diff --git a/java/org/slub/rosetta/dps/repository/plugin/SLUBXmlFormatValidationPlugin.java b/java/org/slub/rosetta/dps/repository/plugin/SLUBXmlFormatValidationPlugin.java index 6a3b442be40fdeb2df8b1c171d3e8e5e65c89b6e..053af3963b908645a8c21558d3744032047c1458 100644 --- a/java/org/slub/rosetta/dps/repository/plugin/SLUBXmlFormatValidationPlugin.java +++ b/java/org/slub/rosetta/dps/repository/plugin/SLUBXmlFormatValidationPlugin.java @@ -31,8 +31,10 @@ import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; +import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Optional; /** * SLUBXmlFormatValidationPlugin @@ -48,6 +50,19 @@ public class SLUBXmlFormatValidationPlugin implements FormatValidationPlugin { private final DocumentBuilderFactory dbf = DocumentBuilderFactory.newDefaultInstance(); private final List<String> errors = new ArrayList<>(); private final List<String> details = new ArrayList<>(); + private static final HashSet<validationSchema> namespaceSchemaMap = new HashSet<>() { + { + add(new validationSchema("http://www.loc.gov/standards/alto/ns-v2#", ValidationSchemaType.schema, "http://www.loc.gov/standards/alto/alto-v2.0.xsd")); + add(new validationSchema("http://www.loc.gov/mods/v3", ValidationSchemaType.schema, "http://www.loc.gov/standards/mods/v3/mods-3-8.xsd")); + add(new validationSchema("http://www.lido-schema.org", ValidationSchemaType.schema, "http://www.lido-schema.org/schema/v1.1/lido-v1.1.xsd")); + add(new validationSchema( "http://slubarchiv.slub-dresden.de/rights1", ValidationSchemaType.schema, "https://slubarchiv.slub-dresden.de/fileadmin/groups/slubsite/slubarchiv/standards/rights/rights1.xsd")); +// put("http://www.opengis.net/citygml/profiles/base/1.0", ""); +// put("http://www.opengis.net/kml/2.2", ""); +// put("http://www.music-encoding.org/ns/mei", ""); +// put("http://www.tei-c.org/ns/1.0", ""); + } + }; + private ValidationCatalogResolver validationCatalogResolver = null; private final ErrorHandler validationErrorHandler = new ErrorHandler() { @Override @@ -109,6 +124,33 @@ public class SLUBXmlFormatValidationPlugin implements FormatValidationPlugin { + "\n" ); } + private static Optional<validationSchema> assignSchema(Document doc) { + xmlInfoRecord info = getXMLinfo(doc); + Optional<validationSchema> optEle = Optional.empty(); + if (null == info.nameSpaceUri) { + /* try if a DTD is assignable */ + var type = assignDtdIfApplicable(doc); + if (type.equals(ValidationSchemaType.dtd)) { + System.out.println("found schema " + type); + var ele = new validationSchema(info.nameSpaceUri, type, info.systemID); + optEle = Optional.of(ele); + } + } else { + optEle = namespaceSchemaMap.stream() + .filter( + entry -> (entry.schemaType.equals(ValidationSchemaType.schema)) && (entry.nameSpace.equals(info.nameSpaceUri)) + ) + .findAny(); + } + if ( optEle.isPresent() ) { + System.out.println("found namespace " + optEle.get().nameSpace ); + System.out.println("found schematype " + optEle.get().schemaType ); + System.out.println("found schemaURL " + optEle.get().schemaURL ); + } else { + System.out.println("no element found"); + } + return optEle; + } @Override @@ -118,6 +160,7 @@ public class SLUBXmlFormatValidationPlugin implements FormatValidationPlugin { wellformed = true; errors.clear(); valid = validateAgainstSchema(filePath); + System.out.println("ok no error ->" + valid ); } } catch (ParserConfigurationException e) { reportError("ParserconfExc file=" + filePath + " Exc:" + e.getMessage()); @@ -127,44 +170,82 @@ public class SLUBXmlFormatValidationPlugin implements FormatValidationPlugin { reportError("not a XML file, " + e.getMessage(), filePath); e.getStackTrace(); } + // debug + System.out.println("errors:" ); + System.out.println("----"); + for (var line: errors) { + System.out.println("\t" + line); + } + System.out.println("----"); return valid; } + private static ValidationSchemaType assignDtdIfApplicable(Document doc) { + var info = getXMLinfo(doc); + if (null != info.systemID && info.systemID.endsWith(".dtd")) { + return ValidationSchemaType.dtd; + } else if (null != info.systemID ) { + + } + return ValidationSchemaType.nothing; + } + private boolean checkIfWellformed(String filePath) throws ParserConfigurationException, IOException, SAXException { /* detect XML type via NS */ boolean isWellformedXml = false; dbf.setAttribute("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); + dbf.setValidating(false); DocumentBuilder db = dbf.newDocumentBuilder(); Document doc = db.parse(new File(filePath)); xmlInfoRecord info = getXMLinfo(doc); reportDetail("detect XML type via NS:" + info.nameSpaceUri); - //printXMLinfo(doc); + /* TODO: align corresponding Schema based on systemID */ + Optional<validationSchema> schema = assignSchema(doc); + if (schema.isEmpty()) { + reportError("there is no related schema found in *our* catalog of allowed XML types.", filePath); + } else { + reportDetail("assigned schema of type: " + schema.get().schemaType); + reportDetail("assigned schema url: " + schema.get().schemaURL); + if (schema.get().schemaType == ValidationSchemaType.dtd) { + assert(dbf.isValidating() == false); + dbf.setValidating(true); /* only used if DTD */ + assert(dbf.isValidating() == true); + dbf.setFeature(XMLConstants.USE_CATALOG, true); + dbf.setAttribute("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); + System.out.println("-> dtd detected, use catalog"); + } else if (!schema.get().schemaURL.isBlank()) { + System.out.println("-> set schema to " + schema.get().schemaURL); + dbf.setSchema(schema.get().schemaInst); + assert(dbf.getSchema() != null); + } + } + + printXMLinfo(doc); if (!info.xmlVersion.equals("1.0")) { reportError("not an expected XML 1.0 document, found " + info.xmlVersion, filePath); } else { isWellformedXml = true; reportDetail("checked XML is wellformed"); + } return isWellformedXml; } private boolean validateAgainstSchema(String filePath) throws ParserConfigurationException, SAXException, IOException { boolean isValidXml = false; - dbf.setFeature(XMLConstants.USE_CATALOG, true); - dbf.setAttribute("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); - dbf.setValidating(true); + //dbf.setAttribute(); + dbf.setXIncludeAware(true); + dbf.setNamespaceAware(true); DocumentBuilder dbValidate = dbf.newDocumentBuilder(); Document docValidate; dbValidate.setEntityResolver(validationCatalogResolver); dbValidate.setErrorHandler(validationErrorHandler); reportDetail("align entitity resolver"); docValidate = dbValidate.parse(new File(filePath)); - if (dbValidate.isValidating()) { - docValidate.getXmlVersion(); - reportDetail("reparse using own catalog"); - if (errors.isEmpty()) { - isValidXml = true; - } + docValidate.getXmlVersion(); + reportDetail("reparse using own catalog"); + if (errors.isEmpty()) { + isValidXml = true; } return isValidXml; }