From 69cd7b9b3abd1df196e62ab636c805b712a5ca10 Mon Sep 17 00:00:00 2001 From: Andreas Romeyke <art1@andreas-romeyke.de> Date: Wed, 26 Jan 2022 18:34:42 +0100 Subject: [PATCH] - changed to use mediainfo as mdextractor - loading xslt added - xlst trnsformation added (needs fixes, because I/O error) --- ...icalMetadataExtractorMediaConchPlugin.java | 353 +++++++++--------- 1 file changed, 177 insertions(+), 176 deletions(-) diff --git a/java/org/slub/rosetta/dps/repository/plugin/SLUBTechnicalMetadataExtractorMediaConchPlugin.java b/java/org/slub/rosetta/dps/repository/plugin/SLUBTechnicalMetadataExtractorMediaConchPlugin.java index 5c5a3f0..28e6947 100644 --- a/java/org/slub/rosetta/dps/repository/plugin/SLUBTechnicalMetadataExtractorMediaConchPlugin.java +++ b/java/org/slub/rosetta/dps/repository/plugin/SLUBTechnicalMetadataExtractorMediaConchPlugin.java @@ -1,5 +1,5 @@ /* -2017 by Andreas Romeyke (SLUB Dresden) +2017-2022 by Andreas Romeyke (SLUB Dresden) Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -19,16 +19,25 @@ package org.slub.rosetta.dps.repository.plugin; import com.exlibris.core.sdk.strings.StringUtils; import com.exlibris.dps.sdk.techmd.MDExtractorPlugin; - +import org.xml.sax.InputSource; +import org.xml.sax.XMLReader; + +import javax.xml.parsers.SAXParserFactory; +import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.sax.SAXSource; +import javax.xml.transform.stream.StreamResult; +import javax.xml.transform.stream.StreamSource; import java.io.BufferedReader; +import java.io.FileOutputStream; import java.io.IOException; +import java.io.InputStream; import java.io.InputStreamReader; +import java.io.OutputStream; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; -import java.util.regex.Matcher; -import java.util.regex.Pattern; /** * SLUBTechnicalMetadataExtractorMediaConchPlugin @@ -41,16 +50,14 @@ public class SLUBTechnicalMetadataExtractorMediaConchPlugin implements MDExtract private String mediaconch_binary_path; private String mediaconch_profile_path; - private String ffprobe_binary_path; + private String mediainfo_binary_path; private List<String> extractionErrors = new ArrayList<String>(); private List<String> validationLog = new ArrayList<String>(); private boolean isvalid = false; private boolean iswellformed = false; - - - private Map<String,String> attributes = new HashMap<String, String>(); + //static final ExLogger log = ExLogger.getExLogger(SLUBTechnicalMetadataExtractorMediaConchPlugin.class, ExLogger.VALIDATIONSTACK); /** constructor */ public SLUBTechnicalMetadataExtractorMediaConchPlugin() { @@ -63,32 +70,15 @@ public class SLUBTechnicalMetadataExtractorMediaConchPlugin implements MDExtract public void initParams(Map<String, String> initp) { this.mediaconch_binary_path = initp.get("mediaconch_binary_path").trim(); this.mediaconch_profile_path = initp.get("mediaconch_profile_path").trim(); - this.ffprobe_binary_path = initp.get("ffprobe_binary_path").trim(); + this.mediainfo_binary_path = initp.get("mediainfo_binary_path").trim(); System.out.println("SLUBTechnicalMetadataExtractorMediaConchPlugin instantiated with " + " mediaconch_binary_path=" + mediaconch_binary_path + " mediaconch_profile_path=" + mediaconch_profile_path - + " ffprobe_binary_path=" + ffprobe_binary_path + + " mediainfo_binary_path=" + mediainfo_binary_path ); } - /* ffprobe output of metadata - supports different outputs. we are using the flat-model, see WRITERS section in ffprobe manual - the streams will be mapped as: streams.stream.0.$property - the separator is "=" - */ - private void parse_ffprobe_flat_output(String exiftoolxml ) { - // see output of exiftool -X, alternatively check http://ns.exiftool.ca/ExifTool/1.0/ - Pattern p = Pattern.compile("([^=]+)=(.*)"); - Matcher m = p.matcher(exiftoolxml); - if (m.matches()) { - String key = m.group(1); - String value = m.group(2); - System.out.println("matcher: key=" + key + " value=" + value); - attributes.put(key, value); - } - } - @Override public void extract(String filePath) throws Exception { if (StringUtils.isEmptyString(mediaconch_binary_path)) { @@ -99,12 +89,10 @@ public class SLUBTechnicalMetadataExtractorMediaConchPlugin implements MDExtract //log.error("No mediaconch_config_path defined. Please set the plugin parameter to hold your mediaconch_config_path."); throw new Exception("mediaconch_profile_path not found"); } - if (StringUtils.isEmptyString(ffprobe_binary_path)) { + if (StringUtils.isEmptyString(mediainfo_binary_path)) { - throw new Exception("ffprobe_binary_path (part of ffmpeg) not found"); + throw new Exception("mediainfo_binary_path not found"); } - - // mediaconch validation try { String execstring = this.mediaconch_binary_path + " " + filePath + " " + this.mediaconch_profile_path; @@ -141,25 +129,48 @@ public class SLUBTechnicalMetadataExtractorMediaConchPlugin implements MDExtract */ try { - String execstring = this.ffprobe_binary_path + " -print_format flat -v error -show_format -show_streams -show_entries stream=r_frame_rate" + filePath; + String execstring = this.mediainfo_binary_path + " -f --Output=XML " + filePath; System.out.println("executing: " + execstring); Process p = Runtime.getRuntime().exec(execstring); p.waitFor(); BufferedReader reader = new BufferedReader(new InputStreamReader(p.getInputStream())); String line=reader.readLine(); - String response=""; + StringBuilder mediainfo_output= new StringBuilder(); while (line != null) { System.out.println(line); - parse_ffprobe_flat_output(line.trim()); - response+=line; + mediainfo_output.append(line); line = reader.readLine(); } - attributes.put("ffprobe-log", response.trim()); - - } catch (IOException e) { - //log.error("exception creation socket, clamd not available at host=" + host + "port=" + port, e); - - + /* xslt transform */ + InputStream stylestream = getClass().getResourceAsStream("resources/transformer.xsl"); + StreamSource stylesource = new StreamSource( stylestream); + // Use a Transformer for output + SAXParserFactory spf = SAXParserFactory.newInstance(); + spf.setNamespaceAware(true); + /* disabled external DTD loading, which does not work with class ressource */ + spf.setValidating(false); + spf.setNamespaceAware(true); + spf.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); + spf.setFeature("http://xml.org/sax/features/use-entity-resolver2", false); + spf.setFeature("http://xml.org/sax/features/validation", false); + spf.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false); + spf.setFeature("http://xml.org/sax/features/allow-dtd-events-after-endDTD", false); + XMLReader r = spf.newSAXParser().getXMLReader(); +// StreamSource mediainfo_source = new StreamSource(mediainfo_output); + SAXSource mediainfo_sax_source = new SAXSource(r, new InputSource(mediainfo_output.toString())); + TransformerFactory tFactory = TransformerFactory.newInstance(); + //tFactory.setAttribute(XMLConstants.ACCESS_EXTERNAL_DTD, ""); + //tFactory.setAttribute(XMLConstants.ACCESS_EXTERNAL_STYLESHEET, ""); +// tFactory.setAttribute(XMLConstants.ACCESS_EXTERNAL_SCHEMA, ""); +// tFactory.setAttribute(XMLConstants.USE_CATALOG, ""); + Transformer transformer = tFactory.newTransformer(stylesource); + +/* ok, mediainfo is loaded correctly, and xslt loaded too */ + OutputStream debugFile = new FileOutputStream("DEBUG.xml"); + StreamResult result = new StreamResult( debugFile); +// StreamResult result = new StreamResult(System.out); /* FIXME , use StringOutputStream */ + transformer.transform(mediainfo_sax_source, result); + attributes.put("key", "value"); } catch (InterruptedException e) { e.printStackTrace(); @@ -176,25 +187,31 @@ public class SLUBTechnicalMetadataExtractorMediaConchPlugin implements MDExtract * @return string with clamd version and signature version */ public String getAgent() { - String response=""; - response+="mediaconch:\n"; + StringBuilder response= new StringBuilder(); + response.append("mediaconch:\n"); try { - String execstring = this.mediaconch_binary_path + " -v"; - Process p = Runtime.getRuntime().exec(execstring); - p.waitFor(); - BufferedReader reader = new BufferedReader(new InputStreamReader(p.getInputStream())); - String line=reader.readLine(); - while (line != null) { - System.out.println(line); - response+=line; - line = reader.readLine(); + String[] executables = { + this.mediaconch_binary_path, + this.mediainfo_binary_path + }; + for(String executable: executables){ + String execstring = executable + " --Version"; + Process p = Runtime.getRuntime().exec(execstring); + p.waitFor(); + BufferedReader reader = new BufferedReader(new InputStreamReader(p.getInputStream())); + String line = reader.readLine(); + while (line != null) { + System.out.println(line); + response.append(line); + line = reader.readLine(); + } } } catch (IOException e) { //log.error("exception creation socket, clamd not available at host=" + host + "port=" + port, e); } catch (InterruptedException e) { e.printStackTrace(); } - return response.trim(); + return response.toString().trim(); } @Override @@ -217,129 +234,113 @@ public class SLUBTechnicalMetadataExtractorMediaConchPlugin implements MDExtract */ @Override public List<String> getSupportedAttributeNames() { - //return new ArrayList<String>(attributes.keySet()); + //return new ArrayList<String>(attributes.keySet()); List<String> available = new ArrayList<String>(); - //available.add("checkit-tiff-conf"); - available.add("format.bit_rate"); - available.add("format.duration"); - available.add("format.filename"); - available.add("format.format_long_name"); - available.add("format.format_name"); - available.add("format.nb_programs"); - available.add("format.nb_streams"); - available.add("format.probe_score"); - available.add("format.size"); - available.add("format.start_time"); - available.add("format.tags.DATE"); - available.add("format.tags.ENCODED_BY"); - available.add("format.tags.ENCODER"); - available.add("format.tags.MAJOR_BRAND"); - available.add("format.tags.MINOR_VERSION"); - available.add("format.tags.ORIGINATOR_REFERENCE"); - available.add("format.tags.TIME_REFERENCE"); - available.add("streams.stream.0.avg_frame_rate"); - available.add("streams.stream.0.bit_rate"); - available.add("streams.stream.0.bits_per_raw_sample"); - available.add("streams.stream.0.bits_per_sample"); - available.add("streams.stream.0.channel_layout"); - available.add("streams.stream.0.channels"); - available.add("streams.stream.0.chroma_location"); - available.add("streams.stream.0.codec_long_name"); - available.add("streams.stream.0.codec_name"); - available.add("streams.stream.0.codec_tag"); - available.add("streams.stream.0.codec_tag_string"); - available.add("streams.stream.0.codec_time_base"); - available.add("streams.stream.0.codec_type"); - available.add("streams.stream.0.coded_height"); - available.add("streams.stream.0.coded_width"); - available.add("streams.stream.0.color_primaries"); - available.add("streams.stream.0.color_range"); - available.add("streams.stream.0.color_space"); - available.add("streams.stream.0.color_transfer"); - available.add("streams.stream.0.display_aspect_ratio"); - available.add("streams.stream.0.disposition.attached_pic"); - available.add("streams.stream.0.disposition.clean_effects"); - available.add("streams.stream.0.disposition.comment"); - available.add("streams.stream.0.disposition.default"); - available.add("streams.stream.0.disposition.dub"); - available.add("streams.stream.0.disposition.forced"); - available.add("streams.stream.0.disposition.hearing_impaired"); - available.add("streams.stream.0.disposition.karaoke"); - available.add("streams.stream.0.disposition.lyrics"); - available.add("streams.stream.0.disposition.original"); - available.add("streams.stream.0.disposition.timed_thumbnails"); - available.add("streams.stream.0.disposition.visual_impaired"); - available.add("streams.stream.0.duration"); - available.add("streams.stream.0.duration_ts"); - available.add("streams.stream.0.field_order"); - available.add("streams.stream.0.has_b_frames"); - available.add("streams.stream.0.height"); - available.add("streams.stream.0.id"); - available.add("streams.stream.0.index"); - available.add("streams.stream.0.level"); - available.add("streams.stream.0.max_bit_rate"); - available.add("streams.stream.0.nb_frames"); - available.add("streams.stream.0.nb_read_frames"); - available.add("streams.stream.0.nb_read_packets"); - available.add("streams.stream.0.pix_fmt"); - available.add("streams.stream.0.profile"); - available.add("streams.stream.0.refs"); - available.add("streams.stream.0.r_frame_rate"); - available.add("streams.stream.0.sample_aspect_ratio"); - available.add("streams.stream.0.sample_fmt"); - available.add("streams.stream.0.sample_rate"); - available.add("streams.stream.0.start_pts"); - available.add("streams.stream.0.start_time"); - available.add("streams.stream.0.tags.DURATION"); - available.add("streams.stream.0.tags.ENCODER"); - available.add("streams.stream.0.tags.HANDLER_NAME"); - available.add("streams.stream.0.tags.language"); - available.add("streams.stream.0.tags.TIMECODE"); - available.add("streams.stream.0.time_base"); - available.add("streams.stream.0.timecode"); - available.add("streams.stream.0.width"); - available.add("streams.stream.1.avg_frame_rate"); - available.add("streams.stream.1.bit_rate"); - available.add("streams.stream.1.bits_per_raw_sample"); - available.add("streams.stream.1.bits_per_sample"); - available.add("streams.stream.1.channel_layout"); - available.add("streams.stream.1.channels"); - available.add("streams.stream.1.codec_long_name"); - available.add("streams.stream.1.codec_name"); - available.add("streams.stream.1.codec_tag"); - available.add("streams.stream.1.codec_tag_string"); - available.add("streams.stream.1.codec_time_base"); - available.add("streams.stream.1.codec_type"); - available.add("streams.stream.1.disposition.attached_pic"); - available.add("streams.stream.1.disposition.clean_effects"); - available.add("streams.stream.1.disposition.comment"); - available.add("streams.stream.1.disposition.default"); - available.add("streams.stream.1.disposition.dub"); - available.add("streams.stream.1.disposition.forced"); - available.add("streams.stream.1.disposition.hearing_impaired"); - available.add("streams.stream.1.disposition.karaoke"); - available.add("streams.stream.1.disposition.lyrics"); - available.add("streams.stream.1.disposition.original"); - available.add("streams.stream.1.disposition.timed_thumbnails"); - available.add("streams.stream.1.disposition.visual_impaired"); - available.add("streams.stream.1.duration"); - available.add("streams.stream.1.duration_ts"); - available.add("streams.stream.1.id"); - available.add("streams.stream.1.index"); - available.add("streams.stream.1.max_bit_rate"); - available.add("streams.stream.1.nb_frames"); - available.add("streams.stream.1.nb_read_frames"); - available.add("streams.stream.1.nb_read_packets"); - available.add("streams.stream.1.profile"); - available.add("streams.stream.1.r_frame_rate"); - available.add("streams.stream.1.sample_fmt"); - available.add("streams.stream.1.sample_rate"); - available.add("streams.stream.1.start_pts"); - available.add("streams.stream.1.start_time"); - available.add("streams.stream.1.tags.DURATION"); - available.add("streams.stream.1.tags.HANDLER_NAME"); - available.add("streams.stream.1.tags.language"); - available.add("streams.stream.1.time_base"); + available.add("mediainfo.track.Audio.BitDepth"); + available.add("mediainfo.track.Audio.BitRate"); + available.add("mediainfo.track.Audio.BitRate_Mode"); + available.add("mediainfo.track.Audio.Channels"); + available.add("mediainfo.track.Audio.CodecID"); + available.add("mediainfo.track.Audio.Commercial_Name"); + available.add("mediainfo.track.Audio.Compression_Mode"); + available.add("mediainfo.track.Audio.Default"); + available.add("mediainfo.track.Audio.Duration"); + available.add("mediainfo.track.Audio.encoded_library"); + available.add("mediainfo.track.Audio.Forced"); + available.add("mediainfo.track.Audio.Format"); + available.add("mediainfo.track.Audio.Format_Profile"); + available.add("mediainfo.track.Audio.Format_Settings_Wrapping"); + available.add("mediainfo.track.Audio.Format_Settings_Sign"); + available.add("mediainfo.track.Audio.Format_Version"); + available.add("mediainfo.track.Audio.FrameRate"); + available.add("mediainfo.track.Audio.ID"); + available.add("mediainfo.track.Audio.SamplingRate"); + available.add("mediainfo.track.Audio.ServiceKind"); + available.add("mediainfo.track.Audio.StreamSize"); + available.add("mediainfo.track.Audio.Title"); + available.add("mediainfo.track.General.AudioCount"); + available.add("mediainfo.track.General.CodecID"); + available.add("mediainfo.track.General.CompleteName"); + available.add("mediainfo.track.General.Description"); + available.add("mediainfo.track.General.Duration"); + available.add("mediainfo.track.General.Encoded_Application"); + available.add("mediainfo.track.General.Encoded_Application_CompanyName"); + available.add("mediainfo.track.General.Encoded_Application_Name"); + available.add("mediainfo.track.General.Encoded_Application_Version"); + available.add("mediainfo.track.General.Encoded_Date"); + available.add("mediainfo.track.General.Encoded_Library"); + available.add("mediainfo.track.General.Encoded_Library_Name"); + available.add("mediainfo.track.General.Encoded_Library_Version"); + available.add("mediainfo.track.General.extra.ErrorDetectionType"); + available.add("mediainfo.track.General.extra.IsTruncated"); + available.add("mediainfo.track.General.extra.bext_Present"); + available.add("mediainfo.track.General.FileSize"); + available.add("mediainfo.track.General.Format"); + available.add("mediainfo.track.General.Format_Profile"); + available.add("mediainfo.track.General.Format_Settings"); + available.add("mediainfo.track.General.Format_Version"); + available.add("mediainfo.track.General.IsStreamable"); + available.add("mediainfo.track.General.MenuCount"); + available.add("mediainfo.track.General.OtherCount"); + available.add("mediainfo.track.General.OverallBitRate"); + available.add("mediainfo.track.General.OverallBitRateMode"); + available.add("mediainfo.track.General.PackageName"); + available.add("mediainfo.track.General.Producer"); + available.add("mediainfo.track.General.StreamSize"); + available.add("mediainfo.track.General.TextCount"); + available.add("mediainfo.track.General.Title"); + available.add("mediainfo.track.General.UniqueID"); + available.add("mediainfo.track.General.VideoCount"); + available.add("mediainfo.track.Image.BitDepth"); + available.add("mediainfo.track.Image.ColorSpace"); + available.add("mediainfo.track.Image.colour_primaries"); + available.add("mediainfo.track.Image.Compression_Mode"); + available.add("mediainfo.track.Image.DisplayAspectRatio"); + available.add("mediainfo.track.Image.Encoded_Date"); + available.add("mediainfo.track.Image.Encoded_Library"); + available.add("mediainfo.track.Image.Format"); + available.add("mediainfo.track.Image.Format_Version"); + available.add("mediainfo.track.Image.FrameRate"); + available.add("mediainfo.track.Image.Height"); + available.add("mediainfo.track.Image.StreamSize"); + available.add("mediainfo.track.Image.transfer_characteristics"); + available.add("mediainfo.track.Image.Width"); + available.add("mediainfo.track.Video.BitDepth"); + available.add("mediainfo.track.Video.BitRate"); + available.add("mediainfo.track.Video.BitRate_Mode"); + available.add("mediainfo.track.Video.ChromaSubsampling"); + available.add("mediainfo.track.Video.CodecID"); + available.add("mediainfo.track.Video.ColorSpace"); + available.add("mediainfo.track.Video.Compression_Mode"); + available.add("mediainfo.track.Video.Default"); + available.add("mediainfo.track.Video.DisplayAspectRatio"); + available.add("mediainfo.track.Video.Duration"); + available.add("mediainfo.track.Video.Encoded_Library"); + available.add("mediainfo.track.Video.extra.coder_type"); + available.add("mediainfo.track.Video.extra.ErrorDetectionType"); + available.add("mediainfo.track.Video.extra.MaxSlicesCount"); + available.add("mediainfo.track.Video.extra.OriginalSourceMedium"); + available.add("mediainfo.track.Video.Forced"); + available.add("mediainfo.track.Video.Format"); + available.add("mediainfo.track.Video.Format_Profile"); + available.add("mediainfo.track.Video.Format_Settings_GOP"); + available.add("mediainfo.track.Video.Format_Settings_Wrapping"); + available.add("mediainfo.track.Video.Format_Version"); + available.add("mediainfo.track.Video.FrameCount"); + available.add("mediainfo.track.Video.FrameRate"); + available.add("mediainfo.track.Video.Height"); + available.add("mediainfo.track.Video.ID"); + available.add("mediainfo.track.Video.PixelAspectRatio"); + available.add("mediainfo.track.Video.PixelAspectRatioOriginal"); + available.add("mediainfo.track.Video.ScanOrder"); + available.add("mediainfo.track.Video.ScanType"); + available.add("mediainfo.track.Video.Standard"); + available.add("mediainfo.track.Video.StreamSize"); + available.add("mediainfo.track.Video.TimeCode_FirstFrame"); + available.add("mediainfo.track.Video.TimeCode_Source"); + available.add("mediainfo.track.Video.Title"); + available.add("mediainfo.track.Video.transfer_characteristics"); + available.add("mediainfo.track.Video.Width"); return available; } @@ -381,7 +382,7 @@ public class SLUBTechnicalMetadataExtractorMediaConchPlugin implements MDExtract Map<String, String> initp = new HashMap<String, String>(); initp.put( "mediaconch_binary_path", "/usr/bin/mediaconch"); initp.put( "mediaconch_profile_path", "/etc/mediaconch/profile.xml"); - initp.put( "ffprobe_binary_path", "/usr/bin/ffprobe"); + initp.put( "mediainfo_binary_path", "/usr/bin/mediainfo"); plugin.initParams( initp ); System.out.println("Agent: '" + plugin.getAgent() + "'"); System.out.println(); -- GitLab