diff --git a/perl/exit_strategy.pl b/perl/exit_strategy.pl index 7b447f4c2fa58e77b09d30bea38ff4a31ea7cce9..cbbbd03ce8e642e0bd639792d84ed510097b2a67 100644 --- a/perl/exit_strategy.pl +++ b/perl/exit_strategy.pl @@ -233,6 +233,29 @@ sub check_if_db_conform ($$) { return; } +{ + my $xp; + sub get_xpath_context { + if (defined $xp) { return $xp}; + $xp = XML::LibXML::XPathContext->new(); + $xp->registerNs("dnx", "http://www.exlibrisgroup.com/dps/dnx"); + $xp->registerNs("sru", "http://www.loc.gov/zing/srw/"); + $xp->registerNs("xsi", "http://www.w3.org/2001/XMLSchema-instance"); + $xp->registerNs("dc", "http://purl.org/dc/elements/1.1/"); + $xp->registerNs("mets", "http://www.loc.gov/METS/"); + $xp->registerNs("rosettamets", "http://www.exlibrisgroup.com/xsd/dps/rosettaMets"); + $xp->registerNs("mods", "http://www.loc.gov/mods/v3"); + $xp->registerNs("ns2", "http://dps.exlibris.com/"); + $xp->registerNs("dv", "http://dfg-viewer.de/"); + $xp->registerNs("slub", "http://slub-dresden.de/"); + $xp->registerNs("archive", "http://slub-dresden.de/slubarchiv"); + $xp->registerNs("premis", "info:lc/xmlns/premis-v2"); + $xp->registerNs("mix", "http://www.loc.gov/standards/mix/"); + $xp->registerNs("xlink", "http://www.w3.org/1999/xlink"); + $xp->registerNs("xlin", "http://www.w3.org/1999/xlink"); + return $xp; + } +} ############################################################################### # @@ -244,105 +267,113 @@ sub check_if_db_conform ($$) { # Files via /mets:mets/mets:fileSec[1]/mets:fileGrp[1]/mets:file[1]/mets:FLocat[1] # ############################################################################### -sub parse_iexml ($$) { - my $filename = shift; - my $recovery_flag = shift; - # create object - # - #my $xp = XML::XPath->new (filename => $filename); - if ($recovery_flag) { - $recoverflag=2; # avoid warnings, see XML::LibXML::Parser POD about 'recovery' - } - my $dom = XML::LibXML->load_xml (location => $filename, recover => $recovery_flag, no_blanks=>1, compact=>1); - my $xp = XML::LibXML::XPathContext->new($dom); - $xp->registerNs("dnx", "http://www.exlibrisgroup.com/dps/dnx"); - $xp->registerNs("sru", "http://www.loc.gov/zing/srw/"); - $xp->registerNs("xsi", "http://www.w3.org/2001/XMLSchema-instance"); - $xp->registerNs("dc", "http://purl.org/dc/elements/1.1/"); - $xp->registerNs("mets", "http://www.loc.gov/METS/"); - $xp->registerNs("rosettamets", "http://www.exlibrisgroup.com/xsd/dps/rosettaMets"); - $xp->registerNs("mods", "http://www.loc.gov/mods/v3"); - $xp->registerNs("ns2", "http://dps.exlibris.com/"); - $xp->registerNs("dv", "http://dfg-viewer.de/"); - $xp->registerNs("slub", "http://slub-dresden.de/"); - $xp->registerNs("archive", "http://slub-dresden.de/slubarchiv"); - $xp->registerNs("premis", "info:lc/xmlns/premis-v2"); - $xp->registerNs("mix", "http://www.loc.gov/standards/mix/"); - $xp->registerNs("xlink", "http://www.w3.org/1999/xlink"); - $xp->registerNs("xlin", "http://www.w3.org/1999/xlink"); - - ############################################ - # get title - my $compiled_xpath_titles = '/mets:mets/mets:dmdSec/mets:mdWrap[1]/mets:xmlData[1]/dc:record/dc:title[1]'; - my $title = $xp->findvalue($compiled_xpath_titles); - check_if_db_conform($title, $filename); - ############################################ - # get dc-records - my @dcrecords; - my $compiled_xpath_dcrecords='/mets:mets/mets:dmdSec/mets:mdWrap/mets:xmlData/dc:record/*'; - my $dcnodes = $xp->find($compiled_xpath_dcrecords); - foreach my $dcnode ($dcnodes->get_nodelist) { - my $key = $dcnode->getName(); - my $value = $dcnode->findvalue("."); - if (defined $value) { - $value =~ s/\n/ /g; - $value =~ s/'/\\'/g; - check_if_db_conform($value, $filename); - my @pair; - push @pair, $key; - push @pair, $value; - push @dcrecords, \@pair; +{ + my $compiled_xpath_titles = XML::LibXML::XPathExpression->new('/mets:mets/mets:dmdSec/mets:mdWrap[1]/mets:xmlData[1]/dc:record/dc:title[1]'); + my $compiled_xpath_dcrecords = XML::LibXML::XPathExpression->new('/mets:mets/mets:dmdSec/mets:mdWrap/mets:xmlData/dc:record/*'); + my $compiled_xpath_amdsecs = XML::LibXML::XPathExpression->new('/mets:mets/mets:amdSec[starts-with(@ID, \'REP\')]'); + my $str_local_reps = 'mets:techMD/mets:mdWrap/mets:xmlData/*[local-name()=\'dnx\']/*[local-name()=\'section\']/*[local-name()=\'record\']/*[local-name()=\'key\' and @id=\'label\']'; + my $compiled_xpath_localreps = XML::LibXML::XPathExpression->new( $str_local_reps); + my $compiled_xpath_localreps2 = XML::LibXML::XPathExpression->new('/mets:mets/mets:amdSec/mets:techMD/mets:mdWrap/mets:xmlData/dnx/section/record/key[@id=\'label\']=\'LOCAL\''); + my $compiled_xpath_filegrps = XML::LibXML::XPathExpression->new('/mets:mets/mets:fileSec/mets:fileGrp'); + my $compiled_xpath_flocat = XML::LibXML::XPathExpression->new('mets:file/mets:FLocat'); + my $compiled_xpath_id = XML::LibXML::XPathExpression->new('@ID'); + my $compiled_xpath_admid = XML::LibXML::XPathExpression->new('@ADMID'); + my $compiled_xpath_xlinhref = XML::LibXML::XPathExpression->new('@xlin:href'); + my $compiled_xpath_dot = XML::LibXML::XPathExpression->new('.'); + sub parse_iexml($$) { + my $filename = shift; + my $recovery_flag = shift; + if ($recovery_flag) { + $recovery_flag = 2; # avoid warnings, see XML::LibXML::Parser POD about 'recovery' } - } - ############################################ - # get right representation ID (has a dnx-section with <key id=label>LOCAL</key>) - my $compiled_xpath_amdsecs = '/mets:mets/mets:amdSec'; - my $compiled_xpath_localreps = 'mets:techMD/mets:mdWrap/mets:xmlData/dnx/section/record/key[@id=\'label\']'; - my $compiled_xpath_filegrps = '/mets:mets/mets:fileSec/mets:fileGrp'; - my $compiled_xpath_flocat = 'mets:file/mets:FLocat'; - - my $repids = $xp->find($compiled_xpath_amdsecs); - my $repid; - # FIXME: if only one represenation exists (Qucosa), select this. If there - # are more than one, use them with label LOCAL - my @repnodes = $repids->get_nodelist; + my $dom = XML::LibXML->load_xml( + location => $filename, + recover => $recovery_flag, + no_blanks => 1, + compact => 1, + no_network => 1, + ); + my $xp = get_xpath_context(); + $xp->setContextNode($dom); + ############################################ + # get title + my $title = $xp->findvalue($compiled_xpath_titles); + check_if_db_conform($title, $filename); + ############################################ + # get dc-records + my @dcrecords; - $repid = $repnodes[0]->findvalue('@ID' ); - foreach my $node (@repnodes) { - my $id = $node->findvalue('@ID' ); - check_if_db_conform($id, $filename); - #/mets:mets/mets:amdSec[1]/mets:techMD[1]/mets:mdWrap[1]/mets:xmlData[1]/dnx[1]/section[1]/record[1]/key[1] - # - - if ($node->findvalue($compiled_xpath_localreps) eq 'LOCAL') { - $repid=$id; - } - #print XML::XPath::XMLParser::as_string($node), "\n\n"; - } - ############################################ - # get all files of LOCAL representation - my @files; - my $filegrpnodes = $xp->find($compiled_xpath_filegrps); - foreach my $filegrpnode ($filegrpnodes->get_nodelist) { - #die XML::XPath::XMLParser::as_string($filegrpnode), "\n\n"; - #die Dumper($filegrpnode); - if ($filegrpnode->findvalue('@ADMID') eq $repid) { - #die Dumper($filegrpnode); - my $filesnodes = $filegrpnode ->find($compiled_xpath_flocat); - foreach my $filesnode ($filesnodes->get_nodelist) { - my $value = $filesnode->findvalue('@xlin:href'); + my $dcnodes = $xp->find($compiled_xpath_dcrecords); + foreach my $dcnode ($dcnodes->get_nodelist) { + #my $ref = ref $dcnode; use Data::Printer; p( $ref); + my $key = $dcnode->getName(); + #my $value = $dcnode->findvalue($compiled_xpath_dot); + my $value = $dcnode->nodeValue; + if (defined $value) { + $value =~ s/\n/ /g; + $value =~ s/'/\\'/g; check_if_db_conform($value, $filename); - push @files, sprintf("%s", $value); + my @pair; + push @pair, $key; + push @pair, $value; + push @dcrecords, \@pair; + } + } + ############################################ + # get right representation ID (has a dnx-section with <key id=label>LOCAL</key>) + my $repids = $xp->find($compiled_xpath_amdsecs); #/mets:mets/mets:amdSec + my $repid; + my @repnodes = $repids->get_nodelist; + if (scalar @repnodes == 0) { + say STDERR "No reppid found in file $filename, is IE purged?"; + my %tmp; + $tmp{"filename"}=$filename; + $tmp{"purged"}=1; + return \%tmp; + } elsif (scalar @repnodes == 1) { + $repid = $repnodes[0]->findvalue($compiled_xpath_id); + } else { #multiple representations found + # choose reppid with LZA, LZA_INTERN or LOCAL (for very old IEs) + foreach my $node (@repnodes) { + my $id = $node->findvalue($compiled_xpath_id); + check_if_db_conform($id, $filename); + my $localreps = $node->findvalue($compiled_xpath_localreps); + if ( + ($localreps eq 'LOCAL') + or ($localreps eq 'LZA') + or ($localreps eq 'LZA_INTERN') + ) { + $repid = $id; + last; + } + #print XML::XPath::XMLParser::as_string($node), "\n\n"; + } + } + if (!defined $repid) { + say "No repid found in file $filename"; + } + ############################################ + # get all files of LOCAL representation + my @files; + my $filegrpnodes = $xp->find($compiled_xpath_filegrps); + foreach my $filegrpnode ($filegrpnodes->get_nodelist) { + if ($filegrpnode->findvalue($compiled_xpath_admid) eq $repid) { + my $filesnodes = $filegrpnode->find($compiled_xpath_flocat); + foreach my $filesnode ($filesnodes->get_nodelist) { + my $value = $filesnode->findvalue($compiled_xpath_xlinhref); + check_if_db_conform($value, $filename); + push @files, sprintf("%s", $value); + } } } + my %ret; + $ret{"filename" } = $filename; + $ret{"title"} = $title; + $ret{"repid"} = $repid; + $ret{"files"} = \@files; + $ret{"dcrecords"} = \@dcrecords; + return \%ret; } - my %ret; - $ret{"filename" } = $filename; - $ret{"title"} = $title; - $ret{"repid"} = $repid; - $ret{"files"} = \@files; - $ret{"dcrecords"} = \@dcrecords; - return \%ret; } ############################################################################### @@ -398,6 +429,7 @@ sub searching_ie_files ($$) { my $tmp_ies_unsorted_file = shift; my $cnt_unsorted_files = 0; my $first_two_levels_of_dirs = 0; + ### my $wanted_twolevel_dircount = sub { my $relpath = $File::Find::name; $relpath =~ s{^\Q$dir\E/?}{}; @@ -407,9 +439,11 @@ sub searching_ie_files ($$) { and $File::Find::prune = 1; if (-d $_) { $first_two_levels_of_dirs++;} }; + ### find( $wanted_twolevel_dircount, $dir); my $progressbar=Time::Progress->new(min => 0, max => $first_two_levels_of_dirs, smoothing => 1); my $dircount = 0; + ### my $wanted_process_sip = sub { if (-f && m/V(\d+)-IE\d+\.xml$/) { my $version = $1; @@ -430,6 +464,7 @@ sub searching_ie_files ($$) { } return; }; + ### find($wanted_process_sip, $dir); say ""; return $cnt_unsorted_files;