Skip to content
Snippets Groups Projects
Commit 12d026cf authored by Andreas Romeyke's avatar Andreas Romeyke
Browse files

- fixed and improved reppid findings

- use compiled xpaths
- use precompiled xpathcontext
- use nodeValue() instead triggering findvalue()
- handle purged IEs
- hande all variants of localreps (LOCAL and LZA and LZA_INTERN)
parent 621288d6
No related branches found
No related tags found
No related merge requests found
......@@ -233,6 +233,29 @@ sub check_if_db_conform ($$) {
return;
}
{
my $xp;
sub get_xpath_context {
if (defined $xp) { return $xp};
$xp = XML::LibXML::XPathContext->new();
$xp->registerNs("dnx", "http://www.exlibrisgroup.com/dps/dnx");
$xp->registerNs("sru", "http://www.loc.gov/zing/srw/");
$xp->registerNs("xsi", "http://www.w3.org/2001/XMLSchema-instance");
$xp->registerNs("dc", "http://purl.org/dc/elements/1.1/");
$xp->registerNs("mets", "http://www.loc.gov/METS/");
$xp->registerNs("rosettamets", "http://www.exlibrisgroup.com/xsd/dps/rosettaMets");
$xp->registerNs("mods", "http://www.loc.gov/mods/v3");
$xp->registerNs("ns2", "http://dps.exlibris.com/");
$xp->registerNs("dv", "http://dfg-viewer.de/");
$xp->registerNs("slub", "http://slub-dresden.de/");
$xp->registerNs("archive", "http://slub-dresden.de/slubarchiv");
$xp->registerNs("premis", "info:lc/xmlns/premis-v2");
$xp->registerNs("mix", "http://www.loc.gov/standards/mix/");
$xp->registerNs("xlink", "http://www.w3.org/1999/xlink");
$xp->registerNs("xlin", "http://www.w3.org/1999/xlink");
return $xp;
}
}
###############################################################################
#
......@@ -244,105 +267,113 @@ sub check_if_db_conform ($$) {
# Files via /mets:mets/mets:fileSec[1]/mets:fileGrp[1]/mets:file[1]/mets:FLocat[1]
#
###############################################################################
sub parse_iexml ($$) {
my $filename = shift;
my $recovery_flag = shift;
# create object
#
#my $xp = XML::XPath->new (filename => $filename);
if ($recovery_flag) {
$recoverflag=2; # avoid warnings, see XML::LibXML::Parser POD about 'recovery'
}
my $dom = XML::LibXML->load_xml (location => $filename, recover => $recovery_flag, no_blanks=>1, compact=>1);
my $xp = XML::LibXML::XPathContext->new($dom);
$xp->registerNs("dnx", "http://www.exlibrisgroup.com/dps/dnx");
$xp->registerNs("sru", "http://www.loc.gov/zing/srw/");
$xp->registerNs("xsi", "http://www.w3.org/2001/XMLSchema-instance");
$xp->registerNs("dc", "http://purl.org/dc/elements/1.1/");
$xp->registerNs("mets", "http://www.loc.gov/METS/");
$xp->registerNs("rosettamets", "http://www.exlibrisgroup.com/xsd/dps/rosettaMets");
$xp->registerNs("mods", "http://www.loc.gov/mods/v3");
$xp->registerNs("ns2", "http://dps.exlibris.com/");
$xp->registerNs("dv", "http://dfg-viewer.de/");
$xp->registerNs("slub", "http://slub-dresden.de/");
$xp->registerNs("archive", "http://slub-dresden.de/slubarchiv");
$xp->registerNs("premis", "info:lc/xmlns/premis-v2");
$xp->registerNs("mix", "http://www.loc.gov/standards/mix/");
$xp->registerNs("xlink", "http://www.w3.org/1999/xlink");
$xp->registerNs("xlin", "http://www.w3.org/1999/xlink");
############################################
# get title
my $compiled_xpath_titles = '/mets:mets/mets:dmdSec/mets:mdWrap[1]/mets:xmlData[1]/dc:record/dc:title[1]';
my $title = $xp->findvalue($compiled_xpath_titles);
check_if_db_conform($title, $filename);
############################################
# get dc-records
my @dcrecords;
my $compiled_xpath_dcrecords='/mets:mets/mets:dmdSec/mets:mdWrap/mets:xmlData/dc:record/*';
my $dcnodes = $xp->find($compiled_xpath_dcrecords);
foreach my $dcnode ($dcnodes->get_nodelist) {
my $key = $dcnode->getName();
my $value = $dcnode->findvalue(".");
if (defined $value) {
$value =~ s/\n/ /g;
$value =~ s/'/\\'/g;
check_if_db_conform($value, $filename);
my @pair;
push @pair, $key;
push @pair, $value;
push @dcrecords, \@pair;
{
my $compiled_xpath_titles = XML::LibXML::XPathExpression->new('/mets:mets/mets:dmdSec/mets:mdWrap[1]/mets:xmlData[1]/dc:record/dc:title[1]');
my $compiled_xpath_dcrecords = XML::LibXML::XPathExpression->new('/mets:mets/mets:dmdSec/mets:mdWrap/mets:xmlData/dc:record/*');
my $compiled_xpath_amdsecs = XML::LibXML::XPathExpression->new('/mets:mets/mets:amdSec[starts-with(@ID, \'REP\')]');
my $str_local_reps = 'mets:techMD/mets:mdWrap/mets:xmlData/*[local-name()=\'dnx\']/*[local-name()=\'section\']/*[local-name()=\'record\']/*[local-name()=\'key\' and @id=\'label\']';
my $compiled_xpath_localreps = XML::LibXML::XPathExpression->new( $str_local_reps);
my $compiled_xpath_localreps2 = XML::LibXML::XPathExpression->new('/mets:mets/mets:amdSec/mets:techMD/mets:mdWrap/mets:xmlData/dnx/section/record/key[@id=\'label\']=\'LOCAL\'');
my $compiled_xpath_filegrps = XML::LibXML::XPathExpression->new('/mets:mets/mets:fileSec/mets:fileGrp');
my $compiled_xpath_flocat = XML::LibXML::XPathExpression->new('mets:file/mets:FLocat');
my $compiled_xpath_id = XML::LibXML::XPathExpression->new('@ID');
my $compiled_xpath_admid = XML::LibXML::XPathExpression->new('@ADMID');
my $compiled_xpath_xlinhref = XML::LibXML::XPathExpression->new('@xlin:href');
my $compiled_xpath_dot = XML::LibXML::XPathExpression->new('.');
sub parse_iexml($$) {
my $filename = shift;
my $recovery_flag = shift;
if ($recovery_flag) {
$recovery_flag = 2; # avoid warnings, see XML::LibXML::Parser POD about 'recovery'
}
}
############################################
# get right representation ID (has a dnx-section with <key id=label>LOCAL</key>)
my $compiled_xpath_amdsecs = '/mets:mets/mets:amdSec';
my $compiled_xpath_localreps = 'mets:techMD/mets:mdWrap/mets:xmlData/dnx/section/record/key[@id=\'label\']';
my $compiled_xpath_filegrps = '/mets:mets/mets:fileSec/mets:fileGrp';
my $compiled_xpath_flocat = 'mets:file/mets:FLocat';
my $repids = $xp->find($compiled_xpath_amdsecs);
my $repid;
# FIXME: if only one represenation exists (Qucosa), select this. If there
# are more than one, use them with label LOCAL
my @repnodes = $repids->get_nodelist;
my $dom = XML::LibXML->load_xml(
location => $filename,
recover => $recovery_flag,
no_blanks => 1,
compact => 1,
no_network => 1,
);
my $xp = get_xpath_context();
$xp->setContextNode($dom);
############################################
# get title
my $title = $xp->findvalue($compiled_xpath_titles);
check_if_db_conform($title, $filename);
############################################
# get dc-records
my @dcrecords;
$repid = $repnodes[0]->findvalue('@ID' );
foreach my $node (@repnodes) {
my $id = $node->findvalue('@ID' );
check_if_db_conform($id, $filename);
#/mets:mets/mets:amdSec[1]/mets:techMD[1]/mets:mdWrap[1]/mets:xmlData[1]/dnx[1]/section[1]/record[1]/key[1]
#
if ($node->findvalue($compiled_xpath_localreps) eq 'LOCAL') {
$repid=$id;
}
#print XML::XPath::XMLParser::as_string($node), "\n\n";
}
############################################
# get all files of LOCAL representation
my @files;
my $filegrpnodes = $xp->find($compiled_xpath_filegrps);
foreach my $filegrpnode ($filegrpnodes->get_nodelist) {
#die XML::XPath::XMLParser::as_string($filegrpnode), "\n\n";
#die Dumper($filegrpnode);
if ($filegrpnode->findvalue('@ADMID') eq $repid) {
#die Dumper($filegrpnode);
my $filesnodes = $filegrpnode ->find($compiled_xpath_flocat);
foreach my $filesnode ($filesnodes->get_nodelist) {
my $value = $filesnode->findvalue('@xlin:href');
my $dcnodes = $xp->find($compiled_xpath_dcrecords);
foreach my $dcnode ($dcnodes->get_nodelist) {
#my $ref = ref $dcnode; use Data::Printer; p( $ref);
my $key = $dcnode->getName();
#my $value = $dcnode->findvalue($compiled_xpath_dot);
my $value = $dcnode->nodeValue;
if (defined $value) {
$value =~ s/\n/ /g;
$value =~ s/'/\\'/g;
check_if_db_conform($value, $filename);
push @files, sprintf("%s", $value);
my @pair;
push @pair, $key;
push @pair, $value;
push @dcrecords, \@pair;
}
}
############################################
# get right representation ID (has a dnx-section with <key id=label>LOCAL</key>)
my $repids = $xp->find($compiled_xpath_amdsecs); #/mets:mets/mets:amdSec
my $repid;
my @repnodes = $repids->get_nodelist;
if (scalar @repnodes == 0) {
say STDERR "No reppid found in file $filename, is IE purged?";
my %tmp;
$tmp{"filename"}=$filename;
$tmp{"purged"}=1;
return \%tmp;
} elsif (scalar @repnodes == 1) {
$repid = $repnodes[0]->findvalue($compiled_xpath_id);
} else { #multiple representations found
# choose reppid with LZA, LZA_INTERN or LOCAL (for very old IEs)
foreach my $node (@repnodes) {
my $id = $node->findvalue($compiled_xpath_id);
check_if_db_conform($id, $filename);
my $localreps = $node->findvalue($compiled_xpath_localreps);
if (
($localreps eq 'LOCAL')
or ($localreps eq 'LZA')
or ($localreps eq 'LZA_INTERN')
) {
$repid = $id;
last;
}
#print XML::XPath::XMLParser::as_string($node), "\n\n";
}
}
if (!defined $repid) {
say "No repid found in file $filename";
}
############################################
# get all files of LOCAL representation
my @files;
my $filegrpnodes = $xp->find($compiled_xpath_filegrps);
foreach my $filegrpnode ($filegrpnodes->get_nodelist) {
if ($filegrpnode->findvalue($compiled_xpath_admid) eq $repid) {
my $filesnodes = $filegrpnode->find($compiled_xpath_flocat);
foreach my $filesnode ($filesnodes->get_nodelist) {
my $value = $filesnode->findvalue($compiled_xpath_xlinhref);
check_if_db_conform($value, $filename);
push @files, sprintf("%s", $value);
}
}
}
my %ret;
$ret{"filename" } = $filename;
$ret{"title"} = $title;
$ret{"repid"} = $repid;
$ret{"files"} = \@files;
$ret{"dcrecords"} = \@dcrecords;
return \%ret;
}
my %ret;
$ret{"filename" } = $filename;
$ret{"title"} = $title;
$ret{"repid"} = $repid;
$ret{"files"} = \@files;
$ret{"dcrecords"} = \@dcrecords;
return \%ret;
}
###############################################################################
......@@ -398,6 +429,7 @@ sub searching_ie_files ($$) {
my $tmp_ies_unsorted_file = shift;
my $cnt_unsorted_files = 0;
my $first_two_levels_of_dirs = 0;
###
my $wanted_twolevel_dircount = sub {
my $relpath = $File::Find::name;
$relpath =~ s{^\Q$dir\E/?}{};
......@@ -407,9 +439,11 @@ sub searching_ie_files ($$) {
and $File::Find::prune = 1;
if (-d $_) { $first_two_levels_of_dirs++;}
};
###
find( $wanted_twolevel_dircount, $dir);
my $progressbar=Time::Progress->new(min => 0, max => $first_two_levels_of_dirs, smoothing => 1);
my $dircount = 0;
###
my $wanted_process_sip = sub {
if (-f && m/V(\d+)-IE\d+\.xml$/) {
my $version = $1;
......@@ -430,6 +464,7 @@ sub searching_ie_files ($$) {
}
return;
};
###
find($wanted_process_sip, $dir);
say "";
return $cnt_unsorted_files;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment