Skip to content
Snippets Groups Projects
Commit 12d026cf authored by Andreas Romeyke's avatar Andreas Romeyke
Browse files

- fixed and improved reppid findings

- use compiled xpaths
- use precompiled xpathcontext
- use nodeValue() instead triggering findvalue()
- handle purged IEs
- hande all variants of localreps (LOCAL and LZA and LZA_INTERN)
parent 621288d6
No related branches found
No related tags found
No related merge requests found
...@@ -233,28 +233,11 @@ sub check_if_db_conform ($$) { ...@@ -233,28 +233,11 @@ sub check_if_db_conform ($$) {
return; return;
} }
{
############################################################################### my $xp;
# sub get_xpath_context {
# /mets:mets/mets:dmdSec[1]/mets:mdWrap[1]/mets:xmlData[1]/dc:record[1]/dc:title[1] if (defined $xp) { return $xp};
# /mets:mets/mets:amdSec[1]/mets:techMD[1]/mets:mdWrap[1]/mets:xmlData[1]/dnx[1]/section[1]/record[1]/key[2] $xp = XML::LibXML::XPathContext->new();
# mit ID=Label und Wert = LOCAL
# dort die ID von techMD (Referenz für Files)
#
# Files via /mets:mets/mets:fileSec[1]/mets:fileGrp[1]/mets:file[1]/mets:FLocat[1]
#
###############################################################################
sub parse_iexml ($$) {
my $filename = shift;
my $recovery_flag = shift;
# create object
#
#my $xp = XML::XPath->new (filename => $filename);
if ($recovery_flag) {
$recoverflag=2; # avoid warnings, see XML::LibXML::Parser POD about 'recovery'
}
my $dom = XML::LibXML->load_xml (location => $filename, recover => $recovery_flag, no_blanks=>1, compact=>1);
my $xp = XML::LibXML::XPathContext->new($dom);
$xp->registerNs("dnx", "http://www.exlibrisgroup.com/dps/dnx"); $xp->registerNs("dnx", "http://www.exlibrisgroup.com/dps/dnx");
$xp->registerNs("sru", "http://www.loc.gov/zing/srw/"); $xp->registerNs("sru", "http://www.loc.gov/zing/srw/");
$xp->registerNs("xsi", "http://www.w3.org/2001/XMLSchema-instance"); $xp->registerNs("xsi", "http://www.w3.org/2001/XMLSchema-instance");
...@@ -270,20 +253,62 @@ sub parse_iexml ($$) { ...@@ -270,20 +253,62 @@ sub parse_iexml ($$) {
$xp->registerNs("mix", "http://www.loc.gov/standards/mix/"); $xp->registerNs("mix", "http://www.loc.gov/standards/mix/");
$xp->registerNs("xlink", "http://www.w3.org/1999/xlink"); $xp->registerNs("xlink", "http://www.w3.org/1999/xlink");
$xp->registerNs("xlin", "http://www.w3.org/1999/xlink"); $xp->registerNs("xlin", "http://www.w3.org/1999/xlink");
return $xp;
}
}
###############################################################################
#
# /mets:mets/mets:dmdSec[1]/mets:mdWrap[1]/mets:xmlData[1]/dc:record[1]/dc:title[1]
# /mets:mets/mets:amdSec[1]/mets:techMD[1]/mets:mdWrap[1]/mets:xmlData[1]/dnx[1]/section[1]/record[1]/key[2]
# mit ID=Label und Wert = LOCAL
# dort die ID von techMD (Referenz für Files)
#
# Files via /mets:mets/mets:fileSec[1]/mets:fileGrp[1]/mets:file[1]/mets:FLocat[1]
#
###############################################################################
{
my $compiled_xpath_titles = XML::LibXML::XPathExpression->new('/mets:mets/mets:dmdSec/mets:mdWrap[1]/mets:xmlData[1]/dc:record/dc:title[1]');
my $compiled_xpath_dcrecords = XML::LibXML::XPathExpression->new('/mets:mets/mets:dmdSec/mets:mdWrap/mets:xmlData/dc:record/*');
my $compiled_xpath_amdsecs = XML::LibXML::XPathExpression->new('/mets:mets/mets:amdSec[starts-with(@ID, \'REP\')]');
my $str_local_reps = 'mets:techMD/mets:mdWrap/mets:xmlData/*[local-name()=\'dnx\']/*[local-name()=\'section\']/*[local-name()=\'record\']/*[local-name()=\'key\' and @id=\'label\']';
my $compiled_xpath_localreps = XML::LibXML::XPathExpression->new( $str_local_reps);
my $compiled_xpath_localreps2 = XML::LibXML::XPathExpression->new('/mets:mets/mets:amdSec/mets:techMD/mets:mdWrap/mets:xmlData/dnx/section/record/key[@id=\'label\']=\'LOCAL\'');
my $compiled_xpath_filegrps = XML::LibXML::XPathExpression->new('/mets:mets/mets:fileSec/mets:fileGrp');
my $compiled_xpath_flocat = XML::LibXML::XPathExpression->new('mets:file/mets:FLocat');
my $compiled_xpath_id = XML::LibXML::XPathExpression->new('@ID');
my $compiled_xpath_admid = XML::LibXML::XPathExpression->new('@ADMID');
my $compiled_xpath_xlinhref = XML::LibXML::XPathExpression->new('@xlin:href');
my $compiled_xpath_dot = XML::LibXML::XPathExpression->new('.');
sub parse_iexml($$) {
my $filename = shift;
my $recovery_flag = shift;
if ($recovery_flag) {
$recovery_flag = 2; # avoid warnings, see XML::LibXML::Parser POD about 'recovery'
}
my $dom = XML::LibXML->load_xml(
location => $filename,
recover => $recovery_flag,
no_blanks => 1,
compact => 1,
no_network => 1,
);
my $xp = get_xpath_context();
$xp->setContextNode($dom);
############################################ ############################################
# get title # get title
my $compiled_xpath_titles = '/mets:mets/mets:dmdSec/mets:mdWrap[1]/mets:xmlData[1]/dc:record/dc:title[1]';
my $title = $xp->findvalue($compiled_xpath_titles); my $title = $xp->findvalue($compiled_xpath_titles);
check_if_db_conform($title, $filename); check_if_db_conform($title, $filename);
############################################ ############################################
# get dc-records # get dc-records
my @dcrecords; my @dcrecords;
my $compiled_xpath_dcrecords='/mets:mets/mets:dmdSec/mets:mdWrap/mets:xmlData/dc:record/*';
my $dcnodes = $xp->find($compiled_xpath_dcrecords); my $dcnodes = $xp->find($compiled_xpath_dcrecords);
foreach my $dcnode ($dcnodes->get_nodelist) { foreach my $dcnode ($dcnodes->get_nodelist) {
#my $ref = ref $dcnode; use Data::Printer; p( $ref);
my $key = $dcnode->getName(); my $key = $dcnode->getName();
my $value = $dcnode->findvalue("."); #my $value = $dcnode->findvalue($compiled_xpath_dot);
my $value = $dcnode->nodeValue;
if (defined $value) { if (defined $value) {
$value =~ s/\n/ /g; $value =~ s/\n/ /g;
$value =~ s/'/\\'/g; $value =~ s/'/\\'/g;
...@@ -296,41 +321,46 @@ sub parse_iexml ($$) { ...@@ -296,41 +321,46 @@ sub parse_iexml ($$) {
} }
############################################ ############################################
# get right representation ID (has a dnx-section with <key id=label>LOCAL</key>) # get right representation ID (has a dnx-section with <key id=label>LOCAL</key>)
my $compiled_xpath_amdsecs = '/mets:mets/mets:amdSec'; my $repids = $xp->find($compiled_xpath_amdsecs); #/mets:mets/mets:amdSec
my $compiled_xpath_localreps = 'mets:techMD/mets:mdWrap/mets:xmlData/dnx/section/record/key[@id=\'label\']';
my $compiled_xpath_filegrps = '/mets:mets/mets:fileSec/mets:fileGrp';
my $compiled_xpath_flocat = 'mets:file/mets:FLocat';
my $repids = $xp->find($compiled_xpath_amdsecs);
my $repid; my $repid;
# FIXME: if only one represenation exists (Qucosa), select this. If there
# are more than one, use them with label LOCAL
my @repnodes = $repids->get_nodelist; my @repnodes = $repids->get_nodelist;
if (scalar @repnodes == 0) {
$repid = $repnodes[0]->findvalue('@ID' ); say STDERR "No reppid found in file $filename, is IE purged?";
my %tmp;
$tmp{"filename"}=$filename;
$tmp{"purged"}=1;
return \%tmp;
} elsif (scalar @repnodes == 1) {
$repid = $repnodes[0]->findvalue($compiled_xpath_id);
} else { #multiple representations found
# choose reppid with LZA, LZA_INTERN or LOCAL (for very old IEs)
foreach my $node (@repnodes) { foreach my $node (@repnodes) {
my $id = $node->findvalue('@ID' ); my $id = $node->findvalue($compiled_xpath_id);
check_if_db_conform($id, $filename); check_if_db_conform($id, $filename);
#/mets:mets/mets:amdSec[1]/mets:techMD[1]/mets:mdWrap[1]/mets:xmlData[1]/dnx[1]/section[1]/record[1]/key[1] my $localreps = $node->findvalue($compiled_xpath_localreps);
# if (
($localreps eq 'LOCAL')
if ($node->findvalue($compiled_xpath_localreps) eq 'LOCAL') { or ($localreps eq 'LZA')
or ($localreps eq 'LZA_INTERN')
) {
$repid = $id; $repid = $id;
last;
} }
#print XML::XPath::XMLParser::as_string($node), "\n\n"; #print XML::XPath::XMLParser::as_string($node), "\n\n";
} }
}
if (!defined $repid) {
say "No repid found in file $filename";
}
############################################ ############################################
# get all files of LOCAL representation # get all files of LOCAL representation
my @files; my @files;
my $filegrpnodes = $xp->find($compiled_xpath_filegrps); my $filegrpnodes = $xp->find($compiled_xpath_filegrps);
foreach my $filegrpnode ($filegrpnodes->get_nodelist) { foreach my $filegrpnode ($filegrpnodes->get_nodelist) {
#die XML::XPath::XMLParser::as_string($filegrpnode), "\n\n"; if ($filegrpnode->findvalue($compiled_xpath_admid) eq $repid) {
#die Dumper($filegrpnode);
if ($filegrpnode->findvalue('@ADMID') eq $repid) {
#die Dumper($filegrpnode);
my $filesnodes = $filegrpnode->find($compiled_xpath_flocat); my $filesnodes = $filegrpnode->find($compiled_xpath_flocat);
foreach my $filesnode ($filesnodes->get_nodelist) { foreach my $filesnode ($filesnodes->get_nodelist) {
my $value = $filesnode->findvalue('@xlin:href'); my $value = $filesnode->findvalue($compiled_xpath_xlinhref);
check_if_db_conform($value, $filename); check_if_db_conform($value, $filename);
push @files, sprintf("%s", $value); push @files, sprintf("%s", $value);
} }
...@@ -344,6 +374,7 @@ sub parse_iexml ($$) { ...@@ -344,6 +374,7 @@ sub parse_iexml ($$) {
$ret{"dcrecords"} = \@dcrecords; $ret{"dcrecords"} = \@dcrecords;
return \%ret; return \%ret;
} }
}
############################################################################### ###############################################################################
# because ExLibris Rosetta produces filenames of following format: # because ExLibris Rosetta produces filenames of following format:
...@@ -398,6 +429,7 @@ sub searching_ie_files ($$) { ...@@ -398,6 +429,7 @@ sub searching_ie_files ($$) {
my $tmp_ies_unsorted_file = shift; my $tmp_ies_unsorted_file = shift;
my $cnt_unsorted_files = 0; my $cnt_unsorted_files = 0;
my $first_two_levels_of_dirs = 0; my $first_two_levels_of_dirs = 0;
###
my $wanted_twolevel_dircount = sub { my $wanted_twolevel_dircount = sub {
my $relpath = $File::Find::name; my $relpath = $File::Find::name;
$relpath =~ s{^\Q$dir\E/?}{}; $relpath =~ s{^\Q$dir\E/?}{};
...@@ -407,9 +439,11 @@ sub searching_ie_files ($$) { ...@@ -407,9 +439,11 @@ sub searching_ie_files ($$) {
and $File::Find::prune = 1; and $File::Find::prune = 1;
if (-d $_) { $first_two_levels_of_dirs++;} if (-d $_) { $first_two_levels_of_dirs++;}
}; };
###
find( $wanted_twolevel_dircount, $dir); find( $wanted_twolevel_dircount, $dir);
my $progressbar=Time::Progress->new(min => 0, max => $first_two_levels_of_dirs, smoothing => 1); my $progressbar=Time::Progress->new(min => 0, max => $first_two_levels_of_dirs, smoothing => 1);
my $dircount = 0; my $dircount = 0;
###
my $wanted_process_sip = sub { my $wanted_process_sip = sub {
if (-f && m/V(\d+)-IE\d+\.xml$/) { if (-f && m/V(\d+)-IE\d+\.xml$/) {
my $version = $1; my $version = $1;
...@@ -430,6 +464,7 @@ sub searching_ie_files ($$) { ...@@ -430,6 +464,7 @@ sub searching_ie_files ($$) {
} }
return; return;
}; };
###
find($wanted_process_sip, $dir); find($wanted_process_sip, $dir);
say ""; say "";
return $cnt_unsorted_files; return $cnt_unsorted_files;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment