From c98cdcb37bfe7d00db4430c2236050b2c623495f Mon Sep 17 00:00:00 2001 From: Andreas Romeyke <andreas.romeyke@slub-dresden.de> Date: Wed, 10 Apr 2024 17:24:14 +0200 Subject: [PATCH] - improved xpaths in get_filesize() --- perl/exit_strategy.pl | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/perl/exit_strategy.pl b/perl/exit_strategy.pl index 5c95209..2b621e2 100644 --- a/perl/exit_strategy.pl +++ b/perl/exit_strategy.pl @@ -61,7 +61,6 @@ sub check_lzaid ($lza_id) { return ($lza_id =~ m/^SLUB:LZA:$rx_up:$rx_lw:$rx_lw$/); }; - STDOUT->autoflush(1); # guarantee, that output will be UTF8 binmode(STDOUT, ":encoding(UTF-8)"); @@ -467,17 +466,31 @@ sub check_if_db_conform ($string, $filename) { sub get_file_path($xp, $fsp, $filepid) { return $xp->findvalue("mets:fileGrp/mets:file[\@ID=\"$filepid\"]/mets:FLocat/\@xlin:href", $fsp); } + sub get_file_path_rx($xml, $filepid) { + my $nt = qr{[^>]*}; + if ($xml=~m{(?:<mets:file${nt}ID="$filepid"$nt>\s*<mets:FLocat${nt}xlin:href=")([^"]*)}s) { + #say "path $1"; + return $1; + } + } sub get_filesize ($xp, $filepid) { my $xpath =<<"XPATH"; - /mets:mets/mets:amdSec[starts-with(\@ID, \'$filepid\')]/mets:techMD[\@ID=\"$filepid-amd-tech\"] + /mets:mets/mets:amdSec[\@ID='$filepid-amd']/mets:techMD[\@ID='$filepid-amd-tech'] /mets:mdWrap/mets:xmlData/*[namespace-uri()='http://www.exlibrisgroup.com/dps/dnx' and local-name()='dnx'] - /*[namespace-uri()='http://www.exlibrisgroup.com/dps/dnx' and local-name()='section'] + /*[namespace-uri()='http://www.exlibrisgroup.com/dps/dnx' and local-name()='section' and \@id='generalFileCharacteristics'] /*[namespace-uri()='http://www.exlibrisgroup.com/dps/dnx' and local-name()='record'] /*[namespace-uri()='http://www.exlibrisgroup.com/dps/dnx' and local-name()='key' and \@id='fileSizeBytes']/text() XPATH return $xp->findvalue($xpath); } + sub get_filesize_rx($xml, $filepid) { + #say "size -"; + if ($xml=~m{(?:<mets:techMD\s*ID="$filepid-amd-tech">.*?<key id="fileSizeBytes">)([^<]*)}s) { + #say "size $1"; + return $1; + } + } sub get_purged_states($xp, $amd) { # we need to earch for eventIdentifierValue 272 or 274. @@ -525,6 +538,7 @@ XPATH } my $parser = get_parser($recovery_flag); my $dom = $parser->parse_file($filename); + #my $slurp = path($filename)->slurp; my $xp = get_xpath_context(); $xp->setContextNode($dom); my $dmdsec = $xp->findnodes($compiled_xpath_dmdSec)->[0]; @@ -566,14 +580,20 @@ XPATH # get all files of LOCAL representation $ret->{"filepids"} = get_filepids_ref($xp, $filesec, $repid); $ret->{"files"} = get_files_ref($xp, $filesec, $repid); - foreach my $fpid (@{$ret->{"filepids"}}) { - #say "pid=$fpid"; + my @loc_and_size = map { + my $fpid = $_; my $location = get_file_path($xp, $filesec, $fpid); + #my $location = get_file_path_rx($slurp, $fpid); #say "location=$location"; my $size = get_filesize($xp, $fpid); + #my $size = get_filesize_rx($slurp, $fpid); #say "size=$size"; - $ret->{"sizes"}->{$location} = $size ; + [$location, $size]; + } @{$ret->{"filepids"}}; + foreach my $entry (@loc_and_size) { + my ($location, $size) = @{ $entry }; + $ret->{"sizes"}->{$location} = $size ; } return $ret; } -- GitLab