From c98cdcb37bfe7d00db4430c2236050b2c623495f Mon Sep 17 00:00:00 2001
From: Andreas Romeyke <andreas.romeyke@slub-dresden.de>
Date: Wed, 10 Apr 2024 17:24:14 +0200
Subject: [PATCH] - improved xpaths in get_filesize()

---
 perl/exit_strategy.pl | 32 ++++++++++++++++++++++++++------
 1 file changed, 26 insertions(+), 6 deletions(-)

diff --git a/perl/exit_strategy.pl b/perl/exit_strategy.pl
index 5c95209..2b621e2 100644
--- a/perl/exit_strategy.pl
+++ b/perl/exit_strategy.pl
@@ -61,7 +61,6 @@ sub check_lzaid ($lza_id) {
   return ($lza_id =~ m/^SLUB:LZA:$rx_up:$rx_lw:$rx_lw$/);
 };
 
-
 STDOUT->autoflush(1);
 # guarantee, that output will be UTF8
 binmode(STDOUT, ":encoding(UTF-8)");
@@ -467,17 +466,31 @@ sub check_if_db_conform ($string, $filename) {
   sub get_file_path($xp, $fsp, $filepid) {
     return $xp->findvalue("mets:fileGrp/mets:file[\@ID=\"$filepid\"]/mets:FLocat/\@xlin:href", $fsp);
   }
+  sub get_file_path_rx($xml, $filepid) {
+    my $nt = qr{[^>]*};
+    if ($xml=~m{(?:<mets:file${nt}ID="$filepid"$nt>\s*<mets:FLocat${nt}xlin:href=")([^"]*)}s) {
+      #say "path $1";
+      return $1;
+    }
+  }
 
   sub get_filesize ($xp, $filepid) {
     my $xpath =<<"XPATH";
-        /mets:mets/mets:amdSec[starts-with(\@ID, \'$filepid\')]/mets:techMD[\@ID=\"$filepid-amd-tech\"]
+        /mets:mets/mets:amdSec[\@ID='$filepid-amd']/mets:techMD[\@ID='$filepid-amd-tech']
         /mets:mdWrap/mets:xmlData/*[namespace-uri()='http://www.exlibrisgroup.com/dps/dnx' and local-name()='dnx']
-      /*[namespace-uri()='http://www.exlibrisgroup.com/dps/dnx' and local-name()='section']
+      /*[namespace-uri()='http://www.exlibrisgroup.com/dps/dnx' and local-name()='section' and \@id='generalFileCharacteristics']
       /*[namespace-uri()='http://www.exlibrisgroup.com/dps/dnx' and local-name()='record']
       /*[namespace-uri()='http://www.exlibrisgroup.com/dps/dnx' and local-name()='key' and \@id='fileSizeBytes']/text()
 XPATH
     return $xp->findvalue($xpath);
   }
+  sub get_filesize_rx($xml, $filepid) {
+    #say "size -";
+    if ($xml=~m{(?:<mets:techMD\s*ID="$filepid-amd-tech">.*?<key id="fileSizeBytes">)([^<]*)}s) {
+      #say "size $1";
+      return $1;
+    }
+  }
 
   sub get_purged_states($xp, $amd) {
     # we need to earch for eventIdentifierValue 272 or 274.
@@ -525,6 +538,7 @@ XPATH
     }
     my $parser = get_parser($recovery_flag);
     my $dom = $parser->parse_file($filename);
+    #my $slurp = path($filename)->slurp;
     my $xp = get_xpath_context();
     $xp->setContextNode($dom);
     my $dmdsec = $xp->findnodes($compiled_xpath_dmdSec)->[0];
@@ -566,14 +580,20 @@ XPATH
     # get all files of LOCAL representation
     $ret->{"filepids"} = get_filepids_ref($xp, $filesec, $repid);
     $ret->{"files"}  = get_files_ref($xp, $filesec, $repid);
-    foreach my $fpid (@{$ret->{"filepids"}}) {
-      #say "pid=$fpid";
+    my @loc_and_size = map {
+      my $fpid = $_;
       my $location = get_file_path($xp, $filesec, $fpid);
+      #my $location = get_file_path_rx($slurp, $fpid);
       #say "location=$location";
       my $size = get_filesize($xp, $fpid);
+      #my $size = get_filesize_rx($slurp, $fpid);
       #say "size=$size";
-      $ret->{"sizes"}->{$location} = $size ;
+      [$location, $size];
+    } @{$ret->{"filepids"}};
 
+    foreach my $entry (@loc_and_size) {
+      my ($location, $size) = @{ $entry };
+      $ret->{"sizes"}->{$location} = $size ;
     }
     return $ret;
   }
-- 
GitLab