From f03c5f04e997daa5ac02884dfd3193879193b72a Mon Sep 17 00:00:00 2001
From: Andreas Romeyke <andreas.romeyke@slub-dresden.de>
Date: Mon, 2 Jan 2023 16:52:48 +0100
Subject: [PATCH] - added cache support to avoid superflous reading

---
 lib/SLUB/LZA/Rosetta/TA.pm | 87 +++++++++++++++++++++++++++++++++-----
 1 file changed, 76 insertions(+), 11 deletions(-)

diff --git a/lib/SLUB/LZA/Rosetta/TA.pm b/lib/SLUB/LZA/Rosetta/TA.pm
index 03e6f7e..1d6645c 100644
--- a/lib/SLUB/LZA/Rosetta/TA.pm
+++ b/lib/SLUB/LZA/Rosetta/TA.pm
@@ -14,6 +14,8 @@ use Text::CSV_PP;
 # ABSTRACT: main module for ta-tool
 
 our %config;
+our %cache;
+our $cache_path;
 BEGIN{
     my $home = path($ENV{'HOME'});
     if ($home->is_dir() && !$home->is_rootdir) {
@@ -22,9 +24,17 @@ BEGIN{
         if ($config_path->is_file) {
             %config = YAML::LoadFile($config_path);
         }
+        $cache_path = $home->child('.cache')->child('ta-tool.cache');
+        if ($cache_path->is_file) {
+            %cache = YAML::LoadFile($cache_path);
+        }
     }
 }
 
+END {
+    YAML::DumpFile($cache_path, %cache);
+}
+
 sub sru_search {
     my $searchtype = shift;
     my $query = shift;
@@ -160,7 +170,7 @@ sub helper_scan_log {
             $fh = $file->filehandle;
         }
         if (defined $fh) {
-            $fh_processing->( $fh );
+            $fh_processing->( $fh, $file );
         }
         undef $fh;
     }
@@ -205,9 +215,23 @@ sub trace_log {
         $line_rx3 = Regexp::Optimizer->new->optimize(qr{^$pre_rx.*Loaded \d+ files for: REP$searchid \(IE\d+\)});
         $line_rx4 = Regexp::Optimizer->new->optimize(qr{^$pre_rx.*Representation $searchid IE \d+ Copy ID: \d+});
     }
-
     my $fh_processing_stage1 = sub {
         my $fh = shift;
+        my $file = shift;
+        my $file_md5 = path($file)->digest();
+        return if (
+            exists( $cache{$file_md5} )
+                and (
+                (
+                    exists( $cache{$file_md5}->{deposit_dir}->{$deposit_dir})
+                        and exists( $cache{$file_md5}->{deposit_id}->{$deposit_id})
+                        and exists( $cache{$file_md5}->{sip_id}->{$sip_id})
+                ) or (
+                        exists( $cache{$file_md5}->{ie_pid}->{$ie_pid})
+                        and exists( $cache{$file_md5}->{rep_id}{$rep_id})
+                )
+               )
+        );
         while(<$fh>) {
             if (
                 (defined $sip_id and defined $deposit_id and defined $deposit_dir)
@@ -221,19 +245,47 @@ sub trace_log {
             }
             chomp;
             if ( m/$line_rx1/ ) {
-                if (!defined $sip_id      and m/$sip_rx(\d{6}),/) { $sip_id = $2; }
-                if (!defined $deposit_dir and m/originalDirName=([^,]*),/) { $deposit_dir = $1; }
-                if (!defined $deposit_id  and m/depositId=(\d+),/) { $deposit_id = $1;}
+                if (!defined $sip_id      and m/$sip_rx(\d{6}),/) {
+                    $sip_id = $2;
+                    $cache{$file_md5}->{sip_id}->{$sip_id}=1; # mark as match
+                }
+                if (!defined $deposit_dir and m/originalDirName=([^,]*),/) {
+                    $deposit_dir = $1;
+                    $cache{$file_md5}->{deposit_dir}->{$deposit_dir}=1; # mark as match
+                }
+                if (!defined $deposit_id  and m/depositId=(\d+),/) {
+                    $deposit_id = $1;
+                    $cache{$file_md5}->{deposit_id}->{$deposit_id}=1; # mark as match
+                }
+
             } elsif (m/$line_rx2/) {
-                if (!defined $sip_id and m/SIP (\d{6})/) { $sip_id = $1;}
-                if (!defined $deposit_id and m/Deposit Activity ID=(\d+)/) { $deposit_id = $1;}
+                if (!defined $sip_id and m/SIP (\d{6})/) {
+                    $sip_id = $1;
+                    $cache{$file_md5}->{sip_id}->{$sip_id}=1; # mark as match
+                }
+                if (!defined $deposit_id and m/Deposit Activity ID=(\d+)/) {
+                    $deposit_id = $1;
+                    $cache{$file_md5}->{deposit_id}->{$deposit_id}=1; # mark as match
+                }
             } elsif (m/$line_rx3/) {
-                if (!defined $ie_pid and m/Loaded \d+ files for: REP\d+ \((IE\d+)/) { $ie_pid = $1;}
-                if (!defined $rep_id and m/Loaded \d+ files for: (REP\d+)/) { $rep_id = $1;}
+                if (!defined $ie_pid and m/Loaded \d+ files for: REP\d+ \((IE\d+)/) {
+                    $ie_pid = $1;
+                    $cache{$file_md5}->{ie_pid}->{$ie_pid}=1; # mark as match
+                }
+                if (!defined $rep_id and m/Loaded \d+ files for: (REP\d+)/) {
+                    $rep_id = $1;
+                    $cache{$file_md5}->{rep_id}->{$rep_id}=1; # mark as match
+                }
             } elsif (m/$line_rx4/) {
                 my $rx = qr/Representation (\d+) IE (\d+)/;
-                if (!defined $ie_pid and m/$rx/) { $ie_pid = $2;}
-                if (!defined $rep_id and m/$rx/) { $rep_id = $1;}
+                if (!defined $ie_pid and m/$rx/) {
+                    $ie_pid = $2;
+                    $cache{$file_md5}->{ie_pid}->{$ie_pid}=1; # mark as match
+                }
+                if (!defined $rep_id and m/$rx/) {
+                    $rep_id = $1;
+                    $cache{$file_md5}->{rep_id}->{$rep_id}=1; # mark as match
+                }
             }
         }
         return 1;
@@ -255,6 +307,19 @@ sub trace_log {
     use warnings;
     my $fh_processing_stage2 = sub {
         my $fh = shift;
+        my $file = shift;
+        my $file_md5 = path($file)->digest();
+        return if ! (
+            exists( $cache{$file_md5} )
+            and
+            (
+            exists( $cache{$file_md5}->{deposit_dir}->{$deposit_dir})
+            or exists( $cache{$file_md5}->{deposit_id}->{$deposit_id})
+            or exists( $cache{$file_md5}->{sip_id}->{$sip_id})
+            or exists( $cache{$file_md5}->{ie_pid}->{$ie_pid})
+            or exists( $cache{$file_md5}->{rep_id}->{$rep_id})
+            )
+        );
         while(<$fh>) {
             if (!m/^$date_rx/) {
                 next;
-- 
GitLab