From ffbb9e80186c6477a3b8d1c7845e32a36b299f54 Mon Sep 17 00:00:00 2001
From: Andreas Romeyke <art1@andreas-romeyke.de>
Date: Tue, 11 Jan 2022 11:47:26 +0100
Subject: [PATCH] - autoflush for stderr enabled - in check_file_fixities()
 using Path::Tiny::openr()_raw instead open_r() to avoid extra binmode() call
 - extracted common stuff from while-loop in stage2() to stage2_for_ie(),  
 this could reduce the memory footprint because reduced scope - in stage2()
 instead seek we close and reopen fh now HINT: no real reason for "out of
 memory"-problem found

---
 deep_fixitycheck.pl | 105 ++++++++++++++++++++++++--------------------
 1 file changed, 58 insertions(+), 47 deletions(-)

diff --git a/deep_fixitycheck.pl b/deep_fixitycheck.pl
index 8aa2557..d98d3b7 100644
--- a/deep_fixitycheck.pl
+++ b/deep_fixitycheck.pl
@@ -37,6 +37,7 @@ use IO::Handle;
 use POSIX qw(strftime);
 use Fcntl qw(SEEK_END SEEK_SET);
 STDOUT->autoflush(1);
+STDERR->autoflush(1);
 # guarantee, that output will be UTF8
 binmode(STDOUT, ":encoding(UTF-8)");
 
@@ -253,6 +254,7 @@ sub check_file_seekable($$) {
   if (!$result->{seekable}) {
     add_error($fileobj, $result, "is not seekable, $!");
   }
+  close($fh);
   return $result;
 }
 
@@ -267,8 +269,7 @@ sub check_file_fixities($$) {
       'SHA256' => Digest::SHA->new(256),
       'SHA512' => Digest::SHA->new(512)
   );
-  my $fh = path($fileobj->{file_mounted})->openr();
-  binmode($fh);
+  my $fh = path($fileobj->{file_mounted})->openr_raw();
   my $buffer;
   my $block=0;
   my $blocksize = 128*1024; # 128kB blocks
@@ -327,6 +328,49 @@ sub stage1 ($$$) {
   return 1;
 }
 
+sub stage2_for_ie {
+  my $unseen = shift;
+  my $bf = shift;
+  my $map_path = shift;
+  my $report_path = shift;
+  my $stat = shift;
+  foreach my $fileobj (@{ $unseen }) {
+    if ($bf->key_count() >= (0.8 * $capacity)) { # reset Bloom filter if 80% filled
+      #print "reset bloomfilter\n";
+      $bf = Bloom::Filter->new(
+          capacity   => $capacity, # we cache n last seen files
+          error_rate => 0.0001,
+      );
+    }
+    $bf->add( $fileobj->{fileid} );
+    $fileobj->{file_mounted} = map_file($map_path, $fileobj->{filepath});
+    $stat->{files}++;
+    my $result;
+    $result->{errors} = 0;
+    $result->{report_path} = $report_path;
+    $result = check_if_file_exist($fileobj, $result);
+    if ($result->{exist}) {
+      # only if file exists, do additional checks
+      $result = check_file_size($fileobj, $result);
+      if ($result->{size}) {
+        $stat->{scansize} += $result->{size};
+        $result = check_file_seekable($fileobj, $result);
+        if ($result->{seekable}) {
+          $result = check_file_fixities($fileobj, $result);
+        }
+      }
+    }
+
+    if ($result->{errors} > 0) {
+      $report_path->append_utf8("-" x 60, "\n");
+      $stat->{errors} += $result->{errors};
+    } else { # no errors
+      $report_path->append_utf8(" none\n");
+    }
+  }
+  # write report
+}
+
 sub stage2 ($$$$) {
   my $tmp_ies_unsorted_path = shift;
   my $report_path = shift;
@@ -337,7 +381,7 @@ sub stage2 ($$$$) {
   while (<$fh_unsorted_file>) {
     $cnt_unsorted_files++;
   }
-  seek $fh_unsorted_file, 0, 0; # seek to first byte
+  close($fh_unsorted_file);
   my $count = 0;
   my $bf = Bloom::Filter->new(
       capacity   => $capacity, # we cache n last seen files
@@ -352,57 +396,24 @@ sub stage2 ($$$$) {
   $stat->{errors} = 0;
   $stat->{scansize} = 0;
   $stat->{begin} = time;
-  my $bfreset="⏸";
   my $prev_ie = "";
+  $fh_unsorted_file = $tmp_ies_unsorted_path->openr();
   while (<$fh_unsorted_file>) {
+    my $actual_ie = $_;
     # scan each IE
     $stat->{IEs}++;
     chomp;
     my $transferrate_in_MBs = sprintf("%0.2f", $stat->{scansize} / (time - $stat->{begin} + 1) / 1024 / 1024);
-    my $bfusage = int($bf->key_count()*100/$capacity );
-    print $progressbar->report("parse IE files:       %40b $bfreset running: %L ETA: %E ($count/$cnt_unsorted_files IEs, tfr=$transferrate_in_MBs MB/s, bfu=$bfusage%)               \r", ++$count);
+    my $bfusage = int($bf->key_count() * 100 / $capacity);
+    print $progressbar->report("parse IE files:       %40b running: %L ETA: %E ($count/$cnt_unsorted_files IEs, tfr=$transferrate_in_MBs MB/s, bfu=$bfusage%)               \r", ++$count);
     my $timestamp = strftime("%Y-%m-%d %H:%M:%S %z (%Z)", localtime(time));
-    $report_path->append_utf8("$timestamp, IE $_ with following errors:\n");
-    my $ret = parse_iexml($_, $recovery);
-    my $unseen = bloomfilter_to_unseen($bf,  $ret->{files} );
-    if (scalar @{$unseen} == 0 ) { $report_path->append_utf8("skipped because files already checked using IE $prev_ie\n"); }
-    $prev_ie = $_;
-    foreach my $fileobj (@{ $unseen }) {
-      if ($bf->key_count() >= (0.8 * $capacity)) { # reset Bloom filter if 80% filled
-        #print "reset bloomfilter\n";
-        $bf = Bloom::Filter->new(
-            capacity   => $capacity, # we cache n last seen files
-            error_rate => 0.0001,
-        );
-        $bfreset = "⏮";
-      } else { $bfreset = "▶";}
-      $bf->add( $fileobj->{fileid} );
-      $fileobj->{file_mounted} = map_file($map_path, $fileobj->{filepath});
-      $stat->{files}++;
-      my $result;
-      $result->{errors} = 0;
-      $result->{report_path} = $report_path;
-      $result = check_if_file_exist($fileobj, $result);
-      if ($result->{exist}) {
-        # only if file exists, do additional checks
-        $result = check_file_size($fileobj, $result);
-        if ($result->{size}) {
-          $stat->{scansize} += $result->{size};
-          $result = check_file_seekable($fileobj, $result);
-          if ($result->{seekable}) {
-            $result = check_file_fixities($fileobj, $result);
-          }
-        }
-      }
-
-      if ($result->{errors} > 0) {
-        $report_path->append_utf8("-" x 60, "\n");
-        $stat->{errors} += $result->{errors};
-      } else { # no errors
-        $report_path->append_utf8(" none\n");
-      }
-    }
-    # write report
+    $report_path->append_utf8("$timestamp, IE $actual_ie with following errors:\n");
+    my $ret = parse_iexml($actual_ie, $recovery);
+    my $unseen = bloomfilter_to_unseen($bf, $ret->{files});
+    if (scalar @{$unseen} == 0) {$report_path->append_utf8("skipped because files already checked using IE $prev_ie\n");}
+    $prev_ie = $actual_ie;
+    ###
+    stage2_for_ie($unseen, $bf, $map_path, $report_path, $stat);
   }
   say "";
   $stat->{end} = time;
-- 
GitLab