From ed857cad87f52ab82cf422e35581c54e46ac38e1 Mon Sep 17 00:00:00 2001
From: Andreas Romeyke <art1@andreas-romeyke.de>
Date: Fri, 10 Dec 2021 15:58:55 +0100
Subject: [PATCH] - stages could be running separately - typofix - added
 explicite cache-file - added autocomplete support

---
 deep_fixitycheck.pl | 248 +++++++++++++++++++++++++++++---------------
 1 file changed, 164 insertions(+), 84 deletions(-)

diff --git a/deep_fixitycheck.pl b/deep_fixitycheck.pl
index c24fe15..aa54abb 100644
--- a/deep_fixitycheck.pl
+++ b/deep_fixitycheck.pl
@@ -3,7 +3,7 @@
 # Author: Andreas Romeyke
 # SLUB Dresden, Department Longterm Preservation
 #
-# License: This script is free available under GNU Gneral Public License V3.0 or higher,
+# License: This script is free available under GNU General Public License V3.0 or higher,
 # see file LICENSE.txt for details.
 #
 # This script scans a given rosetta repository and checks the fixity deeply
@@ -26,9 +26,8 @@ use File::Find;
 use XML::LibXML;
 use Time::Progress;
 use XML::LibXML::XPathContext;
-use Getopt::Long;
-use constant DEBUG => 0; # no debug
-use Digest::CRC;
+use Getopt::Long::Complete qw(GetOptionsWithCompletion);
+
 use Digest::MD5;
 use Digest::SHA;
 use Pod::Usage;
@@ -50,7 +49,7 @@ my @algorithms = qw(CRC32 MD5 SHA1 SHA256 SHA512); # used fixity algorithm names
 # The function returns the count of found IEs
 sub searching_ie_files ($$) {
   my $dir = shift;
-  my $tmp_ies_unsorted_file = shift;
+  my $tmp_ies_unsorted_path = shift;
   my $cnt_unsorted_files = 0;
   my $first_two_levels_of_dirs = 0;
   my $wanted_twolevel_dircount = sub {
@@ -69,7 +68,7 @@ sub searching_ie_files ($$) {
     if (-f && m/V(\d+)-IE\d+\.xml$/) {
       my $version = $1;
       my $file=$File::Find::name;
-      $tmp_ies_unsorted_file -> append( $file."\n");
+      $tmp_ies_unsorted_path->append( $file."\n");
       $cnt_unsorted_files++;
       $File::Find::prune =1;
     } elsif (-d ) {
@@ -293,6 +292,108 @@ sub check_file_fixities($$) {
   return $result;
 }
 
+sub stage1 ($$$) {
+  my $tmp_ies_unsorted_path = shift;
+  my $search_dir = shift;
+  my $report_path = shift;
+  my $stat;
+  $stat->{begin} = time;
+  say "Preparing scan";
+  say "searching IE files";
+  # create temporary file to hold list of IEs
+  my $cnt_unsorted_files = searching_ie_files($search_dir, $tmp_ies_unsorted_path);
+  say "Scan finished";
+  $stat->{end} = time;
+  $stat->{duration} = $stat->{end} - $stat->{begin};
+  $report_path->append_utf8("scanned $search_dir in $stat->{duration} seconds, found $cnt_unsorted_files IEs");
+  return 1;
+}
+
+sub stage2 ($$$$) {
+  my $tmp_ies_unsorted_path = shift;
+  my $report_path = shift;
+  my $map_path = shift;
+  my $recovery = shift;
+  say "checking IEs";
+  my $fh_unsorted_file = $tmp_ies_unsorted_path->openr();
+  my $cnt_unsorted_files = 0;
+  while (<$fh_unsorted_file>) {
+    $cnt_unsorted_files++;
+  }
+  seek $fh_unsorted_file, 0, 0; # seek to first byte
+  my $count = 0;
+  my $progressbar = Time::Progress->new(min => 0, max => $cnt_unsorted_files, smoothing => 1);
+  my $stat;
+  $stat->{IEs} = 0;
+  $stat->{files} = 0;
+  $stat->{errors} = 0;
+  $stat->{scansize} = 0;
+  $stat->{begin} = time;
+  while (<$fh_unsorted_file>) {
+    # scan each IE
+    $stat->{IEs}++;
+    chomp;
+    my $transferrate_in_MBs = sprintf("%0.2f", $stat->{scansize} / (time - $stat->{begin} + 1) / 1024 / 1024);
+    print $progressbar->report("parse IE files:       %40b  running: %L ETA: %E ($count/$cnt_unsorted_files IEs, tfr=$transferrate_in_MBs MB/s)         \r", ++$count);
+    my $ret = parse_iexml($_, $recovery);
+    foreach my $fileobj (@{$ret->{files}}) {
+      $fileobj->{file_mounted} = map_file($map_path, $fileobj->{filepath});
+      $stat->{files}++;
+      my $result;
+      $result->{errors} = 0;
+      $result = check_if_file_exist($fileobj, $result);
+      if ($result->{exist}) {
+        # only if file exists, do additional checks
+        $result = check_file_size($fileobj, $result);
+        if ($result->{size}) {
+          $stat->{scansize} += $result->{size};
+          $result = check_file_seekable($fileobj, $result);
+          if ($result->{seekable}) {
+            $result = check_file_fixities($fileobj, $result);
+          }
+        }
+      }
+
+      if ($result->{errors} > 0) {
+        my $timestamp = strftime("%Y-%m-%d %H:%M:%S %z (%Z)", localtime(time));
+        $report_path->append_utf8("$timestamp, IE $_ with following errors:\n");
+        foreach my $errors (@{$result->{error_description}}) {
+          $report_path->append_utf8("\t$errors\n");
+        }
+        $report_path->append_utf8("-" x 60, "\n");
+        $stat->{errors} += $result->{errors};
+      }
+    }
+    # write report
+  }
+  say "";
+  $stat->{end} = time;
+  $stat->{duration} = $stat->{end} - $stat->{begin};
+  my $human_readable_size;
+  my $fac_tera = 1024 * 1024 * 1024 * 1024;
+  my $fac_giga = 1024 * 1024 * 1024;
+  my $fac_mega = 1024 * 1024;
+  my $fac_kilo = 1024;
+  if ($stat->{scansize} > $fac_tera) {$human_readable_size = sprintf "%03.1fTB", $stat->{scansize} / $fac_tera}
+  elsif ($stat->{scansize} > $fac_giga) {$human_readable_size = sprintf "%03.1fGB", $stat->{scansize} / $fac_giga}
+  elsif ($stat->{scansize} > $fac_mega) {$human_readable_size = sprintf "%03.1fMB", $stat->{scansize} / $fac_mega}
+  elsif ($stat->{scansize} > $fac_kilo) {$human_readable_size = sprintf "%03.1fkB", $stat->{scansize} / $fac_kilo}
+  else {$human_readable_size = $stat->{scansize} . "B";}
+  $report_path->append_utf8("=" x 60, "\n");
+  my $summary = sprintf("Scanned %d IEs with %d files (%s) in %d seconds, found %d errors",
+      $stat->{IEs},
+      $stat->{files},
+      $human_readable_size,
+      $stat->{duration},
+      $stat->{errors}
+  );
+  $report_path->append_utf8("$summary\n");
+  say $summary;
+  say "Checking finished";
+  return 1;
+}
+
+
 ###############################################################################
 ###############################################################################
 ############# main ############################################################
@@ -304,12 +405,33 @@ my $map_path;
 my $search_dir;
 my $report_file;
 my $help;
-GetOptions(
+my $stage = 'both'; # or 'find' or 'check'
+my $cache_str = "";
+my $cache_file;
+
+GetOptionsWithCompletion(
+    sub {
+      my %args = @_;
+      my $word = $args{word}; # the word to be completed
+      my $type = $args{type}; # 'optname', 'optval', or 'arg'
+      my $opt = $args{opt};   # can be an array of oons if ambiguous, e.g. ['--on-fail', '--on-full']
+      if (defined $opt && $opt eq "--searchdir") {
+        return Complete::File::complete_dir(starting_path=>"/permanent", recurse=>1, word=>$word)
+      }
+      if (defined $opt && $opt eq "--map_path") {
+        return Complete::File::complete_dir(starting_path=>"/permanent", recurse=>1, word=>$word)
+      }
+      if (defined $opt && $opt eq "--cache") {
+        return Complete::File::complete_dir(starting_path=>"/tmp", recurse=>1, word=>$word)
+      }
+    },
     "recovery"     => \$recovery,
     "map_path=s"   => \$map_path,
     "search_dir=s" => \$search_dir,
     "report=s"     => \$report_file,
-    "help|?"       => \$help
+    "stage:s"      => \$stage,
+    "cache:s"      => \$cache_str,
+    "help|?|h"     => \$help,
 ) or die "Try --help for usage information";
 pod2usage(1) if $help;
 if (!defined $map_path || length($map_path) < 1) {
@@ -330,84 +452,34 @@ if (!path($search_dir)->is_dir) {
 if ($search_dir !~ m/^$map_path/) {
   die "map_path $map_path should be part of search dir $search_dir!";
 }
+
+if ($cache_str eq "") {
+  $cache_file = Path::Tiny->tempfile(TEMPLATE => "deep_fixitycheck_XXXXXXXXXXX", CLEANUP => 1);
+} else {
+  $cache_file = Path::Tiny::path($cache_str)->absolute();
+  $cache_file->touch();
+  if (!$cache_file->is_file) {
+    die "cache file $cache_str does not exist!";
+  }
+}
+
+if (!defined $stage || length($stage) < 1) {
+  die "stage is empty!";
+}
+if ($stage !~ m/(both)|(find)|(check)|(1)|(2)/ ) {
+  die "stage $stage is unknown";
+}
+
+
 # Here starts the scanning
 if (defined $search_dir && -d "$search_dir") {
-    say "Preparing scan";
-    say "searching IE files";
-    # create temporary file to hold list of IEs
-    my $tmp_ies_dir = Path::Tiny->tempdir( TEMPLATE => "deep_fixitycheck_XXXXXXXXXXX", CLEANUP => 1);
-    my $tmp_ies_unsorted_file = $tmp_ies_dir->child("unsorted_ies");
-    $tmp_ies_unsorted_file->touch();
-    my $cnt_unsorted_files = searching_ie_files($search_dir, $tmp_ies_unsorted_file);
-    say "checking IEs";
-    my $fh_unsorted_file = $tmp_ies_unsorted_file->openr();
-    my $count = 0;
-    my $progressbar = Time::Progress->new(min=>0, max=>$cnt_unsorted_files, smoothing => 1);
-    my $stat;
-    $stat->{IEs} = 0;
-    $stat->{files} = 0;
-    $stat->{errors} = 0;
-    $stat->{scansize} = 0;
-    $stat->{begin} = time;
-    while ( <$fh_unsorted_file>) { # scan each IE
-      $stat->{IEs}++;
-      chomp;
-      my $transferrate_in_MBs = sprintf("%0.2f", $stat->{scansize} / (time - $stat->{begin}+1) / 1024 / 1024);
-      print $progressbar->report("parse IE files:       %40b  running: %L ETA: %E ($count/$cnt_unsorted_files IEs, tfr=$transferrate_in_MBs MB/s)         \r", ++$count);
-      my $ret = parse_iexml( $_, $recovery);
-      foreach my $fileobj (@{ $ret->{files} }) {
-        $fileobj->{file_mounted} = map_file($map_path, $fileobj->{filepath});
-        $stat->{files}++;
-        my $result;
-        $result->{errors} = 0;
-        $result = check_if_file_exist($fileobj, $result);
-        if ($result->{exist}) {
-          # only if file exists, do additional checks
-          $result = check_file_size($fileobj, $result);
-          if ($result->{size}) {
-            $stat->{scansize} += $result->{size};
-            $result = check_file_seekable($fileobj, $result);
-            if ($result->{seekable}) {
-              $result = check_file_fixities($fileobj, $result);
-            }
-          }
-        }
-
-        if ($result->{errors} > 0) {
-          my $timestamp = strftime("%Y-%m-%d %H:%M:%S %z (%Z)", localtime(time));
-          path($report_file)->append_utf8("$timestamp, IE $_ with following errors:\n");
-          foreach my $errors (@{ $result->{error_description} }) {
-            path($report_file)->append_utf8("\t$errors\n");
-          }
-          path($report_file)->append_utf8("-"x60,"\n");
-          $stat->{errors} += $result->{errors};
-        }
-      }
-      # write report
-    }
-  say "";
-  $stat->{end} = time;
-  $stat->{duration} = $stat->{end} - $stat->{begin};
-  my $human_readable_size;
-  my $fac_tera = 1024*1024*1024*1024;
-  my $fac_giga = 1024*1024*1024;
-  my $fac_mega = 1024*1024;
-  my $fac_kilo = 1024;
-  if ($stat->{scansize} > $fac_tera) { $human_readable_size = sprintf "%03.1fTB", $stat->{scansize} / $fac_tera}
-  elsif ($stat->{scansize} > $fac_giga) {$human_readable_size = sprintf "%03.1fGB", $stat->{scansize} / $fac_giga}
-  elsif ($stat->{scansize} > $fac_mega) {$human_readable_size = sprintf "%03.1fMB", $stat->{scansize} / $fac_mega}
-  elsif ($stat->{scansize} > $fac_kilo) {$human_readable_size = sprintf "%03.1fkB", $stat->{scansize} / $fac_kilo}
-  else {$human_readable_size = $stat->{scansize} . "B";}
-  path($report_file)->append_utf8("="x60,"\n");
-  my $summary = sprintf ("Scanned %d IEs with %d files (%s) in %d seconds, found %d errors",
-      $stat->{IEs},
-      $stat->{files},
-      $human_readable_size,
-      $stat->{duration},
-      $stat->{errors}
-  );
-  path($report_file)->append_utf8("$summary\n");
-  say $summary;
+  my $report_path = path( $report_file);
+  if ($stage eq "both" || $stage eq 'find' || $stage == 1) {
+    stage1($cache_file, $search_dir, $report_path);
+  }
+  if ($stage eq "both" || $stage eq 'check' || $stage == 2) {
+    stage2($cache_file, $report_path, $map_path, $recovery);
+  }
 } else {
   die "no directory given on commandline";
 }
@@ -457,6 +529,14 @@ The search_dir is the directory where the search starts. The search_dir could be
 
 The file where the report is stored. If the file exist, the report will be append without warnings.
 
+=item B<--stage>
+
+Select the stage, defaults to execute stage 1 "find" and stage 2 "check"
+
+=item B<--cache_file>
+
+Store results of stage 1 "find" in given file, defaults to a temporary file in /tmp/
+
 =back
 
 =cut
-- 
GitLab