From ed857cad87f52ab82cf422e35581c54e46ac38e1 Mon Sep 17 00:00:00 2001 From: Andreas Romeyke <art1@andreas-romeyke.de> Date: Fri, 10 Dec 2021 15:58:55 +0100 Subject: [PATCH] - stages could be running separately - typofix - added explicite cache-file - added autocomplete support --- deep_fixitycheck.pl | 248 +++++++++++++++++++++++++++++--------------- 1 file changed, 164 insertions(+), 84 deletions(-) diff --git a/deep_fixitycheck.pl b/deep_fixitycheck.pl index c24fe15..aa54abb 100644 --- a/deep_fixitycheck.pl +++ b/deep_fixitycheck.pl @@ -3,7 +3,7 @@ # Author: Andreas Romeyke # SLUB Dresden, Department Longterm Preservation # -# License: This script is free available under GNU Gneral Public License V3.0 or higher, +# License: This script is free available under GNU General Public License V3.0 or higher, # see file LICENSE.txt for details. # # This script scans a given rosetta repository and checks the fixity deeply @@ -26,9 +26,8 @@ use File::Find; use XML::LibXML; use Time::Progress; use XML::LibXML::XPathContext; -use Getopt::Long; -use constant DEBUG => 0; # no debug -use Digest::CRC; +use Getopt::Long::Complete qw(GetOptionsWithCompletion); + use Digest::MD5; use Digest::SHA; use Pod::Usage; @@ -50,7 +49,7 @@ my @algorithms = qw(CRC32 MD5 SHA1 SHA256 SHA512); # used fixity algorithm names # The function returns the count of found IEs sub searching_ie_files ($$) { my $dir = shift; - my $tmp_ies_unsorted_file = shift; + my $tmp_ies_unsorted_path = shift; my $cnt_unsorted_files = 0; my $first_two_levels_of_dirs = 0; my $wanted_twolevel_dircount = sub { @@ -69,7 +68,7 @@ sub searching_ie_files ($$) { if (-f && m/V(\d+)-IE\d+\.xml$/) { my $version = $1; my $file=$File::Find::name; - $tmp_ies_unsorted_file -> append( $file."\n"); + $tmp_ies_unsorted_path->append( $file."\n"); $cnt_unsorted_files++; $File::Find::prune =1; } elsif (-d ) { @@ -293,6 +292,108 @@ sub check_file_fixities($$) { return $result; } +sub stage1 ($$$) { + my $tmp_ies_unsorted_path = shift; + my $search_dir = shift; + my $report_path = shift; + my $stat; + $stat->{begin} = time; + say "Preparing scan"; + say "searching IE files"; + # create temporary file to hold list of IEs + my $cnt_unsorted_files = searching_ie_files($search_dir, $tmp_ies_unsorted_path); + say "Scan finished"; + $stat->{end} = time; + $stat->{duration} = $stat->{end} - $stat->{begin}; + $report_path->append_utf8("scanned $search_dir in $stat->{duration} seconds, found $cnt_unsorted_files IEs"); + return 1; +} + +sub stage2 ($$$$) { + my $tmp_ies_unsorted_path = shift; + my $report_path = shift; + my $map_path = shift; + my $recovery = shift; + say "checking IEs"; + my $fh_unsorted_file = $tmp_ies_unsorted_path->openr(); + my $cnt_unsorted_files = 0; + while (<$fh_unsorted_file>) { + $cnt_unsorted_files++; + } + seek $fh_unsorted_file, 0, 0; # seek to first byte + my $count = 0; + my $progressbar = Time::Progress->new(min => 0, max => $cnt_unsorted_files, smoothing => 1); + my $stat; + $stat->{IEs} = 0; + $stat->{files} = 0; + $stat->{errors} = 0; + $stat->{scansize} = 0; + $stat->{begin} = time; + while (<$fh_unsorted_file>) { + # scan each IE + $stat->{IEs}++; + chomp; + my $transferrate_in_MBs = sprintf("%0.2f", $stat->{scansize} / (time - $stat->{begin} + 1) / 1024 / 1024); + print $progressbar->report("parse IE files: %40b running: %L ETA: %E ($count/$cnt_unsorted_files IEs, tfr=$transferrate_in_MBs MB/s) \r", ++$count); + my $ret = parse_iexml($_, $recovery); + foreach my $fileobj (@{$ret->{files}}) { + $fileobj->{file_mounted} = map_file($map_path, $fileobj->{filepath}); + $stat->{files}++; + my $result; + $result->{errors} = 0; + $result = check_if_file_exist($fileobj, $result); + if ($result->{exist}) { + # only if file exists, do additional checks + $result = check_file_size($fileobj, $result); + if ($result->{size}) { + $stat->{scansize} += $result->{size}; + $result = check_file_seekable($fileobj, $result); + if ($result->{seekable}) { + $result = check_file_fixities($fileobj, $result); + } + } + } + + if ($result->{errors} > 0) { + my $timestamp = strftime("%Y-%m-%d %H:%M:%S %z (%Z)", localtime(time)); + $report_path->append_utf8("$timestamp, IE $_ with following errors:\n"); + foreach my $errors (@{$result->{error_description}}) { + $report_path->append_utf8("\t$errors\n"); + } + $report_path->append_utf8("-" x 60, "\n"); + $stat->{errors} += $result->{errors}; + } + } + # write report + } + say ""; + $stat->{end} = time; + $stat->{duration} = $stat->{end} - $stat->{begin}; + my $human_readable_size; + my $fac_tera = 1024 * 1024 * 1024 * 1024; + my $fac_giga = 1024 * 1024 * 1024; + my $fac_mega = 1024 * 1024; + my $fac_kilo = 1024; + if ($stat->{scansize} > $fac_tera) {$human_readable_size = sprintf "%03.1fTB", $stat->{scansize} / $fac_tera} + elsif ($stat->{scansize} > $fac_giga) {$human_readable_size = sprintf "%03.1fGB", $stat->{scansize} / $fac_giga} + elsif ($stat->{scansize} > $fac_mega) {$human_readable_size = sprintf "%03.1fMB", $stat->{scansize} / $fac_mega} + elsif ($stat->{scansize} > $fac_kilo) {$human_readable_size = sprintf "%03.1fkB", $stat->{scansize} / $fac_kilo} + else {$human_readable_size = $stat->{scansize} . "B";} + $report_path->append_utf8("=" x 60, "\n"); + my $summary = sprintf("Scanned %d IEs with %d files (%s) in %d seconds, found %d errors", + $stat->{IEs}, + $stat->{files}, + $human_readable_size, + $stat->{duration}, + $stat->{errors} + ); + $report_path->append_utf8("$summary\n"); + say $summary; + say "Checking finished"; + return 1; +} + + ############################################################################### ############################################################################### ############# main ############################################################ @@ -304,12 +405,33 @@ my $map_path; my $search_dir; my $report_file; my $help; -GetOptions( +my $stage = 'both'; # or 'find' or 'check' +my $cache_str = ""; +my $cache_file; + +GetOptionsWithCompletion( + sub { + my %args = @_; + my $word = $args{word}; # the word to be completed + my $type = $args{type}; # 'optname', 'optval', or 'arg' + my $opt = $args{opt}; # can be an array of oons if ambiguous, e.g. ['--on-fail', '--on-full'] + if (defined $opt && $opt eq "--searchdir") { + return Complete::File::complete_dir(starting_path=>"/permanent", recurse=>1, word=>$word) + } + if (defined $opt && $opt eq "--map_path") { + return Complete::File::complete_dir(starting_path=>"/permanent", recurse=>1, word=>$word) + } + if (defined $opt && $opt eq "--cache") { + return Complete::File::complete_dir(starting_path=>"/tmp", recurse=>1, word=>$word) + } + }, "recovery" => \$recovery, "map_path=s" => \$map_path, "search_dir=s" => \$search_dir, "report=s" => \$report_file, - "help|?" => \$help + "stage:s" => \$stage, + "cache:s" => \$cache_str, + "help|?|h" => \$help, ) or die "Try --help for usage information"; pod2usage(1) if $help; if (!defined $map_path || length($map_path) < 1) { @@ -330,84 +452,34 @@ if (!path($search_dir)->is_dir) { if ($search_dir !~ m/^$map_path/) { die "map_path $map_path should be part of search dir $search_dir!"; } + +if ($cache_str eq "") { + $cache_file = Path::Tiny->tempfile(TEMPLATE => "deep_fixitycheck_XXXXXXXXXXX", CLEANUP => 1); +} else { + $cache_file = Path::Tiny::path($cache_str)->absolute(); + $cache_file->touch(); + if (!$cache_file->is_file) { + die "cache file $cache_str does not exist!"; + } +} + +if (!defined $stage || length($stage) < 1) { + die "stage is empty!"; +} +if ($stage !~ m/(both)|(find)|(check)|(1)|(2)/ ) { + die "stage $stage is unknown"; +} + + # Here starts the scanning if (defined $search_dir && -d "$search_dir") { - say "Preparing scan"; - say "searching IE files"; - # create temporary file to hold list of IEs - my $tmp_ies_dir = Path::Tiny->tempdir( TEMPLATE => "deep_fixitycheck_XXXXXXXXXXX", CLEANUP => 1); - my $tmp_ies_unsorted_file = $tmp_ies_dir->child("unsorted_ies"); - $tmp_ies_unsorted_file->touch(); - my $cnt_unsorted_files = searching_ie_files($search_dir, $tmp_ies_unsorted_file); - say "checking IEs"; - my $fh_unsorted_file = $tmp_ies_unsorted_file->openr(); - my $count = 0; - my $progressbar = Time::Progress->new(min=>0, max=>$cnt_unsorted_files, smoothing => 1); - my $stat; - $stat->{IEs} = 0; - $stat->{files} = 0; - $stat->{errors} = 0; - $stat->{scansize} = 0; - $stat->{begin} = time; - while ( <$fh_unsorted_file>) { # scan each IE - $stat->{IEs}++; - chomp; - my $transferrate_in_MBs = sprintf("%0.2f", $stat->{scansize} / (time - $stat->{begin}+1) / 1024 / 1024); - print $progressbar->report("parse IE files: %40b running: %L ETA: %E ($count/$cnt_unsorted_files IEs, tfr=$transferrate_in_MBs MB/s) \r", ++$count); - my $ret = parse_iexml( $_, $recovery); - foreach my $fileobj (@{ $ret->{files} }) { - $fileobj->{file_mounted} = map_file($map_path, $fileobj->{filepath}); - $stat->{files}++; - my $result; - $result->{errors} = 0; - $result = check_if_file_exist($fileobj, $result); - if ($result->{exist}) { - # only if file exists, do additional checks - $result = check_file_size($fileobj, $result); - if ($result->{size}) { - $stat->{scansize} += $result->{size}; - $result = check_file_seekable($fileobj, $result); - if ($result->{seekable}) { - $result = check_file_fixities($fileobj, $result); - } - } - } - - if ($result->{errors} > 0) { - my $timestamp = strftime("%Y-%m-%d %H:%M:%S %z (%Z)", localtime(time)); - path($report_file)->append_utf8("$timestamp, IE $_ with following errors:\n"); - foreach my $errors (@{ $result->{error_description} }) { - path($report_file)->append_utf8("\t$errors\n"); - } - path($report_file)->append_utf8("-"x60,"\n"); - $stat->{errors} += $result->{errors}; - } - } - # write report - } - say ""; - $stat->{end} = time; - $stat->{duration} = $stat->{end} - $stat->{begin}; - my $human_readable_size; - my $fac_tera = 1024*1024*1024*1024; - my $fac_giga = 1024*1024*1024; - my $fac_mega = 1024*1024; - my $fac_kilo = 1024; - if ($stat->{scansize} > $fac_tera) { $human_readable_size = sprintf "%03.1fTB", $stat->{scansize} / $fac_tera} - elsif ($stat->{scansize} > $fac_giga) {$human_readable_size = sprintf "%03.1fGB", $stat->{scansize} / $fac_giga} - elsif ($stat->{scansize} > $fac_mega) {$human_readable_size = sprintf "%03.1fMB", $stat->{scansize} / $fac_mega} - elsif ($stat->{scansize} > $fac_kilo) {$human_readable_size = sprintf "%03.1fkB", $stat->{scansize} / $fac_kilo} - else {$human_readable_size = $stat->{scansize} . "B";} - path($report_file)->append_utf8("="x60,"\n"); - my $summary = sprintf ("Scanned %d IEs with %d files (%s) in %d seconds, found %d errors", - $stat->{IEs}, - $stat->{files}, - $human_readable_size, - $stat->{duration}, - $stat->{errors} - ); - path($report_file)->append_utf8("$summary\n"); - say $summary; + my $report_path = path( $report_file); + if ($stage eq "both" || $stage eq 'find' || $stage == 1) { + stage1($cache_file, $search_dir, $report_path); + } + if ($stage eq "both" || $stage eq 'check' || $stage == 2) { + stage2($cache_file, $report_path, $map_path, $recovery); + } } else { die "no directory given on commandline"; } @@ -457,6 +529,14 @@ The search_dir is the directory where the search starts. The search_dir could be The file where the report is stored. If the file exist, the report will be append without warnings. +=item B<--stage> + +Select the stage, defaults to execute stage 1 "find" and stage 2 "check" + +=item B<--cache_file> + +Store results of stage 1 "find" in given file, defaults to a temporary file in /tmp/ + =back =cut -- GitLab