diff --git a/deep_fixitycheck.pl b/deep_fixitycheck.pl index 8aa255795270a577c9506256d1928bbe77538e99..d98d3b72d1b9bb80c55bc55b69b00678166686ef 100644 --- a/deep_fixitycheck.pl +++ b/deep_fixitycheck.pl @@ -37,6 +37,7 @@ use IO::Handle; use POSIX qw(strftime); use Fcntl qw(SEEK_END SEEK_SET); STDOUT->autoflush(1); +STDERR->autoflush(1); # guarantee, that output will be UTF8 binmode(STDOUT, ":encoding(UTF-8)"); @@ -253,6 +254,7 @@ sub check_file_seekable($$) { if (!$result->{seekable}) { add_error($fileobj, $result, "is not seekable, $!"); } + close($fh); return $result; } @@ -267,8 +269,7 @@ sub check_file_fixities($$) { 'SHA256' => Digest::SHA->new(256), 'SHA512' => Digest::SHA->new(512) ); - my $fh = path($fileobj->{file_mounted})->openr(); - binmode($fh); + my $fh = path($fileobj->{file_mounted})->openr_raw(); my $buffer; my $block=0; my $blocksize = 128*1024; # 128kB blocks @@ -327,6 +328,49 @@ sub stage1 ($$$) { return 1; } +sub stage2_for_ie { + my $unseen = shift; + my $bf = shift; + my $map_path = shift; + my $report_path = shift; + my $stat = shift; + foreach my $fileobj (@{ $unseen }) { + if ($bf->key_count() >= (0.8 * $capacity)) { # reset Bloom filter if 80% filled + #print "reset bloomfilter\n"; + $bf = Bloom::Filter->new( + capacity => $capacity, # we cache n last seen files + error_rate => 0.0001, + ); + } + $bf->add( $fileobj->{fileid} ); + $fileobj->{file_mounted} = map_file($map_path, $fileobj->{filepath}); + $stat->{files}++; + my $result; + $result->{errors} = 0; + $result->{report_path} = $report_path; + $result = check_if_file_exist($fileobj, $result); + if ($result->{exist}) { + # only if file exists, do additional checks + $result = check_file_size($fileobj, $result); + if ($result->{size}) { + $stat->{scansize} += $result->{size}; + $result = check_file_seekable($fileobj, $result); + if ($result->{seekable}) { + $result = check_file_fixities($fileobj, $result); + } + } + } + + if ($result->{errors} > 0) { + $report_path->append_utf8("-" x 60, "\n"); + $stat->{errors} += $result->{errors}; + } else { # no errors + $report_path->append_utf8(" none\n"); + } + } + # write report +} + sub stage2 ($$$$) { my $tmp_ies_unsorted_path = shift; my $report_path = shift; @@ -337,7 +381,7 @@ sub stage2 ($$$$) { while (<$fh_unsorted_file>) { $cnt_unsorted_files++; } - seek $fh_unsorted_file, 0, 0; # seek to first byte + close($fh_unsorted_file); my $count = 0; my $bf = Bloom::Filter->new( capacity => $capacity, # we cache n last seen files @@ -352,57 +396,24 @@ sub stage2 ($$$$) { $stat->{errors} = 0; $stat->{scansize} = 0; $stat->{begin} = time; - my $bfreset="⏸"; my $prev_ie = ""; + $fh_unsorted_file = $tmp_ies_unsorted_path->openr(); while (<$fh_unsorted_file>) { + my $actual_ie = $_; # scan each IE $stat->{IEs}++; chomp; my $transferrate_in_MBs = sprintf("%0.2f", $stat->{scansize} / (time - $stat->{begin} + 1) / 1024 / 1024); - my $bfusage = int($bf->key_count()*100/$capacity ); - print $progressbar->report("parse IE files: %40b $bfreset running: %L ETA: %E ($count/$cnt_unsorted_files IEs, tfr=$transferrate_in_MBs MB/s, bfu=$bfusage%) \r", ++$count); + my $bfusage = int($bf->key_count() * 100 / $capacity); + print $progressbar->report("parse IE files: %40b running: %L ETA: %E ($count/$cnt_unsorted_files IEs, tfr=$transferrate_in_MBs MB/s, bfu=$bfusage%) \r", ++$count); my $timestamp = strftime("%Y-%m-%d %H:%M:%S %z (%Z)", localtime(time)); - $report_path->append_utf8("$timestamp, IE $_ with following errors:\n"); - my $ret = parse_iexml($_, $recovery); - my $unseen = bloomfilter_to_unseen($bf, $ret->{files} ); - if (scalar @{$unseen} == 0 ) { $report_path->append_utf8("skipped because files already checked using IE $prev_ie\n"); } - $prev_ie = $_; - foreach my $fileobj (@{ $unseen }) { - if ($bf->key_count() >= (0.8 * $capacity)) { # reset Bloom filter if 80% filled - #print "reset bloomfilter\n"; - $bf = Bloom::Filter->new( - capacity => $capacity, # we cache n last seen files - error_rate => 0.0001, - ); - $bfreset = "⏮"; - } else { $bfreset = "▶";} - $bf->add( $fileobj->{fileid} ); - $fileobj->{file_mounted} = map_file($map_path, $fileobj->{filepath}); - $stat->{files}++; - my $result; - $result->{errors} = 0; - $result->{report_path} = $report_path; - $result = check_if_file_exist($fileobj, $result); - if ($result->{exist}) { - # only if file exists, do additional checks - $result = check_file_size($fileobj, $result); - if ($result->{size}) { - $stat->{scansize} += $result->{size}; - $result = check_file_seekable($fileobj, $result); - if ($result->{seekable}) { - $result = check_file_fixities($fileobj, $result); - } - } - } - - if ($result->{errors} > 0) { - $report_path->append_utf8("-" x 60, "\n"); - $stat->{errors} += $result->{errors}; - } else { # no errors - $report_path->append_utf8(" none\n"); - } - } - # write report + $report_path->append_utf8("$timestamp, IE $actual_ie with following errors:\n"); + my $ret = parse_iexml($actual_ie, $recovery); + my $unseen = bloomfilter_to_unseen($bf, $ret->{files}); + if (scalar @{$unseen} == 0) {$report_path->append_utf8("skipped because files already checked using IE $prev_ie\n");} + $prev_ie = $actual_ie; + ### + stage2_for_ie($unseen, $bf, $map_path, $report_path, $stat); } say ""; $stat->{end} = time;