Skip to content
Snippets Groups Projects
Commit ffbb9e80 authored by Andreas Romeyke's avatar Andreas Romeyke
Browse files

- autoflush for stderr enabled

- in check_file_fixities() using Path::Tiny::openr()_raw instead open_r() to avoid extra binmode() call
- extracted common stuff from while-loop in stage2() to stage2_for_ie(),
  this could reduce the memory footprint because reduced scope
- in stage2() instead seek we close and reopen fh now
HINT: no real reason for "out of memory"-problem found
parent 0a197336
No related branches found
No related tags found
No related merge requests found
......@@ -37,6 +37,7 @@ use IO::Handle;
use POSIX qw(strftime);
use Fcntl qw(SEEK_END SEEK_SET);
STDOUT->autoflush(1);
STDERR->autoflush(1);
# guarantee, that output will be UTF8
binmode(STDOUT, ":encoding(UTF-8)");
......@@ -253,6 +254,7 @@ sub check_file_seekable($$) {
if (!$result->{seekable}) {
add_error($fileobj, $result, "is not seekable, $!");
}
close($fh);
return $result;
}
......@@ -267,8 +269,7 @@ sub check_file_fixities($$) {
'SHA256' => Digest::SHA->new(256),
'SHA512' => Digest::SHA->new(512)
);
my $fh = path($fileobj->{file_mounted})->openr();
binmode($fh);
my $fh = path($fileobj->{file_mounted})->openr_raw();
my $buffer;
my $block=0;
my $blocksize = 128*1024; # 128kB blocks
......@@ -327,6 +328,49 @@ sub stage1 ($$$) {
return 1;
}
sub stage2_for_ie {
my $unseen = shift;
my $bf = shift;
my $map_path = shift;
my $report_path = shift;
my $stat = shift;
foreach my $fileobj (@{ $unseen }) {
if ($bf->key_count() >= (0.8 * $capacity)) { # reset Bloom filter if 80% filled
#print "reset bloomfilter\n";
$bf = Bloom::Filter->new(
capacity => $capacity, # we cache n last seen files
error_rate => 0.0001,
);
}
$bf->add( $fileobj->{fileid} );
$fileobj->{file_mounted} = map_file($map_path, $fileobj->{filepath});
$stat->{files}++;
my $result;
$result->{errors} = 0;
$result->{report_path} = $report_path;
$result = check_if_file_exist($fileobj, $result);
if ($result->{exist}) {
# only if file exists, do additional checks
$result = check_file_size($fileobj, $result);
if ($result->{size}) {
$stat->{scansize} += $result->{size};
$result = check_file_seekable($fileobj, $result);
if ($result->{seekable}) {
$result = check_file_fixities($fileobj, $result);
}
}
}
if ($result->{errors} > 0) {
$report_path->append_utf8("-" x 60, "\n");
$stat->{errors} += $result->{errors};
} else { # no errors
$report_path->append_utf8(" none\n");
}
}
# write report
}
sub stage2 ($$$$) {
my $tmp_ies_unsorted_path = shift;
my $report_path = shift;
......@@ -337,7 +381,7 @@ sub stage2 ($$$$) {
while (<$fh_unsorted_file>) {
$cnt_unsorted_files++;
}
seek $fh_unsorted_file, 0, 0; # seek to first byte
close($fh_unsorted_file);
my $count = 0;
my $bf = Bloom::Filter->new(
capacity => $capacity, # we cache n last seen files
......@@ -352,57 +396,24 @@ sub stage2 ($$$$) {
$stat->{errors} = 0;
$stat->{scansize} = 0;
$stat->{begin} = time;
my $bfreset="";
my $prev_ie = "";
$fh_unsorted_file = $tmp_ies_unsorted_path->openr();
while (<$fh_unsorted_file>) {
my $actual_ie = $_;
# scan each IE
$stat->{IEs}++;
chomp;
my $transferrate_in_MBs = sprintf("%0.2f", $stat->{scansize} / (time - $stat->{begin} + 1) / 1024 / 1024);
my $bfusage = int($bf->key_count()*100/$capacity );
print $progressbar->report("parse IE files: %40b $bfreset running: %L ETA: %E ($count/$cnt_unsorted_files IEs, tfr=$transferrate_in_MBs MB/s, bfu=$bfusage%) \r", ++$count);
my $bfusage = int($bf->key_count() * 100 / $capacity);
print $progressbar->report("parse IE files: %40b running: %L ETA: %E ($count/$cnt_unsorted_files IEs, tfr=$transferrate_in_MBs MB/s, bfu=$bfusage%) \r", ++$count);
my $timestamp = strftime("%Y-%m-%d %H:%M:%S %z (%Z)", localtime(time));
$report_path->append_utf8("$timestamp, IE $_ with following errors:\n");
my $ret = parse_iexml($_, $recovery);
my $unseen = bloomfilter_to_unseen($bf, $ret->{files} );
if (scalar @{$unseen} == 0 ) { $report_path->append_utf8("skipped because files already checked using IE $prev_ie\n"); }
$prev_ie = $_;
foreach my $fileobj (@{ $unseen }) {
if ($bf->key_count() >= (0.8 * $capacity)) { # reset Bloom filter if 80% filled
#print "reset bloomfilter\n";
$bf = Bloom::Filter->new(
capacity => $capacity, # we cache n last seen files
error_rate => 0.0001,
);
$bfreset = "";
} else { $bfreset = "";}
$bf->add( $fileobj->{fileid} );
$fileobj->{file_mounted} = map_file($map_path, $fileobj->{filepath});
$stat->{files}++;
my $result;
$result->{errors} = 0;
$result->{report_path} = $report_path;
$result = check_if_file_exist($fileobj, $result);
if ($result->{exist}) {
# only if file exists, do additional checks
$result = check_file_size($fileobj, $result);
if ($result->{size}) {
$stat->{scansize} += $result->{size};
$result = check_file_seekable($fileobj, $result);
if ($result->{seekable}) {
$result = check_file_fixities($fileobj, $result);
}
}
}
if ($result->{errors} > 0) {
$report_path->append_utf8("-" x 60, "\n");
$stat->{errors} += $result->{errors};
} else { # no errors
$report_path->append_utf8(" none\n");
}
}
# write report
$report_path->append_utf8("$timestamp, IE $actual_ie with following errors:\n");
my $ret = parse_iexml($actual_ie, $recovery);
my $unseen = bloomfilter_to_unseen($bf, $ret->{files});
if (scalar @{$unseen} == 0) {$report_path->append_utf8("skipped because files already checked using IE $prev_ie\n");}
$prev_ie = $actual_ie;
###
stage2_for_ie($unseen, $bf, $map_path, $report_path, $stat);
}
say "";
$stat->{end} = time;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment