From 670756078e3dbd989027b616f516ee9dabec4fbf Mon Sep 17 00:00:00 2001 From: Andreas Romeyke <art1@andreas-romeyke.de> Date: Tue, 4 Jan 2022 19:39:00 +0100 Subject: [PATCH] - added bloomfilter to speedup file checks if shared between multiple IEs, see issue https://git.slub-dresden.de/digital-preservation/rosettadeepfixity/-/issues/1 --- deep_fixitycheck.pl | 42 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 39 insertions(+), 3 deletions(-) diff --git a/deep_fixitycheck.pl b/deep_fixitycheck.pl index 3f9db7b..095c163 100644 --- a/deep_fixitycheck.pl +++ b/deep_fixitycheck.pl @@ -19,6 +19,7 @@ use strict; use warnings; +use utf8; use feature qw(say); use Carp; use Path::Tiny; @@ -30,6 +31,7 @@ use Getopt::Long::Complete qw(GetOptionsWithCompletion); use Digest::CRC; use Digest::MD5; use Digest::SHA; +use Bloom::Filter; use Pod::Usage; use IO::Handle; use POSIX qw(strftime); @@ -39,6 +41,7 @@ STDOUT->autoflush(1); binmode(STDOUT, ":encoding(UTF-8)"); my @algorithms = qw(CRC32 MD5 SHA1 SHA256 SHA512); # used fixity algorithm names in Rosetta +my $capacity = 500; # bloomfilter capacity (n last seen files) # scans a given dir for IEs and stores in temporary file # There are two parts: @@ -293,6 +296,19 @@ sub check_file_fixities($$) { return $result; } +sub bloomfilter_to_unseen { + my $bf = shift; + my $files_ref = shift; + my @fileids = map { $_->{fileid} } @{ $files_ref }; + my @have_seen = $bf->check( @fileids ); + my @unseen; + for (my $idx = 0; $idx <= $#fileids; $idx++) { + next if $have_seen[$idx]; + push @unseen, $files_ref->[$idx]; + } + return \@unseen; +} + sub stage1 ($$$) { my $tmp_ies_unsorted_path = shift; my $search_dir = shift; @@ -315,7 +331,6 @@ sub stage2 ($$$$) { my $report_path = shift; my $map_path = shift; my $recovery = shift; - say "checking IEs"; my $fh_unsorted_file = $tmp_ies_unsorted_path->openr(); my $cnt_unsorted_files = 0; while (<$fh_unsorted_file>) { @@ -323,6 +338,12 @@ sub stage2 ($$$$) { } seek $fh_unsorted_file, 0, 0; # seek to first byte my $count = 0; + my $bf = Bloom::Filter->new( + capacity => $capacity, # we cache n last seen files + error_rate => 0.0001, + ); + say "using bloomfilter of length ".int($bf->length/8)." bytes"; + say "checking IEs"; my $progressbar = Time::Progress->new(min => 0, max => $cnt_unsorted_files, smoothing => 1); my $stat; $stat->{IEs} = 0; @@ -330,16 +351,31 @@ sub stage2 ($$$$) { $stat->{errors} = 0; $stat->{scansize} = 0; $stat->{begin} = time; + my $bfreset="⏸"; + my $prev_ie = ""; while (<$fh_unsorted_file>) { # scan each IE $stat->{IEs}++; chomp; my $transferrate_in_MBs = sprintf("%0.2f", $stat->{scansize} / (time - $stat->{begin} + 1) / 1024 / 1024); - print $progressbar->report("parse IE files: %40b running: %L ETA: %E ($count/$cnt_unsorted_files IEs, tfr=$transferrate_in_MBs MB/s) \r", ++$count); + my $bfusage = int($bf->key_count()*100/$capacity ); + print $progressbar->report("parse IE files: %40b $bfreset running: %L ETA: %E ($count/$cnt_unsorted_files IEs, tfr=$transferrate_in_MBs MB/s, bfu=$bfusage%) \r", ++$count); my $timestamp = strftime("%Y-%m-%d %H:%M:%S %z (%Z)", localtime(time)); $report_path->append_utf8("$timestamp, IE $_ with following errors:\n"); my $ret = parse_iexml($_, $recovery); - foreach my $fileobj (@{$ret->{files}}) { + my $unseen = bloomfilter_to_unseen($bf, $ret->{files} ); + if (scalar @{$unseen} == 0 ) { $report_path->append_utf8("skipped because files already checked using IE $prev_ie\n"); } + $prev_ie = $_; + foreach my $fileobj (@{ $unseen }) { + if ($bf->key_count() >= (0.8 * $capacity)) { # reset Bloom filter if 80% filled + #print "reset bloomfilter\n"; + $bf = Bloom::Filter->new( + capacity => $capacity, # we cache n last seen files + error_rate => 0.0001, + ); + $bfreset = "⏮"; + } else { $bfreset = "▶";} + $bf->add( $fileobj->{fileid} ); $fileobj->{file_mounted} = map_file($map_path, $fileobj->{filepath}); $stat->{files}++; my $result; -- GitLab