Skip to content
Snippets Groups Projects
Commit ed857cad authored by Andreas Romeyke's avatar Andreas Romeyke
Browse files

- stages could be running separately

- typofix
- added explicite cache-file
- added autocomplete support
parent 0391a5ae
No related branches found
No related tags found
No related merge requests found
......@@ -3,7 +3,7 @@
# Author: Andreas Romeyke
# SLUB Dresden, Department Longterm Preservation
#
# License: This script is free available under GNU Gneral Public License V3.0 or higher,
# License: This script is free available under GNU General Public License V3.0 or higher,
# see file LICENSE.txt for details.
#
# This script scans a given rosetta repository and checks the fixity deeply
......@@ -26,9 +26,8 @@ use File::Find;
use XML::LibXML;
use Time::Progress;
use XML::LibXML::XPathContext;
use Getopt::Long;
use constant DEBUG => 0; # no debug
use Digest::CRC;
use Getopt::Long::Complete qw(GetOptionsWithCompletion);
use Digest::MD5;
use Digest::SHA;
use Pod::Usage;
......@@ -50,7 +49,7 @@ my @algorithms = qw(CRC32 MD5 SHA1 SHA256 SHA512); # used fixity algorithm names
# The function returns the count of found IEs
sub searching_ie_files ($$) {
my $dir = shift;
my $tmp_ies_unsorted_file = shift;
my $tmp_ies_unsorted_path = shift;
my $cnt_unsorted_files = 0;
my $first_two_levels_of_dirs = 0;
my $wanted_twolevel_dircount = sub {
......@@ -69,7 +68,7 @@ sub searching_ie_files ($$) {
if (-f && m/V(\d+)-IE\d+\.xml$/) {
my $version = $1;
my $file=$File::Find::name;
$tmp_ies_unsorted_file -> append( $file."\n");
$tmp_ies_unsorted_path->append( $file."\n");
$cnt_unsorted_files++;
$File::Find::prune =1;
} elsif (-d ) {
......@@ -293,54 +292,35 @@ sub check_file_fixities($$) {
return $result;
}
###############################################################################
###############################################################################
############# main ############################################################
###############################################################################
###############################################################################
my $recovery = 1;
my $map_path;
my $search_dir;
my $report_file;
my $help;
GetOptions(
"recovery" => \$recovery,
"map_path=s" => \$map_path,
"search_dir=s" => \$search_dir,
"report=s" => \$report_file,
"help|?" => \$help
) or die "Try --help for usage information";
pod2usage(1) if $help;
if (!defined $map_path || length($map_path) < 1) {
die "map path is empty!";
}
if (!defined $search_dir || length($search_dir) < 1) {
die "search dir is empty!";
}
if (!defined $report_file || length($report_file) < 1) {
die "report file is empty!";
};
if (!path($map_path)->is_dir) {
die "map path $map_path does not exist!";
}
if (!path($search_dir)->is_dir) {
die "search dir $search_dir does not exist!";
}
if ($search_dir !~ m/^$map_path/) {
die "map_path $map_path should be part of search dir $search_dir!";
}
# Here starts the scanning
if (defined $search_dir && -d "$search_dir") {
sub stage1 ($$$) {
my $tmp_ies_unsorted_path = shift;
my $search_dir = shift;
my $report_path = shift;
my $stat;
$stat->{begin} = time;
say "Preparing scan";
say "searching IE files";
# create temporary file to hold list of IEs
my $tmp_ies_dir = Path::Tiny->tempdir( TEMPLATE => "deep_fixitycheck_XXXXXXXXXXX", CLEANUP => 1);
my $tmp_ies_unsorted_file = $tmp_ies_dir->child("unsorted_ies");
$tmp_ies_unsorted_file->touch();
my $cnt_unsorted_files = searching_ie_files($search_dir, $tmp_ies_unsorted_file);
my $cnt_unsorted_files = searching_ie_files($search_dir, $tmp_ies_unsorted_path);
say "Scan finished";
$stat->{end} = time;
$stat->{duration} = $stat->{end} - $stat->{begin};
$report_path->append_utf8("scanned $search_dir in $stat->{duration} seconds, found $cnt_unsorted_files IEs");
return 1;
}
sub stage2 ($$$$) {
my $tmp_ies_unsorted_path = shift;
my $report_path = shift;
my $map_path = shift;
my $recovery = shift;
say "checking IEs";
my $fh_unsorted_file = $tmp_ies_unsorted_file->openr();
my $fh_unsorted_file = $tmp_ies_unsorted_path->openr();
my $cnt_unsorted_files = 0;
while (<$fh_unsorted_file>) {
$cnt_unsorted_files++;
}
seek $fh_unsorted_file, 0, 0; # seek to first byte
my $count = 0;
my $progressbar = Time::Progress->new(min => 0, max => $cnt_unsorted_files, smoothing => 1);
my $stat;
......@@ -349,7 +329,8 @@ if (defined $search_dir && -d "$search_dir") {
$stat->{errors} = 0;
$stat->{scansize} = 0;
$stat->{begin} = time;
while ( <$fh_unsorted_file>) { # scan each IE
while (<$fh_unsorted_file>) {
# scan each IE
$stat->{IEs}++;
chomp;
my $transferrate_in_MBs = sprintf("%0.2f", $stat->{scansize} / (time - $stat->{begin} + 1) / 1024 / 1024);
......@@ -375,11 +356,11 @@ if (defined $search_dir && -d "$search_dir") {
if ($result->{errors} > 0) {
my $timestamp = strftime("%Y-%m-%d %H:%M:%S %z (%Z)", localtime(time));
path($report_file)->append_utf8("$timestamp, IE $_ with following errors:\n");
$report_path->append_utf8("$timestamp, IE $_ with following errors:\n");
foreach my $errors (@{$result->{error_description}}) {
path($report_file)->append_utf8("\t$errors\n");
$report_path->append_utf8("\t$errors\n");
}
path($report_file)->append_utf8("-"x60,"\n");
$report_path->append_utf8("-" x 60, "\n");
$stat->{errors} += $result->{errors};
}
}
......@@ -398,7 +379,7 @@ if (defined $search_dir && -d "$search_dir") {
elsif ($stat->{scansize} > $fac_mega) {$human_readable_size = sprintf "%03.1fMB", $stat->{scansize} / $fac_mega}
elsif ($stat->{scansize} > $fac_kilo) {$human_readable_size = sprintf "%03.1fkB", $stat->{scansize} / $fac_kilo}
else {$human_readable_size = $stat->{scansize} . "B";}
path($report_file)->append_utf8("="x60,"\n");
$report_path->append_utf8("=" x 60, "\n");
my $summary = sprintf("Scanned %d IEs with %d files (%s) in %d seconds, found %d errors",
$stat->{IEs},
$stat->{files},
......@@ -406,8 +387,99 @@ if (defined $search_dir && -d "$search_dir") {
$stat->{duration},
$stat->{errors}
);
path($report_file)->append_utf8("$summary\n");
$report_path->append_utf8("$summary\n");
say $summary;
say "Checking finished";
return 1;
}
###############################################################################
###############################################################################
############# main ############################################################
###############################################################################
###############################################################################
my $recovery = 1;
my $map_path;
my $search_dir;
my $report_file;
my $help;
my $stage = 'both'; # or 'find' or 'check'
my $cache_str = "";
my $cache_file;
GetOptionsWithCompletion(
sub {
my %args = @_;
my $word = $args{word}; # the word to be completed
my $type = $args{type}; # 'optname', 'optval', or 'arg'
my $opt = $args{opt}; # can be an array of oons if ambiguous, e.g. ['--on-fail', '--on-full']
if (defined $opt && $opt eq "--searchdir") {
return Complete::File::complete_dir(starting_path=>"/permanent", recurse=>1, word=>$word)
}
if (defined $opt && $opt eq "--map_path") {
return Complete::File::complete_dir(starting_path=>"/permanent", recurse=>1, word=>$word)
}
if (defined $opt && $opt eq "--cache") {
return Complete::File::complete_dir(starting_path=>"/tmp", recurse=>1, word=>$word)
}
},
"recovery" => \$recovery,
"map_path=s" => \$map_path,
"search_dir=s" => \$search_dir,
"report=s" => \$report_file,
"stage:s" => \$stage,
"cache:s" => \$cache_str,
"help|?|h" => \$help,
) or die "Try --help for usage information";
pod2usage(1) if $help;
if (!defined $map_path || length($map_path) < 1) {
die "map path is empty!";
}
if (!defined $search_dir || length($search_dir) < 1) {
die "search dir is empty!";
}
if (!defined $report_file || length($report_file) < 1) {
die "report file is empty!";
};
if (!path($map_path)->is_dir) {
die "map path $map_path does not exist!";
}
if (!path($search_dir)->is_dir) {
die "search dir $search_dir does not exist!";
}
if ($search_dir !~ m/^$map_path/) {
die "map_path $map_path should be part of search dir $search_dir!";
}
if ($cache_str eq "") {
$cache_file = Path::Tiny->tempfile(TEMPLATE => "deep_fixitycheck_XXXXXXXXXXX", CLEANUP => 1);
} else {
$cache_file = Path::Tiny::path($cache_str)->absolute();
$cache_file->touch();
if (!$cache_file->is_file) {
die "cache file $cache_str does not exist!";
}
}
if (!defined $stage || length($stage) < 1) {
die "stage is empty!";
}
if ($stage !~ m/(both)|(find)|(check)|(1)|(2)/ ) {
die "stage $stage is unknown";
}
# Here starts the scanning
if (defined $search_dir && -d "$search_dir") {
my $report_path = path( $report_file);
if ($stage eq "both" || $stage eq 'find' || $stage == 1) {
stage1($cache_file, $search_dir, $report_path);
}
if ($stage eq "both" || $stage eq 'check' || $stage == 2) {
stage2($cache_file, $report_path, $map_path, $recovery);
}
} else {
die "no directory given on commandline";
}
......@@ -457,6 +529,14 @@ The search_dir is the directory where the search starts. The search_dir could be
The file where the report is stored. If the file exist, the report will be append without warnings.
=item B<--stage>
Select the stage, defaults to execute stage 1 "find" and stage 2 "check"
=item B<--cache_file>
Store results of stage 1 "find" in given file, defaults to a temporary file in /tmp/
=back
=cut
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment