Select Git revision
metadata_DnxMoveFileOriginalPath.xml
deep_fixitycheck.pl 21.05 KiB
#!/usr/bin/perl -w
###############################################################################
# Author: Andreas Romeyke
# SLUB Dresden, Department Longterm Preservation
#
# License: This script is free available under GNU General Public License V3.0 or higher,
# see file LICENSE.txt for details.
#
# This script scans a given rosetta repository and checks the fixity deeply
#
# For each IE
# check if referenced file exist
# check if size equals stat file size
# foreach algorithm in crc32,md5,sha1,sha256,sha512
# check if written fixity equals fixity of file
#
# if error, report in output, but continue
# report statistics at end
use strict;
use warnings;
use utf8;
use feature qw(say);
use Carp;
use Path::Tiny;
use File::Find;
use XML::LibXML;
use Time::Progress;
use XML::LibXML::XPathContext;
use Getopt::Long::Complete qw(GetOptionsWithCompletion);
use Digest::CRC;
use Digest::MD5;
use Digest::SHA;
use Bloom::Filter;
use Pod::Usage;
use IO::Handle;
use POSIX qw(strftime);
use Fcntl qw(SEEK_END SEEK_SET);
STDOUT->autoflush(1);
STDERR->autoflush(1);
# guarantee, that output will be UTF8
binmode(STDOUT, ":encoding(UTF-8)");
my @algorithms = qw(CRC32 MD5 SHA1 SHA256 SHA512); # used fixity algorithm names in Rosetta
my $capacity = 5000; # bloomfilter capacity (n last seen files)
# scans a given dir for IEs and stores in temporary file
# There are two parts:
# * first detect count of level2 subdirs
# * second use this count to estimate search for Rosetta specific IE*.xml files
# This helps if you have a very deep directory structure
# (in our case the repository is organized in /repo/FN/YYYY/MM/DD/IE/)
# The function returns the count of found IEs
sub searching_ie_files ($$) {
my $dir = shift;
my $tmp_ies_unsorted_path = shift;
my $cnt_unsorted_files = 0;
my $first_two_levels_of_dirs = 0;
my $wanted_twolevel_dircount = sub {
my $relpath = $File::Find::name;
$relpath =~ s{^\Q$dir\E/?}{};
my $depth = File::Spec->splitdir($relpath);
$depth >= 2
and $File::Find::prune = 1;
if (-d $_) { $first_two_levels_of_dirs++;}
};
$tmp_ies_unsorted_path->spew_utf8("");
find( $wanted_twolevel_dircount, $dir);
my $progressbar=Time::Progress->new(min => 0, max => $first_two_levels_of_dirs, smoothing => 1);