Skip to content
Snippets Groups Projects
Select Git revision
  • 826febd8a8df783e1f63de9d83809bcd8aabd87a
  • master default protected
  • sdk_and_event_update
  • remove_file_dublettes
4 results

metadata_DnxMoveFileOriginalPath.xml

Blame
  • deep_fixitycheck.pl 21.05 KiB
    #!/usr/bin/perl -w
    ###############################################################################
    # Author: Andreas Romeyke
    # SLUB Dresden, Department Longterm Preservation
    #
    # License: This script is free available under GNU General Public License V3.0 or higher,
    # see file LICENSE.txt for details.
    #
    # This script scans a given rosetta repository and checks the fixity deeply
    #
    # For each IE
    #   check if referenced file exist
    #   check if size equals stat file size
    #   foreach algorithm in crc32,md5,sha1,sha256,sha512
    #     check if written fixity equals fixity of file
    #
    # if error, report in output, but continue
    # report statistics at end
    
    use strict;
    use warnings;
    use utf8;
    use feature qw(say);
    use Carp;
    use Path::Tiny;
    use File::Find;
    use XML::LibXML;
    use Time::Progress;
    use XML::LibXML::XPathContext;
    use Getopt::Long::Complete qw(GetOptionsWithCompletion);
    use Digest::CRC;
    use Digest::MD5;
    use Digest::SHA;
    use Bloom::Filter;
    use Pod::Usage;
    use IO::Handle;
    use POSIX qw(strftime);
    use Fcntl qw(SEEK_END SEEK_SET);
    STDOUT->autoflush(1);
    STDERR->autoflush(1);
    # guarantee, that output will be UTF8
    binmode(STDOUT, ":encoding(UTF-8)");
    
    my @algorithms = qw(CRC32 MD5 SHA1 SHA256 SHA512); # used fixity algorithm names in Rosetta
    my $capacity = 5000; # bloomfilter capacity (n last seen files)
    
    # scans a given dir for IEs and stores in temporary file
    # There are two parts:
    # * first detect count of level2 subdirs
    # * second use this count to estimate search for Rosetta specific IE*.xml files
    # This helps if you have a very deep directory structure
    # (in our case the repository is organized in /repo/FN/YYYY/MM/DD/IE/)
    # The function returns the count of found IEs
    sub searching_ie_files ($$) {
      my $dir = shift;
      my $tmp_ies_unsorted_path = shift;
      my $cnt_unsorted_files = 0;
      my $first_two_levels_of_dirs = 0;
      my $wanted_twolevel_dircount = sub {
        my $relpath = $File::Find::name;
        $relpath =~ s{^\Q$dir\E/?}{};
        my $depth = File::Spec->splitdir($relpath);
    
        $depth >= 2
            and $File::Find::prune = 1;
        if (-d $_) { $first_two_levels_of_dirs++;}
      };
      $tmp_ies_unsorted_path->spew_utf8("");
      find( $wanted_twolevel_dircount, $dir);
      my $progressbar=Time::Progress->new(min => 0, max => $first_two_levels_of_dirs, smoothing => 1);