Skip to content
Snippets Groups Projects
Commit 1619cd6c authored by Andreas Romeyke's avatar Andreas Romeyke
Browse files

- init

parents
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/perl -w
###############################################################################
# Author: Andreas Romeyke
# SLUB Dresden, Department Longterm Preservation
#
# This script scans a given rosetta repository and checks the fixity deeply
#
# For each IE
# check if referenced file exist
# check if size equals stat file size
# foreach algorithm in crc32,md5,sha1,sha256,sha512
# check if written fixity equals fixity of file
#
# if error, report in output, but continue
# report statistics at end
#
# Usage: perl ./deep_fixitycheck.pl report_file.txt /permanent
use strict;
use warnings;
use feature qw(say);
use Carp;
use Path::Tiny;
use File::Find;
use File::Sort qw(sort_file);
use XML::LibXML;
use Time::Progress;
use XML::LibXML::XPathContext;
use Getopt::Long;
use constant DEBUG => 0; # no debug
use IO::Handle;
STDOUT->autoflush(1);
# guarantee, that output will be UTF8
binmode(STDOUT, ":encoding(UTF-8)");
my @algorithms = qw(CRC32 MD5 SHA1 SHA256 SHA512);
sub searching_ie_files ($$) {
my $dir = shift;
my $tmp_ies_unsorted_file = shift;
my $cnt_unsorted_files = 0;
my $first_two_levels_of_dirs = 0;
my $wanted_twolevel_dircount = sub {
my $relpath = $File::Find::name;
$relpath =~ s{^\Q$dir\E/?}{};
my $depth = File::Spec->splitdir($relpath);
$depth >= 2
and $File::Find::prune = 1;
if (-d $_) { $first_two_levels_of_dirs++;}
};
find( $wanted_twolevel_dircount, $dir);
my $progressbar=Time::Progress->new(min => 0, max => $first_two_levels_of_dirs, smoothing => 1);
my $dircount = 0;
my $wanted_process_sip = sub {
if (-f && m/V(\d+)-IE\d+\.xml$/) {
my $version = $1;
my $file=$File::Find::name;
$tmp_ies_unsorted_file -> append( $file."\n");
$cnt_unsorted_files++;
$File::Find::prune =1;
} elsif (-d ) {
my $relpath = $File::Find::name;
$relpath =~ s{^\Q$dir\E/?}{};
my $depth = File::Spec->splitdir($relpath);
if ($depth <= 2) {
print $progressbar->report("find IE files: %40b ETA: %E \r", $dircount++);
}
}
return;
};
find($wanted_process_sip, $dir);
say "";
return $cnt_unsorted_files;
}
###############################################################################
#
# /mets:mets/mets:dmdSec[1]/mets:mdWrap[1]/mets:xmlData[1]/dc:record[1]/dc:title[1]
# /mets:mets/mets:amdSec[1]/mets:techMD[1]/mets:mdWrap[1]/mets:xmlData[1]/dnx[1]/section[1]/record[1]/key[2]
# mit ID=Label und Wert = LOCAL
# dort die ID von techMD (Referenz für Files)
#
# Files via /mets:mets/mets:fileSec[1]/mets:fileGrp[1]/mets:file[1]/mets:FLocat[1]
#
###############################################################################
sub parse_iexml ($$) {
my $filename = shift;
my $recovery_flag = shift;
# create object
#
#my $xp = XML::XPath->new (filename => $filename);
my $dom = XML::LibXML->load_xml (location => $filename, recover => $recovery_flag, no_blanks=>1, compact=>1, );
my $xp = XML::LibXML::XPathContext->new($dom);
$xp->registerNs("dnx", "http://www.exlibrisgroup.com/dps/dnx");
$xp->registerNs("sru", "http://www.loc.gov/zing/srw/");
$xp->registerNs("xsi", "http://www.w3.org/2001/XMLSchema-instance");
$xp->registerNs("dc", "http://purl.org/dc/elements/1.1/");
$xp->registerNs("mets", "http://www.loc.gov/METS/");
$xp->registerNs("rosettamets", "http://www.exlibrisgroup.com/xsd/dps/rosettaMets");
$xp->registerNs("mods", "http://www.loc.gov/mods/v3");
$xp->registerNs("ns2", "http://dps.exlibris.com/");
$xp->registerNs("dv", "http://dfg-viewer.de/");
$xp->registerNs("slub", "http://slub-dresden.de/");
$xp->registerNs("archive", "http://slub-dresden.de/slubarchiv");
$xp->registerNs("premis", "info:lc/xmlns/premis-v2");
$xp->registerNs("mix", "http://www.loc.gov/standards/mix/");
$xp->registerNs("xlink", "http://www.w3.org/1999/xlink");
$xp->registerNs("xlin", "http://www.w3.org/1999/xlink");
############################################
############################################
# get right representation ID (has a dnx-section with <key id=label>LOCAL</key>)
my $compiled_xpath_amdsecs = '/mets:mets/mets:amdSec';
my $compiled_xpath_localreps = 'mets:techMD/mets:mdWrap/mets:xmlData/dnx/section/record/key[@id=\'label\']';
my $compiled_xpath_filegrps = '/mets:mets/mets:fileSec/mets:fileGrp';
my $compiled_xpath_flocat = 'mets:file/mets:FLocat';
my $repids = $xp->find($compiled_xpath_amdsecs);
my $repid;
# FIXME: if only one represenation exists (Qucosa), select this. If there
# are more than one, use them with label LOCAL
my @repnodes = $repids->get_nodelist;
$repid = $repnodes[0]->findvalue('@ID' );
foreach my $node (@repnodes) {
my $id = $node->findvalue('@ID' );
#/mets:mets/mets:amdSec[1]/mets:techMD[1]/mets:mdWrap[1]/mets:xmlData[1]/dnx[1]/section[1]/record[1]/key[1]
#
if ($node->findvalue($compiled_xpath_localreps) eq 'LOCAL') {
$repid=$id;
}
#print XML::XPath::XMLParser::as_string($node), "\n\n";
}
############################################
# get all files of LOCAL representation
my @files;
my $filegrpnodes = $xp->find($compiled_xpath_filegrps);
foreach my $filegrpnode ($filegrpnodes->get_nodelist) {
#die XML::XPath::XMLParser::as_string($filegrpnode), "\n\n";
#die Dumper($filegrpnode);
if ($filegrpnode->findvalue('@ADMID') eq $repid) {
#die Dumper($filegrpnode);
my $filesnodes = $filegrpnode ->find($compiled_xpath_flocat);
foreach my $filesnode ($filesnodes->get_nodelist) {
my $fileobj;
$fileobj->{'filepath'} = $filesnode->findvalue('@xlin:href');
my $fileadmid = $filesnode->findvalue('../@ADMID');
$fileobj->{'fileid'} = $filesnode->findvalue('../@ID');
# find size
# /mets:mets/mets:amdSec[2]/mets:techMD[1]/mets:mdWrap[1]/mets:xmlData[1]/*[namespace-uri()='http://www.exlibrisgroup.com/dps/dnx' and local-name()='dnx'][1]/*[namespace-uri()='http://www.exlibrisgroup.com/dps/dnx' and local-name()='section'][7]/*[namespace-uri()='http://www.exlibrisgroup.com/dps/dnx' and local-name()='record'][1]/*[namespace-uri()='http://www.exlibrisgroup.com/dps/dnx' and local-name()='key'][8]/@id
my $xpath_dnx = "*[namespace-uri()='http://www.exlibrisgroup.com/dps/dnx' and local-name()='dnx']";
my $xpath_section = "*[namespace-uri()='http://www.exlibrisgroup.com/dps/dnx' and local-name()='section']";
my $xpath_record = "*[namespace-uri()='http://www.exlibrisgroup.com/dps/dnx' and local-name()='record']";
my $xpath_key_part = "namespace-uri()='http://www.exlibrisgroup.com/dps/dnx' and local-name()='key'";
my $fileamd = $xp->find("/mets:mets/mets:amdSec[\@ID=\'$fileadmid\']/mets:techMD/mets:mdWrap/mets:xmlData");
my $fileamd_node = $fileamd->get_node(1);
my $size = $fileamd_node->findvalue(".//*[namespace-uri()='http://www.exlibrisgroup.com/dps/dnx' and local-name()='key' and \@id='fileSizeBytes']");
$fileobj->{size} = $size;
# find checksums
foreach my $fixity_algorithm (@algorithms ) {
my $xpath = sprintf ".//%s[*[%s and \@id=\'fixityType\']=\'%s\']/*[%s and \@id=\'fixityValue\']", $xpath_record, $xpath_key_part, $fixity_algorithm, $xpath_key_part;
my $checksum = $fileamd_node->findvalue($xpath);
$fileobj->{'fixity'}->{$fixity_algorithm} = (defined $checksum) ? $checksum : '';
}
push @files, $fileobj;
}
}
}
my %ret;
$ret{"filename" } = $filename;
$ret{"size"} = -s $filename;
$ret{"MD5"} = path($filename)->digest("MD5");
$ret{"repid"} = $repid;
$ret{"files"} = \@files;
return \%ret;
}
###############################################################################
###############################################################################
############# main ############################################################
###############################################################################
###############################################################################
my $report_file = shift @ARGV;
my $recovery = 1;
my $dir = shift @ARGV;
if (defined $dir && -d "$dir") {
say "Preparing scan";
say "searching IE files";
my $tmp_ies_dir = Path::Tiny->tempdir( TEMPLATE => "deep_fixitycheck_XXXXXXXXXXX", CLEANUP => 0);
my $tmp_ies_unsorted_file = $tmp_ies_dir->child("unsorted_ies");
$tmp_ies_unsorted_file->touch();
my $cnt_unsorted_files = searching_ie_files($dir, $tmp_ies_unsorted_file);
say "checking IEs";
say $tmp_ies_unsorted_file->absolute()->stringify;
my $fh_unsorted_file = $tmp_ies_unsorted_file->openr();
my $count = 0;
my $progressbar = Time::Progress->new(min=>0, max=>$cnt_unsorted_files, smoothing => 1);
my $stat;
while ( <$fh_unsorted_file>) {
$stat->{IEs}++;
chomp;
print $progressbar->report("parse IE files: %40b ETA: %E \r", $count++);
use Data::Printer;
#p( $_);
my $ret = parse_iexml( $_, $recovery);
p( $ret);
foreach my $fileobj (@{ $ret->{files} }) {
$stat->{files}++;
my $result;
# check size
my $filepath = path($fileobj->{filepath});
$result->{errors} = 0;
$result->{exist} = $filepath->is_file();
if (!$result->{exist}) {
$result->{errors}++;
push @{$result->{error_description}}, "file $filepath does not exist nor readable";
}
$result->{size} = -s $filepath->visit(sub{ my ($path, $state) = @_;
return if $path->is_dir;
$state->{$path} = -s $path || "";
}, {recurse => 0});
use Data::Printer; p($result);
if ($result->{size} ne $ret->{size}) {
$result->{errors}++;
push @{$result->{error_description}}, "file $filepath has size $result->{size} but $ret->{size} was expected";
}
foreach my $fixity_algorithm ( @algorithms) {
$result->{fixity}->{$fixity_algorithm} = $filepath->digest($fixity_algorithm);
if ($result->{fixity}->{$fixity_algorithm} ne $ret->{fixity}->{$fixity_algorithm}) {
$result->{errors}++;
push @{$result->{error_description}}, "file $filepath has fixity $result->{fixity}->{$fixity_algorithm} for algorithm $fixity_algorithm, but $ret->{fixity}->{$fixity_algorithm} was expected";
}
}
if ($result->{errors} > 0) {
path($report_file)->append_utf8("IE $_ with following errors:\n");
foreach my $errors (@{ $result->{error_description} }) {
path($report_file)->append_utf8("\t$errors\n");
}
$stat->{errors} += $result->{errors};
}
}
# write report
}
say "";
use Data::Printer;
p( $stat);
} else {
die "no directory given on commandline";
}
say "";
1;
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment