diff --git a/perl/find_problematic_IEXML_files.pl b/perl/find_problematic_IEXML_files.pl new file mode 100644 index 0000000000000000000000000000000000000000..5dd65e182cc394cfd4490f9f3f2cb9abf0732271 --- /dev/null +++ b/perl/find_problematic_IEXML_files.pl @@ -0,0 +1,76 @@ +#!/usr/bin/perl -w +use strict; +use warnings; +############################################################################### +# Author: Andreas Romeyke +# SLUB Dresden, Department Longterm Preservation +# +# scans a given repository and reports problematic IE XML files +############################################################################### + +use feature "say"; +use Carp; +use File::Find; +use XML::LibXML; +use Time::Progress; +use Path::Tiny; +use IO::Handle; +STDOUT->autoflush(1); +my $dir = shift @ARGV; +my $tmp_ies_dir = Path::Tiny->tempdir( + TEMPLATE => "exitstrategy_XXXXXXXXXXX", + CLEANUP => 1 + ); +my $tmp_ies_unsorted_file = $tmp_ies_dir->child("unsorted_ies"); +my $maxcount = 0; +my $allcount = 0; +my $sp = Time::Progress->new(); +my @errorneous = (); + +sub check_xml { + my $filename = shift; + eval { + XML::LibXML->load_xml( + location => $filename, + no_blanks => 1, + compact => 1 + ); + }; + push @errorneous, $filename if $@; +} ## end sub check_xml + +sub process_sip () { + my $file = $File::Find::name; + $allcount++; + if ( $file =~ m/V(\d+)-IE\d+\.xml$/ ) { + $tmp_ies_unsorted_file->append( $file . "\n" ); + $maxcount++; + print "find IE files $maxcount of $allcount files, " + . $sp->report("%4l s \r"); + } ## end if ( $file =~ m/V(\d+)-IE\d+\.xml$/) + + return; +} ## end sub process_sip +say "searching IE files"; + +find( \&process_sip, $dir ); +print "\r"; +say "checking IE files"; +my $fh = $tmp_ies_unsorted_file->openr(); +my $p = Time::Progress->new( + min => 0, + max => $maxcount + ); +my $i = 0; + +while (<$fh>) { + print $p->report( "check IE files: %40b ETA: %E \r", $i++ ); + chomp; + check_xml($_); +} ## end while (<$fh>) +print "\r"; +say "done."; +foreach my $file (@errorneous) { + say " $file"; +} +1;