From a79401b94a07f8cb1e647ec5796e7921def161af Mon Sep 17 00:00:00 2001
From: Andreas Romeyke <art1@andreas-romeyke.de>
Date: Wed, 2 Sep 2020 11:03:21 +0200
Subject: [PATCH] - init

---
 perl/find_problematic_IEXML_files.pl | 76 ++++++++++++++++++++++++++++
 1 file changed, 76 insertions(+)
 create mode 100644 perl/find_problematic_IEXML_files.pl

diff --git a/perl/find_problematic_IEXML_files.pl b/perl/find_problematic_IEXML_files.pl
new file mode 100644
index 0000000..5dd65e1
--- /dev/null
+++ b/perl/find_problematic_IEXML_files.pl
@@ -0,0 +1,76 @@
+#!/usr/bin/perl -w
+use strict;
+use warnings;
+###############################################################################
+# Author: Andreas Romeyke
+# SLUB Dresden, Department Longterm Preservation
+#
+# scans a given repository and reports problematic IE XML files
+###############################################################################
+
+use feature "say";
+use Carp;
+use File::Find;
+use XML::LibXML;
+use Time::Progress;
+use Path::Tiny;
+use IO::Handle;
+STDOUT->autoflush(1);
+my $dir         = shift @ARGV;
+my $tmp_ies_dir = Path::Tiny->tempdir(
+    TEMPLATE => "exitstrategy_XXXXXXXXXXX",
+    CLEANUP  => 1
+    );
+my $tmp_ies_unsorted_file = $tmp_ies_dir->child("unsorted_ies");
+my $maxcount              = 0;
+my $allcount              = 0;
+my $sp                    = Time::Progress->new();
+my @errorneous            = ();
+
+sub check_xml {
+    my $filename = shift;
+    eval {
+        XML::LibXML->load_xml(
+            location  => $filename,
+            no_blanks => 1,
+            compact   => 1
+            );
+    };
+    push @errorneous, $filename if $@;
+} ## end sub check_xml
+
+sub process_sip () {
+    my $file = $File::Find::name;
+    $allcount++;
+    if ( $file =~ m/V(\d+)-IE\d+\.xml$/ ) {
+        $tmp_ies_unsorted_file->append( $file . "\n" );
+        $maxcount++;
+        print "find IE files $maxcount of $allcount files, "
+          . $sp->report("%4l s   \r");
+    } ## end if ( $file =~ m/V(\d+)-IE\d+\.xml$/)
+
+    return;
+} ## end sub process_sip
+say "searching IE files";
+
+find( \&process_sip, $dir );
+print "\r";
+say "checking IE files";
+my $fh = $tmp_ies_unsorted_file->openr();
+my $p  = Time::Progress->new(
+    min => 0,
+    max => $maxcount
+    );
+my $i = 0;
+
+while (<$fh>) {
+    print $p->report( "check IE files: %40b  ETA: %E   \r", $i++ );
+    chomp;
+    check_xml($_);
+} ## end while (<$fh>)
+print "\r";
+say "done.";
+foreach my $file (@errorneous) {
+    say "   $file";
+}
+1;
-- 
GitLab