- reintegrate patches from another free-floating repository:

- reorganized-modules-test-and-examples - code-clean-up-removed-all-trailing-whitespaces - improved-memory-usage-by-using-filehandles - added-a-recovery-mode-if-AIPs-have-unclean-IE-XML - fix-increased-field-size-for-dc-strings-to-prevent - fixed-size-to-8192

- reintegrate patches from another free-floating repository:
b3c344d5 · Andreas Romeyke · 9fb93a76 · b3c344d5
Commit b3c344d5 authored 5 years ago by Andreas Romeyke
--- a/perl/exit_strategy.pl
+++ b/perl/exit_strategy.pl
@@ -7,23 +7,51 @@
 # This is part of the exit-strategy for details, see asciidoc file
 # exit_strategie.asciidoc (also contains ER-diagram for database)
 #
+# call with:
+#
+# perl ./exit_strategy.pl rosetta_exit_strategy/tmp.sql /permanent/
+#
 # file tested with postgres-database
 #
-# using:  
+# using then with:
 #         psql -U romeyke -d exit_strategy \
 #              -f rosetta_exit_strategy/tmp.sql -L rosetta_exit.log
 #
 ###############################################################################
+#
+# WARNING:
+#
+# the following messages only occure if you had an unclean SIP ingest process,
+# it means that in your IE-XML are wrong/unused namespace declarations
+#
+# if some AIPs are wrong with messages like:
+#
+# '/permanent_storage/normal/2017/07/05/IE1043/V1-IE1043.xml:6: 
+#  namespace error : xmlns:mods: 'http://www.loc.gov/mods/v3
+#   http://www.loc.gov/standards/mods/v3/mods-3-0.xsd' is not a valid URI
+#  s="http://www.loc.gov/mods/v3
+#   http://www.loc.gov/standards/mods/v3/mods-3-0.xsd"'
+#
+# then (and only then) try this:
+#
+# perl ./exit_strategy.pl --recover rosetta_exit_strategy/tmp.sql /permanent/
+#
+###############################################################################
-use 5.28.0;
 use strict;
 use warnings;
+use feature "say";
 use Carp;
 use Path::Tiny;
 use File::Find;
 use File::Sort qw(sort_file);
-use XML::XPath;
+use XML::LibXML;
-use XML::XPath::XMLParser;
+use Time::Progress;
+use XML::LibXML::XPathContext;
+use Getopt::Long;
+use constant DEBUG => 0; # no debug
 # guarantee, that output will be UTF8
 binmode(STDOUT, ":encoding(UTF-8)");
@@ -38,90 +66,94 @@ my $sourcetype="hdd"; #default value
 #   if IE.xml file found, read its metadata, create SQL add entry
 #   write SQL add entry
 ###############################################################################
-sub write_database_creation {
+sub write_database_creation ($) {
+  my $fh = shift;
     # non standard conform SQL keywords
-     #say "CREATE DATABASE $db_name;";
+     #say $fh "CREATE DATABASE $db_name;";
-     #say "CREATE SCHEMA $schema_name;";
+     #say $fh "CREATE SCHEMA $schema_name;";
-     #say "USE ";
+     #say $fh "USE ";
+  return;
 }
 # write tables creation;:
-sub write_tables_creation {
+sub write_tables_creation ($) {
+  my $fh = shift;
  # Transactions for tables creation
-  say "BEGIN;";
+  say $fh "BEGIN;";
  # SEQUENCE
-  say "/* create SEQUENCE generator */";
+  say $fh "/* create SEQUENCE generator */";
-  say "CREATE SEQUENCE serial START 1;";
+  say $fh "CREATE SEQUENCE serial START 1;";
  # AIP
-  say "/* create AIP table */";
+  say $fh "/* create AIP table */";
-  say "CREATE TABLE aip (";
+  say $fh "CREATE TABLE aip (";
-  say "\tid INT PRIMARY KEY DEFAULT nextval('serial'),";
+  say $fh "\tid INT PRIMARY KEY DEFAULT nextval('serial'),";
-  say "\tie_id VARCHAR(30) NOT NULL UNIQUE";
+  say $fh "\tie_id VARCHAR(30) NOT NULL UNIQUE";
-  say ");";
+  say $fh ");";
  # IEFILE
-  say "/* create IEFILE table */";
+  say $fh "/* create IEFILE table */";
-  say "CREATE TABLE metadatafile (";
+  say $fh "CREATE TABLE metadatafile (";
-  say "\tid INT PRIMARY KEY DEFAULT nextval('serial'),";
+  say $fh "\tid INT PRIMARY KEY DEFAULT nextval('serial'),";
-  say "\taip_id INT NOT NULL REFERENCES aip (id),";
+  say $fh "\taip_id INT NOT NULL REFERENCES aip (id),";
-  say "\tlocation VARCHAR(1024) NOT NULL,";
+  say $fh "\tlocation VARCHAR(1024) NOT NULL,";
-  say "\tsourcetype VARCHAR(30) NOT NULL";
+  say $fh "\tsourcetype VARCHAR(30) NOT NULL";
-  say ");";
+  say $fh ");";
  # DC
-  say "/* create DC table */";
+  say $fh "/* create DC table */";
-  say "CREATE TABLE dc (";
+  say $fh "CREATE TABLE dc (";
-  say "\tid INT PRIMARY KEY DEFAULT nextval('serial'),";
+  say $fh "\tid INT PRIMARY KEY DEFAULT nextval('serial'),";
-  say "\taip_id INT NOT NULL REFERENCES aip (id),";
+  say $fh "\taip_id INT NOT NULL REFERENCES aip (id),";
-  say "\telement VARCHAR(30) NOT NULL,";
+  say $fh "\telement VARCHAR(30) NOT NULL,";
-  say "\tvalue VARCHAR(8192) NOT NULL";
+  say $fh "\tvalue VARCHAR(8192) NOT NULL";
-  say ");";
+  say $fh ");";
  # FILE
-  say "/* create FILE table */";      
+  say $fh "/* create FILE table */";
-  say "CREATE TABLE sourcedatafile (";
+  say $fh "CREATE TABLE sourcedatafile (";
-  say "\tid INT PRIMARY KEY DEFAULT nextval('serial'), ";
+  say $fh "\tid INT PRIMARY KEY DEFAULT nextval('serial'), ";
-  say "\taip_id INT NOT NULL REFERENCES aip (id),";
+  say $fh "\taip_id INT NOT NULL REFERENCES aip (id),";
-  say "\tname VARCHAR(1024) NOT NULL";
+  say $fh "\tname VARCHAR(1024) NOT NULL";
-  say ");";
+  say $fh ");";
  # LOCAT
-  say "/* create LOCAT table */";            
+  say $fh "/* create LOCAT table */";
-  say "CREATE TABLE sourcedatalocat (";
+  say $fh "CREATE TABLE sourcedatalocat (";
-  say "\tid INT PRIMARY KEY DEFAULT nextval('serial'),";
+  say $fh "\tid INT PRIMARY KEY DEFAULT nextval('serial'),";
-  say "\tfile_id INT NOT NULL REFERENCES sourcedatafile (id),";
+  say $fh "\tfile_id INT NOT NULL REFERENCES sourcedatafile (id),";
-  say "\tlocation VARCHAR(1024) NOT NULL,";
+  say $fh "\tlocation VARCHAR(1024) NOT NULL,";
-  say "\tsourcetype VARCHAR(30) NOT NULL";
+  say $fh "\tsourcetype VARCHAR(30) NOT NULL";
-  say ");";
+  say $fh ");";
  #end transaction
-  say "COMMIT;";
+  say $fh "COMMIT;";
  return;
 }
 ###############################################################################
 # Prepare SQL INSERT Statements for AIPs
 ###############################################################################
-sub write_prepare_insert {
+sub write_prepare_insert ($) {
-  say "BEGIN;";
+  my $fh = shift;
-  say "PREPARE aip_plan (varchar) AS";
+  say $fh "BEGIN;";
-  say "  INSERT INTO aip (ie_id) VALUES (\$1);";
+  say $fh "PREPARE aip_plan (varchar) AS";
-  say "PREPARE ie_plan (varchar, varchar, varchar) AS";
+  say $fh "  INSERT INTO aip (ie_id) VALUES (\$1);";
-  say "  INSERT INTO metadatafile (aip_id, location, sourcetype) VALUES (";
+  say $fh "PREPARE ie_plan (varchar, varchar, varchar) AS";
-  say "    (SELECT id FROM aip WHERE aip.ie_id=\$1), \$2, \$3";
+  say $fh "  INSERT INTO metadatafile (aip_id, location, sourcetype) VALUES (";
-  say "  );";
+  say $fh "    (SELECT id FROM aip WHERE aip.ie_id=\$1), \$2, \$3";
-  say "PREPARE file_plan (varchar, varchar) AS";
+  say $fh "  );";
-  say "  INSERT INTO sourcedatafile (aip_id, name) VALUES (";
+  say $fh "PREPARE file_plan (varchar, varchar) AS";
-  say "    (SELECT id FROM aip WHERE aip.ie_id=\$1), \$2";
+  say $fh "  INSERT INTO sourcedatafile (aip_id, name) VALUES (";
-  say "  );";
+  say $fh "    (SELECT id FROM aip WHERE aip.ie_id=\$1), \$2";
-  say "PREPARE locat_plan (varchar, varchar, varchar, varchar) AS";
+  say $fh "  );";
-  say "  INSERT INTO sourcedatalocat (file_id, location, sourcetype) VALUES (";
+  say $fh "PREPARE locat_plan (varchar, varchar, varchar, varchar) AS";
-  say "    (SELECT sourcedatafile.id FROM sourcedatafile,aip WHERE";
+  say $fh "  INSERT INTO sourcedatalocat (file_id, location, sourcetype) VALUES (";
-  say "    sourcedatafile.aip_id=aip.id AND aip.ie_id=\$1 AND";
+  say $fh "    (SELECT sourcedatafile.id FROM sourcedatafile,aip WHERE";
-  say "    sourcedatafile.name=\$2), \$3, \$4";
+  say $fh "    sourcedatafile.aip_id=aip.id AND aip.ie_id=\$1 AND";
-  say "  );";
+  say $fh "    sourcedatafile.name=\$2), \$3, \$4";
-  say "PREPARE dc_plan (varchar, varchar, varchar) AS";
+  say $fh "  );";
-  say "  INSERT INTO dc (aip_id, element, value) VALUES (";
+  say $fh "PREPARE dc_plan (varchar, varchar, varchar) AS";
-  say "    (SELECT id FROM aip WHERE aip.ie_id=\$1), \$2, \$3";
+  say $fh "  INSERT INTO dc (aip_id, element, value) VALUES (";
-  say "  );";
+  say $fh "    (SELECT id FROM aip WHERE aip.ie_id=\$1), \$2, \$3";
-  say "COMMIT;";
+  say $fh "  );";
+  say $fh "COMMIT;";
  return;
 }
@@ -147,28 +179,28 @@ sub write_prepare_insert {
 #     $ret{"files"} = \@files;
 #     $ret{"dcrecords"} = \@dcrecords;
 ###############################################################################
-sub write_addsql {
+sub write_addsql ($$) {
-  my $refhash = $_[0];
+  my $fh = shift;
+  my $refhash = shift;
  my $ieid = path($refhash->{"filename"})->basename(qw/.xml/);
-  say "BEGIN;";
+  say $fh "BEGIN;";
-  say "EXECUTE aip_plan ('$ieid');";
+  say $fh "EXECUTE aip_plan ('$ieid');";
  # FIXME if multiple locations exists
  my $iefile = path($refhash->{"filename"})->basename();
-  say "EXECUTE ie_plan ('$ieid', '$iefile', '$sourcetype');";
+  say $fh "EXECUTE ie_plan ('$ieid', '$iefile', '$sourcetype');";
  foreach my $location (@{$refhash->{"files"}}) {
    my $file = path($location)->basename(); # FIXME if multiple locations
-    say "EXECUTE file_plan ('$ieid', '$file');";
+    say $fh "EXECUTE file_plan ('$ieid', '$file');";
-    say "EXECUTE locat_plan ('$ieid', '$file', '$location', '$sourcetype' );";
+    say $fh "EXECUTE locat_plan ('$ieid', '$file', '$location', '$sourcetype' );";
  }
  foreach my $dcpair   (@{$refhash->{"dcrecords"}}) {
    my ($dckey,$dcvalue) = @{$dcpair};
    # quote ' in dcvalue
    $dcvalue=~tr/'/"/;
-    say "EXECUTE dc_plan ( '$ieid', '$dckey', '$dcvalue');";
+    say $fh "EXECUTE dc_plan ( '$ieid', '$dckey', '$dcvalue');";
  }
-  say "COMMIT;";
+  say $fh "COMMIT;";
-  say "\n"; 
+  say $fh "\n";
  return;
 }
@@ -177,10 +209,11 @@ sub write_addsql {
 ###############################################################################
 # add INDEX and other TRICKs to increase performance
 ###############################################################################
-sub write_index_creation() {
+sub write_index_creation($) {
-  say "-- BEGIN;";
+  my $fh = shift;
-  say "-- CREATE UNIQUE INDEX aip_index on aip (ie_id);";
+  say $fh "-- BEGIN;";
-  say "-- COMMIT;";
+  say $fh "-- CREATE UNIQUE INDEX aip_index on aip (ie_id);";
+  say $fh "-- COMMIT;";
  return;
 }
@@ -210,20 +243,42 @@ sub check_if_db_conform ($$) {
 # Files via /mets:mets/mets:fileSec[1]/mets:fileGrp[1]/mets:file[1]/mets:FLocat[1]
 #
 ###############################################################################
-sub parse_iexml {
+sub parse_iexml ($$) {
-  my $filename = $_[0];
+  my $filename = shift;
+  my $recovery_flag = shift;
    # create object
-  my $xp = XML::XPath->new (filename => $filename);
+    #
+    #my $xp = XML::XPath->new (filename => $filename);
+  my $dom = XML::LibXML->load_xml (location => $filename, recover => $recovery_flag, no_blanks=>1, compact=>1);
+  my $xp = XML::LibXML::XPathContext->new($dom);
+  $xp->registerNs("dnx", "http://www.exlibrisgroup.com/dps/dnx");
+  $xp->registerNs("sru", "http://www.loc.gov/zing/srw/");
+  $xp->registerNs("xsi", "http://www.w3.org/2001/XMLSchema-instance");
+  $xp->registerNs("dc", "http://purl.org/dc/elements/1.1/");
+  $xp->registerNs("mets", "http://www.loc.gov/METS/");
+  $xp->registerNs("rosettamets", "http://www.exlibrisgroup.com/xsd/dps/rosettaMets");
+  $xp->registerNs("mods", "http://www.loc.gov/mods/v3");
+  $xp->registerNs("ns2", "http://dps.exlibris.com/");
+  $xp->registerNs("dv", "http://dfg-viewer.de/");
+  $xp->registerNs("slub", "http://slub-dresden.de/");
+  $xp->registerNs("archive", "http://slub-dresden.de/slubarchiv");
+  $xp->registerNs("premis", "info:lc/xmlns/premis-v2");
+  $xp->registerNs("mix", "http://www.loc.gov/standards/mix/");
+  $xp->registerNs("xlink", "http://www.w3.org/1999/xlink");
+  $xp->registerNs("xlin", "http://www.w3.org/1999/xlink");
  ############################################
  # get title
-  my $title = $xp->findvalue('/mets:mets/mets:dmdSec/mets:mdWrap[1]/mets:xmlData[1]/dc:record/dc:title[1]');
+  my $compiled_xpath_titles = '/mets:mets/mets:dmdSec/mets:mdWrap[1]/mets:xmlData[1]/dc:record/dc:title[1]';
+  my $title = $xp->findvalue($compiled_xpath_titles);
  check_if_db_conform($title, $filename);
  ############################################
  # get dc-records
  my @dcrecords;
-  my $dcnodes = $xp->find('/mets:mets/mets:dmdSec/mets:mdWrap/mets:xmlData/dc:record/*');
+  my $compiled_xpath_dcrecords='/mets:mets/mets:dmdSec/mets:mdWrap/mets:xmlData/dc:record/*';
+  my $dcnodes = $xp->find($compiled_xpath_dcrecords);
  foreach my $dcnode ($dcnodes->get_nodelist) {
-    my $key = $dcnode->getName(".");
+    my $key = $dcnode->getName();
    my $value = $dcnode->findvalue(".");
    if (defined $value) {
      $value=~s/\n/ /g;
@@ -237,7 +292,12 @@ sub parse_iexml {
  }
  ############################################
  # get right representation ID (has a dnx-section with <key id=label>LOCAL</key>)
-  my $repids = $xp->find('/mets:mets/mets:amdSec');
+  my $compiled_xpath_amdsecs = '/mets:mets/mets:amdSec';
+  my $compiled_xpath_localreps = 'mets:techMD/mets:mdWrap/mets:xmlData/dnx/section/record/key[@id=\'label\']';
+  my $compiled_xpath_filegrps = '/mets:mets/mets:fileSec/mets:fileGrp';
+  my $compiled_xpath_flocat = 'mets:file/mets:FLocat';
+  my $repids = $xp->find($compiled_xpath_amdsecs);
  my $repid;
  # FIXME: if only one represenation exists (Qucosa), select this. If there
  # are more than one, use them with label LOCAL
@@ -249,7 +309,8 @@ sub parse_iexml {
    check_if_db_conform($id, $filename);
    #/mets:mets/mets:amdSec[1]/mets:techMD[1]/mets:mdWrap[1]/mets:xmlData[1]/dnx[1]/section[1]/record[1]/key[1]
    #
-    if ($node->findvalue('mets:techMD/mets:mdWrap/mets:xmlData/dnx/section/record/key[@id=\'label\']') eq 'LOCAL') {                   
+    if ($node->findvalue($compiled_xpath_localreps) eq 'LOCAL') {
      $repid=$id;
    }
    #print XML::XPath::XMLParser::as_string($node), "\n\n";
@@ -257,13 +318,13 @@ sub parse_iexml {
  ############################################
  # get all files of LOCAL representation
  my @files;
-  my $filegrpnodes = $xp->find('/mets:mets/mets:fileSec/mets:fileGrp');
+  my $filegrpnodes = $xp->find($compiled_xpath_filegrps);
  foreach my $filegrpnode ($filegrpnodes->get_nodelist) {
    #die XML::XPath::XMLParser::as_string($filegrpnode), "\n\n";
    #die Dumper($filegrpnode);
    if ($filegrpnode->findvalue('@ADMID') eq $repid) {
      #die Dumper($filegrpnode);
-      my $filesnodes = $filegrpnode ->find("mets:file/mets:FLocat");
+      my $filesnodes = $filegrpnode ->find($compiled_xpath_flocat);
      foreach my $filesnode ($filesnodes->get_nodelist) {
        my $value = $filesnode->findvalue('@xlin:href');
        check_if_db_conform($value, $filename);
@@ -295,11 +356,16 @@ sub parse_iexml {
 # and returns an array reference with reduced files using only highest V-value
 # HINT, it only operates on sorted file list with fake versions (with zero-filled prefixes)
 ################################################################################
-sub find_newest_iefile_version ($$) {
+sub find_newest_iefile_version ($$$) {
  my $files_sorted = shift;
  my $files_truncated = shift;
+  my $cnt_files = shift;
+  my $cnt_truncated = 0;
  my $fh = $files_sorted->filehandle("<");
  my $last_entry;
+  # FIXME, how many lines?
+  my $p = Time::Progress->new(min=>0, max=> $cnt_files);
+  my $i=0;
  while(<$fh>) {
    my $entry = $_;
    $entry =~ m/^(.+?V)(\d+)(-IE\d+\.xml)$/;
@@ -312,23 +378,38 @@ sub find_newest_iefile_version ($$) {
    if (($last_prefix eq $prefix ) && ($last_suffix eq $suffix) && ($last_version < $version)) {
    } else {
        $files_truncated->append($last_entry);
+        $cnt_truncated++;
    }
+      print $p->report("find newest IE files: %40b  ETA: %E   \r", $i++);
    $last_entry = $entry;
  }
  $files_truncated->append($last_entry);
-  return 1;
+  $cnt_truncated++;
+  say "";
+  return $cnt_truncated;
 }
-# begin closure
-{
  my $tmp_ies_dir = Path::Tiny->tempdir( TEMPLATE => "exitstrategy_XXXXXXXXXXX", CLEANUP => 1);
  my $tmp_ies_unsorted_file = $tmp_ies_dir->child("unsorted_ies");
  my $tmp_ies_sorted_file = $tmp_ies_dir->child("sorted_ies");
  my $tmp_ies_truncated_file = $tmp_ies_dir->child("truncated_ies");
+  my $cnt_unsorted_files=0;
 ###############################################################################
-# call back function to File::Find
-#
 ###############################################################################
+############# main ############################################################
+###############################################################################
+###############################################################################
+  my $recovery = undef;
+  my @ARGV_tail;
+  GetOptions(
+      "recovery" => \$recovery,
+      '<>' => sub {push @ARGV_tail, @_;}
+  );
+  if ($#ARGV_tail != 1) {
+      die "you need  a SQL-file and a directory as argument\n";
+  }
  sub process_sip () {
    my $file=$File::Find::name;
    if ($file =~ m/V(\d+)-IE\d+\.xml$/) {
@@ -337,43 +418,50 @@ sub find_newest_iefile_version ($$) {
      my $fakeversion = sprintf("%05i",$version);
      $file =~s/V(\d+)-IE/V$fakeversion-IE/;
      $tmp_ies_unsorted_file -> append( $file."\n");
+      $cnt_unsorted_files++;
    }
    return;
  }
-###############################################################################
+  if (defined $recovery) { warn "recovery enabled for XML processing\n"; }
-###############################################################################
+  my $sqlfile = shift @ARGV_tail;
-############# main ############################################################
+  if($sqlfile !~ m/[A-Za-z0-9]+\.sql$/) {die "SQL file should be named like 'foo.sql', but was '$sqlfile'\n";}
-###############################################################################
+  my $dir = shift @ARGV_tail;
-###############################################################################
+  open(my $fh, ">:encoding(UTF-8)", "$sqlfile") || die "could not open file '$sqlfile' for writing, $!";
-  my $dir = shift @ARGV;
  if (defined $dir && -d "$dir") {
-    write_database_creation();
+    say "preparing SQL";
-    write_tables_creation();
+    write_database_creation($fh);
-    write_prepare_insert();
+    write_tables_creation($fh);
+    write_prepare_insert($fh);
    $tmp_ies_unsorted_file->touch();
+    say "searching IE files";
    find(\&process_sip, $dir);
    # /permanent_storage/2020/04/02/IE201080/V1-FL201091.xml
    # /permanent_storage/2020/04/02/IE201080/V2-FL201091.xml
+    say "sorting IE files";
    sort_file({
        I => $tmp_ies_unsorted_file->absolute()->stringify,
        o => $tmp_ies_sorted_file->absolute()->stringify,
    });
-    find_newest_iefile_version ($tmp_ies_sorted_file, $tmp_ies_truncated_file );
+    my $cnt_truncated_files = find_newest_iefile_version ($tmp_ies_sorted_file, $tmp_ies_truncated_file, $cnt_unsorted_files );
    # now operate on truncated
-    my $fh = $tmp_ies_truncated_file->openr();
+    my $fh_truncated_IEs = $tmp_ies_truncated_file->openr();
    my $count=0;
-    while( <$fh> ) {
+    my $p=Time::Progress->new(min => 0, max => $cnt_truncated_files);
+    while( <$fh_truncated_IEs> ) {
      chomp;
-      $count++;
+      print $p->report("parse IE files:       %40b  ETA: %E   \r", $count++);
      s/V(0*)(\d+-IE)/V$2/; # revert fake version
-      my $ret = parse_iexml($_);
+      my $ret = parse_iexml($_, $recovery);
-      write_addsql($ret);
+      write_addsql($fh, $ret);
    }
-    write_index_creation();
+    say "";
-    warn "processed $count uniq IEs\n;"
+    write_index_creation($fh);
+    say "processed $count uniq IEs";
  } else {
    die "no directory given on commandline"
  }
-} #end closure
+  print "\n";
+  close ($fh);
 1;