diff --git a/perl/exit_strategy.pl b/perl/exit_strategy.pl index c1ef3fbe91d8a70a7d7026cc966ee90a9e60c159..4c031d224c1a489afa0408d03a1e2bf0636ff194 100644 --- a/perl/exit_strategy.pl +++ b/perl/exit_strategy.pl @@ -3,27 +3,55 @@ # Author: Andreas Romeyke # SLUB Dresden, Department Longterm Preservation # -# scans a given repository and creates an SQL script to create a database. +# scans a given repository and creates an SQL script to create a database. # This is part of the exit-strategy for details, see asciidoc file # exit_strategie.asciidoc (also contains ER-diagram for database) # +# call with: +# +# perl ./exit_strategy.pl rosetta_exit_strategy/tmp.sql /permanent/ +# # file tested with postgres-database # -# using: +# using then with: # psql -U romeyke -d exit_strategy \ # -f rosetta_exit_strategy/tmp.sql -L rosetta_exit.log # ############################################################################### +# +# WARNING: +# +# the following messages only occure if you had an unclean SIP ingest process, +# it means that in your IE-XML are wrong/unused namespace declarations +# +# if some AIPs are wrong with messages like: +# +# '/permanent_storage/normal/2017/07/05/IE1043/V1-IE1043.xml:6: +# namespace error : xmlns:mods: 'http://www.loc.gov/mods/v3 +# http://www.loc.gov/standards/mods/v3/mods-3-0.xsd' is not a valid URI +# s="http://www.loc.gov/mods/v3 +# http://www.loc.gov/standards/mods/v3/mods-3-0.xsd"' +# +# then (and only then) try this: +# +# perl ./exit_strategy.pl --recover rosetta_exit_strategy/tmp.sql /permanent/ +# +############################################################################### + -use 5.28.0; use strict; use warnings; +use feature "say"; use Carp; use Path::Tiny; use File::Find; use File::Sort qw(sort_file); -use XML::XPath; -use XML::XPath::XMLParser; +use XML::LibXML; +use Time::Progress; +use XML::LibXML::XPathContext; +use Getopt::Long; +use constant DEBUG => 0; # no debug + # guarantee, that output will be UTF8 binmode(STDOUT, ":encoding(UTF-8)"); @@ -38,96 +66,100 @@ my $sourcetype="hdd"; #default value # if IE.xml file found, read its metadata, create SQL add entry # write SQL add entry ############################################################################### -sub write_database_creation { +sub write_database_creation ($) { + my $fh = shift; # non standard conform SQL keywords - #say "CREATE DATABASE $db_name;"; - #say "CREATE SCHEMA $schema_name;"; - #say "USE "; + #say $fh "CREATE DATABASE $db_name;"; + #say $fh "CREATE SCHEMA $schema_name;"; + #say $fh "USE "; + return; } # write tables creation;: -sub write_tables_creation { +sub write_tables_creation ($) { + my $fh = shift; # Transactions for tables creation - say "BEGIN;"; + say $fh "BEGIN;"; # SEQUENCE - say "/* create SEQUENCE generator */"; - say "CREATE SEQUENCE serial START 1;"; + say $fh "/* create SEQUENCE generator */"; + say $fh "CREATE SEQUENCE serial START 1;"; # AIP - say "/* create AIP table */"; - say "CREATE TABLE aip ("; - say "\tid INT PRIMARY KEY DEFAULT nextval('serial'),"; - say "\tie_id VARCHAR(30) NOT NULL UNIQUE"; - say ");"; + say $fh "/* create AIP table */"; + say $fh "CREATE TABLE aip ("; + say $fh "\tid INT PRIMARY KEY DEFAULT nextval('serial'),"; + say $fh "\tie_id VARCHAR(30) NOT NULL UNIQUE"; + say $fh ");"; # IEFILE - say "/* create IEFILE table */"; - say "CREATE TABLE metadatafile ("; - say "\tid INT PRIMARY KEY DEFAULT nextval('serial'),"; - say "\taip_id INT NOT NULL REFERENCES aip (id),"; - say "\tlocation VARCHAR(1024) NOT NULL,"; - say "\tsourcetype VARCHAR(30) NOT NULL"; - say ");"; + say $fh "/* create IEFILE table */"; + say $fh "CREATE TABLE metadatafile ("; + say $fh "\tid INT PRIMARY KEY DEFAULT nextval('serial'),"; + say $fh "\taip_id INT NOT NULL REFERENCES aip (id),"; + say $fh "\tlocation VARCHAR(1024) NOT NULL,"; + say $fh "\tsourcetype VARCHAR(30) NOT NULL"; + say $fh ");"; # DC - say "/* create DC table */"; - say "CREATE TABLE dc ("; - say "\tid INT PRIMARY KEY DEFAULT nextval('serial'),"; - say "\taip_id INT NOT NULL REFERENCES aip (id),"; - say "\telement VARCHAR(30) NOT NULL,"; - say "\tvalue VARCHAR(8192) NOT NULL"; - say ");"; + say $fh "/* create DC table */"; + say $fh "CREATE TABLE dc ("; + say $fh "\tid INT PRIMARY KEY DEFAULT nextval('serial'),"; + say $fh "\taip_id INT NOT NULL REFERENCES aip (id),"; + say $fh "\telement VARCHAR(30) NOT NULL,"; + say $fh "\tvalue VARCHAR(8192) NOT NULL"; + say $fh ");"; # FILE - say "/* create FILE table */"; - say "CREATE TABLE sourcedatafile ("; - say "\tid INT PRIMARY KEY DEFAULT nextval('serial'), "; - say "\taip_id INT NOT NULL REFERENCES aip (id),"; - say "\tname VARCHAR(1024) NOT NULL"; - say ");"; + say $fh "/* create FILE table */"; + say $fh "CREATE TABLE sourcedatafile ("; + say $fh "\tid INT PRIMARY KEY DEFAULT nextval('serial'), "; + say $fh "\taip_id INT NOT NULL REFERENCES aip (id),"; + say $fh "\tname VARCHAR(1024) NOT NULL"; + say $fh ");"; # LOCAT - say "/* create LOCAT table */"; - say "CREATE TABLE sourcedatalocat ("; - say "\tid INT PRIMARY KEY DEFAULT nextval('serial'),"; - say "\tfile_id INT NOT NULL REFERENCES sourcedatafile (id),"; - say "\tlocation VARCHAR(1024) NOT NULL,"; - say "\tsourcetype VARCHAR(30) NOT NULL"; - say ");"; + say $fh "/* create LOCAT table */"; + say $fh "CREATE TABLE sourcedatalocat ("; + say $fh "\tid INT PRIMARY KEY DEFAULT nextval('serial'),"; + say $fh "\tfile_id INT NOT NULL REFERENCES sourcedatafile (id),"; + say $fh "\tlocation VARCHAR(1024) NOT NULL,"; + say $fh "\tsourcetype VARCHAR(30) NOT NULL"; + say $fh ");"; #end transaction - say "COMMIT;"; + say $fh "COMMIT;"; return; } ############################################################################### # Prepare SQL INSERT Statements for AIPs ############################################################################### -sub write_prepare_insert { - say "BEGIN;"; - say "PREPARE aip_plan (varchar) AS"; - say " INSERT INTO aip (ie_id) VALUES (\$1);"; - say "PREPARE ie_plan (varchar, varchar, varchar) AS"; - say " INSERT INTO metadatafile (aip_id, location, sourcetype) VALUES ("; - say " (SELECT id FROM aip WHERE aip.ie_id=\$1), \$2, \$3"; - say " );"; - say "PREPARE file_plan (varchar, varchar) AS"; - say " INSERT INTO sourcedatafile (aip_id, name) VALUES ("; - say " (SELECT id FROM aip WHERE aip.ie_id=\$1), \$2"; - say " );"; - say "PREPARE locat_plan (varchar, varchar, varchar, varchar) AS"; - say " INSERT INTO sourcedatalocat (file_id, location, sourcetype) VALUES ("; - say " (SELECT sourcedatafile.id FROM sourcedatafile,aip WHERE"; - say " sourcedatafile.aip_id=aip.id AND aip.ie_id=\$1 AND"; - say " sourcedatafile.name=\$2), \$3, \$4"; - say " );"; - say "PREPARE dc_plan (varchar, varchar, varchar) AS"; - say " INSERT INTO dc (aip_id, element, value) VALUES ("; - say " (SELECT id FROM aip WHERE aip.ie_id=\$1), \$2, \$3"; - say " );"; - say "COMMIT;"; +sub write_prepare_insert ($) { + my $fh = shift; + say $fh "BEGIN;"; + say $fh "PREPARE aip_plan (varchar) AS"; + say $fh " INSERT INTO aip (ie_id) VALUES (\$1);"; + say $fh "PREPARE ie_plan (varchar, varchar, varchar) AS"; + say $fh " INSERT INTO metadatafile (aip_id, location, sourcetype) VALUES ("; + say $fh " (SELECT id FROM aip WHERE aip.ie_id=\$1), \$2, \$3"; + say $fh " );"; + say $fh "PREPARE file_plan (varchar, varchar) AS"; + say $fh " INSERT INTO sourcedatafile (aip_id, name) VALUES ("; + say $fh " (SELECT id FROM aip WHERE aip.ie_id=\$1), \$2"; + say $fh " );"; + say $fh "PREPARE locat_plan (varchar, varchar, varchar, varchar) AS"; + say $fh " INSERT INTO sourcedatalocat (file_id, location, sourcetype) VALUES ("; + say $fh " (SELECT sourcedatafile.id FROM sourcedatafile,aip WHERE"; + say $fh " sourcedatafile.aip_id=aip.id AND aip.ie_id=\$1 AND"; + say $fh " sourcedatafile.name=\$2), \$3, \$4"; + say $fh " );"; + say $fh "PREPARE dc_plan (varchar, varchar, varchar) AS"; + say $fh " INSERT INTO dc (aip_id, element, value) VALUES ("; + say $fh " (SELECT id FROM aip WHERE aip.ie_id=\$1), \$2, \$3"; + say $fh " );"; + say $fh "COMMIT;"; return; } ############################################################################### -# write add SQL entry, expects a hashref which contains ff. params +# write add SQL entry, expects a hashref which contains ff. params # (foreach file location/copy): # INSERT INTO aip (ie_id) VALUES ($ieid); # INSERT INTO iefile (aip_id, location, sourcetype) VALUES ( @@ -135,7 +167,7 @@ sub write_prepare_insert { # INSERT INTO file (aip_id, name) VALUES ( # (SELECT id FROM aip where aip.ieid = $ieid), $name); # INSERT INTO locat (file_id, location, sourcetype) VALUES ( -# (SELECT file.aip_id FROM file where file.aip_id = aip.id +# (SELECT file.aip_id FROM file where file.aip_id = aip.id # AND aip.ie_id=$ieid), $location, $sourcetype) # INSERT INTO dc (aip_id, element, value) VALUES ( # (SELECT id FROM aip where aip.ieid = $ieid), $element, $value); @@ -147,28 +179,28 @@ sub write_prepare_insert { # $ret{"files"} = \@files; # $ret{"dcrecords"} = \@dcrecords; ############################################################################### -sub write_addsql { - my $refhash = $_[0]; +sub write_addsql ($$) { + my $fh = shift; + my $refhash = shift; my $ieid = path($refhash->{"filename"})->basename(qw/.xml/); - say "BEGIN;"; - say "EXECUTE aip_plan ('$ieid');"; + say $fh "BEGIN;"; + say $fh "EXECUTE aip_plan ('$ieid');"; # FIXME if multiple locations exists my $iefile = path($refhash->{"filename"})->basename(); - say "EXECUTE ie_plan ('$ieid', '$iefile', '$sourcetype');"; + say $fh "EXECUTE ie_plan ('$ieid', '$iefile', '$sourcetype');"; foreach my $location (@{$refhash->{"files"}}) { - my $file = path($location)->basename(); # FIXME if multiple locations - say "EXECUTE file_plan ('$ieid', '$file');"; - say "EXECUTE locat_plan ('$ieid', '$file', '$location', '$sourcetype' );"; + say $fh "EXECUTE file_plan ('$ieid', '$file');"; + say $fh "EXECUTE locat_plan ('$ieid', '$file', '$location', '$sourcetype' );"; } foreach my $dcpair (@{$refhash->{"dcrecords"}}) { my ($dckey,$dcvalue) = @{$dcpair}; # quote ' in dcvalue $dcvalue=~tr/'/"/; - say "EXECUTE dc_plan ( '$ieid', '$dckey', '$dcvalue');"; + say $fh "EXECUTE dc_plan ( '$ieid', '$dckey', '$dcvalue');"; } - say "COMMIT;"; - say "\n"; + say $fh "COMMIT;"; + say $fh "\n"; return; } @@ -177,10 +209,11 @@ sub write_addsql { ############################################################################### # add INDEX and other TRICKs to increase performance ############################################################################### -sub write_index_creation() { - say "-- BEGIN;"; - say "-- CREATE UNIQUE INDEX aip_index on aip (ie_id);"; - say "-- COMMIT;"; +sub write_index_creation($) { + my $fh = shift; + say $fh "-- BEGIN;"; + say $fh "-- CREATE UNIQUE INDEX aip_index on aip (ie_id);"; + say $fh "-- COMMIT;"; return; } @@ -192,8 +225,8 @@ sub check_if_db_conform ($$) { my $string = "$_[0]"; my $filename = $_[1]; if ($string ne '') { - if ( not utf8::is_utf8($string)) { - croak "no utf8: '$string' in file '$filename'\n"; + if ( not utf8::is_utf8($string)) { + croak "no utf8: '$string' in file '$filename'\n"; } }# return; @@ -203,27 +236,49 @@ sub check_if_db_conform ($$) { ############################################################################### # # /mets:mets/mets:dmdSec[1]/mets:mdWrap[1]/mets:xmlData[1]/dc:record[1]/dc:title[1] -# /mets:mets/mets:amdSec[1]/mets:techMD[1]/mets:mdWrap[1]/mets:xmlData[1]/dnx[1]/section[1]/record[1]/key[2] +# /mets:mets/mets:amdSec[1]/mets:techMD[1]/mets:mdWrap[1]/mets:xmlData[1]/dnx[1]/section[1]/record[1]/key[2] # mit ID=Label und Wert = LOCAL # dort die ID von techMD (Referenz für Files) # # Files via /mets:mets/mets:fileSec[1]/mets:fileGrp[1]/mets:file[1]/mets:FLocat[1] # ############################################################################### -sub parse_iexml { - my $filename = $_[0]; +sub parse_iexml ($$) { + my $filename = shift; + my $recovery_flag = shift; # create object - my $xp = XML::XPath->new (filename => $filename); + # + #my $xp = XML::XPath->new (filename => $filename); + my $dom = XML::LibXML->load_xml (location => $filename, recover => $recovery_flag, no_blanks=>1, compact=>1); + my $xp = XML::LibXML::XPathContext->new($dom); + $xp->registerNs("dnx", "http://www.exlibrisgroup.com/dps/dnx"); + $xp->registerNs("sru", "http://www.loc.gov/zing/srw/"); + $xp->registerNs("xsi", "http://www.w3.org/2001/XMLSchema-instance"); + $xp->registerNs("dc", "http://purl.org/dc/elements/1.1/"); + $xp->registerNs("mets", "http://www.loc.gov/METS/"); + $xp->registerNs("rosettamets", "http://www.exlibrisgroup.com/xsd/dps/rosettaMets"); + $xp->registerNs("mods", "http://www.loc.gov/mods/v3"); + $xp->registerNs("ns2", "http://dps.exlibris.com/"); + $xp->registerNs("dv", "http://dfg-viewer.de/"); + $xp->registerNs("slub", "http://slub-dresden.de/"); + $xp->registerNs("archive", "http://slub-dresden.de/slubarchiv"); + $xp->registerNs("premis", "info:lc/xmlns/premis-v2"); + $xp->registerNs("mix", "http://www.loc.gov/standards/mix/"); + $xp->registerNs("xlink", "http://www.w3.org/1999/xlink"); + $xp->registerNs("xlin", "http://www.w3.org/1999/xlink"); + ############################################ # get title - my $title = $xp->findvalue('/mets:mets/mets:dmdSec/mets:mdWrap[1]/mets:xmlData[1]/dc:record/dc:title[1]'); + my $compiled_xpath_titles = '/mets:mets/mets:dmdSec/mets:mdWrap[1]/mets:xmlData[1]/dc:record/dc:title[1]'; + my $title = $xp->findvalue($compiled_xpath_titles); check_if_db_conform($title, $filename); ############################################ # get dc-records my @dcrecords; - my $dcnodes = $xp->find('/mets:mets/mets:dmdSec/mets:mdWrap/mets:xmlData/dc:record/*'); + my $compiled_xpath_dcrecords='/mets:mets/mets:dmdSec/mets:mdWrap/mets:xmlData/dc:record/*'; + my $dcnodes = $xp->find($compiled_xpath_dcrecords); foreach my $dcnode ($dcnodes->get_nodelist) { - my $key = $dcnode->getName("."); + my $key = $dcnode->getName(); my $value = $dcnode->findvalue("."); if (defined $value) { $value=~s/\n/ /g; @@ -237,7 +292,12 @@ sub parse_iexml { } ############################################ # get right representation ID (has a dnx-section with <key id=label>LOCAL</key>) - my $repids = $xp->find('/mets:mets/mets:amdSec'); + my $compiled_xpath_amdsecs = '/mets:mets/mets:amdSec'; + my $compiled_xpath_localreps = 'mets:techMD/mets:mdWrap/mets:xmlData/dnx/section/record/key[@id=\'label\']'; + my $compiled_xpath_filegrps = '/mets:mets/mets:fileSec/mets:fileGrp'; + my $compiled_xpath_flocat = 'mets:file/mets:FLocat'; + + my $repids = $xp->find($compiled_xpath_amdsecs); my $repid; # FIXME: if only one represenation exists (Qucosa), select this. If there # are more than one, use them with label LOCAL @@ -249,21 +309,22 @@ sub parse_iexml { check_if_db_conform($id, $filename); #/mets:mets/mets:amdSec[1]/mets:techMD[1]/mets:mdWrap[1]/mets:xmlData[1]/dnx[1]/section[1]/record[1]/key[1] # - if ($node->findvalue('mets:techMD/mets:mdWrap/mets:xmlData/dnx/section/record/key[@id=\'label\']') eq 'LOCAL') { + + if ($node->findvalue($compiled_xpath_localreps) eq 'LOCAL') { $repid=$id; } - #print XML::XPath::XMLParser::as_string($node), "\n\n"; + #print XML::XPath::XMLParser::as_string($node), "\n\n"; } ############################################ # get all files of LOCAL representation my @files; - my $filegrpnodes = $xp->find('/mets:mets/mets:fileSec/mets:fileGrp'); + my $filegrpnodes = $xp->find($compiled_xpath_filegrps); foreach my $filegrpnode ($filegrpnodes->get_nodelist) { #die XML::XPath::XMLParser::as_string($filegrpnode), "\n\n"; #die Dumper($filegrpnode); if ($filegrpnode->findvalue('@ADMID') eq $repid) { #die Dumper($filegrpnode); - my $filesnodes = $filegrpnode ->find("mets:file/mets:FLocat"); + my $filesnodes = $filegrpnode ->find($compiled_xpath_flocat); foreach my $filesnode ($filesnodes->get_nodelist) { my $value = $filesnode->findvalue('@xlin:href'); check_if_db_conform($value, $filename); @@ -283,7 +344,7 @@ sub parse_iexml { ############################################################################### # because ExLibris Rosetta produces filenames of following format: # V\d+-IE\d+\.xml -# e.G.: +# e.G.: # V1-IE23891.xml # V1-IE94621.xml # V2-IE23891.xml @@ -295,11 +356,16 @@ sub parse_iexml { # and returns an array reference with reduced files using only highest V-value # HINT, it only operates on sorted file list with fake versions (with zero-filled prefixes) ################################################################################ -sub find_newest_iefile_version ($$) { +sub find_newest_iefile_version ($$$) { my $files_sorted = shift; my $files_truncated = shift; + my $cnt_files = shift; + my $cnt_truncated = 0; my $fh = $files_sorted->filehandle("<"); my $last_entry; + # FIXME, how many lines? + my $p = Time::Progress->new(min=>0, max=> $cnt_files); + my $i=0; while(<$fh>) { my $entry = $_; $entry =~ m/^(.+?V)(\d+)(-IE\d+\.xml)$/; @@ -312,23 +378,38 @@ sub find_newest_iefile_version ($$) { if (($last_prefix eq $prefix ) && ($last_suffix eq $suffix) && ($last_version < $version)) { } else { $files_truncated->append($last_entry); + $cnt_truncated++; } + print $p->report("find newest IE files: %40b ETA: %E \r", $i++); $last_entry = $entry; } $files_truncated->append($last_entry); - return 1; + $cnt_truncated++; + say ""; + return $cnt_truncated; } -# begin closure -{ + my $tmp_ies_dir = Path::Tiny->tempdir( TEMPLATE => "exitstrategy_XXXXXXXXXXX", CLEANUP => 1); my $tmp_ies_unsorted_file = $tmp_ies_dir->child("unsorted_ies"); my $tmp_ies_sorted_file = $tmp_ies_dir->child("sorted_ies"); my $tmp_ies_truncated_file = $tmp_ies_dir->child("truncated_ies"); + my $cnt_unsorted_files=0; ############################################################################### -# call back function to File::Find -# ############################################################################### +############# main ############################################################ +############################################################################### +############################################################################### + my $recovery = undef; + my @ARGV_tail; + GetOptions( + "recovery" => \$recovery, + '<>' => sub {push @ARGV_tail, @_;} + ); + if ($#ARGV_tail != 1) { + die "you need a SQL-file and a directory as argument\n"; + } + sub process_sip () { my $file=$File::Find::name; if ($file =~ m/V(\d+)-IE\d+\.xml$/) { @@ -337,43 +418,50 @@ sub find_newest_iefile_version ($$) { my $fakeversion = sprintf("%05i",$version); $file =~s/V(\d+)-IE/V$fakeversion-IE/; $tmp_ies_unsorted_file -> append( $file."\n"); + $cnt_unsorted_files++; } return; } -############################################################################### -############################################################################### -############# main ############################################################ -############################################################################### -############################################################################### - my $dir = shift @ARGV; + if (defined $recovery) { warn "recovery enabled for XML processing\n"; } + my $sqlfile = shift @ARGV_tail; + if($sqlfile !~ m/[A-Za-z0-9]+\.sql$/) {die "SQL file should be named like 'foo.sql', but was '$sqlfile'\n";} + my $dir = shift @ARGV_tail; + open(my $fh, ">:encoding(UTF-8)", "$sqlfile") || die "could not open file '$sqlfile' for writing, $!"; if (defined $dir && -d "$dir") { - write_database_creation(); - write_tables_creation(); - write_prepare_insert(); + say "preparing SQL"; + write_database_creation($fh); + write_tables_creation($fh); + write_prepare_insert($fh); $tmp_ies_unsorted_file->touch(); + say "searching IE files"; find(\&process_sip, $dir); # /permanent_storage/2020/04/02/IE201080/V1-FL201091.xml # /permanent_storage/2020/04/02/IE201080/V2-FL201091.xml + say "sorting IE files"; sort_file({ I => $tmp_ies_unsorted_file->absolute()->stringify, o => $tmp_ies_sorted_file->absolute()->stringify, }); - find_newest_iefile_version ($tmp_ies_sorted_file, $tmp_ies_truncated_file ); + my $cnt_truncated_files = find_newest_iefile_version ($tmp_ies_sorted_file, $tmp_ies_truncated_file, $cnt_unsorted_files ); # now operate on truncated - my $fh = $tmp_ies_truncated_file->openr(); + my $fh_truncated_IEs = $tmp_ies_truncated_file->openr(); my $count=0; - while( <$fh> ) { + my $p=Time::Progress->new(min => 0, max => $cnt_truncated_files); + while( <$fh_truncated_IEs> ) { chomp; - $count++; + print $p->report("parse IE files: %40b ETA: %E \r", $count++); s/V(0*)(\d+-IE)/V$2/; # revert fake version - my $ret = parse_iexml($_); - write_addsql($ret); + my $ret = parse_iexml($_, $recovery); + write_addsql($fh, $ret); } - write_index_creation(); - warn "processed $count uniq IEs\n;" + say ""; + write_index_creation($fh); + say "processed $count uniq IEs"; } else { die "no directory given on commandline" } -} #end closure + print "\n"; + close ($fh); + 1;