Skip to content
Snippets Groups Projects
Commit b3c344d5 authored by Andreas Romeyke's avatar Andreas Romeyke
Browse files

- reintegrate patches from another free-floating repository:

- reorganized-modules-test-and-examples
- code-clean-up-removed-all-trailing-whitespaces
- improved-memory-usage-by-using-filehandles
- added-a-recovery-mode-if-AIPs-have-unclean-IE-XML
- fix-increased-field-size-for-dc-strings-to-prevent
- fixed-size-to-8192
parent 9fb93a76
No related branches found
No related tags found
No related merge requests found
......@@ -3,27 +3,55 @@
# Author: Andreas Romeyke
# SLUB Dresden, Department Longterm Preservation
#
# scans a given repository and creates an SQL script to create a database.
# scans a given repository and creates an SQL script to create a database.
# This is part of the exit-strategy for details, see asciidoc file
# exit_strategie.asciidoc (also contains ER-diagram for database)
#
# call with:
#
# perl ./exit_strategy.pl rosetta_exit_strategy/tmp.sql /permanent/
#
# file tested with postgres-database
#
# using:
# using then with:
# psql -U romeyke -d exit_strategy \
# -f rosetta_exit_strategy/tmp.sql -L rosetta_exit.log
#
###############################################################################
#
# WARNING:
#
# the following messages only occure if you had an unclean SIP ingest process,
# it means that in your IE-XML are wrong/unused namespace declarations
#
# if some AIPs are wrong with messages like:
#
# '/permanent_storage/normal/2017/07/05/IE1043/V1-IE1043.xml:6:
# namespace error : xmlns:mods: 'http://www.loc.gov/mods/v3
# http://www.loc.gov/standards/mods/v3/mods-3-0.xsd' is not a valid URI
# s="http://www.loc.gov/mods/v3
# http://www.loc.gov/standards/mods/v3/mods-3-0.xsd"'
#
# then (and only then) try this:
#
# perl ./exit_strategy.pl --recover rosetta_exit_strategy/tmp.sql /permanent/
#
###############################################################################
use 5.28.0;
use strict;
use warnings;
use feature "say";
use Carp;
use Path::Tiny;
use File::Find;
use File::Sort qw(sort_file);
use XML::XPath;
use XML::XPath::XMLParser;
use XML::LibXML;
use Time::Progress;
use XML::LibXML::XPathContext;
use Getopt::Long;
use constant DEBUG => 0; # no debug
# guarantee, that output will be UTF8
binmode(STDOUT, ":encoding(UTF-8)");
......@@ -38,96 +66,100 @@ my $sourcetype="hdd"; #default value
# if IE.xml file found, read its metadata, create SQL add entry
# write SQL add entry
###############################################################################
sub write_database_creation {
sub write_database_creation ($) {
my $fh = shift;
# non standard conform SQL keywords
#say "CREATE DATABASE $db_name;";
#say "CREATE SCHEMA $schema_name;";
#say "USE ";
#say $fh "CREATE DATABASE $db_name;";
#say $fh "CREATE SCHEMA $schema_name;";
#say $fh "USE ";
return;
}
# write tables creation;:
sub write_tables_creation {
sub write_tables_creation ($) {
my $fh = shift;
# Transactions for tables creation
say "BEGIN;";
say $fh "BEGIN;";
# SEQUENCE
say "/* create SEQUENCE generator */";
say "CREATE SEQUENCE serial START 1;";
say $fh "/* create SEQUENCE generator */";
say $fh "CREATE SEQUENCE serial START 1;";
# AIP
say "/* create AIP table */";
say "CREATE TABLE aip (";
say "\tid INT PRIMARY KEY DEFAULT nextval('serial'),";
say "\tie_id VARCHAR(30) NOT NULL UNIQUE";
say ");";
say $fh "/* create AIP table */";
say $fh "CREATE TABLE aip (";
say $fh "\tid INT PRIMARY KEY DEFAULT nextval('serial'),";
say $fh "\tie_id VARCHAR(30) NOT NULL UNIQUE";
say $fh ");";
# IEFILE
say "/* create IEFILE table */";
say "CREATE TABLE metadatafile (";
say "\tid INT PRIMARY KEY DEFAULT nextval('serial'),";
say "\taip_id INT NOT NULL REFERENCES aip (id),";
say "\tlocation VARCHAR(1024) NOT NULL,";
say "\tsourcetype VARCHAR(30) NOT NULL";
say ");";
say $fh "/* create IEFILE table */";
say $fh "CREATE TABLE metadatafile (";
say $fh "\tid INT PRIMARY KEY DEFAULT nextval('serial'),";
say $fh "\taip_id INT NOT NULL REFERENCES aip (id),";
say $fh "\tlocation VARCHAR(1024) NOT NULL,";
say $fh "\tsourcetype VARCHAR(30) NOT NULL";
say $fh ");";
# DC
say "/* create DC table */";
say "CREATE TABLE dc (";
say "\tid INT PRIMARY KEY DEFAULT nextval('serial'),";
say "\taip_id INT NOT NULL REFERENCES aip (id),";
say "\telement VARCHAR(30) NOT NULL,";
say "\tvalue VARCHAR(8192) NOT NULL";
say ");";
say $fh "/* create DC table */";
say $fh "CREATE TABLE dc (";
say $fh "\tid INT PRIMARY KEY DEFAULT nextval('serial'),";
say $fh "\taip_id INT NOT NULL REFERENCES aip (id),";
say $fh "\telement VARCHAR(30) NOT NULL,";
say $fh "\tvalue VARCHAR(8192) NOT NULL";
say $fh ");";
# FILE
say "/* create FILE table */";
say "CREATE TABLE sourcedatafile (";
say "\tid INT PRIMARY KEY DEFAULT nextval('serial'), ";
say "\taip_id INT NOT NULL REFERENCES aip (id),";
say "\tname VARCHAR(1024) NOT NULL";
say ");";
say $fh "/* create FILE table */";
say $fh "CREATE TABLE sourcedatafile (";
say $fh "\tid INT PRIMARY KEY DEFAULT nextval('serial'), ";
say $fh "\taip_id INT NOT NULL REFERENCES aip (id),";
say $fh "\tname VARCHAR(1024) NOT NULL";
say $fh ");";
# LOCAT
say "/* create LOCAT table */";
say "CREATE TABLE sourcedatalocat (";
say "\tid INT PRIMARY KEY DEFAULT nextval('serial'),";
say "\tfile_id INT NOT NULL REFERENCES sourcedatafile (id),";
say "\tlocation VARCHAR(1024) NOT NULL,";
say "\tsourcetype VARCHAR(30) NOT NULL";
say ");";
say $fh "/* create LOCAT table */";
say $fh "CREATE TABLE sourcedatalocat (";
say $fh "\tid INT PRIMARY KEY DEFAULT nextval('serial'),";
say $fh "\tfile_id INT NOT NULL REFERENCES sourcedatafile (id),";
say $fh "\tlocation VARCHAR(1024) NOT NULL,";
say $fh "\tsourcetype VARCHAR(30) NOT NULL";
say $fh ");";
#end transaction
say "COMMIT;";
say $fh "COMMIT;";
return;
}
###############################################################################
# Prepare SQL INSERT Statements for AIPs
###############################################################################
sub write_prepare_insert {
say "BEGIN;";
say "PREPARE aip_plan (varchar) AS";
say " INSERT INTO aip (ie_id) VALUES (\$1);";
say "PREPARE ie_plan (varchar, varchar, varchar) AS";
say " INSERT INTO metadatafile (aip_id, location, sourcetype) VALUES (";
say " (SELECT id FROM aip WHERE aip.ie_id=\$1), \$2, \$3";
say " );";
say "PREPARE file_plan (varchar, varchar) AS";
say " INSERT INTO sourcedatafile (aip_id, name) VALUES (";
say " (SELECT id FROM aip WHERE aip.ie_id=\$1), \$2";
say " );";
say "PREPARE locat_plan (varchar, varchar, varchar, varchar) AS";
say " INSERT INTO sourcedatalocat (file_id, location, sourcetype) VALUES (";
say " (SELECT sourcedatafile.id FROM sourcedatafile,aip WHERE";
say " sourcedatafile.aip_id=aip.id AND aip.ie_id=\$1 AND";
say " sourcedatafile.name=\$2), \$3, \$4";
say " );";
say "PREPARE dc_plan (varchar, varchar, varchar) AS";
say " INSERT INTO dc (aip_id, element, value) VALUES (";
say " (SELECT id FROM aip WHERE aip.ie_id=\$1), \$2, \$3";
say " );";
say "COMMIT;";
sub write_prepare_insert ($) {
my $fh = shift;
say $fh "BEGIN;";
say $fh "PREPARE aip_plan (varchar) AS";
say $fh " INSERT INTO aip (ie_id) VALUES (\$1);";
say $fh "PREPARE ie_plan (varchar, varchar, varchar) AS";
say $fh " INSERT INTO metadatafile (aip_id, location, sourcetype) VALUES (";
say $fh " (SELECT id FROM aip WHERE aip.ie_id=\$1), \$2, \$3";
say $fh " );";
say $fh "PREPARE file_plan (varchar, varchar) AS";
say $fh " INSERT INTO sourcedatafile (aip_id, name) VALUES (";
say $fh " (SELECT id FROM aip WHERE aip.ie_id=\$1), \$2";
say $fh " );";
say $fh "PREPARE locat_plan (varchar, varchar, varchar, varchar) AS";
say $fh " INSERT INTO sourcedatalocat (file_id, location, sourcetype) VALUES (";
say $fh " (SELECT sourcedatafile.id FROM sourcedatafile,aip WHERE";
say $fh " sourcedatafile.aip_id=aip.id AND aip.ie_id=\$1 AND";
say $fh " sourcedatafile.name=\$2), \$3, \$4";
say $fh " );";
say $fh "PREPARE dc_plan (varchar, varchar, varchar) AS";
say $fh " INSERT INTO dc (aip_id, element, value) VALUES (";
say $fh " (SELECT id FROM aip WHERE aip.ie_id=\$1), \$2, \$3";
say $fh " );";
say $fh "COMMIT;";
return;
}
###############################################################################
# write add SQL entry, expects a hashref which contains ff. params
# write add SQL entry, expects a hashref which contains ff. params
# (foreach file location/copy):
# INSERT INTO aip (ie_id) VALUES ($ieid);
# INSERT INTO iefile (aip_id, location, sourcetype) VALUES (
......@@ -135,7 +167,7 @@ sub write_prepare_insert {
# INSERT INTO file (aip_id, name) VALUES (
# (SELECT id FROM aip where aip.ieid = $ieid), $name);
# INSERT INTO locat (file_id, location, sourcetype) VALUES (
# (SELECT file.aip_id FROM file where file.aip_id = aip.id
# (SELECT file.aip_id FROM file where file.aip_id = aip.id
# AND aip.ie_id=$ieid), $location, $sourcetype)
# INSERT INTO dc (aip_id, element, value) VALUES (
# (SELECT id FROM aip where aip.ieid = $ieid), $element, $value);
......@@ -147,28 +179,28 @@ sub write_prepare_insert {
# $ret{"files"} = \@files;
# $ret{"dcrecords"} = \@dcrecords;
###############################################################################
sub write_addsql {
my $refhash = $_[0];
sub write_addsql ($$) {
my $fh = shift;
my $refhash = shift;
my $ieid = path($refhash->{"filename"})->basename(qw/.xml/);
say "BEGIN;";
say "EXECUTE aip_plan ('$ieid');";
say $fh "BEGIN;";
say $fh "EXECUTE aip_plan ('$ieid');";
# FIXME if multiple locations exists
my $iefile = path($refhash->{"filename"})->basename();
say "EXECUTE ie_plan ('$ieid', '$iefile', '$sourcetype');";
say $fh "EXECUTE ie_plan ('$ieid', '$iefile', '$sourcetype');";
foreach my $location (@{$refhash->{"files"}}) {
my $file = path($location)->basename(); # FIXME if multiple locations
say "EXECUTE file_plan ('$ieid', '$file');";
say "EXECUTE locat_plan ('$ieid', '$file', '$location', '$sourcetype' );";
say $fh "EXECUTE file_plan ('$ieid', '$file');";
say $fh "EXECUTE locat_plan ('$ieid', '$file', '$location', '$sourcetype' );";
}
foreach my $dcpair (@{$refhash->{"dcrecords"}}) {
my ($dckey,$dcvalue) = @{$dcpair};
# quote ' in dcvalue
$dcvalue=~tr/'/"/;
say "EXECUTE dc_plan ( '$ieid', '$dckey', '$dcvalue');";
say $fh "EXECUTE dc_plan ( '$ieid', '$dckey', '$dcvalue');";
}
say "COMMIT;";
say "\n";
say $fh "COMMIT;";
say $fh "\n";
return;
}
......@@ -177,10 +209,11 @@ sub write_addsql {
###############################################################################
# add INDEX and other TRICKs to increase performance
###############################################################################
sub write_index_creation() {
say "-- BEGIN;";
say "-- CREATE UNIQUE INDEX aip_index on aip (ie_id);";
say "-- COMMIT;";
sub write_index_creation($) {
my $fh = shift;
say $fh "-- BEGIN;";
say $fh "-- CREATE UNIQUE INDEX aip_index on aip (ie_id);";
say $fh "-- COMMIT;";
return;
}
......@@ -192,8 +225,8 @@ sub check_if_db_conform ($$) {
my $string = "$_[0]";
my $filename = $_[1];
if ($string ne '') {
if ( not utf8::is_utf8($string)) {
croak "no utf8: '$string' in file '$filename'\n";
if ( not utf8::is_utf8($string)) {
croak "no utf8: '$string' in file '$filename'\n";
}
}#
return;
......@@ -203,27 +236,49 @@ sub check_if_db_conform ($$) {
###############################################################################
#
# /mets:mets/mets:dmdSec[1]/mets:mdWrap[1]/mets:xmlData[1]/dc:record[1]/dc:title[1]
# /mets:mets/mets:amdSec[1]/mets:techMD[1]/mets:mdWrap[1]/mets:xmlData[1]/dnx[1]/section[1]/record[1]/key[2]
# /mets:mets/mets:amdSec[1]/mets:techMD[1]/mets:mdWrap[1]/mets:xmlData[1]/dnx[1]/section[1]/record[1]/key[2]
# mit ID=Label und Wert = LOCAL
# dort die ID von techMD (Referenz für Files)
#
# Files via /mets:mets/mets:fileSec[1]/mets:fileGrp[1]/mets:file[1]/mets:FLocat[1]
#
###############################################################################
sub parse_iexml {
my $filename = $_[0];
sub parse_iexml ($$) {
my $filename = shift;
my $recovery_flag = shift;
# create object
my $xp = XML::XPath->new (filename => $filename);
#
#my $xp = XML::XPath->new (filename => $filename);
my $dom = XML::LibXML->load_xml (location => $filename, recover => $recovery_flag, no_blanks=>1, compact=>1);
my $xp = XML::LibXML::XPathContext->new($dom);
$xp->registerNs("dnx", "http://www.exlibrisgroup.com/dps/dnx");
$xp->registerNs("sru", "http://www.loc.gov/zing/srw/");
$xp->registerNs("xsi", "http://www.w3.org/2001/XMLSchema-instance");
$xp->registerNs("dc", "http://purl.org/dc/elements/1.1/");
$xp->registerNs("mets", "http://www.loc.gov/METS/");
$xp->registerNs("rosettamets", "http://www.exlibrisgroup.com/xsd/dps/rosettaMets");
$xp->registerNs("mods", "http://www.loc.gov/mods/v3");
$xp->registerNs("ns2", "http://dps.exlibris.com/");
$xp->registerNs("dv", "http://dfg-viewer.de/");
$xp->registerNs("slub", "http://slub-dresden.de/");
$xp->registerNs("archive", "http://slub-dresden.de/slubarchiv");
$xp->registerNs("premis", "info:lc/xmlns/premis-v2");
$xp->registerNs("mix", "http://www.loc.gov/standards/mix/");
$xp->registerNs("xlink", "http://www.w3.org/1999/xlink");
$xp->registerNs("xlin", "http://www.w3.org/1999/xlink");
############################################
# get title
my $title = $xp->findvalue('/mets:mets/mets:dmdSec/mets:mdWrap[1]/mets:xmlData[1]/dc:record/dc:title[1]');
my $compiled_xpath_titles = '/mets:mets/mets:dmdSec/mets:mdWrap[1]/mets:xmlData[1]/dc:record/dc:title[1]';
my $title = $xp->findvalue($compiled_xpath_titles);
check_if_db_conform($title, $filename);
############################################
# get dc-records
my @dcrecords;
my $dcnodes = $xp->find('/mets:mets/mets:dmdSec/mets:mdWrap/mets:xmlData/dc:record/*');
my $compiled_xpath_dcrecords='/mets:mets/mets:dmdSec/mets:mdWrap/mets:xmlData/dc:record/*';
my $dcnodes = $xp->find($compiled_xpath_dcrecords);
foreach my $dcnode ($dcnodes->get_nodelist) {
my $key = $dcnode->getName(".");
my $key = $dcnode->getName();
my $value = $dcnode->findvalue(".");
if (defined $value) {
$value=~s/\n/ /g;
......@@ -237,7 +292,12 @@ sub parse_iexml {
}
############################################
# get right representation ID (has a dnx-section with <key id=label>LOCAL</key>)
my $repids = $xp->find('/mets:mets/mets:amdSec');
my $compiled_xpath_amdsecs = '/mets:mets/mets:amdSec';
my $compiled_xpath_localreps = 'mets:techMD/mets:mdWrap/mets:xmlData/dnx/section/record/key[@id=\'label\']';
my $compiled_xpath_filegrps = '/mets:mets/mets:fileSec/mets:fileGrp';
my $compiled_xpath_flocat = 'mets:file/mets:FLocat';
my $repids = $xp->find($compiled_xpath_amdsecs);
my $repid;
# FIXME: if only one represenation exists (Qucosa), select this. If there
# are more than one, use them with label LOCAL
......@@ -249,21 +309,22 @@ sub parse_iexml {
check_if_db_conform($id, $filename);
#/mets:mets/mets:amdSec[1]/mets:techMD[1]/mets:mdWrap[1]/mets:xmlData[1]/dnx[1]/section[1]/record[1]/key[1]
#
if ($node->findvalue('mets:techMD/mets:mdWrap/mets:xmlData/dnx/section/record/key[@id=\'label\']') eq 'LOCAL') {
if ($node->findvalue($compiled_xpath_localreps) eq 'LOCAL') {
$repid=$id;
}
#print XML::XPath::XMLParser::as_string($node), "\n\n";
#print XML::XPath::XMLParser::as_string($node), "\n\n";
}
############################################
# get all files of LOCAL representation
my @files;
my $filegrpnodes = $xp->find('/mets:mets/mets:fileSec/mets:fileGrp');
my $filegrpnodes = $xp->find($compiled_xpath_filegrps);
foreach my $filegrpnode ($filegrpnodes->get_nodelist) {
#die XML::XPath::XMLParser::as_string($filegrpnode), "\n\n";
#die Dumper($filegrpnode);
if ($filegrpnode->findvalue('@ADMID') eq $repid) {
#die Dumper($filegrpnode);
my $filesnodes = $filegrpnode ->find("mets:file/mets:FLocat");
my $filesnodes = $filegrpnode ->find($compiled_xpath_flocat);
foreach my $filesnode ($filesnodes->get_nodelist) {
my $value = $filesnode->findvalue('@xlin:href');
check_if_db_conform($value, $filename);
......@@ -283,7 +344,7 @@ sub parse_iexml {
###############################################################################
# because ExLibris Rosetta produces filenames of following format:
# V\d+-IE\d+\.xml
# e.G.:
# e.G.:
# V1-IE23891.xml
# V1-IE94621.xml
# V2-IE23891.xml
......@@ -295,11 +356,16 @@ sub parse_iexml {
# and returns an array reference with reduced files using only highest V-value
# HINT, it only operates on sorted file list with fake versions (with zero-filled prefixes)
################################################################################
sub find_newest_iefile_version ($$) {
sub find_newest_iefile_version ($$$) {
my $files_sorted = shift;
my $files_truncated = shift;
my $cnt_files = shift;
my $cnt_truncated = 0;
my $fh = $files_sorted->filehandle("<");
my $last_entry;
# FIXME, how many lines?
my $p = Time::Progress->new(min=>0, max=> $cnt_files);
my $i=0;
while(<$fh>) {
my $entry = $_;
$entry =~ m/^(.+?V)(\d+)(-IE\d+\.xml)$/;
......@@ -312,23 +378,38 @@ sub find_newest_iefile_version ($$) {
if (($last_prefix eq $prefix ) && ($last_suffix eq $suffix) && ($last_version < $version)) {
} else {
$files_truncated->append($last_entry);
$cnt_truncated++;
}
print $p->report("find newest IE files: %40b ETA: %E \r", $i++);
$last_entry = $entry;
}
$files_truncated->append($last_entry);
return 1;
$cnt_truncated++;
say "";
return $cnt_truncated;
}
# begin closure
{
my $tmp_ies_dir = Path::Tiny->tempdir( TEMPLATE => "exitstrategy_XXXXXXXXXXX", CLEANUP => 1);
my $tmp_ies_unsorted_file = $tmp_ies_dir->child("unsorted_ies");
my $tmp_ies_sorted_file = $tmp_ies_dir->child("sorted_ies");
my $tmp_ies_truncated_file = $tmp_ies_dir->child("truncated_ies");
my $cnt_unsorted_files=0;
###############################################################################
# call back function to File::Find
#
###############################################################################
############# main ############################################################
###############################################################################
###############################################################################
my $recovery = undef;
my @ARGV_tail;
GetOptions(
"recovery" => \$recovery,
'<>' => sub {push @ARGV_tail, @_;}
);
if ($#ARGV_tail != 1) {
die "you need a SQL-file and a directory as argument\n";
}
sub process_sip () {
my $file=$File::Find::name;
if ($file =~ m/V(\d+)-IE\d+\.xml$/) {
......@@ -337,43 +418,50 @@ sub find_newest_iefile_version ($$) {
my $fakeversion = sprintf("%05i",$version);
$file =~s/V(\d+)-IE/V$fakeversion-IE/;
$tmp_ies_unsorted_file -> append( $file."\n");
$cnt_unsorted_files++;
}
return;
}
###############################################################################
###############################################################################
############# main ############################################################
###############################################################################
###############################################################################
my $dir = shift @ARGV;
if (defined $recovery) { warn "recovery enabled for XML processing\n"; }
my $sqlfile = shift @ARGV_tail;
if($sqlfile !~ m/[A-Za-z0-9]+\.sql$/) {die "SQL file should be named like 'foo.sql', but was '$sqlfile'\n";}
my $dir = shift @ARGV_tail;
open(my $fh, ">:encoding(UTF-8)", "$sqlfile") || die "could not open file '$sqlfile' for writing, $!";
if (defined $dir && -d "$dir") {
write_database_creation();
write_tables_creation();
write_prepare_insert();
say "preparing SQL";
write_database_creation($fh);
write_tables_creation($fh);
write_prepare_insert($fh);
$tmp_ies_unsorted_file->touch();
say "searching IE files";
find(\&process_sip, $dir);
# /permanent_storage/2020/04/02/IE201080/V1-FL201091.xml
# /permanent_storage/2020/04/02/IE201080/V2-FL201091.xml
say "sorting IE files";
sort_file({
I => $tmp_ies_unsorted_file->absolute()->stringify,
o => $tmp_ies_sorted_file->absolute()->stringify,
});
find_newest_iefile_version ($tmp_ies_sorted_file, $tmp_ies_truncated_file );
my $cnt_truncated_files = find_newest_iefile_version ($tmp_ies_sorted_file, $tmp_ies_truncated_file, $cnt_unsorted_files );
# now operate on truncated
my $fh = $tmp_ies_truncated_file->openr();
my $fh_truncated_IEs = $tmp_ies_truncated_file->openr();
my $count=0;
while( <$fh> ) {
my $p=Time::Progress->new(min => 0, max => $cnt_truncated_files);
while( <$fh_truncated_IEs> ) {
chomp;
$count++;
print $p->report("parse IE files: %40b ETA: %E \r", $count++);
s/V(0*)(\d+-IE)/V$2/; # revert fake version
my $ret = parse_iexml($_);
write_addsql($ret);
my $ret = parse_iexml($_, $recovery);
write_addsql($fh, $ret);
}
write_index_creation();
warn "processed $count uniq IEs\n;"
say "";
write_index_creation($fh);
say "processed $count uniq IEs";
} else {
die "no directory given on commandline"
}
} #end closure
print "\n";
close ($fh);
1;
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment