#!/usr/bin/perl -w ############################################################################### # Author: Andreas Romeyke # SLUB Dresden, Department Longterm Preservation # copyright 2023/2024, licensed under terms of GNU General Public License 3.0, # see file LICENSE.txt for details. # # scans a given repository and creates a SQLite database. # This is part of the exit-strategy for details, see asciidoc file # exit_strategie.asciidoc (also contains ER-diagram for database) # # call with: # # perl ./exit_strategy.pl rosetta_exit_strategy/tmp.db /permanent/ # ############################################################################### # # WARNING: # # the following messages only occure if you had an unclean SIP ingest process, # it means that in your IE-XML are wrong/unused namespace declarations # # if some AIPs are wrong with messages like: # # '/permanent_storage/normal/2017/07/05/IE1043/V1-IE1043.xml:6: # namespace error : xmlns:mods: 'http://www.loc.gov/mods/v3 # http://www.loc.gov/standards/mods/v3/mods-3-0.xsd' is not a valid URI # s="http://www.loc.gov/mods/v3 # http://www.loc.gov/standards/mods/v3/mods-3-0.xsd"' # # then (and only then) try this: # # perl ./exit_strategy.pl --recover rosetta_exit_strategy/tmp.db /permanent/ # # # call help with: # perl ./exit_strategy.pl --help ############################################################################### use v5.36; use utf8; # for debugging output use constant DEBUG => 0; # no debug use Carp; use DBD::SQLite; use DBI; use File::Basename qw(basename dirname); use File::Find; use Getopt::Long; use IO::Handle; use List::Util qw(first zip); use Path::Tiny; use Pod::Usage; use Time::Progress; use XML::LibXML::XPathContext; use XML::LibXML; sub check_lzaid ($lza_id) { my $rx_up=qr{[A-Za-z0-9_-]+}; # archive name & internal workflow my $rx_lw=qr{[a-z0-9_-]+}; # external workflow & external id # SLUB:LZA:Kitodo:testcases:fileoriginalpath_with_http return ($lza_id =~ m/^SLUB:LZA:$rx_up:$rx_lw:$rx_lw$/); }; STDOUT->autoflush(1); # guarantee, that output will be UTF8 binmode(STDOUT, ":encoding(UTF-8)"); my $db_name="exit_strategy"; my $schema_name="exit_strategy"; my $sourcetype="hdd"; #default value ############################################################################### # write database creation # write tables creation # scan repository # if IE.xml file found, read its metadata, create SQL add entry # write SQL add entry ############################################################################### sub write_database_creation ($dbh) { # non standard conform SQL keywords #say $fh "CREATE DATABASE $db_name;"; #say $fh "CREATE SCHEMA $schema_name;"; #say $fh "USE "; my $sql1=<<"SQL_PRAGMA_WAL"; PRAGMA journal_mode=WAL; SQL_PRAGMA_WAL my $sql2=<<"SQL_PRAGMA_ASYNC"; PRAGMA synchronous = OFF; SQL_PRAGMA_ASYNC my $sql3=<<"SQL_PRAGMA_TEMP"; PRAGMA temp_store = OFF; SQL_PRAGMA_TEMP my $sth1 = $dbh->prepare($sql1); $sth1->execute() or die "sql problem detected", $dbh->errstr; my $sth2 = $dbh->prepare($sql2); $sth2->execute() or die "sql problem detected", $dbh->errstr; my $sth3 = $dbh->prepare($sql3); $sth3->execute() or die "sql problem detected", $dbh->errstr; return; } # write tables creation;: sub write_tables_creation ($dbh) { my $sql1=<<"SQL_CREATE1"; /* create AIP table */ CREATE TABLE aip ( id INTEGER, ie_id TEXT NOT NULL, lza_id TEXT, version INTEGER NOT NULL, dir_path TEXT NOT NULL, PRIMARY KEY(id AUTOINCREMENT) ); SQL_CREATE1 my $sql2=<<"SQL_CREATE2"; /* create IEFILE table */ CREATE TABLE metadatafile ( id INTEGER, aip_id INTEGER NOT NULL REFERENCES aip (id), location TEXT NOT NULL, sourcetype TEXT NOT NULL, PRIMARY KEY(id AUTOINCREMENT) ); SQL_CREATE2 my $sql3=<<"SQL_CREATE3"; /* create DC table */ CREATE TABLE dc ( id INTEGER, aip_id INTEGER NOT NULL REFERENCES aip (id), element TEXT NOT NULL, value TEXT NOT NULL, PRIMARY KEY(id AUTOINCREMENT) ); SQL_CREATE3 my $sql4=<<"SQL_CREATE4"; /* create FILE table */ CREATE TABLE sourcedatafile ( id INTEGER, aip_id INTEGER NOT NULL REFERENCES aip (id), name TEXT NOT NULL, PRIMARY KEY(id AUTOINCREMENT) ); SQL_CREATE4 my $sql5=<<"SQL_CREATE5"; /* create LOCAT table */ CREATE TABLE sourcedatalocat ( id INTEGER, file_id INTEGER NOT NULL REFERENCES sourcedatafile (id), location TEXT NOT NULL, sourcetype TEXT NOT NULL, filesize INTEGER, PRIMARY KEY(id AUTOINCREMENT) ); SQL_CREATE5 my $sql6=<<"SQL_CREATE6"; CREATE TABLE deleted ( id INTEGER, aip_id INTEGER NOT NULL REFERENCES aip (id), reason TEXT NOT NULL, note TEXT, date DATE, authorized_by TEXT NOT NULL, PRIMARY KEY(id AUTOINCREMENT) ); SQL_CREATE6 my $sql8=<<"SQL_CREATE8"; CREATE TABLE purged ( id INTEGER, aip_id INTEGER NOT NULL REFERENCES aip (id), reason TEXT, note TEXT, date DATE, authorized_by TEXT NOT NULL, PRIMARY KEY(id AUTOINCREMENT) ); SQL_CREATE8 my $sql7=<<"SQL_CREATE7"; CREATE TABLE transferaip ( id INTEGER, aip_id INTEGER NOT NULL REFERENCES aip (id), lza_id TEXT NOT NULL, transfer_state TEXT NOT NULL, PRIMARY KEY(id AUTOINCREMENT) ); SQL_CREATE7 my $sth1 = $dbh->prepare($sql1); $sth1->execute() or die "sql problem detected", $dbh->errstr; my $sth2 = $dbh->prepare($sql2); $sth2->execute() or die "sql problem detected", $dbh->errstr; my $sth3 = $dbh->prepare($sql3); $sth3->execute() or die "sql problem detected", $dbh->errstr; my $sth4 = $dbh->prepare($sql4); $sth4->execute() or die "sql problem detected", $dbh->errstr; my $sth5 = $dbh->prepare($sql5); $sth5->execute() or die "sql problem detected", $dbh->errstr; my $sth6 = $dbh->prepare($sql6); $sth6->execute() or die "sql problem detected", $dbh->errstr; my $sth7 = $dbh->prepare($sql7); $sth7->execute() or die "sql problem detected", $dbh->errstr; my $sth8 = $dbh->prepare($sql8); $sth8->execute() or die "sql problem detected", $dbh->errstr; return 1; } ############################################################################### # Prepare SQL INSERT Statements for AIPs ############################################################################### sub write_prepare_insert ($dbh) { return 1; } sub prepare_addsql( $dbh) { my $sql_aip_plan=<<"SQL_AIP_PLAN"; INSERT INTO aip (ie_id, lza_id, version, dir_path) VALUES (\$1, \$2, \$3, \$4); SQL_AIP_PLAN my $sql_ie_plan=<<"SQL_IE_PLAN"; INSERT INTO metadatafile (aip_id, location, sourcetype) VALUES ( (SELECT id FROM aip WHERE aip.ie_id=\$1 AND aip.version=\$2), \$3, \$4 ); SQL_IE_PLAN my $sql_file_plan=<<"SQL_FILE_PLAN"; INSERT INTO sourcedatafile (aip_id, name) VALUES ( (SELECT id FROM aip WHERE aip.ie_id=\$1 AND aip.version=\$2), \$3 ); SQL_FILE_PLAN my $sql_locat_plan=<<"SQL_LOCAT_PLAN"; INSERT INTO sourcedatalocat (file_id, location, sourcetype, filesize) VALUES ( (SELECT sourcedatafile.id FROM sourcedatafile,aip WHERE sourcedatafile.aip_id=aip.id AND aip.ie_id=\$1 AND aip.version=\$2 AND sourcedatafile.name=\$3), \$4, \$5, \$6 ); SQL_LOCAT_PLAN my $sql_dc_plan=<<"SQL_DC_PLAN"; INSERT INTO dc (aip_id, element, value) VALUES ( (SELECT id FROM aip WHERE aip.ie_id=\$1 AND aip.version=\$2), \$3, \$4 ); SQL_DC_PLAN my $sql_deleted_plan=<<"SQL_DELETE_PLAN"; INSERT INTO deleted (aip_id, reason, note, date, authorized_by) VALUES ( (SELECT id FROM aip WHERE aip.ie_id=\$1 AND aip.version=\$2), \$3, \$4, \$5, \$6 ); SQL_DELETE_PLAN my $sql_purged_plan=<<"SQL_PURGED_PLAN"; INSERT INTO purged (aip_id, reason, note, date, authorized_by) VALUES ( (SELECT id FROM aip WHERE aip.ie_id=\$1 AND aip.version=\$2), \$3, \$4, \$5, \$6 ); SQL_PURGED_PLAN my $sth_aip_plan = $dbh->prepare($sql_aip_plan); my $sth_ie_plan = $dbh->prepare($sql_ie_plan); my $sth_file_plan = $dbh->prepare($sql_file_plan); my $sth_locat_plan = $dbh->prepare($sql_locat_plan); my $sth_dc_plan = $dbh->prepare($sql_dc_plan); my $sth_deleted_plan = $dbh->prepare($sql_deleted_plan); my $sth_purged_plan = $dbh->prepare($sql_purged_plan); my $plans; $plans->{aip} = $sth_aip_plan; $plans->{ie} = $sth_ie_plan; $plans->{file} = $sth_file_plan; $plans->{locat} = $sth_locat_plan; $plans->{dc} = $sth_dc_plan; $plans->{deleted} = $sth_deleted_plan; $plans->{purged} = $sth_purged_plan; return $plans; } ############################################################################### # write add SQL entry, expects a hashref which contains ff. params # (foreach file location/copy): # INSERT INTO aip (ie_id) VALUES ($ieid); # INSERT INTO iefile (aip_id, location, sourcetype) VALUES ( # (SELECT id FROM aip where aip.ieid = $ieid), $location, $sourcetype); # INSERT INTO file (aip_id, name) VALUES ( # (SELECT id FROM aip where aip.ieid = $ieid), $name); # INSERT INTO locat (file_id, location, sourcetype) VALUES ( # (SELECT file.aip_id FROM file where file.aip_id = aip.id # AND aip.ie_id=$ieid), $location, $sourcetype) # INSERT INTO dc (aip_id, element, value) VALUES ( # (SELECT id FROM aip where aip.ieid = $ieid), $element, $value); # TODO: needs additional work # expects a reference of an hash: # $ret{"filename" } = $filename; # $ret{"title"} = $title; # $ret{"repid"} = $repid; # $ret{"files"} = \@files; # $ret{"dcrecords"} = \@dcrecords; ############################################################################### { my $lza_id_counter=0; my %lza_id_counter_hash; sub write_addsql($dbh, $plans, $refhash) { my $iefile = basename($refhash->{"filename"}); my $dir = dirname($refhash->{"filename"}); my ($ieid, $ieversion); if ($iefile =~ m/^V(\d+)-(IE\d*)\.xml$/) { $ieversion = $1; $ieid = $2; } else { die "Could not detect PID and Version from IEFile '$iefile'\n"; } my $lza_id_kv = List::Util::first { my ($dckey, $dcvalue) = @{$_}; (($dckey eq 'dc:identifier') and (check_lzaid($dcvalue) == 1)) } @{$refhash->{"dcrecords"}}; my $lza_id = @{$lza_id_kv}[1]; if (!defined $lza_id) { if (!exists( $lza_id_counter_hash{ $ieid} )) { $lza_id = sprintf("SLUB:LZA:migration_from:rosetta:unknown_%05lu", $lza_id_counter++); warn("no lza id detected on $refhash->{filename}, using $lza_id now!"); $lza_id_counter_hash{ $ieid} = $lza_id; } else { $lza_id = $lza_id_counter_hash{ $ieid}; } } # start SQL insert eval {$plans->{aip}->execute($ieid, $lza_id, $ieversion, $dir)} or die "sql problem detected with $ieid ($ieversion), plz check if there is a dublette dir, ", $dbh->errstr; # FIXME if multiple locations exists eval {$plans->{ie}->execute($ieid, $ieversion, $iefile, $sourcetype)} or die "sql problem detected with $ieid ($ieversion, $iefile, $sourcetype), ", $dbh->errstr; foreach my $location (@{$refhash->{"files"}}) { my $file = basename($location); # FIXME if multiple locations my $size = $refhash->{"sizes"}->{$location}; $location=~s#//#/#g; # post correct file paths, because sometimes in Rosetta // is used eval {$plans->{file}->execute($ieid, $ieversion, $file)} or die "sql problem detected with $ieid ($ieversion, $file), ", $dbh->errstr; eval {$plans->{locat}->execute($ieid, $ieversion, $file, $location, $sourcetype, $size)} or die "sql problem detected with $ieid ($ieversion, $file, $location, $sourcetype, $size), ", $dbh->errstr; } foreach my $dcpair (@{$refhash->{"dcrecords"}}) { my ($dckey, $dcvalue) = @{$dcpair}; # quote ' in dcvalue $dcvalue =~ tr/'/"/; eval {$plans->{dc}->execute($ieid, $ieversion, $dckey, $dcvalue)} or die "sql problem detected with $ieid ($ieversion, $dckey, $dcvalue), ", $dbh->errstr; } foreach my $delstate (qw(purged deleted)) { # we ignore "recovered" state here, because not an extra table if ($refhash->{$delstate}) { my $delete = $refhash->{$delstate}; # TODO: call plan plans->{deleted} my $state = $delete->{'state'}; my $reason = $delete->{'reason'}; my $note = $delete->{'note'}; my $date = $delete->{'date'}; my $authorized_by = $delete->{'authorized_by'}; eval { $plans->{$delstate}->execute($ieid, $ieversion, $reason, $note, $date, $authorized_by) } or die "sql problem detected with $delstate $ieid ($ieversion, $reason, $note, $date, $authorized_by), ", $dbh->errstr; } } return 1; } } ############################################################################### # add INDEX and other TRICKs to increase performance ############################################################################### sub write_index_creation($dbh) { my $sql1=<<"SQL_INDEX1"; CREATE UNIQUE INDEX aip_index ON aip (ie_id, version); SQL_INDEX1 my $sql2=<<"SQL_INDEX2"; CREATE UNIQUE INDEX sourcedata_index ON sourcedatafile (aip_id, name); SQL_INDEX2 my $sql3=<<"SQL_INDEX3"; CREATE INDEX delete_index ON deleted (aip_id); SQL_INDEX3 my $sql4=<<"SQL_INDEX4"; CREATE INDEX aip_index2 ON aip (lza_id); SQL_INDEX4 my $sql5=<<"SQL_INDEX5"; CREATE INDEX aip_index3 ON aip (ie_id,lza_id); SQL_INDEX5 my $sql6=<<"SQL_VIEW"; CREATE VIEW non_deleted_aip(id,ie_id,lza_id,version,dir_path) AS SELECT * FROM aip WHERE NOT EXISTS ( SELECT aip_id FROM deleted WHERE aip.id == aip_id ) AND NOT EXISTS ( SELECT aip_id FROM purged WHERE aip.id == aip_id ); SQL_VIEW my $sth1 = $dbh->prepare($sql1); $sth1->execute() or die "sql problem detected", $dbh->errstr; my $sth2 = $dbh->prepare($sql2); $sth2->execute() or die "sql problem detected", $dbh->errstr; my $sth3 = $dbh->prepare($sql3); $sth3->execute() or die "sql problem detected", $dbh->errstr; my $sth4 = $dbh->prepare($sql4); $sth4->execute() or die "sql problem detected", $dbh->errstr; my $sth5 = $dbh->prepare($sql5); $sth5->execute() or die "sql problem detected", $dbh->errstr; my $sth6 = $dbh->prepare($sql6); $sth6->execute() or die "sql problem detected", $dbh->errstr; return 1; } ############################################################################### # checks if a given string from from a given file contains only utf-8 chars # which are compatible to common used databases ############################################################################### sub check_if_db_conform ($string, $filename) { if ($string ne '') { if ( not utf8::is_utf8($string)) { croak "no utf8: '$string' in file '$filename'\n"; } }# return; } { my $xp; sub get_xpath_context { if (defined $xp) { return $xp;} $xp = XML::LibXML::XPathContext->new(); $xp->registerNs("dnx", "http://www.exlibrisgroup.com/dps/dnx"); $xp->registerNs("sru", "http://www.loc.gov/zing/srw/"); $xp->registerNs("xsi", "http://www.w3.org/2001/XMLSchema-instance"); $xp->registerNs("dc", "http://purl.org/dc/elements/1.1/"); $xp->registerNs("mets", "http://www.loc.gov/METS/"); $xp->registerNs("rosettamets", "http://www.exlibrisgroup.com/xsd/dps/rosettaMets"); $xp->registerNs("mods", "http://www.loc.gov/mods/v3"); $xp->registerNs("ns2", "http://dps.exlibris.com/"); $xp->registerNs("dv", "http://dfg-viewer.de/"); $xp->registerNs("slub", "http://slub-dresden.de/"); $xp->registerNs("archive", "http://slub-dresden.de/slubarchiv"); $xp->registerNs("premis", "info:lc/xmlns/premis-v2"); $xp->registerNs("mix", "http://www.loc.gov/standards/mix/"); $xp->registerNs("xlink", "http://www.w3.org/1999/xlink"); $xp->registerNs("xlin", "http://www.w3.org/1999/xlink"); return $xp; } } { my $parser; sub get_parser ( $recovery_flag ){ if (defined $parser) { return $parser;} $parser = XML::LibXML->new(recover => $recovery_flag, compact => 1, no_network => 1, recover => $recovery_flag, no_blanks => 1, ); return $parser; } } ############################################################################### # # /mets:mets/mets:dmdSec[1]/mets:mdWrap[1]/mets:xmlData[1]/dc:record[1]/dc:title[1] # /mets:mets/mets:amdSec[1]/mets:techMD[1]/mets:mdWrap[1]/mets:xmlData[1]/dnx[1]/section[1]/record[1]/key[2] # mit ID=Label und Wert = LOCAL # dort die ID von techMD (Referenz für Files) # # Files via /mets:mets/mets:fileSec[1]/mets:fileGrp[1]/mets:file[1]/mets:FLocat[1] # ############################################################################### my $compiled_xpath_dmdSec = XML::LibXML::XPathExpression->new('/mets:mets/mets:dmdSec'); my $compiled_xpath_amdSec = XML::LibXML::XPathExpression->new('/mets:mets/mets:amdSec'); my $compiled_xpath_amdSec_rep = XML::LibXML::XPathExpression->new('/mets:mets/mets:amdSec[starts-with(@ID, \'REP\')]'); my $compiled_xpath_amdSec_fl = XML::LibXML::XPathExpression->new('/mets:mets/mets:amdSec[starts-with(@ID, \'FL\')]'); my $compiled_xpath_amdSec_ie = XML::LibXML::XPathExpression->new('/mets:mets/mets:amdSec[@ID="ie-amd"]'); my $compiled_xpath_fileSec = XML::LibXML::XPathExpression->new('/mets:mets/mets:fileSec'); my $compiled_xpath_titles = XML::LibXML::XPathExpression->new('mets:mdWrap[1]/mets:xmlData[1]/dc:record/dc:title[1]'); my $compiled_xpath_fileGrp = XML::LibXML::XPathExpression->new('/mets:mets/mets:fileSec/mets:fileGrp'); my $compiled_xpath_files = XML::LibXML::XPathExpression->new("mets:file/mets:FLocat/\@xlin:href"); my $compiled_xpath_filepids = XML::LibXML::XPathExpression->new('mets:file/@ID'); my $compiled_xpath_dcrecords = XML::LibXML::XPathExpression->new('mets:mdWrap/mets:xmlData/dc:record/*'); my $compiled_xpath_repid = XML::LibXML::XPathExpression->new('/mets:mets/mets:amdSec[starts-with(@ID, \'REP\')]/@ID'); my $str_local_record = "mets:techMD/mets:mdWrap/mets:xmlData/*[local-name()=\'dnx\']/*[local-name()=\'section\']/*[local-name()=\'record\']"; my $str_local_reps = "$str_local_record/*[local-name()=\'key\' and \@id=\'label\' and (. = \'LOCAL\' or . = \'LZA_INTERN\' or . = \'LZA\')]"; my $str_repid_old = "/mets:mets/mets:amdSec[starts-with(\@ID, \'REP\') and $str_local_reps]/\@ID"; my $compiled_xpath_repid_old = XML::LibXML::XPathExpression->new($str_repid_old); # only event 272 or 274 should be used, next line uses a multiple-predicate hack for XPATH 1.0 my $xpath_if_anydelete_event_expr = 'mets:digiprovMD[@ID="ie-amd-digiprov"]/mets:mdWrap/mets:xmlData/dnx:dnx/dnx:section[@id="event"]/dnx:record[dnx:key[@id="eventIdentifierValue"][. >= 272][. <= 274]]'; my $compiled_xpath_anydelete_event = XML::LibXML::XPathExpression->new('dnx:key[@id="eventDescription"]/text()'); my $compiled_xpath_anydelete_date = XML::LibXML::XPathExpression->new('dnx:key[@id="eventDateTime"]/text()'); my $compiled_xpath_anydelete_authorized_by = XML::LibXML::XPathExpression->new('dnx:key[@id="linkingAgentIdentifierValue1"]/text()'); my $compiled_xpath_anydelete_reason = XML::LibXML::XPathExpression->new('dnx:key[@id="eventOutcomeDetail1"]/text()'); ############################################################ sub get_title ($xp, $dmd){ # get title my $title = $xp->findvalue($compiled_xpath_titles, $dmd); return $title; } sub get_dcrecords_ref ($xp, $dmd){ my @dcnodes = $xp->findnodes($compiled_xpath_dcrecords, $dmd); my @dcrecords = map { $_->[1]=~ tr/\n/ /; $_->[1] =~ s/'/\\'/g; $_; } grep { defined $_->[0] && defined $_->[1]; } map { # array ref [$_->getName, $_->textContent]; } @dcnodes; return \@dcrecords; } sub get_repid ($xp){ my $repid; my @repnodes = $xp->findnodes($compiled_xpath_repid); #/mets:mets/mets:amdSec/@ID my $found = scalar @repnodes; if (1 == $found) { $repid = $repnodes[0]->nodeValue; } elsif ($found > 1) { #multiple representations found # choose reppid with LZA, LZA_INTERN or LOCAL (for very old IEs) $repid = $xp->findvalue( $compiled_xpath_repid_old); } # if $found == 0 do nothing return $repid; } sub get_filepids_ref ($xp, $filegrp) { my @files_nodes = $xp->findnodes($compiled_xpath_filepids, $filegrp); my @filepids = map { $_->nodeValue; } @files_nodes; return \@filepids; } sub get_files_ref ($xp, $filegrp){ my @files_nodes = $xp->findnodes($compiled_xpath_files, $filegrp); my @files = map { $_->nodeValue; } @files_nodes; return \@files; } sub get_file_path($xp, $filegrp, $filepid) { return $xp->findvalue("./mets:file[\@ID=\"$filepid\"]/mets:FLocat/\@xlin:href", $filegrp); } sub get_file_path_rx($xml, $filepid) { # this method is around 2.5x slower than get_file_path my $nt = qr{[^>]*}; if ($xml=~m{(?:<mets:file${nt}ID="$filepid"$nt>\s*<mets:FLocat${nt}xlin:href=")([^"]*)}s) { #warn "path '$1'\n"; return $1; } } sub get_filesize ($xp, $filepid) { my $xpath =<<"XPATH"; /mets:mets/mets:amdSec[\@ID='$filepid-amd']/mets:techMD[\@ID='$filepid-amd-tech'] /mets:mdWrap/mets:xmlData/*[namespace-uri()='http://www.exlibrisgroup.com/dps/dnx' and local-name()='dnx'] /*[namespace-uri()='http://www.exlibrisgroup.com/dps/dnx' and local-name()='section' and \@id='generalFileCharacteristics'] /*[namespace-uri()='http://www.exlibrisgroup.com/dps/dnx' and local-name()='record'] /*[namespace-uri()='http://www.exlibrisgroup.com/dps/dnx' and local-name()='key' and \@id='fileSizeBytes']/text() XPATH return $xp->findvalue($xpath); } sub get_filesize_rx($xml, $filepid) { #say "size -"; if ($xml=~m{(?:<mets:techMD\s*ID="$filepid-amd-tech">.*?<key id="fileSizeBytes">)([^<]*)}s) { #say "size $1"; return $1; } } sub get_anydelete_state($xp, $dnx) { # we need to earch for eventIdentifierValue 272 or 274. # the eventDescription should be # a) IE has been deleted # b) IE has been purged my @del_nodes = map {$_->nodeValue} $xp->findnodes($compiled_xpath_anydelete_event, $dnx); my @states = map {$_ =~ s/^IE has been (deleted|purged|recovered)$/$1/; $_} @del_nodes; return $states[0]; } sub has_anydelete_entries($xp, $dnx) { # we need to earch for eventIdentifierValue 272 or 274. # the eventDescription should be # a) IE has been deleted # b) IE has been purged return $xp->exists($compiled_xpath_anydelete_event, $dnx); } sub get_anydelete_date($xp, $dnx) { my @date_nodes = $xp->findnodes($compiled_xpath_anydelete_date, $dnx); my @dates = map {$_->nodeValue} @date_nodes; return $dates[0]; } sub get_anydelete_note($xp, $dnx) { my @entries_str = map {$_->nodeValue} $xp->findnodes($compiled_xpath_anydelete_reason, $dnx); my @results; foreach my $e ( @entries_str) { my @entries = split(/;/, $e); push @results, List::Util::first{ s/note=//; } @entries } return $results[0]; } sub get_anydelete_reason($xp, $dnx) { my @entries_str = map {$_->nodeValue} $xp->findnodes($compiled_xpath_anydelete_reason, $dnx); my @results; foreach my $e ( @entries_str) { my @entries = split(/;/, $e); push @results, List::Util::first{ s/reason=//; } @entries } return $results[0]; } sub get_anydelete_authorized_by($xp, $dnx) { my @auth = map {$_->nodeValue} $xp->findnodes($compiled_xpath_anydelete_authorized_by, $dnx); return $auth[0]; } sub check_if_file_is_readable($filename) { # workaround to check if NFS read errors my $ok = open(my $FH, '<', $filename); if (!$ok) { warn "file '$filename' could not be opened, $!\n"; return; } binmode($FH); my $buf; my $readc = sysread $FH, $buf, 4; unless (4 == $readc) { warn "file '$filename' could not read 4 bytes from (beginning file), $!\n"; return; } my $pos = sysseek $FH, -4, 2; unless (defined $pos) { warn "file '$filename' could not seek 4 bytes before end, $!\n"; return; } unless (4 == $readc) { warn "file '$filename' could not read 4 bytes from (end file), $!\n"; return; } close $FH; return 1; } sub _handle_deleted_or_purged_aips($xp, $ret) { my $amdsec_ie = $xp->findnodes($compiled_xpath_amdSec_ie)->[0]; my @ctxs = $xp->findnodes($xpath_if_anydelete_event_expr, $amdsec_ie); my $cmp_by_date_and_state = sub ($a, $b) { my $adate = get_anydelete_date($xp, $a); my $bdate = get_anydelete_date($xp, $b); my $dateres = $bdate cmp $adate; if (0 == $dateres) { my $astate = get_anydelete_state($xp, $a); my $bstate = get_anydelete_state($xp, $b); return $bstate cmp $astate # "purged" > "delete" } return $dateres; # newer > older }; my $latest_ctx = (sort { $cmp_by_date_and_state->($a, $b) } @ctxs)[0]; # check if latest entry is either deleted or purged, only than add entry my $is_deleted_or_purged = has_anydelete_entries($xp, $latest_ctx); if ($is_deleted_or_purged) { my $state = get_anydelete_state($xp, $latest_ctx); $ret->{$state}->{state} = $state; $ret->{$state}->{reason} = get_anydelete_reason($xp, $latest_ctx); $ret->{$state}->{note} = get_anydelete_note($xp, $latest_ctx); $ret->{$state}->{date} = get_anydelete_date($xp, $latest_ctx); $ret->{$state}->{authorized_by} = get_anydelete_authorized_by($xp, $latest_ctx); } return $is_deleted_or_purged; } sub parse_iexml($filename, $recovery_flag) { my $retry = 3; while ($retry > 0) { if (!check_if_file_is_readable($filename)) { $retry--; my $wait = int(rand(60)); warn "detected non-readable file $filename, retrying after $wait seconds ... ($retry)"; sleep $wait; next; } if ($recovery_flag) { $recovery_flag = 2; # avoid warnings, see XML::LibXML::Parser POD about 'recovery' } my $parser = get_parser($recovery_flag); my $dom = $parser->parse_file($filename); #my $slurp = path($filename)->slurp; my $xp = get_xpath_context(); $xp->setContextNode($dom); my $dmdsec = $xp->findnodes($compiled_xpath_dmdSec)->[0]; #my $amdsec_rep = $xp->findnodes($compiled_xpath_amdSec_rep)->[0]; my $filesec = $xp->findnodes($compiled_xpath_fileSec)->[0]; my $ret; $ret->{"filename"} = $filename; ############################################ # get title $ret->{"title"} = get_title($xp, $dmdsec); ############################################ # get dc-records $ret->{"dcrecords"} = get_dcrecords_ref($xp, $dmdsec); ############################################ my $is_deleted = _handle_deleted_or_purged_aips($xp, $ret); ############################################ # get right representation ID (has a dnx-section with <key id=label>LOCAL</key>) my $repid = get_repid($xp); if (!defined $repid) { say STDERR "No repid found in file $filename, is IE intentionally purged?"; if ($is_deleted) { say STDERR " Yes, a corresponding purge/delete/recover event is found."; } else { say STDERR " No, a corresponding purge/delete/recover event was missed. This indicates an error in archive."; } return $ret; } $ret->{"repid"} = $repid; ############################################ # get all files of LOCAL representation my $filegrp = $xp->findnodes("mets:fileGrp[\@ADMID='$repid']", $filesec)->[0]; $ret->{"filepids"} = get_filepids_ref($xp, $filegrp); $ret->{"files"} = get_files_ref($xp, $filegrp); my @locations = map {get_file_path($xp, $filegrp, $_)} @{$ret->{"filepids"}}; my @sizes = map {get_filesize($xp, $_)} @{$ret->{"filepids"}}; my %ls = map { $_->[0] => $_->[1] } zip \@locations, \@sizes; $ret->{"sizes"} = \%ls; #use Data::Printer; p($ret); return $ret; } return; } # returns count of subdirs of $dir sub searching_relevant_subdirs ($dir) { my $first_two_levels_of_dirs = 0; my $rx_dir = qr{^\Q$dir\E/?}; ### my $wanted_twolevel_dircount = sub { if (/^\.[A-Za-z0-9]/) { # ignore dot-dirs $File::Find::prune = 1; } else { my $relpath = $File::Find::name; $relpath =~ s{$rx_dir}{}; my $depth = File::Spec->splitdir($relpath); $depth >= 2 and $File::Find::prune = 1; if (-d $_) {$first_two_levels_of_dirs++;} } }; ### find($wanted_twolevel_dircount, $dir); return $first_two_levels_of_dirs; } sub searching_relevant_ie_files ($dir, $tmp_ies_unsorted_file, $first_two_levels_of_dirs) { my $progressbar = Time::Progress->new(min => 0, max => $first_two_levels_of_dirs, smoothing => 1); my $rx_dir = qr{^\Q$dir\E/?}; ### my $dircount = 0; my $wanted_process_sip = sub { if (-f && m/V(\d+)-IE\d+\.xml$/) { my $version = $1; my $file = $File::Find::name; # fake name to ue alphabetical sort my $fakeversion = sprintf("%05i", $version); $file =~ s/V\d+-IE/V$fakeversion-IE/; $tmp_ies_unsorted_file->append($file . "\n"); $dircount++; $File::Find::prune = 1; } elsif (-d) { if (/^\.[A-Za-z0-9]/) { # ignore dot-dirs $File::Find::prune = 1; } else { my $relpath = $File::Find::name; $relpath =~ s{$rx_dir}{}; my $depth = File::Spec->splitdir($relpath); if ($depth <= 2) { print $progressbar->report("find IE files: %40b ETA: %E \r", $dircount); } } } return; }; ### find($wanted_process_sip, $dir); return $dircount; } sub searching_ie_files ($dirs_ref, $tmp_ies_unsorted_file) { my $cnt_unsorted_files = 0; my $pass = 0; my @dirs = @{ $dirs_ref }; my $maxpass=scalar @dirs; foreach my $dir (@dirs) { $pass++; my $begin_time = time; my $first_two_levels_of_dirs = searching_relevant_subdirs($dir); my $count = searching_relevant_ie_files($dir, $tmp_ies_unsorted_file, $first_two_levels_of_dirs); my $end_time = time; my $diff_time = $end_time - $begin_time; say "found $count IEs in pass $pass/$maxpass (dir '$dir') in $diff_time s "; $cnt_unsorted_files+=$count; } say "\r "; return $cnt_unsorted_files; } ############################################################################### ############################################################################### ############# main ############################################################ ############################################################################### ############################################################################### my $tmp_ies_dir = Path::Tiny->tempdir(TEMPLATE => "exitstrategy_XXXXXXXXXXX", CLEANUP => 1); my $tmp_ies_unsorted_file = $tmp_ies_dir->child("unsorted_ies"); my $flag_recovery = undef; my $flag_sqldump = undef; my $flag_continue = undef; my $db_filename = $db_name . ".db"; my $debug_to_file = undef; my @ARGV_tail; GetOptions( "help|?" => sub { say <<"HELP"; call $0 with following options --help ............... this help --recovery ........... set special recovery flag, if set deleted IEs will be ignored (handle with care!) --continue ........... tries to add IEs to existing database, ignores IEs which already exists in DB (dangerous!) --sqlitedb-file=FILE . set database to file FILE --enable-sqldump ..... dumps a given database as SQL to STDOUT --debug-to-file=FILE . dumps internal structure as YAML to FILE create an exit-DB: $0 [--recovery] [--sqlitedb-file=FILE] DIR [DIR…] dump an exit-DB as SQL $0 [--sqlitedb-file=FILE] --enable-sqldump HELP exit; }, "continue" => \$flag_continue, "recovery" => \$flag_recovery, "sqlitedb-file=s" => \$db_filename, "enable-sqldump" => \$flag_sqldump, "debug-to-file=s" => \$debug_to_file, '<>' => sub {push @ARGV_tail, @_;} ) or pod2usage(2); if (defined $flag_recovery) {warn "recovery enabled for XML processing\n";} if (defined $flag_sqldump) { open my $cmd, '|-', '/usr/bin/sqlite3', $db_filename, '.dump' or die "Failed to dump DB: $!\n"; close $cmd or die "Failed to dumb DB: $!\n"; exit; } if ($#ARGV_tail < 0) { die "you need at least a directory as argument\n"; } my $old_stdout = select(STDOUT); $| = 1; select($old_stdout); if ( (! defined $flag_continue) || (! -e $db_filename) ) { say "preparing SQL"; my $dbh = DBI->connect("dbi:SQLite:dbname=$db_filename", "", "", { RaiseError => 1, sqlite_unicode => 1, }) or die "could not connect to database (file '$db_filename')", $DBI::errstr; write_database_creation($dbh); write_tables_creation($dbh); write_prepare_insert($dbh); write_index_creation($dbh); $dbh->disconnect or warn("disconnecting problems, ", $dbh->errstr); } else { say "using existing DB (continue mode!)"; } if (0==@ARGV_tail ){ die "no directory given on commandline" } my @dirs; while (@ARGV_tail > 0) { my $dir = shift @ARGV_tail; if (defined $dir && -d "$dir") { push @dirs, $dir; } } $tmp_ies_unsorted_file->touch(); say "searching IE files"; my $cnt_unsorted_files = searching_ie_files(\@dirs, $tmp_ies_unsorted_file); # /permanent_storage/2020/04/02/IE201080/V1-FL201091.xml # /permanent_storage/2020/04/02/IE201080/V2-FL201091.xml my $fh_unsorted_IEs = $tmp_ies_unsorted_file->openr(); my $count = 0; my $progressbar = Time::Progress->new(min => 1, max => $cnt_unsorted_files); my $dbh = DBI->connect("dbi:SQLite:dbname=$db_filename", "", "", { RaiseError => 1, sqlite_unicode => 1, }) or die "could not connect to database (file '$db_filename')", $DBI::errstr; my $begin_time = time; my $plans = prepare_addsql( $dbh); while (<$fh_unsorted_IEs>) { chomp; print $progressbar->report("parse IE files: %40b ETA: %E \r", $count++); s/V0*(\d+-IE)/V$1/; # revert fake version my $ret = parse_iexml($_, $flag_recovery); if (defined $debug_to_file and length($debug_to_file)>2) { require YAML::XS; YAML::XS::DumpFile($debug_to_file, $ret) or warn "could not dump to file $debug_to_file"; } $dbh->begin_work; eval { write_addsql($dbh, $plans, $ret); }; if ($@) { $dbh->rollback; say "Rollback, because $@"; } else { $dbh->commit; } } $dbh->disconnect or warn("disconnecting problems, ", $dbh->errstr); my $end_time = time; my $diff_time = $end_time - $begin_time; say "\rprocessed $count uniq IEs in $diff_time s "; say ""; 1;