diff --git a/perl/exit_strategy.pl b/perl/exit_strategy.pl index 0c13d5eaf4431a3127376d1a4fb6d8bfc8420d37..f1b080f4c52c508ee6249d86ed806444d0462404 100644 --- a/perl/exit_strategy.pl +++ b/perl/exit_strategy.pl @@ -3,19 +3,13 @@ # Author: Andreas Romeyke # SLUB Dresden, Department Longterm Preservation # -# scans a given repository and creates an SQL script to create a database. +# scans a given repository and creates a SQLite database. # This is part of the exit-strategy for details, see asciidoc file # exit_strategie.asciidoc (also contains ER-diagram for database) # # call with: # -# perl ./exit_strategy.pl rosetta_exit_strategy/tmp.sql /permanent/ -# -# file tested with postgres-database -# -# using then with: -# psql -U romeyke -d exit_strategy \ -# -f rosetta_exit_strategy/tmp.sql -L rosetta_exit.log +# perl ./exit_strategy.pl rosetta_exit_strategy/tmp.db /permanent/ # ############################################################################### # @@ -34,7 +28,7 @@ # # then (and only then) try this: # -# perl ./exit_strategy.pl --recover rosetta_exit_strategy/tmp.sql /permanent/ +# perl ./exit_strategy.pl --recover rosetta_exit_strategy/tmp.db /permanent/ # ############################################################################### @@ -47,7 +41,6 @@ use utf8; # for debugging output use Carp; use Path::Tiny; use File::Find; -use File::Sort qw(sort_file); use XML::LibXML; use Time::Progress; use XML::LibXML::XPathContext; @@ -93,7 +86,8 @@ sub write_tables_creation ($dbh) { /* create AIP table */ CREATE TABLE aip ( id INTEGER, - ie_id TEXT NOT NULL UNIQUE, + ie_id TEXT NOT NULL, + version INTEGER NOT NULL, PRIMARY KEY(id AUTOINCREMENT) ); SQL_CREATE1 @@ -175,30 +169,36 @@ sub write_prepare_insert ($dbh) { # $ret{"dcrecords"} = \@dcrecords; ############################################################################### sub write_addsql ($dbh, $refhash) { - my $ieid = path($refhash->{"filename"})->basename(qw/.xml/); + my $iefile = path($refhash->{"filename"})->basename(); + my ($ieid,$ieversion); + if ($iefile =~ m/^V(\d+)-(IE\d*)\.xml$/) { + $ieversion=$1; $ieid=$2; + } else { + die "Could not detect PID and Version from IEFile '$iefile'\n"; + } my $sql_aip_plan=<<"SQL_AIP_PLAN"; - INSERT INTO aip (ie_id) VALUES (\$1); + INSERT INTO aip (ie_id, version) VALUES (\$1, \$2); SQL_AIP_PLAN my $sql_ie_plan=<<"SQL_IE_PLAN"; INSERT INTO metadatafile (aip_id, location, sourcetype) VALUES ( - (SELECT id FROM aip WHERE aip.ie_id=\$1), \$2, \$3 + (SELECT id FROM aip WHERE aip.ie_id=\$1 AND aip.version=\$2), \$3, \$4 ); SQL_IE_PLAN my $sql_file_plan=<<"SQL_FILE_PLAN"; INSERT INTO sourcedatafile (aip_id, name) VALUES ( - (SELECT id FROM aip WHERE aip.ie_id=\$1), \$2 + (SELECT id FROM aip WHERE aip.ie_id=\$1 AND aip.version=\$2), \$3 ); SQL_FILE_PLAN my $sql_locat_plan=<<"SQL_LOCAT_PLAN"; INSERT INTO sourcedatalocat (file_id, location, sourcetype) VALUES ( (SELECT sourcedatafile.id FROM sourcedatafile,aip WHERE - sourcedatafile.aip_id=aip.id AND aip.ie_id=\$1 AND - sourcedatafile.name=\$2), \$3, \$4 + sourcedatafile.aip_id=aip.id AND aip.ie_id=\$1 AND aip.version=\$2 AND + sourcedatafile.name=\$3), \$4, \$5 ); SQL_LOCAT_PLAN my $sql_dc_pan=<<"SQL_DC_PLAN"; INSERT INTO dc (aip_id, element, value) VALUES ( - (SELECT id FROM aip WHERE aip.ie_id=\$1), \$2, \$3 + (SELECT id FROM aip WHERE aip.ie_id=\$1 AND aip.version=\$2), \$3, \$4 ); SQL_DC_PLAN my $sth_aip_plan = $dbh->prepare($sql_aip_plan); @@ -207,20 +207,20 @@ SQL_DC_PLAN my $sth_locat_plan = $dbh->prepare($sql_locat_plan); my $sth_dc_plan = $dbh->prepare($sql_dc_pan); # start SQL insert - $sth_aip_plan->execute($ieid) or die "sql problem detected", $dbh->errstr; + $sth_aip_plan->execute($ieid, $ieversion) or die "sql problem detected", $dbh->errstr; # FIXME if multiple locations exists - my $iefile = path($refhash->{"filename"})->basename(); - $sth_ie_plan->execute( $ieid, $iefile, $sourcetype) or die "sql problem detected", $dbh->errstr; + + $sth_ie_plan->execute( $ieid, $ieversion, $iefile, $sourcetype) or die "sql problem detected", $dbh->errstr; foreach my $location (@{$refhash->{"files"}}) { my $file = path($location)->basename(); # FIXME if multiple locations - $sth_file_plan->execute($ieid, $file) or die "sql problem detected", $dbh->errstr; - $sth_locat_plan->execute($ieid, $file, $location, $sourcetype) or die "sql problem detected", $dbh->errstr; + $sth_file_plan->execute($ieid, $ieversion, $file) or die "sql problem detected", $dbh->errstr; + $sth_locat_plan->execute($ieid, $ieversion, $file, $location, $sourcetype) or die "sql problem detected", $dbh->errstr; } foreach my $dcpair (@{$refhash->{"dcrecords"}}) { my ($dckey,$dcvalue) = @{$dcpair}; # quote ' in dcvalue $dcvalue=~tr/'/"/; - $sth_dc_plan->execute($ieid, $dckey, $dcvalue) or die "sql problem detected", $dbh->errstr; + $sth_dc_plan->execute($ieid, $ieversion, $dckey, $dcvalue) or die "sql problem detected", $dbh->errstr; } return 1; } @@ -232,7 +232,7 @@ SQL_DC_PLAN ############################################################################### sub write_index_creation($dbh) { my $sql1=<<"SQL_INDEX1"; - CREATE UNIQUE INDEX aip_index on aip (ie_id); + CREATE UNIQUE INDEX aip_index on aip (ie_id, version); SQL_INDEX1 my $sql2=<<"SQL_INDEX2"; CREATE UNIQUE INDEX sourcedata_index on sourcedatafile (aip_id, name); @@ -258,7 +258,7 @@ sub check_if_db_conform ($string, $filename) { { my $xp; sub get_xpath_context { - if (defined $xp) { return $xp}; + if (defined $xp) { return $xp;} $xp = XML::LibXML::XPathContext->new(); $xp->registerNs("dnx", "http://www.exlibrisgroup.com/dps/dnx"); $xp->registerNs("sru", "http://www.loc.gov/zing/srw/"); @@ -279,6 +279,18 @@ sub check_if_db_conform ($string, $filename) { } } +{ + my $parser; + sub get_parser ( $recovery_flag ){ + if (defined $parser) { return $parser;} + $parser = XML::LibXML->new( recover => $recovery_flag, + compact => 1, + no_network => 1, + recover => $recovery_flag, + ); + } +} + ############################################################################### # # /mets:mets/mets:dmdSec[1]/mets:mdWrap[1]/mets:xmlData[1]/dc:record[1]/dc:title[1] @@ -346,14 +358,9 @@ sub check_if_db_conform ($string, $filename) { if ($recovery_flag) { $recovery_flag = 2; # avoid warnings, see XML::LibXML::Parser POD about 'recovery' } - my $dom = XML::LibXML->load_xml( - location => $filename, - recover => $recovery_flag, - compact => 1, - no_network => 1, - ); - my $xp = get_xpath_context(); - $xp->setContextNode($dom); + my $parser = get_parser( $recovery_flag); + my $dom = $parser->parse_file( $filename ); + my $xp = get_xpath_context(); $xp->setContextNode($dom); ############################################ # get title my $title = get_title($xp, $filename); @@ -365,21 +372,21 @@ sub check_if_db_conform ($string, $filename) { my $repid = get_repid($xp, $filename); if (!defined $repid) { say STDERR "No repid found in file $filename, is IE purged?"; - my %tmp; - $tmp{"filename"}=$filename; - $tmp{"purged"}=1; - return \%tmp; + my $tmp; + $tmp->{"filename"}=$filename; + $tmp->{"purged"}=1; + return $tmp; } ############################################ # get all files of LOCAL representation my $files_ref = get_files_ref ($xp, $filename, $repid); - my %ret; - $ret{"filename" } = $filename; - $ret{"title"} = $title; - $ret{"repid"} = $repid; - $ret{"files"} = $files_ref; - $ret{"dcrecords"} = $dcrecords_ref; - return \%ret; + my $ret; + $ret->{"filename" } = $filename; + $ret->{"title"} = $title; + $ret->{"repid"} = $repid; + $ret->{"files"} = $files_ref; + $ret->{"dcrecords"} = $dcrecords_ref; + return $ret; } } @@ -473,16 +480,14 @@ sub searching_ie_files ($dir, $tmp_ies_unsorted_file) { } - my $tmp_ies_dir = Path::Tiny->tempdir( TEMPLATE => "exitstrategy_XXXXXXXXXXX", CLEANUP => 1); - my $tmp_ies_unsorted_file = $tmp_ies_dir->child("unsorted_ies"); - my $tmp_ies_sorted_file = $tmp_ies_dir->child("sorted_ies"); - my $tmp_ies_truncated_file = $tmp_ies_dir->child("truncated_ies"); - + ############################################################################### ############################################################################### ############# main ############################################################ ############################################################################### ############################################################################### +my $tmp_ies_dir = Path::Tiny->tempdir( TEMPLATE => "exitstrategy_XXXXXXXXXXX", CLEANUP => 1); +my $tmp_ies_unsorted_file = $tmp_ies_dir->child("unsorted_ies"); my $flag_recovery = undef; my $flag_sqldump = undef; my $db_filename = $db_name.".db"; @@ -507,22 +512,18 @@ sub searching_ie_files ($dir, $tmp_ies_unsorted_file) { my $cnt_unsorted_files = searching_ie_files($dir, $tmp_ies_unsorted_file); # /permanent_storage/2020/04/02/IE201080/V1-FL201091.xml # /permanent_storage/2020/04/02/IE201080/V2-FL201091.xml - say "sorting IE files"; - sort_file({ - I => $tmp_ies_unsorted_file->absolute()->stringify, - o => $tmp_ies_sorted_file->absolute()->stringify, - }); - my $cnt_truncated_files = find_newest_iefile_version ($tmp_ies_sorted_file, $tmp_ies_truncated_file, $cnt_unsorted_files ); - # now operate on truncated - my $fh_truncated_IEs = $tmp_ies_truncated_file->openr(); + my $fh_unsorted_IEs = $tmp_ies_unsorted_file->openr(); my $count=0; - my $progressbar =Time::Progress->new(min => 0, max => $cnt_truncated_files, smoothing => 1); - my $dbh = DBI->connect("dbi:SQLite:dbname=$db_filename", "", "") or die "could not connect to database (file '$db_filename')", $DBI::errstr; + my $progressbar =Time::Progress->new(min => 0, max => $cnt_unsorted_files, smoothing => 1); + my $dbh = DBI->connect("dbi:SQLite:dbname=$db_filename", "", "", { + RaiseError => 1, + sqlite_unicode => 1, + }) or die "could not connect to database (file '$db_filename')", $DBI::errstr; write_database_creation($dbh); write_tables_creation($dbh); write_prepare_insert($dbh); write_index_creation($dbh); - while( <$fh_truncated_IEs> ) { + while( <$fh_unsorted_IEs> ) { chomp; print $progressbar->report("parse IE files: %40b ETA: %E \r", $count++); s/V0*(\d+-IE)/V$1/; # revert fake version @@ -530,7 +531,6 @@ sub searching_ie_files ($dir, $tmp_ies_unsorted_file) { write_addsql($dbh, $ret); } say ""; - $dbh->disconnect or warn("disconnecting problems, ", $dbh->errstr); say "processed $count uniq IEs"; } else {