Skip to content
Snippets Groups Projects
Commit 03790c71 authored by Andreas Romeyke's avatar Andreas Romeyke
Browse files

- fixed comments

- added support for multiple AIP versions
- fixed 'ie_id' in table 'aip' (== IEPID)
- removed deprecated code
parent 12ca063a
No related branches found
No related tags found
No related merge requests found
......@@ -3,19 +3,13 @@
# Author: Andreas Romeyke
# SLUB Dresden, Department Longterm Preservation
#
# scans a given repository and creates an SQL script to create a database.
# scans a given repository and creates a SQLite database.
# This is part of the exit-strategy for details, see asciidoc file
# exit_strategie.asciidoc (also contains ER-diagram for database)
#
# call with:
#
# perl ./exit_strategy.pl rosetta_exit_strategy/tmp.sql /permanent/
#
# file tested with postgres-database
#
# using then with:
# psql -U romeyke -d exit_strategy \
# -f rosetta_exit_strategy/tmp.sql -L rosetta_exit.log
# perl ./exit_strategy.pl rosetta_exit_strategy/tmp.db /permanent/
#
###############################################################################
#
......@@ -34,7 +28,7 @@
#
# then (and only then) try this:
#
# perl ./exit_strategy.pl --recover rosetta_exit_strategy/tmp.sql /permanent/
# perl ./exit_strategy.pl --recover rosetta_exit_strategy/tmp.db /permanent/
#
###############################################################################
......@@ -47,7 +41,6 @@ use utf8; # for debugging output
use Carp;
use Path::Tiny;
use File::Find;
use File::Sort qw(sort_file);
use XML::LibXML;
use Time::Progress;
use XML::LibXML::XPathContext;
......@@ -93,7 +86,8 @@ sub write_tables_creation ($dbh) {
/* create AIP table */
CREATE TABLE aip (
id INTEGER,
ie_id TEXT NOT NULL UNIQUE,
ie_id TEXT NOT NULL,
version INTEGER NOT NULL,
PRIMARY KEY(id AUTOINCREMENT)
);
SQL_CREATE1
......@@ -175,30 +169,36 @@ sub write_prepare_insert ($dbh) {
# $ret{"dcrecords"} = \@dcrecords;
###############################################################################
sub write_addsql ($dbh, $refhash) {
my $ieid = path($refhash->{"filename"})->basename(qw/.xml/);
my $iefile = path($refhash->{"filename"})->basename();
my ($ieid,$ieversion);
if ($iefile =~ m/^V(\d+)-(IE\d*)\.xml$/) {
$ieversion=$1; $ieid=$2;
} else {
die "Could not detect PID and Version from IEFile '$iefile'\n";
}
my $sql_aip_plan=<<"SQL_AIP_PLAN";
INSERT INTO aip (ie_id) VALUES (\$1);
INSERT INTO aip (ie_id, version) VALUES (\$1, \$2);
SQL_AIP_PLAN
my $sql_ie_plan=<<"SQL_IE_PLAN";
INSERT INTO metadatafile (aip_id, location, sourcetype) VALUES (
(SELECT id FROM aip WHERE aip.ie_id=\$1), \$2, \$3
(SELECT id FROM aip WHERE aip.ie_id=\$1 AND aip.version=\$2), \$3, \$4
);
SQL_IE_PLAN
my $sql_file_plan=<<"SQL_FILE_PLAN";
INSERT INTO sourcedatafile (aip_id, name) VALUES (
(SELECT id FROM aip WHERE aip.ie_id=\$1), \$2
(SELECT id FROM aip WHERE aip.ie_id=\$1 AND aip.version=\$2), \$3
);
SQL_FILE_PLAN
my $sql_locat_plan=<<"SQL_LOCAT_PLAN";
INSERT INTO sourcedatalocat (file_id, location, sourcetype) VALUES (
(SELECT sourcedatafile.id FROM sourcedatafile,aip WHERE
sourcedatafile.aip_id=aip.id AND aip.ie_id=\$1 AND
sourcedatafile.name=\$2), \$3, \$4
sourcedatafile.aip_id=aip.id AND aip.ie_id=\$1 AND aip.version=\$2 AND
sourcedatafile.name=\$3), \$4, \$5
);
SQL_LOCAT_PLAN
my $sql_dc_pan=<<"SQL_DC_PLAN";
INSERT INTO dc (aip_id, element, value) VALUES (
(SELECT id FROM aip WHERE aip.ie_id=\$1), \$2, \$3
(SELECT id FROM aip WHERE aip.ie_id=\$1 AND aip.version=\$2), \$3, \$4
);
SQL_DC_PLAN
my $sth_aip_plan = $dbh->prepare($sql_aip_plan);
......@@ -207,20 +207,20 @@ SQL_DC_PLAN
my $sth_locat_plan = $dbh->prepare($sql_locat_plan);
my $sth_dc_plan = $dbh->prepare($sql_dc_pan);
# start SQL insert
$sth_aip_plan->execute($ieid) or die "sql problem detected", $dbh->errstr;
$sth_aip_plan->execute($ieid, $ieversion) or die "sql problem detected", $dbh->errstr;
# FIXME if multiple locations exists
my $iefile = path($refhash->{"filename"})->basename();
$sth_ie_plan->execute( $ieid, $iefile, $sourcetype) or die "sql problem detected", $dbh->errstr;
$sth_ie_plan->execute( $ieid, $ieversion, $iefile, $sourcetype) or die "sql problem detected", $dbh->errstr;
foreach my $location (@{$refhash->{"files"}}) {
my $file = path($location)->basename(); # FIXME if multiple locations
$sth_file_plan->execute($ieid, $file) or die "sql problem detected", $dbh->errstr;
$sth_locat_plan->execute($ieid, $file, $location, $sourcetype) or die "sql problem detected", $dbh->errstr;
$sth_file_plan->execute($ieid, $ieversion, $file) or die "sql problem detected", $dbh->errstr;
$sth_locat_plan->execute($ieid, $ieversion, $file, $location, $sourcetype) or die "sql problem detected", $dbh->errstr;
}
foreach my $dcpair (@{$refhash->{"dcrecords"}}) {
my ($dckey,$dcvalue) = @{$dcpair};
# quote ' in dcvalue
$dcvalue=~tr/'/"/;
$sth_dc_plan->execute($ieid, $dckey, $dcvalue) or die "sql problem detected", $dbh->errstr;
$sth_dc_plan->execute($ieid, $ieversion, $dckey, $dcvalue) or die "sql problem detected", $dbh->errstr;
}
return 1;
}
......@@ -232,7 +232,7 @@ SQL_DC_PLAN
###############################################################################
sub write_index_creation($dbh) {
my $sql1=<<"SQL_INDEX1";
CREATE UNIQUE INDEX aip_index on aip (ie_id);
CREATE UNIQUE INDEX aip_index on aip (ie_id, version);
SQL_INDEX1
my $sql2=<<"SQL_INDEX2";
CREATE UNIQUE INDEX sourcedata_index on sourcedatafile (aip_id, name);
......@@ -258,7 +258,7 @@ sub check_if_db_conform ($string, $filename) {
{
my $xp;
sub get_xpath_context {
if (defined $xp) { return $xp};
if (defined $xp) { return $xp;}
$xp = XML::LibXML::XPathContext->new();
$xp->registerNs("dnx", "http://www.exlibrisgroup.com/dps/dnx");
$xp->registerNs("sru", "http://www.loc.gov/zing/srw/");
......@@ -279,6 +279,18 @@ sub check_if_db_conform ($string, $filename) {
}
}
{
my $parser;
sub get_parser ( $recovery_flag ){
if (defined $parser) { return $parser;}
$parser = XML::LibXML->new( recover => $recovery_flag,
compact => 1,
no_network => 1,
recover => $recovery_flag,
);
}
}
###############################################################################
#
# /mets:mets/mets:dmdSec[1]/mets:mdWrap[1]/mets:xmlData[1]/dc:record[1]/dc:title[1]
......@@ -346,14 +358,9 @@ sub check_if_db_conform ($string, $filename) {
if ($recovery_flag) {
$recovery_flag = 2; # avoid warnings, see XML::LibXML::Parser POD about 'recovery'
}
my $dom = XML::LibXML->load_xml(
location => $filename,
recover => $recovery_flag,
compact => 1,
no_network => 1,
);
my $xp = get_xpath_context();
$xp->setContextNode($dom);
my $parser = get_parser( $recovery_flag);
my $dom = $parser->parse_file( $filename );
my $xp = get_xpath_context(); $xp->setContextNode($dom);
############################################
# get title
my $title = get_title($xp, $filename);
......@@ -365,21 +372,21 @@ sub check_if_db_conform ($string, $filename) {
my $repid = get_repid($xp, $filename);
if (!defined $repid) {
say STDERR "No repid found in file $filename, is IE purged?";
my %tmp;
$tmp{"filename"}=$filename;
$tmp{"purged"}=1;
return \%tmp;
my $tmp;
$tmp->{"filename"}=$filename;
$tmp->{"purged"}=1;
return $tmp;
}
############################################
# get all files of LOCAL representation
my $files_ref = get_files_ref ($xp, $filename, $repid);
my %ret;
$ret{"filename" } = $filename;
$ret{"title"} = $title;
$ret{"repid"} = $repid;
$ret{"files"} = $files_ref;
$ret{"dcrecords"} = $dcrecords_ref;
return \%ret;
my $ret;
$ret->{"filename" } = $filename;
$ret->{"title"} = $title;
$ret->{"repid"} = $repid;
$ret->{"files"} = $files_ref;
$ret->{"dcrecords"} = $dcrecords_ref;
return $ret;
}
}
......@@ -473,16 +480,14 @@ sub searching_ie_files ($dir, $tmp_ies_unsorted_file) {
}
my $tmp_ies_dir = Path::Tiny->tempdir( TEMPLATE => "exitstrategy_XXXXXXXXXXX", CLEANUP => 1);
my $tmp_ies_unsorted_file = $tmp_ies_dir->child("unsorted_ies");
my $tmp_ies_sorted_file = $tmp_ies_dir->child("sorted_ies");
my $tmp_ies_truncated_file = $tmp_ies_dir->child("truncated_ies");
###############################################################################
###############################################################################
############# main ############################################################
###############################################################################
###############################################################################
my $tmp_ies_dir = Path::Tiny->tempdir( TEMPLATE => "exitstrategy_XXXXXXXXXXX", CLEANUP => 1);
my $tmp_ies_unsorted_file = $tmp_ies_dir->child("unsorted_ies");
my $flag_recovery = undef;
my $flag_sqldump = undef;
my $db_filename = $db_name.".db";
......@@ -507,22 +512,18 @@ sub searching_ie_files ($dir, $tmp_ies_unsorted_file) {
my $cnt_unsorted_files = searching_ie_files($dir, $tmp_ies_unsorted_file);
# /permanent_storage/2020/04/02/IE201080/V1-FL201091.xml
# /permanent_storage/2020/04/02/IE201080/V2-FL201091.xml
say "sorting IE files";
sort_file({
I => $tmp_ies_unsorted_file->absolute()->stringify,
o => $tmp_ies_sorted_file->absolute()->stringify,
});
my $cnt_truncated_files = find_newest_iefile_version ($tmp_ies_sorted_file, $tmp_ies_truncated_file, $cnt_unsorted_files );
# now operate on truncated
my $fh_truncated_IEs = $tmp_ies_truncated_file->openr();
my $fh_unsorted_IEs = $tmp_ies_unsorted_file->openr();
my $count=0;
my $progressbar =Time::Progress->new(min => 0, max => $cnt_truncated_files, smoothing => 1);
my $dbh = DBI->connect("dbi:SQLite:dbname=$db_filename", "", "") or die "could not connect to database (file '$db_filename')", $DBI::errstr;
my $progressbar =Time::Progress->new(min => 0, max => $cnt_unsorted_files, smoothing => 1);
my $dbh = DBI->connect("dbi:SQLite:dbname=$db_filename", "", "", {
RaiseError => 1,
sqlite_unicode => 1,
}) or die "could not connect to database (file '$db_filename')", $DBI::errstr;
write_database_creation($dbh);
write_tables_creation($dbh);
write_prepare_insert($dbh);
write_index_creation($dbh);
while( <$fh_truncated_IEs> ) {
while( <$fh_unsorted_IEs> ) {
chomp;
print $progressbar->report("parse IE files: %40b ETA: %E \r", $count++);
s/V0*(\d+-IE)/V$1/; # revert fake version
......@@ -530,7 +531,6 @@ sub searching_ie_files ($dir, $tmp_ies_unsorted_file) {
write_addsql($dbh, $ret);
}
say "";
$dbh->disconnect or warn("disconnecting problems, ", $dbh->errstr);
say "processed $count uniq IEs";
} else {
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment