Skip to content
Snippets Groups Projects
Commit b3c344d5 authored by Andreas Romeyke's avatar Andreas Romeyke
Browse files

- reintegrate patches from another free-floating repository:

- reorganized-modules-test-and-examples
- code-clean-up-removed-all-trailing-whitespaces
- improved-memory-usage-by-using-filehandles
- added-a-recovery-mode-if-AIPs-have-unclean-IE-XML
- fix-increased-field-size-for-dc-strings-to-prevent
- fixed-size-to-8192
parent 9fb93a76
No related branches found
No related tags found
No related merge requests found
...@@ -7,23 +7,51 @@ ...@@ -7,23 +7,51 @@
# This is part of the exit-strategy for details, see asciidoc file # This is part of the exit-strategy for details, see asciidoc file
# exit_strategie.asciidoc (also contains ER-diagram for database) # exit_strategie.asciidoc (also contains ER-diagram for database)
# #
# call with:
#
# perl ./exit_strategy.pl rosetta_exit_strategy/tmp.sql /permanent/
#
# file tested with postgres-database # file tested with postgres-database
# #
# using: # using then with:
# psql -U romeyke -d exit_strategy \ # psql -U romeyke -d exit_strategy \
# -f rosetta_exit_strategy/tmp.sql -L rosetta_exit.log # -f rosetta_exit_strategy/tmp.sql -L rosetta_exit.log
# #
############################################################################### ###############################################################################
#
# WARNING:
#
# the following messages only occure if you had an unclean SIP ingest process,
# it means that in your IE-XML are wrong/unused namespace declarations
#
# if some AIPs are wrong with messages like:
#
# '/permanent_storage/normal/2017/07/05/IE1043/V1-IE1043.xml:6:
# namespace error : xmlns:mods: 'http://www.loc.gov/mods/v3
# http://www.loc.gov/standards/mods/v3/mods-3-0.xsd' is not a valid URI
# s="http://www.loc.gov/mods/v3
# http://www.loc.gov/standards/mods/v3/mods-3-0.xsd"'
#
# then (and only then) try this:
#
# perl ./exit_strategy.pl --recover rosetta_exit_strategy/tmp.sql /permanent/
#
###############################################################################
use 5.28.0;
use strict; use strict;
use warnings; use warnings;
use feature "say";
use Carp; use Carp;
use Path::Tiny; use Path::Tiny;
use File::Find; use File::Find;
use File::Sort qw(sort_file); use File::Sort qw(sort_file);
use XML::XPath; use XML::LibXML;
use XML::XPath::XMLParser; use Time::Progress;
use XML::LibXML::XPathContext;
use Getopt::Long;
use constant DEBUG => 0; # no debug
# guarantee, that output will be UTF8 # guarantee, that output will be UTF8
binmode(STDOUT, ":encoding(UTF-8)"); binmode(STDOUT, ":encoding(UTF-8)");
...@@ -38,90 +66,94 @@ my $sourcetype="hdd"; #default value ...@@ -38,90 +66,94 @@ my $sourcetype="hdd"; #default value
# if IE.xml file found, read its metadata, create SQL add entry # if IE.xml file found, read its metadata, create SQL add entry
# write SQL add entry # write SQL add entry
############################################################################### ###############################################################################
sub write_database_creation { sub write_database_creation ($) {
my $fh = shift;
# non standard conform SQL keywords # non standard conform SQL keywords
#say "CREATE DATABASE $db_name;"; #say $fh "CREATE DATABASE $db_name;";
#say "CREATE SCHEMA $schema_name;"; #say $fh "CREATE SCHEMA $schema_name;";
#say "USE "; #say $fh "USE ";
return;
} }
# write tables creation;: # write tables creation;:
sub write_tables_creation { sub write_tables_creation ($) {
my $fh = shift;
# Transactions for tables creation # Transactions for tables creation
say "BEGIN;"; say $fh "BEGIN;";
# SEQUENCE # SEQUENCE
say "/* create SEQUENCE generator */"; say $fh "/* create SEQUENCE generator */";
say "CREATE SEQUENCE serial START 1;"; say $fh "CREATE SEQUENCE serial START 1;";
# AIP # AIP
say "/* create AIP table */"; say $fh "/* create AIP table */";
say "CREATE TABLE aip ("; say $fh "CREATE TABLE aip (";
say "\tid INT PRIMARY KEY DEFAULT nextval('serial'),"; say $fh "\tid INT PRIMARY KEY DEFAULT nextval('serial'),";
say "\tie_id VARCHAR(30) NOT NULL UNIQUE"; say $fh "\tie_id VARCHAR(30) NOT NULL UNIQUE";
say ");"; say $fh ");";
# IEFILE # IEFILE
say "/* create IEFILE table */"; say $fh "/* create IEFILE table */";
say "CREATE TABLE metadatafile ("; say $fh "CREATE TABLE metadatafile (";
say "\tid INT PRIMARY KEY DEFAULT nextval('serial'),"; say $fh "\tid INT PRIMARY KEY DEFAULT nextval('serial'),";
say "\taip_id INT NOT NULL REFERENCES aip (id),"; say $fh "\taip_id INT NOT NULL REFERENCES aip (id),";
say "\tlocation VARCHAR(1024) NOT NULL,"; say $fh "\tlocation VARCHAR(1024) NOT NULL,";
say "\tsourcetype VARCHAR(30) NOT NULL"; say $fh "\tsourcetype VARCHAR(30) NOT NULL";
say ");"; say $fh ");";
# DC # DC
say "/* create DC table */"; say $fh "/* create DC table */";
say "CREATE TABLE dc ("; say $fh "CREATE TABLE dc (";
say "\tid INT PRIMARY KEY DEFAULT nextval('serial'),"; say $fh "\tid INT PRIMARY KEY DEFAULT nextval('serial'),";
say "\taip_id INT NOT NULL REFERENCES aip (id),"; say $fh "\taip_id INT NOT NULL REFERENCES aip (id),";
say "\telement VARCHAR(30) NOT NULL,"; say $fh "\telement VARCHAR(30) NOT NULL,";
say "\tvalue VARCHAR(8192) NOT NULL"; say $fh "\tvalue VARCHAR(8192) NOT NULL";
say ");"; say $fh ");";
# FILE # FILE
say "/* create FILE table */"; say $fh "/* create FILE table */";
say "CREATE TABLE sourcedatafile ("; say $fh "CREATE TABLE sourcedatafile (";
say "\tid INT PRIMARY KEY DEFAULT nextval('serial'), "; say $fh "\tid INT PRIMARY KEY DEFAULT nextval('serial'), ";
say "\taip_id INT NOT NULL REFERENCES aip (id),"; say $fh "\taip_id INT NOT NULL REFERENCES aip (id),";
say "\tname VARCHAR(1024) NOT NULL"; say $fh "\tname VARCHAR(1024) NOT NULL";
say ");"; say $fh ");";
# LOCAT # LOCAT
say "/* create LOCAT table */"; say $fh "/* create LOCAT table */";
say "CREATE TABLE sourcedatalocat ("; say $fh "CREATE TABLE sourcedatalocat (";
say "\tid INT PRIMARY KEY DEFAULT nextval('serial'),"; say $fh "\tid INT PRIMARY KEY DEFAULT nextval('serial'),";
say "\tfile_id INT NOT NULL REFERENCES sourcedatafile (id),"; say $fh "\tfile_id INT NOT NULL REFERENCES sourcedatafile (id),";
say "\tlocation VARCHAR(1024) NOT NULL,"; say $fh "\tlocation VARCHAR(1024) NOT NULL,";
say "\tsourcetype VARCHAR(30) NOT NULL"; say $fh "\tsourcetype VARCHAR(30) NOT NULL";
say ");"; say $fh ");";
#end transaction #end transaction
say "COMMIT;"; say $fh "COMMIT;";
return; return;
} }
############################################################################### ###############################################################################
# Prepare SQL INSERT Statements for AIPs # Prepare SQL INSERT Statements for AIPs
############################################################################### ###############################################################################
sub write_prepare_insert { sub write_prepare_insert ($) {
say "BEGIN;"; my $fh = shift;
say "PREPARE aip_plan (varchar) AS"; say $fh "BEGIN;";
say " INSERT INTO aip (ie_id) VALUES (\$1);"; say $fh "PREPARE aip_plan (varchar) AS";
say "PREPARE ie_plan (varchar, varchar, varchar) AS"; say $fh " INSERT INTO aip (ie_id) VALUES (\$1);";
say " INSERT INTO metadatafile (aip_id, location, sourcetype) VALUES ("; say $fh "PREPARE ie_plan (varchar, varchar, varchar) AS";
say " (SELECT id FROM aip WHERE aip.ie_id=\$1), \$2, \$3"; say $fh " INSERT INTO metadatafile (aip_id, location, sourcetype) VALUES (";
say " );"; say $fh " (SELECT id FROM aip WHERE aip.ie_id=\$1), \$2, \$3";
say "PREPARE file_plan (varchar, varchar) AS"; say $fh " );";
say " INSERT INTO sourcedatafile (aip_id, name) VALUES ("; say $fh "PREPARE file_plan (varchar, varchar) AS";
say " (SELECT id FROM aip WHERE aip.ie_id=\$1), \$2"; say $fh " INSERT INTO sourcedatafile (aip_id, name) VALUES (";
say " );"; say $fh " (SELECT id FROM aip WHERE aip.ie_id=\$1), \$2";
say "PREPARE locat_plan (varchar, varchar, varchar, varchar) AS"; say $fh " );";
say " INSERT INTO sourcedatalocat (file_id, location, sourcetype) VALUES ("; say $fh "PREPARE locat_plan (varchar, varchar, varchar, varchar) AS";
say " (SELECT sourcedatafile.id FROM sourcedatafile,aip WHERE"; say $fh " INSERT INTO sourcedatalocat (file_id, location, sourcetype) VALUES (";
say " sourcedatafile.aip_id=aip.id AND aip.ie_id=\$1 AND"; say $fh " (SELECT sourcedatafile.id FROM sourcedatafile,aip WHERE";
say " sourcedatafile.name=\$2), \$3, \$4"; say $fh " sourcedatafile.aip_id=aip.id AND aip.ie_id=\$1 AND";
say " );"; say $fh " sourcedatafile.name=\$2), \$3, \$4";
say "PREPARE dc_plan (varchar, varchar, varchar) AS"; say $fh " );";
say " INSERT INTO dc (aip_id, element, value) VALUES ("; say $fh "PREPARE dc_plan (varchar, varchar, varchar) AS";
say " (SELECT id FROM aip WHERE aip.ie_id=\$1), \$2, \$3"; say $fh " INSERT INTO dc (aip_id, element, value) VALUES (";
say " );"; say $fh " (SELECT id FROM aip WHERE aip.ie_id=\$1), \$2, \$3";
say "COMMIT;"; say $fh " );";
say $fh "COMMIT;";
return; return;
} }
...@@ -147,28 +179,28 @@ sub write_prepare_insert { ...@@ -147,28 +179,28 @@ sub write_prepare_insert {
# $ret{"files"} = \@files; # $ret{"files"} = \@files;
# $ret{"dcrecords"} = \@dcrecords; # $ret{"dcrecords"} = \@dcrecords;
############################################################################### ###############################################################################
sub write_addsql { sub write_addsql ($$) {
my $refhash = $_[0]; my $fh = shift;
my $refhash = shift;
my $ieid = path($refhash->{"filename"})->basename(qw/.xml/); my $ieid = path($refhash->{"filename"})->basename(qw/.xml/);
say "BEGIN;"; say $fh "BEGIN;";
say "EXECUTE aip_plan ('$ieid');"; say $fh "EXECUTE aip_plan ('$ieid');";
# FIXME if multiple locations exists # FIXME if multiple locations exists
my $iefile = path($refhash->{"filename"})->basename(); my $iefile = path($refhash->{"filename"})->basename();
say "EXECUTE ie_plan ('$ieid', '$iefile', '$sourcetype');"; say $fh "EXECUTE ie_plan ('$ieid', '$iefile', '$sourcetype');";
foreach my $location (@{$refhash->{"files"}}) { foreach my $location (@{$refhash->{"files"}}) {
my $file = path($location)->basename(); # FIXME if multiple locations my $file = path($location)->basename(); # FIXME if multiple locations
say "EXECUTE file_plan ('$ieid', '$file');"; say $fh "EXECUTE file_plan ('$ieid', '$file');";
say "EXECUTE locat_plan ('$ieid', '$file', '$location', '$sourcetype' );"; say $fh "EXECUTE locat_plan ('$ieid', '$file', '$location', '$sourcetype' );";
} }
foreach my $dcpair (@{$refhash->{"dcrecords"}}) { foreach my $dcpair (@{$refhash->{"dcrecords"}}) {
my ($dckey,$dcvalue) = @{$dcpair}; my ($dckey,$dcvalue) = @{$dcpair};
# quote ' in dcvalue # quote ' in dcvalue
$dcvalue=~tr/'/"/; $dcvalue=~tr/'/"/;
say "EXECUTE dc_plan ( '$ieid', '$dckey', '$dcvalue');"; say $fh "EXECUTE dc_plan ( '$ieid', '$dckey', '$dcvalue');";
} }
say "COMMIT;"; say $fh "COMMIT;";
say "\n"; say $fh "\n";
return; return;
} }
...@@ -177,10 +209,11 @@ sub write_addsql { ...@@ -177,10 +209,11 @@ sub write_addsql {
############################################################################### ###############################################################################
# add INDEX and other TRICKs to increase performance # add INDEX and other TRICKs to increase performance
############################################################################### ###############################################################################
sub write_index_creation() { sub write_index_creation($) {
say "-- BEGIN;"; my $fh = shift;
say "-- CREATE UNIQUE INDEX aip_index on aip (ie_id);"; say $fh "-- BEGIN;";
say "-- COMMIT;"; say $fh "-- CREATE UNIQUE INDEX aip_index on aip (ie_id);";
say $fh "-- COMMIT;";
return; return;
} }
...@@ -210,20 +243,42 @@ sub check_if_db_conform ($$) { ...@@ -210,20 +243,42 @@ sub check_if_db_conform ($$) {
# Files via /mets:mets/mets:fileSec[1]/mets:fileGrp[1]/mets:file[1]/mets:FLocat[1] # Files via /mets:mets/mets:fileSec[1]/mets:fileGrp[1]/mets:file[1]/mets:FLocat[1]
# #
############################################################################### ###############################################################################
sub parse_iexml { sub parse_iexml ($$) {
my $filename = $_[0]; my $filename = shift;
my $recovery_flag = shift;
# create object # create object
my $xp = XML::XPath->new (filename => $filename); #
#my $xp = XML::XPath->new (filename => $filename);
my $dom = XML::LibXML->load_xml (location => $filename, recover => $recovery_flag, no_blanks=>1, compact=>1);
my $xp = XML::LibXML::XPathContext->new($dom);
$xp->registerNs("dnx", "http://www.exlibrisgroup.com/dps/dnx");
$xp->registerNs("sru", "http://www.loc.gov/zing/srw/");
$xp->registerNs("xsi", "http://www.w3.org/2001/XMLSchema-instance");
$xp->registerNs("dc", "http://purl.org/dc/elements/1.1/");
$xp->registerNs("mets", "http://www.loc.gov/METS/");
$xp->registerNs("rosettamets", "http://www.exlibrisgroup.com/xsd/dps/rosettaMets");
$xp->registerNs("mods", "http://www.loc.gov/mods/v3");
$xp->registerNs("ns2", "http://dps.exlibris.com/");
$xp->registerNs("dv", "http://dfg-viewer.de/");
$xp->registerNs("slub", "http://slub-dresden.de/");
$xp->registerNs("archive", "http://slub-dresden.de/slubarchiv");
$xp->registerNs("premis", "info:lc/xmlns/premis-v2");
$xp->registerNs("mix", "http://www.loc.gov/standards/mix/");
$xp->registerNs("xlink", "http://www.w3.org/1999/xlink");
$xp->registerNs("xlin", "http://www.w3.org/1999/xlink");
############################################ ############################################
# get title # get title
my $title = $xp->findvalue('/mets:mets/mets:dmdSec/mets:mdWrap[1]/mets:xmlData[1]/dc:record/dc:title[1]'); my $compiled_xpath_titles = '/mets:mets/mets:dmdSec/mets:mdWrap[1]/mets:xmlData[1]/dc:record/dc:title[1]';
my $title = $xp->findvalue($compiled_xpath_titles);
check_if_db_conform($title, $filename); check_if_db_conform($title, $filename);
############################################ ############################################
# get dc-records # get dc-records
my @dcrecords; my @dcrecords;
my $dcnodes = $xp->find('/mets:mets/mets:dmdSec/mets:mdWrap/mets:xmlData/dc:record/*'); my $compiled_xpath_dcrecords='/mets:mets/mets:dmdSec/mets:mdWrap/mets:xmlData/dc:record/*';
my $dcnodes = $xp->find($compiled_xpath_dcrecords);
foreach my $dcnode ($dcnodes->get_nodelist) { foreach my $dcnode ($dcnodes->get_nodelist) {
my $key = $dcnode->getName("."); my $key = $dcnode->getName();
my $value = $dcnode->findvalue("."); my $value = $dcnode->findvalue(".");
if (defined $value) { if (defined $value) {
$value=~s/\n/ /g; $value=~s/\n/ /g;
...@@ -237,7 +292,12 @@ sub parse_iexml { ...@@ -237,7 +292,12 @@ sub parse_iexml {
} }
############################################ ############################################
# get right representation ID (has a dnx-section with <key id=label>LOCAL</key>) # get right representation ID (has a dnx-section with <key id=label>LOCAL</key>)
my $repids = $xp->find('/mets:mets/mets:amdSec'); my $compiled_xpath_amdsecs = '/mets:mets/mets:amdSec';
my $compiled_xpath_localreps = 'mets:techMD/mets:mdWrap/mets:xmlData/dnx/section/record/key[@id=\'label\']';
my $compiled_xpath_filegrps = '/mets:mets/mets:fileSec/mets:fileGrp';
my $compiled_xpath_flocat = 'mets:file/mets:FLocat';
my $repids = $xp->find($compiled_xpath_amdsecs);
my $repid; my $repid;
# FIXME: if only one represenation exists (Qucosa), select this. If there # FIXME: if only one represenation exists (Qucosa), select this. If there
# are more than one, use them with label LOCAL # are more than one, use them with label LOCAL
...@@ -249,7 +309,8 @@ sub parse_iexml { ...@@ -249,7 +309,8 @@ sub parse_iexml {
check_if_db_conform($id, $filename); check_if_db_conform($id, $filename);
#/mets:mets/mets:amdSec[1]/mets:techMD[1]/mets:mdWrap[1]/mets:xmlData[1]/dnx[1]/section[1]/record[1]/key[1] #/mets:mets/mets:amdSec[1]/mets:techMD[1]/mets:mdWrap[1]/mets:xmlData[1]/dnx[1]/section[1]/record[1]/key[1]
# #
if ($node->findvalue('mets:techMD/mets:mdWrap/mets:xmlData/dnx/section/record/key[@id=\'label\']') eq 'LOCAL') {
if ($node->findvalue($compiled_xpath_localreps) eq 'LOCAL') {
$repid=$id; $repid=$id;
} }
#print XML::XPath::XMLParser::as_string($node), "\n\n"; #print XML::XPath::XMLParser::as_string($node), "\n\n";
...@@ -257,13 +318,13 @@ sub parse_iexml { ...@@ -257,13 +318,13 @@ sub parse_iexml {
############################################ ############################################
# get all files of LOCAL representation # get all files of LOCAL representation
my @files; my @files;
my $filegrpnodes = $xp->find('/mets:mets/mets:fileSec/mets:fileGrp'); my $filegrpnodes = $xp->find($compiled_xpath_filegrps);
foreach my $filegrpnode ($filegrpnodes->get_nodelist) { foreach my $filegrpnode ($filegrpnodes->get_nodelist) {
#die XML::XPath::XMLParser::as_string($filegrpnode), "\n\n"; #die XML::XPath::XMLParser::as_string($filegrpnode), "\n\n";
#die Dumper($filegrpnode); #die Dumper($filegrpnode);
if ($filegrpnode->findvalue('@ADMID') eq $repid) { if ($filegrpnode->findvalue('@ADMID') eq $repid) {
#die Dumper($filegrpnode); #die Dumper($filegrpnode);
my $filesnodes = $filegrpnode ->find("mets:file/mets:FLocat"); my $filesnodes = $filegrpnode ->find($compiled_xpath_flocat);
foreach my $filesnode ($filesnodes->get_nodelist) { foreach my $filesnode ($filesnodes->get_nodelist) {
my $value = $filesnode->findvalue('@xlin:href'); my $value = $filesnode->findvalue('@xlin:href');
check_if_db_conform($value, $filename); check_if_db_conform($value, $filename);
...@@ -295,11 +356,16 @@ sub parse_iexml { ...@@ -295,11 +356,16 @@ sub parse_iexml {
# and returns an array reference with reduced files using only highest V-value # and returns an array reference with reduced files using only highest V-value
# HINT, it only operates on sorted file list with fake versions (with zero-filled prefixes) # HINT, it only operates on sorted file list with fake versions (with zero-filled prefixes)
################################################################################ ################################################################################
sub find_newest_iefile_version ($$) { sub find_newest_iefile_version ($$$) {
my $files_sorted = shift; my $files_sorted = shift;
my $files_truncated = shift; my $files_truncated = shift;
my $cnt_files = shift;
my $cnt_truncated = 0;
my $fh = $files_sorted->filehandle("<"); my $fh = $files_sorted->filehandle("<");
my $last_entry; my $last_entry;
# FIXME, how many lines?
my $p = Time::Progress->new(min=>0, max=> $cnt_files);
my $i=0;
while(<$fh>) { while(<$fh>) {
my $entry = $_; my $entry = $_;
$entry =~ m/^(.+?V)(\d+)(-IE\d+\.xml)$/; $entry =~ m/^(.+?V)(\d+)(-IE\d+\.xml)$/;
...@@ -312,23 +378,38 @@ sub find_newest_iefile_version ($$) { ...@@ -312,23 +378,38 @@ sub find_newest_iefile_version ($$) {
if (($last_prefix eq $prefix ) && ($last_suffix eq $suffix) && ($last_version < $version)) { if (($last_prefix eq $prefix ) && ($last_suffix eq $suffix) && ($last_version < $version)) {
} else { } else {
$files_truncated->append($last_entry); $files_truncated->append($last_entry);
$cnt_truncated++;
} }
print $p->report("find newest IE files: %40b ETA: %E \r", $i++);
$last_entry = $entry; $last_entry = $entry;
} }
$files_truncated->append($last_entry); $files_truncated->append($last_entry);
return 1; $cnt_truncated++;
say "";
return $cnt_truncated;
} }
# begin closure
{
my $tmp_ies_dir = Path::Tiny->tempdir( TEMPLATE => "exitstrategy_XXXXXXXXXXX", CLEANUP => 1); my $tmp_ies_dir = Path::Tiny->tempdir( TEMPLATE => "exitstrategy_XXXXXXXXXXX", CLEANUP => 1);
my $tmp_ies_unsorted_file = $tmp_ies_dir->child("unsorted_ies"); my $tmp_ies_unsorted_file = $tmp_ies_dir->child("unsorted_ies");
my $tmp_ies_sorted_file = $tmp_ies_dir->child("sorted_ies"); my $tmp_ies_sorted_file = $tmp_ies_dir->child("sorted_ies");
my $tmp_ies_truncated_file = $tmp_ies_dir->child("truncated_ies"); my $tmp_ies_truncated_file = $tmp_ies_dir->child("truncated_ies");
my $cnt_unsorted_files=0;
############################################################################### ###############################################################################
# call back function to File::Find
#
############################################################################### ###############################################################################
############# main ############################################################
###############################################################################
###############################################################################
my $recovery = undef;
my @ARGV_tail;
GetOptions(
"recovery" => \$recovery,
'<>' => sub {push @ARGV_tail, @_;}
);
if ($#ARGV_tail != 1) {
die "you need a SQL-file and a directory as argument\n";
}
sub process_sip () { sub process_sip () {
my $file=$File::Find::name; my $file=$File::Find::name;
if ($file =~ m/V(\d+)-IE\d+\.xml$/) { if ($file =~ m/V(\d+)-IE\d+\.xml$/) {
...@@ -337,43 +418,50 @@ sub find_newest_iefile_version ($$) { ...@@ -337,43 +418,50 @@ sub find_newest_iefile_version ($$) {
my $fakeversion = sprintf("%05i",$version); my $fakeversion = sprintf("%05i",$version);
$file =~s/V(\d+)-IE/V$fakeversion-IE/; $file =~s/V(\d+)-IE/V$fakeversion-IE/;
$tmp_ies_unsorted_file -> append( $file."\n"); $tmp_ies_unsorted_file -> append( $file."\n");
$cnt_unsorted_files++;
} }
return; return;
} }
############################################################################### if (defined $recovery) { warn "recovery enabled for XML processing\n"; }
############################################################################### my $sqlfile = shift @ARGV_tail;
############# main ############################################################ if($sqlfile !~ m/[A-Za-z0-9]+\.sql$/) {die "SQL file should be named like 'foo.sql', but was '$sqlfile'\n";}
############################################################################### my $dir = shift @ARGV_tail;
############################################################################### open(my $fh, ">:encoding(UTF-8)", "$sqlfile") || die "could not open file '$sqlfile' for writing, $!";
my $dir = shift @ARGV;
if (defined $dir && -d "$dir") { if (defined $dir && -d "$dir") {
write_database_creation(); say "preparing SQL";
write_tables_creation(); write_database_creation($fh);
write_prepare_insert(); write_tables_creation($fh);
write_prepare_insert($fh);
$tmp_ies_unsorted_file->touch(); $tmp_ies_unsorted_file->touch();
say "searching IE files";
find(\&process_sip, $dir); find(\&process_sip, $dir);
# /permanent_storage/2020/04/02/IE201080/V1-FL201091.xml # /permanent_storage/2020/04/02/IE201080/V1-FL201091.xml
# /permanent_storage/2020/04/02/IE201080/V2-FL201091.xml # /permanent_storage/2020/04/02/IE201080/V2-FL201091.xml
say "sorting IE files";
sort_file({ sort_file({
I => $tmp_ies_unsorted_file->absolute()->stringify, I => $tmp_ies_unsorted_file->absolute()->stringify,
o => $tmp_ies_sorted_file->absolute()->stringify, o => $tmp_ies_sorted_file->absolute()->stringify,
}); });
find_newest_iefile_version ($tmp_ies_sorted_file, $tmp_ies_truncated_file ); my $cnt_truncated_files = find_newest_iefile_version ($tmp_ies_sorted_file, $tmp_ies_truncated_file, $cnt_unsorted_files );
# now operate on truncated # now operate on truncated
my $fh = $tmp_ies_truncated_file->openr(); my $fh_truncated_IEs = $tmp_ies_truncated_file->openr();
my $count=0; my $count=0;
while( <$fh> ) { my $p=Time::Progress->new(min => 0, max => $cnt_truncated_files);
while( <$fh_truncated_IEs> ) {
chomp; chomp;
$count++; print $p->report("parse IE files: %40b ETA: %E \r", $count++);
s/V(0*)(\d+-IE)/V$2/; # revert fake version s/V(0*)(\d+-IE)/V$2/; # revert fake version
my $ret = parse_iexml($_); my $ret = parse_iexml($_, $recovery);
write_addsql($ret); write_addsql($fh, $ret);
} }
write_index_creation(); say "";
warn "processed $count uniq IEs\n;" write_index_creation($fh);
say "processed $count uniq IEs";
} else { } else {
die "no directory given on commandline" die "no directory given on commandline"
} }
} #end closure print "\n";
close ($fh);
1; 1;
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment