Skip to content
Snippets Groups Projects
Commit 9015f8e7 authored by Andreas Romeyke's avatar Andreas Romeyke
Browse files

Merge branch 'development' into 'master'

r2022.1

Closes #5

See merge request !1
parents effe3162 0014a1e8
No related branches found
No related tags found
1 merge request!1r2022.1
...@@ -41,20 +41,21 @@ use warnings; ...@@ -41,20 +41,21 @@ use warnings;
use feature qw( say signatures ); use feature qw( say signatures );
no warnings "experimental::signatures"; no warnings "experimental::signatures";
use utf8; # for debugging output use utf8; # for debugging output
use constant DEBUG => 0; # no debug
use Carp; use Carp;
use Path::Tiny;
use File::Basename qw(basename);
use File::Find;
use XML::LibXML;
use Time::Progress;
use XML::LibXML::XPathContext;
use DBD::SQLite; use DBD::SQLite;
use DBI; use DBI;
use File::Basename qw(basename);
use File::Find;
use Getopt::Long; use Getopt::Long;
use IO::Handle;
use List::Util qw(any);
use Path::Tiny;
use Pod::Usage; use Pod::Usage;
use constant DEBUG => 0; # no debug use Time::Progress;
use XML::LibXML::XPathContext;
use XML::LibXML;
use IO::Handle;
STDOUT->autoflush(1); STDOUT->autoflush(1);
# guarantee, that output will be UTF8 # guarantee, that output will be UTF8
binmode(STDOUT, ":encoding(UTF-8)"); binmode(STDOUT, ":encoding(UTF-8)");
...@@ -273,7 +274,6 @@ sub check_if_db_conform ($string, $filename) { ...@@ -273,7 +274,6 @@ sub check_if_db_conform ($string, $filename) {
}# }#
return; return;
} }
{ {
my $xp; my $xp;
sub get_xpath_context { sub get_xpath_context {
...@@ -310,6 +310,9 @@ sub check_if_db_conform ($string, $filename) { ...@@ -310,6 +310,9 @@ sub check_if_db_conform ($string, $filename) {
} }
} }
############################################################################### ###############################################################################
# #
# /mets:mets/mets:dmdSec[1]/mets:mdWrap[1]/mets:xmlData[1]/dc:record[1]/dc:title[1] # /mets:mets/mets:dmdSec[1]/mets:mdWrap[1]/mets:xmlData[1]/dc:record[1]/dc:title[1]
...@@ -320,28 +323,30 @@ sub check_if_db_conform ($string, $filename) { ...@@ -320,28 +323,30 @@ sub check_if_db_conform ($string, $filename) {
# Files via /mets:mets/mets:fileSec[1]/mets:fileGrp[1]/mets:file[1]/mets:FLocat[1] # Files via /mets:mets/mets:fileSec[1]/mets:fileGrp[1]/mets:file[1]/mets:FLocat[1]
# #
############################################################################### ###############################################################################
{ my $compiled_xpath_dmdSec = XML::LibXML::XPathExpression->new('/mets:mets/mets:dmdSec');
my $compiled_xpath_titles = XML::LibXML::XPathExpression->new('/mets:mets/mets:dmdSec/mets:mdWrap[1]/mets:xmlData[1]/dc:record/dc:title[1]'); my $compiled_xpath_amdSec = XML::LibXML::XPathExpression->new('/mets:mets/mets:amdSec[starts-with(@ID, \'REP\')]');
my $compiled_xpath_dcrecords = XML::LibXML::XPathExpression->new('/mets:mets/mets:dmdSec/mets:mdWrap/mets:xmlData/dc:record/*'); my $compiled_xpath_fileSec = XML::LibXML::XPathExpression->new('/mets:mets/mets:fileSec');
my $compiled_xpath_titles = XML::LibXML::XPathExpression->new('mets:mdWrap[1]/mets:xmlData[1]/dc:record/dc:title[1]');
my $compiled_xpath_files = XML::LibXML::XPathExpression->new("mets:file/mets:FLocat/\@xlin:href");
my $compiled_xpath_dcrecords = XML::LibXML::XPathExpression->new('mets:mdWrap/mets:xmlData/dc:record/*');
my $compiled_xpath_repid = XML::LibXML::XPathExpression->new('/mets:mets/mets:amdSec[starts-with(@ID, \'REP\')]/@ID'); my $compiled_xpath_repid = XML::LibXML::XPathExpression->new('/mets:mets/mets:amdSec[starts-with(@ID, \'REP\')]/@ID');
my $str_local_record = "mets:techMD/mets:mdWrap/mets:xmlData/*[local-name()=\'dnx\']/*[local-name()=\'section\']/*[local-name()=\'record\']"; my $str_local_record = "mets:techMD/mets:mdWrap/mets:xmlData/*[local-name()=\'dnx\']/*[local-name()=\'section\']/*[local-name()=\'record\']";
my $str_local_reps = "$str_local_record/*[local-name()=\'key\' and \@id=\'label\' and (. = \'LOCAL\' or . = \'LZA_INTERN\' or . = \'LZA\')]"; my $str_local_reps = "$str_local_record/*[local-name()=\'key\' and \@id=\'label\' and (. = \'LOCAL\' or . = \'LZA_INTERN\' or . = \'LZA\')]";
my $str_repid_old = "/mets:mets/mets:amdSec[starts-with(\@ID, \'REP\') and $str_local_reps]/\@ID"; my $str_repid_old = "/mets:mets/mets:amdSec[starts-with(\@ID, \'REP\') and $str_local_reps]/\@ID";
my $compiled_xpath_repid_old = XML::LibXML::XPathExpression->new($str_repid_old); my $compiled_xpath_repid_old = XML::LibXML::XPathExpression->new($str_repid_old);
my $compiled_xpath_ifpurged_event = XML::LibXML::XPathExpression->new('/mets:mets/mets:amdSec[@ID="ie-amd"]/mets:digiprovMD[@ID="ie-amd-digiprov"]/mets:mdWrap/mets:xmlData/dnx:dnx/dnx:section[@id="event"]/dnx:record[dnx:key[@id="eventIdentifierValue"]/. = 272 or . = 274]/dnx:key[@id="eventDescription"]/text()');
############################################################ ############################################################
sub get_title ($xp, $filename){ sub get_title ($xp, $dmd){
# get title # get title
my $title = $xp->findvalue($compiled_xpath_titles); my $title = $xp->findvalue($compiled_xpath_titles, $dmd);
#check_if_db_conform($title, $filename);
return $title; return $title;
} }
sub get_dcrecords_ref ($xp, $filename){ sub get_dcrecords_ref ($xp, $dmd){
my @dcnodes = $xp->findnodes($compiled_xpath_dcrecords); my @dcnodes = $xp->findnodes($compiled_xpath_dcrecords, $dmd);
my @dcrecords = map { my @dcrecords = map {
$_->[1]=~ s/\n/ /g; $_->[1]=~ tr/\n/ /;
$_->[1] =~ s/'/\\'/g; $_->[1] =~ s/'/\\'/g;
#check_if_db_conform($_->[1], $filename);
$_; $_;
} grep { } grep {
defined $_->[0] && defined $_->[0] &&
...@@ -353,7 +358,7 @@ sub check_if_db_conform ($string, $filename) { ...@@ -353,7 +358,7 @@ sub check_if_db_conform ($string, $filename) {
return \@dcrecords; return \@dcrecords;
} }
sub get_repid ($xp, $filename){ sub get_repid ($xp){
my $repid; my $repid;
my @repnodes = $xp->findnodes($compiled_xpath_repid); #/mets:mets/mets:amdSec/@ID my @repnodes = $xp->findnodes($compiled_xpath_repid); #/mets:mets/mets:amdSec/@ID
my $found = scalar @repnodes; my $found = scalar @repnodes;
...@@ -367,30 +372,49 @@ sub check_if_db_conform ($string, $filename) { ...@@ -367,30 +372,49 @@ sub check_if_db_conform ($string, $filename) {
return $repid; return $repid;
} }
sub get_files_ref ($xp, $filename, $repid){ sub get_files_ref ($xp, $fsp, $repid){
my @files_nodes = $xp->findnodes("/mets:mets/mets:fileSec/mets:fileGrp[\@ADMID='$repid']/mets:file/mets:FLocat/\@xlin:href"); my $filegrp = $xp->findnodes("mets:fileGrp[\@ADMID='$repid']", $fsp)->[0];
my @files_nodes = $xp->findnodes($compiled_xpath_files, $filegrp);
my @files = map { my $tmp= $_->nodeValue; $tmp=~ s#//#/#g; $tmp } @files_nodes; my @files = map { my $tmp= $_->nodeValue; $tmp=~ s#//#/#g; $tmp } @files_nodes;
return \@files; return \@files;
} }
sub has_purged_entry($xp, $amd) {
# we need to earch for eventIdentifierValue 272 or 274.
# the eventDescription should be
# a) IE has been deleted
# b) IE has been purged
my @del_nodes = $xp->findvalue($compiled_xpath_ifpurged_event, $amd);
return (List::Util::any {$_ =~ m/^IE has been (deleted|purged)$/} @del_nodes);
}
sub parse_iexml($filename, $recovery_flag) { sub parse_iexml($filename, $recovery_flag) {
if ($recovery_flag) { if ($recovery_flag) {
$recovery_flag = 2; # avoid warnings, see XML::LibXML::Parser POD about 'recovery' $recovery_flag = 2; # avoid warnings, see XML::LibXML::Parser POD about 'recovery'
} }
my $parser = get_parser($recovery_flag); my $parser = get_parser($recovery_flag);
my $dom = $parser->parse_file($filename); my $dom = $parser->parse_file($filename);
my $xp = get_xpath_context(); $xp->setContextNode($dom); my $xp = get_xpath_context();
$xp->setContextNode($dom);
my $dmdsec = $xp->findnodes($compiled_xpath_dmdSec)->[0];
my $amdsec = $xp->findnodes($compiled_xpath_amdSec)->[0];
my $filesec= $xp->findnodes($compiled_xpath_fileSec)->[0];
############################################ ############################################
# get title # get title
my $title = get_title($xp, $filename); my $title = get_title($xp, $dmdsec);
############################################ ############################################
# get dc-records # get dc-records
my $dcrecords_ref = get_dcrecords_ref($xp, $filename); my $dcrecords_ref = get_dcrecords_ref($xp, $dmdsec);
############################################ ############################################
# get right representation ID (has a dnx-section with <key id=label>LOCAL</key>) # get right representation ID (has a dnx-section with <key id=label>LOCAL</key>)
my $repid = get_repid($xp, $filename); my $repid = get_repid($xp);
if (!defined $repid) { if (!defined $repid) {
say STDERR "No repid found in file $filename, is IE purged?"; say STDERR "No repid found in file $filename, is IE intentionally purged?";
if (has_purged_entry($xp, $amdsec)) {
say STDERR " Yes, a corresponding purge event is found.";
} else {
say STDERR " No, a corresponding purge event was missed. This indicates an error in archive.";
}
my $tmp; my $tmp;
$tmp->{"filename"} = $filename; $tmp->{"filename"} = $filename;
$tmp->{"purged"} = 1; $tmp->{"purged"} = 1;
...@@ -398,7 +422,7 @@ sub check_if_db_conform ($string, $filename) { ...@@ -398,7 +422,7 @@ sub check_if_db_conform ($string, $filename) {
} }
############################################ ############################################
# get all files of LOCAL representation # get all files of LOCAL representation
my $files_ref = get_files_ref ($xp, $filename, $repid); my $files_ref = get_files_ref($xp, $filesec, $repid);
my $ret; my $ret;
$ret->{"filename" } = $filename; $ret->{"filename" } = $filename;
$ret->{"title"} = $title; $ret->{"title"} = $title;
...@@ -406,9 +430,10 @@ sub check_if_db_conform ($string, $filename) { ...@@ -406,9 +430,10 @@ sub check_if_db_conform ($string, $filename) {
$ret->{"files"} = $files_ref; $ret->{"files"} = $files_ref;
$ret->{"dcrecords"} = $dcrecords_ref; $ret->{"dcrecords"} = $dcrecords_ref;
return $ret; return $ret;
}
} }
# returns count of subdirs of $dir
sub searching_relevant_subdirs ($dir) { sub searching_relevant_subdirs ($dir) {
my $first_two_levels_of_dirs = 0; my $first_two_levels_of_dirs = 0;
### ###
...@@ -465,7 +490,7 @@ sub searching_ie_files ($dirs_ref, $tmp_ies_unsorted_file) { ...@@ -465,7 +490,7 @@ sub searching_ie_files ($dirs_ref, $tmp_ies_unsorted_file) {
$pass++; $pass++;
my $first_two_levels_of_dirs = searching_relevant_subdirs($dir); my $first_two_levels_of_dirs = searching_relevant_subdirs($dir);
my $count = searching_relevant_ie_files($dir, $tmp_ies_unsorted_file, $first_two_levels_of_dirs); my $count = searching_relevant_ie_files($dir, $tmp_ies_unsorted_file, $first_two_levels_of_dirs);
say "found $count IEs in pass $pass/$maxpass ($dir) "; say "found $count IEs in pass $pass/$maxpass (dir '$dir') ";
$cnt_unsorted_files+=$count; $cnt_unsorted_files+=$count;
} }
say "\r "; say "\r ";
...@@ -489,8 +514,8 @@ GetOptions( ...@@ -489,8 +514,8 @@ GetOptions(
say <<"HELP"; say <<"HELP";
call $0 with following options call $0 with following options
--help ............... this help --help ............... this help
--recovery ........... set special recovery flag --recovery ........... set special recovery flag, if set deleted IEs will be ignored
--continue ........... tries to add IEs to existing database, ignores IEs which already exists (dangerous!) --continue ........... tries to add IEs to existing database, ignores IEs which already exists in DB (dangerous!)
--sqlitedb-file=FILE . set database to file FILE --sqlitedb-file=FILE . set database to file FILE
--enable-sqldump ..... dumps a given database as SQL to STDOUT --enable-sqldump ..... dumps a given database as SQL to STDOUT
......
This diff is collapsed.
PRAGMA foreign_keys=OFF;
BEGIN TRANSACTION;
CREATE TABLE aip (
id INTEGER,
ie_id TEXT NOT NULL,
version INTEGER NOT NULL,
PRIMARY KEY(id AUTOINCREMENT)
);
INSERT INTO aip VALUES(1,'IE200928',1);
CREATE TABLE metadatafile (
id INTEGER,
aip_id INTEGER NOT NULL REFERENCES aip (id),
location TEXT NOT NULL,
sourcetype TEXT NOT NULL,
PRIMARY KEY(id AUTOINCREMENT)
);
INSERT INTO metadatafile VALUES(1,1,'V1-IE200928.xml','hdd');
CREATE TABLE dc (
id INTEGER,
aip_id INTEGER NOT NULL REFERENCES aip (id),
element TEXT NOT NULL,
value TEXT NOT NULL,
PRIMARY KEY(id AUTOINCREMENT)
);
INSERT INTO dc VALUES(1,1,'dc:identifier','SLUB:LZA:Kitodo:goobi:202001030001');
INSERT INTO dc VALUES(2,1,'dc:coverage','DE-14');
INSERT INTO dc VALUES(3,1,'dc:coverage','Hist.Sax.M.37.t,120');
INSERT INTO dc VALUES(4,1,'dc:relation','Saxonica');
INSERT INTO dc VALUES(5,1,'dc:identifier','oai:de:slub-dresden:db:id-319037843');
INSERT INTO dc VALUES(6,1,'dc:format','[1] Bl.');
INSERT INTO dc VALUES(7,1,'dc:identifier','goobi:202001030001');
INSERT INTO dc VALUES(8,1,'dc:identifier','urn:nbn:de:bsz:14-db-id3190378431');
INSERT INTO dc VALUES(9,1,'dc:identifier',': UTF8-Test: Ae: Ä, Oe: Ö, ue: ü, sz: ß, long-s: ſ, euro: €');
INSERT INTO dc VALUES(10,1,'dc:title','Eingabe der Handelskammer zu Leipzig den Entwurf eines Tabak-Steuer-Gesetzes betr. an den Reichstag zu Berlin');
INSERT INTO dc VALUES(11,1,'dc:language','de');
INSERT INTO dc VALUES(12,1,'dc:date','1893');
INSERT INTO dc VALUES(13,1,'dc:subject','eingdehaz');
CREATE TABLE sourcedatafile (
id INTEGER,
aip_id INTEGER NOT NULL REFERENCES aip (id),
name TEXT NOT NULL,
PRIMARY KEY(id AUTOINCREMENT)
);
INSERT INTO sourcedatafile VALUES(1,1,'V1-FL200931.tif');
INSERT INTO sourcedatafile VALUES(2,1,'V1-FL200930.xml');
INSERT INTO sourcedatafile VALUES(3,1,'V1-FL200932.mkv');
CREATE TABLE sourcedatalocat (
id INTEGER,
file_id INTEGER NOT NULL REFERENCES sourcedatafile (id),
location TEXT NOT NULL,
sourcetype TEXT NOT NULL,
PRIMARY KEY(id AUTOINCREMENT)
);
INSERT INTO sourcedatalocat VALUES(1,1,'/permanent_storage/2020/01/03/IE200928/V1-FL200931.tif','hdd');
INSERT INTO sourcedatalocat VALUES(2,2,'/permanent_storage/2020/01/03/IE200928/V1-FL200930.xml','hdd');
INSERT INTO sourcedatalocat VALUES(3,3,'/permanent_storage/2020/01/03/IE200928/V1-FL200932.mkv','hdd');
DELETE FROM sqlite_sequence;
INSERT INTO sqlite_sequence VALUES('aip',1);
INSERT INTO sqlite_sequence VALUES('metadatafile',1);
INSERT INTO sqlite_sequence VALUES('sourcedatafile',3);
INSERT INTO sqlite_sequence VALUES('sourcedatalocat',3);
INSERT INTO sqlite_sequence VALUES('dc',13);
CREATE UNIQUE INDEX aip_index on aip (ie_id, version);
CREATE UNIQUE INDEX sourcedata_index on sourcedatafile (aip_id, name);
COMMIT;
#!/usr/bin/perl
use strict;
use warnings;
use Test::More;
use Test::Cmd;
use Path::Tiny;
my $test = Test::Cmd->new(
prog => 'perl/exit_strategy.pl',
interpreter => "/usr/bin/perl",
workdir => "",
);
#################### help
my $expected = path(Test::Cmd::here())->child('perl')->child('t')->child('expected.sql')->slurp();
$test->run(args => '-h');
like( $test->stdout, qr/call .*exit_strategy.pl with following options/, 'help, standard out' );
is( $test->stderr, "", 'help, standard error' );
is( $? >> 8, 0, 'help, exit status' );
#################### parsing
my $tmp = $test->workdir("");
my $current_db = "$tmp/sqlite.db";
$test->run(args => "--sqlitedb-file=$current_db perl/t/");
like( $test->stdout, qr{preparing SQL.*processed 1 uniq IEs}s, 'sqldump, standard out' );
is( $test->stderr, "", 'sqldump, standard error' );
is( $? >> 8, 0, 'sqldump, exit status' );
##################### after parsing dump SQL
$test->run(args => "--sqlitedb-file=$current_db --enable-sqldump");
is( $test->stdout, $expected, 'sqldump, standard out' );
is( $test->stderr, "", 'sqldump, standard error' );
is( $? >> 8, 0, 'sqldump, exit status' );
done_testing();
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment