From d50fddbf508c0ce983dcfdba20dfdb32a2761992 Mon Sep 17 00:00:00 2001 From: Andreas Romeyke <art1@andreas-romeyke.de> Date: Sat, 23 Oct 2021 14:06:02 +0200 Subject: [PATCH] - refactoring, extracted package SLUB::LZA::SIPBuilderBagIt into lib/ --- bin/slubsipbuilderbagit.pl | 488 +------------------------------- lib/SLUB/LZA/SIPBuilderBagIt.pm | 484 +++++++++++++++++++++++++++++++ 2 files changed, 486 insertions(+), 486 deletions(-) create mode 100644 lib/SLUB/LZA/SIPBuilderBagIt.pm diff --git a/bin/slubsipbuilderbagit.pl b/bin/slubsipbuilderbagit.pl index 98cf51e..4be3aa1 100644 --- a/bin/slubsipbuilderbagit.pl +++ b/bin/slubsipbuilderbagit.pl @@ -44,491 +44,6 @@ use strict; use warnings; use utf8; -package SLUB::LZA::SIPBuilderBagIt; - use utf8; - use feature 'say'; - use Archive::BagIt; - use Carp; - use Encode; - use File::Copy; - use File::Copy::Recursive qw(dircopy dirmove); - use File::Basename; - use File::Find; - use List::Util qw(none); - use LWP::UserAgent; # to get MARC data - use MARC::Record; - use Path::Tiny; - use Term::ANSIColor; # colored print - use XML::LibXML; - use XML::XPath; - - our $VERBOSE = 0; # print output switch 0 => on, 1 and larger = off - our $with_debug = 0; # output debug infos and files - - # catalogue infos - my $swb_url = 'https://sru.bsz-bw.de/swb'; - my $searchkey_swb = 'pica.swn'; - my $searchkey_k10p = 'pica.ppn'; - my $recordschema_swb = 'marcxmlvbos'; - my $recordschema_k10p = 'marcxmlk10os'; - - # MARC21 -> bag-info.txt mapping - # reference: https://www.loc.gov/marc/bibliographic/bd20x24x.html - my $marc21_to_baginfo_mappings = [ - { tag => "245", code => "a", key => "Title" }, - { tag => "245", code => "b", key => "Title" }, - { tag => "245", code => "c", key => "Title" }, - { tag => "246", code => "a", key => "Title" }, - { tag => "246", code => "b", key => "Title" }, - ]; - - sub add_metadata($$$$) { - if(ref($_[0]) ne 'ARRAY') { croak "array of hashes not defined for bag-info.txt!"; } - - my $refAddBagInfo = $_[0]; # ref to array of hashes - my $ppn = $_[1]; - my $noppn = $_[2]; - my $marc21 = $_[3]; - - # adding entries to bag-info key list - push @{$refAddBagInfo}, {'External-Identifier' => "PPN:$ppn"} unless (! $ppn); - push @{$refAddBagInfo}, {'External-Identifier' => "$noppn"} unless (! $noppn); - if ($marc21) { - foreach my $mapping (@{$marc21_to_baginfo_mappings}) { - my $text = get_marc21_text_node($marc21, $mapping); - push @{$refAddBagInfo}, {$mapping->{key} => $text} unless (! $text); - } - } - return 1; - } - - sub get_marc21_text_node($$) { - if(! $_[0]) { croak "empty marc21!"; } - if(ref($_[1]) ne 'HASH') { croak "not a mapping hash!"; } - - my $marc21 = $_[0]; - my $mapping = $_[1]; - - # retrieve value based on mapping using a xpath - my $xpath = "//*[\@tag='" . $mapping->{tag} . "']/"; - $xpath .= "*[local-name()='subfield' and \@code='" . $mapping->{code} . "']/" unless (! exists $mapping->{code}); - $xpath .= "text()"; - # HINT: xpath example //*[@tag='245']/*[local-name()='subfield' and @code='a']/text() - my $text = XML::XPath->new($marc21)->findnodes_as_string($xpath); - - return $text; - } - - sub get_marc21_from_catalogue($){ - if(! $_[0]){ croak "ppn not defined!"; } - - my $ppn = $_[0]; - my $marc21; - my $use_fallback; - - eval { # try SWB catalogue - $marc21 = SLUB::LZA::SIPBuilderBagIt::get_marc21_from($swb_url, $ppn, $searchkey_swb, $recordschema_swb); - }; - if ($@) { # error - say "$@" unless (! $with_debug); - if ($@ =~ m#ERROR: Did not get any <recordData/> for PPN#) { - print_scalar_data("", "WARNING: '$ppn' not a SWB PPN, fallback to K10plus", "", "yellow"); - $use_fallback = 1; - } - else { - die "$@"; # throw exception again - } - } - - if ($use_fallback) { - eval { # try K10plus catalogue - $marc21 = SLUB::LZA::SIPBuilderBagIt::get_marc21_from($swb_url, $ppn, $searchkey_k10p, $recordschema_k10p); - }; - if ($@) { # error - say "$@" unless (!$with_debug); - if ($@ =~ m#ERROR: Did not get any <recordData/> for PPN#) { - print_scalar_data("", "ERROR: '$ppn' also not a K10plus PPN", "", "red"); - exit 1; - } - else { - die "$@"; # throw exception again - } - } - } - - print_scalar_data("", "INFO: Found metadata for PPN '$ppn'", "", "white"); - return $marc21; - } - - sub get_marc21_from($$$$){ - # specification SRU/SRW BSZ: https://wiki.k10plus.de/pages/viewpage.action?pageId=132874251 - if(! defined $_[0]) { croak "url not defined!"; } - if(! defined $_[1]) { croak "ppn not defined!"; } - if(! defined $_[2]) { croak "key not defined!"; } - if(! defined $_[3]) { croak "schema not defined!"; } - if($_[0] eq "" || $_[1] eq "" || $_[2] eq "" || $_[3] eq ""){ - die "invalid parameters."; - } - my $url = shift; - my $ppn = shift; # example: "457035137" for "Der Fichtelberg" - my $key = shift; - my $schema = shift; - - my $ua = LWP::UserAgent->new; - $ua->agent("MyApp/0.1 "); - $ua->timeout(3600); #1h - - my $srubase = $url; # host - my $srusearchkey = $key; # SRU search key - my $sruvalue = $ppn; - my $srumaxrecords = 1; - my $srustartrecord = 1; - my $sruschema = $schema; - my $sru = "${srubase}?version=1.1&query=${srusearchkey}%3D${sruvalue}&operation=searchRetrieve&maximumRecords=${srumaxrecords}&startRecord=${srustartrecord}&recordSchema=${sruschema}"; - - if ($with_debug) {say "catalog-URL='$sru'";} - my $response = $ua->get($sru); # ask SWB for given PPN - if ($response->is_success) { - # parse ZiNG response, extract MARC-data - my $xp = XML::XPath->new($response->decoded_content); - my $parser = XML::LibXML->new(); - if ($with_debug) { - say "write DEBUG_${ppn}_catalog_response.xml"; - SLUB::LZA::SIPBuilderBagIt::write_file("DEBUG_${ppn}_catalog_response.xml", $response->decoded_content); - } - my $recordData = $xp->findnodes_as_string('/*[local-name()="searchRetrieveResponse"]/*[local-name()="records"]/*[local-name()="record"]/*[local-name()="recordData"]/*'); - if (!$recordData) { croak("ERROR: Did not get any <recordData/> for PPN '$ppn' using '$sru'");} - my $marc_doc = $parser->parse_string($recordData); - add_marc21_schema_location($marc_doc); - if ($with_debug) { - say "write DEBUG_${ppn}_extracted_marc_record.xml"; - SLUB::LZA::SIPBuilderBagIt::write_file("DEBUG_${ppn}_extracted_marc_record.xml", $marc_doc); - } - return $marc_doc->serialize(); - } - else { - carp("Problem asking catalogue at $url using $ppn"); - } - - return; - } - - sub add_marc21_schema_location($) { - if(ref($_[0]) ne 'XML::LibXML::Document') { croak "marc xml document required!"; } - my $marc_doc = $_[0]; - my $root = $marc_doc->findnodes('/*[local-name()="record"]')->[0]; # get root element (XML::LibXML::Element) - $root->setAttribute('xmlns:xsi', 'http://www.w3.org/2001/XMLSchema-instance'); # add xsi namespace (simple) - $root->setAttribute('xsi:schemaLocation', 'http://www.loc.gov/MARC21/slim https://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd'); # add MARCXML schema location - # HINT: method sets or replaces the node's attribute - 1; - } - - sub write_file($$){ - if(! defined $_[0]) { croak "filename not defined!"; } - if(! defined $_[1]) { croak "value not defined!"; } - if($_[0] eq "" || $_[1] eq ""){ die "invalid parameters."; } - - my $filename = $_[0]; - my $value = $_[1]; - - # write data to file (UTF-8) - open(my $fh, '>:encoding(UTF-8)', $filename) || (croak "Can't open '$filename', $!"); - print $fh $value; - close($fh) || (croak "could not close file '$filename', $!"); - return 1; - } - - - sub generateBagName($$$){ - my $file_date = $_[0]; - my $ppn = $_[1]; - my $noppn = $_[2]; - - $file_date =~ s/T/_/g; # replace 'T' with '_' - $file_date =~ s/:/-/g; # replace ':' with '-' - return (defined $ppn)? "PPN-${ppn}_${file_date}" : "ID-${noppn}_${file_date}"; - } - - sub createDir($){ - my $path = $_[0]; - - if(! -d $path && ! -f $path){ - eval { path($path)->mkpath; }; - if ($@) { # error - print_scalar_data("", "ERROR: Failed to create directory '$path'.", "", "red"); - die $@; - } - } - return 1; - } - - sub copyFilesToMeta($$$$$){ - my $ppn = $_[0]; - my $metaPath = $_[1]; - my $rightsFilePath = $_[2]; - my $aRefAddMetaFile = $_[3]; - my $marc21 = $_[4]; - - my @addMetaFile = @{ $aRefAddMetaFile }; - - if(@addMetaFile){ - my $i = 1; - foreach my $file(@addMetaFile){ - # check reserved file names - my $meta_file_name = basename($file); - if($meta_file_name eq "rights.xml" || $meta_file_name eq "marc21.xml"){ - foreach my $f(@addMetaFile){ - # check if numbered xml file exists - my $f_name = basename($f); - while($f_name eq "$i.xml"){ - $i++; - } - } - print_scalar_data("", "WARNING: Renaming " . $meta_file_name . " to $i.xml, because meta filename <rights.xml> or <marc21.xml> is reserved.", "", "yellow"); - print_scalar_data("", "INFO: Read Docu for more information at https://slubarchiv.slub-dresden.de/technische-standards-fuer-die-ablieferung-von-digitalen-dokumenten/", "", "white"); - copy($file, $metaPath) or die "Copy failed: $!"; - rename("$metaPath/$meta_file_name", "$metaPath/$i.xml"); - $i++; - }else{ - copy($file, $metaPath) or die "Copy failed: $!"; - } - } - } - - # marc21.xml - if ($ppn && $marc21) { - write_file("$metaPath/marc21.xml", $marc21); - } - - # rights.xml - if(basename($rightsFilePath) ne "rights.xml"){ - my $rights_name = basename($rightsFilePath); - print_scalar_data("", "WARNING: Renaming " . $rights_name . " to rights.xml, because meta filename <rights.xml> is required.", "", "yellow"); - print_scalar_data("", "INFO: Read Docu for more information at https://slubarchiv.slub-dresden.de/technische-standards-fuer-die-ablieferung-von-digitalen-dokumenten/", "", "white"); - copy($rightsFilePath, $metaPath) or die "Copy failed: $!"; - rename("$metaPath/$rights_name", "$metaPath/rights.xml"); - }else{ - copy($rightsFilePath, $metaPath) or die "Copy failed: $!"; - } - - return 1; - } - - sub validateRightsXML($$){ - if(! $_[0]) { croak "rightsFilePath not defined!"; } - if(! defined $_[1]) { croak "xsd name not defined!"; } - - my $rightsFilePath = $_[0]; - my $xsdName = $_[1]; - - my $rightsName = basename($rightsFilePath); - my $xsd_file = path(__FILE__)->parent->realpath->parent->child("xsd")->child($xsdName); # absolute path - - my $schema = XML::LibXML::Schema->new(location => $xsd_file); - my $parser = XML::LibXML->new; - my $doc = $parser->parse_file($rightsFilePath); - - eval { - $schema->validate($doc); - }; - if ($@) { # error - die "File $rightsName failed validation: $@"; - }; - return 1; - } - - sub print_scalar_data($$$$){ - my $priority = 0; - my $header = $_[0]; - my $p = $_[1]; - my $arrRef = $_[2]; - my $pColor = $_[3]; - - # header - if($header ne ""){ - print "----------------------------------------------------------------------------------\n" if $priority >= $VERBOSE; - printf "%" . length($header) . "s\n", colored($header, 'bold green') if $priority >= $VERBOSE; - print "----------------------------------------------------------------------------------\n" if $priority >= $VERBOSE; - } - # paragraph - if(($p ne "") && ($arrRef eq "")){ - printf "%" . length($p) . "s\n", colored($p, "bold $pColor") if $priority >= $VERBOSE; - } - - # paragraph with converted array - if(($p ne "") && ($arrRef ne "")){ - my $string = ""; - for my $i (0 .. $#$arrRef){ - if($i eq $#$arrRef){ - $string .= " $$arrRef[$i]"; - } else { - $string .= " $$arrRef[$i],"; - } - } - my $body = $p . $string; - printf "%" . length($body) . "s\n", colored($body, "bold $pColor") if $priority >= $VERBOSE; - } - - return 1; - } - - sub create_slub_bagit($$){ - if(! defined $_[0]) { croak "bagPath is not defined!"; } - if(! defined $_[1]) { croak "array of hashes not defined for bag-info.txt!"; } - if(! -d $_[0]) { croak "path is not directory!" } - - my $bagPath = $_[0]; - my $refAddBagInfo = $_[1]; # ref to array of hashes - - # construct bag - my $oArchiveBagIt = Archive::BagIt->make_bag($bagPath); - $oArchiveBagIt->bag_info($refAddBagInfo); - $oArchiveBagIt->store(); - print_scalar_data("", "SUCCESS: SLUB SIP at $bagPath build successfully!", "", "green"); - - return 1; - } - - sub buildBagWithCopyOption($$$$$$$$$){ - my $ppn = $_[0]; - my $noppn = $_[1]; - my $ieDirectory = $_[2]; - my $rightsFilePath = $_[3]; - my $aRefAddMetaFile = $_[4]; - my $refAddBagInfo = $_[5]; # ref to array of hashes - my $bagPath = $_[6]; - my $dataPath = $_[7]; - my $metaPath = $_[8]; - - my @addMetaFile = @{ $aRefAddMetaFile }; - my @addBagInfo = @{ $refAddBagInfo }; - my $marc21; - - # get descriptive metadata from catalog - if ($ppn) { - $marc21 = get_marc21_from_catalogue($ppn); - } - # create bag dir - SLUB::LZA::SIPBuilderBagIt::createDir($bagPath); - # create data dir - SLUB::LZA::SIPBuilderBagIt::createDir($dataPath); - # copy payload files to data - print_scalar_data("", "INFO: copying original data...", "", "white"); - dircopy($ieDirectory, $dataPath) || die ("Error coping $ieDirectory to $dataPath"); - # create meta dir - SLUB::LZA::SIPBuilderBagIt::createDir($metaPath); - # copy not payload files to meta - SLUB::LZA::SIPBuilderBagIt::copyFilesToMeta($ppn, $metaPath, $rightsFilePath, \@addMetaFile, $marc21); - # add metadata for bag-info.txt - add_metadata(\@addBagInfo, $ppn, $noppn, $marc21); - # metadata warnings - SLUB::LZA::SIPBuilderBagIt::checkForTitle(\@addBagInfo); - # create slub bagit - print_scalar_data("", "INFO: building SIP...", "", "white"); - SLUB::LZA::SIPBuilderBagIt::create_slub_bagit($bagPath, \@addBagInfo); - - return 1; - } - - sub buildBagWithMoveOption($$$$$$$$$){ - my $ppn = $_[0]; - my $noppn = $_[1]; - my $ieDirectory = $_[2]; - my $rightsFilePath = $_[3]; - my $aRefAddMetaFile = $_[4]; - my $refAddBagInfo = $_[5]; # ref to array of hashes - my $bagPath = $_[6]; - my $dataPath = $_[7]; - my $metaPath = $_[8]; - - my @addMetaFile = @{ $aRefAddMetaFile }; - my @addBagInfo = @{ $refAddBagInfo }; - my $marc21; - - # get descriptive metadata from catalog - if ($ppn) { - $marc21 = get_marc21_from_catalogue($ppn); - } - # create bag dir - SLUB::LZA::SIPBuilderBagIt::createDir($bagPath); - # move payloads to data dir (which is automatically renamed from IE Directory) - print_scalar_data("", "INFO: moving original data...", "", "white"); - dirmove($ieDirectory, $dataPath) || die ("Error moving $ieDirectory to $dataPath"); - # create meta dir - SLUB::LZA::SIPBuilderBagIt::createDir($metaPath); - # copy not payload files to meta - SLUB::LZA::SIPBuilderBagIt::copyFilesToMeta($ppn, $metaPath, $rightsFilePath, \@addMetaFile, $marc21); - # add metadata for bag-info.txt - add_metadata(\@addBagInfo, $ppn, $noppn, $marc21); - # metadata warnings - SLUB::LZA::SIPBuilderBagIt::checkForTitle(\@addBagInfo); - # create slub bagit - print_scalar_data("", "INFO: building SIP...", "", "white"); - SLUB::LZA::SIPBuilderBagIt::create_slub_bagit($bagPath, \@addBagInfo); - - return 1; - } - - sub buildBagWithReplaceOption($$$$$$$$$){ - my $ppn = $_[0]; - my $noppn = $_[1]; - my $ieDirectory = $_[2]; - my $rightsFilePath = $_[3]; - my $aRefAddMetaFile = $_[4]; - my $refAddBagInfo = $_[5]; # ref to array of hashes - my $bagPath = $_[6]; - my $dataPath = $_[7]; - my $metaPath = $_[8]; - - my @addMetaFile = @{ $aRefAddMetaFile }; - my @addBagInfo = @{ $refAddBagInfo }; - my $marc21; - - # get descriptive metadata from catalog - if ($ppn) { - $marc21 = get_marc21_from_catalogue($ppn); - } - # create bag dir - SLUB::LZA::SIPBuilderBagIt::createDir($bagPath); - # move payloads to data dir (which is automatically renamed from IE Directory) - print_scalar_data("", "INFO: moving original data...", "", "white"); - dirmove($ieDirectory, $dataPath) || die ("Error moving $ieDirectory to $dataPath"); - # create meta dir - SLUB::LZA::SIPBuilderBagIt::createDir($metaPath); - # copy not payload files to meta - SLUB::LZA::SIPBuilderBagIt::copyFilesToMeta($ppn, $metaPath, $rightsFilePath, \@addMetaFile, $marc21); - # add metadata for bag-info.txt - add_metadata(\@addBagInfo, $ppn, $noppn, $marc21); - # metadata warnings - SLUB::LZA::SIPBuilderBagIt::checkForTitle(\@addBagInfo); - # create slub bagit - print_scalar_data("", "INFO: building SIP...", "", "white"); - SLUB::LZA::SIPBuilderBagIt::create_slub_bagit($bagPath, \@addBagInfo); - - return 1; - } - - sub checkForTitle($){ - if(ref($_[0]) ne 'ARRAY') { croak "no array of hashes with bag-info.txt key values given!"; } - - my $key_hash_list_aref = $_[0]; - - # collect all bag-info keys for easy check - my @keys; - for my $hash (@{$key_hash_list_aref}) { - push @keys, keys %{$hash}; - } - # check for title - if (none { $_ =~ m/^[Tt][Ii][Tt][Ll][Ee]/ } @keys) { - print_scalar_data("", "WARNING: No Title given. Please consider adding a human-readable title with '--add_key_value=Title:YOUR-TITLE'", "", "yellow"); - return 0; - } - return 1; - } - -# end package - -package main; #=============================================================================== BEGIN{ @@ -794,9 +309,10 @@ sub main{ #=============================================================================== main(); - +1; #=============================================================================== + __END__ =pod diff --git a/lib/SLUB/LZA/SIPBuilderBagIt.pm b/lib/SLUB/LZA/SIPBuilderBagIt.pm new file mode 100644 index 0000000..6b390b7 --- /dev/null +++ b/lib/SLUB/LZA/SIPBuilderBagIt.pm @@ -0,0 +1,484 @@ +package SLUB::LZA::SIPBuilderBagIt; +use strict; +use warnings; +use utf8; +use feature 'say'; +use Archive::BagIt; +use Carp; +use Encode; +use File::Copy; +use File::Copy::Recursive qw(dircopy dirmove); +use File::Basename; +use File::Find; +use List::Util qw(none); +use LWP::UserAgent; # to get MARC data +use MARC::Record; +use Path::Tiny; +use Term::ANSIColor; # colored print +use XML::LibXML; +use XML::XPath; + +our $VERBOSE = 0; # print output switch 0 => on, 1 and larger = off +our $with_debug = 0; # output debug infos and files + +# catalogue infos +my $swb_url = 'https://sru.bsz-bw.de/swb'; +my $searchkey_swb = 'pica.swn'; +my $searchkey_k10p = 'pica.ppn'; +my $recordschema_swb = 'marcxmlvbos'; +my $recordschema_k10p = 'marcxmlk10os'; + +# MARC21 -> bag-info.txt mapping +# reference: https://www.loc.gov/marc/bibliographic/bd20x24x.html +my $marc21_to_baginfo_mappings = [ + { tag => "245", code => "a", key => "Title" }, + { tag => "245", code => "b", key => "Title" }, + { tag => "245", code => "c", key => "Title" }, + { tag => "246", code => "a", key => "Title" }, + { tag => "246", code => "b", key => "Title" }, +]; + +sub add_metadata($$$$) { + if(ref($_[0]) ne 'ARRAY') { croak "array of hashes not defined for bag-info.txt!"; } + + my $refAddBagInfo = $_[0]; # ref to array of hashes + my $ppn = $_[1]; + my $noppn = $_[2]; + my $marc21 = $_[3]; + + # adding entries to bag-info key list + push @{$refAddBagInfo}, {'External-Identifier' => "PPN:$ppn"} unless (! $ppn); + push @{$refAddBagInfo}, {'External-Identifier' => "$noppn"} unless (! $noppn); + if ($marc21) { + foreach my $mapping (@{$marc21_to_baginfo_mappings}) { + my $text = get_marc21_text_node($marc21, $mapping); + push @{$refAddBagInfo}, {$mapping->{key} => $text} unless (! $text); + } + } + return 1; +} + +sub get_marc21_text_node($$) { + if(! $_[0]) { croak "empty marc21!"; } + if(ref($_[1]) ne 'HASH') { croak "not a mapping hash!"; } + + my $marc21 = $_[0]; + my $mapping = $_[1]; + + # retrieve value based on mapping using a xpath + my $xpath = "//*[\@tag='" . $mapping->{tag} . "']/"; + $xpath .= "*[local-name()='subfield' and \@code='" . $mapping->{code} . "']/" unless (! exists $mapping->{code}); + $xpath .= "text()"; + # HINT: xpath example //*[@tag='245']/*[local-name()='subfield' and @code='a']/text() + my $text = XML::XPath->new($marc21)->findnodes_as_string($xpath); + + return $text; +} + +sub get_marc21_from_catalogue($){ + if(! $_[0]){ croak "ppn not defined!"; } + + my $ppn = $_[0]; + my $marc21; + my $use_fallback; + + eval { # try SWB catalogue + $marc21 = SLUB::LZA::SIPBuilderBagIt::get_marc21_from($swb_url, $ppn, $searchkey_swb, $recordschema_swb); + }; + if ($@) { # error + say "$@" unless (! $with_debug); + if ($@ =~ m#ERROR: Did not get any <recordData/> for PPN#) { + print_scalar_data("", "WARNING: '$ppn' not a SWB PPN, fallback to K10plus", "", "yellow"); + $use_fallback = 1; + } + else { + die "$@"; # throw exception again + } + } + + if ($use_fallback) { + eval { # try K10plus catalogue + $marc21 = SLUB::LZA::SIPBuilderBagIt::get_marc21_from($swb_url, $ppn, $searchkey_k10p, $recordschema_k10p); + }; + if ($@) { # error + say "$@" unless (!$with_debug); + if ($@ =~ m#ERROR: Did not get any <recordData/> for PPN#) { + print_scalar_data("", "ERROR: '$ppn' also not a K10plus PPN", "", "red"); + exit 1; + } + else { + die "$@"; # throw exception again + } + } + } + + print_scalar_data("", "INFO: Found metadata for PPN '$ppn'", "", "white"); + return $marc21; +} + +sub get_marc21_from($$$$){ + # specification SRU/SRW BSZ: https://wiki.k10plus.de/pages/viewpage.action?pageId=132874251 + if(! defined $_[0]) { croak "url not defined!"; } + if(! defined $_[1]) { croak "ppn not defined!"; } + if(! defined $_[2]) { croak "key not defined!"; } + if(! defined $_[3]) { croak "schema not defined!"; } + if($_[0] eq "" || $_[1] eq "" || $_[2] eq "" || $_[3] eq ""){ + die "invalid parameters."; + } + my $url = shift; + my $ppn = shift; # example: "457035137" for "Der Fichtelberg" + my $key = shift; + my $schema = shift; + + my $ua = LWP::UserAgent->new; + $ua->agent("MyApp/0.1 "); + $ua->timeout(3600); #1h + + my $srubase = $url; # host + my $srusearchkey = $key; # SRU search key + my $sruvalue = $ppn; + my $srumaxrecords = 1; + my $srustartrecord = 1; + my $sruschema = $schema; + my $sru = "${srubase}?version=1.1&query=${srusearchkey}%3D${sruvalue}&operation=searchRetrieve&maximumRecords=${srumaxrecords}&startRecord=${srustartrecord}&recordSchema=${sruschema}"; + + if ($with_debug) {say "catalog-URL='$sru'";} + my $response = $ua->get($sru); # ask SWB for given PPN + if ($response->is_success) { + # parse ZiNG response, extract MARC-data + my $xp = XML::XPath->new($response->decoded_content); + my $parser = XML::LibXML->new(); + if ($with_debug) { + say "write DEBUG_${ppn}_catalog_response.xml"; + SLUB::LZA::SIPBuilderBagIt::write_file("DEBUG_${ppn}_catalog_response.xml", $response->decoded_content); + } + my $recordData = $xp->findnodes_as_string('/*[local-name()="searchRetrieveResponse"]/*[local-name()="records"]/*[local-name()="record"]/*[local-name()="recordData"]/*'); + if (!$recordData) { croak("ERROR: Did not get any <recordData/> for PPN '$ppn' using '$sru'");} + my $marc_doc = $parser->parse_string($recordData); + add_marc21_schema_location($marc_doc); + if ($with_debug) { + say "write DEBUG_${ppn}_extracted_marc_record.xml"; + SLUB::LZA::SIPBuilderBagIt::write_file("DEBUG_${ppn}_extracted_marc_record.xml", $marc_doc); + } + return $marc_doc->serialize(); + } + else { + carp("Problem asking catalogue at $url using $ppn"); + } + + return; +} + +sub add_marc21_schema_location($) { + if(ref($_[0]) ne 'XML::LibXML::Document') { croak "marc xml document required!"; } + my $marc_doc = $_[0]; + my $root = $marc_doc->findnodes('/*[local-name()="record"]')->[0]; # get root element (XML::LibXML::Element) + $root->setAttribute('xmlns:xsi', 'http://www.w3.org/2001/XMLSchema-instance'); # add xsi namespace (simple) + $root->setAttribute('xsi:schemaLocation', 'http://www.loc.gov/MARC21/slim https://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd'); # add MARCXML schema location + # HINT: method sets or replaces the node's attribute + 1; +} + +sub write_file($$){ + if(! defined $_[0]) { croak "filename not defined!"; } + if(! defined $_[1]) { croak "value not defined!"; } + if($_[0] eq "" || $_[1] eq ""){ die "invalid parameters."; } + + my $filename = $_[0]; + my $value = $_[1]; + + # write data to file (UTF-8) + open(my $fh, '>:encoding(UTF-8)', $filename) || (croak "Can't open '$filename', $!"); + print $fh $value; + close($fh) || (croak "could not close file '$filename', $!"); + return 1; +} + + +sub generateBagName($$$){ + my $file_date = $_[0]; + my $ppn = $_[1]; + my $noppn = $_[2]; + + $file_date =~ s/T/_/g; # replace 'T' with '_' + $file_date =~ s/:/-/g; # replace ':' with '-' + return (defined $ppn)? "PPN-${ppn}_${file_date}" : "ID-${noppn}_${file_date}"; +} + +sub createDir($){ + my $path = $_[0]; + + if(! -d $path && ! -f $path){ + eval { path($path)->mkpath; }; + if ($@) { # error + print_scalar_data("", "ERROR: Failed to create directory '$path'.", "", "red"); + die $@; + } + } + return 1; +} + +sub copyFilesToMeta($$$$$){ + my $ppn = $_[0]; + my $metaPath = $_[1]; + my $rightsFilePath = $_[2]; + my $aRefAddMetaFile = $_[3]; + my $marc21 = $_[4]; + + my @addMetaFile = @{ $aRefAddMetaFile }; + + if(@addMetaFile){ + my $i = 1; + foreach my $file(@addMetaFile){ + # check reserved file names + my $meta_file_name = basename($file); + if($meta_file_name eq "rights.xml" || $meta_file_name eq "marc21.xml"){ + foreach my $f(@addMetaFile){ + # check if numbered xml file exists + my $f_name = basename($f); + while($f_name eq "$i.xml"){ + $i++; + } + } + print_scalar_data("", "WARNING: Renaming " . $meta_file_name . " to $i.xml, because meta filename <rights.xml> or <marc21.xml> is reserved.", "", "yellow"); + print_scalar_data("", "INFO: Read Docu for more information at https://slubarchiv.slub-dresden.de/technische-standards-fuer-die-ablieferung-von-digitalen-dokumenten/", "", "white"); + copy($file, $metaPath) or die "Copy failed: $!"; + rename("$metaPath/$meta_file_name", "$metaPath/$i.xml"); + $i++; + }else{ + copy($file, $metaPath) or die "Copy failed: $!"; + } + } + } + + # marc21.xml + if ($ppn && $marc21) { + write_file("$metaPath/marc21.xml", $marc21); + } + + # rights.xml + if(basename($rightsFilePath) ne "rights.xml"){ + my $rights_name = basename($rightsFilePath); + print_scalar_data("", "WARNING: Renaming " . $rights_name . " to rights.xml, because meta filename <rights.xml> is required.", "", "yellow"); + print_scalar_data("", "INFO: Read Docu for more information at https://slubarchiv.slub-dresden.de/technische-standards-fuer-die-ablieferung-von-digitalen-dokumenten/", "", "white"); + copy($rightsFilePath, $metaPath) or die "Copy failed: $!"; + rename("$metaPath/$rights_name", "$metaPath/rights.xml"); + }else{ + copy($rightsFilePath, $metaPath) or die "Copy failed: $!"; + } + + return 1; +} + +sub validateRightsXML($$){ + if(! $_[0]) { croak "rightsFilePath not defined!"; } + if(! defined $_[1]) { croak "xsd name not defined!"; } + + my $rightsFilePath = $_[0]; + my $xsdName = $_[1]; + + my $rightsName = basename($rightsFilePath); + my $xsd_file = path(__FILE__)->parent->realpath->parent->child("xsd")->child($xsdName); # absolute path + + my $schema = XML::LibXML::Schema->new(location => $xsd_file); + my $parser = XML::LibXML->new; + my $doc = $parser->parse_file($rightsFilePath); + + eval { + $schema->validate($doc); + }; + if ($@) { # error + die "File $rightsName failed validation: $@"; + }; + return 1; +} + +sub print_scalar_data($$$$){ + my $priority = 0; + my $header = $_[0]; + my $p = $_[1]; + my $arrRef = $_[2]; + my $pColor = $_[3]; + + # header + if($header ne ""){ + print "----------------------------------------------------------------------------------\n" if $priority >= $VERBOSE; + printf "%" . length($header) . "s\n", colored($header, 'bold green') if $priority >= $VERBOSE; + print "----------------------------------------------------------------------------------\n" if $priority >= $VERBOSE; + } + # paragraph + if(($p ne "") && ($arrRef eq "")){ + printf "%" . length($p) . "s\n", colored($p, "bold $pColor") if $priority >= $VERBOSE; + } + + # paragraph with converted array + if(($p ne "") && ($arrRef ne "")){ + my $string = ""; + for my $i (0 .. $#$arrRef){ + if($i eq $#$arrRef){ + $string .= " $$arrRef[$i]"; + } else { + $string .= " $$arrRef[$i],"; + } + } + my $body = $p . $string; + printf "%" . length($body) . "s\n", colored($body, "bold $pColor") if $priority >= $VERBOSE; + } + + return 1; +} + +sub create_slub_bagit($$){ + if(! defined $_[0]) { croak "bagPath is not defined!"; } + if(! defined $_[1]) { croak "array of hashes not defined for bag-info.txt!"; } + if(! -d $_[0]) { croak "path is not directory!" } + + my $bagPath = $_[0]; + my $refAddBagInfo = $_[1]; # ref to array of hashes + + # construct bag + my $oArchiveBagIt = Archive::BagIt->make_bag($bagPath); + $oArchiveBagIt->bag_info($refAddBagInfo); + $oArchiveBagIt->store(); + print_scalar_data("", "SUCCESS: SLUB SIP at $bagPath build successfully!", "", "green"); + + return 1; +} + +sub buildBagWithCopyOption($$$$$$$$$){ + my $ppn = $_[0]; + my $noppn = $_[1]; + my $ieDirectory = $_[2]; + my $rightsFilePath = $_[3]; + my $aRefAddMetaFile = $_[4]; + my $refAddBagInfo = $_[5]; # ref to array of hashes + my $bagPath = $_[6]; + my $dataPath = $_[7]; + my $metaPath = $_[8]; + + my @addMetaFile = @{ $aRefAddMetaFile }; + my @addBagInfo = @{ $refAddBagInfo }; + my $marc21; + + # get descriptive metadata from catalog + if ($ppn) { + $marc21 = get_marc21_from_catalogue($ppn); + } + # create bag dir + SLUB::LZA::SIPBuilderBagIt::createDir($bagPath); + # create data dir + SLUB::LZA::SIPBuilderBagIt::createDir($dataPath); + # copy payload files to data + print_scalar_data("", "INFO: copying original data...", "", "white"); + dircopy($ieDirectory, $dataPath) || die ("Error coping $ieDirectory to $dataPath"); + # create meta dir + SLUB::LZA::SIPBuilderBagIt::createDir($metaPath); + # copy not payload files to meta + SLUB::LZA::SIPBuilderBagIt::copyFilesToMeta($ppn, $metaPath, $rightsFilePath, \@addMetaFile, $marc21); + # add metadata for bag-info.txt + add_metadata(\@addBagInfo, $ppn, $noppn, $marc21); + # metadata warnings + SLUB::LZA::SIPBuilderBagIt::checkForTitle(\@addBagInfo); + # create slub bagit + print_scalar_data("", "INFO: building SIP...", "", "white"); + SLUB::LZA::SIPBuilderBagIt::create_slub_bagit($bagPath, \@addBagInfo); + + return 1; +} + +sub buildBagWithMoveOption($$$$$$$$$){ + my $ppn = $_[0]; + my $noppn = $_[1]; + my $ieDirectory = $_[2]; + my $rightsFilePath = $_[3]; + my $aRefAddMetaFile = $_[4]; + my $refAddBagInfo = $_[5]; # ref to array of hashes + my $bagPath = $_[6]; + my $dataPath = $_[7]; + my $metaPath = $_[8]; + + my @addMetaFile = @{ $aRefAddMetaFile }; + my @addBagInfo = @{ $refAddBagInfo }; + my $marc21; + + # get descriptive metadata from catalog + if ($ppn) { + $marc21 = get_marc21_from_catalogue($ppn); + } + # create bag dir + SLUB::LZA::SIPBuilderBagIt::createDir($bagPath); + # move payloads to data dir (which is automatically renamed from IE Directory) + print_scalar_data("", "INFO: moving original data...", "", "white"); + dirmove($ieDirectory, $dataPath) || die ("Error moving $ieDirectory to $dataPath"); + # create meta dir + SLUB::LZA::SIPBuilderBagIt::createDir($metaPath); + # copy not payload files to meta + SLUB::LZA::SIPBuilderBagIt::copyFilesToMeta($ppn, $metaPath, $rightsFilePath, \@addMetaFile, $marc21); + # add metadata for bag-info.txt + add_metadata(\@addBagInfo, $ppn, $noppn, $marc21); + # metadata warnings + SLUB::LZA::SIPBuilderBagIt::checkForTitle(\@addBagInfo); + # create slub bagit + print_scalar_data("", "INFO: building SIP...", "", "white"); + SLUB::LZA::SIPBuilderBagIt::create_slub_bagit($bagPath, \@addBagInfo); + + return 1; +} + +sub buildBagWithReplaceOption($$$$$$$$$){ + my $ppn = $_[0]; + my $noppn = $_[1]; + my $ieDirectory = $_[2]; + my $rightsFilePath = $_[3]; + my $aRefAddMetaFile = $_[4]; + my $refAddBagInfo = $_[5]; # ref to array of hashes + my $bagPath = $_[6]; + my $dataPath = $_[7]; + my $metaPath = $_[8]; + + my @addMetaFile = @{ $aRefAddMetaFile }; + my @addBagInfo = @{ $refAddBagInfo }; + my $marc21; + + # get descriptive metadata from catalog + if ($ppn) { + $marc21 = get_marc21_from_catalogue($ppn); + } + # create bag dir + SLUB::LZA::SIPBuilderBagIt::createDir($bagPath); + # move payloads to data dir (which is automatically renamed from IE Directory) + print_scalar_data("", "INFO: moving original data...", "", "white"); + dirmove($ieDirectory, $dataPath) || die ("Error moving $ieDirectory to $dataPath"); + # create meta dir + SLUB::LZA::SIPBuilderBagIt::createDir($metaPath); + # copy not payload files to meta + SLUB::LZA::SIPBuilderBagIt::copyFilesToMeta($ppn, $metaPath, $rightsFilePath, \@addMetaFile, $marc21); + # add metadata for bag-info.txt + add_metadata(\@addBagInfo, $ppn, $noppn, $marc21); + # metadata warnings + SLUB::LZA::SIPBuilderBagIt::checkForTitle(\@addBagInfo); + # create slub bagit + print_scalar_data("", "INFO: building SIP...", "", "white"); + SLUB::LZA::SIPBuilderBagIt::create_slub_bagit($bagPath, \@addBagInfo); + + return 1; +} + +sub checkForTitle($){ + if(ref($_[0]) ne 'ARRAY') { croak "no array of hashes with bag-info.txt key values given!"; } + + my $key_hash_list_aref = $_[0]; + + # collect all bag-info keys for easy check + my @keys; + for my $hash (@{$key_hash_list_aref}) { + push @keys, keys %{$hash}; + } + # check for title + if (none { $_ =~ m/^[Tt][Ii][Tt][Ll][Ee]/ } @keys) { + print_scalar_data("", "WARNING: No Title given. Please consider adding a human-readable title with '--add_key_value=Title:YOUR-TITLE'", "", "yellow"); + return 0; + } + return 1; +} +1; \ No newline at end of file -- GitLab