package SLUB::LZA::SIPBuilderBagIt; use strict; use warnings; use utf8; use feature 'say'; no warnings "experimental::signatures"; use feature 'signatures'; use Archive::BagIt; use Carp; use Encode; use File::Copy; use File::Copy::Recursive qw(dircopy dirmove); use File::Basename; use File::Find; use List::Util qw(none); use LWP::UserAgent; # to get MARC data use MARC::Record; use Path::Tiny; use Term::ANSIColor; # colored print use XML::LibXML; use XML::XPath; our $VERBOSE = 0; # print output switch 0 => on, 1 and larger = off our $with_debug = 0; # output debug infos and files # catalogue infos my $swb_url = 'https://sru.bsz-bw.de/swb'; my $searchkey_swb = 'pica.swn'; my $searchkey_k10p = 'pica.ppn'; my $recordschema_swb = 'marcxmlvbos'; my $recordschema_k10p = 'marcxmlk10os'; # MARC21 -> bag-info.txt mapping # reference: https://www.loc.gov/marc/bibliographic/bd20x24x.html my $marc21_to_baginfo_mappings = [ { tag => "245", code => "a", key => "Title" }, { tag => "245", code => "b", key => "Title" }, { tag => "245", code => "c", key => "Title" }, { tag => "246", code => "a", key => "Title" }, { tag => "246", code => "b", key => "Title" }, ]; sub add_metadata($refAddBagInfo, $ppn, $noppn, $marc21) { # adding entries to bag-info key list push @{$refAddBagInfo}, {'External-Identifier' => "PPN:$ppn"} unless (! $ppn); push @{$refAddBagInfo}, {'External-Identifier' => "$noppn"} unless (! $noppn); if ($marc21) { foreach my $mapping (@{$marc21_to_baginfo_mappings}) { my $text = get_marc21_text_node($marc21, $mapping); push @{$refAddBagInfo}, {$mapping->{key} => $text} unless (! $text); } } return 1; } sub get_marc21_text_node($marc21, $mapping) { if(ref($mapping) ne 'HASH') { croak "not a mapping hash!"; } # retrieve value based on mapping using a xpath my $xpath = "//*[\@tag='" . $mapping->{tag} . "']/"; $xpath .= "*[local-name()='subfield' and \@code='" . $mapping->{code} . "']/" unless (! exists $mapping->{code}); $xpath .= "text()"; # HINT: xpath example //*[@tag='245']/*[local-name()='subfield' and @code='a']/text() my $text = XML::XPath->new($marc21)->findnodes_as_string($xpath); return $text; } sub get_marc21_from_catalogue($ppn){ if(! $_[0]){ croak "ppn not defined!"; } my $marc21; my $use_fallback; eval { # try SWB catalogue $marc21 = SLUB::LZA::SIPBuilderBagIt::get_marc21_from($swb_url, $ppn, $searchkey_swb, $recordschema_swb); }; if ($@) { # error say "$@" unless (! $with_debug); if ($@ =~ m#ERROR: Did not get any <recordData/> for PPN#) { print_scalar_data("", "WARNING: '$ppn' not a SWB PPN, fallback to K10plus", "", "yellow"); $use_fallback = 1; } else { die "$@"; # throw exception again } } if ($use_fallback) { eval { # try K10plus catalogue $marc21 = SLUB::LZA::SIPBuilderBagIt::get_marc21_from($swb_url, $ppn, $searchkey_k10p, $recordschema_k10p); }; if ($@) { # error say "$@" unless (!$with_debug); if ($@ =~ m#ERROR: Did not get any <recordData/> for PPN#) { print_scalar_data("", "ERROR: '$ppn' also not a K10plus PPN", "", "red"); exit 1; } else { die "$@"; # throw exception again } } } print_scalar_data("", "INFO: Found metadata for PPN '$ppn'", "", "white"); return $marc21; } sub get_marc21_from($url, $ppn,$key,$schema){ # example: ppn="457035137" for "Der Fichtelberg" # specification SRU/SRW BSZ: https://wiki.k10plus.de/pages/viewpage.action?pageId=132874251 my $ua = LWP::UserAgent->new; $ua->agent("MyApp/0.1 "); $ua->timeout(3600); #1h my $srubase = $url; # host my $srusearchkey = $key; # SRU search key my $sruvalue = $ppn; my $srumaxrecords = 1; my $srustartrecord = 1; my $sruschema = $schema; my $sru = "${srubase}?version=1.1&query=${srusearchkey}%3D${sruvalue}&operation=searchRetrieve&maximumRecords=${srumaxrecords}&startRecord=${srustartrecord}&recordSchema=${sruschema}"; if ($with_debug) {say "catalog-URL='$sru'";} my $response = $ua->get($sru); # ask SWB for given PPN if ($response->is_success) { # parse ZiNG response, extract MARC-data my $xp = XML::XPath->new($response->decoded_content); my $parser = XML::LibXML->new(); if ($with_debug) { say "write DEBUG_${ppn}_catalog_response.xml"; SLUB::LZA::SIPBuilderBagIt::write_file("DEBUG_${ppn}_catalog_response.xml", $response->decoded_content); } my $recordData = $xp->findnodes_as_string('/*[local-name()="searchRetrieveResponse"]/*[local-name()="records"]/*[local-name()="record"]/*[local-name()="recordData"]/*'); if (!$recordData) { croak("ERROR: Did not get any <recordData/> for PPN '$ppn' using '$sru'");} my $marc_doc = $parser->parse_string($recordData); add_marc21_schema_location($marc_doc); if ($with_debug) { say "write DEBUG_${ppn}_extracted_marc_record.xml"; SLUB::LZA::SIPBuilderBagIt::write_file("DEBUG_${ppn}_extracted_marc_record.xml", $marc_doc); } return $marc_doc->serialize(); } else { carp("Problem asking catalogue at $url using $ppn"); } return; } sub add_marc21_schema_location($marc_doc) { if(ref($marc_doc) ne 'XML::LibXML::Document') { croak "marc xml document required!"; } my $root = $marc_doc->findnodes('/*[local-name()="record"]')->[0]; # get root element (XML::LibXML::Element) $root->setAttribute('xmlns:xsi', 'http://www.w3.org/2001/XMLSchema-instance'); # add xsi namespace (simple) $root->setAttribute('xsi:schemaLocation', 'http://www.loc.gov/MARC21/slim https://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd'); # add MARCXML schema location # HINT: method sets or replaces the node's attribute return 1; } sub write_file($filename, $value){ # write data to file (UTF-8) open(my $fh, '>:encoding(UTF-8)', $filename) || (croak "Can't open '$filename', $!"); print $fh $value; close($fh) || (croak "could not close file '$filename', $!"); return 1; } sub generateBagName($file_date, $ppn, $noppn){ $file_date =~ s/T/_/g; # replace 'T' with '_' $file_date =~ s/:/-/g; # replace ':' with '-' return (defined $ppn)? "PPN-${ppn}_${file_date}" : "ID-${noppn}_${file_date}"; } sub createDir($path){ if(! -d $path && ! -f $path){ eval { path($path)->mkpath; }; if ($@) { # error print_scalar_data("", "ERROR: Failed to create directory '$path'.", "", "red"); die $@; } } return 1; } sub copyFilesToMeta($ppn, $metaPath,$rightsFilePath,$aRefAddMetaFile,$marc21){ my @addMetaFile = @{ $aRefAddMetaFile }; if(@addMetaFile){ my $i = 1; foreach my $file(@addMetaFile){ # check reserved file names my $meta_file_name = basename($file); if($meta_file_name eq "rights.xml" || $meta_file_name eq "marc21.xml"){ foreach my $f(@addMetaFile){ # check if numbered xml file exists my $f_name = basename($f); while($f_name eq "$i.xml"){ $i++; } } print_scalar_data("", "WARNING: Renaming " . $meta_file_name . " to $i.xml, because meta filename <rights.xml> or <marc21.xml> is reserved.", "", "yellow"); print_scalar_data("", "INFO: Read Docu for more information at https://slubarchiv.slub-dresden.de/technische-standards-fuer-die-ablieferung-von-digitalen-dokumenten/", "", "white"); copy($file, $metaPath) or die "Copy failed: $!"; rename("$metaPath/$meta_file_name", "$metaPath/$i.xml"); $i++; }else{ copy($file, $metaPath) or die "Copy failed: $!"; } } } # marc21.xml if ($ppn && $marc21) { write_file("$metaPath/marc21.xml", $marc21); } # rights.xml if(basename($rightsFilePath) ne "rights.xml"){ my $rights_name = basename($rightsFilePath); print_scalar_data("", "WARNING: Renaming " . $rights_name . " to rights.xml, because meta filename <rights.xml> is required.", "", "yellow"); print_scalar_data("", "INFO: Read Docu for more information at https://slubarchiv.slub-dresden.de/technische-standards-fuer-die-ablieferung-von-digitalen-dokumenten/", "", "white"); copy($rightsFilePath, $metaPath) or die "Copy failed: $!"; rename("$metaPath/$rights_name", "$metaPath/rights.xml"); }else{ copy($rightsFilePath, $metaPath) or die "Copy failed: $!"; } return 1; } sub validateRightsXML($rightsFilePathString,$xsdName){ my $rightsFilePath = path($rightsFilePathString); if (!$rightsFilePath->is_file) { die "rightsFilePath '$rightsFilePath' is not readable/does not exist"; } my $rightsName = $rightsFilePath->basename(); my $xsd_file = path(__FILE__)->parent->realpath->parent->child("xsd")->child($xsdName); # absolute path if (! $xsd_file->is_file) { # retry, because assembled path for lib/ $xsd_file = path(__FILE__)->parent->parent->parent->realpath->parent->child("xsd")->child($xsdName); } my $schema = XML::LibXML::Schema->new(location => $xsd_file); my $parser = XML::LibXML->new; my $doc = $parser->parse_file($rightsFilePath); eval { $schema->validate($doc); }; if ($@) { # error die "File $rightsName failed validation: $@"; }; return 1; } sub print_scalar_data($header,$p,$arrRef,$pColor){ my $priority = 0; # header if($header ne ""){ print "----------------------------------------------------------------------------------\n" if $priority >= $VERBOSE; printf "%" . length($header) . "s\n", colored($header, 'bold green') if $priority >= $VERBOSE; print "----------------------------------------------------------------------------------\n" if $priority >= $VERBOSE; } # paragraph if(($p ne "") && ($arrRef eq "")){ printf "%" . length($p) . "s\n", colored($p, "bold $pColor") if $priority >= $VERBOSE; } # paragraph with converted array if(($p ne "") && ($arrRef ne "")){ my $string = ""; for my $i (0 .. $#$arrRef){ if($i eq $#$arrRef){ $string .= " $$arrRef[$i]"; } else { $string .= " $$arrRef[$i],"; } } my $body = $p . $string; printf "%" . length($body) . "s\n", colored($body, "bold $pColor") if $priority >= $VERBOSE; } return 1; } sub create_slub_bagit($bagPath,$refAddBagInfo){ # construct bag if(! defined $bagPath) { croak "bagPath is not defined!"; } if(! -d $bagPath) { croak "path '$bagPath' is not directory!" } if(! defined $refAddBagInfo ) { croak "array of hashes not defined for bag-info.txt!"; } my $oArchiveBagIt = Archive::BagIt->make_bag($bagPath); $oArchiveBagIt->bag_info($refAddBagInfo); $oArchiveBagIt->store(); print_scalar_data("", "SUCCESS: SLUB SIP at $bagPath build successfully!", "", "green"); return 1; } sub buildBagWithCopyOption($ppn,$noppn,$ieDirectory,$rightsFilePath,$aRefAddMetaFile,$refAddBagInfo,$bagPath,$dataPath,$metaPath){ my @addMetaFile = @{ $aRefAddMetaFile }; my @addBagInfo = @{ $refAddBagInfo }; my $marc21; # get descriptive metadata from catalog if ($ppn) { $marc21 = get_marc21_from_catalogue($ppn); } # create bag dir SLUB::LZA::SIPBuilderBagIt::createDir($bagPath); # create data dir SLUB::LZA::SIPBuilderBagIt::createDir($dataPath); # copy payload files to data print_scalar_data("", "INFO: copying original data...", "", "white"); dircopy($ieDirectory, $dataPath) || die ("Error coping $ieDirectory to $dataPath"); # create meta dir SLUB::LZA::SIPBuilderBagIt::createDir($metaPath); # copy not payload files to meta SLUB::LZA::SIPBuilderBagIt::copyFilesToMeta($ppn, $metaPath, $rightsFilePath, \@addMetaFile, $marc21); # add metadata for bag-info.txt add_metadata(\@addBagInfo, $ppn, $noppn, $marc21); # metadata warnings SLUB::LZA::SIPBuilderBagIt::checkForTitle(\@addBagInfo); # create slub bagit print_scalar_data("", "INFO: building SIP...", "", "white"); SLUB::LZA::SIPBuilderBagIt::create_slub_bagit($bagPath, \@addBagInfo); return 1; } sub buildBagWithMoveOption($ppn,$noppn,$ieDirectory,$rightsFilePath,$aRefAddMetaFile,$refAddBagInfo,$bagPath,$dataPath,$metaPath){ my @addMetaFile = @{ $aRefAddMetaFile }; my @addBagInfo = @{ $refAddBagInfo }; my $marc21; # get descriptive metadata from catalog if ($ppn) { $marc21 = get_marc21_from_catalogue($ppn); } # create bag dir SLUB::LZA::SIPBuilderBagIt::createDir($bagPath); # move payloads to data dir (which is automatically renamed from IE Directory) print_scalar_data("", "INFO: moving original data...", "", "white"); dirmove($ieDirectory, $dataPath) || die ("Error moving $ieDirectory to $dataPath"); # create meta dir SLUB::LZA::SIPBuilderBagIt::createDir($metaPath); # copy not payload files to meta SLUB::LZA::SIPBuilderBagIt::copyFilesToMeta($ppn, $metaPath, $rightsFilePath, \@addMetaFile, $marc21); # add metadata for bag-info.txt add_metadata(\@addBagInfo, $ppn, $noppn, $marc21); # metadata warnings SLUB::LZA::SIPBuilderBagIt::checkForTitle(\@addBagInfo); # create slub bagit print_scalar_data("", "INFO: building SIP...", "", "white"); SLUB::LZA::SIPBuilderBagIt::create_slub_bagit($bagPath, \@addBagInfo); return 1; } sub buildBagWithReplaceOption($ppn,$noppn,$ieDirectory,$rightsFilePath,$aRefAddMetaFile,$refAddBagInfo,$bagPath,$dataPath,$metaPath){ my @addMetaFile = @{ $aRefAddMetaFile }; my @addBagInfo = @{ $refAddBagInfo }; my $marc21; # get descriptive metadata from catalog if ($ppn) { $marc21 = get_marc21_from_catalogue($ppn); } # create bag dir SLUB::LZA::SIPBuilderBagIt::createDir($bagPath); # move payloads to data dir (which is automatically renamed from IE Directory) print_scalar_data("", "INFO: moving original data...", "", "white"); dirmove($ieDirectory, $dataPath) || die ("Error moving $ieDirectory to $dataPath"); # create meta dir SLUB::LZA::SIPBuilderBagIt::createDir($metaPath); # copy not payload files to meta SLUB::LZA::SIPBuilderBagIt::copyFilesToMeta($ppn, $metaPath, $rightsFilePath, \@addMetaFile, $marc21); # add metadata for bag-info.txt add_metadata(\@addBagInfo, $ppn, $noppn, $marc21); # metadata warnings SLUB::LZA::SIPBuilderBagIt::checkForTitle(\@addBagInfo); # create slub bagit print_scalar_data("", "INFO: building SIP...", "", "white"); SLUB::LZA::SIPBuilderBagIt::create_slub_bagit($bagPath, \@addBagInfo); return 1; } sub checkForTitle($key_hash_list_aref){ # collect all bag-info keys for easy check my @keys; for my $hash (@{$key_hash_list_aref}) { push @keys, keys %{$hash}; } # check for title if (none { $_ =~ m/^[Tt][Ii][Tt][Ll][Ee]/ } @keys) { print_scalar_data("", "WARNING: No Title given. Please consider adding a human-readable title with '--add_key_value=Title:YOUR-TITLE'", "", "yellow"); return 0; } return 1; } 1;