diff --git a/bin/slubsipbuilder.pl b/bin/slubsipbuilder.pl index be608c31d3be8fc16d6eca2db61f96ba7e77298a..d520dfcfb18cb2d92adca46638085e98125fa7a7 100755 --- a/bin/slubsipbuilder.pl +++ b/bin/slubsipbuilder.pl @@ -17,96 +17,48 @@ # CREATED: 2019-07-23 #=============================================================================== + + use strict; use warnings; use Carp; -use 5.20.0; -use strict; -use warnings; -use Archive::Zip::SimpleZip qw($SimpleZipError); -use DateTime::Format::ISO8601; -use Digest::MD5 qw(md5); -use File::Basename; -use File::Copy qw(cp); -use File::Find; -use Path::Tiny; -use Getopt::Long; -use LWP::UserAgent; # to get MARC data -use MARC::Record; -use Pod::Usage; -use XML::LibXML; -use XML::LibXSLT; -use XML::XPath; -use constant buffer => 100*1024*1024; # use 100MB as Buffer - -#=============================================================================== - -my $directory; -my $ppn; -my $noppn; -my $output; -my $as_zip; -my $external_id; -my $external_workflow; -my $external_isil=""; -my $external_value_descr; -my $external_conservation_flag; -my $with_debug; -my $help; -my $man; - -our $VERSION = '1.1'; -GetOptions( - "IE_directory=s" => \$directory, # required - "ppn=s" => \$ppn, # semi-optional (choice 1 of 2) - "noppn=s" => \$noppn, # semi-optional (choice 2 of 2) - "SIP_output_path=s" => \$output, # required - "as_zip" => \$as_zip, # optional, default: do not zip - "external_id=s" => \$external_id, # required - "external_workflow=s" => \$external_workflow, # required - "external_ISIL=s" => \$external_isil, # optional, default: no ISIL - "external_value_descr=s" => \$external_value_descr, # required - "external_conservation_flag" => \$external_conservation_flag, # optional, default: no special conservation - "debug" => \$with_debug, # optional - "help|?" => \$help, # optional - "man" => \$man, # optional -) or pod2usage(2); - -if ($help) { pod2usage(1); } -if ($man) { pod2usage(-exitval => 0, -verbose => 2); } -if (!defined $directory) { confess("you need to specify an IE directory, which needs to be archived"); } -if ((defined $ppn) && (defined $noppn)) { confess("you can only specify either -ppn or -noppn"); } -if ((!defined $ppn) && (!defined $noppn)) { confess("you need to specify a PPN with -ppn or use --noppn"); } -if (!defined $output) { confess("you need to specify an output path, where the SIP will be stored"); } -if (!defined $external_conservation_flag) { $external_conservation_flag="false"; } else { $external_conservation_flag="true"; } -if (! -d $directory) { confess("you need to specify an IE directory, which needs to be archived, $!"); } -$directory = path($directory)->realpath->stringify; -path($output)->mkpath; -$output = path($output)->realpath->stringify; -if ($external_id !~ m#^[a-z0-9]+$#) { confess("you need to specify a valid external ID (^[a-z0-9]+\$)"); } -if ($external_workflow !~ m#^[a-z0-9]+$#) { confess("you need to specify a valid external workflow (^[a-z0-9]+\$)"); } -if (!$external_value_descr) { confess("you need to specify an external value description (reason for archiving)"); } -my ($url, $searchkey, $recordschema); -if (defined $ppn) { $url="https://sru.bsz-bw.de/swb"; $searchkey="pica.swn"; $recordschema="marcxmlvbos"; }; - -#=============================================================================== - -# write data to file (UTF-8) -sub write_file ($$) { # - my $filename = $_[0]; - my $value = $_[1]; - my $fh; - open($fh, '>:encoding(UTF-8)', $filename) || ( die "Can't open '$filename', $!"); - print $fh $value; - close($fh) || (die "could not close file '$filename', $!"); - return 1; -} +use 5.28.0; +package SLUB::LZA::SIPBuilder; + use DateTime::Format::ISO8601; + use File::Copy qw(cp); + use File::Find; + use Path::Tiny; + use LWP::UserAgent; # to get MARC data + use MARC::Record; + use Pod::Usage; + use XML::LibXML; + use XML::LibXSLT; + use XML::XPath; + use Carp; + my $marc_mods_url = 'http://www.loc.gov/standards/mods/v3/MARC21slim2MODS3-6.xsl'; + my $marc_utils_url = 'http://www.loc.gov/standards/marcxml/xslt/MARC21slimUtils.xsl'; + my $swb_url = 'https://sru.bsz-bw.de/swb'; + my $searchkey = "pica.swn"; + my $recordschema = "marcxmlvbos"; + our $VERSION = '1.2'; + our $with_debug=0; + + + # write data to file (UTF-8) + sub write_file($$) { + my $filename = $_[0]; + my $value = $_[1]; + open(my $fh, '>:encoding(UTF-8)', $filename) || (croak "Can't open '$filename', $!"); + print $fh $value; + close($fh) || (croak "could not close file '$filename', $!"); + return 1; + } -# this will patch the mods-xml as a workaround for bugs in LOCs xslt files -sub patch_mods($) { - my $modsobj = shift; # mods expected as XML Parser object - # TODO: Bugfix for /mets:mets/mets:dmdSec[1]/mets:mdWrap[1]/mets:xmlData[1]/mods:modsCollection[1]/mods:mods[1]/mods:relatedItem[2]/mods:internetMediaType[1] - my $xslt_patch_string =<<PATCH; + # this will patch the mods-xml as a workaround for bugs in LOCs xslt files + sub patch_mods($) { + my $modsobj = shift; # mods expected as XML Parser object + # TODO: Bugfix for /mets:mets/mets:dmdSec[1]/mets:mdWrap[1]/mets:xmlData[1]/mods:modsCollection[1]/mods:mods[1]/mods:relatedItem[2]/mods:internetMediaType[1] + my $xslt_patch_string = <<'PATCH'; <?xml version="1.0" encoding="UTF-8"?> <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:xs="http://www.w3.org/2001/XMLSchema" @@ -127,16 +79,16 @@ sub patch_mods($) { </xsl:template> </xsl:stylesheet> PATCH - my $xslt = XML::LibXSLT->new(); - my $xslt_patch = XML::LibXML->load_xml(string=>$xslt_patch_string, no_cdata=>1); - my $stylesheet = $xslt->parse_stylesheet ( $xslt_patch); - my $result = $stylesheet->transform( $modsobj ); - return $result; -} + my $xslt = XML::LibXSLT->new(); + my $xslt_patch = XML::LibXML->load_xml(string => $xslt_patch_string, no_cdata => 1); + my $stylesheet = $xslt->parse_stylesheet($xslt_patch); + my $result = $stylesheet->transform($modsobj); + return $result; + } -sub patch_marc_response($) { - my $marcobj = shift; # marcobj expected as XML Parser object - my $xslt_patch_string =<<PATCH2; + sub patch_marc_response($) { + my $marcobj = shift; # marcobj expected as XML Parser object + my $xslt_patch_string = <<'PATCH2'; <?xml version="1.0" encoding="UTF-8"?> <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns="http://www.loc.gov/MARC21/slim" xmlns:srw="http://www.loc.gov/zing/srw/" @@ -161,110 +113,208 @@ sub patch_marc_response($) { </xsl:template> </xsl:stylesheet> PATCH2 - my $xslt = XML::LibXSLT->new(); - my $xslt_patch = XML::LibXML->load_xml(string=>$xslt_patch_string, no_cdata=>1); - my $stylesheet = $xslt->parse_stylesheet ( $xslt_patch); - my $result = $stylesheet->transform( $marcobj ); - return $result; -} - -# specification SRU/SRW BSZ: https://wiki.k10plus.de/pages/viewpage.action?pageId=132874251 -sub get_mods_from ($$$$) { # $mods = ($url, $ppn, $searchkey, $recordschema) - my $url = shift; - my $ppn = shift; # example: "457035137" for "Der Fichtelberg" - my $key = shift; - my $schema = shift; - - # check xsl directory - my $xsl_dir = path( dirname(__FILE__) )->realpath->parent->child("xsl")->stringify; - if (! -d $xsl_dir) { - say "Rebuilding XSL directory '$xsl_dir'"; - mkpath($xsl_dir) || confess ("could not mkdir '$xsl_dir', $!");; + my $xslt = XML::LibXSLT->new(); + my $xslt_patch = XML::LibXML->load_xml(string => $xslt_patch_string, no_cdata => 1); + my $stylesheet = $xslt->parse_stylesheet($xslt_patch); + my $result = $stylesheet->transform($marcobj); + return $result; } - #### where to find XSLT - my $marc_mods_url = 'http://www.loc.gov/standards/mods/v3/MARC21slim2MODS3-6.xsl'; - my $marc_mods_path = path($xsl_dir)->child( basename($marc_mods_url) )->stringify; - my $marc_mods_patched_path = path($xsl_dir)->child( basename($marc_mods_url,".xsl").".patched.xsl" )->stringify; - my $marc_utils_url = 'http://www.loc.gov/standards/marcxml/xslt/MARC21slimUtils.xsl'; - my $marc_utils_path = path($xsl_dir)->child( basename($marc_utils_url) )->stringify; - - my $ua = LWP::UserAgent->new; - $ua->agent("MyApp/0.1 "); - $ua->timeout(3600); #1h - # check MARC21 utility xsl - if (! -f $marc_utils_path) { - say "Downloading MARC21 utility xsl '$marc_utils_url'"; - my $result = $ua->get($marc_utils_url); - if ($result->is_error) { - die "Failed to download '$marc_utils_url', " . $result->error_as_HTML; + sub check_marc21_utility { + my $xsl_dir = shift; + my $ua = shift; + my $marc_utils_basename = path($marc_utils_url)->basename; + my $marc_utils_path = path($xsl_dir)->child($marc_utils_basename); + if (!$marc_utils_path->is_file) { + say "Downloading MARC21 utility xsl '$marc_utils_url'"; + my $result = $ua->get($marc_utils_url); + if ($result->is_error) { + croak "Failed to download '$marc_utils_url', " . $result->error_as_HTML; + } + say "Saving MARC21 utility xsl to file '$marc_utils_path'"; + my $xsl = $result->decoded_content; + write_file($marc_utils_path, $xsl); } - say "Saving MARC21 utility xsl to file '$marc_utils_path'"; - my $xsl = $result->decoded_content; - write_file($marc_utils_path, $xsl); + return $marc_utils_path; } # check MARC21->MODS xsl - if (! -f $marc_mods_patched_path) { - say "Downloading MARC21->MODS xsl '$marc_mods_url'"; - my $result = $ua->get($marc_mods_url); - if ($result->is_error) { - die "Failed to download '$marc_mods_url', " . $result->error_as_HTML; + sub check_marc21_mods_xsl { + my $xsl_dir = shift; + my $ua = shift; + my $marc_mods_basename = path($marc_mods_url)->basename; + my $marc_mods_path = path($xsl_dir)->child($marc_mods_basename)->stringify; + my $marc_mods_patched_basename = path($marc_mods_url)->basename(".xsl") . ".patched.xsl"; + my $marc_mods_patched_path = path($xsl_dir)->child($marc_mods_patched_basename); + if (! $marc_mods_patched_path->is_file) { + say "Downloading MARC21->MODS xsl '$marc_mods_url'"; + my $result = $ua->get($marc_mods_url); + if ($result->is_error) { + croak "Failed to download '$marc_mods_url', " . $result->error_as_HTML; + } + say "Modifying MARC21->MODS xsl for offline use"; + my $xsl = $result->decoded_content; + write_file($marc_mods_path, $xsl); + my $xsl_modified = $xsl; + my $marc_utils_path = check_marc21_utility( $xsl_dir, $ua); + $xsl_modified =~ s#$marc_utils_url#$marc_utils_path#g; + say "Saving MARC21->MODS xsl to file '$marc_mods_path'"; + write_file($marc_mods_patched_path, $xsl_modified); } - say "Modifying MARC21->MODS xsl for offline use"; - my $xsl = $result->decoded_content; - write_file($marc_mods_path, $xsl); - my $xsl_modified = $xsl; - $xsl_modified =~ s#$marc_utils_url#$marc_utils_path#g; - say "Saving MARC21->MODS xsl to file '$marc_mods_path'"; - write_file($marc_mods_patched_path, $xsl_modified); + return $marc_mods_patched_path; } - my $srubase=$url; # host - my $srusearchkey=$key; # SRU search key - my $sruvalue=$ppn; - my $srumaxrecords=1; - my $srustartrecord=1; - my $sruschema=$schema; - my $sru = "${srubase}?version=1.1&query=${srusearchkey}%3D${sruvalue}&operation=searchRetrieve&maximumRecords=${srumaxrecords}&startRecord=${srustartrecord}&recordSchema=${sruschema}"; - if ($with_debug) { say "catalog-URL='$sru'"; }; - my $record = $ua->get($sru); # ask SWB for given PPN - if ($record->is_success) { - # parse ZiNG repsonse, extract MARC-data - my $xp = XML::XPath->new( $record->decoded_content ); - my $parser = XML::LibXML->new(); - if ($with_debug) { - say "write DEBUG_${ppn}_response.xml"; - write_file("DEBUG_${ppn}_response.xml", $record->decoded_content); + sub check_xsl_directory { + # check xsl directory + my $xsl_dir = path(__FILE__)->parent->realpath->parent->child("xsl"); + if (! $xsl_dir->is_dir) { + say "Rebuilding XSL directory '$xsl_dir'"; + $xsl_dir->mkpath() || confess("could not mkdir '$xsl_dir', $!"); } - my $recordData = $xp->findnodes_as_string('/*[local-name()="searchRetrieveResponse"]/*[local-name()="records"]/*[local-name()="record"]/*[local-name()="recordData"]/*'); - if (!$recordData) { die("ERROR: Did not get any <recordData/> for PPN '$ppn' using '$sru'"); } - my $marcblob = $parser->parse_string( $recordData ); - my $marcblob_patched = patch_marc_response( $marcblob ); - if ($with_debug) { - say "write DEBUG_${ppn}_marc_unpatched.xml"; - write_file("DEBUG_${ppn}_marc_unpatched.xml", $marcblob); - say "write DEBUG_${ppn}_marc.xml"; - write_file("DEBUG_${ppn}_marc.xml", $marcblob_patched); + return $xsl_dir; + } + + # specification SRU/SRW BSZ: https://wiki.k10plus.de/pages/viewpage.action?pageId=132874251 + sub get_mods_from($$$$) { + # $mods = ($url, $ppn, $searchkey, $recordschema) + my $url = shift; + my $ppn = shift; # example: "457035137" for "Der Fichtelberg" + my $key = shift; + my $schema = shift; + + + #### where to find XSLT + + + + + my $ua = LWP::UserAgent->new; + $ua->agent("MyApp/0.1 "); + $ua->timeout(3600); #1h + + my $xsl_dir = check_xsl_directory(); + check_marc21_utility($xsl_dir, $ua); + check_marc21_mods_xsl($xsl_dir, $ua); + + + my $srubase = $url; # host + my $srusearchkey = $key; # SRU search key + my $sruvalue = $ppn; + my $srumaxrecords = 1; + my $srustartrecord = 1; + my $sruschema = $schema; + my $sru = "${srubase}?version=1.1&query=${srusearchkey}%3D${sruvalue}&operation=searchRetrieve&maximumRecords=${srumaxrecords}&startRecord=${srustartrecord}&recordSchema=${sruschema}"; + if ($with_debug) {say "catalog-URL='$sru'";} + my $response = $ua->get($sru); # ask SWB for given PPN + if ($response->is_success) { + # parse ZiNG repsonse, extract MARC-data + my $xp = XML::XPath->new($response->decoded_content); + my $parser = XML::LibXML->new(); + if ($with_debug) { + say "write DEBUG_${ppn}_response.xml"; + write_file("DEBUG_${ppn}_response.xml", $response->decoded_content); + } + my $recordData = $xp->findnodes_as_string('/*[local-name()="searchRetrieveResponse"]/*[local-name()="records"]/*[local-name()="record"]/*[local-name()="recordData"]/*'); + if (!$recordData) { croak("ERROR: Did not get any <recordData/> for PPN '$ppn' using '$sru'");} + my $marcblob = $parser->parse_string($recordData); + + + my $marcblob_patched = patch_marc_response($marcblob); + if ($with_debug) { + say "write DEBUG_${ppn}_marc_unpatched.xml"; + write_file("DEBUG_${ppn}_marc_unpatched.xml", $marcblob); + say "write DEBUG_${ppn}_marc.xml"; + write_file("DEBUG_${ppn}_marc.xml", $marcblob_patched); + } + my $marc_mods_patched_path = check_marc21_mods_xsl($xsl_dir, $ua); + my $xslt = XML::LibXSLT->new(); + my $marcmods = XML::LibXML->load_xml(location => $marc_mods_patched_path, no_cdata => 1); + my $stylesheet = $xslt->parse_stylesheet($marcmods); + my $marc = $parser->parse_string($marcblob_patched); + my $result = $stylesheet->transform($marc); + if ($with_debug) { + say "write DEBUG_${ppn}_unpatched_mods.xml"; + write_file("DEBUG_${ppn}_unpatched_mods.xml", $stylesheet->output_string($result)); + } + + $result = patch_mods($result); + my $result_string = $stylesheet->output_string($result); + return $result_string; } - my $xslt = XML::LibXSLT->new(); - my $marcmods = XML::LibXML->load_xml( location=>$marc_mods_patched_path, no_cdata=>1 ); - my $stylesheet = $xslt->parse_stylesheet( $marcmods ); - my $marc = $parser->parse_string( $marcblob_patched ); - my $result = $stylesheet->transform( $marc ); - if ($with_debug) { - say "write DEBUG_${ppn}_unpatched_mods.xml"; - write_file("DEBUG_${ppn}_unpatched_mods.xml", $stylesheet->output_string( $result )); + else { + carp("Problem asking catalogue at $url using $ppn"); } - $result = patch_mods( $result ); - my $result_string = $stylesheet->output_string( $result ); - return $result_string; - } else { - carp ("Problem asking catalogue at $url using $ppn"); + return; } - return; + # end package + +package main; +#=============================================================================== + +BEGIN{ + $INC{'SLUB/LZA/SIPBuilder.pm'} = 1; # needed because inlined module } +return 1 if caller; # avoids main code running if module stuff is needed +use SLUB::LZA::SIPBuilder; +use Archive::Zip::SimpleZip qw($SimpleZipError); +use Getopt::Long; +use Path::Tiny; +use Digest::MD5; +use constant buffer => 100 * 1024 * 1024; # use 100MB as Buffer +use File::Find; +use File::Copy qw(cp); + +my $directory; +my $ppn; +my $noppn; +my $output; +my $as_zip; +my $external_id; +my $external_workflow; +my $external_isil=""; +my $external_value_descr; +my $external_conservation_flag; + +my $help; +my $man; + + +GetOptions( + "IE_directory=s" => \$directory, # required + "ppn=s" => \$ppn, # semi-optional (choice 1 of 2) + "noppn=s" => \$noppn, # semi-optional (choice 2 of 2) + "SIP_output_path=s" => \$output, # required + "as_zip" => \$as_zip, # optional, default: do not zip + "external_id=s" => \$external_id, # required + "external_workflow=s" => \$external_workflow, # required + "external_ISIL=s" => \$external_isil, # optional, default: no ISIL + "external_value_descr=s" => \$external_value_descr, # required + "external_conservation_flag" => \$external_conservation_flag, # optional, default: no special conservation + "debug" => \$SLUB::LZA::SIPBuilder::with_debug, # optional + "help|?" => \$help, # optional + "man" => \$man, # optional +) or pod2usage(2); + +if ($help) { pod2usage(1); } +if ($man) { pod2usage(-exitval => 0, -verbose => 2); } +if (!defined $directory) { confess("you need to specify an IE directory, which needs to be archived"); } +if ((defined $ppn) && (defined $noppn)) { confess("you can only specify either -ppn or -noppn"); } +if ((!defined $ppn) && (!defined $noppn)) { confess("you need to specify a PPN with -ppn or use --noppn"); } +if (!defined $output) { confess("you need to specify an output path, where the SIP will be stored"); } +if (!defined $external_conservation_flag) { $external_conservation_flag="false"; } else { $external_conservation_flag="true"; } +if (! -d $directory) { confess("you need to specify an IE directory, which needs to be archived, $!"); } +$directory = path($directory)->realpath->stringify; +path($output)->mkpath; +$output = path($output)->realpath->stringify; +if ($external_id !~ m#^[a-z0-9]+$#) { confess("you need to specify a valid external ID (^[a-z0-9]+\$)"); } +if ($external_workflow !~ m#^[a-z0-9]+$#) { confess("you need to specify a valid external workflow (^[a-z0-9]+\$)"); } +if (!$external_value_descr) { confess("you need to specify an external value description (reason for archiving)"); } + +#=============================================================================== + + + #=============================================================================== @@ -289,14 +339,14 @@ sub main { # prepare dmd-sec my $mods; if (defined $ppn) { - $mods = get_mods_from($url, $ppn, $searchkey, $recordschema); + $mods = SLUB::LZA::SIPBuilder::get_mods_from($swb_url, $ppn, $searchkey, $recordschema); if ($with_debug) { - write_file("DEBUG_${ppn}_mods.xml", $mods); + SLUB::LZA::SIPBuilder::write_file("DEBUG_${ppn}_mods.xml", $mods); } # remove the <xml /> from beginning of the answer $mods=~ s#<\?xml version="1.0" encoding="UTF-8"\?>#<!-- removed xml header from mods part -->#; } elsif (defined $noppn) { - $mods =<<MODS; + $mods =<<'MODS'; <mods version="3.6" xmlns="http://www.loc.gov/mods/v3" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" @@ -305,7 +355,7 @@ sub main { </mods> MODS } - my $dmd =<<DMD; + my $dmd =<<'DMD'; <mets:dmdSec ID="DMDLOG_0000"> <!-- bibliographic metadata --> <mets:mdWrap MDTYPE="MODS"> @@ -317,7 +367,7 @@ MODS DMD # prepare amd-sec - my $amd =<<AMD; + my $amd =<<'AMD'; <mets:amdSec ID="AMD"> <!-- SIP metadata for automated processing by submission application --> <mets:techMD ID="ARCHIVE"> @@ -342,7 +392,7 @@ AMD my $wanted=sub { if (-d $_) { # dir, do nothing - () + (); } else { my $file=$File::Find::name; if ($file !~ m#^[-A-Za-z0-9_\./]+$#) { @@ -367,7 +417,7 @@ AMD finddepth($wanted, $directory); # create fileSec - my $filesec=<<FILESEC1; + my $filesec=<<'FILESEC1'; <mets:fileSec> <mets:fileGrp USE="LZA"> FILESEC1 @@ -382,13 +432,13 @@ FILESEC1 } $filesec = join("\n", $filesec, @fsec); } - $filesec = $filesec . <<FILESEC2; + $filesec = $filesec . <<'FILESEC2'; </mets:fileGrp> </mets:fileSec> FILESEC2 # prepare structmap - my $structmap =<<STRUCTMAP1; + my $structmap =<<'STRUCTMAP1'; <mets:structMap TYPE="PHYSICAL"> <mets:div ID="PHYS_0000" TYPE="ieDir"> STRUCTMAP1 @@ -403,13 +453,13 @@ STRUCTMAP1 } $structmap = join("\n", $structmap, @ssec); } - $structmap = $structmap . <<STRUCTMAP2; + $structmap = $structmap . <<'STRUCTMAP2'; </mets:div> </mets:structMap> STRUCTMAP2 # create sip.xml - my $sip =<<METS; + my $sip =<<'METS'; <?xml version="1.0" encoding="utf-8"?> <mets:mets xmlns:mets="http://www.loc.gov/METS/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" @@ -423,7 +473,7 @@ METS # compress if needed if (!defined $as_zip) { - write_file( path($output)->child($sip_root_dir)->child("sip.xml")->stringify, $sip ); + SLUB::LZA::SIPBuilder::write_file( path($output)->child($sip_root_dir)->child("sip.xml")->stringify, $sip ); # copy source to target foreach my $source (sort keys (%filecopyhash)) { my $target = path($filecopyhash{$source}->{"target"})->stringify; # CHECK ON WINDOWS @@ -450,6 +500,7 @@ METS } say "SIP '$sip_root_dir' build successfully in '$zip_file_path'"; } + return; }