diff --git a/bin/slubsipbuilder.pl b/bin/slubsipbuilder.pl old mode 100644 new mode 100755 index de574ed0d573fae8e6010473aae62928395d97ed..932b8e3661bdb52e39eecbf93024d705ce87e006 --- a/bin/slubsipbuilder.pl +++ b/bin/slubsipbuilder.pl @@ -10,44 +10,54 @@ # REQUIREMENTS: --- # BUGS: --- # NOTES: related to official document -# "SIP Spezifikation (v1.4.1)" +# "SIP Spezifikation (v1.4.2)" # AUTHOR: Andreas Romeyke (romeyke@slub-dresden.de) # ORGANIZATION: SLUB # VERSION: 1.1 -# CREATED: 10.05.2016 +# CREATED: 2019-07-23 #=============================================================================== + + use strict; use warnings; use Carp; -use 5.20.0; -use strict; -use warnings; -use Archive::Zip::SimpleZip qw($SimpleZipError); -use Cwd; -use DateTime::Format::ISO8601; -use Digest::MD5 qw(md5); -use File::Basename; -use File::Copy qw(cp); -use File::Find; -use File::Path; -use File::Slurp; -use Getopt::Long; -use LWP::UserAgent; # to get MARC data -use MARC::Record; -use Pod::Usage; -use XML::LibXML; -use XML::LibXSLT; -use XML::XPath; -use constant buffer => 100*1024*1024; # use 100MB as Buffer - -my $with_debug = 0; - -# this will patch the mods-xml as a workaround for bugs in LOCs xslt files -sub patch_mods($) { - my $modsobj = shift; # mods expected as XML Parser object - # TODO: Bugfix for /mets:mets/mets:dmdSec[1]/mets:mdWrap[1]/mets:xmlData[1]/mods:modsCollection[1]/mods:mods[1]/mods:relatedItem[2]/mods:internetMediaType[1] - my $xslt_patch_string =<<PATCH; +use 5.28.0; +package SLUB::LZA::SIPBuilder; + use DateTime::Format::ISO8601; + use File::Copy qw(cp); + use File::Find; + use Path::Tiny; + use LWP::UserAgent; # to get MARC data + use MARC::Record; + use XML::LibXML; + use XML::LibXSLT; + use XML::XPath; + use Carp; + my $marc_mods_url = 'http://www.loc.gov/standards/mods/v3/MARC21slim2MODS3-6.xsl'; + my $marc_utils_url = 'http://www.loc.gov/standards/marcxml/xslt/MARC21slimUtils.xsl'; + my $swb_url = 'https://sru.bsz-bw.de/swb'; + my $searchkey = "pica.swn"; + my $recordschema = "marcxmlvbos"; + our $VERSION = '1.2'; + our $with_debug=0; + + + # write data to file (UTF-8) + sub write_file($$) { + my $filename = $_[0]; + my $value = $_[1]; + open(my $fh, '>:encoding(UTF-8)', $filename) || (croak "Can't open '$filename', $!"); + print $fh $value; + close($fh) || (croak "could not close file '$filename', $!"); + return 1; + } + + # this will patch the mods-xml as a workaround for bugs in LOCs xslt files + sub patch_mods($) { + my $modsobj = shift; # mods expected as XML Parser object + # TODO: Bugfix for /mets:mets/mets:dmdSec[1]/mets:mdWrap[1]/mets:xmlData[1]/mods:modsCollection[1]/mods:mods[1]/mods:relatedItem[2]/mods:internetMediaType[1] + my $xslt_patch_string = <<'PATCH'; <?xml version="1.0" encoding="UTF-8"?> <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:xs="http://www.w3.org/2001/XMLSchema" @@ -68,16 +78,16 @@ sub patch_mods($) { </xsl:template> </xsl:stylesheet> PATCH - my $xslt = XML::LibXSLT->new(); - my $xslt_patch = XML::LibXML->load_xml(string=>$xslt_patch_string, no_cdata=>1); - my $stylesheet = $xslt->parse_stylesheet ( $xslt_patch); - my $result = $stylesheet->transform( $modsobj ); - return $result; -} + my $xslt = XML::LibXSLT->new(); + my $xslt_patch = XML::LibXML->load_xml(string => $xslt_patch_string, no_cdata => 1); + my $stylesheet = $xslt->parse_stylesheet($xslt_patch); + my $result = $stylesheet->transform($modsobj); + return $result; + } -sub patch_marc_response($) { - my $marcobj = shift; # marcobj expected as XML Parser object - my $xslt_patch_string =<<PATCH2; + sub patch_marc_response($) { + my $marcobj = shift; # marcobj expected as XML Parser object + my $xslt_patch_string = <<'PATCH2'; <?xml version="1.0" encoding="UTF-8"?> <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns="http://www.loc.gov/MARC21/slim" xmlns:srw="http://www.loc.gov/zing/srw/" @@ -102,155 +112,206 @@ sub patch_marc_response($) { </xsl:template> </xsl:stylesheet> PATCH2 - my $xslt = XML::LibXSLT->new(); - my $xslt_patch = XML::LibXML->load_xml(string=>$xslt_patch_string, no_cdata=>1); - my $stylesheet = $xslt->parse_stylesheet ( $xslt_patch); - my $result = $stylesheet->transform( $marcobj ); - return $result; -} - + my $xslt = XML::LibXSLT->new(); + my $xslt_patch = XML::LibXML->load_xml(string => $xslt_patch_string, no_cdata => 1); + my $stylesheet = $xslt->parse_stylesheet($xslt_patch); + my $result = $stylesheet->transform($marcobj); + return $result; + } + # check MARC21 utility xsl + sub check_marc21_utility { + my $xsl_dir = shift; + my $ua = shift; + my $marc_utils_basename = path($marc_utils_url)->basename; + my $marc_utils_path = path($xsl_dir)->child($marc_utils_basename); + if (!$marc_utils_path->is_file) { + say "Downloading MARC21 utility xsl '$marc_utils_url'"; + my $result = $ua->get($marc_utils_url); + if ($result->is_error) { + croak "Failed to download '$marc_utils_url', " . $result->error_as_HTML; + } + say "Saving MARC21 utility xsl to file '$marc_utils_path'"; + my $xsl = $result->decoded_content; + write_file($marc_utils_path, $xsl); + } + return $marc_utils_path; + } -# the "old" approach does not handle umlauts or UTF8-chars above ASCII table -# old: http://swb2.bsz-bw.de/sru/DB=2.1/username=/password=/?query=pica.ppn+%3D+"494384174"&startRecord=1&maximumRecords=10&recordSchema=marcxml -# new: http://swb2.bsz-bw.de/sru/DB=2.1/username=/password=/?query=pica.ppn+%3D+"494384174"&startRecord=1&maximumRecords=10&recordSchema=marc21&recordPacking=xml&version=1.1 -# with stylesheet: -# http://swb2.bsz-bw.de/sru/DB=2.1/username=/password=/?query=pica.ppn+%3D+%22494384174%22&version=1.1&operation=searchRetrieve&stylesheet=http%3A%2F%2Fswb2.bsz-bw.de%2Fsru%2FDB%3D2.1%2F%3Fxsl%3DsearchRetrieveResponse&recordSchema=marc21&maximumRecords=10&startRecord=1&recordPacking=xml&sortKeys=none&x-info-5-mg-requestGroupings=none -sub get_mods_from ($$) { # $mods = ($url, $ppn) - my $url = shift; - my $ppn = shift; # example: "457035137" for "Der Fichtelberg" - #### where to find XSLT - # my $marc_dc_url = 'http://www.loc.gov/standards/marcxml/xslt/MARC21slim2RDFDC.xsl'; - my $marc_mods_url = 'http://www.loc.gov/standards/mods/v3/MARC21slim2MODS3-6.xsl'; + # check MARC21->MODS xsl + sub check_marc21_mods_xsl { + my $xsl_dir = shift; + my $ua = shift; + my $marc_mods_basename = path($marc_mods_url)->basename; + my $marc_mods_path = path($xsl_dir)->child($marc_mods_basename)->stringify; + my $marc_mods_patched_basename = path($marc_mods_url)->basename(".xsl") . ".patched.xsl"; + my $marc_mods_patched_path = path($xsl_dir)->child($marc_mods_patched_basename); + if (! $marc_mods_patched_path->is_file) { + say "Downloading MARC21->MODS xsl '$marc_mods_url'"; + my $result = $ua->get($marc_mods_url); + if ($result->is_error) { + croak "Failed to download '$marc_mods_url', " . $result->error_as_HTML; + } + say "Modifying MARC21->MODS xsl for offline use"; + my $xsl = $result->decoded_content; + write_file($marc_mods_path, $xsl); + my $xsl_modified = $xsl; + my $marc_utils_path = check_marc21_utility( $xsl_dir, $ua); + $xsl_modified =~ s#$marc_utils_url#$marc_utils_path#g; + say "Saving MARC21->MODS xsl to file '$marc_mods_path'"; + write_file($marc_mods_patched_path, $xsl_modified); + } + return $marc_mods_patched_path; + } - my $ua = LWP::UserAgent->new; - $ua->agent("MyApp/0.1 "); - $ua->timeout(3600); #1h - my $srubase=$url; # host - my $srusearchkey="pica.ppn"; - my $sruvalue=$ppn; - my $srumaxrecords=1; - #my $sruschema="marcxml"; - my $sruschema="marc21"; - #my $sru = "${srubase}?query=${srusearchkey}+%3D+%22${sruvalue}%22&startRecord=1&maximumRecords=${srumaxrecords}&recordSchema=${sruschema}"; - my $sru = "${srubase}?query=${srusearchkey}+%3D+%22${sruvalue}%22&startRecord=1&maximumRecords=${srumaxrecords}&recordSchema=${sruschema}&recordPacking=xml&version=1.1&stylesheet=http%3A%2F%2Fswb2.bsz-bw.de%2Fsru%2FDB%3D2.1%2F%3Fxsl%3DsearchRetrieveResponse"; - #p ($sru); # debug output - my $record = $ua->get($sru); # ask SWB for given PPN - if ($record->is_success) { - # parse ZiNG repsonse, extract MARC-data - my $xp = XML::XPath->new( $record->decoded_content ); - my $parser = XML::LibXML->new(); - if ($with_debug) { - say "write DEBUG_${ppn}_response.xml"; - write_file("DEBUG_${ppn}_response.xml", {binmode => ':utf8'}, $record->decoded_content); + sub check_xsl_directory { + # check xsl directory + my $xsl_dir = path(__FILE__)->parent->realpath->parent->child("xsl"); + if (! $xsl_dir->is_dir) { + say "Rebuilding XSL directory '$xsl_dir'"; + $xsl_dir->mkpath() || confess("could not mkdir '$xsl_dir', $!"); } - my $marcblob = $parser->parse_string( - $xp->findnodes_as_string('/*[local-name()="searchRetrieveResponse"]/*[local-name()="records"]/*[local-name()="record"]/*[local-name()="recordData"]/*') - ); - my $marcblob_patched = patch_marc_response( $marcblob ); - if ($with_debug) { - say "write DEBUG_${ppn}_marc_unpatched.xml"; - write_file("DEBUG_${ppn}_marc_unpatched.xml", {binmode => ':utf8'}, $marcblob); - say "write DEBUG_${ppn}_marc.xml"; - write_file("DEBUG_${ppn}_marc.xml", {binmode => ':utf8'}, $marcblob_patched); + return $xsl_dir; + } + + # specification SRU/SRW BSZ: https://wiki.k10plus.de/pages/viewpage.action?pageId=132874251 + sub get_mods_from($$$$) { + # $mods = ($url, $ppn, $searchkey, $recordschema) + my $url = shift; + my $ppn = shift; # example: "457035137" for "Der Fichtelberg" + my $key = shift; + my $schema = shift; + + + #### where to find XSLT + + + + + my $ua = LWP::UserAgent->new; + $ua->agent("MyApp/0.1 "); + $ua->timeout(3600); #1h + + my $xsl_dir = check_xsl_directory(); + check_marc21_utility($xsl_dir, $ua); + check_marc21_mods_xsl($xsl_dir, $ua); + + + my $srubase = $url; # host + my $srusearchkey = $key; # SRU search key + my $sruvalue = $ppn; + my $srumaxrecords = 1; + my $srustartrecord = 1; + my $sruschema = $schema; + my $sru = "${srubase}?version=1.1&query=${srusearchkey}%3D${sruvalue}&operation=searchRetrieve&maximumRecords=${srumaxrecords}&startRecord=${srustartrecord}&recordSchema=${sruschema}"; + if ($with_debug) {say "catalog-URL='$sru'";} + my $response = $ua->get($sru); # ask SWB for given PPN + if ($response->is_success) { + # parse ZiNG repsonse, extract MARC-data + my $xp = XML::XPath->new($response->decoded_content); + my $parser = XML::LibXML->new(); + if ($with_debug) { + say "write DEBUG_${ppn}_response.xml"; + write_file("DEBUG_${ppn}_response.xml", $response->decoded_content); + } + my $recordData = $xp->findnodes_as_string('/*[local-name()="searchRetrieveResponse"]/*[local-name()="records"]/*[local-name()="record"]/*[local-name()="recordData"]/*'); + if (!$recordData) { croak("ERROR: Did not get any <recordData/> for PPN '$ppn' using '$sru'");} + my $marcblob = $parser->parse_string($recordData); + + + my $marcblob_patched = patch_marc_response($marcblob); + if ($with_debug) { + say "write DEBUG_${ppn}_marc_unpatched.xml"; + write_file("DEBUG_${ppn}_marc_unpatched.xml", $marcblob); + say "write DEBUG_${ppn}_marc.xml"; + write_file("DEBUG_${ppn}_marc.xml", $marcblob_patched); + } + my $marc_mods_patched_path = check_marc21_mods_xsl($xsl_dir, $ua); + my $xslt = XML::LibXSLT->new(); + my $marcmods = XML::LibXML->load_xml(location => $marc_mods_patched_path, no_cdata => 1); + my $stylesheet = $xslt->parse_stylesheet($marcmods); + my $marc = $parser->parse_string($marcblob_patched); + my $result = $stylesheet->transform($marc); + if ($with_debug) { + say "write DEBUG_${ppn}_unpatched_mods.xml"; + write_file("DEBUG_${ppn}_unpatched_mods.xml", $stylesheet->output_string($result)); + } + + $result = patch_mods($result); + my $result_string = $stylesheet->output_string($result); + return $result_string; } - my $xslt = XML::LibXSLT->new(); - my $marcmods = XML::LibXML->load_xml(location=>$marc_mods_url, no_cdata=>1); - my $stylesheet = $xslt->parse_stylesheet ( $marcmods); - my $marc = $parser->parse_string( $marcblob_patched ); - my $result = $stylesheet->transform( $marc); - if ($with_debug) { - say "write DEBUG_${ppn}_unpatched_mods.xml"; - write_file("DEBUG_${ppn}_unpatched_mods.xml", {binmode => ':utf8'}, $stylesheet->output_string( $result )); + else { + carp("Problem asking catalogue at $url using $ppn"); } - $result = patch_mods( $result); - my $result_string = $stylesheet->output_string( $result ); - return $result_string; - } else { - carp ("Problem asking catalogue at $url using $ppn"); + return; } - return; -} -#=============================================================================== - -my $directory; -my $ppn; -my $noppn; -my $output; -my $url; -my $as_zip; -my $external_id; -my $external_workflow; -my $external_isil=""; -my $external_value_descr; -my $external_conservation_flag; +sub create_filecopyhash { + my $directory = shift; + my $content = shift; + my %filecopyhash; + my $wanted=sub { + if (-d $_) { + # dir, do nothing + (); + } else { + my $file=$File::Find::name; + if ($file !~ m#^[-A-Za-z0-9_\.:\\/]+$#) { + confess("file '$file' does not match regex '^[-A-Za-z0-9_\.:\\/]+\$'"); + } + my $source = $file; + $filecopyhash{$source}->{'source'}=$file; + $file=~s#^$directory/?##; + $filecopyhash{$source}{'relative'}="data/$file"; + $filecopyhash{$source}{'target'}="$content/$file"; + my $fh; + open($fh, "<", $source) or confess ("Can't open '$source', $!"); + binmode($fh); + my $ctx = Digest::MD5->new; + $ctx->addfile(*$fh); + close ($fh); + my $md5 = $ctx->hexdigest; + $filecopyhash{$source}{'md5sum'}=$md5; + } + }; -our $VERSION = '1.0'; -GetOptions( - "IE_directory=s" => \$directory, - "ppn=s" => \$ppn, - "noppn=s" => \$noppn, - "SIP_output_path=s" => \$output, - "as_zip" => \$as_zip, - "url=s" => \$url, - "external_id=s" => \$external_id, - "external_workflow=s" => \$external_workflow, - "external_ISIL=s" => \$external_isil, - "external_value_descr=s" => \$external_value_descr, - "external_conservation_flag" => \$external_conservation_flag, - "debug" => \$with_debug, - - "help" => sub { pod2usage(1); exit(0); }, - ) or pod2usage(2); - -if (!defined $directory) { confess ("you need to specify an IE directory, which needs to be archived"); } -if (!defined $ppn && !defined $noppn) { confess ("you need to specify a PPN, which exists in SWB catalogue"); } -if (defined $ppn && defined $noppn) {confess ("you could only use --ppn=foo or --noppn=bar"); } -if (!defined $output) { confess (" you need to specify an output path, where the SIP will be stored"); } -if ($output !~ m#^/#) { confess("you need to specify an output path using absoluet paths, $!"); } -if (!defined $url) { $url = "http://swb.bsz-bw.de/sru/DB=2.1/username=/password=/";} -if (!defined $external_conservation_flag) { $external_conservation_flag="false"; } else { $external_conservation_flag="true"; } -# additional checks -if (! -d $directory) { confess("you need to specify an IE directory, which needs to be archived, $!"); } -if ($directory !~ m#^/#) { confess("you need to specify an IE directory using absoluet paths, $!"); } -#if (! -d $output) { confess("you need to specify an output path, where the SIP will be stored, $!"); } - -# get date -my $export_to_archive_date = DateTime->now->iso8601();# -my $file_date = $export_to_archive_date; -$file_date =~ s/T/_/g; -$file_date =~ s/:/-/g; -# create output dir -mkpath "$output" || confess("could not create SIP directory for '$output', $!"); -my $sip_root_dir = "PPN-${ppn}_${file_date}"; -my $content = "$output/$sip_root_dir/data"; -if (!defined $as_zip) { - mkpath "$output/$sip_root_dir" || confess("could not create SIP directory for '$output/$sip_root_dir', $!"); - mkpath "$content" || confess("could not create SIP subdirectory for '$content', $!"); + finddepth($wanted, $directory); + return \%filecopyhash; } - -# prepare dmd-sec -my $mods; -if (defined $ppn) { - $mods = get_mods_from($url, $ppn); - if (1 == $with_debug) { - write_file("DEBUG_${ppn}_mods.xml", {binmode => ':utf8'}, $mods); + sub prepare_dmd_section_with_ppn ($) { + my $ppn = shift; + my $mods = SLUB::LZA::SIPBuilder::get_mods_from($swb_url, $ppn, $searchkey, $recordschema); + if ($with_debug) { + SLUB::LZA::SIPBuilder::write_file("DEBUG_${ppn}_mods.xml", $mods); + } + # remove the <xml /> from beginning of the answer + $mods=~ s#<\?xml version="1.0" encoding="UTF-8"\?>#<!-- removed xml header from mods part -->#; + my $dmd =<<"DMD"; +<mets:dmdSec ID="DMDLOG_0000"> + <!-- bibliographic metadata --> + <mets:mdWrap MDTYPE="MODS"> + <mets:xmlData> + $mods + </mets:xmlData> + </mets:mdWrap> +</mets:dmdSec> +DMD + return $dmd; } -# remove the <xml /> from beginning of the answer - $mods=~ s#<\?xml version="1.0" encoding="UTF-8"\?>#<!-- removed xml header from mods part -->#; -} elsif (defined $noppn) { - $mods =<<MODS; -<mods version="3.5" + + sub prepare_dmd_section_with_noppn ($) { + my $noppn = shift; + my $mods =<<"MODS"; +<mods version="3.6" xmlns="http://www.loc.gov/mods/v3" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" - xsi:schemaLocation="http://www.loc.gov/mods/v3 http://www.loc.gov/standards/mods/v3/mods-3-5.xsd"> + xsi:schemaLocation="http://www.loc.gov/mods/v3 http://www.loc.gov/standards/mods/v3/mods-3-6.xsd"> <identifier>$noppn</identifier> </mods> MODS -} -my $dmd =<<DMD; + my $dmd =<<"DMD"; <mets:dmdSec ID="DMDLOG_0000"> <!-- bibliographic metadata --> <mets:mdWrap MDTYPE="MODS"> @@ -260,9 +321,17 @@ my $dmd =<<DMD; </mets:mdWrap> </mets:dmdSec> DMD + return $dmd; + } -# prepare amd-sec -my $amd =<<AMD; + sub prepare_amd_section($$$$$$) { + my $export_to_archive_date = shift; + my $external_workflow = shift; + my $external_id = shift; + my $external_conservation_flag = shift; + my $external_isil = shift; + my $external_value_descr = shift; + my $amd =<<"AMD"; <mets:amdSec ID="AMD"> <!-- SIP metadata for automated processing by submission application --> <mets:techMD ID="ARCHIVE"> @@ -281,80 +350,149 @@ my $amd =<<AMD; </mets:techMD> </mets:amdSec> AMD - -# create filecopyhash -my %filecopyhash; -my $wanted=sub { - if (-d $_) { - # dir, do nothing - () - } else { - my $file=$File::Find::name; - if ($file !~ m#^[-A-Za-z0-9_\./]+$#) { - confess("file '$file' does not match regex '^[-A-Za-z0-9_\./]+\$'"); - } - my $source = $file; - $filecopyhash{$source}->{'source'}=$file; - $file=~s#^$directory/?##; - $filecopyhash{$source}{'relative'}="data/$file"; - $filecopyhash{$source}{'target'}="$content/$file"; - my $fh; - open($fh, "<", $source) or confess ("Can't open '$source' (current='",getcwd,"', $!\n"); - binmode($fh); - my $ctx = Digest::MD5->new; - $ctx->addfile(*$fh); - close ($fh); - my $md5 = $ctx->hexdigest; - $filecopyhash{$source}{'md5sum'}=$md5; + return $amd; } -}; - -finddepth($wanted, $directory); -# create fileSec -my $filesec=<<FILESEC1; + sub prepare_files_sections($) { + my $filecopyhash = shift; + my @fsec; + my $i=0; + foreach my $fkey (sort keys (%{$filecopyhash})) { + push @fsec, sprintf("<mets:file ID=\"FILE_%015u_LZA\" CHECKSUMTYPE=\"MD5\" CHECKSUM=\"%s\">", $i, $filecopyhash->{$fkey}->{"md5sum"}); + push @fsec, sprintf("<mets:FLocat xmlns:xlink=\"http://www.w3.org/1999/xlink\" LOCTYPE=\"URL\" xlink:href=\"file://%s\"/>", $filecopyhash->{$fkey}->{"relative"}); + push @fsec, "</mets:file>"; + $i++; + } + my $files = join("\n", @fsec); + my $filesec=<<"FILESEC"; <mets:fileSec> <mets:fileGrp USE="LZA"> -FILESEC1 -{ - my @fsec; - my $i=0; - foreach my $fkey (sort keys (%filecopyhash)) { - push @fsec, sprintf("<mets:file ID=\"FILE_%015u_LZA\" CHECKSUMTYPE=\"MD5\" CHECKSUM=\"%s\">", $i, $filecopyhash{$fkey}->{"md5sum"}); - push @fsec, sprintf("<mets:FLocat xmlns:xlink=\"http://www.w3.org/1999/xlink\" LOCTYPE=\"URL\" xlink:href=\"file://%s\"/>", $filecopyhash{$fkey}->{"relative"}); - push @fsec, "</mets:file>"; - $i++; - } - $filesec = join("\n", $filesec, @fsec); -} -$filesec = $filesec . <<FILESEC2; + $files </mets:fileGrp> </mets:fileSec> -FILESEC2 +FILESEC + return $filesec; + } -# prepare structmap -my $structmap =<<STRUCTMAP1; + sub prepare_struct_map($) { + my $filecopyhash = shift; + my @ssec; + my $i=0; + foreach my $fkey (sort keys (%{$filecopyhash})) { + push @ssec, sprintf("<mets:div ID=\"PHYS_%015u_LZA\" TYPE=\"fileorderSequence\">", $i); + push @ssec, sprintf("<mets:fptr FILEID=\"FILE_%015u_LZA\" />", $i); + push @ssec, "</mets:div>"; + $i++; + } + my $structs = join("\n", @ssec); + my $structmap =<<"STRUCTMAP"; <mets:structMap TYPE="PHYSICAL"> <mets:div ID="PHYS_0000" TYPE="ieDir"> -STRUCTMAP1 -{ - my @ssec; - my $i=0; - foreach my $fkey (sort keys (%filecopyhash)) { - push @ssec, sprintf("<mets:div ID=\"PHYS_%015u_LZA\" TYPE=\"fileorderSequence\">", $i); - push @ssec, sprintf("<mets:fptr FILEID=\"FILE_%015u_LZA\" />", $i); - push @ssec, "</mets:div>"; - $i++; - } - $structmap = join("\n", $structmap, @ssec); -} -$structmap = $structmap . <<STRUCTMAP2; + $structs </mets:div> </mets:structMap> -STRUCTMAP2 +STRUCTMAP + return $structmap; + } + + # end package + +package main; +#=============================================================================== + +BEGIN{ + $INC{'SLUB/LZA/SIPBuilder.pm'} = 1; # needed because inlined module +} +return 1 if caller; # avoids main code running if module stuff is needed +use SLUB::LZA::SIPBuilder; +use Archive::Zip::SimpleZip qw($SimpleZipError); +use Getopt::Long; +use Path::Tiny; +use Digest::MD5; +use constant buffer => 100 * 1024 * 1024; # use 100MB as Buffer +use File::Find; +use File::Copy qw(cp); +use Pod::Usage; + +my $directory; +my $ppn; +my $noppn; +my $output; +my $as_zip; +my $external_id; +my $external_workflow; +my $external_isil=""; +my $external_value_descr; +my $external_conservation_flag; -# create sip.xml -my $sip =<<METS; +my $help; +my $man; + + +GetOptions( + "IE_directory=s" => \$directory, # required + "ppn=s" => \$ppn, # semi-optional (choice 1 of 2) + "noppn=s" => \$noppn, # semi-optional (choice 2 of 2) + "SIP_output_path=s" => \$output, # required + "as_zip" => \$as_zip, # optional, default: do not zip + "external_id=s" => \$external_id, # required + "external_workflow=s" => \$external_workflow, # required + "external_ISIL=s" => \$external_isil, # optional, default: no ISIL + "external_value_descr=s" => \$external_value_descr, # required + "external_conservation_flag" => \$external_conservation_flag, # optional, default: no special conservation + "debug" => \$SLUB::LZA::SIPBuilder::with_debug, # optional + "help|?" => \$help, # optional + "man" => \$man, # optional +) or pod2usage(2); + +if ($help) { pod2usage(1); } +if ($man) { pod2usage(-exitval => 0, -verbose => 2); } +if (!defined $directory) { confess("you need to specify an IE directory, which needs to be archived"); } +if ((defined $ppn) && (defined $noppn)) { confess("you can only specify either -ppn or -noppn"); } +if ((!defined $ppn) && (!defined $noppn)) { confess("you need to specify a PPN with -ppn or use --noppn"); } +if (!defined $output) { confess("you need to specify an output path, where the SIP will be stored"); } +if (!defined $external_conservation_flag) { $external_conservation_flag="false"; } else { $external_conservation_flag="true"; } +if (! -d $directory) { confess("you need to specify an IE directory, which needs to be archived, $!"); } +$directory = path($directory)->realpath->stringify; +path($output)->mkpath; +$output = path($output)->realpath->stringify; +if ($external_id !~ m#^[a-z0-9]+$#) { confess("you need to specify a valid external ID (^[a-z0-9]+\$)"); } +if ($external_workflow !~ m#^[a-z0-9]+$#) { confess("you need to specify a valid external workflow (^[a-z0-9]+\$)"); } +if (!$external_value_descr) { confess("you need to specify an external value description (reason for archiving)"); } + +#=============================================================================== + +sub main { + # get date + my $export_to_archive_date = DateTime->now->iso8601(); + my $file_date = $export_to_archive_date; + $file_date =~ s/T/_/g; # replace 'T' with '_' + $file_date =~ s/:/-/g; # replace ':' with '-' + # prepare dirs + my $sip_root_dir = (defined $ppn)? "PPN-${ppn}_${file_date}" : "ID-${noppn}_${file_date}"; + my $content = path($output)->child($sip_root_dir)->child("data")->stringify; + if (!defined $as_zip) { + path($content)->mkpath; + } + my $filecopyhash = SLUB::LZA::SIPBuilder::create_filecopyhash($directory, $content); + + # prepare dmd-sec + my $dmd = (defined $ppn)? SLUB::LZA::SIPBuilder::prepare_dmd_section_with_ppn( $ppn ) : SLUB::LZA::SIPBuilder::prepare_dmd_section_with_noppn( $noppn ); + # prepare amd-sec + my $amd = SLUB::LZA::SIPBuilder::prepare_amd_section( + $export_to_archive_date, + $external_workflow, + $external_id, + $external_conservation_flag, + $external_isil, + $external_value_descr + ); + # create fileSec + my $filesec = SLUB::LZA::SIPBuilder::prepare_files_sections($filecopyhash); + # prepare structmap + my $structmap = SLUB::LZA::SIPBuilder::prepare_struct_map($filecopyhash); + # create sip.xml + my $sip =<<"METS"; <?xml version="1.0" encoding="utf-8"?> <mets:mets xmlns:mets="http://www.loc.gov/METS/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" @@ -366,38 +504,48 @@ my $sip =<<METS; </mets:mets> METS -# compress if needed -if (!defined $as_zip) { - write_file( "${output}/${sip_root_dir}/sip.xml",{binmode => ':utf8'}, $sip ); - # copy source to target - foreach my $source (sort keys (%filecopyhash)) { - my $target = $filecopyhash{$source}->{"target"}; - my $basename = dirname($target); - #say "cp $source, $target ($basename)"; - if (! -d $basename) { - mkpath $basename || confess ("could not mkdir '$basename', $!"); + # write stuff out + if (!defined $as_zip) { + SLUB::LZA::SIPBuilder::write_file( path($output)->child($sip_root_dir)->child("sip.xml")->stringify, $sip ); + # copy source to target + foreach my $source (sort keys (%{$filecopyhash})) { + my $target = path($filecopyhash->{$source}->{"target"})->stringify; # CHECK ON WINDOWS + my $basename = path($target)->parent->stringify; + if (! -d $basename) { + path($basename)->mkpath; + } + cp($source, $target, buffer) || confess ("could not copy from '$source' to '$target', $!"); } - cp($source, $target, buffer) || confess ("could not copy from '$source' to '$target', $!"); - } - say "SIP '$sip_root_dir' build successfully in '$output'"; -} else { - # compress it - my $zip_file_path = "$output/$sip_root_dir.zip"; - my $zip = Archive::Zip::SimpleZip->new( $zip_file_path, Zip64=>1 ); - $zip->addString($sip, Name=> "$sip_root_dir/sip.xml" ); - # copy source to target - foreach my $source (sort keys (%filecopyhash)) { - my $target = "$sip_root_dir/".$filecopyhash{$source}->{"relative"}; - my $basename = dirname($target); - #say "cp $source, $target ($basename)"; - $zip->add( $source, Name=> $target) || confess ("could not zip copy from '$source' to '$target', $!"); - } - unless ( $zip->close()) { - confess "write error to '$zip_file_path', $SimpleZipError, $!"; + say "SIP '$sip_root_dir' build successfully in '$output'"; + } else { + # compress it + my $zip_file_path = path($output)->child("$sip_root_dir.zip")->stringify; + my $zip = Archive::Zip::SimpleZip->new( $zip_file_path, Zip64=>1 ); + $zip->addString($sip, Name=>path($sip_root_dir)->child("sip.xml")->stringify); + # copy source to target + foreach my $source (sort keys (%{$filecopyhash})) { + my $target = path($sip_root_dir)->child($filecopyhash->{$source}->{"relative"})->stringify; # CHECK ON WINDOWS + my $basename = path($target)->parent->stringify; + $zip->add( $source, Name=> $target) || confess ("could not zip copy from '$source' to '$target', $!"); + } + unless ( $zip->close()) { + confess "write error to '$zip_file_path', $SimpleZipError, $!"; + } + say "SIP '$sip_root_dir' build successfully in '$zip_file_path'"; } - say "SIP '$sip_root_dir' build successfully in '$zip_file_path'"; + return; } + +#=============================================================================== + +main(); + +#=============================================================================== + + +__END__ + =pod =head1 NAME @@ -413,17 +561,16 @@ slubsipbuilder.pl [options] -man full documentation -IE_directory=<IE dir> existing IE directory (absolute path!) - -ppn=<ppn>|-noppn=<noppn> PPN (swb catalogue) or any identifier (uses minimalistic MODS) + -ppn=<ppn>|-noppn=<noppn> SWB-PPN or any identifier (uses minimalistic MODS) -SIP_output_path=<target dir> where to put the SIP dir (absolute path!) -as_zip optional, if set a ZIP will be created - -url=<SRU url> optional, URL of the SRU for PICA catalogues -external_id=<id> mandatory, should be uniqe ID -external_workflow=<workflow> mandatory, should be uniqe workflow name -external_ISIL=<isil> optional, ISIL number of library -external_value_descr=<text> mandatory, the reason why to archive -external_conservation_flag optional, if set no other "original" still exists -slubsipbuilder.pl --IE_directory=/processdir_from_goobi/10008 --ppn=457035137 --SIP_output_path=/tmp/mysip --external_id=10008 --external_workflow=goobitest --external_ISIL=de-14 --external_value_descr="Gesetzlicher Auftrag" --as_zip +slubsipbuilder.pl --IE_directory=/export_dir_kitodo/10008 --ppn=457035137 --SIP_output_path=/tmp/mysip --external_id=10008 --external_workflow=kitodo --external_ISIL=DE-14 --external_value_descr="Gesetzlicher Auftrag" =head1 OPTIONS @@ -440,4 +587,3 @@ Print a brief help message and exits. B<This program> will process the given IE directory, add bibliographic metadata from catalogue with given PICA number and check and create a SIP directory ready for SLUBarchiv =cut -# vim: set tabstop=4 diff --git a/t/slubsipbuilder.t b/t/slubsipbuilder.t new file mode 100644 index 0000000000000000000000000000000000000000..30945bcd0abe33b68370ab5210f351f0f4a5525f --- /dev/null +++ b/t/slubsipbuilder.t @@ -0,0 +1,326 @@ +#!/usr/bin/perl -w +use strict; +use warnings; +use diagnostics; + +use Test::More tests => 8; +use Test::Exception; +use Test::File; +use Path::Tiny; + +### prepare +BEGIN { + use Path::Tiny; + push @INC, Path::Tiny::path(__FILE__)->parent->parent->path("bin")->absolute->stringify; + require "slubsipbuilder.pl"; + $INC{'SLUB/LZA/SIPBuilder.pm'} = 1; # needed because inlined module +} +my $unpatched_mods=<<'UNPATCHED_MODS'; +<?xml version="1.0" encoding="UTF-8"?> +<mods xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://www.loc.gov/mods/v3" version="3.6" xsi:schemaLocation="http://www.loc.gov/mods/v3 http://www.loc.gov/standards/mods/v3/mods-3-6.xsd"><titleInfo><nonSort xml:space="preserve">Der </nonSort><title>Fichtelberg</title><subTitle>Berg der unbekannten Rekorrde</subTitle></titleInfo><name type="personal"><namePart>Schneider, Dirk</namePart><role><roleTerm type="text">FilmemacherIn</roleTerm></role><role><roleTerm authority="marcrelator" type="code">fmk</roleTerm></role><nameIdentifier>(DE-627)1235502279 (DE-576)165502274</nameIdentifier></name><typeOfResource>moving image</typeOfResource><genre authority="rdacontent">zweidimensionales bewegtes Bild</genre><genre authority="gnd-content">Film</genre><originInfo><place><placeTerm type="code" authority="marccountry">xx</placeTerm></place><dateIssued encoding="marc">2014</dateIssued><issuance>monographic</issuance></originInfo><originInfo eventType="publication"><place><placeTerm type="text">[Leipzig]</placeTerm></place><publisher>top ten tv</publisher><dateIssued>[2014]</dateIssued></originInfo><language><languageTerm authority="iso639-2b" type="code">ger</languageTerm></language><physicalDescription><form authority="marccategory">electronic resource</form><form authority="marcsmd">remote</form><extent>1 Online-Ressource (1 Videodatei, 29:49) farbig</extent><form type="media" authority="rdamedia">Computermedien</form><form type="carrier" authority="rdacarrier">Online-Ressource</form></physicalDescription><targetAudience authority="marctarget">juvenile</targetAudience><note type="statement of responsibility" altRepGroup="00">ein Film von Dirk Schneider</note><note>Dokumentarfilm. Deutschland. 2014</note><relatedItem type="series"><titleInfo><title>MDR</title></titleInfo></relatedItem><relatedItem type="series"><titleInfo><title>Der Osten - entdecke wo du lebst</title></titleInfo></relatedItem><identifier type="oclc">946544758</identifier><recordInfo><descriptionStandard>rda</descriptionStandard><recordContentSource authority="marcorg">DE-576</recordContentSource><recordCreationDate encoding="marc">160304</recordCreationDate><recordChangeDate encoding="iso8601">20160510144338.0</recordChangeDate><recordIdentifier source="DE-576">457035137</recordIdentifier><recordOrigin>Converted from MARCXML to MODS version 3.6 using MARC21slim2MODS3-6.xsl + (Revision 1.119 2018/06/21)</recordOrigin><languageOfCataloging><languageTerm authority="iso639-2b" type="code">ger</languageTerm></languageOfCataloging></recordInfo></mods> +UNPATCHED_MODS +my $patched_mods=<<'PATCHED_MODS'; +<?xml version="1.0" encoding="UTF-8"?> +<mods xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://www.loc.gov/mods/v3" version="3.6" xsi:schemaLocation="http://www.loc.gov/mods/v3 http://www.loc.gov/standards/mods/v3/mods-3-6.xsd"><titleInfo><nonSort xml:space="preserve">Der </nonSort><title>Fichtelberg</title><subTitle>Berg der unbekannten Rekorrde</subTitle></titleInfo><name type="personal"><namePart>Schneider, Dirk</namePart><role><roleTerm type="text">FilmemacherIn</roleTerm></role><role><roleTerm authority="marcrelator" type="code">fmk</roleTerm></role><nameIdentifier>(DE-627)1235502279 (DE-576)165502274</nameIdentifier></name><typeOfResource>moving image</typeOfResource><genre authority="rdacontent">zweidimensionales bewegtes Bild</genre><genre authority="gnd-content">Film</genre><originInfo><place><placeTerm type="code" authority="marccountry">xx</placeTerm></place><dateIssued encoding="marc">2014</dateIssued><issuance>monographic</issuance></originInfo><originInfo eventType="publication"><place><placeTerm type="text">[Leipzig]</placeTerm></place><publisher>top ten tv</publisher><dateIssued>[2014]</dateIssued></originInfo><language><languageTerm authority="iso639-2b" type="code">ger</languageTerm></language><physicalDescription><form authority="marccategory">electronic resource</form><form authority="marcsmd">remote</form><extent>1 Online-Ressource (1 Videodatei, 29:49) farbig</extent><form type="media" authority="rdamedia">Computermedien</form><form type="carrier" authority="rdacarrier">Online-Ressource</form></physicalDescription><targetAudience authority="marctarget">juvenile</targetAudience><note type="statement of responsibility" altRepGroup="00">ein Film von Dirk Schneider</note><note>Dokumentarfilm. Deutschland. 2014</note><relatedItem type="series"><titleInfo><title>MDR</title></titleInfo></relatedItem><relatedItem type="series"><titleInfo><title>Der Osten - entdecke wo du lebst</title></titleInfo></relatedItem><identifier type="oclc">946544758</identifier><recordInfo><descriptionStandard>rda</descriptionStandard><recordContentSource authority="marcorg">DE-576</recordContentSource><recordCreationDate encoding="marc">160304</recordCreationDate><recordChangeDate encoding="iso8601">20160510144338.0</recordChangeDate><recordIdentifier source="DE-576">457035137</recordIdentifier><recordOrigin>Converted from MARCXML to MODS version 3.6 using MARC21slim2MODS3-6.xsl + (Revision 1.119 2018/06/21)</recordOrigin><languageOfCataloging><languageTerm authority="iso639-2b" type="code">ger</languageTerm></languageOfCataloging></recordInfo></mods> +PATCHED_MODS +my $unpatched_mods_obj = XML::LibXML->load_xml(string => $unpatched_mods); + +my $unpatched_marcblob=<<'UNPATCHED_MARCBLOB'; +<?xml version="1.0"?> +<record xmlns="http://www.loc.gov/MARC21/slim"> + <leader> cgm a22 4500</leader> + <controlfield tag="001">457035137</controlfield> + <controlfield tag="003">DE-576</controlfield> + <controlfield tag="005">20160510144338.0</controlfield> + <controlfield tag="006">m o | | </controlfield> + <controlfield tag="007">cr uuu---uuuuu</controlfield> + <controlfield tag="007">vu uuuuuu</controlfield> + <controlfield tag="008">160304s2014 xx ger c</controlfield> + <datafield tag="035" ind1=" " ind2=" "> + <subfield code="a">(DE-627)1655506501</subfield> + </datafield> + <datafield tag="035" ind1=" " ind2=" "> + <subfield code="a">(DE-576)457035137</subfield> + </datafield> + <datafield tag="035" ind1=" " ind2=" "> + <subfield code="a">(DE-599)BSZ457035137</subfield> + </datafield> + <datafield tag="035" ind1=" " ind2=" "> + <subfield code="a">(OCoLC)946544758</subfield> + </datafield> + <datafield tag="040" ind1=" " ind2=" "> + <subfield code="a">DE-576</subfield> + <subfield code="b">ger</subfield> + <subfield code="c">DE-576</subfield> + <subfield code="e">rda</subfield> + </datafield> + <datafield tag="041" ind1=" " ind2=" "> + <subfield code="a">ger</subfield> + </datafield> + <datafield tag="245" ind1="1" ind2="4"> + <subfield code="a">Der Fichtelberg</subfield> + <subfield code="b">Berg der unbekannten Rekorrde</subfield> + <subfield code="c">ein Film von Dirk Schneider</subfield> + </datafield> + <datafield tag="264" ind1=" " ind2="1"> + <subfield code="a">[Leipzig]</subfield> + <subfield code="b">top ten tv</subfield> + <subfield code="c">[2014]</subfield> + </datafield> + <datafield tag="264" ind1=" " ind2="4"> + <subfield code="c">© 2014</subfield> + </datafield> + <datafield tag="300" ind1=" " ind2=" "> + <subfield code="a">1 Online-Ressource (1 Videodatei, 29:49)</subfield> + <subfield code="b">farbig</subfield> + </datafield> + <datafield tag="336" ind1=" " ind2=" "> + <subfield code="a">zweidimensionales bewegtes Bild</subfield> + <subfield code="b">tdi</subfield> + <subfield code="2">rdacontent</subfield> + </datafield> + <datafield tag="337" ind1=" " ind2=" "> + <subfield code="a">Computermedien</subfield> + <subfield code="b">c</subfield> + <subfield code="2">rdamedia</subfield> + </datafield> + <datafield tag="338" ind1=" " ind2=" "> + <subfield code="a">Online-Ressource</subfield> + <subfield code="b">cr</subfield> + <subfield code="2">rdacarrier</subfield> + </datafield> + <datafield tag="490" ind1="0" ind2=" "> + <subfield code="a">MDR</subfield> + </datafield> + <datafield tag="490" ind1="0" ind2=" "> + <subfield code="a">Der Osten - entdecke wo du lebst</subfield> + </datafield> + <datafield tag="500" ind1=" " ind2=" "> + <subfield code="a">Dokumentarfilm. Deutschland. 2014</subfield> + </datafield> + <datafield tag="591" ind1=" " ind2=" "> + <subfield code="a">Fernsehmitschnitt (SWB)</subfield> + </datafield> + <datafield tag="655" ind1=" " ind2="7"> + <subfield code="a">Film</subfield> + <subfield code="0">(DE-588)4017102-4</subfield> + <subfield code="0">(DE-627)104559683</subfield> + <subfield code="0">(DE-576)208918531</subfield> + <subfield code="2">gnd-content</subfield> + </datafield> + <datafield tag="700" ind1="1" ind2=" "> + <subfield code="a">Schneider, Dirk</subfield> + <subfield code="e">FilmemacherIn</subfield> + <subfield code="0">(DE-627)1235502279</subfield> + <subfield code="0">(DE-576)165502274</subfield> + <subfield code="4">fmk</subfield> + </datafield> + <datafield tag="935" ind1=" " ind2=" "> + <subfield code="c">vide</subfield> + </datafield> + <datafield tag="937" ind1=" " ind2=" "> + <subfield code="a">Dokumentarfilm</subfield> + <subfield code="b">Deutschland</subfield> + <subfield code="c">2014</subfield> + </datafield> + <datafield tag="LOK" ind1=" " ind2=" "> + <subfield code="0">000 xxxxxcx a22 zn 4500</subfield> + </datafield> + <datafield tag="LOK" ind1=" " ind2=" "> + <subfield code="0">001 901795887</subfield> + </datafield> + <datafield tag="LOK" ind1=" " ind2=" "> + <subfield code="0">003 DE-576</subfield> + </datafield> + <datafield tag="LOK" ind1=" " ind2=" "> + <subfield code="0">004 457035137</subfield> + </datafield> + <datafield tag="LOK" ind1=" " ind2=" "> + <subfield code="0">005 20160510125331</subfield> + </datafield> + <datafield tag="LOK" ind1=" " ind2=" "> + <subfield code="0">008 160304||||||||||||||||ger|||||||</subfield> + </datafield> + <datafield tag="LOK" ind1=" " ind2=" "> + <subfield code="0">040 </subfield> + <subfield code="a">DE-14</subfield> + <subfield code="c">DE-576</subfield> + <subfield code="d">DE-14</subfield> + </datafield> + <datafield tag="LOK" ind1=" " ind2=" "> + <subfield code="0">852 </subfield> + <subfield code="z">Fernsehmitschnitt: MDR, 04.02.2014. - Beilage</subfield> + </datafield> + <datafield tag="LOK" ind1=" " ind2=" "> + <subfield code="0">852 </subfield> + <subfield code="a">DE-14</subfield> + </datafield> + <datafield tag="LOK" ind1=" " ind2=" "> + <subfield code="0">852 1</subfield> + <subfield code="9">00</subfield> + </datafield> + <datafield tag="LOK" ind1=" " ind2=" "> + <subfield code="0">866 </subfield> + <subfield code="x">ddsu,pn</subfield> + </datafield> +</record> +UNPATCHED_MARCBLOB + +my $patched_marcblob =<<'PATCHED_MARCBLOB'; +<?xml version="1.0"?> +<record xmlns="http://www.loc.gov/MARC21/slim"> + <leader> cgm a22 4500</leader> + <controlfield tag="001">457035137</controlfield> + <controlfield tag="003">DE-576</controlfield> + <controlfield tag="005">20160510144338.0</controlfield> + <controlfield tag="006">m o | | </controlfield> + <controlfield tag="007">cr uuu---uuuuu</controlfield> + <controlfield tag="007">vu uuuuuu</controlfield> + <controlfield tag="008">160304s2014 xx ger c</controlfield> + <datafield tag="035" ind1=" " ind2=" "> + <subfield code="a">(DE-627)1655506501</subfield> + </datafield> + <datafield tag="035" ind1=" " ind2=" "> + <subfield code="a">(DE-576)457035137</subfield> + </datafield> + <datafield tag="035" ind1=" " ind2=" "> + <subfield code="a">(DE-599)BSZ457035137</subfield> + </datafield> + <datafield tag="035" ind1=" " ind2=" "> + <subfield code="a">(OCoLC)946544758</subfield> + </datafield> + <datafield tag="040" ind1=" " ind2=" "> + <subfield code="a">DE-576</subfield> + <subfield code="b">ger</subfield> + <subfield code="c">DE-576</subfield> + <subfield code="e">rda</subfield> + </datafield> + <datafield tag="041" ind1=" " ind2=" "> + <subfield code="a">ger</subfield> + </datafield> + <datafield tag="245" ind1="1" ind2="4"> + <subfield code="a">Der Fichtelberg</subfield> + <subfield code="b">Berg der unbekannten Rekorrde</subfield> + <subfield code="c">ein Film von Dirk Schneider</subfield> + </datafield> + <datafield tag="264" ind1=" " ind2="1"> + <subfield code="a">[Leipzig]</subfield> + <subfield code="b">top ten tv</subfield> + <subfield code="c">[2014]</subfield> + </datafield> + <datafield tag="264" ind1=" " ind2="4"> + <subfield code="c">© 2014</subfield> + </datafield> + <datafield tag="300" ind1=" " ind2=" "> + <subfield code="a">1 Online-Ressource (1 Videodatei, 29:49)</subfield> + <subfield code="b">farbig</subfield> + </datafield> + <datafield tag="336" ind1=" " ind2=" "> + <subfield code="a">zweidimensionales bewegtes Bild</subfield> + <subfield code="b">tdi</subfield> + <subfield code="2">rdacontent</subfield> + </datafield> + <datafield tag="337" ind1=" " ind2=" "> + <subfield code="a">Computermedien</subfield> + <subfield code="b">c</subfield> + <subfield code="2">rdamedia</subfield> + </datafield> + <datafield tag="338" ind1=" " ind2=" "> + <subfield code="a">Online-Ressource</subfield> + <subfield code="b">cr</subfield> + <subfield code="2">rdacarrier</subfield> + </datafield> + <datafield tag="490" ind1="0" ind2=" "> + <subfield code="a">MDR</subfield> + </datafield> + <datafield tag="490" ind1="0" ind2=" "> + <subfield code="a">Der Osten - entdecke wo du lebst</subfield> + </datafield> + <datafield tag="500" ind1=" " ind2=" "> + <subfield code="a">Dokumentarfilm. Deutschland. 2014</subfield> + </datafield> + <datafield tag="591" ind1=" " ind2=" "> + <subfield code="a">Fernsehmitschnitt (SWB)</subfield> + </datafield> + <datafield tag="655" ind1=" " ind2="7"> + <subfield code="a">Film</subfield> + <subfield code="0">(DE-588)4017102-4</subfield> + <subfield code="0">(DE-627)104559683</subfield> + <subfield code="0">(DE-576)208918531</subfield> + <subfield code="2">gnd-content</subfield> + </datafield> + <datafield tag="700" ind1="1" ind2=" "> + <subfield code="a">Schneider, Dirk</subfield> + <subfield code="e">FilmemacherIn</subfield> + <subfield code="0">(DE-627)1235502279</subfield> + <subfield code="0">(DE-576)165502274</subfield> + <subfield code="4">fmk</subfield> + </datafield> + <datafield tag="935" ind1=" " ind2=" "> + <subfield code="c">vide</subfield> + </datafield> + <datafield tag="937" ind1=" " ind2=" "> + <subfield code="a">Dokumentarfilm</subfield> + <subfield code="b">Deutschland</subfield> + <subfield code="c">2014</subfield> + </datafield> + <datafield tag="LOK" ind1=" " ind2=" "> + <subfield code="0">000 xxxxxcx a22 zn 4500</subfield> + </datafield> + <datafield tag="LOK" ind1=" " ind2=" "> + <subfield code="0">001 901795887</subfield> + </datafield> + <datafield tag="LOK" ind1=" " ind2=" "> + <subfield code="0">003 DE-576</subfield> + </datafield> + <datafield tag="LOK" ind1=" " ind2=" "> + <subfield code="0">004 457035137</subfield> + </datafield> + <datafield tag="LOK" ind1=" " ind2=" "> + <subfield code="0">005 20160510125331</subfield> + </datafield> + <datafield tag="LOK" ind1=" " ind2=" "> + <subfield code="0">008 160304||||||||||||||||ger|||||||</subfield> + </datafield> + <datafield tag="LOK" ind1=" " ind2=" "> + <subfield code="0">040 </subfield> + <subfield code="a">DE-14</subfield> + <subfield code="c">DE-576</subfield> + <subfield code="d">DE-14</subfield> + </datafield> + <datafield tag="LOK" ind1=" " ind2=" "> + <subfield code="0">852 </subfield> + <subfield code="z">Fernsehmitschnitt: MDR, 04.02.2014. - Beilage</subfield> + </datafield> + <datafield tag="LOK" ind1=" " ind2=" "> + <subfield code="0">852 </subfield> + <subfield code="a">DE-14</subfield> + </datafield> + <datafield tag="LOK" ind1=" " ind2=" "> + <subfield code="0">852 1</subfield> + <subfield code="9">00</subfield> + </datafield> + <datafield tag="LOK" ind1=" " ind2=" "> + <subfield code="0">866 </subfield> + <subfield code="x">ddsu,pn</subfield> + </datafield> +</record> +PATCHED_MARCBLOB +my $unpatched_marcblob_obj = XML::LibXML->load_xml(string => $unpatched_marcblob); + +### tests +BEGIN { use_ok("SLUB::LZA::SIPBuilder"); } +like(SLUB::LZA::SIPBuilder::get_mods_from("https://sru.bsz-bw.de/swb", "457035137", "pica.swn", "marcxmlvbos"), qr//, "get_mods_from()"); +is(SLUB::LZA::SIPBuilder::patch_mods($unpatched_mods_obj), $patched_mods, "patch_mods()" ); +is(SLUB::LZA::SIPBuilder::patch_marc_response($unpatched_marcblob_obj), $patched_marcblob, "patch_marc_response()"); +# ensure no dir exists, then run test +my $xsl_path = path(__FILE__)->parent->parent->child('xsl'); +if ($xsl_path->is_dir) { $xsl_path->remove_tree; } +is(SLUB::LZA::SIPBuilder::check_xsl_directory(), $xsl_path->absolute, "check_xsl_directory(), return value if not exist"); +ok($xsl_path->is_dir, "check_xsl_directory(), created if not exist"); +is(SLUB::LZA::SIPBuilder::check_xsl_directory(), $xsl_path->absolute, "check_xsl_directory(), return value if exist"); +ok($xsl_path->is_dir, "check_xsl_directory(), untouched if exist"); + +1; diff --git a/xsd/archive.xsd b/xsd/archive.xsd index 4bc783c9dc25239bbc43f6d0aa786202571eb7b3..15d034a11877d937a19c3d5eea17397bfecf0d07 100644 --- a/xsd/archive.xsd +++ b/xsd/archive.xsd @@ -1,25 +1,30 @@ <?xml version="1.0" encoding="UTF-8"?> -<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema" elementFormDefault="qualified" targetNamespace="http://slub-dresden.de" xmlns:archive="http://slub-dresden.de" xmlns:v3="http://www.loc.gov/mods/v3" xmlns:mets="http://www.loc.gov/METS/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xlink="http://www.w3.org/1999/xlink"> - <xs:import namespace="http://www.loc.gov/METS/" schemaLocation="sip.xsd"/> - <xs:import namespace="http://www.loc.gov/mods/v3" schemaLocation="v3.xsd"/> - <xs:import namespace="http://www.w3.org/1999/xlink" schemaLocation="xlink.xsd"/> - <xs:import namespace="http://www.w3.org/2001/XMLSchema-instance" schemaLocation="xsi.xsd"/> - <xs:element name="record"> - <xs:complexType> - <xs:sequence> - <xs:element ref="archive:exportToArchiveDate"/> - <xs:element ref="archive:externalId"/> - <xs:element ref="archive:externalWorkflow"/> - <xs:element ref="archive:hasConservationReason"/> - <xs:element ref="archive:externalIsilId"/> - <xs:element ref="archive:archivalValueDescription"/> - </xs:sequence> - </xs:complexType> - </xs:element> - <xs:element name="exportToArchiveDate" type="xs:dateTime"/> - <xs:element name="externalId" type="xs:string"/> - <xs:element name="externalWorkflow" type="xs:string"/> - <xs:element name="hasConservationReason" type="xs:boolean"/> - <xs:element name="externalIsilId" type="xs:string"/> - <xs:element name="archivalValueDescription" type="xs:string"/> +<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema" elementFormDefault="qualified" attributeFormDefault="unqualified" targetNamespace="http://slub-dresden.de/slubarchiv" xmlns:archive="http://slub-dresden.de/slubarchiv"> + <xs:element name="record" type="archive:record"/> + <xs:complexType name="record"> + <xs:all> + <xs:element name="archivalValueDescription" type="archive:NonEmptyString" minOccurs="1" maxOccurs="1"/> + <xs:element name="exportToArchiveDate" type="xs:dateTime" minOccurs="1" maxOccurs="1"/> + <xs:element name="externalId" type="archive:LzaIdSubstring" minOccurs="1" maxOccurs="1"/> + <xs:element name="externalIsilId" type="archive:NonEmptyString" minOccurs="0" maxOccurs="1"/> + <xs:element name="externalWorkflow" type="archive:LzaIdSubstring" minOccurs="1" maxOccurs="1"/> + <xs:element name="hasConservationReason" type="xs:boolean" minOccurs="1" maxOccurs="1"/> + </xs:all> + <xs:attribute name="version" type="archive:SipVersionString" use="required"/> + </xs:complexType> + <xs:simpleType name="NonEmptyString"> + <xs:restriction base="xs:string"> + <xs:pattern value="[\s\S]*[^ ][\s\S]*"/> + </xs:restriction> + </xs:simpleType> + <xs:simpleType name="LzaIdSubstring"> + <xs:restriction base="xs:string"> + <xs:pattern value="[a-z0-9]+"/> + </xs:restriction> + </xs:simpleType> + <xs:simpleType name="SipVersionString"> + <xs:restriction base="xs:string"> + <xs:enumeration value="v2017.1"/> + </xs:restriction> + </xs:simpleType> </xs:schema>