diff --git a/bin/slubsipbuilder.pl b/bin/slubsipbuilder.pl index a9f8d4735a2624c909c277f998e610eb584180a8..8dcdb43305b77e682605cd88ee8726cd4be39baf 100755 --- a/bin/slubsipbuilder.pl +++ b/bin/slubsipbuilder.pl @@ -246,6 +246,155 @@ PATCH2 } return; } + +sub create_filecopyhash { + my $directory = shift; + my $content = shift; + my %filecopyhash; + my $wanted=sub { + if (-d $_) { + # dir, do nothing + (); + } else { + my $file=$File::Find::name; + if ($file !~ m#^[-A-Za-z0-9_\./]+$#) { + confess("file '$file' does not match regex '^[-A-Za-z0-9_\./]+\$'"); + } + my $source = $file; + $filecopyhash{$source}->{'source'}=$file; + $file=~s#^$directory/?##; + $filecopyhash{$source}{'relative'}="data/$file"; + $filecopyhash{$source}{'target'}="$content/$file"; + my $fh; + open($fh, "<", $source) or confess ("Can't open '$source', $!"); + binmode($fh); + my $ctx = Digest::MD5->new; + $ctx->addfile(*$fh); + close ($fh); + my $md5 = $ctx->hexdigest; + $filecopyhash{$source}{'md5sum'}=$md5; + } + }; + + finddepth($wanted, $directory); + return \%filecopyhash; +} + + sub prepare_dmd_section_with_ppn ($) { + my $ppn = shift; + my $mods = SLUB::LZA::SIPBuilder::get_mods_from($swb_url, $ppn, $searchkey, $recordschema); + if ($with_debug) { + SLUB::LZA::SIPBuilder::write_file("DEBUG_${ppn}_mods.xml", $mods); + } + # remove the <xml /> from beginning of the answer + $mods=~ s#<\?xml version="1.0" encoding="UTF-8"\?>#<!-- removed xml header from mods part -->#; + my $dmd =<<"DMD"; +<mets:dmdSec ID="DMDLOG_0000"> + <!-- bibliographic metadata --> + <mets:mdWrap MDTYPE="MODS"> + <mets:xmlData> + $mods + </mets:xmlData> + </mets:mdWrap> +</mets:dmdSec> +DMD + return $dmd; + } + + sub prepare_dmd_section_with_noppn ($) { + my $noppn = shift; + my $mods =<<'MODS'; +<mods version="3.6" + xmlns="http://www.loc.gov/mods/v3" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://www.loc.gov/mods/v3 http://www.loc.gov/standards/mods/v3/mods-3-6.xsd"> + <identifier>$noppn</identifier> +</mods> +MODS + my $dmd =<<"DMD"; +<mets:dmdSec ID="DMDLOG_0000"> + <!-- bibliographic metadata --> + <mets:mdWrap MDTYPE="MODS"> + <mets:xmlData> + $mods + </mets:xmlData> + </mets:mdWrap> +</mets:dmdSec> +DMD + return $dmd; + } + + sub prepare_amd_section($$$$$$) { + my $export_to_archive_date = shift; + my $external_workflow = shift; + my $external_id = shift; + my $external_conservation_flag = shift; + my $external_isil = shift; + my $external_value_descr = shift; + my $amd =<<"AMD"; +<mets:amdSec ID="AMD"> + <!-- SIP metadata for automated processing by submission application --> + <mets:techMD ID="ARCHIVE"> + <mets:mdWrap MDTYPE="OTHER" MIMETYPE="text/xml" OTHERMDTYPE="ARCHIVE"> + <mets:xmlData> + <archive:record version="v2017.1" xmlns:archive="http://slub-dresden.de/slubarchiv"> + <archive:exportToArchiveDate>$export_to_archive_date</archive:exportToArchiveDate> + <archive:externalId>$external_id</archive:externalId> + <archive:externalWorkflow>$external_workflow</archive:externalWorkflow> + <archive:hasConservationReason>$external_conservation_flag</archive:hasConservationReason> + <archive:externalIsilId>$external_isil</archive:externalIsilId> + <archive:archivalValueDescription>$external_value_descr</archive:archivalValueDescription> + </archive:record> + </mets:xmlData> + </mets:mdWrap> + </mets:techMD> + </mets:amdSec> +AMD + return $amd; + } + + sub prepare_files_sections($) { + my $filecopyhash = shift; + my @fsec; + my $i=0; + foreach my $fkey (sort keys (%{$filecopyhash})) { + push @fsec, sprintf("<mets:file ID=\"FILE_%015u_LZA\" CHECKSUMTYPE=\"MD5\" CHECKSUM=\"%s\">", $i, $filecopyhash->{$fkey}->{"md5sum"}); + push @fsec, sprintf("<mets:FLocat xmlns:xlink=\"http://www.w3.org/1999/xlink\" LOCTYPE=\"URL\" xlink:href=\"file://%s\"/>", $filecopyhash->{$fkey}->{"relative"}); + push @fsec, "</mets:file>"; + $i++; + } + my $files = join("\n", @fsec); + my $filesec=<<"FILESEC"; +<mets:fileSec> + <mets:fileGrp USE="LZA"> + $files + </mets:fileGrp> +</mets:fileSec> +FILESEC + return $filesec; + } + + sub prepare_struct_map($) { + my $filecopyhash = shift; + my @ssec; + my $i=0; + foreach my $fkey (sort keys (%{$filecopyhash})) { + push @ssec, sprintf("<mets:div ID=\"PHYS_%015u_LZA\" TYPE=\"fileorderSequence\">", $i); + push @ssec, sprintf("<mets:fptr FILEID=\"FILE_%015u_LZA\" />", $i); + push @ssec, "</mets:div>"; + $i++; + } + my $structs = join("\n", @ssec); + my $structmap =<<"STRUCTMAP"; +<mets:structMap TYPE="PHYSICAL"> + <mets:div ID="PHYS_0000" TYPE="ieDir"> + $structs + </mets:div> +</mets:structMap> +STRUCTMAP + return $structmap; + } + # end package package main; @@ -311,11 +460,6 @@ if ($external_id !~ m#^[a-z0-9]+$#) { confess("you need to specify a val if ($external_workflow !~ m#^[a-z0-9]+$#) { confess("you need to specify a valid external workflow (^[a-z0-9]+\$)"); } if (!$external_value_descr) { confess("you need to specify an external value description (reason for archiving)"); } -#=============================================================================== - - - - #=============================================================================== sub main { @@ -324,142 +468,31 @@ sub main { my $file_date = $export_to_archive_date; $file_date =~ s/T/_/g; # replace 'T' with '_' $file_date =~ s/:/-/g; # replace ':' with '-' - # create output dir - #~ if (! -d $output) { - #~ mkpath("$output") || confess("could not create SIP directory for '$output', $!"); - #~ } - #~ $output = abs_path($output); + # prepare dirs my $sip_root_dir = (defined $ppn)? "PPN-${ppn}_${file_date}" : "ID-${noppn}_${file_date}"; my $content = path($output)->child($sip_root_dir)->child("data")->stringify; if (!defined $as_zip) { path($content)->mkpath; } - + my $filecopyhash = SLUB::LZA::SIPBuilder::create_filecopyhash($directory, $content); # prepare dmd-sec - my $mods; - if (defined $ppn) { - $mods = SLUB::LZA::SIPBuilder::get_mods_from($swb_url, $ppn, $searchkey, $recordschema); - if ($with_debug) { - SLUB::LZA::SIPBuilder::write_file("DEBUG_${ppn}_mods.xml", $mods); - } - # remove the <xml /> from beginning of the answer - $mods=~ s#<\?xml version="1.0" encoding="UTF-8"\?>#<!-- removed xml header from mods part -->#; - } elsif (defined $noppn) { - $mods =<<'MODS'; -<mods version="3.6" - xmlns="http://www.loc.gov/mods/v3" - xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" - xsi:schemaLocation="http://www.loc.gov/mods/v3 http://www.loc.gov/standards/mods/v3/mods-3-6.xsd"> - <identifier>$noppn</identifier> -</mods> -MODS - } - my $dmd =<<'DMD'; -<mets:dmdSec ID="DMDLOG_0000"> - <!-- bibliographic metadata --> - <mets:mdWrap MDTYPE="MODS"> - <mets:xmlData> - $mods - </mets:xmlData> - </mets:mdWrap> -</mets:dmdSec> -DMD - + my $dmd = (defined $ppn)? SLUB::LZA::SIPBuilder::prepare_dmd_section_with_ppn( $ppn ) : SLUB::LZA::SIPBuilder::prepare_dmd_section_with_noppn( $noppn ); # prepare amd-sec - my $amd =<<'AMD'; -<mets:amdSec ID="AMD"> - <!-- SIP metadata for automated processing by submission application --> - <mets:techMD ID="ARCHIVE"> - <mets:mdWrap MDTYPE="OTHER" MIMETYPE="text/xml" OTHERMDTYPE="ARCHIVE"> - <mets:xmlData> - <archive:record version="v2017.1" xmlns:archive="http://slub-dresden.de/slubarchiv"> - <archive:exportToArchiveDate>$export_to_archive_date</archive:exportToArchiveDate> - <archive:externalId>$external_id</archive:externalId> - <archive:externalWorkflow>$external_workflow</archive:externalWorkflow> - <archive:hasConservationReason>$external_conservation_flag</archive:hasConservationReason> - <archive:externalIsilId>$external_isil</archive:externalIsilId> - <archive:archivalValueDescription>$external_value_descr</archive:archivalValueDescription> - </archive:record> - </mets:xmlData> - </mets:mdWrap> - </mets:techMD> - </mets:amdSec> -AMD - - # create filecopyhash - my %filecopyhash; - my $wanted=sub { - if (-d $_) { - # dir, do nothing - (); - } else { - my $file=$File::Find::name; - if ($file !~ m#^[-A-Za-z0-9_\./]+$#) { - confess("file '$file' does not match regex '^[-A-Za-z0-9_\./]+\$'"); - } - my $source = $file; - $filecopyhash{$source}->{'source'}=$file; - $file=~s#^$directory/?##; - $filecopyhash{$source}{'relative'}="data/$file"; - $filecopyhash{$source}{'target'}="$content/$file"; - my $fh; - open($fh, "<", $source) or confess ("Can't open '$source', $!"); - binmode($fh); - my $ctx = Digest::MD5->new; - $ctx->addfile(*$fh); - close ($fh); - my $md5 = $ctx->hexdigest; - $filecopyhash{$source}{'md5sum'}=$md5; - } - }; - - finddepth($wanted, $directory); - + my $amd = SLUB::LZA::SIPBuilder::prepare_amd_section( + $export_to_archive_date, + $external_workflow, + $external_id, + $external_conservation_flag, + $external_isil, + $external_value_descr + ); # create fileSec - my $filesec=<<'FILESEC1'; -<mets:fileSec> - <mets:fileGrp USE="LZA"> -FILESEC1 - { - my @fsec; - my $i=0; - foreach my $fkey (sort keys (%filecopyhash)) { - push @fsec, sprintf("<mets:file ID=\"FILE_%015u_LZA\" CHECKSUMTYPE=\"MD5\" CHECKSUM=\"%s\">", $i, $filecopyhash{$fkey}->{"md5sum"}); - push @fsec, sprintf("<mets:FLocat xmlns:xlink=\"http://www.w3.org/1999/xlink\" LOCTYPE=\"URL\" xlink:href=\"file://%s\"/>", $filecopyhash{$fkey}->{"relative"}); - push @fsec, "</mets:file>"; - $i++; - } - $filesec = join("\n", $filesec, @fsec); - } - $filesec = $filesec . <<'FILESEC2'; - </mets:fileGrp> -</mets:fileSec> -FILESEC2 - + my $filesec = SLUB::LZA::SIPBuilder::prepare_files_sections($filecopyhash); # prepare structmap - my $structmap =<<'STRUCTMAP1'; -<mets:structMap TYPE="PHYSICAL"> - <mets:div ID="PHYS_0000" TYPE="ieDir"> -STRUCTMAP1 - { - my @ssec; - my $i=0; - foreach my $fkey (sort keys (%filecopyhash)) { - push @ssec, sprintf("<mets:div ID=\"PHYS_%015u_LZA\" TYPE=\"fileorderSequence\">", $i); - push @ssec, sprintf("<mets:fptr FILEID=\"FILE_%015u_LZA\" />", $i); - push @ssec, "</mets:div>"; - $i++; - } - $structmap = join("\n", $structmap, @ssec); - } - $structmap = $structmap . <<'STRUCTMAP2'; - </mets:div> -</mets:structMap> -STRUCTMAP2 - + my $structmap = SLUB::LZA::SIPBuilder::prepare_struct_map($filecopyhash); # create sip.xml - my $sip =<<'METS'; + my $sip =<<"METS"; <?xml version="1.0" encoding="utf-8"?> <mets:mets xmlns:mets="http://www.loc.gov/METS/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" @@ -471,12 +504,12 @@ STRUCTMAP2 </mets:mets> METS - # compress if needed + # write stuff out if (!defined $as_zip) { SLUB::LZA::SIPBuilder::write_file( path($output)->child($sip_root_dir)->child("sip.xml")->stringify, $sip ); # copy source to target - foreach my $source (sort keys (%filecopyhash)) { - my $target = path($filecopyhash{$source}->{"target"})->stringify; # CHECK ON WINDOWS + foreach my $source (sort keys (%{$filecopyhash})) { + my $target = path($filecopyhash->{$source}->{"target"})->stringify; # CHECK ON WINDOWS my $basename = path($target)->parent->stringify; if (! -d $basename) { path($basename)->mkpath; @@ -490,8 +523,8 @@ METS my $zip = Archive::Zip::SimpleZip->new( $zip_file_path, Zip64=>1 ); $zip->addString($sip, Name=>path($sip_root_dir)->child("sip.xml")->stringify); # copy source to target - foreach my $source (sort keys (%filecopyhash)) { - my $target = path($sip_root_dir)->child($filecopyhash{$source}->{"relative"})->stringify; # CHECK ON WINDOWS + foreach my $source (sort keys (%{$filecopyhash})) { + my $target = path($sip_root_dir)->child($filecopyhash->{$source}->{"relative"})->stringify; # CHECK ON WINDOWS my $basename = path($target)->parent->stringify; $zip->add( $source, Name=> $target) || confess ("could not zip copy from '$source' to '$target', $!"); }