Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
S
SLUB_SIP_Builder
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Digital Preservation
SLUB_SIP_Builder
Commits
f97b1ccf
Commit
f97b1ccf
authored
5 years ago
by
Andreas Romeyke
Browse files
Options
Downloads
Patches
Plain Diff
- refactoring, extracting helper functions
parent
66a7aeda
No related branches found
No related tags found
No related merge requests found
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
bin/slubsipbuilder.pl
+168
-135
168 additions, 135 deletions
bin/slubsipbuilder.pl
with
168 additions
and
135 deletions
bin/slubsipbuilder.pl
+
168
−
135
View file @
f97b1ccf
...
...
@@ -246,6 +246,155 @@ PATCH2
}
return
;
}
sub
create_filecopyhash
{
my
$directory
=
shift
;
my
$content
=
shift
;
my
%filecopyhash
;
my
$wanted
=
sub
{
if
(
-
d
$_
)
{
# dir, do nothing
();
}
else
{
my
$file
=
$
File::Find::
name
;
if
(
$file
!~
m#^[-A-Za-z0-9_\./]+$#
)
{
confess
("
file '
$file
' does not match regex '^[-A-Za-z0-9_
\
./]+
\$
'
");
}
my
$source
=
$file
;
$filecopyhash
{
$source
}
->
{'
source
'}
=
$file
;
$file
=~
s#^$directory/?##
;
$filecopyhash
{
$source
}{'
relative
'}
=
"
data/
$file
";
$filecopyhash
{
$source
}{'
target
'}
=
"
$content
/
$file
";
my
$fh
;
open
(
$fh
,
"
<
",
$source
)
or
confess
("
Can't open '
$source
', $!
");
binmode
(
$fh
);
my
$ctx
=
Digest::
MD5
->
new
;
$ctx
->
addfile
(
*$fh
);
close
(
$fh
);
my
$md5
=
$ctx
->
hexdigest
;
$filecopyhash
{
$source
}{'
md5sum
'}
=
$md5
;
}
};
finddepth
(
$wanted
,
$directory
);
return
\
%filecopyhash
;
}
sub
prepare_dmd_section_with_ppn
($)
{
my
$ppn
=
shift
;
my
$mods
=
SLUB::LZA::SIPBuilder::
get_mods_from
(
$swb_url
,
$ppn
,
$searchkey
,
$recordschema
);
if
(
$with_debug
)
{
SLUB::LZA::SIPBuilder::
write_file
("
DEBUG_
${ppn}
_mods.xml
",
$mods
);
}
# remove the <xml /> from beginning of the answer
$mods
=~
s#<\?xml version="1.0" encoding="UTF-8"\?>#<!-- removed xml header from mods part -->#
;
my
$dmd
=
<<"DMD";
<mets:dmdSec ID="DMDLOG_0000">
<!-- bibliographic metadata -->
<mets:mdWrap MDTYPE="MODS">
<mets:xmlData>
$mods
</mets:xmlData>
</mets:mdWrap>
</mets:dmdSec>
DMD
return
$dmd
;
}
sub
prepare_dmd_section_with_noppn
($)
{
my
$noppn
=
shift
;
my
$mods
=
<<'MODS';
<mods version="3.6"
xmlns="http://www.loc.gov/mods/v3"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://www.loc.gov/mods/v3 http://www.loc.gov/standards/mods/v3/mods-3-6.xsd">
<identifier>$noppn</identifier>
</mods>
MODS
my
$dmd
=
<<"DMD";
<mets:dmdSec ID="DMDLOG_0000">
<!-- bibliographic metadata -->
<mets:mdWrap MDTYPE="MODS">
<mets:xmlData>
$mods
</mets:xmlData>
</mets:mdWrap>
</mets:dmdSec>
DMD
return
$dmd
;
}
sub
prepare_amd_section
($$$$$$)
{
my
$export_to_archive_date
=
shift
;
my
$external_workflow
=
shift
;
my
$external_id
=
shift
;
my
$external_conservation_flag
=
shift
;
my
$external_isil
=
shift
;
my
$external_value_descr
=
shift
;
my
$amd
=
<<"AMD";
<mets:amdSec ID="AMD">
<!-- SIP metadata for automated processing by submission application -->
<mets:techMD ID="ARCHIVE">
<mets:mdWrap MDTYPE="OTHER" MIMETYPE="text/xml" OTHERMDTYPE="ARCHIVE">
<mets:xmlData>
<archive:record version="v2017.1" xmlns:archive="http://slub-dresden.de/slubarchiv">
<archive:exportToArchiveDate>$export_to_archive_date</archive:exportToArchiveDate>
<archive:externalId>$external_id</archive:externalId>
<archive:externalWorkflow>$external_workflow</archive:externalWorkflow>
<archive:hasConservationReason>$external_conservation_flag</archive:hasConservationReason>
<archive:externalIsilId>$external_isil</archive:externalIsilId>
<archive:archivalValueDescription>$external_value_descr</archive:archivalValueDescription>
</archive:record>
</mets:xmlData>
</mets:mdWrap>
</mets:techMD>
</mets:amdSec>
AMD
return
$amd
;
}
sub
prepare_files_sections
($)
{
my
$filecopyhash
=
shift
;
my
@fsec
;
my
$i
=
0
;
foreach
my
$fkey
(
sort
keys
(
%
{
$filecopyhash
}))
{
push
@fsec
,
sprintf
("
<mets:file ID=
\"
FILE_%015u_LZA
\"
CHECKSUMTYPE=
\"
MD5
\"
CHECKSUM=
\"
%s
\"
>
",
$i
,
$filecopyhash
->
{
$fkey
}
->
{"
md5sum
"});
push
@fsec
,
sprintf
("
<mets:FLocat xmlns:xlink=
\"
http://www.w3.org/1999/xlink
\"
LOCTYPE=
\"
URL
\"
xlink:href=
\"
file://%s
\"
/>
",
$filecopyhash
->
{
$fkey
}
->
{"
relative
"});
push
@fsec
,
"
</mets:file>
";
$i
++
;
}
my
$files
=
join
("
\n
",
@fsec
);
my
$filesec
=
<<"FILESEC";
<mets:fileSec>
<mets:fileGrp USE="LZA">
$files
</mets:fileGrp>
</mets:fileSec>
FILESEC
return
$filesec
;
}
sub
prepare_struct_map
($)
{
my
$filecopyhash
=
shift
;
my
@ssec
;
my
$i
=
0
;
foreach
my
$fkey
(
sort
keys
(
%
{
$filecopyhash
}))
{
push
@ssec
,
sprintf
("
<mets:div ID=
\"
PHYS_%015u_LZA
\"
TYPE=
\"
fileorderSequence
\"
>
",
$i
);
push
@ssec
,
sprintf
("
<mets:fptr FILEID=
\"
FILE_%015u_LZA
\"
/>
",
$i
);
push
@ssec
,
"
</mets:div>
";
$i
++
;
}
my
$structs
=
join
("
\n
",
@ssec
);
my
$structmap
=
<<"STRUCTMAP";
<mets:structMap TYPE="PHYSICAL">
<mets:div ID="PHYS_0000" TYPE="ieDir">
$structs
</mets:div>
</mets:structMap>
STRUCTMAP
return
$structmap
;
}
# end package
package
main
;
...
...
@@ -311,11 +460,6 @@ if ($external_id !~ m#^[a-z0-9]+$#) { confess("you need to specify a val
if
(
$external_workflow
!~
m#^[a-z0-9]+$#
)
{
confess
("
you need to specify a valid external workflow (^[a-z0-9]+
\$
)
");
}
if
(
!
$external_value_descr
)
{
confess
("
you need to specify an external value description (reason for archiving)
");
}
#===============================================================================
#===============================================================================
sub
main
{
...
...
@@ -324,142 +468,31 @@ sub main {
my
$file_date
=
$export_to_archive_date
;
$file_date
=~
s/T/_/g
;
# replace 'T' with '_'
$file_date
=~
s/:/-/g
;
# replace ':' with '-'
# create output dir
#~ if (! -d $output) {
#~ mkpath("$output") || confess("could not create SIP directory for '$output', $!");
#~ }
#~ $output = abs_path($output);
# prepare dirs
my
$sip_root_dir
=
(
defined
$ppn
)?
"
PPN-
${ppn}
_
${file_date}
"
:
"
ID-
${noppn}
_
${file_date}
";
my
$content
=
path
(
$output
)
->
child
(
$sip_root_dir
)
->
child
("
data
")
->
stringify
;
if
(
!
defined
$as_zip
)
{
path
(
$content
)
->
mkpath
;
}
my
$filecopyhash
=
SLUB::LZA::SIPBuilder::
create_filecopyhash
(
$directory
,
$content
);
# prepare dmd-sec
my
$mods
;
if
(
defined
$ppn
)
{
$mods
=
SLUB::LZA::SIPBuilder::
get_mods_from
(
$swb_url
,
$ppn
,
$searchkey
,
$recordschema
);
if
(
$with_debug
)
{
SLUB::LZA::SIPBuilder::
write_file
("
DEBUG_
${ppn}
_mods.xml
",
$mods
);
}
# remove the <xml /> from beginning of the answer
$mods
=~
s#<\?xml version="1.0" encoding="UTF-8"\?>#<!-- removed xml header from mods part -->#
;
}
elsif
(
defined
$noppn
)
{
$mods
=
<<'MODS';
<mods version="3.6"
xmlns="http://www.loc.gov/mods/v3"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://www.loc.gov/mods/v3 http://www.loc.gov/standards/mods/v3/mods-3-6.xsd">
<identifier>$noppn</identifier>
</mods>
MODS
}
my
$dmd
=
<<'DMD';
<mets:dmdSec ID="DMDLOG_0000">
<!-- bibliographic metadata -->
<mets:mdWrap MDTYPE="MODS">
<mets:xmlData>
$mods
</mets:xmlData>
</mets:mdWrap>
</mets:dmdSec>
DMD
my
$dmd
=
(
defined
$ppn
)?
SLUB::LZA::SIPBuilder::
prepare_dmd_section_with_ppn
(
$ppn
)
:
SLUB::LZA::SIPBuilder::
prepare_dmd_section_with_noppn
(
$noppn
);
# prepare amd-sec
my
$amd
=
<<'AMD';
<mets:amdSec ID="AMD">
<!-- SIP metadata for automated processing by submission application -->
<mets:techMD ID="ARCHIVE">
<mets:mdWrap MDTYPE="OTHER" MIMETYPE="text/xml" OTHERMDTYPE="ARCHIVE">
<mets:xmlData>
<archive:record version="v2017.1" xmlns:archive="http://slub-dresden.de/slubarchiv">
<archive:exportToArchiveDate>$export_to_archive_date</archive:exportToArchiveDate>
<archive:externalId>$external_id</archive:externalId>
<archive:externalWorkflow>$external_workflow</archive:externalWorkflow>
<archive:hasConservationReason>$external_conservation_flag</archive:hasConservationReason>
<archive:externalIsilId>$external_isil</archive:externalIsilId>
<archive:archivalValueDescription>$external_value_descr</archive:archivalValueDescription>
</archive:record>
</mets:xmlData>
</mets:mdWrap>
</mets:techMD>
</mets:amdSec>
AMD
# create filecopyhash
my
%filecopyhash
;
my
$wanted
=
sub
{
if
(
-
d
$_
)
{
# dir, do nothing
();
}
else
{
my
$file
=
$
File::Find::
name
;
if
(
$file
!~
m#^[-A-Za-z0-9_\./]+$#
)
{
confess
("
file '
$file
' does not match regex '^[-A-Za-z0-9_
\
./]+
\$
'
");
}
my
$source
=
$file
;
$filecopyhash
{
$source
}
->
{'
source
'}
=
$file
;
$file
=~
s#^$directory/?##
;
$filecopyhash
{
$source
}{'
relative
'}
=
"
data/
$file
";
$filecopyhash
{
$source
}{'
target
'}
=
"
$content
/
$file
";
my
$fh
;
open
(
$fh
,
"
<
",
$source
)
or
confess
("
Can't open '
$source
', $!
");
binmode
(
$fh
);
my
$ctx
=
Digest::
MD5
->
new
;
$ctx
->
addfile
(
*$fh
);
close
(
$fh
);
my
$md5
=
$ctx
->
hexdigest
;
$filecopyhash
{
$source
}{'
md5sum
'}
=
$md5
;
}
};
finddepth
(
$wanted
,
$directory
);
my
$amd
=
SLUB::LZA::SIPBuilder::
prepare_amd_section
(
$export_to_archive_date
,
$external_workflow
,
$external_id
,
$external_conservation_flag
,
$external_isil
,
$external_value_descr
);
# create fileSec
my
$filesec
=
<<'FILESEC1';
<mets:fileSec>
<mets:fileGrp USE="LZA">
FILESEC1
{
my
@fsec
;
my
$i
=
0
;
foreach
my
$fkey
(
sort
keys
(
%filecopyhash
))
{
push
@fsec
,
sprintf
("
<mets:file ID=
\"
FILE_%015u_LZA
\"
CHECKSUMTYPE=
\"
MD5
\"
CHECKSUM=
\"
%s
\"
>
",
$i
,
$filecopyhash
{
$fkey
}
->
{"
md5sum
"});
push
@fsec
,
sprintf
("
<mets:FLocat xmlns:xlink=
\"
http://www.w3.org/1999/xlink
\"
LOCTYPE=
\"
URL
\"
xlink:href=
\"
file://%s
\"
/>
",
$filecopyhash
{
$fkey
}
->
{"
relative
"});
push
@fsec
,
"
</mets:file>
";
$i
++
;
}
$filesec
=
join
("
\n
",
$filesec
,
@fsec
);
}
$filesec
=
$filesec
.
<<'FILESEC2';
</mets:fileGrp>
</mets:fileSec>
FILESEC2
my
$filesec
=
SLUB::LZA::SIPBuilder::
prepare_files_sections
(
$filecopyhash
);
# prepare structmap
my
$structmap
=
<<'STRUCTMAP1';
<mets:structMap TYPE="PHYSICAL">
<mets:div ID="PHYS_0000" TYPE="ieDir">
STRUCTMAP1
{
my
@ssec
;
my
$i
=
0
;
foreach
my
$fkey
(
sort
keys
(
%filecopyhash
))
{
push
@ssec
,
sprintf
("
<mets:div ID=
\"
PHYS_%015u_LZA
\"
TYPE=
\"
fileorderSequence
\"
>
",
$i
);
push
@ssec
,
sprintf
("
<mets:fptr FILEID=
\"
FILE_%015u_LZA
\"
/>
",
$i
);
push
@ssec
,
"
</mets:div>
";
$i
++
;
}
$structmap
=
join
("
\n
",
$structmap
,
@ssec
);
}
$structmap
=
$structmap
.
<<'STRUCTMAP2';
</mets:div>
</mets:structMap>
STRUCTMAP2
my
$structmap
=
SLUB::LZA::SIPBuilder::
prepare_struct_map
(
$filecopyhash
);
# create sip.xml
my
$sip
=
<<
'
METS
'
;
my
$sip
=
<<
"
METS
"
;
<?xml version="1.0" encoding="utf-8"?>
<mets:mets xmlns:mets="http://www.loc.gov/METS/"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
...
...
@@ -471,12 +504,12 @@ STRUCTMAP2
</mets:mets>
METS
#
compress if needed
#
write stuff out
if
(
!
defined
$as_zip
)
{
SLUB::LZA::SIPBuilder::
write_file
(
path
(
$output
)
->
child
(
$sip_root_dir
)
->
child
("
sip.xml
")
->
stringify
,
$sip
);
# copy source to target
foreach
my
$source
(
sort
keys
(
%filecopyhash
))
{
my
$target
=
path
(
$filecopyhash
{
$source
}
->
{"
target
"})
->
stringify
;
# CHECK ON WINDOWS
foreach
my
$source
(
sort
keys
(
%
{
$
filecopyhash
}
))
{
my
$target
=
path
(
$filecopyhash
->
{
$source
}
->
{"
target
"})
->
stringify
;
# CHECK ON WINDOWS
my
$basename
=
path
(
$target
)
->
parent
->
stringify
;
if
(
!
-
d
$basename
)
{
path
(
$basename
)
->
mkpath
;
...
...
@@ -490,8 +523,8 @@ METS
my
$zip
=
Archive::Zip::
SimpleZip
->
new
(
$zip_file_path
,
Zip64
=>
1
);
$zip
->
addString
(
$sip
,
Name
=>
path
(
$sip_root_dir
)
->
child
("
sip.xml
")
->
stringify
);
# copy source to target
foreach
my
$source
(
sort
keys
(
%filecopyhash
))
{
my
$target
=
path
(
$sip_root_dir
)
->
child
(
$filecopyhash
{
$source
}
->
{"
relative
"})
->
stringify
;
# CHECK ON WINDOWS
foreach
my
$source
(
sort
keys
(
%
{
$
filecopyhash
}
))
{
my
$target
=
path
(
$sip_root_dir
)
->
child
(
$filecopyhash
->
{
$source
}
->
{"
relative
"})
->
stringify
;
# CHECK ON WINDOWS
my
$basename
=
path
(
$target
)
->
parent
->
stringify
;
$zip
->
add
(
$source
,
Name
=>
$target
)
||
confess
("
could not zip copy from '
$source
' to '
$target
', $!
");
}
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment