diff --git a/lib/SLUB/LZA/TA/Command/report.pm b/lib/SLUB/LZA/TA/Command/report.pm index 4e77eac2f2f9094b2f57152b4ece572168f82aa2..ea496adc86ddcf9974df6e9833979fe9e0aaae30 100644 --- a/lib/SLUB/LZA/TA/Command/report.pm +++ b/lib/SLUB/LZA/TA/Command/report.pm @@ -12,6 +12,42 @@ use namespace::autoclean -except => qr{SLUB::LZA::TA::.*}; # ABSTRACT: report module for ta-tool +use constant { + FULL => 0, + LDP => 1, + NO_LDP => 2, +}; +use constant SETS => ( + "total", + "LDP only", + "no LDP", +); +use constant { + NEW => 0, + UPDATE => 1, + TOTAL => 2 +}; +use constant AIPSTATE => ( + "first ingest", + "AIP update", + "all AIPs" +); + +use constant { + COUNT => 0, + FILES => 1, + SIZE => 2, + PAYLOAD_FILES => 3, + PAYLOAD_SIZE => 4, +}; +use constant FLAVOURS => ( + "count of aips", + "count of files", + "size in B", + "count of payload files", + "payload size in B" +); + sub abstract { return "print AIP reports about Archival Information System (AIS)";} my $base_cmd = "$0 report"; my $dummycmd = " "x length($base_cmd); @@ -216,7 +252,6 @@ PAINLESS ); $aips_response->{from_date}=$opt->{creationdate_epochs}->{from_string}; $aips_response->{to_date}=$opt->{creationdate_epochs}->{to_string}; - #p($aips_response); return $aips_response; } @@ -257,6 +292,129 @@ sub get_ldp_projects { return @ret; } +sub get_filestypes_by_aips { + my ($self, $opt, $args) = @_; + my $aips_query = SLUB::LZA::TA::Archivematica::Elasticsearch::PrepareQuery::prepare_aip_query($opt); + $aips_query->{"_source"} = { + "includes" => 'uuid' + }; + if (exists $opt->{debug}) { + use Data::Printer; + say STDERR "query:"; + say STDERR "------------------------"; + say STDERR np($aips_query); + say STDERR "------------------------"; + } + #p($aips_query); + my $aips_response = SLUB::LZA::TA::Archivematica::Elasticsearch::query_elasticsearch( + $SLUB::LZA::TA::config{elasticsearch_protocol}, + $SLUB::LZA::TA::config{elasticsearch_host}, + $SLUB::LZA::TA::config{elasticsearch_port}, + 'aips', # indexname + $aips_query, # query_hash ref + ); + my @aips = map { $_->{_source}->{uuid} } @{ $aips_response->{hits}->{hits} }; + my %results; + foreach my $aip (@aips) { + my $files_query = { + query => { + bool => { + must => + [ + { + match => { "AIPUUID" => $aip + #"METS.amdSec.mets:amdSec_dict.mets:techMD_dict.mets:mdWrap_dict.mets:xmlData_dict.premis:object_dict.premis:formatRegistry_dict.premis:formatRegistryKey" => "$pronom_id" + #"premis:formatRegistryKey" => "$pronom_id" + } + } + ], + } + }, + "size" => 10000, + # fields not supported in ES6, therefore we must use _source! + "_source" => { + "includes" => [ + 'fileExtension', + join(".", qw( + METS + amdSec + mets:amdSec_dict + mets:techMD_dict + mets:mdWrap_dict + mets:xmlData_dict + premis:object_dict + premis:objectCharacteristics_dict + premis:size + ) + ), + join(".", qw( + METS + amdSec + mets:amdSec_dict + mets:techMD_dict + mets:mdWrap_dict + mets:xmlData_dict + premis:object_dict + premis:objectCharacteristics_dict + premis:format_dict + premis:formatRegistry_dict + premis:formatRegistryKey + ) + ), + ] + } + }; + my $files_response = SLUB::LZA::TA::Archivematica::Elasticsearch::query_elasticsearch( + $SLUB::LZA::TA::config{elasticsearch_protocol}, + $SLUB::LZA::TA::config{elasticsearch_host}, + $SLUB::LZA::TA::config{elasticsearch_port}, + 'aipfiles', # indexname + $files_query, # query_hash ref + ); + if (exists $opt->{debug}) { + use Data::Printer; + say "------------------------"; + say "query=",np( $files_query); + say "resp =",np($files_response); + say "------------------------"; + } + foreach my $file_response (@{ $files_response->{hits}->{hits} }) { + my $pronom_id = $file_response->{_source} + ->{'METS'} + ->{'amdSec'} + ->{'mets:amdSec_dict'} + ->{'mets:techMD_dict'} + ->{'mets:mdWrap_dict'} + ->{'mets:xmlData_dict'} + ->{'premis:object_dict'} + ->{'premis:objectCharacteristics_dict'} + ->{'premis:format_dict'} + ->{'premis:formatRegistry_dict'} + ->{'premis:formatRegistryKey'}; + my $size = $file_response->{_source} + ->{'METS'} + ->{'amdSec'} + ->{'mets:amdSec_dict'} + ->{'mets:techMD_dict'} + ->{'mets:mdWrap_dict'} + ->{'mets:xmlData_dict'} + ->{'premis:object_dict'} + ->{'premis:objectCharacteristics_dict'} + ->{'premis:size'}; + my $file_extension = $file_response->{_source}->{fileExtension} // "(no extension)"; + #my %tmp; + #$tmp{pronom_id} = $pronom_id; + #$tmp{size} = $size; + #$tmp{file_extension} = $file_extension; + $results{pronom_id}->{$pronom_id}->{(FLAVOURS)[SIZE]} += $size; + $results{pronom_id}->{$pronom_id}->{(FLAVOURS)[FILES]}++; + $results{file_extension}->{$file_extension}->{(FLAVOURS)[SIZE]} += $size; + $results{file_extension}->{$file_extension}->{(FLAVOURS)[FILES]}++; + } + } + return \%results; +} + sub execute { my ($self, $opt, $args) = @_; my %results; @@ -268,44 +426,59 @@ sub execute { if ($opt->{with_ldp}) { @ldp_projects = get_ldp_projects(@_); } - foreach my $aip_state (qw(total new updated)) { + + foreach my $aip_state (AIPSTATE) { my %newhash = %{$opt}; - if ($aip_state eq 'new') {$newhash{only_new} = 1;} - elsif ($aip_state eq 'updated') {$newhash{only_updated} = 1;} - foreach my $ldp_funder (qw(saxony misc none full)) { - delete $newhash{only_ldp_saxon}; - delete $newhash{only_ldp_without_saxon}; + if ($aip_state eq (AIPSTATE)[NEW]) {$newhash{only_new} = 1;} + elsif ($aip_state eq (AIPSTATE)[UPDATE]) {$newhash{only_updated} = 1;} + foreach my $set (SETS) { + delete $newhash{only_ldp}; delete $newhash{no_ldp}; - if ($ldp_funder eq 'saxony') {$newhash{only_ldp_saxon} = 1} - elsif ($ldp_funder eq 'misc') {$newhash{only_ldp_without_saxon} = 1;} - elsif ($ldp_funder eq 'none') {$newhash{no_ldp} = 1;} + if ($set eq (SETS)[LDP]) {$newhash{only_ldp} = 1} + elsif ($set eq (SETS)[NO_LDP]) {$newhash{no_ldp} = 1;} else { # full, do not filter for ldp } + my $filetypes_hashref = get_filestypes_by_aips($self, \%newhash, $args); my $res = _execute($self, \%newhash, $args); - $results{flavour}->{count}->{$aip_state}->{$ldp_funder} = $res->{hits}->{total}; - $results{flavour}->{size}->{$aip_state}->{$ldp_funder} = $res->{aggregations}->{total_aip_size}->{value}*1024*1024; - $results{flavour}->{files}->{$aip_state}->{$ldp_funder} = $res->{aggregations}->{total_file_count}->{value}; - $results{flavour}->{payload_size}->{$aip_state}->{$ldp_funder} = $res->{aggregations}->{total_payload_size}->{value}; - $results{flavour}->{payload_files}->{$aip_state}->{$ldp_funder} = $res->{aggregations}->{total_payload_filecount}->{value}; - + $results{flavour}->{(FLAVOURS)[COUNT]}->{$aip_state}->{$set}->{""} = $res->{hits}->{total}; + $results{flavour}->{(FLAVOURS)[SIZE]}->{$aip_state}->{$set}->{""} = $res->{aggregations}->{total_aip_size}->{value}*1024*1024; + $results{flavour}->{(FLAVOURS)[FILES]}->{$aip_state}->{$set}->{""} = $res->{aggregations}->{total_file_count}->{value}; + $results{flavour}->{(FLAVOURS)[PAYLOAD_SIZE]}->{$aip_state}->{$set}->{""} = $res->{aggregations}->{total_payload_size}->{value}; + $results{flavour}->{(FLAVOURS)[PAYLOAD_FILES]}->{$aip_state}->{$set}->{""} = $res->{aggregations}->{total_payload_filecount}->{value}; + foreach my $file_extension (sort keys %{ $filetypes_hashref->{file_extension} }) { + $results{flavour}->{(FLAVOURS)[FILES]}->{$aip_state}->{$set}->{sprintf "%20s %10s", "file extension", $file_extension} = $filetypes_hashref->{file_extension}->{$file_extension}->{(FLAVOURS)[FILES]}; + $results{flavour}->{(FLAVOURS)[SIZE]}->{$aip_state}->{$set}->{sprintf "%20s %10s", "file extension", $file_extension} = $filetypes_hashref->{file_extension}->{$file_extension}->{(FLAVOURS)[SIZE]}; + } + foreach my $pronom_id (sort keys %{ $filetypes_hashref->{pronom_id} }) { + $results{flavour}->{(FLAVOURS)[FILES]}->{$aip_state}->{$set}->{sprintf "%20s %10s", "pronom id",$pronom_id} = $filetypes_hashref->{pronom_id}->{$pronom_id}->{(FLAVOURS)[FILES]}; + $results{flavour}->{(FLAVOURS)[SIZE]}->{$aip_state}->{$set}->{sprintf "%20s %10s", "pronom id",$pronom_id} = $filetypes_hashref->{pronom_id}->{$pronom_id}->{(FLAVOURS)[SIZE]}; + } } undef %newhash; foreach my $ldp_project (@ldp_projects) { # only if @ldp_projects have size > 1 %newhash = %{$opt}; - if ($aip_state eq 'new') {$newhash{only_new} = 1;} - elsif ($aip_state eq 'updated') {$newhash{only_updated} = 1;} + if ($aip_state eq (AIPSTATE)[NEW]) {$newhash{only_new} = 1;} + elsif ($aip_state eq (AIPSTATE)[UPDATE]) {$newhash{only_updated} = 1;} $newhash{only_ldp_project} = $ldp_project; + my $filetypes_hashref = get_filestypes_by_aips($self, \%newhash, $args); my $res = _execute($self, \%newhash, $args); - $results{ldp_project}->{$ldp_project}->{flavour}->{count}->{$aip_state} = $res->{hits}->{total}; - $results{ldp_project}->{$ldp_project}->{flavour}->{size}->{$aip_state} = $res->{aggregations}->{total_aip_size}->{value}*1024*1024; - $results{ldp_project}->{$ldp_project}->{flavour}->{files}->{$aip_state} = $res->{aggregations}->{total_file_count}->{value}; - $results{ldp_project}->{$ldp_project}->{flavour}->{payload_size}->{$aip_state} = $res->{aggregations}->{total_payload_size}->{value}; - $results{ldp_project}->{$ldp_project}->{flavour}->{payload_files}->{$aip_state} = $res->{aggregations}->{total_payload_filecount}->{value}; + $results{ldp_project}->{$ldp_project}->{flavour}->{(FLAVOURS)[COUNT]}->{$aip_state}->{""} = $res->{hits}->{total}; + $results{ldp_project}->{$ldp_project}->{flavour}->{(FLAVOURS)[SIZE]}->{$aip_state}->{""} = $res->{aggregations}->{total_aip_size}->{value}*1024*1024; + $results{ldp_project}->{$ldp_project}->{flavour}->{(FLAVOURS)[FILES]}->{$aip_state}->{""} = $res->{aggregations}->{total_file_count}->{value}; + $results{ldp_project}->{$ldp_project}->{flavour}->{(FLAVOURS)[PAYLOAD_SIZE]}->{$aip_state}->{""} = $res->{aggregations}->{total_payload_size}->{value}; + $results{ldp_project}->{$ldp_project}->{flavour}->{(FLAVOURS)[PAYLOAD_FILES]}->{$aip_state}->{""} = $res->{aggregations}->{total_payload_filecount}->{value}; + + foreach my $file_extension (sort keys %{ $filetypes_hashref->{file_extension} }) { + $results{ldp_project}->{$ldp_project}->{flavour}->{(FLAVOURS)[FILES]}->{$aip_state}->{sprintf "%20s %10s", "file extension", $file_extension} = $filetypes_hashref->{file_extension}->{$file_extension}->{(FLAVOURS)[FILES]}; + $results{ldp_project}->{$ldp_project}->{flavour}->{(FLAVOURS)[SIZE]}->{$aip_state}->{sprintf "%20s %10s", "file extension", $file_extension} = $filetypes_hashref->{file_extension}->{$file_extension}->{(FLAVOURS)[SIZE]}; + } + foreach my $pronom_id (sort keys %{ $filetypes_hashref->{pronom_id} }) { + $results{ldp_project}->{$ldp_project}->{flavour}->{(FLAVOURS)[FILES]}->{$aip_state}->{sprintf "%20s %10s", "pronom id",$pronom_id} = $filetypes_hashref->{pronom_id}->{$pronom_id}->{(FLAVOURS)[FILES]}; + $results{ldp_project}->{$ldp_project}->{flavour}->{(FLAVOURS)[SIZE]}->{$aip_state}->{sprintf "%20s %10s", "pronom id",$pronom_id} = $filetypes_hashref->{pronom_id}->{$pronom_id}->{(FLAVOURS)[SIZE]}; + } } - } - if ($opt->{output_format} eq 'output_as_asciidoc') { print_humanreadable_report(\%results); } else { @@ -320,29 +493,44 @@ sub execute { sub prepare_for_table($results, $ldp_projects) { my @table; - foreach my $aip_state (qw(new updated total)) { - foreach my $flavour (sort keys %{$results->{flavour}}) { # count size file - foreach my $ldp_funder (qw(saxony misc none full)) { - my $line; - $line->{_set} = $ldp_funder; - $line->{_subset} = ""; - $line->{_timespan_from} =$results->{from}; - $line->{_timespan_to} =$results->{to}; - $line->{flavour} = $flavour; - $line->{aip_state} = $aip_state; - $line->{value} = $results->{flavour}->{$flavour}->{$aip_state}->{$ldp_funder}; - push @table, $line; + foreach my $set (SETS) { + foreach my $aip_state (AIPSTATE) { + foreach my $flavour (FLAVOURS) { # count size file + foreach my $filter (sort keys %{$results->{flavour}->{$flavour}->{$aip_state}->{$set}}) { + + my $line; + $line->{_set} = $set; + $line->{_subset} = ""; + $line->{_timespan_from} = $results->{from}; + $line->{_timespan_to} = $results->{to}; + $line->{flavour} = $flavour; + $line->{aip_state} = $aip_state; + $line->{filter} = $filter; + $line->{value} = $results->{flavour}->{$flavour}->{$aip_state}->{$set}->{$filter}; + push @table, $line; + } } - foreach my $ldp_project (@{ $ldp_projects }) { - my $line; - $line->{_set} = "LDP"; - $line->{_subset} = $ldp_project; - $line->{_timespan_from} =$results->{from}; - $line->{_timespan_to} =$results->{to}; - $line->{flavour} = $flavour; - $line->{aip_state} = $aip_state; - $line->{value} = $results->{ldp_project}->{$ldp_project}->{flavour}->{$flavour}->{$aip_state}; - push @table, $line; + } + } + foreach my $ldp_project (@{ $ldp_projects }) { + foreach my $aip_state (AIPSTATE) { + foreach my $flavour (FLAVOURS) { + #$results{ldp_project}->{$ldp_project}->{flavour}->{(FLAVOURS)[FILES]}->{$aip_state}->{"file_extension $file_extension"} + + foreach my $filter (sort keys %{$results->{ldp_project}->{$ldp_project}->{flavour}->{$flavour}->{$aip_state}}) { + say "FILTER=$filter\n"; + # count size file + my $line; + $line->{_set} = "LDP"; + $line->{_subset} = $ldp_project; + $line->{_timespan_from} = $results->{from}; + $line->{_timespan_to} = $results->{to}; + $line->{flavour} = $flavour; + $line->{aip_state} = $aip_state; + $line->{filter} = $filter; + $line->{value} = $results->{ldp_project}->{$ldp_project}->{flavour}->{$flavour}->{$aip_state}->{$filter}; + push @table, $line; + } } } } @@ -350,59 +538,58 @@ sub prepare_for_table($results, $ldp_projects) { return \@headers, \@table; } -sub print_humanreadable_report ($result) { +sub print_humanreadable_report ($results) { + no warnings; + say <<"RPTHEADER"; :lang: en -:date: $result->{date} -:generator: $0 ($result->{package}) - -== Report from $result->{from} to $result->{to} +:doctype: article +:date: $results->{date} +:generator: $0 ($results->{package}) +:toc: += Report from $results->{from} to $results->{to} RPTHEADER - - foreach my $flavour (sort keys %{$result->{flavour}}) { - say <<"REPORT"; - -=== $flavour - - first ingest: $result->{flavour}->{$flavour}->{new}->{full} - (ldp saxony: $result->{flavour}->{$flavour}->{new}->{saxony}) - (ldp misc: $result->{flavour}->{$flavour}->{new}->{misc}) - (no ldp: $result->{flavour}->{$flavour}->{new}->{none}) - - aip update: $result->{flavour}->{$flavour}->{updated}->{full} - (ldp saxony: $result->{flavour}->{$flavour}->{updated}->{saxony}) - (ldp misc: $result->{flavour}->{$flavour}->{updated}->{misc}) - (no ldp: $result->{flavour}->{$flavour}->{updated}->{none}) - - total: $result->{flavour}->{$flavour}->{total}->{full} - -REPORT + foreach my $flavour (sort keys %{$results->{flavour}}) { + + say "== $flavour\n"; + foreach my $aipstate (AIPSTATE) { + printf "* %10s:\n", $aipstate; + foreach my $set (SETS) { + printf "** %15s: %10u\n", + $set, + $results->{flavour}->{$flavour}->{$aipstate}->{$set}->{""}; + } + say; + } } - - foreach my $ldp_project (sort keys %{$result->{ldp_project}}) { - say <<"LDPREPORT_HEADER"; - -=== LDP project '$ldp_project' - -LDPREPORT_HEADER - - foreach my $flavour (sort keys %{$result->{ldp_project}->{$ldp_project}->{flavour}}) { - say <<"LDPREPORT"; - -==== $flavour - - first ingest: $result->{ldp_project}->{$ldp_project}->{flavour}->{$flavour}->{new} - - aip update: $result->{ldp_project}->{$ldp_project}->{flavour}->{$flavour}->{updated} - - total: $result->{ldp_project}->{$ldp_project}->{flavour}->{$flavour}->{total} - -LDPREPORT + foreach my $ldp_project (sort keys %{$results->{ldp_project}}) { + say "== LDP project '$ldp_project'\n"; + foreach my $flavour (sort keys %{$results->{ldp_project}->{$ldp_project}->{flavour}}) { + say "=== $flavour\n"; + foreach my $aip_state (AIPSTATE) { + printf "* %10s:\n", $aip_state; + foreach my $filter (sort keys %{$results->{ldp_project}->{$ldp_project}->{flavour}->{$flavour}->{$aip_state}}) { + if ($filter eq "") { + printf "** %15s: %10u\n", + "total", + $results->{ldp_project}->{$ldp_project}->{flavour}->{$flavour}->{$aip_state}->{$filter}, + ; + } else { + printf "*** %25s: %10u\n", + $filter, + $results->{ldp_project}->{$ldp_project}->{flavour}->{$flavour}->{$aip_state}->{$filter}, + ; + } + } + say; + } } } return 1; } + + 1;