Skip to content
Snippets Groups Projects
Verified Commit 9f74b321 authored by Andreas Romeyke's avatar Andreas Romeyke
Browse files

- init

parent f327a807
No related branches found
No related tags found
No related merge requests found
package SLUB::LZA::TA::Command::sample_testing;
use SLUB::LZA::TA -command;
use v5.36;
use utf8;
use SLUB::LZA::TA::Archivematica::Elasticsearch;
use SLUB::LZA::TA::Archivematica::Elasticsearch::PrepareQuery;
use SLUB::LZA::TA::Output;
use Archive::BagIt::Fast;
use Date::Calc qw(Date_to_Time Today Add_Delta_YM Add_Delta_YMD Day_of_Week);
use List::Util qw(sample);
use namespace::autoclean -except => qr{SLUB::LZA::TA::.*};
# VERSION
# ABSTRACT: sample testing module for ta-tool
sub abstract { return "tests samples of AIPs in current Archival Information System (AIS)";}
my $base_cmd = "$0 sample-test";
my $dummycmd = " "x length($base_cmd);
my $description=<<"DESCR";
Ask an AIS for a sample of AIPs and tests them.
Overview:
$base_cmd [--daily| --weekly | --monthly | --yearly] [--workflow WORKFLOW] [--factor FACTOR]
$dummycmd --version
$dummycmd --help
Examples:
* Report sample testing statistics of AIPs last month for workflow Kitodo
'$base_cmd --monthly --workflow Kitodo'
* Report 1‰ sample statistics of AIPs last year
'$base_cmd --yearly --factor 0.001'
A printable PDF version could be generated using ff. commands:
'$base_cmd | asciidoctor-pdf - > report.pdf'
HINT: If you want lists, use the 'search' command instead!
HINT: ensure the ElasticSearch server allows to return >10.000 results if your archive is large
DESCR
sub description {
return "$description"
}
sub opt_spec {
my @global_opts= SLUB::LZA::TA::common_global_opt_spec();
my @local_opts = (
[ 'datemode' => hidden => {
one_of => [
[ 'daily|d' => 'sample based on last day'],
[ 'weekly|W' => 'sample based on last week'],
[ 'monthly|m' => 'sample based on last month' ],
[ 'yearly|y' => 'sample based on last year' ],
[ 'ldpyearly' => 'sample based on last LDP year 01.11. - 31.10.'],
[ 'complete|c'=> 'sample based on all AIPs (default)'],
[ 'date-from=s' => 'sample based on date range, beginning date in format "YYYY-MM-DD", implies "--date-to"'],
],
},
],
[ 'date-to=s' => 'report based on date range, end date in format "YYYY-MM-DD", implies "--date-from"'],
[],
[ 'output-format' => hidden => {
one_of => [
[ 'output-as-csv|C' => 'prints output as Comma Separated Values (CSV)' ],
[ 'output-as-raw|R' => 'print raw hash output' ],
[ 'output-as-rsv|r' => 'prints output as Raw Strings Values (RSV)' ],
[ 'output-as-asciidoc|a' => 'prints output as AsciiDoc [default]' ],
],
}
],
[],
[ 'factor|f=f' => 'use the given sampling factor (based on AIP count, default: 0.001)', {default => 0.001}],
[],
);
return (@global_opts, [], @local_opts);
}
sub validate_args { ## no critic (CognitiveComplexity::ProhibitExcessCognitiveComplexity)
my ($self, $opt, $args) = @_;
SLUB::LZA::TA::common_global_validate($self, $opt, $args);
# no args allowed but options!
$self->usage_error("No args allowed") if @$args;
my ($cyear, $cmonth, $cday) = Today();
my ($from_year, $from_month, $from_day);
my ($to_year, $to_month, $to_day);
unless (exists $opt->{datemode}) {
$opt->{datemode} = "complete";
$opt->{complete} = 1;
}
my %date_recipe;
$date_recipe{daily} = sub {
($from_year, $from_month, $from_day) = Add_Delta_YMD($cyear, $cmonth, $cday, 0, 0, -1);
($to_year, $to_month, $to_day) = ($from_year, $from_month, $from_day);
};
$date_recipe{weekly} = sub {
($from_year, $from_month, $from_day) = Add_Delta_YMD($cyear, $cmonth, $cday, 0, 0, -Day_of_Week($cyear, $cmonth, $cday) - 6);
($to_year, $to_month, $to_day) = Add_Delta_YMD($from_year, $from_month, $from_day, 0, 0, 6);
};
$date_recipe{monthly} = sub {
($from_year, $from_month, $from_day) = Add_Delta_YM($cyear, $cmonth, 1, 0, -1);
($to_year, $to_month, $to_day) = Add_Delta_YMD($from_year, $from_month, $from_day, 0, 1, -1);
};
$date_recipe{yearly} = sub {
($from_year, $from_month, $from_day) = Add_Delta_YM($cyear, 1, 1, -1, 0);
($to_year, $to_month, $to_day) = Add_Delta_YMD($from_year, $from_month, $from_day, 1, 0, -1);
};
$date_recipe{complete} = sub {
$from_year = 2015;
$from_month = 1;
$from_day = 1;
$to_year = $cyear;
$to_month = $cmonth;
$to_day = $cday;
};
$date_recipe{ldpyearly} = sub {
($from_year, $from_month, $from_day) = Add_Delta_YM($cyear, 1, 1, -1, -2);
($to_year, $to_month, $to_day) = Add_Delta_YMD($from_year, $from_month, $from_day, 1, 0, -1);
};
$date_recipe{date_from} = sub {
$self->usage_error('--date-from implies --date-to"') unless exists $opt->{date_to};
if ($opt->{date_from} =~ m/^(\d{4})-(\d{2})-(\d{2})$/) {
($from_year, $from_month, $from_day) = ($1, $2, $3);
} else {
$self->usage_error('--date-from expects date in format "YYYY-MM-DD", got "' . $opt->{date_from} . '"');
}
};
$date_recipe{date_to} = sub {
$self->usage_error('--date-to implies --date-from"') unless exists $opt->{date_from};
if ($opt->{date_to} =~ m/^(\d{4})-(\d{2})-(\d{2})$/) {
($to_year, $to_month, $to_day) = ($1, $2, $3);
} else {
$self->usage_error('--date-to expects date in format "YYYY-MM-DD", got "', $opt->{date_to} . '"');
}
};
foreach my $key (keys %{ $opt } ) {
$date_recipe{$key}->() if (defined $date_recipe{$key} and ref $date_recipe{$key} eq 'CODE');
}
$opt->{output_format} = 'output_as_asciidoc' unless (exists $opt->{output_format});
my $from_epoch = Date_to_Time($from_year, $from_month, $from_day, 0, 0, 0);
my $to_epoch = Date_to_Time($to_year, $to_month, $to_day, 0, 0, 0);
$self->usage_error('--date-to should have a date newer than --date-from') if ($from_epoch > $to_epoch);
printf STDERR "reporting for period %04u-%02u-%02u … %04u-%02u-%02u\n", $from_year, $from_month, $from_day, $to_year, $to_month, $to_day;
$opt->{creationdate_epochs}->{from} = $from_epoch;
$opt->{creationdate_epochs}->{to} = $to_epoch;
$opt->{creationdate_epochs}->{from_string} = sprintf("%04u-%02u-%02u", $from_year, $from_month, $from_day);
$opt->{creationdate_epochs}->{to_string} = sprintf("%04u-%02u-%02u", $to_year, $to_month, $to_day);
$self->usage_error('--factor FACTOR expects a FACTOR between 0 and 1, got ', $opt->{factor}) if ( $opt->{factor} > 1.0 or $opt->{factor} < 0.0);
return 1;
}
sub _check_aip ($sample) {
my $path = $sample->{filePath};
my $is_valid=0;
if (-d $path) {
my $bag = Archive::BagIt::Fast->new($path);
$bag->use_plugins('Archive::BagIt::Plugin::Algorithm::SHA512');
$is_valid = $bag->verify_bag( {report_all_errors => 1} );
if (! $is_valid) {$is_valid=0;}
}
my $line;
$line->{aipid}=$sample->{uuid};
$line->{filePath}=$sample->{filePath};
$line->{lzaid}=$sample->{transferMetadata}->[0]->{'bim:bag-info_dict'}->{'bim:SLUBArchiv-lzaId'};
$line->{isvalid}=$is_valid ;
$line->{checkdate}=localtime;
$line->{creationdate}=localtime $sample->{created};
return $line;
}
sub _execute {
my ($self, $opt, $args) = @_;
my $aips_query;
my $aips_response;
use Data::Printer;
# only index aips needed
$aips_query = SLUB::LZA::TA::Archivematica::Elasticsearch::PrepareQuery::prepare_aip_query($opt);
# next lines extend query with reporting
$aips_query->{'_source'} = {'includes' =>
['uuid', 'filePath', 'transferMetadata.bim:bag-info_dict.bim:SLUBArchiv-lzaId', 'created']};
#p($aips_query);
$aips_response = SLUB::LZA::TA::Archivematica::Elasticsearch::query_elasticsearch(
$SLUB::LZA::TA::config{elasticsearch_protocol},
$SLUB::LZA::TA::config{elasticsearch_host},
$SLUB::LZA::TA::config{elasticsearch_port},
'aips', # indexname
$aips_query, # query_hash ref
{
debug => $opt->{debug},
}
);
my $max_found_aips = $aips_response->{hits}->{total};
my $sample_size = int($max_found_aips * $opt->{factor} + 0.5);
if ($opt->{debug}) {
say STDERR "found $max_found_aips AIPs, use only a $opt->{factor} subsample of $sample_size AIPs";
}
# select first sample size aips
my @sample_set = sort {$a->{uuid} cmp $b->{uuid}} List::Util::sample $sample_size, map {$_->{'_source'} } @{ $aips_response->{hits}->{hits} };
my @result = map { _check_aip($_) } @sample_set;
return \@result;
}
sub execute($self, $opt, $args) {
my %results;
$results{date} = sprintf("%04u-%02u-%02u", Today());
$results{package} = __PACKAGE__;
$results{from} = $opt->{creationdate_epochs}->{from_string};
$results{to} = $opt->{creationdate_epochs}->{to_string};
# we need only _source->filePath and AIPID (and LZA-ID?)
my $res = _execute($self, $opt, $args);
$results{lines} = $res;
my ($headers, $table) = prepare_for_table($res);
print_humanreadable_report(\%results) if ($opt->{output_format} eq 'output_as_asciidoc');
SLUB::LZA::TA::Output::RSV::print_results($table) if ($opt->{output_format} eq 'output_as_rsv');
SLUB::LZA::TA::Output::CSV::print_results($table) if ($opt->{output_format} eq 'output_as_csv');
SLUB::LZA::TA::Output::Raw::print_results(\%results) if ($opt->{output_format} eq 'output_as_raw');
say STDERR "report is already sent to STDOUT.";
return 1;
}
sub prepare_for_table($results) { ## no critic (CognitiveComplexity::ProhibitExcessCognitiveComplexity)
my @table = @{ $results };
my @headers = sort keys %{ $table[0] };
return \@headers, \@table;
}
sub _print_aip($line) {
my $aipid=$line->{aipid};
#my $isvalid=($line->{isvalid}?"valid (✅)":"*invalid* (❎)");
my $isvalid=($line->{isvalid}?"valid":"invalid");
my $lzaid=$line->{lzaid}?"With lzaid '$line->{lzaid}'":"No lzaid defined";
my $path=$line->{filePath};
say " * AIP `$aipid`";
say " ** Is $isvalid";
say " ** $lzaid";
say " ** Path is `$path`";
say " ** processed at $line->{checkdate}, created at $line->{creationdate}";
return 1;
}
sub print_humanreadable_report ($results) { ## no critic (CognitiveComplexity::ProhibitExcessCognitiveComplexity)
say <<"RPTHEADER";
:lang: en
:doctype: article
:date: $results->{date}
:icons:
:text-align: left
:generator: $0 ($results->{package})
= Report from $results->{from} to $results->{to}
RPTHEADER
say "== Complete archive\n";
say "=== Invalid AIPs";
foreach my $line (grep {!($_->{is_valid})} @{$results->{lines} }) {
_print_aip($line)
}
say "=== Valid AIPs";
foreach my $line (grep {!($_->{is_valid})} @{$results->{lines} }) {
_print_aip($line)
}
return 1;
}
1;
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment