#!/usr/bin/perl -w
###############################################################################
# Author: Andreas Romeyke
# SLUB Dresden, Department Longterm Preservation
# copyright 2023/2024, licensed under terms of GNU General Public License 3.0,
# see file LICENSE.txt for details.
#
# scans a given repository and creates a SQLite database.
# This is part of the exit-strategy for details, see asciidoc file
# exit_strategie.asciidoc (also contains ER-diagram for database)
#
# call with:
#
# perl ./exit_strategy.pl rosetta_exit_strategy/tmp.db /permanent/
#
###############################################################################
#
# WARNING:
#
# the following messages only occure if you had an unclean SIP ingest process,
# it means that in your IE-XML are wrong/unused namespace declarations
#
# if some AIPs are wrong with messages like:
#
# '/permanent_storage/normal/2017/07/05/IE1043/V1-IE1043.xml:6: 
#  namespace error : xmlns:mods: 'http://www.loc.gov/mods/v3
#   http://www.loc.gov/standards/mods/v3/mods-3-0.xsd' is not a valid URI
#  s="http://www.loc.gov/mods/v3
#   http://www.loc.gov/standards/mods/v3/mods-3-0.xsd"'
#
# then (and only then) try this:
#
# perl ./exit_strategy.pl --recover rosetta_exit_strategy/tmp.db /permanent/
#
#
# call help with:
# perl ./exit_strategy.pl --help
###############################################################################

use v5.36;
use utf8; # for debugging output
use constant DEBUG => 0; # no debug
use Carp;
use DBD::SQLite;
use DBI;
use File::Basename qw(basename dirname);
use File::Find;
use Getopt::Long;
use IO::Handle;
use List::Util qw(first zip);
use Path::Tiny;
use Pod::Usage;
use Time::Progress;
use XML::LibXML::XPathContext;
use XML::LibXML;

sub check_lzaid ($lza_id) {
  my $rx_up=qr{[A-Za-z0-9_-]+}; # archive name & internal workflow
  my $rx_lw=qr{[a-z0-9_-]+};    # external workflow & external id
  # SLUB:LZA:Kitodo:testcases:fileoriginalpath_with_http
  return ($lza_id =~ m/^SLUB:LZA:$rx_up:$rx_lw:$rx_lw$/);
};

STDOUT->autoflush(1);
# guarantee, that output will be UTF8
binmode(STDOUT, ":encoding(UTF-8)");
my $db_name="exit_strategy";
my $schema_name="exit_strategy";
my $sourcetype="hdd"; #default value

###############################################################################
# write database creation
# write tables creation
# scan repository
#   if IE.xml file found, read its metadata, create SQL add entry
#   write SQL add entry
###############################################################################
sub write_database_creation ($dbh) {
     # non standard conform SQL keywords
     #say $fh "CREATE DATABASE $db_name;";
     #say $fh "CREATE SCHEMA $schema_name;";
     #say $fh "USE ";
  my $sql1=<<"SQL_PRAGMA_WAL";
PRAGMA journal_mode=WAL;
SQL_PRAGMA_WAL
  my $sql2=<<"SQL_PRAGMA_ASYNC";
PRAGMA synchronous = OFF;
SQL_PRAGMA_ASYNC
  my $sql3=<<"SQL_PRAGMA_TEMP";
PRAGMA temp_store = OFF;
SQL_PRAGMA_TEMP
  my $sth1 = $dbh->prepare($sql1); $sth1->execute() or die "sql problem detected", $dbh->errstr;
  my $sth2 = $dbh->prepare($sql2); $sth2->execute() or die "sql problem detected", $dbh->errstr;
  my $sth3 = $dbh->prepare($sql3); $sth3->execute() or die "sql problem detected", $dbh->errstr;
  return;
}

# write tables creation;:
sub write_tables_creation ($dbh) {
  my $sql1=<<"SQL_CREATE1";
  /* create AIP table */
  CREATE TABLE aip (
	id	INTEGER,
	ie_id	TEXT NOT NULL,
	lza_id  TEXT,
	version INTEGER NOT NULL,
	dir_path TEXT NOT NULL,
	PRIMARY KEY(id AUTOINCREMENT)
  );
SQL_CREATE1
  my $sql2=<<"SQL_CREATE2";
  /* create IEFILE table */
  CREATE TABLE metadatafile (
    id INTEGER,
    aip_id INTEGER NOT NULL REFERENCES aip (id),
    location TEXT NOT NULL,
    sourcetype TEXT NOT NULL,
    PRIMARY KEY(id AUTOINCREMENT)
  );
SQL_CREATE2
  my $sql3=<<"SQL_CREATE3";
  /* create DC table */
  CREATE TABLE dc (
    id INTEGER,
    aip_id INTEGER NOT NULL REFERENCES aip (id),
    element TEXT NOT NULL,
    value TEXT NOT NULL,
    PRIMARY KEY(id AUTOINCREMENT)
  );
SQL_CREATE3
  my $sql4=<<"SQL_CREATE4";
  /* create FILE table */
  CREATE TABLE sourcedatafile (
    id INTEGER,
    aip_id INTEGER NOT NULL REFERENCES aip (id),
    name TEXT NOT NULL,
    PRIMARY KEY(id AUTOINCREMENT)
  );
SQL_CREATE4
  my $sql5=<<"SQL_CREATE5";
  /* create LOCAT table */
  CREATE TABLE sourcedatalocat (
    id INTEGER,
    file_id INTEGER NOT NULL REFERENCES sourcedatafile (id),
    location TEXT NOT NULL,
    sourcetype TEXT NOT NULL,
    filesize INTEGER,
    PRIMARY KEY(id AUTOINCREMENT)
  );
SQL_CREATE5
  my $sql6=<<"SQL_CREATE6";
CREATE TABLE deleted (
    id INTEGER,
    aip_id INTEGER NOT NULL REFERENCES aip (id),
    reason TEXT NOT NULL,
    note TEXT,
    date DATE,
    authorized_by TEXT NOT NULL,
    PRIMARY KEY(id AUTOINCREMENT)
  );
SQL_CREATE6
  my $sql8=<<"SQL_CREATE8";
  CREATE TABLE purged (
      id INTEGER,
      aip_id INTEGER NOT NULL REFERENCES aip (id),
      reason TEXT,
      note TEXT,
      date DATE,
      authorized_by TEXT NOT NULL,
      PRIMARY KEY(id AUTOINCREMENT)
  );
SQL_CREATE8
  my $sql7=<<"SQL_CREATE7";
CREATE TABLE transferaip (
  id INTEGER,
  aip_id INTEGER NOT NULL REFERENCES aip (id),
  lza_id TEXT NOT NULL,
  transfer_state TEXT NOT NULL,
  PRIMARY KEY(id AUTOINCREMENT)
  );
SQL_CREATE7

  my $sth1 = $dbh->prepare($sql1); $sth1->execute() or die "sql problem detected", $dbh->errstr;
  my $sth2 = $dbh->prepare($sql2); $sth2->execute() or die "sql problem detected", $dbh->errstr;
  my $sth3 = $dbh->prepare($sql3); $sth3->execute() or die "sql problem detected", $dbh->errstr;
  my $sth4 = $dbh->prepare($sql4); $sth4->execute() or die "sql problem detected", $dbh->errstr;
  my $sth5 = $dbh->prepare($sql5); $sth5->execute() or die "sql problem detected", $dbh->errstr;
  my $sth6 = $dbh->prepare($sql6); $sth6->execute() or die "sql problem detected", $dbh->errstr;
  my $sth7 = $dbh->prepare($sql7); $sth7->execute() or die "sql problem detected", $dbh->errstr;
  my $sth8 = $dbh->prepare($sql8); $sth8->execute() or die "sql problem detected", $dbh->errstr;
  return 1;
}

###############################################################################
# Prepare SQL INSERT Statements for AIPs
###############################################################################
sub write_prepare_insert ($dbh) {
  return 1;
}

sub prepare_addsql( $dbh) {
  my $sql_aip_plan=<<"SQL_AIP_PLAN";
  INSERT INTO aip (ie_id, lza_id, version, dir_path) VALUES (\$1, \$2, \$3, \$4);
SQL_AIP_PLAN
  my $sql_ie_plan=<<"SQL_IE_PLAN";
  INSERT INTO metadatafile (aip_id, location, sourcetype) VALUES (
      (SELECT id FROM aip WHERE aip.ie_id=\$1 AND aip.version=\$2), \$3, \$4
    );
SQL_IE_PLAN
  my $sql_file_plan=<<"SQL_FILE_PLAN";
    INSERT INTO sourcedatafile (aip_id, name) VALUES (
      (SELECT id FROM aip WHERE aip.ie_id=\$1 AND aip.version=\$2), \$3
    );
SQL_FILE_PLAN
  my $sql_locat_plan=<<"SQL_LOCAT_PLAN";
    INSERT INTO sourcedatalocat (file_id, location, sourcetype, filesize) VALUES (
      (SELECT sourcedatafile.id FROM sourcedatafile,aip WHERE
      sourcedatafile.aip_id=aip.id AND aip.ie_id=\$1 AND aip.version=\$2 AND
      sourcedatafile.name=\$3), \$4, \$5, \$6
    );
SQL_LOCAT_PLAN
  my $sql_dc_plan=<<"SQL_DC_PLAN";
    INSERT INTO dc (aip_id, element, value) VALUES (
      (SELECT id FROM aip WHERE aip.ie_id=\$1 AND aip.version=\$2), \$3, \$4
    );
SQL_DC_PLAN
  my $sql_deleted_plan=<<"SQL_DELETE_PLAN";
    INSERT INTO deleted (aip_id, reason, note, date, authorized_by) VALUES (
        (SELECT id FROM aip WHERE aip.ie_id=\$1 AND aip.version=\$2), \$3, \$4, \$5, \$6
    );
SQL_DELETE_PLAN
  my $sql_purged_plan=<<"SQL_PURGED_PLAN";
    INSERT INTO purged (aip_id, reason, note, date, authorized_by) VALUES (
        (SELECT id FROM aip WHERE aip.ie_id=\$1 AND aip.version=\$2), \$3, \$4, \$5, \$6
    );
SQL_PURGED_PLAN


  my $sth_aip_plan = $dbh->prepare($sql_aip_plan);
  my $sth_ie_plan = $dbh->prepare($sql_ie_plan);
  my $sth_file_plan = $dbh->prepare($sql_file_plan);
  my $sth_locat_plan = $dbh->prepare($sql_locat_plan);
  my $sth_dc_plan = $dbh->prepare($sql_dc_plan);
  my $sth_deleted_plan = $dbh->prepare($sql_deleted_plan);
  my $sth_purged_plan = $dbh->prepare($sql_purged_plan);
  my $plans;
  $plans->{aip} = $sth_aip_plan;
  $plans->{ie} = $sth_ie_plan;
  $plans->{file} = $sth_file_plan;
  $plans->{locat} = $sth_locat_plan;
  $plans->{dc} = $sth_dc_plan;
  $plans->{deleted} = $sth_deleted_plan;
  $plans->{purged} = $sth_purged_plan;
  return $plans;
}

###############################################################################
# write add SQL entry, expects a hashref which contains ff. params
# (foreach file location/copy):
# INSERT INTO aip (ie_id) VALUES ($ieid);
# INSERT INTO iefile (aip_id, location, sourcetype) VALUES (
#       (SELECT id FROM aip where aip.ieid = $ieid), $location, $sourcetype);
# INSERT INTO file (aip_id, name) VALUES (
#       (SELECT id FROM aip where aip.ieid = $ieid), $name);
# INSERT INTO locat (file_id, location, sourcetype) VALUES (
#       (SELECT file.aip_id FROM file where file.aip_id = aip.id
#        AND aip.ie_id=$ieid), $location, $sourcetype)
# INSERT INTO dc (aip_id, element, value) VALUES (
#       (SELECT id FROM aip where aip.ieid = $ieid), $element, $value);
# TODO: needs additional work
# expects a reference of an hash:
#    $ret{"filename" } = $filename;
#     $ret{"title"} = $title;
#     $ret{"repid"} = $repid;
#     $ret{"files"} = \@files;
#     $ret{"dcrecords"} = \@dcrecords;
###############################################################################
{
  my $lza_id_counter=0;
  my %lza_id_counter_hash;
  sub write_addsql($dbh, $plans, $refhash) {
    my $iefile = basename($refhash->{"filename"});
    my $dir = dirname($refhash->{"filename"});
    my ($ieid, $ieversion);
    if ($iefile =~ m/^V(\d+)-(IE\d*)\.xml$/) {
      $ieversion = $1;
      $ieid = $2;
    }
    else {
      die "Could not detect PID and Version from IEFile '$iefile'\n";
    }
    my $lza_id_kv = List::Util::first {
      my ($dckey, $dcvalue) = @{$_};
      (($dckey eq 'dc:identifier')
          and (check_lzaid($dcvalue) == 1))
    } @{$refhash->{"dcrecords"}};
    my $lza_id = @{$lza_id_kv}[1];
    if (!defined $lza_id) {
      if (!exists( $lza_id_counter_hash{ $ieid} )) {
        $lza_id = sprintf("SLUB:LZA:migration_from:rosetta:unknown_%05lu", $lza_id_counter++);
        warn("no lza id detected on $refhash->{filename}, using $lza_id now!");
        $lza_id_counter_hash{ $ieid} = $lza_id;
      } else {
        $lza_id = $lza_id_counter_hash{ $ieid};
      }
    }
    # start SQL insert
    eval {$plans->{aip}->execute($ieid, $lza_id, $ieversion, $dir)} or die "sql problem detected with $ieid ($ieversion), plz check if there is a dublette dir, ", $dbh->errstr;
    # FIXME if multiple locations exists

    eval {$plans->{ie}->execute($ieid, $ieversion, $iefile, $sourcetype)} or die "sql problem detected with $ieid ($ieversion, $iefile, $sourcetype), ", $dbh->errstr;
    foreach my $location (@{$refhash->{"files"}}) {
      my $file = basename($location); # FIXME if multiple locations
      my $size = $refhash->{"sizes"}->{$location};
      $location=~s#//#/#g; # post correct file paths, because sometimes in Rosetta // is used
      eval {$plans->{file}->execute($ieid, $ieversion, $file)} or die "sql problem detected with $ieid ($ieversion, $file), ", $dbh->errstr;
      eval {$plans->{locat}->execute($ieid, $ieversion, $file, $location, $sourcetype, $size)} or die "sql problem detected with $ieid ($ieversion, $file, $location, $sourcetype, $size), ", $dbh->errstr;
    }
    foreach my $dcpair (@{$refhash->{"dcrecords"}}) {
      my ($dckey, $dcvalue) = @{$dcpair};
      # quote ' in dcvalue
      $dcvalue =~ tr/'/"/;
      eval {$plans->{dc}->execute($ieid, $ieversion, $dckey, $dcvalue)} or die "sql problem detected with $ieid ($ieversion, $dckey, $dcvalue), ", $dbh->errstr;
    }
    foreach my $delstate (qw(purged deleted)) { # we ignore "recovered" state here, because not an extra table
      if ($refhash->{$delstate}) {
        my $delete = $refhash->{$delstate};
        # TODO: call plan plans->{deleted}
        my $state = $delete->{'state'};
        my $reason = $delete->{'reason'};
        my $note = $delete->{'note'};
        my $date = $delete->{'date'};
        my $authorized_by = $delete->{'authorized_by'};
        eval {
          $plans->{$delstate}->execute($ieid, $ieversion, $reason, $note, $date, $authorized_by)
        } or die "sql problem detected with $delstate $ieid ($ieversion, $reason, $note, $date, $authorized_by), ", $dbh->errstr;
      }
    }
    return 1;
  }
}



###############################################################################
# add INDEX and other TRICKs to increase performance
###############################################################################
sub write_index_creation($dbh) {
  my $sql1=<<"SQL_INDEX1";
  CREATE UNIQUE INDEX aip_index ON aip (ie_id, version);
SQL_INDEX1
  my $sql2=<<"SQL_INDEX2";
  CREATE UNIQUE INDEX sourcedata_index ON sourcedatafile (aip_id, name);
SQL_INDEX2
  my $sql3=<<"SQL_INDEX3";
  CREATE INDEX delete_index ON deleted (aip_id);
SQL_INDEX3
  my $sql4=<<"SQL_INDEX4";
  CREATE INDEX aip_index2 ON aip (lza_id);
SQL_INDEX4
  my $sql5=<<"SQL_INDEX5";
  CREATE INDEX aip_index3 ON aip (ie_id,lza_id);
SQL_INDEX5
  my $sql6=<<"SQL_VIEW";
  CREATE VIEW non_deleted_aip(id,ie_id,lza_id,version,dir_path) AS 
    SELECT * FROM aip WHERE NOT EXISTS (
      SELECT aip_id FROM deleted WHERE aip.id == aip_id
    ) AND NOT EXISTS (
      SELECT aip_id FROM purged WHERE aip.id == aip_id
    );
SQL_VIEW
  my $sth1 = $dbh->prepare($sql1); $sth1->execute() or die "sql problem detected", $dbh->errstr;
  my $sth2 = $dbh->prepare($sql2); $sth2->execute() or die "sql problem detected", $dbh->errstr;
  my $sth3 = $dbh->prepare($sql3); $sth3->execute() or die "sql problem detected", $dbh->errstr;
  my $sth4 = $dbh->prepare($sql4); $sth4->execute() or die "sql problem detected", $dbh->errstr;
  my $sth5 = $dbh->prepare($sql5); $sth5->execute() or die "sql problem detected", $dbh->errstr;
  my $sth6 = $dbh->prepare($sql6); $sth6->execute() or die "sql problem detected", $dbh->errstr;
  return 1;
}

###############################################################################
# checks if a given string from from a given file contains only utf-8 chars
# which are compatible to common used databases
###############################################################################
sub check_if_db_conform ($string, $filename) {
  if ($string ne '') {
    if ( not utf8::is_utf8($string)) {
      croak "no utf8: '$string' in file '$filename'\n";
    }
  }#
  return;
}
{
  my $xp;
  sub get_xpath_context {
    if (defined $xp) { return $xp;}
    $xp = XML::LibXML::XPathContext->new();
    $xp->registerNs("dnx", "http://www.exlibrisgroup.com/dps/dnx");
    $xp->registerNs("sru", "http://www.loc.gov/zing/srw/");
    $xp->registerNs("xsi", "http://www.w3.org/2001/XMLSchema-instance");
    $xp->registerNs("dc", "http://purl.org/dc/elements/1.1/");
    $xp->registerNs("mets", "http://www.loc.gov/METS/");
    $xp->registerNs("rosettamets", "http://www.exlibrisgroup.com/xsd/dps/rosettaMets");
    $xp->registerNs("mods", "http://www.loc.gov/mods/v3");
    $xp->registerNs("ns2", "http://dps.exlibris.com/");
    $xp->registerNs("dv", "http://dfg-viewer.de/");
    $xp->registerNs("slub", "http://slub-dresden.de/");
    $xp->registerNs("archive", "http://slub-dresden.de/slubarchiv");
    $xp->registerNs("premis", "info:lc/xmlns/premis-v2");
    $xp->registerNs("mix", "http://www.loc.gov/standards/mix/");
    $xp->registerNs("xlink", "http://www.w3.org/1999/xlink");
    $xp->registerNs("xlin", "http://www.w3.org/1999/xlink");
    return $xp;
  }
}

{
  my $parser;
  sub get_parser ( $recovery_flag ){
    if (defined $parser) { return $parser;}
    $parser = XML::LibXML->new(recover => $recovery_flag,
        compact                        => 1,
        no_network                     => 1,
        recover                        => $recovery_flag,
        no_blanks                      => 1,
    );
    return $parser;
  }
}

###############################################################################
#
# /mets:mets/mets:dmdSec[1]/mets:mdWrap[1]/mets:xmlData[1]/dc:record[1]/dc:title[1]
# /mets:mets/mets:amdSec[1]/mets:techMD[1]/mets:mdWrap[1]/mets:xmlData[1]/dnx[1]/section[1]/record[1]/key[2]
# mit ID=Label und Wert = LOCAL
# dort die ID von techMD (Referenz für Files)
#
# Files via /mets:mets/mets:fileSec[1]/mets:fileGrp[1]/mets:file[1]/mets:FLocat[1]
#
###############################################################################
  my $compiled_xpath_dmdSec = XML::LibXML::XPathExpression->new('/mets:mets/mets:dmdSec');
  my $compiled_xpath_amdSec = XML::LibXML::XPathExpression->new('/mets:mets/mets:amdSec');
  my $compiled_xpath_amdSec_rep = XML::LibXML::XPathExpression->new('/mets:mets/mets:amdSec[starts-with(@ID, \'REP\')]');
  my $compiled_xpath_amdSec_fl = XML::LibXML::XPathExpression->new('/mets:mets/mets:amdSec[starts-with(@ID, \'FL\')]');
  my $compiled_xpath_amdSec_ie = XML::LibXML::XPathExpression->new('/mets:mets/mets:amdSec[@ID="ie-amd"]');
  my $compiled_xpath_fileSec = XML::LibXML::XPathExpression->new('/mets:mets/mets:fileSec');
  my $compiled_xpath_titles = XML::LibXML::XPathExpression->new('mets:mdWrap[1]/mets:xmlData[1]/dc:record/dc:title[1]');
  my $compiled_xpath_fileGrp = XML::LibXML::XPathExpression->new('/mets:mets/mets:fileSec/mets:fileGrp');
  my $compiled_xpath_files = XML::LibXML::XPathExpression->new("mets:file/mets:FLocat/\@xlin:href");
  my $compiled_xpath_filepids = XML::LibXML::XPathExpression->new('mets:file/@ID');
  my $compiled_xpath_dcrecords = XML::LibXML::XPathExpression->new('mets:mdWrap/mets:xmlData/dc:record/*');
  my $compiled_xpath_repid = XML::LibXML::XPathExpression->new('/mets:mets/mets:amdSec[starts-with(@ID, \'REP\')]/@ID');
  my $str_local_record = "mets:techMD/mets:mdWrap/mets:xmlData/*[local-name()=\'dnx\']/*[local-name()=\'section\']/*[local-name()=\'record\']";
  my $str_local_reps = "$str_local_record/*[local-name()=\'key\' and \@id=\'label\' and (. = \'LOCAL\'  or . = \'LZA_INTERN\' or . = \'LZA\')]";
  my $str_repid_old = "/mets:mets/mets:amdSec[starts-with(\@ID, \'REP\') and $str_local_reps]/\@ID";
  my $compiled_xpath_repid_old = XML::LibXML::XPathExpression->new($str_repid_old);
  #  only event 272 or 274 should be used, next line uses a multiple-predicate hack for XPATH 1.0
  my $xpath_if_anydelete_event_expr = 'mets:digiprovMD[@ID="ie-amd-digiprov"]/mets:mdWrap/mets:xmlData/dnx:dnx/dnx:section[@id="event"]/dnx:record[dnx:key[@id="eventIdentifierValue"][. >= 272][. <= 274]]';
  my $compiled_xpath_anydelete_event = XML::LibXML::XPathExpression->new('dnx:key[@id="eventDescription"]/text()');
  my $compiled_xpath_anydelete_date = XML::LibXML::XPathExpression->new('dnx:key[@id="eventDateTime"]/text()');
  my $compiled_xpath_anydelete_authorized_by = XML::LibXML::XPathExpression->new('dnx:key[@id="linkingAgentIdentifierValue1"]/text()');
  my $compiled_xpath_anydelete_reason = XML::LibXML::XPathExpression->new('dnx:key[@id="eventOutcomeDetail1"]/text()');

  ############################################################
  sub get_title ($xp, $dmd){
    # get title
    my $title = $xp->findvalue($compiled_xpath_titles, $dmd);
    return $title;
  }

  sub get_dcrecords_ref ($xp, $dmd){
    my @dcnodes = $xp->findnodes($compiled_xpath_dcrecords, $dmd);
    my @dcrecords = map {
        $_->[1]=~ tr/\n/ /;
        $_->[1] =~ s/'/\\'/g;
      $_;
    } grep {
      defined $_->[0] &&
      defined $_->[1];
    } map {
      # array ref
      [$_->getName, $_->textContent];
    } @dcnodes;
     return \@dcrecords;
  }

  sub get_repid ($xp){
    my $repid;
    my @repnodes = $xp->findnodes($compiled_xpath_repid); #/mets:mets/mets:amdSec/@ID
    my $found = scalar @repnodes;
    if (1 == $found) {
      $repid = $repnodes[0]->nodeValue;
    } elsif ($found > 1) { #multiple representations found
      # choose reppid with LZA, LZA_INTERN or LOCAL (for very old IEs)
      $repid = $xp->findvalue( $compiled_xpath_repid_old);
    }
    # if $found == 0 do nothing
    return $repid;
  }
  sub get_filepids_ref ($xp, $filegrp) {
    my @files_nodes = $xp->findnodes($compiled_xpath_filepids, $filegrp);
    my @filepids =  map { $_->nodeValue; } @files_nodes;
    return \@filepids;
  }

  sub get_files_ref ($xp, $filegrp){
    my @files_nodes = $xp->findnodes($compiled_xpath_files, $filegrp);
    my @files =  map { $_->nodeValue; } @files_nodes;
    return \@files;
  }

  sub get_file_path($xp, $filegrp, $filepid) {
    return $xp->findvalue("./mets:file[\@ID=\"$filepid\"]/mets:FLocat/\@xlin:href", $filegrp);
  }
  sub get_file_path_rx($xml, $filepid) { # this method is around 2.5x slower than get_file_path
    my $nt = qr{[^>]*};
    if ($xml=~m{(?:<mets:file${nt}ID="$filepid"$nt>\s*<mets:FLocat${nt}xlin:href=")([^"]*)}s) {
      #warn "path '$1'\n";
      return $1;
    }
  }

  sub get_filesize ($xp, $filepid) {
    my $xpath =<<"XPATH";
        /mets:mets/mets:amdSec[\@ID='$filepid-amd']/mets:techMD[\@ID='$filepid-amd-tech']
        /mets:mdWrap/mets:xmlData/*[namespace-uri()='http://www.exlibrisgroup.com/dps/dnx' and local-name()='dnx']
      /*[namespace-uri()='http://www.exlibrisgroup.com/dps/dnx' and local-name()='section' and \@id='generalFileCharacteristics']
      /*[namespace-uri()='http://www.exlibrisgroup.com/dps/dnx' and local-name()='record']
      /*[namespace-uri()='http://www.exlibrisgroup.com/dps/dnx' and local-name()='key' and \@id='fileSizeBytes']/text()
XPATH
    return $xp->findvalue($xpath);
  }
  sub get_filesize_rx($xml, $filepid) {
    #say "size -";
    if ($xml=~m{(?:<mets:techMD\s*ID="$filepid-amd-tech">.*?<key id="fileSizeBytes">)([^<]*)}s) {
      #say "size $1";
      return $1;
    }
  }

  sub get_anydelete_state($xp, $dnx) {
    # we need to earch for eventIdentifierValue 272 or 274.
    # the eventDescription should be
    #   a) IE has been deleted
    #   b) IE has been purged
    my @del_nodes = map {$_->nodeValue} $xp->findnodes($compiled_xpath_anydelete_event, $dnx);
    my @states = map {$_ =~ s/^IE has been (deleted|purged|recovered)$/$1/; $_}  @del_nodes;
    return $states[0];
  }

  sub has_anydelete_entries($xp, $dnx) {
    # we need to earch for eventIdentifierValue 272 or 274.
    # the eventDescription should be
    #   a) IE has been deleted
    #   b) IE has been purged
    return $xp->exists($compiled_xpath_anydelete_event, $dnx);
  }

  sub get_anydelete_date($xp, $dnx) {
    my @date_nodes = $xp->findnodes($compiled_xpath_anydelete_date, $dnx);
    my @dates = map {$_->nodeValue} @date_nodes;
    return $dates[0];
  }

sub get_anydelete_note($xp, $dnx) {
  my @entries_str = map {$_->nodeValue} $xp->findnodes($compiled_xpath_anydelete_reason, $dnx);
  my @results;
  foreach my $e ( @entries_str) {
    my @entries = split(/;/, $e);
    push @results, List::Util::first{ s/note=//; } @entries
  }
  return $results[0];
}

  sub get_anydelete_reason($xp, $dnx) {
    my @entries_str = map {$_->nodeValue} $xp->findnodes($compiled_xpath_anydelete_reason, $dnx);
    my @results;
    foreach my $e ( @entries_str) {
      my @entries = split(/;/, $e);
      push @results, List::Util::first{ s/reason=//; } @entries
    }
    return $results[0];
  }

  sub get_anydelete_authorized_by($xp, $dnx) {
    my @auth = map {$_->nodeValue} $xp->findnodes($compiled_xpath_anydelete_authorized_by, $dnx);
    return $auth[0];
  }

sub check_if_file_is_readable($filename) {
  # workaround to check if NFS read errors
  my $ok = open(my $FH, '<', $filename);
  if (!$ok) {
    warn "file '$filename' could not be opened, $!\n";
    return;
  }
  binmode($FH);
  my $buf;
  my $readc = sysread $FH, $buf, 4;
  unless (4 == $readc) {
    warn "file '$filename' could not read 4 bytes from (beginning file), $!\n";
    return;
  }
  my $pos = sysseek $FH, -4, 2;
  unless (defined $pos) {
    warn "file '$filename' could not seek 4 bytes before end, $!\n";
    return;
  }
  unless (4 == $readc) {
    warn "file '$filename' could not read 4 bytes from (end file), $!\n";
    return;
  }
  close $FH;
  return 1;
}

sub _handle_deleted_or_purged_aips($xp, $ret) {
  my $amdsec_ie = $xp->findnodes($compiled_xpath_amdSec_ie)->[0];
  my @ctxs = $xp->findnodes($xpath_if_anydelete_event_expr, $amdsec_ie);
  my $cmp_by_date_and_state = sub ($a, $b) {
    my $adate = get_anydelete_date($xp, $a);
    my $bdate = get_anydelete_date($xp, $b);
    my $dateres = $bdate cmp $adate;
    if (0 == $dateres) {
      my $astate = get_anydelete_state($xp, $a);
      my $bstate = get_anydelete_state($xp, $b);
      return $bstate cmp $astate # "purged" > "delete"
    }
    return $dateres; # newer > older
  };
  my $latest_ctx = (sort { $cmp_by_date_and_state->($a, $b) } @ctxs)[0];
  # check if latest entry is either deleted or purged, only than add entry
  my $is_deleted_or_purged = has_anydelete_entries($xp, $latest_ctx);
  if ($is_deleted_or_purged) {
      my $state = get_anydelete_state($xp, $latest_ctx);
      $ret->{$state}->{state} = $state;
      $ret->{$state}->{reason} = get_anydelete_reason($xp, $latest_ctx);
      $ret->{$state}->{note} = get_anydelete_note($xp, $latest_ctx);
      $ret->{$state}->{date} = get_anydelete_date($xp, $latest_ctx);
      $ret->{$state}->{authorized_by} = get_anydelete_authorized_by($xp, $latest_ctx);
    }
  return $is_deleted_or_purged;
}

sub parse_iexml($filename, $recovery_flag) {
  my $retry = 3;
  while ($retry > 0) {

    if (!check_if_file_is_readable($filename)) {
      $retry--;
      my $wait = int(rand(60));
      warn "detected non-readable file $filename, retrying after $wait seconds ... ($retry)";
      sleep $wait;
      next;
    }

    if ($recovery_flag) {
      $recovery_flag = 2; # avoid warnings, see XML::LibXML::Parser POD about 'recovery'
    }
    my $parser = get_parser($recovery_flag);
    my $dom = $parser->parse_file($filename);
    #my $slurp = path($filename)->slurp;
    my $xp = get_xpath_context();
    $xp->setContextNode($dom);
    my $dmdsec = $xp->findnodes($compiled_xpath_dmdSec)->[0];
    #my $amdsec_rep = $xp->findnodes($compiled_xpath_amdSec_rep)->[0];

    my $filesec = $xp->findnodes($compiled_xpath_fileSec)->[0];
    my $ret;
    $ret->{"filename"} = $filename;
    ############################################
    # get title
    $ret->{"title"} = get_title($xp, $dmdsec);
    ############################################
    # get dc-records
    $ret->{"dcrecords"} = get_dcrecords_ref($xp, $dmdsec);
    ############################################
    my $is_deleted = _handle_deleted_or_purged_aips($xp, $ret);
    ############################################
    # get right representation ID (has a dnx-section with <key id=label>LOCAL</key>)
    my $repid = get_repid($xp);
    if (!defined $repid) {
      say STDERR "No repid found in file $filename, is IE intentionally purged?";
      if ($is_deleted) {
        say STDERR "   Yes, a corresponding purge/delete/recover event is found.";
      }
      else {
        say STDERR "   No, a corresponding purge/delete/recover event was missed. This indicates an error in archive.";
      }
      return $ret;
    }

    $ret->{"repid"} = $repid;

    ############################################
    # get all files of LOCAL representation
    my $filegrp = $xp->findnodes("mets:fileGrp[\@ADMID='$repid']", $filesec)->[0];
    $ret->{"filepids"} = get_filepids_ref($xp, $filegrp);
    $ret->{"files"} = get_files_ref($xp, $filegrp);
    my @locations = map {get_file_path($xp, $filegrp, $_)} @{$ret->{"filepids"}};
    my @sizes = map {get_filesize($xp, $_)} @{$ret->{"filepids"}};
    my %ls = map { $_->[0] => $_->[1] } zip \@locations, \@sizes;
    $ret->{"sizes"} = \%ls;
    #use Data::Printer; p($ret);

    return $ret;
  }
  return;
}


# returns count of subdirs of $dir
sub searching_relevant_subdirs ($dir) {
  my $first_two_levels_of_dirs = 0;
  my $rx_dir = qr{^\Q$dir\E/?};
  ###
  my $wanted_twolevel_dircount = sub {
    if (/^\.[A-Za-z0-9]/) { # ignore dot-dirs
      $File::Find::prune = 1;
    } else {
      my $relpath = $File::Find::name;
      $relpath =~ s{$rx_dir}{};
      my $depth = File::Spec->splitdir($relpath);

      $depth >= 2
          and $File::Find::prune = 1;
      if (-d $_) {$first_two_levels_of_dirs++;}
    }
  };
  ###
  find($wanted_twolevel_dircount, $dir);
  return $first_two_levels_of_dirs;
}

sub searching_relevant_ie_files ($dir, $tmp_ies_unsorted_file, $first_two_levels_of_dirs) {
  my $progressbar = Time::Progress->new(min => 0, max => $first_two_levels_of_dirs, smoothing => 1);
  my $rx_dir = qr{^\Q$dir\E/?};
  ###
  my $dircount = 0;

  my $wanted_process_sip = sub {
    if (-f && m/V(\d+)-IE\d+\.xml$/) {
      my $version = $1;
      my $file = $File::Find::name;
      # fake name to ue alphabetical sort
      my $fakeversion = sprintf("%05i", $version);
      $file =~ s/V\d+-IE/V$fakeversion-IE/;
      $tmp_ies_unsorted_file->append($file . "\n");
      $dircount++;
      $File::Find::prune = 1;
    }
    elsif (-d) {
      if (/^\.[A-Za-z0-9]/) { # ignore dot-dirs
        $File::Find::prune = 1;
      } else {
        my $relpath = $File::Find::name;
        $relpath =~ s{$rx_dir}{};
        my $depth = File::Spec->splitdir($relpath);
        if ($depth <= 2) {
          print $progressbar->report("find IE files:        %40b  ETA: %E   \r", $dircount);
        }
      }
    }
    return;
  };
  ###
  find($wanted_process_sip, $dir);
  return $dircount;
}

sub searching_ie_files ($dirs_ref, $tmp_ies_unsorted_file) {
  my $cnt_unsorted_files = 0;
  my $pass = 0;
  my @dirs = @{ $dirs_ref };
  my $maxpass=scalar @dirs;
  foreach my $dir (@dirs) {
    $pass++;
    my $begin_time = time;
    my $first_two_levels_of_dirs = searching_relevant_subdirs($dir);
    my $count = searching_relevant_ie_files($dir, $tmp_ies_unsorted_file, $first_two_levels_of_dirs);
    my $end_time = time;
    my $diff_time = $end_time - $begin_time;
    say "found $count IEs in pass $pass/$maxpass (dir '$dir') in $diff_time s                    ";
    $cnt_unsorted_files+=$count;
  }
  say "\r                                                                                                             ";
  return $cnt_unsorted_files;
}

###############################################################################
###############################################################################
############# main ############################################################
###############################################################################
###############################################################################
my $tmp_ies_dir = Path::Tiny->tempdir(TEMPLATE => "exitstrategy_XXXXXXXXXXX", CLEANUP => 1);
my $tmp_ies_unsorted_file = $tmp_ies_dir->child("unsorted_ies");
my $flag_recovery = undef;
my $flag_sqldump = undef;
my $flag_continue = undef;
my $db_filename = $db_name . ".db";
my $debug_to_file = undef;
my @ARGV_tail;
GetOptions(
    "help|?"          => sub {
      say <<"HELP";
call $0 with following options
        --help ............... this help
        --recovery ........... set special recovery flag, if set deleted IEs will be ignored (handle with care!)
        --continue ........... tries to add IEs to existing database, ignores IEs which already exists in DB (dangerous!)
        --sqlitedb-file=FILE . set database to file FILE
        --enable-sqldump ..... dumps a given database as SQL to STDOUT
        --debug-to-file=FILE . dumps internal structure as YAML to FILE

        create an exit-DB:
        $0 [--recovery] [--sqlitedb-file=FILE] DIR [DIR…]

        dump an exit-DB as SQL
        $0 [--sqlitedb-file=FILE] --enable-sqldump
HELP
      exit;
    },
    "continue"        => \$flag_continue,
    "recovery"        => \$flag_recovery,
    "sqlitedb-file=s" => \$db_filename,
    "enable-sqldump"  => \$flag_sqldump,
    "debug-to-file=s" => \$debug_to_file,
    '<>'              => sub {push @ARGV_tail, @_;}
) or pod2usage(2);

if (defined $flag_recovery) {warn "recovery enabled for XML processing\n";}
if (defined $flag_sqldump) {
  open my $cmd, '|-', '/usr/bin/sqlite3', $db_filename, '.dump' or die "Failed to dump DB: $!\n";
  close $cmd or die "Failed to dumb DB: $!\n";
  exit;
}
if ($#ARGV_tail < 0) {
  die "you need at least a directory as argument\n";
}
my $old_stdout = select(STDOUT);
$| = 1;
select($old_stdout);

if (
    (! defined $flag_continue)
        || (! -e $db_filename)
) {
  say "preparing SQL";
  my $dbh = DBI->connect("dbi:SQLite:dbname=$db_filename", "", "", {
      RaiseError     => 1,
      sqlite_unicode => 1,
  }) or die "could not connect to database (file '$db_filename')", $DBI::errstr;
  write_database_creation($dbh);
  write_tables_creation($dbh);
  write_prepare_insert($dbh);
  write_index_creation($dbh);
  $dbh->disconnect or warn("disconnecting problems, ", $dbh->errstr);
} else {
  say "using existing DB (continue mode!)";
}
if (0==@ARGV_tail ){
    die "no directory given on commandline"
}
my @dirs;
while (@ARGV_tail > 0) {
  my $dir = shift @ARGV_tail;
  if (defined $dir && -d "$dir") {
    push @dirs, $dir;
  }
}
$tmp_ies_unsorted_file->touch();
say "searching IE files";
my $cnt_unsorted_files = searching_ie_files(\@dirs, $tmp_ies_unsorted_file);
# /permanent_storage/2020/04/02/IE201080/V1-FL201091.xml
# /permanent_storage/2020/04/02/IE201080/V2-FL201091.xml
my $fh_unsorted_IEs = $tmp_ies_unsorted_file->openr();
my $count = 0;

my $progressbar = Time::Progress->new(min => 1, max => $cnt_unsorted_files);
my $dbh = DBI->connect("dbi:SQLite:dbname=$db_filename", "", "", {
    RaiseError     => 1,
    sqlite_unicode => 1,
}) or die "could not connect to database (file '$db_filename')", $DBI::errstr;
my $begin_time = time;
my $plans = prepare_addsql( $dbh);
while (<$fh_unsorted_IEs>) {
  chomp;
  print $progressbar->report("parse IE files:       %40b  ETA: %E   \r", $count++);
  s/V0*(\d+-IE)/V$1/; # revert fake version
  my $ret = parse_iexml($_, $flag_recovery);
  if (defined $debug_to_file and length($debug_to_file)>2) {
    require YAML::XS;
    YAML::XS::DumpFile($debug_to_file, $ret) or warn "could not dump to file $debug_to_file";
  }
  $dbh->begin_work;
  eval {
    write_addsql($dbh, $plans, $ret);
  };
  if ($@) {
    $dbh->rollback;
    say "Rollback, because $@";
  } else {
    $dbh->commit;
  }
}

$dbh->disconnect or warn("disconnecting problems, ", $dbh->errstr);
my $end_time = time;
my $diff_time = $end_time - $begin_time;
say "\rprocessed $count uniq IEs in $diff_time s                                                                      ";
say "";

1;