Select Git revision
validate_workflow.sh
validate_workflow.sh 18.88 KiB
#!/usr/bin/env bash
# Author: Andreas Romeyke
# SLUB Dresden, Department Longterm Preservation
# copyright 2023, licensed under terms of GNU General Public License 3.0,
# see file LICENSE.txt for details.
### META
# AUTHORS:
# - Andreas Romeyke (<Andreas.Romeyke@slub-dresden.de>)
# HINT for developers
# - indent code with 4 spaces
# - use UTF8 without BOM
# - use 'newline' as line ending
# in IntelliJ idea the settings are:
# Indent case statements
# Use Unix line separators
# Tab size: 4
# Indent: 4
# Shfmt formatter: ~/.local/share/JetBrains/IdeaIC2022.1/Shell Script/shfmt
# in vim use:
# set tabwidth=4
# set indent=4
#hh A cli tool which uses different validators to validate SLUB workflows.
#hh
#hh Usage: validate_workflow.sh [-h] | [-s] | -w <DIR> -r <DIR> [-d] [...]
#hh
#hh Options:
#hh
#hh -h, --help
#hh help output
#hh -w, --watch-folder <DIR>
#hh watches folder for files which should be evaluated
#hh -r, --result-folder <DIR>
#hh target folder to store validation results
#hh -f, --files-mode [sort|delete|nothing]
#hh mode=sort sorts files to valid- and invalid-folder,
#hh mode=delete deletes already checked files from watch-folder
#hh mode=nothing leaves files in watch-folder untouched
#hh The mode=delete is default.
#hh -v, --valid-folder <DIR>
#hh only needed if files-mode=sort, moves valid files from
#hh watch-folder to valid-folder
#hh -i, --invalid-folder <DIR>
#hh only needed if files-mode=sort, moves invalid files from
#hh watch-folder to invalid folder
#hh -s, --statistics
#hh print a statistic
#hh -m, --mode [auto|mediathek|fotothek|save|ddz|digas]
#hh the mode 'auto' tries to check files based on file mime-types.
#hh The other modes are actual workflow names.
#hh -d, --daemon
#hh starts a daemon, works only in --files-mode=delete
#hh -t, --stage [current|upcoming|any]
#hh valides with current or upcoming profile/validator
#hh or any if any is valid
#hh -p, --pipe
#hh validates a single filestream from STDIN, writes result to STDOUT
#hh no daemon, no folder nor filemode params needed
#hh
#hh
# expected programs:
# file, ...
# DEFAULTS
WITH_DAEMON=0
WITH_DEBUG=0
WITH_PIPE=0
STAGE=any
MODE=auto
FILES_MODE=nothing
STATFILE=${HOME}/.cache/validate_workflows/statistics.cnt
LOCKFILE=/var/lock/validate_workflows.lock
WATCH_FOLDER=""
RESULT_FOLDER=""
VALID_FOLDER=""
INVALID_FOLDER=""
MAX_STAT_LINES=100000
MIN_STAT_LINES=10000
# PREDEFINED VALIDATORS
declare -A validators
#validators[workflow][filetype][stage]
# workflow has max 11 chars
# filetype has max 4 chars
# stage has max 9 chars
# each validator should return true if file was valid
validators[__mediathek_mka__current]="/usr/bin/mediaconch -ft -p /usr/local/etc/SLUB_mediaconch_policy_all.xml FILE"
validators[__mediathek_mka_upcoming]="/usr/bin/mediaconch -ft -p /usr/local/etc/SLUB_mediaconch_policy_all.xml FILE"
validators[_______save_mkv__current]="/usr/bin/mediaconch -ft -p /usr/local/etc/SLUB_mediaconch_policy_all.xml FILE"
validators[_______save_mkv_upcoming]="/usr/bin/mediaconch -ft -p /usr/local/etc/SLUB_mediaconch_policy_all.xml FILE"
validators[_______save_mka__current]="/usr/bin/mediaconch -ft -p /usr/local/etc/SLUB_mediaconch_policy_all.xml FILE"
validators[_______save_mka_upcoming]="/usr/bin/mediaconch -ft -p /usr/local/etc/SLUB_mediaconch_policy_all.xml FILE"
validators[________ddz_tif__current]="/usr/bin/checkit_tiff_current /usr/local/etc/cit_tiff6_baseline_SLUB_current.cfg FILE"
validators[________ddz_tif_upcoming]="/usr/bin/checkit_tiff_upcoming /usr/local/etc/cit_tiff6_baseline_SLUB_upcoming.cfg FILE"
validators[______digas_tif__current]="/usr/bin/checkit_tiff_current /usr/local/etc/cit_tiff6_geotiff_SLUB_current.cfg FILE"
validators[______digas_tif_upcoming]="/usr/bin/checkit_tiff_upcoming /usr/local/etc/cit_tiff6_geotiff_SLUB_upcoming.cfg FILE"
validators[___fotothek_tif__current]="/usr/bin/checkit_tiff_current /usr/local/etc/cit_tiff_retrofotos_SLUB_current.cfg FILE"
validators[___fotothek_tif_upcoming]="/usr/bin/checkit_tiff_upcoming /usr/local/etc/cit_tiff_retrofotos_SLUB_upcoming.cfg FILE"
validators[________ddz_icc__current]="/usr/local/bin/iccDumpProfile -v FILE"
validators[________ddz_icc_upcoming]="/usr/local/bin/iccDumpProfile -v FILE"
set -o nounset # Treat unset variables as an error
set -e
check_argument_count() {
local count="$1"
local expected="$2"
if [[ "${count}" -ne "${expected}" ]]; then
error "called function ${FUNCNAME[1]} expected ${expected} params, but got ${count} by caller function ${FUNCNAME[2]} (line ${BASH_LINENO[2]})"
exit 1
fi
}
check_argument_notempty() {
local param="$1"
if [[ -z "${param}" ]]; then
error "called function ${FUNCNAME[1]} expected non-empty params, but some are empty set in line ${BASH_LINENO[0]}, eventually taken from caller function ${FUNCNAME[1]} (line ${BASH_LINENO[1]})"
exit 1
fi
}
# Don't just call this function "help()", as that's a reserved command in Bash.
comment_help() {
sed -rn 's/^#hh ?//;T;p' "$0"
}
calc_statistics() {
flock -x "${LOCKFILE}" cat "${STATFILE}" | awk -F "," '{cnt_invalid+=$3;total++} END {print total, cnt_invalid}'
}
print_statistics() {
local stat
local cnt_total
local cnt_valid
local cnt_invalid
local ratio
stat=$(calc_statistics)
cnt_total=$(echo "${stat}" | awk 'END {print $1}')
cnt_valid=$(echo "${stat}" | awk 'END {print $2}')
cnt_invalid=$((cnt_total - cnt_valid))
ratio=$(( 100*cnt_valid / cnt_total ))
echo "Validation Statistics"
echo "valid files: ${cnt_valid}"
echo "invalid files: ${cnt_invalid}"
echo "ratio: ${ratio}% valid"
}
update_statistics() {
check_argument_count $# 5
local is_valid="$1"
local duration="$2"
local ftype="$3"
local workflow="$4"
local stage="$5"
check_argument_notempty "${is_valid}"
check_argument_notempty "${duration}"
check_argument_notempty "${ftype}"
check_argument_notempty "${workflow}"
check_argument_notempty "${stage}"
local date
date=$(date +"%F%T")
debug "date=${date}"
flock -x "${LOCKFILE}" echo "${date},${is_valid},${duration},${ftype},${workflow},${stage}" >> "${STATFILE}"
}
trim_statistics() {
debug "trim_statistics"
tail -n "${MIN_STAT_LINES}" "${STATFILE}" > "${STATFILE}.new" || (error "could not trim ${STATFILE} to ${STATFILE}.new"; exit 1 )
mv "${STATFILE}.new" "${STATFILE}" || ( error "count not trim ${STATFILE},because could not mv ${STATFILE}.new to ${STATFILE}"; exit 1 )
}
debug() {
if [[ "${WITH_DEBUG}" -eq 1 ]]; then
>&2 echo "DEBUG: $1"
fi
}
warn() {
>&2 echo "WARN: $1"
}
error() {
>&2 echo "ERROR: $1"
exit 1
}
get_mimetype() {
check_argument_count $# 1
local filename="$1"
check_argument_notempty "${filename}"
local res
res=$(file --mime-type "${filename}" | sed -e "s/^.*: //")
echo "${res}"
}
get_cli_args() {
local lines
local cachedir
while [[ $# -gt 0 ]]; do
case $1 in
-h | --help)
comment_help
exit 0
;;
-D | --debug)
WITH_DEBUG=1
shift
;;
-s | --statistics)
print_statistics
exit 0
;;
-w | --watch-folder)
WATCH_FOLDER="$2"
shift
shift
;;
-r | --result-folder)
RESULT_FOLDER="$2"
shift
shift
;;
-v | --valid-folder)
VALID_FOLDER="$2"
shift
shift
;;
-i | --invalid-folder)
INVALID_FOLDER="$2"
shift
shift
;;
-m | --mode)
MODE="$2"
shift
shift
;;
-t | --stage)
STAGE="$2"
shift
shift
;;
-d | --daemon)
WITH_DAEMON=1
shift
;;
-f | --files-mode)
FILES_MODE="$2"
shift
shift
;;
-p | --pipe)
WITH_PIPE=1
shift
;;
*)
error "'$1' is invalid param. Please, give '$(basename "$0") --help' a chance!"
exit 1
;;
esac
done
if [[ "${FILES_MODE}" != "sort" ]] && [[ "${FILES_MODE}" != "delete" ]] && [[ "${FILES_MODE}" != "nothing" ]]; then
error "param --files-mode must be 'sort', 'delete' or 'nothing'!"
exit 1
fi
if [[ "${MODE}" != "auto" ]] \
&& [[ "${MODE}" != "mediathek" ]] \
&& [[ "${MODE}" != "fotothek" ]] \
&& [[ "${MODE}" != "save" ]] \
&& [[ "${MODE}" != "ddz" ]] \
&& [[ "${MODE}" != "digas" ]]; then
error "param --mode must be 'auto', 'mediathek', 'fotothek', 'save', 'ddz' or 'digas'!"
exit 1
fi
if [[ "${WITH_PIPE}" -eq 1 ]]; then
if
[[ "${WITH_DAEMON}" -eq 1 ]] \
|| [[ -n "${WATCH_FOLDER}" ]] \
|| [[ -n "${RESULT_FOLDER}" ]] \
|| [[ -n "${VALID_FOLDER}" ]] \
|| [[ -n "${INVALID_FOLDER}" ]] \
|| [[ "${FILES_MODE}" = "sort" ]] \
; then
error "param --pipe not combinable with params --daemon, --result-folder, --watch-folder, --valid-folder, --invalid-folder, --files-mode"
exit 1
fi
else
if [[ "${WITH_DAEMON}" -eq 1 ]] && [[ "${FILES_MODE}" = "sort" ]]; then
error "param --daemon does only work with param --mode='delete' or --mode='nothing'!"
exit 1
fi
if [[ "${STAGE}" != "current" ]] && [[ "${STAGE}" != "upcoming" ]] && [[ "${STAGE}" != "any" ]]; then
error "--param stage must be 'any', 'current' or 'upcoming'!"
exit 1
fi
if [[ ! -d "${WATCH_FOLDER}" ]]; then
error "watch folder '${WATCH_FOLDER}' does not exist!"
exit 1
fi
if [[ ! -d "${RESULT_FOLDER}" ]]; then
error "result folder '${RESULT_FOLDER}' does not exist!"
exit 1
fi
if [[ "${FILES_MODE}" = "sort" ]]; then
if [[ ! -d "${VALID_FOLDER}" ]]; then
error "valid folder '${VALID_FOLDER}' does not exist!"
exit 1
fi
if [[ ! -d "${INVALID_FOLDER}" ]]; then
error "invalid folder '${INVALID_FOLDER}' does not exist!"
exit 1
fi
fi
fi
cachedir=$(dirname "${STATFILE}")
if [[ ! -d "${cachedir}" ]]; then
mkdir -p "${cachedir}" || error "Could not create dir ${cachedir}, $?"
fi
if [[ -e "${STATFILE}" ]]; then
lines=$(flock -x "${LOCKFILE}" wc -l "${STATFILE}" | cut -d " " -f 1)
debug "found ${lines} lines in ${STATFILE})"
if [[ "${lines}" -gt "${MAX_STAT_LINES}" ]]; then
(
flock -n 9 || exit 1
trim_statistics
) 9>"${LOCKFILE}"
fi
fi
}
prepare_cmd() {
check_argument_count $# 3
local mode="$1"
local ftype="$2"
local stage="$3"
check_argument_notempty "${mode}"
check_argument_notempty "${ftype}"
check_argument_notempty "${stage}"
local key
local cmd
key=$(printf "%11s%4s%9s" "${mode}" "${ftype}" "${stage}"|sed -e "y/ /_/")
check_argument_notempty "${key}"
debug "prepare_cmd, key=${key}"
if [[ -n ${validators[${key}]:+1} ]]; then
cmd=${validators[${key}]};
check_argument_notempty "${cmd}"
debug "prepare_cmd, cmd=${cmd}"
echo "${cmd}"
else
debug "no valid command found using key ${key}"
echo "echo 'no validation tool detected!'"
fi
}
prepare_ftype() {
check_argument_count $# 1
local mimetype="$1"
check_argument_notempty "${mimetype}"
local ftype
debug "prepare_ftype, using mimetype: ${mimetype}"
case ${mimetype} in
"image/tiff")
ftype="tif"
;;
"video/x-matroska")
ftype="mkv"
;;
"application/vnd.iccprofile")
ftype="icc"
;;
*)
warn "unknown file format with mime-type '${mimetype}'"
ftype="???"
;;
esac
check_argument_notempty "${ftype}"
debug "prepare_ftype, detect ftype: ${ftype}"
echo "${ftype}"
}
estimate_mode() {
check_argument_count $# 1
local mimetype="$1"
check_argument_notempty "${mimetype}"
debug "estimate_mode, using mimetype: ${mimetype}"
case ${mimetype} in
"image/tiff")
MODE="ddz"
;;
"video/x-matroska")
MODE="save"
;;
"application/vnd.iccprofile")
MODE="ddz"
;;
*)
warn "workflow not detectable for mimetype ${mimetype}"
MODE="???"
;;
esac
check_argument_notempty "${MODE}"
debug "estimate_mode, detected mode: ${MODE}"
echo "${MODE}"
}
exec_cmd() {
check_argument_count $# 5
local cmd="$1"
local ftype="$2"
local workflow="$3"
local stage="$4"
local log="$5"
local start_t
local stop_t
check_argument_notempty "${cmd}"
check_argument_notempty "${ftype}"
check_argument_notempty "${workflow}"
check_argument_notempty "${stage}"
check_argument_notempty "${log}"
start_t=$(date +"%s")
debug "scan_file, calling cmd='${cmd}'"
eval "${cmd} >>\"${log}\" 2>&1"
chmod o+w "${log}"
local is_valid=$?
check_argument_notempty "${is_valid}"
stop_t=$(date +"%s")
local duration=$((stop_t - start_t))
debug "exec_cmd, duration=${duration} is_valid=${is_valid} log=${log}"
update_statistics "${is_valid}" "${duration}" "${ftype}" "${workflow}" "${stage}"
echo "${is_valid}"
}
handle_input_if_requested() {
check_argument_count $# 2
local filename="$1"
local is_valid="$2"
check_argument_notempty "${filename}"
check_argument_notempty "${is_valid}"
debug "handle_input_if_requested, filename=${filename} is_valid=${is_valid}"
if [[ "${FILES_MODE}" = "sort" ]]; then
if [[ "${is_valid}" -eq 0 ]]; then
debug "handle_input_if_requested, mv ${filename} to ${VALID_FOLDER}, because valid"
mv "${filename}" "${VALID_FOLDER}"
else
debug "handle_input_if_requested, mv ${filename} to ${INVALID_FOLDER}, because invalid"
mv "${filename}" "${INVALID_FOLDER}"
fi
elif [[ "${FILES_MODE}" = "delete" ]]; then
debug "handle_input_if_requested, rm ${filename} from watch-folder ${WATCH_FOLDER}"
rm -f "${filename}"
fi
}
get_logfile() {
check_argument_count $# 1
local filename="$1"
local logname
check_argument_notempty "${filename}"
if [[ -n "${WATCH_FOLDER}" ]] && [[ "${WITH_PIPE}" -eq 0 ]]; then
logname=$(echo "${filename}"| sed -e "s#^${WATCH_FOLDER}#${RESULT_FOLDER}#" -e "s#\$#.log#")
else # pipe uses a temp filename
logname="${filename}.log"
fi
check_argument_notempty "${logname}"
logdir=$(dirname "${logname}")
check_argument_notempty "${logdir}"
if [[ ! -d "${logdir}" ]]; then
debug "get_logfile, mkdir ${logdir}"
mkdir -p "${logdir}"
fi
debug "get_logfile, logname=${logname} (filename=${filename})"
echo "${logname}"
}
scan_file() {
check_argument_count $# 1
local filename="$1"
local mimetype
local ftype
local logname
local cmd
local is_valid
check_argument_notempty "${filename}"
debug "scan_file, using filename: ${filename}"
mimetype=$(get_mimetype "${filename}")
ftype=$(prepare_ftype "${mimetype}")
logname=$(get_logfile "${filename}")
if [[ "${MODE}" = "auto" ]]; then
# try best guess
MODE=$(estimate_mode "${mimetype}")
fi
check_argument_notempty "${mimetype}"
check_argument_notempty "${ftype}"
check_argument_notempty "${logname}"
trap "" SIGINT
debug "scan_file, === entering protected area ==="
if [[ "${STAGE}" = "any" ]]; then
is_valid=1
for stage in upcoming current; do
cmd=$(prepare_cmd "${MODE}" "${ftype}" "${stage}" | sed -e "s#FILE#${filename}#")
is_valid=$(exec_cmd "${cmd}" "${ftype}" "${MODE}" "${stage}" "${logname}")
if [[ "${is_valid}" -eq 0 ]]; then
debug "scan_file, early break"
break
else
debug "scan_file, no early break, because is_valid='${is_valid}'"
fi
done
handle_input_if_requested "${filename}" "${is_valid}"
else
cmd=$(prepare_cmd "${MODE}" "${ftype}" "${STAGE}" | sed -e "s#FILE#${filename}#")
is_valid=$(exec_cmd "${cmd}" "${ftype}" "${MODE}" "${STAGE}" "${logname}")
handle_input_if_requested "${filename}" "${is_valid}"
fi
debug "scan_file, === leaving protected area ==="
trap - SIGINT
debug "---"
}
scan_dir() {
check_argument_count $# 1
check_argument_notempty "$1"
find "$1" -type f -cmin +1 -mmin +1 -print0 | while IFS= read -r -d '' filename; do
scan_file "${filename}"
done
}
#### MAIN
get_cli_args "$@"
#trap signalhandler SIGINT SIGABRT #sigint
if [[ "${WITH_PIPE}" -eq 1 ]]; then
#cli mode, use stdin
debug "checking stream"
filename=$(mktemp --tmpdir validate_wrg.XXXX)
cat - > "${filename}"
scan_file "${filename}"
cat "${filename}.log"
rm -f "${filename}.log" || error "could not remove temporary file '${filename}.log'"
rm -f "${filename}" || error "could not remove temporary file '${filename}'"
else
if [[ "${WITH_DAEMON}" -eq 1 ]]; then
# TODO: protect DAEMON from STRG-C for clean shutdown
# echo daemon mode, use inotify to watch changes
debug "starting daemon"
while true; do
scan_dir "${WATCH_FOLDER}" # to clean up existing files
sleep 10
done
# /usr/bin/inotifywait --monitor --recursive --event create \
# --event attrib --event moved_to --format "%w%f" "${WATCH_FOLDER}" \
# | while read -r filename; do
# debug "called inotifywait using /usr/bin/inotifywait --monitor
# --recursive --event create --event attrib --event moved_to --format '%f' ${WATCH_FOLDER}"
# scan_file "${filename}"
# done
debug "stopping daemon"
else
# cli mode, scan watch folder once
debug "checking dir ${WATCH_FOLDER}"
scan_dir "${WATCH_FOLDER}"
fi
fi