Select Git revision
validate_workflow.sh 14.81 KiB
#!/usr/bin/env bash
### META
# AUTHORS:
# - Andreas Romeyke (<Andreas.Romeyke@slub-dresden.de>)
# HINT for developers
# - indent code with 4 spaces
# - use UTF8 without BOM
# - use 'newline' as line ending
# in IntelliJ idea the settings are:
# Indent case statements
# Use Unix line spearators
# Tab size: 4
# Indent: 4
# Shfmt formatter: ~/.local/share/JetBrains/IdeaIC2022.1/Shell Script/shfmt
# in vim use:
# set tabwidth=4
# set indent=4
#hh A cli tool which uses different validators to validate SLUB workflows.
#hh
#hh Usage: validate_workflow.sh [-h] | [-s] | -w watchfolder -r resultfolder [-d] [...]
#hh
#hh Options:
#hh
#hh -h, --help
#hh help output
#hh -w, --watch-folder <DIR>
#hh watches folder for files which should be evaluated
#hh -r, --result-folder <DIR>
#hh target folder to store validation results
#hh -f, --files-mode [sort|delete|nothing]
#hh mode=sort sorts files to valid- and invalid-folder,
#hh mode=delete deletes already checked files from watch-folder
#hh mode=nothing files in watch-folder remain untouched
#hh The mode=delete is default.
#hh -v, --valid-folder <DIR>
#hh only needed if files-mode=sort, moves valid files from
#hh watch-folder to valid-folder
#hh -i, --invalid-folder <DIR>
#hh only needed if files-mode=sort, moves invalid files from
#hh watch-folder to invalid folder
#hh -s, --statistics
#hh print a statistic
#hh -m, --mode [auto, mediathek, fotothek, save, kitodo, lfulg]
#hh the mode 'auto' tries to check files based on file mime-types.
#hh The other modes are actual workflow names.
#hh -d, --daemon
#hh starts a daemon, works only in --files-mode=delete
#hh -t, --stage [current,upcoming,any]
#hh valides with current or upcoming profile/validator
#hh or any if any is valid
#hh -p, --pipe
#hh validates a single filestream from STDIN, writes result to STDOUT
#hh no daemon, no folder nor filemode params needed
#hh
#hh
# expected programs:
# file, ...
# DEFAULTS
WITH_DAEMON=0
WITH_DEBUG=0
WITH_PIPE=0
STAGE=any
MODE=auto
FILES_MODE=nothing
STATFILE=${HOME}/.cache/validate_workflows/statistics.cnt
LOCKFILE=/var/lock/validate_workflows.lock
WATCH_FOLDER=""
RESULT_FOLDER=""
VALID_FOLDER=""
INVALID_FOLDER=""
MAX_STAT_LINES=100000
MIN_STAT_LINES=10000
# PREDEFINED VALIDATORS
declare -A validators
#validators[workflow][filetype][stage]
# workflow has max 11 chars
# filetype has max 4 chars
# stage has max 9 chars
# each validator should return true if file was valid
validators[__mediathek_mka__current]="/usr/bin/mediaconch -ft -p /etc/mediaconch/mediathek_retroaudio.xml FILE"
validators[__mediathek_mka_upcoming]="/usr/bin/mediaconch -ft -p /etc/mediaconch/mediathek_retroaudio.xml FILE"
validators[_______save_mkv__current]="/usr/bin/mediaconch -ft -p /etc/mediaconch/save_retrovideofilm_current.xml FILE"
validators[_______save_mkv_upcoming]="/usr/bin/mediaconch -ft -p /etc/mediaconch/save_retrovideofilm_upcoming.xml FILE"
validators[_______save_mka__current]="/usr/bin/mediaconch -ft -p /etc/mediaconch/save_retroaudio_current.xml FILE"
validators[_______save_mka_upcoming]="/usr/bin/mediaconch -ft -p /etc/mediaconch/save_retroaudio_current.xml FILE"
validators[_____kitodo_tif__current]="/usr/bin/checkit_tiff_current /etc/checkit_tiff/retromono_current FILE"
validators[_____kitodo_tif_upcoming]="/usr/bin/checkit_tiff_upcoming /etc/checkit_tiff/retromono_upcoming FILE"
validators[______lfulg_tif__current]="/usr/bin/checkit_tiff_current /etc/checkit_tiff/retrogeomono_current FILE"
validators[______lfulg_tif_upcoming]="/usr/bin/checkit_tiff_upcoming /etc/checkit_tiff/retrogeomono_upcoming FILE"
validators[___fotothek_tif__current]="/usr/bin/checkit_tiff_current /etc/checkit_tiff/retrofoto_current FILE"
validators[___fotothek_tif_upcoming]="/usr/bin/checkit_tiff_upcoming /etc/checkit_tiff/retrofoto_upcoming FILE"
set -o nounset # Treat unset variables as an error
# Don't just call this function "help()", as that's a reserved command in Bash.
comment_help() {
sed -rn 's/^#hh ?//;T;p' "$0"
}
calc_statistics() {
flock -x "$LOCKFILE" cat "$STATFILE" | awk -F "," '{cnt_invalid+=$3;total++} END {print total, cnt_invalid}'
}
print_statistics() {
local stat
local cnt_total
local cnt_valid
local cnt_invalid
local ratio
stat=$(calc_statistics)
cnt_total=$(echo "$stat" | awk 'END {print $1}')
cnt_valid=$(echo "$stat" | awk 'END {print $2}')
cnt_invalid=$((cnt_total - cnt_valid))
ratio=$(( 100*cnt_valid / cnt_total ))
echo "Validation Statistics"
echo "valid files: $cnt_valid"
echo "invalid files: $cnt_invalid"
echo "ratio: $ratio% valid"
}
update_statistics() {
local is_valid=$1
local duration=$2
local ftype=$3
local workflow=$4
local stage=$5
local date
date=$(date +"%F%T")
debug "date=$date"
flock -x $LOCKFILE echo "$date,$is_valid,$duration,$ftype,$workflow,$stage" >> "$STATFILE"
}
trim_statistics() {
debug "trim_statistics"
tail -n $MIN_STAT_LINES "$STATFILE" > "$STATFILE.new" || (error "could not trim $STATFILE to $STATFILE.new" ; exit 1 )
mv "$STATFILE.new" "$STATFILE" || ( error "count not trim $STATFILE,because could not mv $STATFILE.new to $STATFILE"; exit 1 )
}
debug() {
if [ $WITH_DEBUG -eq 1 ]; then
>&2 echo "DEBUG: $1"
fi
}
error() {
>&2 echo "ERROR: $1"
}
get_mimetype() {
local filename=$1
local res
res=$(file --mime-type "$filename" | sed -e "s/^.*: //")
echo "$res"
}
get_cli_args() {
local lines
local cachedir
while [[ $# -gt 0 ]]; do
case ${1} in
-h | --help)
comment_help
exit 0
;;
-D | --debug)
WITH_DEBUG=1
shift
;;
-s | --statistics)
print_statistics
exit 0
;;
-w | --watch-folder)
WATCH_FOLDER="${2}"
shift
shift
;;
-r | --result-folder)
RESULT_FOLDER="${2}"
shift
shift
;;
-v | --valid-folder)
VALID_FOLDER="${2}"
shift
shift
;;
-i | --invalid-folder)
INVALID_FOLDER="${2}"
shift
shift
;;
-m | --mode)
MODE="${2}"
shift
shift
;;
-t | --stage)
STAGE="${2}"
shift
shift
;;
-d | --daemon)
WITH_DAEMON=1
shift
;;
-f | --files-mode)
FILES_MODE="${2}"
shift
shift
;;
-p | --pipe)
WITH_PIPE=1
shift
;;
*)
error "'${1}' is invalid param. Please, give '$(basename "${0}") --help' a chance!"
exit 1
;;
esac
done
if [ "$FILES_MODE" != "sort" ] && [ "$FILES_MODE" != "delete" ] && [ "$FILES_MODE" != "nothing" ]; then
error "param --files-mode must be 'sort', 'delete' or 'nothing'!"
exit 1
fi
if [ "$MODE" != "auto" ] \
&& [ "$MODE" != "mediathek" ] \
&& [ "$MODE" != "fotothek" ] \
&& [ "$MODE" != "save" ] \
&& [ "$MODE" != "kitodo" ] \
&& [ "$MODE" != "lfulg" ]; then
error "param --mode must be 'auto', 'mediathek', 'fotothek', 'save', 'kitodo' or 'lfulg'!"
exit 1
fi
if [ "$WITH_PIPE" -eq 1 ]; then
if
[ "$WITH_DAEMON" -eq 1 ] \
|| [ -n "$WATCH_FOLDER" ] \
|| [ -n "$RESULT_FOLDER" ] \
|| [ -n "$VALID_FOLDER" ] \
|| [ -n "$INVALID_FOLDER" ] \
|| [ "$FILES_MODE" = "sort" ] \
; then
error "param --pipe not combineable with params --daemon, --result-folder, --watch-folder, --valid-folder, --invalid-folder, --files-mode"
exit 1
fi
else
if [ "$WITH_DAEMON" -eq 1 ] && [ "$FILES_MODE" = "sort" ]; then
error "param --daemon does only work with param --mode='delete' or --mode='nothing'!"
exit 1
fi
if [ "$STAGE" != "current" ] && [ "$STAGE" != "upcoming" ] && [ "$STAGE" != "any" ]; then
error "--param stage must be 'any', 'current' or 'upcoming'!"
exit 1
fi
if [ ! -d "$WATCH_FOLDER" ]; then
error "watch folder '$WATCH_FOLDER' does not exist!"
exit 1
fi
if [ ! -d "$RESULT_FOLDER" ]; then
error "result folder '$RESULT_FOLDER' does not exist!"
exit 1
fi
if [ "$FILES_MODE" = "sort" ]; then
if [ ! -d "$VALID_FOLDER" ]; then
error "valid folder '$VALID_FOLDER' does not exist!"
exit 1
fi
if [ ! -d "$INVALID_FOLDER" ]; then
error "invalid folder '$INVALID_FOLDER' does not exist!"
exit 1
fi
fi
fi
cachedir=$(dirname "$STATFILE")
if [ ! -d "$cachedir" ]; then
mkdir -p "$cachedir" || error "Could not create dir $cachedir, $?"
fi
if [ -e "$STATFILE" ]; then
lines=$( flock -x "$LOCKFILE" wc -l "$STATFILE" | cut -d " " -f 1)
debug "found $lines lines in $STATFILE)"
if [ "$lines" -gt $MAX_STAT_LINES ]; then
(
flock -n 9 || exit 1
trim_statistics
) 9>"$LOCKFILE"
fi
fi
}
prepare_cmd() {
local mode=$1
local ftype=$2
local stage=$3
local key
local cmd
key=$(printf "%11s%4s%9s" "$mode" "$ftype" "$stage"|sed -e "y/ /_/")
debug "prepare_cmd, key=$key"
if [[ ${validators[$key]:+1} ]]; then
cmd=${validators[$key]};
debug "prepare_cmd, cmd=$cmd"
echo "$cmd"
else
debug "no valid command found using key $key"
fi
}
prepare_ftype() {
local mimetype=$1
local ftype
debug "prepare_ftype, using mimetype: $mimetype"
case ${mimetype} in
"image/tiff")
ftype="tif"
;;
"video/x-matroska")
ftype="mkv"
;;
*)
error "unknown file format '$mimetype'"
exit 1
;;
esac
debug "prepare_ftype, detect ftype: $ftype"
echo "$ftype"
}
estimate_mode() {
local mimetype=$1
debug "estimate_mode, using mimetype: $mimetype"
case ${mimetype} in
"image/tiff")
MODE="kitodo"
;;
"video/x-matroska")
MODE="save"
;;
*)
error "workflow not detectable"
exit 1
;;
esac
debug "estimate_mode, detected mode: $MODE"
echo "$MODE"
}
exec_cmd() {
local cmd=$1
local ftype=$2
local workflow=$3
local stage=$4
local log=$5
local start_t
local stop_t
start_t=$(date +"%s")
debug "scan_file, calling cmd='$cmd'"
$cmd >>"$log" 2>&1
local is_valid=$?
stop_t=$(date +"%s")
local duration=$((stop_t - start_t))
debug "scan_file, duration=$duration is_valid=$is_valid log=$log"
update_statistics "$is_valid" "$duration" "$ftype" "$workflow" "$stage"
echo "$is_valid"
}
handle_input_if_requested() {
local filename=$1
local is_valid=$2
debug "handle_input_if_requested, filename=$filename is_valid=$is_valid"
if [ "$FILES_MODE" = "sort" ]; then
if [ "$is_valid" -eq 0 ]; then
debug "handle_input_if_requested, mv $filename to $VALID_FOLDER, because valid"
mv "$filename" "$VALID_FOLDER"
else
debug "handle_input_if_requested, mv $filename to $INVALID_FOLDER, because invalid"
mv "$filename" "$INVALID_FOLDER"
fi
elif [ "$FILES_MODE" = "delete" ]; then
debug "handle_input_if_requested, rm $filename from watchfolder $WATCH_FOLDER"
rm -f "$filename"
fi
}
get_logfile() {
local filename=$1
local logname
logname=$(echo "$filename"| sed -e "s#^${WATCH_FOLDER}#${RESULT_FOLDER}#" -e "s#\$#.log#")
debug "get_logfile, logname=$logname (filename=$filename)"
echo "$logname"
}
scan_file() {
local filename="$1"
local mimetype
local ftype
local logname
local cmd
local is_valid
debug "scan_file, using filename: $filename"
mimetype=$(get_mimetype "$filename")
ftype=$(prepare_ftype "$mimetype")
logname=$(get_logfile "$filename")
if [ "$MODE" = "auto" ]; then
# try best guess
MODE=$(estimate_mode "$mimetype")
fi
if [ "$STAGE" = "any" ]; then
is_valid=0
for stage in current upcoming; do
cmd=$(prepare_cmd "$MODE" "$ftype" "$stage" | sed -e "s#FILE#$filename#")
is_valid=$(exec_cmd "$cmd" "$ftype" "$MODE" "$stage" "$logname")
if [ "$is_valid" -eq 0 ]; then
break
fi
done
handle_input_if_requested "$filename" "$is_valid"
else
cmd=$(prepare_cmd "$MODE" "$ftype" "$STAGE" | sed -e "s#FILE#$filename#")
is_valid=$(exec_cmd "$cmd" "$ftype" "$MODE" "$stage" "$logname")
handle_input_if_requested "$filename" "$is_valid"
fi
}
scan_dir() {
find "$1" -type f -print0| while IFS= read -r -d '' filename; do
scan_file "$filename"
done
}
#### MAIN
get_cli_args "$@"
if [ "$WITH_PIPE" -eq 1 ]; then
#cli mode, use stdin
debug "checking stream"
filename=$(mktemp --tmpdir validate_wrg.XXXX)
cat - > "$filename"
scan_file "$filename"
rm -f "$filename" || error "could not remove temporary file '$filename'"
else
if [ "$WITH_DAEMON" -eq 1 ]; then
# TODO: protect DAEMON from STRG-C for clean shutdown
# echo daemon mode, use inotify to watch changes
debug "starting daemon"
scan_dir "$WATCH_FOLDER" # to clean up existing files
/usr/bin/inotifywait --monitor --recursive --event create \
--event attrib --event moved_to --format "%w%f" "$WATCH_FOLDER" \
| while read -r filename; do
debug "called inotifywait using /usr/bin/inotifywait --monitor
--recursive --event create --event attrib --event moved_to --format '%f' $WATCH_FOLDER"
scan_file "$filename"
done
debug "stopping daemon"
else
# cli mode, scan watch folder once
debug "checking dir $WATCH_FOLDER"
scan_dir "$WATCH_FOLDER"
fi
fi