Skip to content
Snippets Groups Projects
Select Git revision
  • ae77be0bffd3679ceefc65b9ef8c44594c9a52b8
  • master default protected
  • feature_bash_completion
3 results

validate_workflow.sh

Blame
  • validate_workflow.sh 18.88 KiB
    #!/usr/bin/env bash
    # Author: Andreas Romeyke
    # SLUB Dresden, Department Longterm Preservation
    # copyright 2023, licensed under terms of GNU General Public License 3.0,
    # see file LICENSE.txt for details.
    ### META
    # AUTHORS:
    #  - Andreas Romeyke (<Andreas.Romeyke@slub-dresden.de>)
    
    # HINT for developers
    #  - indent code with 4 spaces
    #  - use UTF8 without BOM
    #  - use 'newline' as line ending
    # in IntelliJ idea the settings are:
    #  Indent case statements
    #  Use Unix line separators
    #  Tab size: 4
    #  Indent: 4
    #  Shfmt formatter: ~/.local/share/JetBrains/IdeaIC2022.1/Shell Script/shfmt
    # in vim use:
    #   set tabwidth=4
    #   set indent=4
    
    #hh A cli tool which uses different validators to validate SLUB workflows.
    #hh
    #hh Usage: validate_workflow.sh [-h] | [-s] | -w <DIR> -r <DIR> [-d] [...]
    #hh
    #hh Options:
    #hh
    #hh      -h, --help
    #hh                 help output
    #hh      -w, --watch-folder <DIR>
    #hh                 watches folder for files which should be evaluated
    #hh      -r, --result-folder <DIR>
    #hh                 target folder to store validation results
    #hh      -f, --files-mode [sort|delete|nothing]
    #hh                 mode=sort sorts files to valid- and invalid-folder,
    #hh                 mode=delete deletes already checked files from watch-folder
    #hh                 mode=nothing leaves files in watch-folder untouched
    #hh                 The mode=delete is default.
    #hh      -v, --valid-folder <DIR> 
    #hh                 only needed if files-mode=sort, moves valid files from
    #hh                 watch-folder to valid-folder
    #hh      -i, --invalid-folder <DIR>
    #hh                 only needed if files-mode=sort, moves invalid files from
    #hh                 watch-folder to invalid folder
    #hh      -s, --statistics
    #hh                 print a statistic
    #hh      -m, --mode [auto|mediathek|fotothek|save|ddz|digas]
    #hh                the mode 'auto' tries to check files based on file mime-types.
    #hh                The other modes are actual workflow names.
    #hh      -d, --daemon
    #hh                starts a daemon, works only in --files-mode=delete
    #hh      -t, --stage [current|upcoming|any]
    #hh                valides with current or upcoming profile/validator 
    #hh                or any if any is valid
    #hh      -p, --pipe
    #hh                validates a single filestream from STDIN, writes result to STDOUT
    #hh                no daemon, no folder nor filemode params needed
    #hh
    #hh
    
    # expected programs:
    # file, ...
    
    # DEFAULTS
    WITH_DAEMON=0
    WITH_DEBUG=0
    WITH_PIPE=0
    STAGE=any
    MODE=auto
    FILES_MODE=nothing
    STATFILE=${HOME}/.cache/validate_workflows/statistics.cnt
    LOCKFILE=/var/lock/validate_workflows.lock
    WATCH_FOLDER=""
    RESULT_FOLDER=""
    VALID_FOLDER=""
    INVALID_FOLDER=""
    MAX_STAT_LINES=100000
    MIN_STAT_LINES=10000
    
    # PREDEFINED VALIDATORS
    declare -A validators
    #validators[workflow][filetype][stage]
    # workflow has max 11 chars
    # filetype has max 4 chars
    # stage has max 9 chars
    # each validator should return true if file was valid
    validators[__mediathek_mka__current]="/usr/bin/mediaconch -ft -p /usr/local/etc/SLUB_mediaconch_policy_all.xml FILE"
    validators[__mediathek_mka_upcoming]="/usr/bin/mediaconch -ft -p /usr/local/etc/SLUB_mediaconch_policy_all.xml FILE"
    validators[_______save_mkv__current]="/usr/bin/mediaconch -ft -p /usr/local/etc/SLUB_mediaconch_policy_all.xml FILE" 
    validators[_______save_mkv_upcoming]="/usr/bin/mediaconch -ft -p /usr/local/etc/SLUB_mediaconch_policy_all.xml FILE"
    validators[_______save_mka__current]="/usr/bin/mediaconch -ft -p /usr/local/etc/SLUB_mediaconch_policy_all.xml FILE"
    validators[_______save_mka_upcoming]="/usr/bin/mediaconch -ft -p /usr/local/etc/SLUB_mediaconch_policy_all.xml FILE"
    validators[________ddz_tif__current]="/usr/bin/checkit_tiff_current /usr/local/etc/cit_tiff6_baseline_SLUB_current.cfg FILE"
    validators[________ddz_tif_upcoming]="/usr/bin/checkit_tiff_upcoming /usr/local/etc/cit_tiff6_baseline_SLUB_upcoming.cfg FILE"
    validators[______digas_tif__current]="/usr/bin/checkit_tiff_current /usr/local/etc/cit_tiff6_geotiff_SLUB_current.cfg FILE"
    validators[______digas_tif_upcoming]="/usr/bin/checkit_tiff_upcoming /usr/local/etc/cit_tiff6_geotiff_SLUB_upcoming.cfg FILE"
    validators[___fotothek_tif__current]="/usr/bin/checkit_tiff_current /usr/local/etc/cit_tiff_retrofotos_SLUB_current.cfg FILE"
    validators[___fotothek_tif_upcoming]="/usr/bin/checkit_tiff_upcoming /usr/local/etc/cit_tiff_retrofotos_SLUB_upcoming.cfg FILE"
    validators[________ddz_icc__current]="/usr/local/bin/iccDumpProfile -v FILE"
    validators[________ddz_icc_upcoming]="/usr/local/bin/iccDumpProfile -v FILE"
    
    
    set -o nounset                              # Treat unset variables as an error
    set -e
    
    check_argument_count() {
        local count="$1"
        local expected="$2"
        if [[ "${count}" -ne "${expected}" ]]; then
            error "called function ${FUNCNAME[1]} expected ${expected} params, but got ${count} by caller function ${FUNCNAME[2]} (line ${BASH_LINENO[2]})"
            exit 1
        fi
    }
    
    check_argument_notempty() {
        local param="$1"
        if [[ -z "${param}" ]]; then
            error "called function ${FUNCNAME[1]} expected non-empty params, but some are empty set in line ${BASH_LINENO[0]}, eventually taken from caller function ${FUNCNAME[1]} (line ${BASH_LINENO[1]})"
            exit 1
        fi
    }
    
    # Don't just call this function "help()", as that's a reserved command in Bash. 
    comment_help() {
        sed -rn 's/^#hh ?//;T;p' "$0"
    }
    
    calc_statistics() {
        flock -x "${LOCKFILE}" cat "${STATFILE}" | awk -F "," '{cnt_invalid+=$3;total++} END {print total, cnt_invalid}'
    }
    
    print_statistics() {
        local stat
        local cnt_total
        local cnt_valid
        local cnt_invalid
        local ratio
        stat=$(calc_statistics)
        cnt_total=$(echo "${stat}" | awk 'END {print $1}')
        cnt_valid=$(echo "${stat}" | awk 'END {print $2}')
        cnt_invalid=$((cnt_total - cnt_valid))
        ratio=$(( 100*cnt_valid / cnt_total ))
        echo "Validation Statistics"
        echo "valid files:   ${cnt_valid}"
        echo "invalid files: ${cnt_invalid}"
        echo "ratio:         ${ratio}% valid"
    
    }
    
    update_statistics() {
        check_argument_count $# 5
        local is_valid="$1"
        local duration="$2"
        local ftype="$3"
        local workflow="$4"
        local stage="$5"
        check_argument_notempty "${is_valid}"
        check_argument_notempty "${duration}"
        check_argument_notempty "${ftype}"
        check_argument_notempty "${workflow}"
        check_argument_notempty "${stage}"
        local date
        date=$(date +"%F%T")
        debug "date=${date}"
        flock -x "${LOCKFILE}" echo "${date},${is_valid},${duration},${ftype},${workflow},${stage}" >> "${STATFILE}"
    }
    
    trim_statistics() {
        debug "trim_statistics"
        tail -n "${MIN_STAT_LINES}" "${STATFILE}" > "${STATFILE}.new" || (error "could not trim ${STATFILE} to ${STATFILE}.new"; exit 1 )
        mv "${STATFILE}.new" "${STATFILE}" || ( error "count not trim ${STATFILE},because could not mv ${STATFILE}.new to ${STATFILE}"; exit 1 )
    }
    
    debug() {
        if [[ "${WITH_DEBUG}" -eq 1 ]]; then
            >&2 echo "DEBUG: $1"
        fi
    }
    
    warn() {
        >&2 echo "WARN: $1"
    }
    
    error() {
        >&2 echo "ERROR: $1"
        exit 1
    }
    
    get_mimetype() {
        check_argument_count $# 1
        local filename="$1"
        check_argument_notempty "${filename}"
        local res
        res=$(file --mime-type "${filename}" | sed -e "s/^.*: //")
        echo "${res}"
    }
    
    
    get_cli_args() {
        local lines
        local cachedir
        while [[ $# -gt 0 ]]; do
            case $1 in
                -h | --help)
                    comment_help
                    exit 0
                    ;;
                -D | --debug)
                    WITH_DEBUG=1
                    shift
                    ;;
                -s | --statistics)
                    print_statistics
                    exit 0
                    ;;
                -w | --watch-folder)
                    WATCH_FOLDER="$2"
                    shift
                    shift
                    ;;
                -r | --result-folder)
                    RESULT_FOLDER="$2"
                    shift
                    shift
                    ;;
                -v | --valid-folder)
                    VALID_FOLDER="$2"
                    shift
                    shift
                    ;;
                -i | --invalid-folder)
                    INVALID_FOLDER="$2"
                    shift
                    shift
                    ;;
                -m | --mode)
                    MODE="$2"
                    shift
                    shift
                    ;;
                -t | --stage)
                    STAGE="$2"
                    shift
                    shift
                    ;;
                -d | --daemon)
                    WITH_DAEMON=1
                    shift
                    ;;
                -f | --files-mode)
                    FILES_MODE="$2"
                    shift
                    shift
                    ;;
                -p | --pipe)
                    WITH_PIPE=1
                    shift
                    ;;
                *)
                    error "'$1' is invalid param. Please, give '$(basename "$0") --help' a chance!"
                    exit 1
                    ;;
            esac
        done
        if [[ "${FILES_MODE}" != "sort" ]] && [[ "${FILES_MODE}" != "delete" ]] && [[ "${FILES_MODE}" != "nothing" ]]; then
            error "param --files-mode must be 'sort', 'delete' or 'nothing'!"
            exit 1
        fi
        if [[ "${MODE}" != "auto" ]] \
            && [[ "${MODE}" != "mediathek" ]] \
            && [[ "${MODE}" != "fotothek" ]] \
            && [[ "${MODE}" != "save" ]] \
            && [[ "${MODE}" != "ddz" ]] \
            && [[ "${MODE}" != "digas" ]]; then
            error "param --mode must be 'auto', 'mediathek', 'fotothek', 'save', 'ddz' or 'digas'!"
            exit 1
        fi
        if [[ "${WITH_PIPE}" -eq 1 ]]; then
            if 
                [[ "${WITH_DAEMON}" -eq 1 ]] \
                || [[ -n "${WATCH_FOLDER}" ]] \
                || [[ -n "${RESULT_FOLDER}" ]] \
                || [[ -n "${VALID_FOLDER}" ]] \
                || [[ -n "${INVALID_FOLDER}" ]] \
                || [[ "${FILES_MODE}" = "sort" ]] \
                ; then 
                error "param --pipe not combinable with params --daemon, --result-folder, --watch-folder, --valid-folder, --invalid-folder, --files-mode"
                exit 1
            fi
        else
            if [[ "${WITH_DAEMON}" -eq 1 ]] && [[ "${FILES_MODE}" = "sort" ]]; then
                error "param --daemon does only work with param --mode='delete' or --mode='nothing'!"
                exit 1
            fi
            if [[ "${STAGE}" != "current" ]] && [[ "${STAGE}" != "upcoming" ]] && [[ "${STAGE}" != "any" ]]; then
                error "--param stage must be 'any', 'current' or 'upcoming'!"
                exit 1
            fi
            if [[ ! -d "${WATCH_FOLDER}" ]]; then
                error "watch folder '${WATCH_FOLDER}' does not exist!"
                exit 1
            fi
            if [[ ! -d "${RESULT_FOLDER}" ]]; then
                error "result folder '${RESULT_FOLDER}' does not exist!"
                exit 1
            fi
            if [[ "${FILES_MODE}" = "sort" ]]; then
                if [[ ! -d "${VALID_FOLDER}" ]]; then
                    error "valid folder '${VALID_FOLDER}' does not exist!"
                    exit 1
                fi
                if [[ ! -d "${INVALID_FOLDER}" ]]; then
                    error "invalid folder '${INVALID_FOLDER}' does not exist!"
                    exit 1
                fi
            fi
        fi
        cachedir=$(dirname "${STATFILE}")
        if [[ ! -d "${cachedir}" ]]; then
            mkdir -p "${cachedir}" || error "Could not create dir ${cachedir}, $?"
        fi
        if [[ -e "${STATFILE}" ]]; then
            lines=$(flock -x "${LOCKFILE}" wc -l "${STATFILE}" | cut -d " " -f 1)
            debug "found ${lines} lines in ${STATFILE})"
            if [[ "${lines}" -gt "${MAX_STAT_LINES}" ]]; then
                (
                flock -n 9 || exit 1
                trim_statistics
                ) 9>"${LOCKFILE}"
            fi
        fi
    }
    
    prepare_cmd() {
        check_argument_count $# 3
        local mode="$1"
        local ftype="$2"
        local stage="$3"
        check_argument_notempty "${mode}"
        check_argument_notempty "${ftype}"
        check_argument_notempty "${stage}"
        local key
        local cmd
        key=$(printf "%11s%4s%9s" "${mode}" "${ftype}" "${stage}"|sed -e "y/ /_/")
        check_argument_notempty "${key}"
        debug "prepare_cmd, key=${key}"
        if [[ -n ${validators[${key}]:+1} ]]; then
            cmd=${validators[${key}]};
            check_argument_notempty "${cmd}"
            debug "prepare_cmd, cmd=${cmd}"
            echo "${cmd}"
        else
            debug "no valid command found using key ${key}"
            echo "echo 'no validation tool detected!'"
        fi
    }
    
    prepare_ftype() {
        check_argument_count $# 1
        local mimetype="$1"
        check_argument_notempty "${mimetype}"
        local ftype
        debug "prepare_ftype, using mimetype: ${mimetype}"
        case ${mimetype} in
            "image/tiff")
                ftype="tif"
                ;;
            "video/x-matroska")
                ftype="mkv"
                ;;
            "application/vnd.iccprofile")
                ftype="icc"
                ;;
            *)
                warn "unknown file format with mime-type '${mimetype}'"
                ftype="???"
                ;;
        esac
        check_argument_notempty "${ftype}"
        debug "prepare_ftype, detect ftype: ${ftype}"
        echo "${ftype}"
    }
    
    estimate_mode() {
        check_argument_count $# 1
        local mimetype="$1"
        check_argument_notempty "${mimetype}"
        debug "estimate_mode, using mimetype: ${mimetype}"
        case ${mimetype} in
            "image/tiff")
                MODE="ddz"
                ;;
            "video/x-matroska")
                MODE="save"
                ;;
            "application/vnd.iccprofile")
                MODE="ddz"
                ;;
            *)
                warn "workflow not detectable for mimetype ${mimetype}"
                MODE="???"
                ;;
        esac
        check_argument_notempty "${MODE}"
        debug "estimate_mode, detected mode: ${MODE}"
        echo "${MODE}"
    }
    
    exec_cmd() {
        check_argument_count $# 5
        local cmd="$1"
        local ftype="$2"
        local workflow="$3"
        local stage="$4"
        local log="$5"
        local start_t
        local stop_t
        check_argument_notempty "${cmd}"
        check_argument_notempty "${ftype}"
        check_argument_notempty "${workflow}"
        check_argument_notempty "${stage}"
        check_argument_notempty "${log}"
        start_t=$(date +"%s")
        debug "scan_file, calling cmd='${cmd}'"
            eval "${cmd} >>\"${log}\" 2>&1"
        chmod o+w "${log}"
        local is_valid=$?
        check_argument_notempty "${is_valid}"
        stop_t=$(date +"%s")
        local duration=$((stop_t - start_t))
        debug "exec_cmd, duration=${duration} is_valid=${is_valid} log=${log}"
        update_statistics "${is_valid}" "${duration}" "${ftype}" "${workflow}" "${stage}"
        echo "${is_valid}"
    }
    
    handle_input_if_requested() {
        check_argument_count $# 2
        local filename="$1"
        local is_valid="$2"
        check_argument_notempty "${filename}"
        check_argument_notempty "${is_valid}"
        debug "handle_input_if_requested, filename=${filename} is_valid=${is_valid}"
        if [[ "${FILES_MODE}" = "sort" ]]; then
            if [[ "${is_valid}" -eq 0 ]]; then
                debug "handle_input_if_requested, mv ${filename} to ${VALID_FOLDER}, because valid"
                mv "${filename}" "${VALID_FOLDER}"
            else
                debug "handle_input_if_requested, mv ${filename} to ${INVALID_FOLDER}, because invalid"
                mv "${filename}" "${INVALID_FOLDER}"
            fi
        elif [[ "${FILES_MODE}" = "delete" ]]; then
            debug "handle_input_if_requested, rm ${filename} from watch-folder ${WATCH_FOLDER}"
            rm -f "${filename}"
        fi
    }
    
    get_logfile() {
        check_argument_count $# 1
        local filename="$1"
        local logname
        check_argument_notempty "${filename}"
        if [[ -n "${WATCH_FOLDER}" ]] && [[ "${WITH_PIPE}" -eq 0 ]]; then
            logname=$(echo "${filename}"| sed -e "s#^${WATCH_FOLDER}#${RESULT_FOLDER}#" -e "s#\$#.log#")
        else # pipe uses a temp filename
            logname="${filename}.log"
        fi
        check_argument_notempty "${logname}"
        logdir=$(dirname "${logname}")
        check_argument_notempty "${logdir}"
        if [[ ! -d "${logdir}" ]]; then
            debug "get_logfile, mkdir ${logdir}"
            mkdir -p "${logdir}"
        fi
        debug "get_logfile, logname=${logname} (filename=${filename})"
        echo "${logname}"
    }
    
    
    scan_file() {
        check_argument_count $# 1
        local filename="$1"
        local mimetype
        local ftype
        local logname
        local cmd
        local is_valid
        check_argument_notempty "${filename}"
        debug "scan_file, using filename: ${filename}"
        mimetype=$(get_mimetype "${filename}")
        ftype=$(prepare_ftype "${mimetype}")
        logname=$(get_logfile "${filename}")
        if [[ "${MODE}" = "auto" ]]; then
            # try best guess
            MODE=$(estimate_mode "${mimetype}")
        fi
        check_argument_notempty "${mimetype}"
        check_argument_notempty "${ftype}"
        check_argument_notempty "${logname}"
        trap "" SIGINT
        debug "scan_file, === entering protected area ==="
        if [[ "${STAGE}" = "any" ]]; then
            is_valid=1
            for stage in upcoming current; do
                cmd=$(prepare_cmd "${MODE}" "${ftype}" "${stage}" | sed -e "s#FILE#${filename}#")
                is_valid=$(exec_cmd "${cmd}" "${ftype}" "${MODE}" "${stage}" "${logname}")
                if [[ "${is_valid}" -eq 0 ]]; then
                    debug "scan_file, early break"
                    break
                else 
                    debug "scan_file, no early break, because is_valid='${is_valid}'"
                fi
            done
            handle_input_if_requested "${filename}" "${is_valid}"
        else
            cmd=$(prepare_cmd "${MODE}" "${ftype}" "${STAGE}" | sed -e "s#FILE#${filename}#")
            is_valid=$(exec_cmd "${cmd}" "${ftype}" "${MODE}" "${STAGE}" "${logname}")
            handle_input_if_requested "${filename}" "${is_valid}"
        fi
        debug "scan_file, === leaving protected area ==="
        trap - SIGINT
        debug "---"
    }
    
    scan_dir() {
        check_argument_count $# 1
        check_argument_notempty "$1"
        find "$1" -type f -cmin +1 -mmin +1 -print0 | while IFS= read -r -d '' filename; do
            scan_file "${filename}"
        done
    }
    
    #### MAIN
    
    get_cli_args "$@"
    #trap signalhandler SIGINT SIGABRT #sigint
    if [[ "${WITH_PIPE}" -eq 1 ]]; then
        #cli mode, use stdin
        debug "checking stream"
        filename=$(mktemp --tmpdir validate_wrg.XXXX)
        cat - > "${filename}"
        scan_file "${filename}"
        cat "${filename}.log"
        rm -f "${filename}.log" || error "could not remove temporary file '${filename}.log'"
        rm -f "${filename}" || error "could not remove temporary file '${filename}'"
    else 
        if [[ "${WITH_DAEMON}" -eq 1 ]]; then
            # TODO: protect DAEMON from STRG-C for clean shutdown
            # echo daemon mode, use inotify to watch changes
            debug "starting daemon"
            while true; do
                scan_dir "${WATCH_FOLDER}" # to clean up existing files
                sleep 10
            done
    #        /usr/bin/inotifywait --monitor --recursive --event create \
    #            --event attrib --event moved_to --format "%w%f" "${WATCH_FOLDER}" \
    #            | while read -r filename; do
    #                    debug "called inotifywait using /usr/bin/inotifywait --monitor
    #                    --recursive --event create --event attrib --event moved_to --format '%f' ${WATCH_FOLDER}"
    #                    scan_file "${filename}"
    #                done
                    debug "stopping daemon"
                else
                    # cli mode, scan watch folder once
                    debug "checking dir ${WATCH_FOLDER}"
                    scan_dir "${WATCH_FOLDER}"
        fi
    fi