From 74e2248fb05ddaf9299d1816d32934bee3ec6ddf Mon Sep 17 00:00:00 2001
From: Andreas Romeyke <art1@andreas-romeyke.de>
Date: Wed, 12 Oct 2022 16:51:40 +0200
Subject: [PATCH] - added code to calc and print statistics - fixed
 get_mimetype() - fixed cli param order handling - added autotrim - added
 exist-check in prepare_cmd() - added exec_cmd()

---
 validate_workflow.sh | 96 ++++++++++++++++++++++++++++++++++++--------
 1 file changed, 80 insertions(+), 16 deletions(-)

diff --git a/validate_workflow.sh b/validate_workflow.sh
index 42ca793..6a94bc4 100755
--- a/validate_workflow.sh
+++ b/validate_workflow.sh
@@ -73,6 +73,8 @@ WATCH_FOLDER=""
 RESULT_FOLDER=""
 VALID_FOLDER=""
 INVALID_FOLDER=""
+MAX_STAT_LINES=100000
+MIN_STAT_LINES=10000
 
 # PREDEFINED VALIDATORS
 declare -A validators
@@ -102,8 +104,38 @@ comment_help() {
     sed -rn 's/^#hh ?//;T;p' "$0" 
 }
 
+calc_statistics() {
+    flock -x "$LOCKFILE" cat "$STATFILE" | awk -F "," '{cnt_invalid+=$3;total++} END {print total, cnt_invalid}'
+}
+
 print_statistics() {
-    echo "Not implemented yet"
+    stat=$(calc_statistics)
+    cnt_total=$(echo $stat | awk 'END {print $1}')
+    cnt_valid=$(echo $stat | awk 'END {print $2}')
+    cnt_invalid=$((cnt_total - cnt_valid))
+    ratio=$(( 100*cnt_valid / cnt_total ))
+    echo "Validation Statistics"
+    echo "valid files:   $cnt_valid"
+    echo "invalid files: $cnt_invalid"
+    echo "ratio:         $ratio% valid"
+
+}
+
+update_statistics() {
+    is_valid=$1
+    duration=$2
+    ftype=$3
+    workflow=$4
+    stage=$5
+    date=$(date +"%F%T")
+    debug "date=$date"
+    flock -x $LOCKFILE echo "$date,$is_valid,$duration,$ftype,$workflow,$stage" >> "$STATFILE"
+}
+
+trim_statistics() {
+    debug "trim_statistics"
+    tail -n $MIN_STAT_LINES "$STATFILE" > "$STATFILE.new" || (error "could not trim $STATFILE to $STATFILE.new"  ; exit 1 )
+    mv "$STATFILE.new" "$STATFILE" || ( error "count not trim $STATFILE,because could not mv $STATFILE.new to $STATFILE"; exit 1 )
 }
 
 debug() {
@@ -118,10 +150,11 @@ error() {
 
 get_mimetype() {
     filename=$1
-    res=$(file --mime-type "$filename")
+    res=$(file --mime-type "$filename" |  sed -e "s/^.*: //")
     echo "$res"
 }
 
+
 get_cli_args() { 
     while [[ $# -gt 0 ]]; do
         case ${1} in
@@ -129,6 +162,10 @@ get_cli_args() {
                 comment_help
                 exit 0
                 ;;
+            -D | --debug)
+                WITH_DEBUG=1
+                shift
+                ;;
             -s | --statistics)
                 print_statistics
                 exit 0
@@ -176,10 +213,6 @@ get_cli_args() {
                 WITH_PIPE=1
                 shift
                 ;;
-            -D | --debug)
-                WITH_DEBUG=1
-                shift
-                ;;
             *)
                 error "'${1}' is invalid param. Please, give '$(basename "${0}") --help' a chance!"
                 exit 1
@@ -239,6 +272,20 @@ get_cli_args() {
             fi
         fi
     fi
+    cachedir=$(dirname "$STATFILE")
+    if [ ! -d "$cachedir" ]; then
+        mkdir -p "$cachedir" || error "Could not create dir $cachedir, $?"
+    fi
+    if [ -e "$STATFILE" ]; then
+        lines=$( flock -x "$LOCKFILE" wc -l "$STATFILE" | cut -d " " -f 1)
+        debug "found $lines lines in $STATFILE)"
+        if [ "$lines" -gt $MAX_STAT_LINES ]; then
+            (
+            flock -n 9 || exit 1
+            trim_statistics
+            ) 9>"$LOCKFILE"
+        fi
+    fi
 }
 
 prepare_cmd() {
@@ -247,9 +294,13 @@ prepare_cmd() {
     stage=$3
     key=$(printf "%11s%4s%9s" "$mode" "$ftype" "$stage"|sed -e "y/ /_/")
     debug "prepare_cmd, key=$key"
-    cmd=${validators[$key]};
-    debug "prepare_cmd, cmd=$cmd"
-    echo "$cmd"
+    if [[ ${validators[$key]:+1} ]]; then
+        cmd=${validators[$key]};
+        debug "prepare_cmd, cmd=$cmd"
+        echo "$cmd"
+    else
+        debug "no valid command found using key $key"
+    fi
 }
 
 prepare_ftype() {
@@ -290,10 +341,26 @@ estimate_mode() {
     echo $MODE
 }
 
+exec_cmd() {
+    cmd=$1
+    ftype=$2
+    workflow=$3
+    stage=$4
+    start_t=$(date +"%s")
+    debug "scan_file, calling cmd='$cmd'"
+    $cmd || ( error "failed call of '$cmd', $?"; exit 1 )
+    is_valid=$?
+    stop_t=$(date +"%s")
+    duration=$((stop_t - start_t))
+    debug "scan_file, duration=$duration is_valid=$is_valid"
+    update_statistics "$is_valid" "$duration" "$ftype" "$workflow" "$stage"
+}
+
+
 scan_file() {
     filename="$1"
     debug "scan_file, using filename: $filename"
-    mimetype=$(get_mimetype "$filename" | cut -d " " -f 2)
+    mimetype=$(get_mimetype "$filename")
     ftype=$(prepare_ftype "$mimetype")
     if [ "$MODE" = "auto" ]; then
         # try best guess
@@ -301,16 +368,13 @@ scan_file() {
     fi
     if [ "$STAGE" = "any" ]; then
         for stage in current upcoming; do
-            debug "scan_file, using stage: $stage (STAGE mode '$STAGE')"
             cmd=$(prepare_cmd "$MODE" "$ftype" "$stage" | sed -e "s#FILE#'$filename'#")
-            debug "scan_file, calling cmd='$cmd'"
-            $cmd || ( error "failed call of '$cmd', $?"; exit 1 )
+            exec_cmd "$cmd" "$ftype" "$MODE" "$stage"
         done
     else
-        debug "scan_file, using stage: $STAGE"
         cmd=$(prepare_cmd "$MODE" "$ftype" "$STAGE" | sed -e "s#FILE#'$filename'#")
-        debug "scan_file, calling cmd='$cmd'"
-        $cmd || ( error "failed call of '$cmd', $?"; exit 1 )
+        exec_cmd "$cmd" "$ftype" "$MODE" "$stage"
+
     fi
 
 }
-- 
GitLab