diff --git a/files/error-summary.sh b/files/error-summary.sh new file mode 100755 index 0000000000000000000000000000000000000000..186d4bf34f94193484f459639763e71a82c6ae91 --- /dev/null +++ b/files/error-summary.sh @@ -0,0 +1,98 @@ +#!/usr/bin/env bash + +### META +# AUTHORS: +# - Jörg Sachse (<Joerg.Sachse@slub-dresden.de>) +# - Jens Steidl (<Jens.Steidl@slub-dresden.de>) +# INVOCATION: +# ./error_summary.sh [-h] [-n NUM] [-d DAYS] [-w] [-c] +# KNOWN PROBLEMS: +# - Showing Warnings with "-w" is buggy, use with caution. + + +### DEFAULTS +HOW_MANY_CHARS=200 +HOW_MANY_DAYS=10 +WARN="" # Default: do not filter for Warnings, only show Errors +FILTER_STRING=" (ERROR${WARN})\s+\[.{0,${HOW_MANY_CHARS}}" + +# get correct log path, this differs between servers +if [[ -d "/operational_shared/logs/${HOSTNAME}.slub-dresden.de/" ]]; then + SERVER_LOG_DIR="/operational_shared/logs/${HOSTNAME}.slub-dresden.de/" +else + SERVER_LOG_DIR="/operational_shared/logs/${HOSTNAME}/" +fi + + + +### CLI ARGUMENTS +for ARG in "$@"; do + case ${ARG} in + -h|--help) + echo "call help function here" + exit 0 + ;; + -n|--chars) + HOW_MANY_CHARS="${2}" + shift; + ;; + -d|--days) + HOW_MANY_DAYS="${2}" + shift; + ;; + -w|--warn) + WARN="|WARN" + ;; + -c|--class) + FILTER_STRING=" (ERROR${WARN})\s+\[[^[]*\]" + ;; + *) + echo "'${ARG}' is not a valid parameter. Please use '$( basename "${0}" ) --help'. Exiting." + exit 1 + ;; + esac +done + + + +echo "" +echo "USING THE FOLLOWING SETTINGS:" +echo -e "\tSERVERNAME: '${HOSTNAME}'" +echo -e "\tFILTERING LOGS FROM $( date -d "${HOW_MANY_DAYS} days ago" +%Y-%m-%d ) UNTIL $( date -d today +%Y-%m-%d )" +echo -e "\tHOW MANY CHARACTERS: ${HOW_MANY_CHARS}" +echo -en "\tCOUNT WARNINGS: "; [[ "${WARN}" == "|WARN" ]] && echo "yes" || echo "no" + + + +### FUNCTIONS + +# thx @Steidl! +# grep -o "STARTZEICHENKETTE..\{ANZAHL_ZEICHEN_NACH_STARTZEICHENKETTE\}" | sort | uniq -c | sort -nr + +get_last_n_days(){ + LOGFILES="$( find "${SERVER_LOG_DIR}/" -maxdepth 1 -mtime -"${HOW_MANY_DAYS}" -name "server.log*" )" + for LOGFILE in ${LOGFILES}; do + if file -b "${LOGFILE}" | grep "gzip compressed data" > /dev/null; then + GREP_CMD="zgrep" + elif file -b "${LOGFILE}" | grep "ASCII text" > /dev/null; then + GREP_CMD="grep" + else + >&2 echo "ERROR: unknown filetype, cannot find error strings from file '${LOGFILE}.'" + fi + ${GREP_CMD} -oP "${FILTER_STRING}" "${LOGFILE}" + done | \ + sort | uniq -c | sort -nr +} + + + +### MAIN + +echo "" +echo "REPORT LAST ${HOW_MANY_DAYS} DAYS:" +get_last_n_days + +echo "" +echo "REPORT SINCE YESTERDAY 00:00 O'CLOCK:" +HOW_MANY_DAYS=1 +get_last_n_days diff --git a/handlers/main.yml b/handlers/main.yml index 8998fdb09036f9f3dd418ce67c56904ef28a38ad..9d366ba642c91297ddee1ef35d987799615b4751 100644 --- a/handlers/main.yml +++ b/handlers/main.yml @@ -37,3 +37,7 @@ - name: activate kernel parameter changes command: sysctl -p ignore_errors: true + +- name: daemon-reload + ansible.builtin.systemd: + daemon_reload: true diff --git a/tasks/rosetta/install_error_summary.yml b/tasks/rosetta/install_error_summary.yml new file mode 100644 index 0000000000000000000000000000000000000000..6c5a3b4532f3e00cff254abeecb5e047ebd525a3 --- /dev/null +++ b/tasks/rosetta/install_error_summary.yml @@ -0,0 +1,54 @@ +--- +- name: install error summary script + ansible.builtin.copy: + src: "error-summary.sh" + dest: "/operational_shared/software/error-summary.sh" + owner: "{{ vault_rosetta_user }}" + group: "{{ vault_rosetta_group }}" + mode: "0755" + +- name: install error summary SystemD service/timer + ansible.builtin.template: + src: "{{ item }}.j2" + dest: "/etc/systemd/user/{{ item }}" + mode: "0644" + loop: + - "error-summary.service" + - "error-summary.timer" + notify: daemon-reload + + + + +- name: find error summary systemd units so we don't have to hardcode their names in the loops + ansible.builtin.find: + path: "/etc/systemd/user/" + pattern: "error-summary.*" + register: error_summary_units + +- name: check if error summary units are already enabled + ansible.builtin.command: "systemctl is-enabled {{ item.path | basename }}" + loop: "{{ error_summary_units.files }}" + register: error_summary_enabled + changed_when: false + failed_when: + - error_summary_enabled.stdout != "enabled" + - error_summary_enabled.stdout != "disabled" + - '"No such file or directory" not in error_summary_enabled.stderr' + +- name: manually enable error_summary.service, because it cannot be found by the ansible.builtin.systemd module when the timer is located below "/etc/systemd/user/" + ansible.builtin.command: "systemctl enable {{ item.item.path }}" + loop: "{{ error_summary_enabled.results }}" + when: + - item.stdout != "enabled" + register: error_summary_enablecmd + changed_when: error_summary_enablecmd.stdout in "Created symlink" + + + + + +- name: start error summary SystemD timer (the service doesn't need to be started, that's done by the timer) + ansible.builtin.systemd: + name: "error-summary.timer" + state: started diff --git a/tasks/rosetta/main_rosetta.yml b/tasks/rosetta/main_rosetta.yml index 18a6a1e368a24b95951556292a9eb0c0aa1b4ab9..6c56b5b25803e49a4546c8bc6b85c53875f7a7bd 100644 --- a/tasks/rosetta/main_rosetta.yml +++ b/tasks/rosetta/main_rosetta.yml @@ -21,3 +21,5 @@ tags: [backup] - import_tasks: rosetta/install_format_library_xsds.yml tags: [rosetta, xsd] +- import_tasks: "rosetta/install_error_summary.yml" + tags: [monitoring, reporting, visibility, errorsummary] diff --git a/templates/error-summary.service.j2 b/templates/error-summary.service.j2 new file mode 100644 index 0000000000000000000000000000000000000000..5124c74abf26b24a41e76f6c501dc3daabbf4c99 --- /dev/null +++ b/templates/error-summary.service.j2 @@ -0,0 +1,18 @@ +[Unit] +Description=service daily Rosetta error summary + +[Service] +Type=oneshot +Restart=no +ExecStart=/bin/bash -c '/operational_shared/software/error-summary.sh | /usr/bin/mail -s "Rosetta Error Summary from ${HOSTNAME}" "langzeitarchiv@slub-dresden.de"' +User={{ vault_rosetta_user }} +Group={{ vault_rosetta_group }} + +# https://unix.stackexchange.com/a/231201 says: +# /usr/bin/mail performs a double fork to daemonize sendmail for sending the email. This sendmail proc gets reowned to init, so normally it wouldn't be affected by anything that happens with the original parent - except in the systemd case that reowned grandchild is still in the same cgroup as the original service. When systemd tears things down, it kills all processes within the cgroup, including the reowned sendmail process. +# The mail command itself ran fine, but sendmail was getting killed by systemd before it had a chance to do its thing. +# You can get around this by setting KillMode in the Unit section to process (the default is control-group). That will cause systemd to only kill the process which it directly fired. +KillMode=process + +[Install] +WantedBy=default.target diff --git a/templates/error-summary.timer.j2 b/templates/error-summary.timer.j2 new file mode 100644 index 0000000000000000000000000000000000000000..a3c9e59ffdcdc6f3a114b4660a6fc0f06cecd9d9 --- /dev/null +++ b/templates/error-summary.timer.j2 @@ -0,0 +1,15 @@ +[Unit] +Description=timer daily Rosetta error summary + +[Timer] +#Run every day (daily ==> *-*-* 00:00:00) +OnCalendar=daily +#When activated, it triggers the service immediately if it missed the last start time, for example due to the system being powered off +Persistent=true +#Unit to activate when the timer elapses. (default is set to the same name as the timer unit, except for the suffix) +Unit=error-summary.service + +[Install] +#is requires to activate the timer permanently +#on first init a symbolic link of /etc/systemd/system/basic.target.wants will be created +WantedBy=default.target