Skip to content
Snippets Groups Projects
Commit 8b3e57a1 authored by Jörg Sachse's avatar Jörg Sachse
Browse files

feat: create error-sumary report for Rosetta log errors and mail it to admins

parent 9e77885b
Branches
No related tags found
No related merge requests found
#!/usr/bin/env bash
### META
# AUTHORS:
# - Jörg Sachse (<Joerg.Sachse@slub-dresden.de>)
# - Jens Steidl (<Jens.Steidl@slub-dresden.de>)
# INVOCATION:
# ./error_summary.sh [-h] [-n NUM] [-d DAYS] [-w] [-c]
# KNOWN PROBLEMS:
# - Showing Warnings with "-w" is buggy, use with caution.
### DEFAULTS
HOW_MANY_CHARS=200
HOW_MANY_DAYS=10
WARN="" # Default: do not filter for Warnings, only show Errors
FILTER_STRING=" (ERROR${WARN})\s+\[.{0,${HOW_MANY_CHARS}}"
# get correct log path, this differs between servers
if [[ -d "/operational_shared/logs/${HOSTNAME}.slub-dresden.de/" ]]; then
SERVER_LOG_DIR="/operational_shared/logs/${HOSTNAME}.slub-dresden.de/"
else
SERVER_LOG_DIR="/operational_shared/logs/${HOSTNAME}/"
fi
### CLI ARGUMENTS
for ARG in "$@"; do
case ${ARG} in
-h|--help)
echo "call help function here"
exit 0
;;
-n|--chars)
HOW_MANY_CHARS="${2}"
shift;
;;
-d|--days)
HOW_MANY_DAYS="${2}"
shift;
;;
-w|--warn)
WARN="|WARN"
;;
-c|--class)
FILTER_STRING=" (ERROR${WARN})\s+\[[^[]*\]"
;;
*)
echo "'${ARG}' is not a valid parameter. Please use '$( basename "${0}" ) --help'. Exiting."
exit 1
;;
esac
done
echo ""
echo "USING THE FOLLOWING SETTINGS:"
echo -e "\tSERVERNAME: '${HOSTNAME}'"
echo -e "\tFILTERING LOGS FROM $( date -d "${HOW_MANY_DAYS} days ago" +%Y-%m-%d ) UNTIL $( date -d today +%Y-%m-%d )"
echo -e "\tHOW MANY CHARACTERS: ${HOW_MANY_CHARS}"
echo -en "\tCOUNT WARNINGS: "; [[ "${WARN}" == "|WARN" ]] && echo "yes" || echo "no"
### FUNCTIONS
# thx @Steidl!
# grep -o "STARTZEICHENKETTE..\{ANZAHL_ZEICHEN_NACH_STARTZEICHENKETTE\}" | sort | uniq -c | sort -nr
get_last_n_days(){
LOGFILES="$( find "${SERVER_LOG_DIR}/" -maxdepth 1 -mtime -"${HOW_MANY_DAYS}" -name "server.log*" )"
for LOGFILE in ${LOGFILES}; do
if file -b "${LOGFILE}" | grep "gzip compressed data" > /dev/null; then
GREP_CMD="zgrep"
elif file -b "${LOGFILE}" | grep "ASCII text" > /dev/null; then
GREP_CMD="grep"
else
>&2 echo "ERROR: unknown filetype, cannot find error strings from file '${LOGFILE}.'"
fi
${GREP_CMD} -oP "${FILTER_STRING}" "${LOGFILE}"
done | \
sort | uniq -c | sort -nr
}
### MAIN
echo ""
echo "REPORT LAST ${HOW_MANY_DAYS} DAYS:"
get_last_n_days
echo ""
echo "REPORT SINCE YESTERDAY 00:00 O'CLOCK:"
HOW_MANY_DAYS=1
get_last_n_days
......@@ -37,3 +37,7 @@
- name: activate kernel parameter changes
command: sysctl -p
ignore_errors: true
- name: daemon-reload
ansible.builtin.systemd:
daemon_reload: true
---
- name: install error summary script
ansible.builtin.copy:
src: "error-summary.sh"
dest: "/operational_shared/software/error-summary.sh"
owner: "{{ vault_rosetta_user }}"
group: "{{ vault_rosetta_group }}"
mode: "0755"
- name: install error summary SystemD service/timer
ansible.builtin.template:
src: "{{ item }}.j2"
dest: "/etc/systemd/user/{{ item }}"
mode: "0644"
loop:
- "error-summary.service"
- "error-summary.timer"
notify: daemon-reload
- name: find error summary systemd units so we don't have to hardcode their names in the loops
ansible.builtin.find:
path: "/etc/systemd/user/"
pattern: "error-summary.*"
register: error_summary_units
- name: check if error summary units are already enabled
ansible.builtin.command: "systemctl is-enabled {{ item.path | basename }}"
loop: "{{ error_summary_units.files }}"
register: error_summary_enabled
changed_when: false
failed_when:
- error_summary_enabled.stdout != "enabled"
- error_summary_enabled.stdout != "disabled"
- '"No such file or directory" not in error_summary_enabled.stderr'
- name: manually enable error_summary.service, because it cannot be found by the ansible.builtin.systemd module when the timer is located below "/etc/systemd/user/"
ansible.builtin.command: "systemctl enable {{ item.item.path }}"
loop: "{{ error_summary_enabled.results }}"
when:
- item.stdout != "enabled"
register: error_summary_enablecmd
changed_when: error_summary_enablecmd.stdout in "Created symlink"
- name: start error summary SystemD timer (the service doesn't need to be started, that's done by the timer)
ansible.builtin.systemd:
name: "error-summary.timer"
state: started
......@@ -21,3 +21,5 @@
tags: [backup]
- import_tasks: rosetta/install_format_library_xsds.yml
tags: [rosetta, xsd]
- import_tasks: "rosetta/install_error_summary.yml"
tags: [monitoring, reporting, visibility, errorsummary]
[Unit]
Description=service daily Rosetta error summary
[Service]
Type=oneshot
Restart=no
ExecStart=/bin/bash -c '/operational_shared/software/error-summary.sh | /usr/bin/mail -s "Rosetta Error Summary from ${HOSTNAME}" "langzeitarchiv@slub-dresden.de"'
User={{ vault_rosetta_user }}
Group={{ vault_rosetta_group }}
# https://unix.stackexchange.com/a/231201 says:
# /usr/bin/mail performs a double fork to daemonize sendmail for sending the email. This sendmail proc gets reowned to init, so normally it wouldn't be affected by anything that happens with the original parent - except in the systemd case that reowned grandchild is still in the same cgroup as the original service. When systemd tears things down, it kills all processes within the cgroup, including the reowned sendmail process.
# The mail command itself ran fine, but sendmail was getting killed by systemd before it had a chance to do its thing.
# You can get around this by setting KillMode in the Unit section to process (the default is control-group). That will cause systemd to only kill the process which it directly fired.
KillMode=process
[Install]
WantedBy=default.target
[Unit]
Description=timer daily Rosetta error summary
[Timer]
#Run every day (daily ==> *-*-* 00:00:00)
OnCalendar=daily
#When activated, it triggers the service immediately if it missed the last start time, for example due to the system being powered off
Persistent=true
#Unit to activate when the timer elapses. (default is set to the same name as the timer unit, except for the suffix)
Unit=error-summary.service
[Install]
#is requires to activate the timer permanently
#on first init a symbolic link of /etc/systemd/system/basic.target.wants will be created
WantedBy=default.target
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment