diff --git a/tasks/rosetta/configure_nfs_monitoring.yml b/tasks/rosetta/configure_nfs_monitoring.yml new file mode 100644 index 0000000000000000000000000000000000000000..2a20ef86e4cd8e38a2cc44a70b511e7fbebd4672 --- /dev/null +++ b/tasks/rosetta/configure_nfs_monitoring.yml @@ -0,0 +1,29 @@ +--- +- name: create directory for systemd units + ansible.builtin.file: + path: "/usr/local/lib/systemd/system/" + mode: "0755" + state: directory + owner: "root" + group: "root" + +- name: install NFS monitoring services + ansible.builtin.template: + src: "usr/local/lib/systemd/system/{{ item }}.j2" + dest: "/usr/local/lib/systemd/system/{{ item }}" + mode: "0600" + owner: "root" + group: "root" + loop: + - "nfs_timeout_notification.service" + - "nfs_timeout_notification.service.sh" + notify: daemon-reload + +- name: enable NFS monitoring services + ansible.builtin.systemd_service: + name: "{{ item.name }}" + enabled: "{{ item.enabled | default(true) }}" + state: "{{ item.state | default('started') }}" + daemon_reload: true + loop: + - name: "nfs_timeout_notification.service" diff --git a/tasks/rosetta/main_rosetta.yml b/tasks/rosetta/main_rosetta.yml index 3cca01e3069a9758489f09e30164d32021860fbf..865be2e8d8f5c01bfd8924878d0f0c3d32268b68 100644 --- a/tasks/rosetta/main_rosetta.yml +++ b/tasks/rosetta/main_rosetta.yml @@ -52,3 +52,6 @@ - name: configure shell environment ansible.builtin.import_tasks: "rosetta/configure_shell.yml" tags: [shell, csh, alias, aliases] +- name: configure NFS monitoring + ansible.builtin.import_tasks: "rosetta/configure_nfs_monitoring.yml" + tags: [rosetta, systemd, nfs_monitoring] diff --git a/templates/usr/local/lib/systemd/system/nfs_timeout_notification.service.j2 b/templates/usr/local/lib/systemd/system/nfs_timeout_notification.service.j2 new file mode 100644 index 0000000000000000000000000000000000000000..c08722bc3892841eeecda5ccde2994a4e2d79ffc --- /dev/null +++ b/templates/usr/local/lib/systemd/system/nfs_timeout_notification.service.j2 @@ -0,0 +1,14 @@ +[Unit] +Description=monitor journal for NFS timeouts and notify staff +After=remote-fs.target + +[Service] +Type=simple +RemainAfterExit=no +Restart=no +ExecStart=/bin/bash /usr/local/lib/systemd/system/nfs_timeout_notification.service.sh +User={{ vault_nfs_timeout_notification_service.owner }} +Group={{ vault_nfs_timeout_notification_service.group }} + +[Install] +WantedBy=multi-user.target diff --git a/templates/usr/local/lib/systemd/system/nfs_timeout_notification.service.sh.j2 b/templates/usr/local/lib/systemd/system/nfs_timeout_notification.service.sh.j2 new file mode 100644 index 0000000000000000000000000000000000000000..e0d36292fd2b613bd5b0a461229e8c51cfa11396 --- /dev/null +++ b/templates/usr/local/lib/systemd/system/nfs_timeout_notification.service.sh.j2 @@ -0,0 +1,16 @@ +#!/bin/bash + +WAIT_BETWEEN_MAILS_IN_SECONDS={{ vault_nfs_timeout_notification_service.wait_between_mails_in_seconds }}; +NO_MAIL_UNTIL_EPOCH=0; +journalctl -f | while read -r LINE; do + TIMEOUT=$(echo "${LINE}" | grep "nfs.*timed out"); + if [[ ! -z "${TIMEOUT}" ]]; then + TIME=$(date +%s); + if [[ "${TIME}" -ge "${NO_MAIL_UNTIL}" ]]; then + NO_MAIL_UNTIL_EPOCH=$((TIME + WAIT_BETWEEN_MAILS_IN_SECONDS)); + echo "NFS timeout detected, sending mail to staff"; + echo "${TIMEOUT}" | /usr/bin/mail -s "NFS timeout detected on $(hostname -f)" {{ vault_nfs_timeout_notification_service.staff_mail }}; + fi; + fi; + TIMEOUT=""; +done; \ No newline at end of file