From 69e88104da50cd9d5f40cad8cb505f4513c4eadc Mon Sep 17 00:00:00 2001
From: Jens Steidl <Jens.Steidl@slub-dresden.de>
Date: Thu, 15 May 2025 18:09:26 +0200
Subject: [PATCH] feat: nfs timeout notification

---
 tasks/rosetta/configure_nfs_monitoring.yml    | 29 +++++++++++++++++++
 tasks/rosetta/main_rosetta.yml                |  3 ++
 .../nfs_timeout_notification.service.j2       | 14 +++++++++
 .../nfs_timeout_notification.service.sh.j2    | 16 ++++++++++
 4 files changed, 62 insertions(+)
 create mode 100644 tasks/rosetta/configure_nfs_monitoring.yml
 create mode 100644 templates/usr/local/lib/systemd/system/nfs_timeout_notification.service.j2
 create mode 100644 templates/usr/local/lib/systemd/system/nfs_timeout_notification.service.sh.j2

diff --git a/tasks/rosetta/configure_nfs_monitoring.yml b/tasks/rosetta/configure_nfs_monitoring.yml
new file mode 100644
index 0000000..2a20ef8
--- /dev/null
+++ b/tasks/rosetta/configure_nfs_monitoring.yml
@@ -0,0 +1,29 @@
+---
+- name: create directory for systemd units
+  ansible.builtin.file:
+    path: "/usr/local/lib/systemd/system/"
+    mode: "0755"
+    state: directory
+    owner: "root"
+    group: "root"
+
+- name: install NFS monitoring services
+  ansible.builtin.template:
+    src: "usr/local/lib/systemd/system/{{ item }}.j2"
+    dest: "/usr/local/lib/systemd/system/{{ item }}"
+    mode: "0600"
+    owner: "root"
+    group: "root"
+  loop:
+    - "nfs_timeout_notification.service"
+    - "nfs_timeout_notification.service.sh"
+  notify: daemon-reload
+
+- name: enable NFS monitoring services
+  ansible.builtin.systemd_service:
+    name: "{{ item.name }}"
+    enabled: "{{ item.enabled | default(true) }}"
+    state: "{{ item.state | default('started') }}"
+    daemon_reload: true
+  loop:
+    - name: "nfs_timeout_notification.service"
diff --git a/tasks/rosetta/main_rosetta.yml b/tasks/rosetta/main_rosetta.yml
index 3cca01e..865be2e 100644
--- a/tasks/rosetta/main_rosetta.yml
+++ b/tasks/rosetta/main_rosetta.yml
@@ -52,3 +52,6 @@
 - name: configure shell environment
   ansible.builtin.import_tasks: "rosetta/configure_shell.yml"
   tags: [shell, csh, alias, aliases]
+- name: configure NFS monitoring
+  ansible.builtin.import_tasks: "rosetta/configure_nfs_monitoring.yml"
+  tags: [rosetta, systemd, nfs_monitoring]
diff --git a/templates/usr/local/lib/systemd/system/nfs_timeout_notification.service.j2 b/templates/usr/local/lib/systemd/system/nfs_timeout_notification.service.j2
new file mode 100644
index 0000000..c08722b
--- /dev/null
+++ b/templates/usr/local/lib/systemd/system/nfs_timeout_notification.service.j2
@@ -0,0 +1,14 @@
+[Unit]
+Description=monitor journal for NFS timeouts and notify staff
+After=remote-fs.target
+
+[Service]
+Type=simple
+RemainAfterExit=no
+Restart=no
+ExecStart=/bin/bash /usr/local/lib/systemd/system/nfs_timeout_notification.service.sh
+User={{ vault_nfs_timeout_notification_service.owner }}
+Group={{ vault_nfs_timeout_notification_service.group }}
+
+[Install]
+WantedBy=multi-user.target
diff --git a/templates/usr/local/lib/systemd/system/nfs_timeout_notification.service.sh.j2 b/templates/usr/local/lib/systemd/system/nfs_timeout_notification.service.sh.j2
new file mode 100644
index 0000000..e0d3629
--- /dev/null
+++ b/templates/usr/local/lib/systemd/system/nfs_timeout_notification.service.sh.j2
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+WAIT_BETWEEN_MAILS_IN_SECONDS={{ vault_nfs_timeout_notification_service.wait_between_mails_in_seconds }};
+NO_MAIL_UNTIL_EPOCH=0;
+journalctl -f | while read -r LINE; do
+  TIMEOUT=$(echo "${LINE}" | grep "nfs.*timed out");
+  if [[ ! -z "${TIMEOUT}" ]]; then
+    TIME=$(date +%s);
+    if [[ "${TIME}" -ge "${NO_MAIL_UNTIL}" ]]; then
+      NO_MAIL_UNTIL_EPOCH=$((TIME + WAIT_BETWEEN_MAILS_IN_SECONDS));
+      echo "NFS timeout detected, sending mail to staff";
+      echo "${TIMEOUT}" | /usr/bin/mail -s "NFS timeout detected on $(hostname -f)" {{ vault_nfs_timeout_notification_service.staff_mail }};
+    fi;
+  fi;
+  TIMEOUT="";
+done;
\ No newline at end of file
-- 
GitLab