diff --git a/README b/README index c64a064..24f845e 100644 --- a/README +++ b/README @@ -95,7 +95,8 @@ The Notification System When a check return a failure, a previously defined notifier will be called. This will be triggered only after reed-alert find **3** -failures (not more or less) in a row for this check, this is a default +failures (not more or less, but this can be changed globally by +modifying *tries* variable) in a row for this check, this is a default value that can be changed per probe with the :try parameter as explained later in this document. This is to prevent reed-alert to spam notifications for a long time (number of failures very high, like @@ -108,6 +109,13 @@ reed-alert will use the notifier system when it reach its try number and when the problem is fixed, so you know when it begins and when it ends. +It is possible to be reminded about a failure every n tries by setting +the keyword :reminder and using a number. This is useful if you want +to be reminded from time to time if a problem is not fixed, using some +alerts like mails can be easily overlooked or lost in a huge mail +amount. The :reminder is a setting per check. For a global reminder +setting, one can set *reminder* variable. + reed-alert keep tracks of the count of failures with one file per probe failing in the "states" folder. To ensure unique filenames, the following format is used (+ means it's concatenated) : diff --git a/functions.lisp b/functions.lisp index 2c29730..ee0c6a0 100644 --- a/functions.lisp +++ b/functions.lisp @@ -3,6 +3,7 @@ (require 'asdf)) (defparameter *tries* 3) +(defparameter *reminder* 0) (defparameter *alerts* '()) (defparameter *states-dir* "~/.reed-alert/states/") (ensure-directories-exist *states-dir*) @@ -69,7 +70,10 @@ (defun trigger-alert(level function params result state) (let* ((notifier-command (assoc level *alerts*)) (command-string (cadr notifier-command))) - (setf command-string (replace-all command-string "%state%" (if (eql 'error state) "Start" "End"))) + (setf command-string (replace-all command-string "%state%" (cond + ((eql state 'START) "Begin") + ((eql state 'REMINDER) "Reminder") + (t "End")))) (setf command-string (replace-all command-string "%result%" (format nil "~a" result))) (setf command-string (replace-all command-string "%hostname%" (machine-instance))) (setf command-string (replace-all command-string "%os%" (software-type))) @@ -84,7 +88,7 @@ (get-decoded-time) (format nil "~a/~a/~a ~a:~a:~a" year month day hour minute second)))) command-string)) - + (defmacro stop-if-error(&body body) `(progn (and ,@body))) @@ -129,12 +133,21 @@ t) ;; failure handling - (let ((trigger-now? (= (+ 1 tries) (getf params :try *tries*)))) ; we add +1 because it's failing right now + (let ((trigger-now? (or + ;; we add +1 to tries because it's failing right now + (and (= (+ 1 tries) (getf params :try *tries*)) + 'START) ;; it starts failing + + ;; if reminder is set and a valid value (> 0) + (when (< 0 (getf params :reminder *reminder*)) + (and (= 0 (mod (+ 1 tries) (getf params :reminder *reminder*))) + 'REMINDER))))) ;; do we need to remind it's failing? + (format t " => ~aerror (~a failure(s) before)~a~a~%" *red* tries *white* (if trigger-now? " NOTIFIED" "")) ;; more error than limit, send alert once (when trigger-now? - (uiop:run-program (trigger-alert level fonction params (cadr result) 'error) :output t)) + (uiop:run-program (trigger-alert level fonction params (cadr result) trigger-now?) :output t)) ;; increment the number of tries by 1 (with-open-file (stream-out filepath :direction :output :if-exists :supersede)