mirror of git://bitreich.org/reed-alert
Count failures and send begin/end notifications
This commit is contained in:
parent
1b2f15bf29
commit
f352b8458e
37
README
37
README
|
@ -63,9 +63,29 @@ The configuration is explained below.
|
|||
The Notification System
|
||||
=======================
|
||||
|
||||
When a check return an error, a previously defined notifier will be
|
||||
called. The notifier is a shell command with a name. The shell command
|
||||
can contains variables from reed-alert.
|
||||
When a check return a failure, a previously defined notifier will be
|
||||
called. This will be triggered only after reed-alert find **3**
|
||||
failures (not more or less) in a row for this check, this is a default
|
||||
value that can be changed per probe with the :try parameter as
|
||||
explained later in this document. This is to prevent reed-alert to
|
||||
spam notifications for a long time (number of failures very high, like
|
||||
a disk space usage that can't be fixed before a long time) OR
|
||||
preventing reed-alert to send notifications about a check on the edge
|
||||
of the limit like a ping almost working but failing from time to time
|
||||
or the load average around the limit.
|
||||
|
||||
reed-alert will use the notifier system when it reach its try number
|
||||
and when the problem is fixed, so you know when it begins and when it
|
||||
ends.
|
||||
|
||||
reed-alert keep tracks of the count of failures with one file per
|
||||
probe failing in the "states" folder. To ensure unique filenames, the
|
||||
following format is used (+ means it's concatenated) :
|
||||
|
||||
alert-name + probe-name + hash of probe parameters
|
||||
|
||||
The notifier is a shell command with a name. The shell command can
|
||||
contains variables from reed-alert.
|
||||
|
||||
+ %function% : the name of the probe
|
||||
+ %date% : the current date with format YYYY/MM/DD hh:mm:ss
|
||||
|
@ -76,6 +96,7 @@ can contains variables from reed-alert.
|
|||
+ %level% : the type of notification used
|
||||
+ %os% : the type of operating system (FreeBSD/Linux/OpenBSD)
|
||||
+ %newline% : a newline character
|
||||
+ %state% : "start" / "end" when problem happen / is solved
|
||||
|
||||
|
||||
Example Probe 1: 'Check For Load Average'
|
||||
|
@ -119,6 +140,16 @@ does. It can be put in every probe.
|
|||
:desc "STRING"
|
||||
|
||||
|
||||
The :try Parameter
|
||||
------------------
|
||||
The :try parameter allows you to change how many failure to wait
|
||||
before the alert is triggered. By default, it's triggered after 3
|
||||
failures. Sometimes, when using ping for example, you want to be
|
||||
notified when it fails a few cycles and not at first failure.
|
||||
|
||||
:try INTEGER
|
||||
|
||||
|
||||
Overview
|
||||
--------
|
||||
As of this commit, reed-alert ships with the following probes:
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
(load "functions.lisp")
|
||||
|
||||
(alert mail "echo -n 'Problem with %function% %date% %params%' | mail -s alarm mail@isp.net")
|
||||
(alert sms "/home/user/sms.sh '%date% %function% %params% %hostname%")
|
||||
(alert available-variables "REMINDER : %function% %params% %date% %hostname% %desc% %level% %os% %newline% %result%")
|
||||
(alert mail "echo -n '[%state%] Problem with %function% %date% %params%' | mail -s '[%state%] alarm' mail@isp.net")
|
||||
(alert sms "/home/user/sms.sh '%date% %state% %function% %params% %hostname%")
|
||||
(alert available-variables "REMINDER : %function% %params% %date% %hostname% %desc% %level% %os% %newline% %result% %state%")
|
||||
(alert empty "")
|
||||
|
||||
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
(load "functions.lisp")
|
||||
|
||||
(alert dont-use-it "REMINDER %function% %params% %date% %hostname% %desc% %level% %os% %newline% _ %space% %result%")
|
||||
(alert dont-use-it "REMINDER %state% %function% %params% %date% %hostname% %desc% %level% %os% %newline% _ %space% %result%")
|
||||
(alert empty "")
|
||||
(alert mail "")
|
||||
(alert peroket "echo 'problem at %date% with %function% %params%'")
|
||||
(alert peroket "echo '%state% problem at %date% with %function% %params% : %result%'")
|
||||
(alert sms "echo -n '%date% %function% CRITICAL on %hostname%' | curl http://somewebservice")
|
||||
;(alert mail "echo -n '%date% %hostname% had problem on %function% %newline% %params% values %result% %newline%
|
||||
; %desc%' | mail -s '[Error] %function% - %hostname%' foo@bar.com")
|
||||
|
@ -15,8 +15,8 @@
|
|||
(=> peroket disk-usage :path "/tmp" :limit 0) ;; failure
|
||||
|
||||
;; check if :path file exists
|
||||
(=> mail file-exists :path "/bsd.rd" :desc "OpenBSD kernel /bsd.rd")
|
||||
(=> empty file-exists :path "/non-existant-file") ;; failure file not found
|
||||
(=> mail file-exists :path "/bsd.rd" :desc "OpenBSD kernel /bsd.rd")
|
||||
(=> empty file-exists :path "/non-existant-file" :try 1) ;; failure file not found
|
||||
|
||||
;; check if :path file exists and has been updated since :limit minutes
|
||||
(=> empty file-updated :path "/var/log/messages" :limit 400)
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
(require 'asdf)
|
||||
|
||||
(defparameter *tries* 3)
|
||||
(defparameter *alerts* '())
|
||||
(ensure-directories-exist "states/")
|
||||
|
||||
(defun color(num1 num2)
|
||||
(format nil "~a[~a;~am" #\Escape num1 num2))
|
||||
|
@ -57,9 +59,10 @@
|
|||
(push (list ',name ,string)
|
||||
*alerts*)))
|
||||
|
||||
(defun trigger-alert(level function params result)
|
||||
(defun trigger-alert(level function params result state)
|
||||
(let* ((notifier-command (assoc level *alerts*))
|
||||
(command-string (cadr notifier-command)))
|
||||
(setf command-string (replace-all command-string "%state%" (if (eql 'error state) "Start" "End")))
|
||||
(setf command-string (replace-all command-string "%result%" (format nil "~a" result)))
|
||||
(setf command-string (replace-all command-string "%hostname%" (machine-instance)))
|
||||
(setf command-string (replace-all command-string "%os%" (software-type)))
|
||||
|
@ -85,15 +88,53 @@
|
|||
|
||||
(defun =>(level fonction &rest params)
|
||||
(format t "[~a~a ~20A~a] ~45A" *yellow* level fonction *white* (getf params :desc params))
|
||||
(let ((hash (fnv-hash (format nil "~{~a~}" (nconc (list level fonction) (remove-if #'symbolp params)))))
|
||||
(result (funcall fonction params)))
|
||||
(let* ((hash (fnv-hash (format nil "~{~a~}" (remove-if #'symbolp params))))
|
||||
(result (funcall fonction params))
|
||||
(filename (format nil "~a-~a-~a" level fonction hash))
|
||||
(filepath (format nil "states/~a" filename)))
|
||||
(if (not (listp result))
|
||||
(progn
|
||||
(format t " => ~asuccess~a~%" *green* *white*)
|
||||
(if (probe-file filepath)
|
||||
;; last time was a failure
|
||||
(progn
|
||||
(uiop:run-program (trigger-alert level fonction params t 'success) :output t)
|
||||
(delete-file filepath)
|
||||
(format t " => ~afailure => success~a~%" *green* *white*))
|
||||
;; last time was a success
|
||||
(format t " => ~asuccess~a~%" *green* *white*))
|
||||
;; we return t because it's ok
|
||||
t)
|
||||
|
||||
(progn
|
||||
(format t " => ~aerror~a~%" *red* *white*)
|
||||
(uiop:run-program (trigger-alert level fonction params (cadr result)) :output t)
|
||||
(if (probe-file filepath)
|
||||
;; error before
|
||||
;; but how many ?
|
||||
(with-open-file (stream filepath :direction :input)
|
||||
(let ((tries (parse-integer (read-line stream 0 nil))))
|
||||
(format t " => ~aerror (~a failures before)~a~%" *red* tries *white*)
|
||||
|
||||
;; more error than limit, send alert once
|
||||
(when (= tries (getf params :try *tries*))
|
||||
(uiop:run-program (trigger-alert level fonction params (cadr result) 'error) :output t))
|
||||
|
||||
;; increment the file
|
||||
(progn
|
||||
(with-open-file (stream-out filepath :direction :output
|
||||
:if-exists :supersede)
|
||||
(format stream-out "~a~%~a~%" (+ 1 tries) params)))))
|
||||
|
||||
;; file doesn't exist
|
||||
(with-open-file (stream-out filepath :direction :output
|
||||
:if-exists :supersede)
|
||||
(format t " => ~aerror (first failure)~a~%" *red* *white*)
|
||||
|
||||
;; maybe we would be warned at first error ?
|
||||
;; code is duplicated from above because it
|
||||
;; requires reading the non existent file
|
||||
(when (= 1 (getf params :try *tries*))
|
||||
(uiop:run-program (trigger-alert level fonction params (cadr result) 'error) :output t))
|
||||
|
||||
(format stream-out "1~%~a~%" params)))
|
||||
nil))))
|
||||
|
||||
(load "probes.lisp")
|
||||
|
|
Loading…
Reference in New Issue