(alert dont-use-it "REMINDER %state% %function% %params% %date% %hostname% %desc% %level% %os% %newline% _ %space% %result%") (alert empty "") (alert mail "") (alert peroket "echo '%state% problem at %date% with %function% %params% : %result%'") (alert sms "echo -n '%date% %function% CRITICAL on %hostname%' | curl http://somewebservice") ;(alert mail "echo -n '%date% %hostname% had problem on %function% %newline% %params% values %result% %newline% ; %desc%' | mail -s '[Error] %function% - %hostname%' foo@bar.com") ;; check if used percent :path partition is more than :limit (=> peroket disk-usage :path "/" :limit 90) (=> peroket disk-usage :path "/usr" :limit 85) (=> peroket disk-usage :path "/tmp" :limit 0) ;; failure ;; check if :path file exists (=> mail check-file-exists :path "/bsd.rd" :desc "OpenBSD kernel /bsd.rd") (=> empty check-file-exists :path "/non-existant-file" :try 3) ;; failure file not found ;; check if :path file exists and has been updated since :limit minutes (=> empty file-updated :path "/var/log/messages" :limit 400) (=> mail file-updated :path "/bsd.rd" :limit 1 :desc "OpenBSD kernel") ;; failure (=> mail file-updated :path "/tmp/reed-alert.txt" :limit 10) ;; check if :path pid file process is running (=> mail pid-running :path "/var/run/xdm.pid" :desc "XDM pid") (=> mail pid-running :path "/home/user/test.pid") ;; failure ;; check if number of processes on the system is more than :limit (=> mail number-of-processes :limit 200) (=> mail number-of-processes :limit 1) ;; failure ;; check if service is running (=> mail service :name "httpd" :reminder 2) ;; reminds every 2 check when it's failing (=> mail service :name "ospfd") ;; failure : not started (=> mail service :name "unknown") ;; failure : not known ;; check if load average on (1/5/15) minutes is more than :limit (=> mail load-average-1 :limit 4) ;;(=> mail load-average-5 :limit 2) ;;(=> mail load-average-15 :limit 1) (=> mail load-average-1 :limit 0.2) ;; should trigger error ;; check if :host host is reachable ;;(=> mail ping :host "8.8.8.8" :desc "Google DNS") ;;(=> empty ping :host "127.40.30.21" :desc "Certainly not used address") ;; fail time out (loop for host in (list "8.8.8.8" "8.8.4.4" "127.0.0.1") do (=> empty ping :host host)) ;; check if :command command return 0 (success) or something else (error) (=> empty command :command "echo hello") ;; success (=> empty command :command "ls /non-existent-file") ;; fail ;; check if web page :url answer under :limit (=> empty command :command "curl -m 10 http://google.fr/") ;; check if a certificate is still valid within a time range (=> mail ssl-expiration :host "google.fr" :seconds 1296000) (=> mail ssl-expiration :host "freenode.net" :seconds (* 7 24 60 60)) (=> mail ssl-expiration :host "freenode.net" :seconds 1296000 :port 6697) ;; update a file modification time (=> mail write-to-file :path "/tmp/reed-alert.txt") ;; we declare a new probe here (create-probe check-http-pattern (command-return-code (format nil "curl ~a | grep -i ~a" (getf params :url) (getf params :pattern)))) ;; check if the web page :url contains the text regex :pattern (=> empty check-http-pattern :url "http://google.fr/" :pattern "html") (=> empty check-http-pattern :url "http://127.0.0.1/" :pattern "HTML") (=> empty check-http-pattern :url "http://google.fr/" :pattern "hello") ;; error