I had a situation where a daily sync of a table from one database to another was failing. This table was updated daily so the query should return something like this when it was synced correctly:
1 2 3 4 5 |
jasondb=# select date, count(*) from table group by date order by date desc limit 1; date | count ------------+-------- 2017-05-15 | 486706 (1 row) |
I use Nagios very heavily and I setup a custom plugin to check the query’s date against today’s date, this should warn or critical based on user supplied arguments. Here is what a failure looks like when running from the nagios servers command line. This worked well at alerting me when the sync failed, this was integrated into the nagios subsystem and emails and slack alerts are generated as expected.
1 2 3 4 5 6 7 |
[11:54:03] [nagios@nagios ~] $ /usr/local/nagios/libexec/check_pg_table_DATE.py -u nagios -p secretpass -H hostproddb -d proddb -w 1 -c 2 DATETIME NEGATIVE DELTA: [-4 days, 0:00:00] DATETIME NOW: [2017-05-15 00:00:00] DATETIME NOW OF QUERY: [2017-05-11 00:00:00] DATETIME CRIT VALUE: [2 days, 0:00:00] DATETIME WARN VALUE: [1 day, 0:00:00] NAGIOS CRIT: [-4 days, 0:00:00] is greater or equal to [2 days, 0:00:00] |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
#!/usr/bin/python """ MODULE IMPORTS """ """ THIS SCRIPT REQUIRES PSYCOPG2 - INSTALL WITH - 'sudo yum install python-psycopg2' """ import psycopg2 import string import datetime import time import optparse import sys import os import getpass """ NAGIOS API RETURN CODES """ nag_ret_dict = {'NagOk': 0, 'NagWarn': 1, 'NagCrit': 2, 'NagUnknown': 3} """ COMMAND LINE OPTION PARSING """ usage = "USAGE: python %prog -d DATABASE -u DB_USER -H REMOTE_HOST -p DB_PASSWORD -w -2 -c -3" parser = optparse.OptionParser(usage=usage) parser.add_option('-d', action="store", default="postgres", help="Database selection", type="string") parser.add_option('-u', action="store", default="postgres", help="DB User", type="string") parser.add_option('-H', action="store", default="localhost", help="DB Host system", type="string") parser.add_option('-p', action="store", default="", help="DB Password", type="string") parser.add_option('-c', action="store", default="-3", help="Critical Date Thresh -nn", type="int") parser.add_option('-w', action="store", default="-2", help="Warning Date Thresh -nn", type="int") options, args = parser.parse_args() """ ARG COUNT CHECK """ if len(sys.argv[1:]) == 0: parser.print_help() sys.exit(nag_ret_dict['NagCrit']) """ TEST DATABASE CONNECTION """ try: conn = psycopg2.connect("dbname='%s' user='%s' host='%s' password='%s'" \ % (options.d, options.u, options.H, options.p)) except psycopg2.DatabaseError, e: print e sys.exit(nag_ret_dict['NagCrit']) """ LOCAL SCOPE FUNCTIONS """ def query_pg_for_date(): try: cur = conn.cursor() cur.execute("""select date, count(*) From table group by date order by date desc limit 1;""") row = cur.fetchone() date_s = row[0] cur.close() conn.close() return date_s.strftime("%Y%m%d") except psycopg2.DatabaseError, e: print ( 'Error %s' % e ) parser.print_help() sys.exit(nag_ret_dict['nagiCrit']) def get_date_now(): date_now = time.strftime("%Y%m%d") return date_now.split()[0] """ MAIN FUNCTION """ def main(): queryDate = datetime.datetime.strptime(str(query_pg_for_date()), "%Y%m%d") actualDate = datetime.datetime.strptime(str(get_date_now()), "%Y%m%d") deltaCrit = datetime.timedelta(days=options.c) deltaWarn = datetime.timedelta(days=options.w) diff = queryDate - actualDate print("DATETIME NEGATIVE DELTA: [%s]") % diff print("DATETIME NOW: [%s]") % actualDate print("DATETIME NOW OF QUERY: [%s]") % queryDate print("DATETIME CRIT VALUE: [%s]") % deltaCrit print("DATETIME WARN VALUE: [%s]") % deltaWarn """ DEFINE LOGIC """ if diff <= deltaCrit: print("NAGIOS CRIT: [%s] is greater or equal to [%s]") % (diff, deltaCrit) sys.exit(nag_ret_dict['NagCrit']) elif diff <= deltaWarn: print("NAGIOS WARN: [%s] is greater than or equal to [%s]") % (diff, deltaWarn) sys.exit(nag_ret_dict['NagWarn']) else: print("NAGIOS OK: [%s] is less than or equal to [%s]") % (diff, deltaCrit) sys.exit(nag_ret_dict['NagOk']) if __name__ == "__main__": main() |