This python code when called with a user that can query the STV_RECENTS table will check the duration on a current running query against the threshold set by the config in microseconds and send an alert to slack if it exceeds 30 minutes. I have it cronned up and running every 30 minutes.
CLI example:
1 |
[Prompt#]>python rs_long_running_query_alert.py |
You will need slackclient:
https://pypi.python.org/pypi/slackclient
You will need psycopg2:
https://pypi.python.org/pypi/psycopg2
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
#!/usr/bin/python2.6 __author__ = "Jason Ralph" from datetime import timedelta import os import psycopg2 import ConfigParser from slackclient import SlackClient slack_token = os.environ["SLACK_API_TOKEN"] sc = SlackClient(slack_token) config = ConfigParser.ConfigParser() config.read(('%s/qa.ini') % (os.environ['QA_INI_PATH'])) config.sections() # redshift rs_host = config.get('redshift', 'hostname') rs_port = config.get('redshift', 'port') rs_database = config.get('redshift', 'database') rs_sys_user = config.get('redshift', 'sys_user') rs_sys_pass = config.get('redshift', 'sys_password') rs_app_lq_thresh = config.get('redshift', 'lq_app_threshold') """ Get running queries over set threshold set in config in microseconds. """ def get_long_running_queries(): try: conn = psycopg2.connect(("dbname=%s user=%s host=%s password=%s port=%s") % (rs_database, rs_sys_user, rs_host, rs_sys_pass, rs_port)) except psycopg2.Error as e: print e cast_thresh = int(rs_app_lq_thresh) sql = ("select userid, \ status, \ starttime, \ duration, \ user_name, \ db_name, \ query, \ pid from stv_recents \ where duration >= %s and status='Running';") % cast_thresh cursor = conn.cursor(cursor_factory=psycopg2.extras.DictCursor) cursor.execute(sql) count = 0 while True: row = cursor.fetchone() if row is None: break userid = row['userid'] status = row['status'] starttime = row['starttime'] duration = row['duration'] user_name = row['user_name'] db_name = row['db_name'] query = row['query'] pid = row['pid'] human_date_string = timedelta(microseconds=duration) if (user_name == 'user_1' or user_name == 'user_2' or user_name == 'user_3' and duration >= rs_app_lq_thresh): count += 1 msg_items = [] msg_items.append('%s: App Limit Breached - Long Running Query Alert\n' % rs_database) msg_items.append('PID: %s\n' % pid) msg_items.append('COUNT: %s\n' % count) msg_items.append('USERID: %s\n' % userid) msg_items.append('STATUS: %s\n' % status) msg_items.append('STARTTIME: %s\n' % starttime) msg_items.append('DURATION: %s\n' % human_date_string) msg_items.append('USER_NAME: %s\n' % user_name) msg_items.append('DB_NAME: %s\n' % db_name) msg_items.append('QUERY: %s' % query) msg = ''.join(msg_items) sc.api_call('chat.postMessage', channel='#python', text="'%s' :tada:") % msg conn.close() def main(): get_long_running_queries() if __name__ == '__main__': main() |
INI file:
1 2 3 4 5 6 7 8 9 10 |
[redshift] hostname = redshift.amazonaws.com port = 5439 database = redshift user = user password = password schema = public sys_user = rs_sys sys_password = password lq_app_threshold = 3600000000 |
Slack message example:
1 2 3 4 5 6 7 8 9 10 11 |
'db-redshift' APP [10:15 AM] redshift_dev: Long Running Query Alert PID: 10723 COUNT: 1 USERID: 100 STATUS: Running STARTTIME: 2017-09-16 11:00:01.417040 DURATION: 3:15:00.735017 USER_NAME: admin DB_NAME: database QUERY: vacuum; |