June 2019 – Jason R. Ralph

I needed a utility to alert our team when any long running queries were running on a production postgres cluster. I came up with the following python code that achieves just that. This would alert slack if an active query exceeds 45 mins. The script takes in user parameters as well, I will demonstrate the way to call it. Hope it helps someone.

CRON CALL:

### postgres long running query check ###
*/15 * * * * /usr/bin/python2.7 /home/postgres/bin/pg_long_running_query.py --database proddb --dbhost proddb01 --user postgres --alert_time_mins 45 >> /home/postgres/pg_long_running_query.log 2>&1

1 2	### postgres long running query check ### /15 * * * /usr/bin/python2.7 /home/postgres/bin/pg_long_running_query.py --database proddb --dbhost proddb01 --user postgres --alert_time_mins 45 >> /home/postgres/pg_long_running_query.log 2>&1

CODE:

#!/usr/bin/python2.7

__author__ = "Jason Ralph"


import psycopg2
import psycopg2.extras
import argparse
import urllib


def send_message_to_slack(text):
    import requests
    import json

    webhook_url = 'https://hooks.slack.com/services/--redacted--'
    slack_data = {'text': "%s" % text}

    response = requests.post(
        webhook_url, data=json.dumps(slack_data),
        headers={'Content-Type': 'application/json'}
    )
    if response.status_code != 200:
        raise ValueError(
            'Request to slack returned an error %s, the response is:\n%s'
            % (response.status_code, response.text)
    )


def get_long_running_queries():
    parser = argparse.ArgumentParser(description='Check long Running '
                                                 'Queries On Postgres '
                                                 'Databases And Alert')
    parser.add_argument('--database', help='target database')
    parser.add_argument('--dbhost', help='target dbhost')
    parser.add_argument('--user', help='database user')
    parser.add_argument('--alert_time_mins', help='alert time in mins: e.g 30')
    args = parser.parse_args()

    conn = psycopg2.connect("dbname='%s' host='%s' user='%s' port=5432" 
                            % (args.database, args.dbhost, args.user))

    sql = ("""SELECT pid, usename,
              now() - pg_stat_activity.query_start AS duration,
              query, state FROM pg_stat_activity 
              WHERE (now() - pg_stat_activity.query_start) > interval
               '"%s" minutes';""") % args.alert_time_mins

    cursor = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
    cursor.execute(sql)
    count = 0
    while True:
        row = cursor.fetchone()
        if row is None:
            break
        if row['usename'] == 'postgres':
            continue
        if row['state'] == 'idle':
            continue
        count += 1
        pid = row['pid']
        user = row['usename']
        duration = row['duration']
        query = row['query']
        state = row['state']
        msg_items = ['LONG RUNNING QUERY ON HOST: %s\n'
                     % args.dbhost, 'PID: %s\n' % pid,
                     'DURATION: %s\n' % duration,
                     'QUERY: %s\n' % query,
                     'STATE: %s\n' % state,
                     'USER: %s\n' % user,
                     'COUNT: %s\n' % count]                                                      
        msg = ''.join(msg_items)
        send_message_to_slack(msg)
    conn.close()

def main():
    get_long_running_queries()

if __name__ == '__main__':
    main()

#!/usr/bin/python2.7

__author__ = "Jason Ralph"

import psycopg2

import psycopg2.extras

import argparse

import urllib

def send_message_to_slack(text):

import requests

import json

webhook_url = 'https://hooks.slack.com/services/--redacted--'

slack_data = {'text': "%s" % text}

response = requests.post(

webhook_url, data=json.dumps(slack_data),

headers={'Content-Type': 'application/json'}

)

if response.status_code != 200:

raise ValueError(

'Request to slack returned an error %s, the response is:\n%s'

% (response.status_code, response.text)

)

def get_long_running_queries():

parser = argparse.ArgumentParser(description='Check long Running '

'Queries On Postgres '

'Databases And Alert')

parser.add_argument('--database', help='target database')

parser.add_argument('--dbhost', help='target dbhost')

parser.add_argument('--user', help='database user')

parser.add_argument('--alert_time_mins', help='alert time in mins: e.g 30')

args = parser.parse_args()

conn = psycopg2.connect("dbname='%s' host='%s' user='%s' port=5432"

% (args.database, args.dbhost, args.user))

sql = ("""SELECT pid, usename,

now() - pg_stat_activity.query_start AS duration,

query, state FROM pg_stat_activity

WHERE (now() - pg_stat_activity.query_start) > interval

'"%s" minutes';""") % args.alert_time_mins

cursor = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)

cursor.execute(sql)

count = 0

while True:

row = cursor.fetchone()

if row is None:

break

if row['usename'] == 'postgres':

continue

if row['state'] == 'idle':

continue

count += 1

pid = row['pid']

user = row['usename']

duration = row['duration']

query = row['query']

state = row['state']

msg_items = ['LONG RUNNING QUERY ON HOST: %s\n'

% args.dbhost, 'PID: %s\n' % pid,

'DURATION: %s\n' % duration,

'QUERY: %s\n' % query,

'STATE: %s\n' % state,

'USER: %s\n' % user,

'COUNT: %s\n' % count]

msg = ''.join(msg_items)

send_message_to_slack(msg)

conn.close()

def main():

get_long_running_queries()

if __name__ == '__main__':

main()

SLACK MESSAGE:

LONG RUNNING QUERY ON HOST: proddb01
PID: 30270
DURATION: 0:55:02.748624
QUERY: SELECT --redacted--
STATE: active
USER: dbuser
COUNT: 1

LONG RUNNING QUERY ON HOST: proddb01

PID: 30270

DURATION: 0:55:02.748624

QUERY: SELECT --redacted--

STATE: active

USER: dbuser

COUNT: 1

I have a project that rsync’s data from an RPM repository for a local version of this repo. The issue I was faced with was the remote mirror would sometimes stop the rsync due to overloaded network or other unforeseen issues. I wanted to use rsyncs hashing algorithm to have it start right where it left off so I wrote a function to do this. If 900 seconds was hit it usually meant there was an issue with the transfer. I also want to state here that I observed the rsync stop serving issue on many mirrors so it was not just an issue with the TCP network. I use this in production and it logs each iteration or restart. The function below will also kill the current rsync so multiple copies are not running at the same time. I also only wanted to perform 5 iterations of rsync upon error or timeout so I use a while loop here.

Here are the individual rsync commands in the INI configuration.

[rsync_cmds]
rsync01 = /usr/local/bin/rsync -a rsync://mirror.cogentco.com/CentOS/7/os/x86_64/ 7/x86_64/
rsync02 = /usr/local/bin/rsync -a rsync://mirror.cogentco.com/CentOS/7/updates/x86_64/ 7/updates/x86_64/
rsync03 = /usr/local/bin/rsync -a rsync://mirror.cogentco.com/CentOS/7/centosplus/x86_64/ 7/centosplus/x86_64/
rsync04 = /usr/local/bin/rsync -a rsync://mirror.cogentco.com/CentOS/7/extras/x86_64/ 7/extras/x86_64

[rsync_cmds]

rsync01 = /usr/local/bin/rsync -a rsync://mirror.cogentco.com/CentOS/7/os/x86_64/ 7/x86_64/

rsync02 = /usr/local/bin/rsync -a rsync://mirror.cogentco.com/CentOS/7/updates/x86_64/ 7/updates/x86_64/

rsync03 = /usr/local/bin/rsync -a rsync://mirror.cogentco.com/CentOS/7/centosplus/x86_64/ 7/centosplus/x86_64/

rsync04 = /usr/local/bin/rsync -a rsync://mirror.cogentco.com/CentOS/7/extras/x86_64/ 7/extras/x86_64

Here is how I call the execute_jobs_timeout() function:

rsync_commands = dict(config.items('rsync_cmds'))
def rsync_data():
    for name, cmds in sorted(rsync_commands.items()):
        execute_jobs_timeout(cmds)

rsync_commands = dict(config.items('rsync_cmds'))

def rsync_data():

for name, cmds in sorted(rsync_commands.items()):

execute_jobs_timeout(cmds)

The function:

def execute_jobs_timeout(cmd):
    iteration = 0
    while iteration < 5:
        proc = subprocess.Popen(shlex.split(cmd),
                                start_new_session=True)
        try:
            logger.info('Start Command: [%s]' % sanitize(cmd))
            stdout_data, stderr_data = proc.communicate(timeout=900)
            if proc.returncode != 0:
                logger.critical(
                    "%r failed, status code %s stdout %r stderr %r" % (
                        sanitize(cmd), proc.returncode,
                        stdout_data, stderr_data))
                iteration += 1
                if iteration == 5:
                    logger.critical('Execute Jobs Failed After 5 Iterations.')
                    break
                continue
            logger.info('Success: [%s]' % sanitize(cmd))
            break
        except (subprocess.TimeoutExpired, subprocess.SubprocessError) as e:
            os.killpg(os.getpgid(proc.pid), signal.SIGKILL)
            logger.warning('[%s]' % e)
            logger.info('Restarting [%s]' % sanitize(cmd))
            iteration += 1
            if iteration == 5:
                logger.critical('Execute Jobs Failed After 5 Iterations.')
                break
            continue

def execute_jobs_timeout(cmd):

iteration = 0

while iteration < 5:

proc = subprocess.Popen(shlex.split(cmd),

start_new_session=True)

try:

logger.info('Start Command: [%s]' % sanitize(cmd))

stdout_data, stderr_data = proc.communicate(timeout=900)

if proc.returncode != 0:

logger.critical(

"%r failed, status code %s stdout %r stderr %r" % (

sanitize(cmd), proc.returncode,

stdout_data, stderr_data))

iteration += 1

if iteration == 5:

logger.critical('Execute Jobs Failed After 5 Iterations.')

break

continue

logger.info('Success: [%s]' % sanitize(cmd))

break

except (subprocess.TimeoutExpired, subprocess.SubprocessError) as e:

os.killpg(os.getpgid(proc.pid), signal.SIGKILL)

logger.warning('[%s]' % e)

logger.info('Restarting [%s]' % sanitize(cmd))

iteration += 1

if iteration == 5:

logger.critical('Execute Jobs Failed After 5 Iterations.')

break

continue

Log Snippet showing each command executing:

2019-05-25 03:15:03,872 - __main__ - INFO - Restarting [/usr/local/bin/rsync -a rsync://mirror.cogentco.com/CentOS/7/os/x86_64/ 7/x86_64/] - devdbadmin
2019-05-25 03:15:03,875 - __main__ - INFO - Start Command: [/usr/local/bin/rsync -a rsync://mirror.cogentco.com/CentOS/7/os/x86_64/ 7/x86_64/] - devdbadmin
2019-05-25 03:27:53,801 - __main__ - INFO - Success: [/usr/local/bin/rsync -a rsync://mirror.cogentco.com/CentOS/7/os/x86_64/ 7/x86_64/] - devdbadmin
2019-05-25 03:27:53,821 - __main__ - INFO - Start Command: [/usr/local/bin/rsync -a rsync://mirror.cogentco.com/CentOS/7/updates/x86_64/ 7/updates/x86_64/] - devdbadmin
2019-05-25 03:42:53,821 - __main__ - WARNING - [Command '['/usr/local/bin/rsync', '-a', 'rsync://mirror.cogentco.com/CentOS/7/updates/x86_64/', '7/updates/x86_64/']' timed out after 899.9999316609465 seconds] - devdbadmin
2019-05-25 03:42:53,822 - __main__ - INFO - Restarting [/usr/local/bin/rsync -a rsync://mirror.cogentco.com/CentOS/7/updates/x86_64/ 7/updates/x86_64/] - devdbadmin
2019-05-25 03:42:53,850 - __main__ - INFO - Start Command: [/usr/local/bin/rsync -a rsync://mirror.cogentco.com/CentOS/7/updates/x86_64/ 7/updates/x86_64/] - devdbadmin
2019-05-25 03:57:53,851 - __main__ - WARNING - [Command '['/usr/local/bin/rsync', '-a', 'rsync://mirror.cogentco.com/CentOS/7/updates/x86_64/', '7/updates/x86_64/']' timed out after 899.9999369028956 seconds] - devdbadmin
2019-05-25 03:57:53,852 - __main__ - INFO - Restarting [/usr/local/bin/rsync -a rsync://mirror.cogentco.com/CentOS/7/updates/x86_64/ 7/updates/x86_64/] - devdbadmin
2019-05-25 03:57:53,854 - __main__ - INFO - Start Command: [/usr/local/bin/rsync -a rsync://mirror.cogentco.com/CentOS/7/updates/x86_64/ 7/updates/x86_64/] - devdbadmin
2019-05-25 04:01:28,522 - __main__ - INFO - Success: [/usr/local/bin/rsync -a rsync://mirror.cogentco.com/CentOS/7/updates/x86_64/ 7/updates/x86_64/] - devdbadmin
2019-05-25 04:01:28,524 - __main__ - INFO - Start Command: [/usr/local/bin/rsync -a rsync://mirror.cogentco.com/CentOS/7/centosplus/x86_64/ 7/centosplus/x86_64/] - devdbadmin
2019-05-25 04:16:28,527 - __main__ - WARNING - [Command '['/usr/local/bin/rsync', '-a', 'rsync://mirror.cogentco.com/CentOS/7/centosplus/x86_64/', '7/centosplus/x86_64/']' timed out after 899.9999288369436 seconds] - devdbadmin

2019-05-25 03:15:03,872 - __main__ - INFO - Restarting [/usr/local/bin/rsync -a rsync://mirror.cogentco.com/CentOS/7/os/x86_64/ 7/x86_64/] - devdbadmin

2019-05-25 03:15:03,875 - __main__ - INFO - Start Command: [/usr/local/bin/rsync -a rsync://mirror.cogentco.com/CentOS/7/os/x86_64/ 7/x86_64/] - devdbadmin

2019-05-25 03:27:53,801 - __main__ - INFO - Success: [/usr/local/bin/rsync -a rsync://mirror.cogentco.com/CentOS/7/os/x86_64/ 7/x86_64/] - devdbadmin

2019-05-25 03:27:53,821 - __main__ - INFO - Start Command: [/usr/local/bin/rsync -a rsync://mirror.cogentco.com/CentOS/7/updates/x86_64/ 7/updates/x86_64/] - devdbadmin

2019-05-25 03:42:53,821 - __main__ - WARNING - [Command '['/usr/local/bin/rsync', '-a', 'rsync://mirror.cogentco.com/CentOS/7/updates/x86_64/', '7/updates/x86_64/']' timed out after 899.9999316609465 seconds] - devdbadmin

2019-05-25 03:42:53,822 - __main__ - INFO - Restarting [/usr/local/bin/rsync -a rsync://mirror.cogentco.com/CentOS/7/updates/x86_64/ 7/updates/x86_64/] - devdbadmin

2019-05-25 03:42:53,850 - __main__ - INFO - Start Command: [/usr/local/bin/rsync -a rsync://mirror.cogentco.com/CentOS/7/updates/x86_64/ 7/updates/x86_64/] - devdbadmin

2019-05-25 03:57:53,851 - __main__ - WARNING - [Command '['/usr/local/bin/rsync', '-a', 'rsync://mirror.cogentco.com/CentOS/7/updates/x86_64/', '7/updates/x86_64/']' timed out after 899.9999369028956 seconds] - devdbadmin

2019-05-25 03:57:53,852 - __main__ - INFO - Restarting [/usr/local/bin/rsync -a rsync://mirror.cogentco.com/CentOS/7/updates/x86_64/ 7/updates/x86_64/] - devdbadmin

2019-05-25 03:57:53,854 - __main__ - INFO - Start Command: [/usr/local/bin/rsync -a rsync://mirror.cogentco.com/CentOS/7/updates/x86_64/ 7/updates/x86_64/] - devdbadmin

2019-05-25 04:01:28,522 - __main__ - INFO - Success: [/usr/local/bin/rsync -a rsync://mirror.cogentco.com/CentOS/7/updates/x86_64/ 7/updates/x86_64/] - devdbadmin

2019-05-25 04:01:28,524 - __main__ - INFO - Start Command: [/usr/local/bin/rsync -a rsync://mirror.cogentco.com/CentOS/7/centosplus/x86_64/ 7/centosplus/x86_64/] - devdbadmin

2019-05-25 04:16:28,527 - __main__ - WARNING - [Command '['/usr/local/bin/rsync', '-a', 'rsync://mirror.cogentco.com/CentOS/7/centosplus/x86_64/', '7/centosplus/x86_64/']' timed out after 899.9999288369436 seconds] - devdbadmin

Jason R. Ralph

Linux All Day Everyday

Month: June 2019

Postgres Long Running Active Queries Send To Slack

Python Function Execute Subprocess With Timeout