Python Linux Find Files With Pattern Accessed Older Than N Days And Remove

July 7, 2022July 14, 2022 adminLeave a comment

This is a neat utility that you can use to keep in your sysadmin bag of tricks, it walks the directory you define recursively and grabs all the file access times and stores them into a list, it then compares them against a command line parameter for days ago. If its older than N days it will remove the file. What’s really nice about this utility is it has a debug mode, this way you can see what will be deleted before you remove debug and execute it.

#!/usr/bin/env python3

import argparse
import fnmatch
import os
import sys
from datetime import datetime, timedelta
from pathlib import Path

# set date now.
now = datetime.today()

# setup dir to clean
home = str(Path.home())
target_dir = '/home/jasonr' # CHANGE TO WHERE YOU WANT TO SEARCH

# dir to clean
dirs_to_clean = target_dir

# setup cli arguments.
parser = argparse.ArgumentParser(
    description='''
[--days_ago 60] will keep 60 days worth of files.
[--debug yes] will print out statements with no actions.''',
    formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument('--days_ago',
                    help='[--days_ago NN]')
parser.add_argument('--debug',
                    help='[--debug (yes|no)')
args = parser.parse_args()

# allowed arguments from cli.
accepted_cli_args = ['yes', 'no']

# sanity check, assign days to keep on system.
if args.days_ago is None:
    days = 60
else:
    days = args.days_ago

# define a list of patterns
patterns = ['*.csv', '*.txt'] # YOU CAN ADD ANY PATTERN TO LIST

# sanity check, assign debug true or false
if args.debug in accepted_cli_args:
    if args.debug == 'yes':
        debug = True
    else:
        debug = False
else:
    print("{0}: Wrong parameter --debug (yes or no): [{1}]"
          .format(now, args.debug))
    sys.exit(1)


def find_files(dir_to_clean):
    file_list = []
    days_ago = datetime.now() - timedelta(days=int(days))
    for root, dirs, files in os.walk(dir_to_clean):
        for pattern in patterns:
            for filename in fnmatch.filter(files, pattern):
                file_list.append(os.path.join(root, filename))
                file_list.sort()

    for file in file_list:
        try:
            file_atime = datetime.fromtimestamp(os.path.getatime(file))
        except Exception as e:
            print("{0}: File Access Time Get Failed: [{1}]"
                  .format(now, e))
        if file_atime < days_ago:
            if os.path.isfile(file):
                try:
                    if not debug:
                        print("{0}: Removing file: [{1}]"
                              .format(now, file))
                        os.remove(file)
                    else:
                        print("{0}: DEBUG: Removing file: [{1}]"
                              .format(now, file))
                except OSError as e:
                    print("{0}: File Clean Up Failed: [{1}]"
                          .format(now, e))
                    sys.exit(1)


# main function.
def main():
    find_files(dirs_to_clean)


if __name__ == "__main__":
    main()

#!/usr/bin/env python3

import argparse

import fnmatch

import os

import sys

from datetime import datetime, timedelta

from pathlib import Path

# set date now.

now = datetime.today()

# setup dir to clean

home = str(Path.home())

target_dir = '/home/jasonr' # CHANGE TO WHERE YOU WANT TO SEARCH

# dir to clean

dirs_to_clean = target_dir

# setup cli arguments.

parser = argparse.ArgumentParser(

description='''

[--days_ago 60] will keep 60 days worth of files.

[--debug yes] will print out statements with no actions.''',

formatter_class=argparse.RawTextHelpFormatter)

parser.add_argument('--days_ago',

help='[--days_ago NN]')

parser.add_argument('--debug',

help='[--debug (yes|no)')

args = parser.parse_args()

# allowed arguments from cli.

accepted_cli_args = ['yes', 'no']

# sanity check, assign days to keep on system.

if args.days_ago is None:

days = 60

else:

days = args.days_ago

# define a list of patterns

patterns = ['*.csv', '*.txt'] # YOU CAN ADD ANY PATTERN TO LIST

# sanity check, assign debug true or false

if args.debug in accepted_cli_args:

if args.debug == 'yes':

debug = True

else:

debug = False

else:

print("{0}: Wrong parameter --debug (yes or no): [{1}]"

.format(now, args.debug))

sys.exit(1)

def find_files(dir_to_clean):

file_list = []

days_ago = datetime.now() - timedelta(days=int(days))

for root, dirs, files in os.walk(dir_to_clean):

for pattern in patterns:

for filename in fnmatch.filter(files, pattern):

file_list.append(os.path.join(root, filename))

file_list.sort()

for file in file_list:

try:

file_atime = datetime.fromtimestamp(os.path.getatime(file))

except Exception as e:

print("{0}: File Access Time Get Failed: [{1}]"

.format(now, e))

if file_atime < days_ago:

if os.path.isfile(file):

try:

if not debug:

print("{0}: Removing file: [{1}]"

.format(now, file))

os.remove(file)

else:

print("{0}: DEBUG: Removing file: [{1}]"

.format(now, file))

except OSError as e:

print("{0}: File Clean Up Failed: [{1}]"

.format(now, e))

sys.exit(1)

# main function.

def main():

find_files(dirs_to_clean)

if __name__ == "__main__":

main()

[jasonr@sb-jralph-8 ~]$ python3 finder.py --days_ago 90 --debug yes
2022-07-07 11:22:57.524454: DEBUG: Removing file: [/home/jasonr/aws/dist/awscli/examples/emr/create-cluster-synopsis.txt]
2022-07-07 11:22:57.524454: DEBUG: Removing file: [/home/jasonr/aws/dist/cryptography-3.3.2-py3.8.egg-info/top_level.txt]
2022-07-07 11:22:57.524454: DEBUG: Removing file: [/home/jasonr/aws/dist/docutils/parsers/rst/include/README.txt]
2022-07-07 11:22:57.524454: DEBUG: Removing file: [/home/jasonr/aws/dist/docutils/parsers/rst/include/isoamsa.txt]
2022-07-07 11:22:57.524454: DEBUG: Removing file: [/home/jasonr/aws/dist/docutils/parsers/rst/include/isoamsb.txt]
2022-07-07 11:22:57.524454: DEBUG: Removing file: [/home/jasonr/aws/dist/docutils/parsers/rst/include/isoamsc.txt]

[jasonr@sb-jralph-8 ~]$ python3 finder.py --days_ago 90 --debug yes

2022-07-07 11:22:57.524454: DEBUG: Removing file: [/home/jasonr/aws/dist/awscli/examples/emr/create-cluster-synopsis.txt]

2022-07-07 11:22:57.524454: DEBUG: Removing file: [/home/jasonr/aws/dist/cryptography-3.3.2-py3.8.egg-info/top_level.txt]

2022-07-07 11:22:57.524454: DEBUG: Removing file: [/home/jasonr/aws/dist/docutils/parsers/rst/include/README.txt]

2022-07-07 11:22:57.524454: DEBUG: Removing file: [/home/jasonr/aws/dist/docutils/parsers/rst/include/isoamsa.txt]

2022-07-07 11:22:57.524454: DEBUG: Removing file: [/home/jasonr/aws/dist/docutils/parsers/rst/include/isoamsb.txt]

2022-07-07 11:22:57.524454: DEBUG: Removing file: [/home/jasonr/aws/dist/docutils/parsers/rst/include/isoamsc.txt]

AWS EMR ImportError: this version of pandas is incompatible with numpy < 1.17.3

May 10, 2022August 5, 2022 admin7 Comments

I found another one that I thought was worth writing a quick blog post about. We use AWS Elastic Map Reduce with transient clusters, so in order to get the python libraries installed, we need to use the bootstrap feature. We ran into many issues trying the standard bootstrap script which looked something like this:

[09:43:14] jason@jralph-mbp14:~ $ cat bootstrap.sh
aws s3 cp s3://bucket1-us-east-1/EMR/requirements.txt .
sudo python3 -m pip install -r requirements.txt

[09:43:14] jason@jralph-mbp14:~ $ cat bootstrap.sh

aws s3 cp s3://bucket1-us-east-1/EMR/requirements.txt .

sudo python3 -m pip install -r requirements.txt

The contents of requirements.txt looked like this:

[09:43:14] jason@jralph-mbp14:~ $ cat requirements.txt
boto3
botocore
awscli
requests
scikit-learn
numpy
pandas

[09:43:14] jason@jralph-mbp14:~ $ cat requirements.txt

boto3

botocore

awscli

requests

scikit-learn

numpy

pandas

We would get all the nodes in the cluster to bootstrap properly however the logs showed the following:

Traceback (most recent call last):
  File "analysis.py", line 6, in <module>
    import pandas as pd
  File "/usr/local/lib64/python3.7/site-packages/pandas/__init__.py", line 22, in <module>
    from pandas.compat import (
  File "/usr/local/lib64/python3.7/site-packages/pandas/compat/__init__.py", line 15, in <module>
    from pandas.compat.numpy import (
  File "/usr/local/lib64/python3.7/site-packages/pandas/compat/numpy/__init__.py", line 27, in <module>
    f"this version of pandas is incompatible with numpy < {_min_numpy_ver}\n"
ImportError: this version of pandas is incompatible with numpy < 1.17.3
your numpy version is 1.16.5.
Please upgrade numpy to >= 1.17.3 to use this pandas version

Traceback (most recent call last):

File "analysis.py", line 6, in <module>

import pandas as pd

File "/usr/local/lib64/python3.7/site-packages/pandas/__init__.py", line 22, in <module>

from pandas.compat import (

File "/usr/local/lib64/python3.7/site-packages/pandas/compat/__init__.py", line 15, in <module>

from pandas.compat.numpy import (

File "/usr/local/lib64/python3.7/site-packages/pandas/compat/numpy/__init__.py", line 27, in <module>

f"this version of pandas is incompatible with numpy < {_min_numpy_ver}\n"

ImportError: this version of pandas is incompatible with numpy < 1.17.3

your numpy version is 1.16.5.

Please upgrade numpy to >= 1.17.3 to use this pandas version

And when trying to import from pyspark, we saw this:

Traceback (most recent call last):
  File "analysis.py", line 6, in <module>
    import pandas as pd
ModuleNotFoundError: No module named 'pandas'

Traceback (most recent call last):

File "analysis.py", line 6, in <module>

import pandas as pd

ModuleNotFoundError: No module named 'pandas'

After speaking with AWS support, it turns out this was a known issue. When a cluster is launched, EMR first provisions the EC2 instances, after that it runs the bootstrap actions. Thus, when the bootstrap action runs, it installs the desired version. However, since the applications are installed after the bootstrap action, these applications override the custom installation for the Python packages. In order to get around the issue of the version being overridden, the workaround is to make use of a Bootstrap Action that delays the installation of the packages until the nodes are fully up and running. This will resolve the conflict that we have been seeing with pandas and numpy. Here is what our final working bootstrap.sh looks like, hope this helps, it was a tough one to solve:

#!/bin/bash
set -x

cat > /var/tmp/fix-bootstap.sh <<'EOF'
#!/bin/bash
set -x

while true; do
    NODEPROVISIONSTATE=`sed -n '/localInstance [{]/,/[}]/{
    /nodeProvisionCheckinRecord [{]/,/[}]/ {
    /status: / { p }
    /[}]/a
    }
    /[}]/a
    }' /emr/instance-controller/lib/info/job-flow-state.txt | awk ' { print $2 }'`

    if [ "$NODEPROVISIONSTATE" == "SUCCESSFUL" ]; then
        echo "Running my post provision bootstrap"
        # Enter your code here
        sudo python3 -m pip install --upgrade pip
        sudo python3 -m pip install boto3
        sudo python3 -m pip install botocore
        sudo python3 -m pip install sklearn
        sudo python3 -m pip install requests
        sudo python3 -m pip install numpy
        sudo python3 -m pip install pandas
        echo '-------BOOTSTRAP COMPLETE-------' 

        exit
    else
        echo "Sleeping Till Node is Provisioned"
        sleep 10
    fi
done

EOF

chmod +x /var/tmp/fix-bootstap.sh
nohup /var/tmp/fix-bootstap.sh  2>&1 &

#!/bin/bash

set -x

cat > /var/tmp/fix-bootstap.sh <<'EOF'

#!/bin/bash

set -x

while true; do

NODEPROVISIONSTATE=`sed -n '/localInstance [{]/,/[}]/{

/nodeProvisionCheckinRecord [{]/,/[}]/ {

/status: / { p }

/[}]/a

}

/[}]/a

}' /emr/instance-controller/lib/info/job-flow-state.txt | awk ' { print $2 }'`

if [ "$NODEPROVISIONSTATE" == "SUCCESSFUL" ]; then

echo "Running my post provision bootstrap"

# Enter your code here

sudo python3 -m pip install --upgrade pip

sudo python3 -m pip install boto3

sudo python3 -m pip install botocore

sudo python3 -m pip install sklearn

sudo python3 -m pip install requests

sudo python3 -m pip install numpy

sudo python3 -m pip install pandas

echo '-------BOOTSTRAP COMPLETE-------'

exit

else

echo "Sleeping Till Node is Provisioned"

sleep 10

done

EOF

chmod +x /var/tmp/fix-bootstap.sh

nohup /var/tmp/fix-bootstap.sh 2>&1 &

10 Year Anniversary: www.jasonralph.org

April 18, 2022April 18, 2022 adminLeave a comment

I had not posted too much lately, lots of stuff going on with my work and personal life, my wife and I moved into a new house in 2022, and for work we have been grinding on a large migration. I looked at my blog this morning and noticed that I have had this spare time project running for 10 years.

So for 10 years I have had jasonralph.org up and continuously available, with analytics, it started in my apartment on an old IBM stand alone server, it now runs on a single Rocky Linux 8 VM from linode for 10 dollars a month. I hope to have some new content soon, but for now, I am happy for the 10 year anniversary.

AWS Apache Managed Airflow EMR ModuleNotFoundError: No module named ‘requests’ Bootstrap

November 2, 2021November 9, 2021 adminLeave a comment

I came across another fun one the other day, we are in the process of migrating our on premise elastic map reduce system into the cloud. We are using AWS EMR and have AWS Managed Airflow as the executor (DAG). We came across an odd situation with a pyspark application. When using Airflow with a SparkSubmitHook, the job would bootstrap looking just fine according to the run logs, however it would fail with No module named 'requests' when the application tried to import it. This was very odd since we have this application running from spark-submit just fine when calling it from the master node command line.

I decided to investigate the differences, our bootstrap script for installing python modules via pip which we call from the EMR API RunJobFlow call looks like this:

#!/bin/bash
pip_bin=pip3
${pip_bin} install --user -U pip
${pip_bin} install --user boto3
${pip_bin} install --user boto
${pip_bin} install --user requests
${pip_bin} install --user psycopg2-binary

#!/bin/bash

pip_bin=pip3

${pip_bin} install --user -U pip

${pip_bin} install --user boto3

${pip_bin} install --user boto

${pip_bin} install --user requests

${pip_bin} install --user psycopg2-binary

This is very basic, all it does is upgrade PIP and run PIP install to install each of the modules. When checking the bootstrap log I can see that PIP upgrades and goes out to the repo and installs the packages just fine. So why were we getting the No module named 'requests' error when executing through airflow. After a ton of googling and research I have found the issue and applied a solution that worked. Turns out airflow will run as the root user when bootstrapping, so if you notice we use the --user argument in pip. This will instruct the packages to be installed in the calling users home directory, the kicker is the code is run by the hadoop user on the EMR cluster nodes after executing from airflow. So turns out, the hadoop user is unable to access the requests module since root installed it with --user. I changed the bootstrap script to the following and it all started working, by removing --user and prefixing with sudo, the packages now get installed in a globally available area for all users. I am sure there are better ways to do this, I am still learning and researching, but if you run into this, the change below with get you out of the woods.

#!/bin/bash
sudo python3 -m pip install \
                        boto3 \
	                    boto \
		                requests \
                        psycopg2-binary

#!/bin/bash

sudo python3 -m pip install \

boto3 \

boto \

requests \

psycopg2-binary

After some further research, and testing we decided to utilize a requirements.txt file to be called by the bootstrap shell script in the RunJobFlow call, first create a requirements.txt file, I like to hardcode the versions so nothing changes unexpectedly as you bootstrap a new cluster and it reaches out to PyPy to get the packages.

https://docs.aws.amazon.com/emr/latest/APIReference/API_RunJobFlow.html

Add your desired packages and version numbers to a file called requirements.txt like below:

boto3==1.17.54
boto==2.49.0
requests==2.18.4
psycopg2-binary==2.8.6

boto3==1.17.54

boto==2.49.0

requests==2.18.4

psycopg2-binary==2.8.6

Then you will need to copy this file into a bucket you have access to:

aws s3 cp requirements.txt s3://YOUR_S3_BUCKET_NAME/requirements.txt

1	aws s3 cp requirements.txt s3://YOUR_S3_BUCKET_NAME/requirements.txt

Then create a shell script that has the following, call it bootstrap.sh:

#!/bin/bash

set -x 

echo '-----------RUNNING BOOTSTRAP------------------------'

echo '-----------COPYING REQUIREMENTS FILE LOCALLY--------'

aws s3 cp s3://YOUR_S3_BUCKET_NAME/requirements.txt .

echo '-----------INSTALLING REQUIREMENTS------------------'

sudo python3 -m pip install -r requirements.txt

echo '-----------DONE BOOTSTRAP---------------------------'

#!/bin/bash

set -x

echo '-----------RUNNING BOOTSTRAP------------------------'

echo '-----------COPYING REQUIREMENTS FILE LOCALLY--------'

aws s3 cp s3://YOUR_S3_BUCKET_NAME/requirements.txt .

echo '-----------INSTALLING REQUIREMENTS------------------'

sudo python3 -m pip install -r requirements.txt

echo '-----------DONE BOOTSTRAP---------------------------'

Copy that shell script to your bucket:

aws s3 cp bootstrap.sh s3://YOUR_S3_BUCKET_NAME/bootstrap.sh

1	aws s3 cp bootstrap.sh s3://YOUR_S3_BUCKET_NAME/bootstrap.sh

And execute it via the bootstrap actions in the RunJobFlow EMR API call:

"BootstrapActions": [
    {
      "Name": "string",
      "ScriptBootstrapAction": {
        "Path": "s3://YOUR_S3_BUCKET_NAME/bootstrap.sh"
      }
    }
  ],

"BootstrapActions": [

{

"Name": "string",

"ScriptBootstrapAction": {

"Path": "s3://YOUR_S3_BUCKET_NAME/bootstrap.sh"

}

As you can see the shell script will be executed which will copy the requirements.txt file locally and then run pip -r against it which will install all the packages. If you want to see the log on a running cluster, you can ssh to the master node and view the logs here to see the bootstrapping take place:

/emr/instance-controller/log/bootstrap-actions

1	/emr/instance-controller/log/bootstrap-actions

You should see the stdout log as so:

-----------RUNNING BOOTSTRAP------------------
-----------COPYING REQUIREMENTS FILE LOCALLY--------
Completed 67 Bytes/67 Bytes (629 Bytes/s) with 1 file(s) remaining
download: s3://YOUR_S3_BUCKET_NAME/requirements.txt to ./requirements.txt
-----------INSTALLING REQUIREMENTS------------------
Collecting boto==2.48.0
  Downloading boto-2.48.0-py2.py3-none-any.whl (1.4 MB)
Collecting boto3==1.6.15
  Downloading boto3-1.6.15-py2.py3-none-any.whl (128 kB)
Collecting requests==2.18.4
  Downloading requests-2.18.4-py2.py3-none-any.whl (88 kB)
Collecting psycopg2-binary==2.8.6
  Downloading psycopg2_binary-2.8.6-cp37-cp37m-manylinux1_x86_64.whl (3.0 MB)
Collecting botocore<1.10.0,>=1.9.15
  Downloading botocore-1.9.23-py2.py3-none-any.whl (4.1 MB)
Collecting s3transfer<0.2.0,>=0.1.10
  Downloading s3transfer-0.1.13-py2.py3-none-any.whl (59 kB)
Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /usr/local/lib/python3.7/site-packages (from boto3==1.6.15->-r jason_requirements.txt (line 2)) (0.10.0)
Collecting urllib3<1.23,>=1.21.1
  Downloading urllib3-1.22-py2.py3-none-any.whl (132 kB)
Collecting certifi>=2017.4.17
  Downloading certifi-2021.10.8-py2.py3-none-any.whl (149 kB)
Collecting idna<2.7,>=2.5
  Downloading idna-2.6-py2.py3-none-any.whl (56 kB)
Collecting chardet<3.1.0,>=3.0.2
  Downloading chardet-3.0.4-py2.py3-none-any.whl (133 kB)
Requirement already satisfied: docutils>=0.10 in /usr/lib/python3.7/site-packages (from botocore<1.10.0,>=1.9.15->boto3==1.6.15->-r jason_requirements.txt (line 2)) (0.14)
Collecting python-dateutil<2.7.0,>=2.1
  Downloading python_dateutil-2.6.1-py2.py3-none-any.whl (194 kB)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/site-packages (from python-dateutil<2.7.0,>=2.1->botocore<1.10.0,>=1.9.15->boto3==1.6.15->-r jason_requirements.txt (line 2)) (1.13.0)
Installing collected packages: boto, python-dateutil, botocore, s3transfer, boto3, urllib3, certifi, idna, chardet, requests, psycopg2-binary
  Attempting uninstall: boto
    Found existing installation: boto 2.49.0
    Uninstalling boto-2.49.0:
      Successfully uninstalled boto-2.49.0
Successfully installed boto-2.48.0 boto3-1.6.15 botocore-1.9.23 certifi-2021.10.8 chardet-3.0.4 idna-2.6 psycopg2-binary-2.8.6 python-dateutil-2.6.1 requests-2.18.4 s3transfer-0.1.13 urllib3-1.22
-----------DONE BOOTSTRAP---------------------

-----------RUNNING BOOTSTRAP------------------

-----------COPYING REQUIREMENTS FILE LOCALLY--------

Completed 67 Bytes/67 Bytes (629 Bytes/s) with 1 file(s) remaining

download: s3://YOUR_S3_BUCKET_NAME/requirements.txt to ./requirements.txt

-----------INSTALLING REQUIREMENTS------------------

Collecting boto==2.48.0

Downloading boto-2.48.0-py2.py3-none-any.whl (1.4 MB)

Collecting boto3==1.6.15

Downloading boto3-1.6.15-py2.py3-none-any.whl (128 kB)

Collecting requests==2.18.4

Downloading requests-2.18.4-py2.py3-none-any.whl (88 kB)

Collecting psycopg2-binary==2.8.6

Downloading psycopg2_binary-2.8.6-cp37-cp37m-manylinux1_x86_64.whl (3.0 MB)

Collecting botocore<1.10.0,>=1.9.15

Downloading botocore-1.9.23-py2.py3-none-any.whl (4.1 MB)

Collecting s3transfer<0.2.0,>=0.1.10

Downloading s3transfer-0.1.13-py2.py3-none-any.whl (59 kB)

Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /usr/local/lib/python3.7/site-packages (from boto3==1.6.15->-r jason_requirements.txt (line 2)) (0.10.0)

Collecting urllib3<1.23,>=1.21.1

Downloading urllib3-1.22-py2.py3-none-any.whl (132 kB)

Collecting certifi>=2017.4.17

Downloading certifi-2021.10.8-py2.py3-none-any.whl (149 kB)

Collecting idna<2.7,>=2.5

Downloading idna-2.6-py2.py3-none-any.whl (56 kB)

Collecting chardet<3.1.0,>=3.0.2

Downloading chardet-3.0.4-py2.py3-none-any.whl (133 kB)

Requirement already satisfied: docutils>=0.10 in /usr/lib/python3.7/site-packages (from botocore<1.10.0,>=1.9.15->boto3==1.6.15->-r jason_requirements.txt (line 2)) (0.14)

Collecting python-dateutil<2.7.0,>=2.1

Downloading python_dateutil-2.6.1-py2.py3-none-any.whl (194 kB)

Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/site-packages (from python-dateutil<2.7.0,>=2.1->botocore<1.10.0,>=1.9.15->boto3==1.6.15->-r jason_requirements.txt (line 2)) (1.13.0)

Installing collected packages: boto, python-dateutil, botocore, s3transfer, boto3, urllib3, certifi, idna, chardet, requests, psycopg2-binary

Attempting uninstall: boto

Found existing installation: boto 2.49.0

Uninstalling boto-2.49.0:

Successfully uninstalled boto-2.49.0

Successfully installed boto-2.48.0 boto3-1.6.15 botocore-1.9.23 certifi-2021.10.8 chardet-3.0.4 idna-2.6 psycopg2-binary-2.8.6 python-dateutil-2.6.1 requests-2.18.4 s3transfer-0.1.13 urllib3-1.22

-----------DONE BOOTSTRAP---------------------

Hope this helps.

Node Application Stopped Sending Updates To Slack – can’t identify protocol

June 24, 2021 adminLeave a comment

I wanted to share my experience with a node application that I support. This particular application is an API, it happens to log each and every request it receives to a internal slack channel. Our team uses this channel for many things, to verify when the API is in maintenance, to check that requests are processing, to see status on the overall health of the API etc..

Once in a while out of nowhere we would stop receiving these updates to slack. I set out to troubleshoot why this may be happening, at first we thought that we were hitting the slack rate limits, which is clearly defined here:

https://api.slack.com/docs/rate-limits

However after reading the linked doc, I was skeptical. The API does serve a lot of requests, but not enough to hit their limit. We have 2 servers that send slack messages and process the API requests and when they stopped sending it would be both servers, not just one. Also we have run into this before and restarting the service fixed the issue, so I was sure we did not hit the rate limit. Also trying to send a manual slack update using curl would not work! I knew this had to be something with the linux OS itself, and not the Slack service.

I tried to use netstat to see if we were hitting some type of OS limit, and all looked well. Next I tried one of my favorite tools, LSOF, at first I grepped for deleted to see if something was being held and not released. I did not see anything that stood out, next I grepped for node and low and behold I saw this:

[root@ip-172-x-x-x ~]# lsof | grep node
--SNIP--
node       1794 nodeuser   19u     sock                0,6       0t0     651101 can't identify protocol
node       1794 nodeuser   20w      REG              202,1 209793922     294970 /opt/afs/mc_api_logs/debug.log
node       1794 nodeuser   21w      REG              202,1   2409554     274199 /opt/afs/mc_api_logs/exceptions.log
node       1794 nodeuser   22w      REG              202,1    572278     294971 /opt/afs/mc_api_logs/error.log
node       1794 nodeuser   23w      REG              202,1   2409554     274199 /opt/afs/mc_api_logs/exceptions.log
node       1794 nodeuser   24w      REG              202,1   2258649     294980 /opt/afs/mc_api_logs/warn.log
node       1794 nodeuser   25w      REG              202,1   2409554     274199 /opt/afs/mc_api_logs/exceptions.log
node       1794 nodeuser   26w      REG              202,1         0     294989 /opt/afs/mc_api_logs/info.log
node       1794 nodeuser   27w      REG              202,1   2409554     274199 /opt/afs/mc_api_logs/exceptions.log
node       1794 nodeuser   28u     IPv4              13731       0t0        TCP *:pcsync-https (LISTEN)
node       1794 nodeuser   29u     sock                0,6       0t0     512828 can't identify protocol
node       1794 nodeuser   30u     sock                0,6       0t0      14507 can't identify protocol
node       1794 nodeuser   31u     sock                0,6       0t0      14028 can't identify protocol
node       1794 nodeuser   32u     sock                0,6       0t0      15183 can't identify protocol
node       1794 nodeuser   33u     sock                0,6       0t0      15628 can't identify protocol
node       1794 nodeuser   34u     sock                0,6       0t0      16346 can't identify protocol
node       1794 nodeuser   35u     sock                0,6       0t0      15778 can't identify protocol
node       1794 nodeuser   36u     sock                0,6       0t0      16847 can't identify protocol
node       1794 nodeuser   37u     sock                0,6       0t0      17512 can't identify protocol
node       1794 nodeuser   38u     sock                0,6       0t0      25572 can't identify protocol
node       1794 nodeuser   39u     sock                0,6       0t0      18437 can't identify protocol
--SNIP--

[root@ip-172-x-x-x ~]# lsof | grep node

--SNIP--

node 1794 nodeuser 19u sock 0,6 0t0 651101 can't identify protocol

node 1794 nodeuser 20w REG 202,1 209793922 294970 /opt/afs/mc_api_logs/debug.log

node 1794 nodeuser 21w REG 202,1 2409554 274199 /opt/afs/mc_api_logs/exceptions.log

node 1794 nodeuser 22w REG 202,1 572278 294971 /opt/afs/mc_api_logs/error.log

node 1794 nodeuser 23w REG 202,1 2409554 274199 /opt/afs/mc_api_logs/exceptions.log

node 1794 nodeuser 24w REG 202,1 2258649 294980 /opt/afs/mc_api_logs/warn.log

node 1794 nodeuser 25w REG 202,1 2409554 274199 /opt/afs/mc_api_logs/exceptions.log

node 1794 nodeuser 26w REG 202,1 0 294989 /opt/afs/mc_api_logs/info.log

node 1794 nodeuser 27w REG 202,1 2409554 274199 /opt/afs/mc_api_logs/exceptions.log

node 1794 nodeuser 28u IPv4 13731 0t0 TCP *:pcsync-https (LISTEN)

node 1794 nodeuser 29u sock 0,6 0t0 512828 can't identify protocol

node 1794 nodeuser 30u sock 0,6 0t0 14507 can't identify protocol

node 1794 nodeuser 31u sock 0,6 0t0 14028 can't identify protocol

node 1794 nodeuser 32u sock 0,6 0t0 15183 can't identify protocol

node 1794 nodeuser 33u sock 0,6 0t0 15628 can't identify protocol

node 1794 nodeuser 34u sock 0,6 0t0 16346 can't identify protocol

node 1794 nodeuser 35u sock 0,6 0t0 15778 can't identify protocol

node 1794 nodeuser 36u sock 0,6 0t0 16847 can't identify protocol

node 1794 nodeuser 37u sock 0,6 0t0 17512 can't identify protocol

node 1794 nodeuser 38u sock 0,6 0t0 25572 can't identify protocol

node 1794 nodeuser 39u sock 0,6 0t0 18437 can't identify protocol

--SNIP--

My eyes went right to the “can’t identify protocol”, I opened up a browser and started to research, first hit when searching “can’t identify protocol” was a stack overflow article with the solution.

https://stackoverflow.com/questions/7911840/seeing-too-many-lsof-cant-identify-protocol

When lsof prints “Can’t identify protocol”, this usually relates to sockets (it should also say ‘sock’ in the relevant output lines).

So, somewhere in your code you are probably connecting sockets and not closing them properly (perhaps you need a finally block).

I suggest you step through your code with a debugger (easiest to use your IDE, potentially with a remote debugger, if necesssary), while running lsof side-by-side. You should eventually be able to see which thread / line of code is creating these File Descriptors.

Turns out that the node application was opening file descriptors / sockets and not closing them properly, this caused the system to hit the hard limit on open files / file descriptors. You can view the hard and soft limit like so, switch to the user that application is running as and run:

[nodeuser@ip-172-x-x-x ~]$ ulimit -Hn
4096
[nodeuser@ip-172-x-x-x ~]$ ulimit -Sn
1024

[nodeuser@ip-172-x-x-x ~]$ ulimit -Hn

4096

[nodeuser@ip-172-x-x-x ~]$ ulimit -Sn

1024

So you can see that the nodeuser has a hard limit of 4096 open files, which due to the application not properly closing them, we hit the ceiling. This explains why restarting the server or the process fixed it. It would release the open file descriptors and the system was able to open sockets again. I spoke with the developer and we researched, looks like one of the modules we were using was the cause of the issue, perhaps we were using it wrong? I found this out from this article:
https://stackoverflow.com/questions/24922745/node-js-winston-how-to-safely-drain-a-logger

Question:

I have experimented with instantiating and closing winston loggers as (half) described on https://github.com/flatiron/winston#instantiating-your-own-logger, to no avail. I run into trouble closing file transports of Winston’s – walking through it’s source code, I found that the proper way to close off a logger would seem to be the close method. I expected this to take care of closing the transport file used by the logger – however that turned out to be not so.

Varying in frequency according to node.js server load, winston would still hold on to many transport files, infinitely long after the close method had been called for them, indefinitely long after no new writes were being initiated to them. I observed that through the node.js process file descriptors table (lsof -p). Even though close has been called for a Winston logger, it would indefinitely keep the file descriptor of the log file “in use”, i.e. the log file never gets really closed. Thus leaking file descriptors and eventually making the node.js process bump into the ulimit (-n) limit after my application has been up for long.

Should there be a specific programming pattern for draining a Winston logger such that it can be eventually closed?

Answer:

Create only one logger instance and then derive children from it. In this case, winston will hold only one open file handler. Might also be better for performance.

So that was it, the developers agreed and set out to create a patch, problem solved.

Capture AWS CLI Output With Timestamps On Each Line Of Output

December 31, 2020 adminLeave a comment

I needed a way to get output from aws cli captured into a log file with timestamps, out of the box the aws cli output has no timestamps in the output. If you execute a aws s3 cp command, something like this:

aws s3 cp s3://jason-test-bucket-1/test_part_00 s3://jason-test-bucket-2/jason_test/

1	aws s3 cp s3://jason-test-bucket-1/test_part_00 s3://jason-test-bucket-2/jason_test/

You will see output like so:

copy: s3://jason-test-bucket-1/test_part_00 to s3://jason-test-bucket-2/jason_test/test_part_00

1	copy: s3://jason-test-bucket-1/test_part_00 to s3://jason-test-bucket-2/jason_test/test_part_00

As you can see this does not show a timestamp in each event of output from the aws cli. So I scoured the internet and found out some interesting things. Turns out that aws cli out of the box outputs with carriage returns instead of newlines. So trying standard awk piping methods was not working. Also aws cli has the ability to change the output, so I needed to add a cli parameter to set output to text. Next we needed to use TR to substitute the carriage returns with newlines, finally we can pipe to awk and print a timestamp on each output event from the aws cli. The final command and output looks like this:

#!/bin/bash
log='test.log'
aws s3 --output text cp s3://jason-test-bucket-1/test_part_00 s3://jason-test-bucket-2/jason_test/ | tr "\r" "\n" > >(awk '{print strftime("%Y-%m-%d:%H:%M:%S ") $0}') | tee >> $log 2>&1

#!/bin/bash

log='test.log'

aws s3 --output text cp s3://jason-test-bucket-1/test_part_00 s3://jason-test-bucket-2/jason_test/ | tr "\r" "\n" > >(awk '{print strftime("%Y-%m-%d:%H:%M:%S ") $0}') | tee >> $log 2>&1

Produces the following in the log file which is my desired result:

2020-12-31:13:32:13 Completed 726.3 KiB/726.3 KiB (3.8 MiB/s) with 1 file(s) remaining
2020-12-31:13:32:13 copy: s3://jason-test-bucket-1/test_part_00 to s3://jason-test-bucket-2/jason_test/test_part_00

1 2	2020-12-31:13:32:13 Completed 726.3 KiB/726.3 KiB (3.8 MiB/s) with 1 file(s) remaining 2020-12-31:13:32:13 copy: s3://jason-test-bucket-1/test_part_00 to s3://jason-test-bucket-2/jason_test/test_part_00

I hope this helps someone else as it was a bear to solve for me.

centos8 postgresql-11-check-db-dir[]: is missing or empty

October 30, 2020November 18, 2020 adminLeave a comment

We have been rolling out CENTOS8 in our lower environments for testing, we use a dedicated vmware virtual server with centos8 minimal install, we only apply hardening techniques to these systems other than the main application, which is pg11 here. These systems use a LVM mounted ext4 filesystem for the data directory.

/dev/mapper/vg01-data1				/u02/data1		ext4    defaults, nofail		0 2

1	/dev/mapper/vg01-data1 /u02/data1 ext4 defaults, nofail 0 2

Recently on 3 of the new PG VMS after reboot we noticed that PG did not start, this also seemed intermittent, even though we have enabled the systemd service to start on reboots. So I checked the pg startup log and did not find too much about the issue. So I checked /var/log/messages and found the issue.

postgresql-11-check-db-dir[1038]: "/u02/data1/pg/data11" is missing or empty.

1	postgresql-11-check-db-dir[1038]: "/u02/data1/pg/data11" is missing or empty.

I checked the systemd service file and saw that out of the box postgres had the following:

[Unit]
Description=PostgreSQL 11 database server
Documentation=https://www.postgresql.org/docs/11/static/
After=syslog.target
After=network.target

[Install]
WantedBy=multi-user.target

[Unit]

Description=PostgreSQL 11 database server

Documentation=https://www.postgresql.org/docs/11/static/

After=syslog.target

After=network.target

[Install]

WantedBy=multi-user.target

After=Syslog.target This is a special target unit in systemd and is the standardized name to pull in a syslog implementation.

After=network.target has very little meaning during start-up. It only indicates that the network management stack is up after it has been reached. Whether any network interfaces are already configured when it is reached is undefined.

WantedBy=multi-user.target normally defines a system state where all network services are started up and the system will accept logins, but a local GUI is not started. This is the typical default system state for server systems, which might be rack-mounted headless systems in a remote server room.

Those options above will not ensure that all filesystems in fstab are mounted before postgres starts. So what we were seeing was a classic race condition where postgres started before the data directory was mounted. As I previously mentioned we use a custom PGDATA location. So after some research I found my option that fixed this. You will need to edit the pg11 service and add the following, then reload systemd and reboot and all should work. You can find your LVM mount by running the following:

[root@server ~]# systemctl list-units --type=mount
UNIT                          LOAD   ACTIVE SUB     DESCRIPTION                     
-.mount                       loaded active mounted Root Mount                      
boot-efi.mount                loaded active mounted /boot/efi                       
boot.mount                    loaded active mounted /boot                           
dev-hugepages.mount           loaded active mounted Huge Pages File System          
dev-mqueue.mount              loaded active mounted POSIX Message Queue File System 
run-user-1328029883.mount     loaded active mounted /run/user/1328029883            
sys-fs-fuse-connections.mount loaded active mounted FUSE Control File System        
sys-kernel-config.mount       loaded active mounted Kernel Configuration File System
sys-kernel-debug.mount        loaded active mounted Kernel Debug File System        
u02-data1.mount               loaded active mounted /u02/data1                      

LOAD   = Reflects whether the unit definition was properly loaded.
ACTIVE = The high-level unit activation state, i.e. generalization of SUB.
SUB    = The low-level unit activation state, values depend on unit type.

10 loaded units listed. Pass --all to see loaded but inactive units, too.
To show all installed unit files use 'systemctl list-unit-files'.

[root@server ~]# systemctl list-units --type=mount

UNIT LOAD ACTIVE SUB DESCRIPTION

-.mount loaded active mounted Root Mount

boot-efi.mount loaded active mounted /boot/efi

boot.mount loaded active mounted /boot

dev-hugepages.mount loaded active mounted Huge Pages File System

dev-mqueue.mount loaded active mounted POSIX Message Queue File System

run-user-1328029883.mount loaded active mounted /run/user/1328029883

sys-fs-fuse-connections.mount loaded active mounted FUSE Control File System

sys-kernel-config.mount loaded active mounted Kernel Configuration File System

sys-kernel-debug.mount loaded active mounted Kernel Debug File System

u02-data1.mount loaded active mounted /u02/data1

LOAD = Reflects whether the unit definition was properly loaded.

ACTIVE = The high-level unit activation state, i.e. generalization of SUB.

SUB = The low-level unit activation state, values depend on unit type.

10 loaded units listed. Pass --all to see loaded but inactive units, too.

To show all installed unit files use 'systemctl list-unit-files'.

You can see my u02-data1.mount in the output, so edit and add the override file with the following, if you have multiple mounts, you can add them as well.
Edit with: systemctl edit postgresql-11

[Unit]
After=local-fs.target u02-data1.mount

[Service]
Environment=PGDATA=/u02/data1/pg/data11

[Unit]

After=local-fs.target u02-data1.mount

[Service]

Environment=PGDATA=/u02/data1/pg/data11

Reload the daemon with: systemctl daemon-reload

After=local-fs.target systemd-fstab-generator(3) automatically adds dependencies of type Before= to all mount units that refer to local mount points for this target unit. In addition, it adds dependencies of type Wants= to this target unit for those mounts listed in /etc/fstab that have the auto mount option set.

Automate pg_dump pg_restore Of Tables From Config File Send Slack Update

July 6, 2020July 15, 2020 adminLeave a comment

You can use this python code to setup a cron that will sync postgres tables from one database to another. This will read from a config file and will be able to do multiple tables from the same run. This can be useful to sync a daily table from source to destinations. This will also send a alert to slack if its ok or critical.

[logging]
log_file = pg_table_sync_dev_to_prod.log
log_path = /home/postgres

[pg_table_source_dest]
public.jason_test_table1 = public.jason_test_table1
public.jason_test_table2 = public.jason_test_table2
public.jason_test_table3 = public.jason_test_table3

[hosts]
source_db = db-sbx01
dest_db = db10

[database]
dev_db = devdb
prod_db = proddb

[dump_location]
local_location = /u04/pg_data_dumps/transfer_tables/

[slack]
webhook = https://hooks.slack.com/services/<yourwebhookhere>

[logging]

log_file = pg_table_sync_dev_to_prod.log

log_path = /home/postgres

[pg_table_source_dest]

public.jason_test_table1 = public.jason_test_table1

public.jason_test_table2 = public.jason_test_table2

public.jason_test_table3 = public.jason_test_table3

[hosts]

source_db = db-sbx01

dest_db = db10

[database]

dev_db = devdb

prod_db = proddb

[dump_location]

local_location = /u04/pg_data_dumps/transfer_tables/

[slack]

webhook = https://hooks.slack.com/services/<yourwebhookhere>

__author__ = 'jralph'
__version__ = '1.0.0'

import configparser
import os
import sys
import logging
import subprocess
import shlex
import socket
import datetime
import requests

# set hostname.
hostname = socket.gethostname()

# set date now.
now = datetime.datetime.now()

# obtain script name and assign to variable.
script_name = sys.argv[0].split('.')[0]

# sanity check for configuration environment variable.
if "INI_PATH" not in os.environ.keys():
    print('INI_PATH is not set, check the .bashrc')
    sys.exit(1)

# parse the configuration sections of the ini file.
config = configparser.ConfigParser()
try:
    config.read(os.environ['INI_PATH'] + '/pg_table_sync_dev_prod.ini')
    config.sections()
    log_file = config.get('logging', 'log_file')
    log_path = config.get('logging', 'log_path')
    slack_hook = config.get('slack', 'webhook')
except configparser.NoSectionError as e:
    print('FATAL: Command failed with error [{0}]'.format(e))

# setup logging.
try:
    logging.basicConfig(filename='%s/%s' % (log_path, log_file),
                        format='%(asctime)s %(message)s',
                        datefmt='%m-%d-%Y %I:%M:%S %p -',
                        level=logging.DEBUG)
except NameError as e:
    print('FATAL: Command failed with error [{0}]'.format(e))

# get hosts and tablenames.
try:
    pg_tables_to_sync = dict(config.items('pg_table_source_dest'))
    source_db = config.get('hosts', 'source_db')
    dest_db = config.get('hosts', 'dest_db')
    dump_location = config.get('dump_location', 'local_location')
    dev_db = config.get('database', 'dev_db')
    prod_db = config.get('database', 'prod_db')
except (configparser.NoSectionError, NameError) as e:
    logging.critical('FATAL: Command failed with error [{0}]'.format(e))


# pg_dump function.
def pg_dump():
    cmd_list = []
    tables = {}
    try:
        tables = sorted(pg_tables_to_sync.items())
    except NameError as e:
        logging.critical('FATAL: Command failed with error [{0}]'.format(e))
    for key, value in tables:
        dump_cmd = 'pg_dump -Fc -h {0} -d {1} -t {2} -f {4}{2}.{3}.pgdump'.format(
            source_db, dev_db, key, now.strftime("%Y%m%d"), dump_location)
        cmd_list.append(dump_cmd)
    return cmd_list

# pg_restore function.
def pg_restore():
    cmd_list = []
    tables = {}
    try:
        tables = sorted(pg_tables_to_sync.items())
    except NameError as e:
        logging.critical('FATAL: Command failed with error [{0}]'.format(e))
    for key, value in tables:
        dump_cmd = 'pg_restore -c -h {0} -d {1} {4}{2}.{3}.pgdump'.format(
            dest_db, prod_db, key, now.strftime("%Y%m%d"), dump_location)
        cmd_list.append(dump_cmd)
    return cmd_list

# send to slack function.
def send_to_slack(slack_url, state, command, date_format, priority, target_os):
    slack_data = {'attachments': [
        {
            "fallback": "Required plain-text summary of the attachment.",
            "color": priority,
            "pretext": "PG Table Sync",
            "author_name": command,
            "text": "%s" % date_format,
            "fields": [
                {
                    "title": "%s" % target_os,
                    "value": state,
                    "short": "false"
                }
            ],
            "footer": "AFS Slack",
            "footer_icon": "https://platform.slack-edge.com"
                           "/img/default_application_icon.png"
        }
    ]}
    response = requests.post(
        slack_url, json=slack_data)
    if response.status_code != 200:
        raise ValueError(
            'Request to slack returned an error %s, the response is:\n%s'
            % (response.status_code, response.text))


# execute with logging.
def execute_jobs(cmd):
    try:
        logging.info('Start Command: [%s]' % cmd)
        subprocess.run(shlex.split(cmd), check=True)
        logging.info('Command Success: [%s]' % cmd)
        try:
            send_to_slack(slack_hook, 'Ok', cmd,
                          datetime.datetime.today(), 'good', hostname)
        except ValueError as e:
            logging.critical('FATAL: Slack post failed with error [%s]'
                             % e)
    except subprocess.CalledProcessError as e:
        logging.critical('[%s] FATAL: Command failed with error [%s]'
                         % (cmd, e))
        try:
            send_to_slack(slack_hook, 'Critical', cmd,
                          datetime.datetime.today(), 'danger', hostname)
        except ValueError as e:
            logging.critical('FATAL: Slack post failed with error [%s]'
                             % e)


# main
def main():
    for command in pg_dump():
        execute_jobs(command)
    for command in pg_restore():
        execute_jobs(command)

    logging.info('finished ' + script_name)


if __name__ == "__main__":
    main()

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

__author__ = 'jralph'

__version__ = '1.0.0'

import configparser

import os

import sys

import logging

import subprocess

import shlex

import socket

import datetime

import requests

# set hostname.

hostname = socket.gethostname()

# set date now.

now = datetime.datetime.now()

# obtain script name and assign to variable.

script_name = sys.argv[0].split('.')[0]

# sanity check for configuration environment variable.

if "INI_PATH" not in os.environ.keys():

print('INI_PATH is not set, check the .bashrc')

sys.exit(1)

# parse the configuration sections of the ini file.

config = configparser.ConfigParser()

try:

config.read(os.environ['INI_PATH'] + '/pg_table_sync_dev_prod.ini')

config.sections()

log_file = config.get('logging', 'log_file')

log_path = config.get('logging', 'log_path')

slack_hook = config.get('slack', 'webhook')

except configparser.NoSectionError as e:

print('FATAL: Command failed with error [{0}]'.format(e))

# setup logging.

try:

logging.basicConfig(filename='%s/%s' % (log_path, log_file),

format='%(asctime)s %(message)s',

datefmt='%m-%d-%Y %I:%M:%S %p -',

level=logging.DEBUG)

except NameError as e:

print('FATAL: Command failed with error [{0}]'.format(e))

# get hosts and tablenames.

try:

pg_tables_to_sync = dict(config.items('pg_table_source_dest'))

source_db = config.get('hosts', 'source_db')

dest_db = config.get('hosts', 'dest_db')

dump_location = config.get('dump_location', 'local_location')

dev_db = config.get('database', 'dev_db')

prod_db = config.get('database', 'prod_db')

except (configparser.NoSectionError, NameError) as e:

logging.critical('FATAL: Command failed with error [{0}]'.format(e))

# pg_dump function.

def pg_dump():

cmd_list = []

tables = {}

try:

tables = sorted(pg_tables_to_sync.items())

except NameError as e:

logging.critical('FATAL: Command failed with error [{0}]'.format(e))

for key, value in tables:

dump_cmd = 'pg_dump -Fc -h {0} -d {1} -t {2} -f {4}{2}.{3}.pgdump'.format(

source_db, dev_db, key, now.strftime("%Y%m%d"), dump_location)

cmd_list.append(dump_cmd)

return cmd_list

# pg_restore function.

def pg_restore():

cmd_list = []

tables = {}

try:

tables = sorted(pg_tables_to_sync.items())

except NameError as e:

logging.critical('FATAL: Command failed with error [{0}]'.format(e))

for key, value in tables:

dump_cmd = 'pg_restore -c -h {0} -d {1} {4}{2}.{3}.pgdump'.format(

dest_db, prod_db, key, now.strftime("%Y%m%d"), dump_location)

cmd_list.append(dump_cmd)

return cmd_list

# send to slack function.

def send_to_slack(slack_url, state, command, date_format, priority, target_os):

slack_data = {'attachments': [

{

"fallback": "Required plain-text summary of the attachment.",

"color": priority,

"pretext": "PG Table Sync",

"author_name": command,

"text": "%s" % date_format,

"fields": [

{

"title": "%s" % target_os,

"value": state,

"short": "false"

}

"footer": "AFS Slack",

"footer_icon": "https://platform.slack-edge.com"

"/img/default_application_icon.png"

}

]}

response = requests.post(

slack_url, json=slack_data)

if response.status_code != 200:

raise ValueError(

'Request to slack returned an error %s, the response is:\n%s'

% (response.status_code, response.text))

# execute with logging.

def execute_jobs(cmd):

try:

logging.info('Start Command: [%s]' % cmd)

subprocess.run(shlex.split(cmd), check=True)

logging.info('Command Success: [%s]' % cmd)

try:

send_to_slack(slack_hook, 'Ok', cmd,

datetime.datetime.today(), 'good', hostname)

except ValueError as e:

logging.critical('FATAL: Slack post failed with error [%s]'

% e)

except subprocess.CalledProcessError as e:

logging.critical('[%s] FATAL: Command failed with error [%s]'

% (cmd, e))

try:

send_to_slack(slack_hook, 'Critical', cmd,

datetime.datetime.today(), 'danger', hostname)

except ValueError as e:

logging.critical('FATAL: Slack post failed with error [%s]'

% e)

# main

def main():

for command in pg_dump():

execute_jobs(command)

for command in pg_restore():

execute_jobs(command)

logging.info('finished ' + script_name)

if __name__ == "__main__":

main()

LOGGING EXAMPLE:

07-06-2020 11:44:28 AM - Start Command: [pg_dump -Fc -h db-sbx01 -d db1 -t public.jason_test_table -f /u04/pg_data_dumps/transfer_tables/public.jason_test_table.20200706.pdump]
07-06-2020 11:44:30 AM - Command Success: [pg_dump -Fc -h db-sbx01 -d db1 -t public.jason_test_table -f /u04/pg_data_dumps/transfer_tables/public.jason_test_table.20200706.pgdump]
07-06-2020 11:44:30 AM - Starting new HTTPS connection (1): hooks.slack.com
07-06-2020 11:44:30 AM - https://hooks.slack.com:443 "POST /services/T04MEPB2K/B72JPEUUB/nfEqv7bsKafUUjLoKgo0oT5S HTTP/1.1" 200 22
07-06-2020 11:44:30 AM - Start Command: [pg_restore -c -h db10 -d db1 /u04/pg_data_dumps/transfer_tables/public.jason_test_table.20200706]
07-06-2020 11:44:31 AM - Command Success: [pg_restore -c -h db10 -d db1 /u04/pg_data_dumps/transfer_tables/public.jason_test_table.20200706]
07-06-2020 11:44:31 AM - Starting new HTTPS connection (1): hooks.slack.com
07-06-2020 11:44:31 AM - https://hooks.slack.com:443 "POST /services/T04MEPB2K/B72JPEUUB/nfEqv7bsKafUUjLoKgo0oT5S HTTP/1.1" 200 22
07-06-2020 11:44:31 AM - finished PgTableSyncDevProd

07-06-2020 11:44:28 AM - Start Command: [pg_dump -Fc -h db-sbx01 -d db1 -t public.jason_test_table -f /u04/pg_data_dumps/transfer_tables/public.jason_test_table.20200706.pdump]

07-06-2020 11:44:30 AM - Command Success: [pg_dump -Fc -h db-sbx01 -d db1 -t public.jason_test_table -f /u04/pg_data_dumps/transfer_tables/public.jason_test_table.20200706.pgdump]

07-06-2020 11:44:30 AM - Starting new HTTPS connection (1): hooks.slack.com

07-06-2020 11:44:30 AM - https://hooks.slack.com:443 "POST /services/T04MEPB2K/B72JPEUUB/nfEqv7bsKafUUjLoKgo0oT5S HTTP/1.1" 200 22

07-06-2020 11:44:30 AM - Start Command: [pg_restore -c -h db10 -d db1 /u04/pg_data_dumps/transfer_tables/public.jason_test_table.20200706]

07-06-2020 11:44:31 AM - Command Success: [pg_restore -c -h db10 -d db1 /u04/pg_data_dumps/transfer_tables/public.jason_test_table.20200706]

07-06-2020 11:44:31 AM - Starting new HTTPS connection (1): hooks.slack.com

07-06-2020 11:44:31 AM - https://hooks.slack.com:443 "POST /services/T04MEPB2K/B72JPEUUB/nfEqv7bsKafUUjLoKgo0oT5S HTTP/1.1" 200 22

07-06-2020 11:44:31 AM - finished PgTableSyncDevProd

Python Remove Files That Match Pattern Older Than N Days

January 16, 2020January 22, 2020 adminLeave a comment

Neat little script that implements find in pure python, this can be passed different patterns and directories. The script will walk the directories and match the patterns, it will then generate a list of files and get the ctime of each. Some comparison is done against a date you set and removes them. This is great for cleaning up application logs that clog up the filesystem.

#!/usr/bin/python3.5

import fnmatch
import os
from datetime import datetime, timedelta
from pathlib import Path

# set variables for dirs to clean.
log_path = os.environ["LOG_PATH"]
user_prod_home = str(Path.home())

# set lists of dirs and patterns to clean
dirs_to_clean = [log_path, user_prod_home]
patterns = ['*.log', 'app_*']


# function to loop and search patterns and rm files.
def find_files(dir_to_clean):
    file_list = []
    days_ago = datetime.now() - timedelta(days=60)
    for root, dirs, files in os.walk(dir_to_clean):
        for pattern in patterns:
            for filename in fnmatch.filter(files, pattern):
                file_list.append(os.path.join(root, filename))
                file_list.sort()

    for file in file_list:
        file_ctime = datetime.fromtimestamp(os.path.getctime(file))
        if file_ctime < days_ago:
            if os.path.isfile(file):
                try:
                    print("Removing file :[{0}]".format(file))
                    os.remove(file)
                except OSError as e:
                    print('File Clean Up Failed: [{0}]'.format(e))


# main function
def main():
    for dirs in dirs_to_clean:
        find_files(dirs)


if __name__ == "__main__":
    main()

#!/usr/bin/python3.5

import fnmatch

import os

from datetime import datetime, timedelta

from pathlib import Path

# set variables for dirs to clean.

log_path = os.environ["LOG_PATH"]

user_prod_home = str(Path.home())

# set lists of dirs and patterns to clean

dirs_to_clean = [log_path, user_prod_home]

patterns = ['*.log', 'app_*']

# function to loop and search patterns and rm files.

def find_files(dir_to_clean):

file_list = []

days_ago = datetime.now() - timedelta(days=60)

for root, dirs, files in os.walk(dir_to_clean):

for pattern in patterns:

for filename in fnmatch.filter(files, pattern):

file_list.append(os.path.join(root, filename))

file_list.sort()

for file in file_list:

file_ctime = datetime.fromtimestamp(os.path.getctime(file))

if file_ctime < days_ago:

if os.path.isfile(file):

try:

print("Removing file :[{0}]".format(file))

os.remove(file)

except OSError as e:

print('File Clean Up Failed: [{0}]'.format(e))

# main function

def main():

for dirs in dirs_to_clean:

find_files(dirs)

if __name__ == "__main__":

main()

AWS CLI Max Concurrent Requests Tuning

January 3, 2020November 11, 2021 admin4 Comments

In this post I would like to go over how I tuned a test server for copying / syncing files from the local filesystem to S3 over the internet. If you ever had the task of doing this, you will notice that as the file count grows, so does the time it takes to upload the files to S3. After some web searching I found out that AWS allows you to tune the config to allow more concurrency than default.
AWS CLI S3 Config

The parameter that we will be playing with is max_concurrent_requests
This has a default value of 10, which allows only 10 requests to the AWS API for S3. Lets see if we can make some changes to that value and get some performance gains. My test setup is as follows:

2 x Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz
8GB RAM
CentOS release 6.10 (Final)

2 x Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz

8GB RAM

CentOS release 6.10 (Final)

I have 56 102MB files in the test directory:

-rw-r--r-- 1 jasonr domain^users 101M Sep 24 11:44 sample__0_0_7.csv.gz
-rw-r--r-- 1 jasonr domain^users 102M Sep 24 11:44 sample__0_0_53.csv.gz
-rw-r--r-- 1 jasonr domain^users 101M Sep 24 11:44 sample__0_0_6.csv.gz
-rw-r--r-- 1 jasonr domain^users 101M Sep 24 11:44 sample__0_0_8.csv.gz
-rw-r--r-- 1 jasonr domain^users 101M Sep 24 11:44 sample__0_0_55.csv.gz
--snip--
[jasonr@jr-sandbox jason_test]$ ls| wc -l
56

-rw-r--r-- 1 jasonr domain^users 101M Sep 24 11:44 sample__0_0_7.csv.gz

-rw-r--r-- 1 jasonr domain^users 102M Sep 24 11:44 sample__0_0_53.csv.gz

-rw-r--r-- 1 jasonr domain^users 101M Sep 24 11:44 sample__0_0_6.csv.gz

-rw-r--r-- 1 jasonr domain^users 101M Sep 24 11:44 sample__0_0_8.csv.gz

-rw-r--r-- 1 jasonr domain^users 101M Sep 24 11:44 sample__0_0_55.csv.gz

--snip--

[jasonr@jr-sandbox jason_test]$ ls| wc -l

For the first test I am going to run aws s3 sync with no changes, so out of the box it should have 10 max_concurrent_requests. Lets use the Linux time command to gather the time result to copy all 56 files to S3. I will delete the folder on S3 with each iteration to keep the test the same. You can also view the 443 requests via netstat and count them as well to show whats going on. In all the tests my best result was 250. So as you can see you will need to play with the settings to get the best result, these settings will change along with the server specs.

1. 1m25.919s with the default configuration:

[jasonr@jr-sandbox jason_test]$ time aws s3 sync . s3://dev-redshift/jason_sync_test/
upload: ./sample__0_0_0.csv.gz to s3://dev-redshift/jason_sync_test/sample__0_0_0.csv.gz
upload: ./sample__0_0_10.csv.gz to s3://dev-redshift/jason_sync_test/sample__0_0_10.csv.gz
upload: ./sample__0_0_11.csv.gz to s3://dev-redshift/jason_sync_test/sample__0_0_11.csv.gz
upload: ./sample__0_0_12.csv.gz to s3://dev-redshift/jason_sync_test/sample__0_0_12.csv.gz
upload: ./sample__0_0_13.csv.gz to s3://dev-redshift/jason_sync_test/sample__0_0_13.csv.gz
--snip--

real	1m25.919s
user	0m35.153s
sys	0m15.879s

[jasonr@jr-sandbox jason_test]$ time aws s3 sync . s3://dev-redshift/jason_sync_test/

upload: ./sample__0_0_0.csv.gz to s3://dev-redshift/jason_sync_test/sample__0_0_0.csv.gz

upload: ./sample__0_0_10.csv.gz to s3://dev-redshift/jason_sync_test/sample__0_0_10.csv.gz

upload: ./sample__0_0_11.csv.gz to s3://dev-redshift/jason_sync_test/sample__0_0_11.csv.gz

upload: ./sample__0_0_12.csv.gz to s3://dev-redshift/jason_sync_test/sample__0_0_12.csv.gz

upload: ./sample__0_0_13.csv.gz to s3://dev-redshift/jason_sync_test/sample__0_0_13.csv.gz

--snip--

real 1m25.919s

user 0m35.153s

sys 0m15.879s

2. Now lets set the max conqurent requests to 20 and try again, you can do this with the command below, after running we can see a little gain.

[jasonr@jr-sandbox jason_test]$ aws configure set default.s3.max_concurrent_requests 20
[jasonr@jr-sandbox jason_test]$ cat ~/.aws/config 
[default]
s3 =
    max_concurrent_requests = 20
[root@jr-sandbox ~]# netstat -an| grep 443| wc -l
20

real	1m13.277s
user	0m36.186s
sys	0m16.462s

[jasonr@jr-sandbox jason_test]$ aws configure set default.s3.max_concurrent_requests 20

[jasonr@jr-sandbox jason_test]$ cat ~/.aws/config

[default]

s3 =

max_concurrent_requests = 20

[root@jr-sandbox ~]# netstat -an| grep 443| wc -l

real 1m13.277s

user 0m36.186s

sys 0m16.462s

3. Bumped up to 50 shows a bit more gain:

[jasonr@jr-sandbox jason_test]$ aws configure set default.s3.max_concurrent_requests 50
[jasonr@jr-sandbox jason_test]$ cat ~/.aws/config 
[default]
s3 =
    max_concurrent_requests = 50

[root@jr-sandbox ~]# netstat -an| grep 443| wc -l
49
real	1m0.720s
user	0m37.669s
sys	0m19.344s

[jasonr@jr-sandbox jason_test]$ aws configure set default.s3.max_concurrent_requests 50

[jasonr@jr-sandbox jason_test]$ cat ~/.aws/config

[default]

s3 =

max_concurrent_requests = 50

[root@jr-sandbox ~]# netstat -an| grep 443| wc -l

real 1m0.720s

user 0m37.669s

sys 0m19.344s

4. Bumped up to 100, I start to notice that we lost some speed:

[jasonr@jr-sandbox jason_test]$ aws configure set default.s3.max_concurrent_requests 100
[jasonr@jr-sandbox jason_test]$ cat ~/.aws/config 
[default]
s3 =
    max_concurrent_requests = 100
[root@jr-sandbox ~]# netstat -an| grep 443| wc -l
95
real	1m4.212s
user	0m39.737s
sys	0m21.847s

[jasonr@jr-sandbox jason_test]$ aws configure set default.s3.max_concurrent_requests 100

[jasonr@jr-sandbox jason_test]$ cat ~/.aws/config

[default]

s3 =

max_concurrent_requests = 100

[root@jr-sandbox ~]# netstat -an| grep 443| wc -l

real 1m4.212s

user 0m39.737s

sys 0m21.847s

5. Bumped up to 250 we see the best result so far:

[jasonr@jr-sandbox jason_test]$ aws configure set default.s3.max_concurrent_requests 250
[jasonr@jr-sandbox jason_test]$ cat ~/.aws/config 
[default]
s3 =
    max_concurrent_requests = 250
[root@jr-sandbox ~]# netstat -an| grep 443| wc -l
234
real	0m55.036s
user	0m42.841s
sys	0m21.409s

[jasonr@jr-sandbox jason_test]$ aws configure set default.s3.max_concurrent_requests 250

[jasonr@jr-sandbox jason_test]$ cat ~/.aws/config

[default]

s3 =

max_concurrent_requests = 250

[root@jr-sandbox ~]# netstat -an| grep 443| wc -l

234

real 0m55.036s

user 0m42.841s

sys 0m21.409s

6. Bumped up to 500, we lose performance, most likely due to the machine resources.

[jasonr@jr-sandbox jason_test]$ aws configure set default.s3.max_concurrent_requests 500
[jasonr@jr-sandbox jason_test]$ cat ~/.aws/config 
[default]
s3 =
    max_concurrent_requests = 500
[root@jr-sandbox ~]# netstat -an| grep 443| wc -l
465
real	1m16.593s
user	0m50.336s
sys	0m25.806s

[jasonr@jr-sandbox jason_test]$ aws configure set default.s3.max_concurrent_requests 500

[jasonr@jr-sandbox jason_test]$ cat ~/.aws/config

[default]

s3 =

max_concurrent_requests = 500

[root@jr-sandbox ~]# netstat -an| grep 443| wc -l

465

real 1m16.593s

user 0m50.336s

sys 0m25.806s

So to wrap up, you can tune the amount of concurrent requests allowed from the aws cli to s3, you will need to play with this setting to get the best results for your machine.

Jason R. Ralph

Linux All Day Everyday

Python Linux Find Files With Pattern Accessed Older Than N Days And Remove

AWS EMR ImportError: this version of pandas is incompatible with numpy < 1.17.3

10 Year Anniversary: www.jasonralph.org

AWS Apache Managed Airflow EMR ModuleNotFoundError: No module named ‘requests’ Bootstrap

Node Application Stopped Sending Updates To Slack – can’t identify protocol

Capture AWS CLI Output With Timestamps On Each Line Of Output

centos8 postgresql-11-check-db-dir[]: is missing or empty

Automate pg_dump pg_restore Of Tables From Config File Send Slack Update

Python Remove Files That Match Pattern Older Than N Days

AWS CLI Max Concurrent Requests Tuning