Code Review Asked on December 19, 2021
This is my scraper for AWS ECS (Elastic Container Service). My primary use for this is to grep
through the resulting file to see which cluster a given service is in.
I’ve been coding in python for a few years now, but I don’t feel like my code is as pythonic as it should be. I’ve run this though pylint
and it passes cleanly now. I’m open to any suggestions for how to make this cleaner, more reliable, or more pythonic.
#!/usr/bin/env python3
"""scrape ECS for inventory"""
import pprint # pylint: disable=unused-import
import sys
import time
import boto3 # pylint: disable=import-error
import sre_logging
def ecs_services(ecs):
"""get all ECS services"""
# get list of clusters from AWS
ecs_clusters = []
paginator1 = ecs.get_paginator('list_clusters')
for page in paginator1.paginate():
for cluster in page['clusterArns']:
ecs_clusters.append(cluster)
service_map = {}
for cluster in ecs_clusters:
# describe cluster
cluster_details = ecs.describe_clusters(clusters=[cluster])
cluster_name = cluster_details['clusters'][0]['clusterName']
cluster_tags = cluster_details['clusters'][0]['tags']
logger.info("getting services for cluster %s, tags: %s...", cluster_name, cluster_tags)
service_map[cluster_name] = {}
# get services
paginator2 = ecs.get_paginator('list_services')
this_ecs_services = []
for page in paginator2.paginate(cluster=cluster):
for service in page['serviceArns']:
this_ecs_services.append(service)
#pp.pprint(this_ecs_services)
for service in this_ecs_services:
service_details = ecs.describe_services(cluster=cluster, services=[service])
service_name = service_details['services'][0]['serviceName']
service_definition = service_details['services'][0]['taskDefinition']
service_map[cluster_name][service_name] = {
"serviceArn": service,
"clusterArn": cluster,
"taskDefinition": service_definition
}
return service_map
def find_consul(env):
"""pull consul variables out of environment"""
#pp.pprint(env)
host = ''
port = ''
prefix = ''
for entry in env:
if entry['name'] == 'CONSUL_HOST':
host = entry['value']
if entry['name'] == 'CONSUL_PORT':
port = entry['value']
if entry['name'] == 'CONSUL_SERVICE_PREFIX':
prefix = entry['value']
if prefix:
consul_url = "%s:%s/%s" % (host, port, prefix)
else:
consul_url = "undefined"
return consul_url
def scrape_ecs_stats():
"""do one pass of scraping ECS"""
ecs = boto3.client('ecs')
services = ecs_services(ecs)
#pp.pprint(services)
logger.info("getting task definitions...")
# find consul config
result_txt = []
for cluster in services:
for service in services[cluster]:
task_def_arn = services[cluster][service]["taskDefinition"]
task_def = ecs.describe_task_definition(taskDefinition=task_def_arn)
env = task_def['taskDefinition']['containerDefinitions'][0]['environment']
consul_url = find_consul(env)
result_txt.append("%st%st%st%sn" % (cluster, service, task_def_arn, consul_url))
send_content = "".join(result_txt)
#pp.pprint(send_content)
# upload to S3
bucket = "redacted"
filepath = "ecs/ecs_services.txt"
boto_s3 = boto3.client('s3')
boto_s3.put_object(Bucket=bucket, Body=send_content, ContentType="text/plain", Key=filepath)
logger.info("sent %s to s3", filepath)
def main():
"""main control loop for scraping ECS stats"""
while True:
try:
start = time.time()
scrape_ecs_stats()
end = time.time()
duration = end - start
naptime = 600 - duration
if naptime < 0:
naptime = 0
logger.info("ran %d seconds, sleeping %3.2f minutes", duration, naptime/60)
time.sleep(naptime)
except KeyboardInterrupt:
sys.exit(1)
except: # pylint: disable=bare-except
# so we can log unexpected and intermittant errors, from boto mostly
logger.exception("top level catch")
time.sleep(180) # 3 minutes
if __name__ == "__main__":
logger = sre_logging.configure_logging()
pp = pprint.PrettyPrinter(indent=4)
main()
Scraping should be the last resort for getting data from a service that does not have an API. Thankfully, not only does ECS have a full API, you're using it via Boto - which means that you are not scraping. Scraping is hacking together unstructured content into structured data, which is not what you're doing.
ecs_clusters = []
paginator1 = ecs.get_paginator('list_clusters')
for page in paginator1.paginate():
for cluster in page['clusterArns']:
ecs_clusters.append(cluster)
can be
ecs_clusters = [
cluster
for page in paginator1.paginate()
for cluster in page['clusterArns']
]
cluster_details['clusters'][0]
should receive a variable since it's used multiple times.
I want to call out that you're using logging properly, and that's not nothing. Good job; keep it up.
"%s:%s/%s" % (host, port, prefix)
can be
f"{host}:{port}/{prefix}"
As these things go, "".join(result_txt)
is not the worst way to do concatenation. It's generally guaranteed to be more performant than +=
. That said, I consider the StringIO
interface to be simpler to use than having to manage and then aggregate a sequence, and it has the added advantage that you can more readily swap this out with an actual file object (including, maybe, an HTTP stream) if that's useful.
naptime = 600 - duration
if naptime < 0:
naptime = 0
can be
naptime = max(0, 600 - duration)
Answered by Reinderien on December 19, 2021
Get help from others!
Recent Answers
Recent Questions
© 2024 TransWikia.com. All rights reserved. Sites we Love: PCI Database, UKBizDB, Menu Kuliner, Sharing RPP