diff --git a/CHANGELOG.md b/CHANGELOG.md index 1ed07f3..d2a3803 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,16 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [Unreleased] + +### Added + +- implement Prometheus exporter that provides all quota results + +### Changed + +- display AWS account ID instead of profile name in check scope + ## [1.1.0] - 2021-02-27 ### Added diff --git a/README.md b/README.md index 49fc4e5..b0f8296 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,9 @@ A tool that helps keeping track of your AWS quota utilization. It'll determine t This is especially useful cause today, cloud resources are being created from all kinds of sources, e.g. IaC and Kubernetes operators. This tool will give you a head start for requesting quota increases before you hit a quota limit to prevent being stuck with a production system not being able to scale anymore. -A usual use case is to add it to your CI pipeline right after applying your IaC or run it on a regular basis. Feel free to leave a vote on [this issue](https://github.com/brennerm/aws-quota-checker/issues/1) if you'd like to see a Prometheus exporter. +A usual use case is to add it to your CI pipeline right after applying your IaC or run it on a regular basis. It also comes with a Prometheus exporter mode that allows you to visualize the data with your tool of choice, e.g. Grafana. + +![Example Grafana dashboard that uses metrics of the Prometheus exporter](https://raw.githubusercontent.com/brennerm/aws-quota-checker/master/img/example-grafana-dashboard.png) ## Installation @@ -63,6 +65,62 @@ $ aws-quota-checker check-instance vpc_acls_per_vpc vpc-0123456789 Network ACLs per VPC [default/eu-central-1/vpc-0123456789]: 0/200 ``` +### Prometheus exporter + +The Prometheus exporter requires additional dependencies that you need to install with `pip install aws-quota-checker[prometheus]`. + +```bash +$ aws-quota-checker prometheus-exporter all +AWS profile: default | AWS region: us-east-1 | Active checks: am_mesh_count,asg_count,cf_stack_count,cw_alarm_count,dyndb_table_count,ebs_snapshot_count,ec2_eip_count,ec2_on_demand_f_count,ec2_on_demand_g_count,ec2_on_demand_inf_count,ec2_on_demand_p_count,ec2_on_demand_standard_count,ec2_on_demand_x_count,ec2_spot_f_count,ec2_spot_g_count,ec2_spot_inf_count,ec2_spot_p_count,ec2_spot_standard_count,ec2_spot_x_count,ec2_tgw_count,ec2_vpn_connection_count,ecs_count,eks_count,elasticbeanstalk_application_count,elasticbeanstalk_environment_count,elb_alb_count,elb_clb_count,elb_listeners_per_alb,elb_listeners_per_clb,elb_listeners_per_nlb,elb_nlb_count,elb_target_group_count,iam_attached_policy_per_group,iam_attached_policy_per_role,iam_attached_policy_per_user,iam_group_count,iam_policy_count,iam_policy_version_count,iam_server_certificate_count,iam_user_count,ig_count,lc_count,ni_count,route53_health_check_count,route53_hosted_zone_count,route53_records_per_hosted_zone,route53_reusable_delegation_set_count,route53_traffic_policy_count,route53_traffic_policy_instance_count,route53_vpcs_per_hosted_zone,route53resolver_endpoint_count,route53resolver_rule_association_count,route53resolver_rule_count,s3_bucket_count,secretsmanager_secrets_count,sg_count,sns_pending_subscriptions_count,sns_subscriptions_per_topic,sns_topics_count,vpc_acls_per_vpc,vpc_count,vpc_subnets_per_vpc +09-Mar-21 20:15:11 [INFO] botocore.credentials - Found credentials in shared credentials file: ~/.aws/credentials +09-Mar-21 20:15:11 [INFO] aws_quota.prometheus - starting /metrics endpoint on port 8080 +09-Mar-21 20:15:11 [INFO] aws_quota.prometheus - collecting checks +09-Mar-21 20:15:19 [INFO] aws_quota.prometheus - collected 110 checks +09-Mar-21 20:15:19 [INFO] aws_quota.prometheus - refreshing limits +09-Mar-21 20:16:34 [INFO] aws_quota.prometheus - limits refreshed +09-Mar-21 20:16:34 [INFO] aws_quota.prometheus - refreshing current values +09-Mar-21 20:18:15 [INFO] aws_quota.prometheus - current values refreshed +``` + +The exporter will return the following metrics: + +- awsquota_$checkkey: the current value of each quota check +- awsquota_$checkkey_limit: the limit value of each quota check +- awsquota_check_count: the number of quota checks that are being executed +- awsquota_check_limits_duration_seconds: the number of seconds that was necessary to query all quota limits +- awsquota_check_currents_duration_seconds: the number of seconds that was necessary to query all current quota values +- awsquota_info: info gauge that will expose the current AWS account and region as labels + +Depending on the check type, labels for the AWS account, the AWS region and the instance ID will be attached to the metric. + +Below you can find a few example metrics: + +``` +# HELP awsquota_info AWS quota checker info +# TYPE awsquota_info gauge +awsquota_info{account="123456789",region="us-east-1"} 1.0 +# HELP awsquota_check_count Number of AWS Quota Checks +# TYPE awsquota_check_count gauge +awsquota_check_count 110.0 +# HELP awsquota_collect_checks_duration_seconds Time to collect all quota checks +# TYPE awsquota_collect_checks_duration_seconds gauge +awsquota_collect_checks_duration_seconds{account="123456789",region="us-east-1"} 7.885610818862915 +# HELP awsquota_asg_count_limit Auto Scaling groups per region Limit +# TYPE awsquota_asg_count_limit gauge +awsquota_asg_count_limit{account="123456789",region="us-east-1"} 200.0 +# HELP awsquota_ec2_on_demand_standard_count Running On-Demand Standard (A, C, D, H, I, M, R, T, Z) EC2 instances +# TYPE awsquota_ec2_on_demand_standard_count gauge +awsquota_ec2_on_demand_standard_count{account="123456789"} 22.0 +# HELP awsquota_elb_listeners_per_clb Listeners per Classic Load Balancer +# TYPE awsquota_elb_listeners_per_clb gauge +awsquota_elb_listeners_per_clb{account="123456789",instance="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",region="us-east-1"} 10.0 +awsquota_elb_listeners_per_clb{account="123456789",instance="bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb",region="us-east-1"} 2.0 +``` + +As querying all quotas, depending on the number of resources to check, may take some time, the exporter works asynchronously. That means requesting the /metrics endpoint will return cached results and not trigger a recheck of all quotas. Instead all checks will be executed and refreshed in the background. That's why no metrics will be available directly after starting the exporter. + +Hence it doesn't make too much sense to scrape the /metrics every few seconds cause the values will only refresh once in a while. The check intervals of the background jobs can be adjusted to your needs using command line arguments. + ## Missing a quota check? Feel free to create a new issue with the _New Check_ label including a description which quota check you are missing. diff --git a/aws_quota/check/elb.py b/aws_quota/check/elb.py index dfd0eb8..b52654a 100644 --- a/aws_quota/check/elb.py +++ b/aws_quota/check/elb.py @@ -1,3 +1,4 @@ +from aws_quota.exceptions import InstanceWithIdentifierNotFound import typing import boto3 from .quota_check import QuotaCheck, InstanceQuotaCheck, QuotaScope @@ -36,9 +37,12 @@ def get_all_identifiers(session: boto3.Session) -> typing.List[str]: @property def current(self): - return len(self.boto_session.client('elb').describe_load_balancers( - LoadBalancerNames=[self.instance_id] - )['LoadBalancerDescriptions'][0]['ListenerDescriptions']) + try: + return len(self.boto_session.client('elb').describe_load_balancers( + LoadBalancerNames=[self.instance_id] + )['LoadBalancerDescriptions'][0]['ListenerDescriptions']) + except self.boto_session.client('elb').exceptions.AccessPointNotFoundException as e: + raise InstanceWithIdentifierNotFound(self) from e class NetworkLoadBalancerCountCheck(QuotaCheck): @@ -67,8 +71,11 @@ def get_all_identifiers(session: boto3.Session) -> typing.List[str]: @property def current(self): - return len(self.boto_session.client('elbv2').describe_listeners( - LoadBalancerArn=self.instance_id)['Listeners']) + try: + return len(self.boto_session.client('elbv2').describe_listeners( + LoadBalancerArn=self.instance_id)['Listeners']) + except self.boto_session.client('elbv2').exceptions.LoadBalancerNotFoundException as e: + raise InstanceWithIdentifierNotFound(self) from e class ApplicationLoadBalancerCountCheck(QuotaCheck): @@ -97,8 +104,11 @@ def get_all_identifiers(session: boto3.Session) -> typing.List[str]: @property def current(self) -> int: - return len(self.boto_session.client('elbv2').describe_listeners( - LoadBalancerArn=self.instance_id)['Listeners']) + try: + return len(self.boto_session.client('elbv2').describe_listeners( + LoadBalancerArn=self.instance_id)['Listeners']) + except self.boto_session.client('elbv2').exceptions.LoadBalancerNotFoundException as e: + raise InstanceWithIdentifierNotFound(self) from e class TargetGroupCountCheck(QuotaCheck): diff --git a/aws_quota/check/iam.py b/aws_quota/check/iam.py index ba6b2ec..f362434 100644 --- a/aws_quota/check/iam.py +++ b/aws_quota/check/iam.py @@ -1,3 +1,4 @@ +from aws_quota.exceptions import InstanceWithIdentifierNotFound import typing import boto3 @@ -89,7 +90,10 @@ def maximum(self): @property def current(self): - return len(self.boto_session.client('iam').list_user_policies(UserName=self.instance_id)['PolicyNames']) + try: + return len(self.boto_session.client('iam').list_user_policies(UserName=self.instance_id)['PolicyNames']) + except self.boto_session.client('iam').exceptions.NoSuchEntityException as e: + raise InstanceWithIdentifierNotFound(self) from e class AttachedPolicyPerGroupCheck(InstanceQuotaCheck): key = "iam_attached_policy_per_group" @@ -106,7 +110,10 @@ def maximum(self): @property def current(self): - return len(self.boto_session.client('iam').list_group_policies(GroupName=self.instance_id)['PolicyNames']) + try: + return len(self.boto_session.client('iam').list_group_policies(GroupName=self.instance_id)['PolicyNames']) + except self.boto_session.client('iam').exceptions.NoSuchEntityException as e: + raise InstanceWithIdentifierNotFound(self) from e class AttachedPolicyPerRoleCheck(InstanceQuotaCheck): key = "iam_attached_policy_per_role" @@ -123,4 +130,7 @@ def maximum(self): @property def current(self): - return len(self.boto_session.client('iam').list_role_policies(RoleName=self.instance_id)['PolicyNames']) + try: + return len(self.boto_session.client('iam').list_role_policies(RoleName=self.instance_id)['PolicyNames']) + except self.boto_session.client('iam').exceptions.NoSuchEntityException as e: + raise InstanceWithIdentifierNotFound(self) from e diff --git a/aws_quota/check/quota_check.py b/aws_quota/check/quota_check.py index b839015..1911b6c 100644 --- a/aws_quota/check/quota_check.py +++ b/aws_quota/check/quota_check.py @@ -1,3 +1,4 @@ +from aws_quota.utils import get_account_id import enum import typing @@ -23,6 +24,22 @@ def __init__(self, boto_session: boto3.Session) -> None: self.boto_session = boto_session self.sq_client = boto_session.client('service-quotas') + def __str__(self) -> str: + return f'{self.key}{self.label_values}' + + @property + def label_values(self): + if self.scope == QuotaScope.ACCOUNT: + return {'account': get_account_id(self.boto_session)} + elif self.scope == QuotaScope.REGION: + return {'account': get_account_id(self.boto_session), 'region': self.boto_session.region_name} + elif self.scope == QuotaScope.INSTANCE: + return { + 'account': get_account_id(self.boto_session), + 'region': self.boto_session.region_name, + 'instance': self.instance_id + } + @property def maximum(self) -> int: try: diff --git a/aws_quota/check/route53.py b/aws_quota/check/route53.py index 37b415d..ec9d0f9 100644 --- a/aws_quota/check/route53.py +++ b/aws_quota/check/route53.py @@ -1,3 +1,4 @@ +from aws_quota.exceptions import InstanceWithIdentifierNotFound import typing import boto3 from .quota_check import InstanceQuotaCheck, QuotaCheck, QuotaScope @@ -84,11 +85,17 @@ def get_all_identifiers(session: boto3.Session) -> typing.List[str]: @property def maximum(self): - return self.boto_session.client('route53').get_hosted_zone_limit(Type='MAX_RRSETS_BY_ZONE', HostedZoneId=self.instance_id)['Limit']['Value'] + try: + return self.boto_session.client('route53').get_hosted_zone_limit(Type='MAX_RRSETS_BY_ZONE', HostedZoneId=self.instance_id)['Limit']['Value'] + except self.boto_session.client('route53').exceptions.NoSuchHostedZone as e: + raise InstanceWithIdentifierNotFound(self) from e @property def current(self): - return self.boto_session.client('route53').get_hosted_zone_limit(Type='MAX_RRSETS_BY_ZONE', HostedZoneId=self.instance_id)['Count'] + try: + return self.boto_session.client('route53').get_hosted_zone_limit(Type='MAX_RRSETS_BY_ZONE', HostedZoneId=self.instance_id)['Count'] + except self.boto_session.client('route53').exceptions.NoSuchHostedZone as e: + raise InstanceWithIdentifierNotFound(self) from e class AssociatedVpcHostedZoneCheck(InstanceQuotaCheck): @@ -102,8 +109,14 @@ def get_all_identifiers(session: boto3.Session) -> typing.List[str]: @property def maximum(self): - return self.boto_session.client('route53').get_hosted_zone_limit(Type='MAX_VPCS_ASSOCIATED_BY_ZONE', HostedZoneId=self.instance_id)['Limit']['Value'] + try: + return self.boto_session.client('route53').get_hosted_zone_limit(Type='MAX_VPCS_ASSOCIATED_BY_ZONE', HostedZoneId=self.instance_id)['Limit']['Value'] + except self.boto_session.client('route53').exceptions.NoSuchHostedZone as e: + raise InstanceWithIdentifierNotFound(self) from e @property def current(self): - return self.boto_session.client('route53').get_hosted_zone_limit(Type='MAX_VPCS_ASSOCIATED_BY_ZONE', HostedZoneId=self.instance_id)['Count'] + try: + return self.boto_session.client('route53').get_hosted_zone_limit(Type='MAX_VPCS_ASSOCIATED_BY_ZONE', HostedZoneId=self.instance_id)['Count'] + except self.boto_session.client('route53').exceptions.NoSuchHostedZone as e: + raise InstanceWithIdentifierNotFound(self) from e diff --git a/aws_quota/check/sns.py b/aws_quota/check/sns.py index 73d9ca9..cd93d96 100644 --- a/aws_quota/check/sns.py +++ b/aws_quota/check/sns.py @@ -1,3 +1,4 @@ +from aws_quota.exceptions import InstanceWithIdentifierNotFound import typing import boto3 @@ -37,6 +38,7 @@ class SubscriptionsPerTopicCheck(InstanceQuotaCheck): description = "SNS subscriptions per topics" service_code = 'sns' quota_code = 'L-A4340BCD' + instance_id = 'Topic ARN' @staticmethod def get_all_identifiers(session: boto3.Session) -> typing.List[str]: @@ -44,5 +46,9 @@ def get_all_identifiers(session: boto3.Session) -> typing.List[str]: @property def current(self): - topic_attrs = self.boto_session.client('sns').get_topic_attributes(TopicArn=self.instance_id)['Attributes'] + try: + topic_attrs = self.boto_session.client('sns').get_topic_attributes(TopicArn=self.instance_id)['Attributes'] + except self.boto_session.client('sns').exceptions.NotFoundException as e: + raise InstanceWithIdentifierNotFound(self) from e + return int(topic_attrs['SubscriptionsConfirmed']) + int(topic_attrs['SubscriptionsPending']) diff --git a/aws_quota/check/vpc.py b/aws_quota/check/vpc.py index ef50cc1..d32de95 100644 --- a/aws_quota/check/vpc.py +++ b/aws_quota/check/vpc.py @@ -1,8 +1,18 @@ +from aws_quota.exceptions import InstanceWithIdentifierNotFound import typing import boto3 +import botocore.exceptions from .quota_check import QuotaCheck, InstanceQuotaCheck, QuotaScope +def check_if_vpc_exists(session: boto3.Session, vpc_id: str) -> bool: + client = session.client('ec2') + try: + client.describe_vpcs(VpcIds=[vpc_id]) + except botocore.exceptions.ClientError as e: + return False + return True + class VpcCountCheck(QuotaCheck): key = "vpc_count" @@ -66,11 +76,14 @@ def get_all_identifiers(session: boto3.Session) -> typing.List[str]: @property def current(self): - return len(self.boto_session.client('ec2').describe_subnets(Filters=[ - { - 'Name': 'vpc-id', - 'Values': [self.instance_id] - }])['Subnets']) + if check_if_vpc_exists(self.boto_session, self.instance_id): + return len(self.boto_session.client('ec2').describe_subnets(Filters=[ + { + 'Name': 'vpc-id', + 'Values': [self.instance_id] + }])['Subnets']) + else: + raise InstanceWithIdentifierNotFound(self) class AclsPerVpcCountCheck(InstanceQuotaCheck): @@ -86,8 +99,11 @@ def get_all_identifiers(session: boto3.Session) -> typing.List[str]: @property def current(self) -> int: - return len(self.boto_session.client('ec2').describe_network_acls(Filters=[ - { - 'Name': 'vpc-id', - 'Values': [self.instance_id] - }])['NetworkAcls']) + if check_if_vpc_exists(self.boto_session, self.instance_id): + return len(self.boto_session.client('ec2').describe_network_acls(Filters=[ + { + 'Name': 'vpc-id', + 'Values': [self.instance_id] + }])['NetworkAcls']) + else: + raise InstanceWithIdentifierNotFound(self) diff --git a/aws_quota/cli.py b/aws_quota/cli.py index 139c79c..56745fe 100644 --- a/aws_quota/cli.py +++ b/aws_quota/cli.py @@ -1,3 +1,5 @@ +import logging +from aws_quota.utils import get_account_id import enum import typing import sys @@ -64,11 +66,11 @@ def run_checks(self): maximum = chk.maximum if chk.scope == QuotaScope.ACCOUNT: - scope = self.session.profile_name + scope = get_account_id(self.session) elif chk.scope == QuotaScope.REGION: - scope = f'{self.session.profile_name}/{self.session.region_name}' + scope = f'{get_account_id(self.session)}/{self.session.region_name}' elif chk.scope == QuotaScope.INSTANCE: - scope = f'{self.session.profile_name}/{self.session.region_name}/{chk.instance_id}' + scope = f'{get_account_id(self.session)}/{self.session.region_name}/{chk.instance_id}' result = self.__report(chk.description, scope, current, maximum) @@ -86,21 +88,28 @@ def cli(): pass -def common_check_options(function): +def common_scope_options(function): function = click.option( '--region', help='Region to use for region scoped quotas, defaults to current')(function) function = click.option( '--profile', help='AWS profile name to use, defaults to current')(function) + + return function + + +def common_check_options(function): function = click.option( '--warning-threshold', help='Warning threshold percentage for quota utilization, defaults to 0.8', default=0.8)(function) function = click.option( '--error-threshold', help='Error threshold percentage for quota utilization, defaults to 0.9', default=0.9)(function) function = click.option('--fail-on-warning/--no-fail-on-warning', help='Exit with non-zero error code on quota warning, defaults to false', default=False)(function) + return function @cli.command() +@common_scope_options @common_check_options @click.argument('check-keys') def check(check_keys, region, profile, warning_threshold, error_threshold, fail_on_warning): @@ -115,7 +124,7 @@ def check(check_keys, region, profile, warning_threshold, error_threshold, fail_ Execute list-checks command to get available check keys Pass all to run all checks - + For instance checks it'll run through each individual instance available""" split_check_keys = check_keys.split(',') @@ -134,7 +143,8 @@ def check(check_keys, region, profile, warning_threshold, error_threshold, fail_ selected_checks = list( filter(lambda c: c.key in whitelisted_check_keys, ALL_CHECKS)) - selected_checks = list(filter(lambda c: c.key not in blacklisted_check_keys, selected_checks)) + selected_checks = list( + filter(lambda c: c.key not in blacklisted_check_keys, selected_checks)) session = boto3.Session(region_name=region, profile_name=profile) @@ -158,6 +168,7 @@ def check(check_keys, region, profile, warning_threshold, error_threshold, fail_ @cli.command() +@common_scope_options @common_check_options @click.argument('check-key') @click.argument('instance-id') @@ -182,11 +193,78 @@ def check_instance(check_key, instance_id, region, profile, warning_threshold, e error_threshold, fail_on_warning).run_checks() +@cli.command() +@common_scope_options +@click.option('--port', help='Port on which to expose the Prometheus /metrics endpoint, defaults to 8080', default=8080) +@click.option('--namespace', help='Namespace/prefix for Prometheus metrics, defaults to awsquota', default='awsquota') +@click.option('--limits-check-interval', help='Interval in seconds at which to check the limit quota value, defaults to 600', default=600) +@click.option('--currents-check-interval', help='Interval in seconds at which to check the current quota value, defaults to 300', default=300) +@click.option('--reload-checks-interval', help='Interval in seconds at which to collect new checks e.g. when a new resource has been created, defaults to 600', default=600) +@click.option('--enable-duration-metrics/--disable-duration-metrics', help='Flag to control whether to collect/expose duration metrics, defaults to true', default=True) +@click.argument('check-keys') +def prometheus_exporter(check_keys, region, profile, port, namespace, limits_check_interval, currents_check_interval, reload_checks_interval, enable_duration_metrics): + """Start a Prometheus exporter for quota checks + + Set checks to execute with CHECK_KEYS + + e.g. check vpc_count,ecs_count + + Blacklist checks by prefixing them with ! + + e.g. check all,!vpc_count + + Execute list-checks command to get available check keys + + Pass all to run all checks + """ + from aws_quota.prometheus import PrometheusExporter, PrometheusExporterSettings + + logging.basicConfig( + level=logging.INFO, + format='%(asctime)s [%(levelname)s] %(name)s - %(message)s', datefmt='%d-%b-%y %H:%M:%S' + ) + + split_check_keys = check_keys.split(',') + blacklisted_check_keys = [] + whitelisted_check_keys = [] + + for key in split_check_keys: + if key.startswith('!'): + blacklisted_check_keys.append(key.lstrip('!')) + else: + whitelisted_check_keys.append(key) + + if 'all' in whitelisted_check_keys: + selected_checks = ALL_CHECKS + else: + selected_checks = list( + filter(lambda c: c.key in whitelisted_check_keys, ALL_CHECKS)) + + selected_checks = list( + filter(lambda c: c.key not in blacklisted_check_keys, selected_checks)) + + session = boto3.Session(region_name=region, profile_name=profile) + + click.echo( + f'AWS profile: {session.profile_name} | AWS region: {session.region_name} | Active checks: {",".join([check.key for check in selected_checks])}') + + settings = PrometheusExporterSettings( + port=port, + namespace=namespace, + get_currents_interval=currents_check_interval, + get_limits_interval=limits_check_interval, + reload_checks_interval=reload_checks_interval, + enable_duration_metrics=enable_duration_metrics + ) + + PrometheusExporter(session, selected_checks, settings).start() + + @cli.command() def list_checks(): """List available quota checks""" click.echo(tabulate.tabulate([(chk.key, chk.description, chk.scope.name, getattr(chk, 'instance_id', 'N/A')) - for chk in ALL_CHECKS], headers=['Key', 'Description', 'Scope', 'Instance ID'])) + for chk in ALL_CHECKS], headers=['Key', 'Description', 'Scope', 'Instance ID'])) if __name__ == '__main__': diff --git a/aws_quota/exceptions.py b/aws_quota/exceptions.py new file mode 100644 index 0000000..29dbf5c --- /dev/null +++ b/aws_quota/exceptions.py @@ -0,0 +1,10 @@ +class AwsQuotaCheckerException(RuntimeError): + pass + + +class InstanceWithIdentifierNotFound(AwsQuotaCheckerException): + def __init__(self, check) -> None: + self.check = check + + def __str__(self) -> str: + return f'check {self.check.key} could not find instance with ID "{self.check.instance_id}"' diff --git a/aws_quota/prometheus.py b/aws_quota/prometheus.py new file mode 100644 index 0000000..8c23b6c --- /dev/null +++ b/aws_quota/prometheus.py @@ -0,0 +1,201 @@ +import asyncio +from aws_quota.exceptions import InstanceWithIdentifierNotFound +from aws_quota.utils import get_account_id +import dataclasses +import logging +import signal +import time +import contextlib +import typing + +from aws_quota.check.quota_check import InstanceQuotaCheck, QuotaCheck + +import boto3 +import prometheus_client as prom + +logger = logging.getLogger(__name__) + + +@dataclasses.dataclass +class PrometheusExporterSettings: + port: int + namespace: str + get_currents_interval: int + get_limits_interval: int + reload_checks_interval: int + enable_duration_metrics: bool + + +class PrometheusExporter: + def __init__(self, + session: boto3.Session, + check_classes: typing.List[QuotaCheck], + settings: PrometheusExporterSettings): + self.session = session + self.check_classes = check_classes + self.checks = [] + self.settings = settings + + # unregister default collectors + for name in list(prom.REGISTRY._names_to_collectors.values()): + with contextlib.suppress(KeyError): + prom.REGISTRY.unregister(name) + + prom.Info(f'{self.settings.namespace}', 'AWS quota checker info').info({ + **self.default_labels + }) + + @property + def default_labels(self): + return { + 'account': get_account_id(self.session), + 'region': self.session.region_name + } + + @contextlib.contextmanager + def timeit_gauge(self, prefix: str, labels: dict = None, **kwargs): + if labels is None: + labels = self.default_labels + + start = time.time() + try: + yield + finally: + duration = time.time() - start + + if self.settings.enable_duration_metrics: + PrometheusExporter.get_or_create_gauge( + name=f'{prefix}_duration_seconds', + labelnames=labels.keys(), + **kwargs).labels(**labels).set(duration) + + @staticmethod + def get_or_create_gauge(name, **kwargs) -> prom.Gauge: + if name in prom.REGISTRY._names_to_collectors: + return prom.REGISTRY._names_to_collectors[name] + + return prom.Gauge(name, **kwargs) + + def drop_obsolete_check(self): + raise NotImplementedError + + async def load_checks_job(self): + g = PrometheusExporter.get_or_create_gauge( + f'{self.settings.namespace}_check_count', + documentation='Number of AWS Quota Checks' + ) + + while True: + with self.timeit_gauge( + f'{self.settings.namespace}_collect_checks', + documentation='Time to collect all quota checks' + ): + logger.info('collecting checks') + checks = [] + for chk in self.check_classes: + try: + if issubclass(chk, InstanceQuotaCheck): + for identifier in chk.get_all_identifiers(self.session): + checks.append( + chk(self.session, identifier) + ) + else: + checks.append(chk(self.session)) + except Exception: + logger.error('failed to collect check %s', chk) + + g.set(len(checks)) + self.checks = checks + logger.info(f'collected {len(checks)} checks') + await asyncio.sleep(self.settings.reload_checks_interval) + + async def get_limits_job(self): + + while True: + with self.timeit_gauge( + f'{self.settings.namespace}_check_limits', + documentation='Time to check limits of all quotas' + ): + logger.info('refreshing limits') + checks_to_drop = [] + + for check in self.checks: + labels = check.label_values + name = f'{self.settings.namespace}_{check.key}_limit' + + try: + value = check.maximum + + PrometheusExporter.get_or_create_gauge( + name, + documentation=f'{check.description} Limit', + labelnames=labels.keys() + ).labels(**check.label_values).set(value) + except InstanceWithIdentifierNotFound as e: + logger.warn( + 'instance with identifier %s does not exist anymore, dropping it...', e.check.instance_id) + checks_to_drop.append(e.check) + except Exception: + logger.error( + 'getting maximum of quota %s failed', check) + + for check in checks_to_drop: + self.checks.remove(check) + + logger.info('limits refreshed') + await asyncio.sleep(self.settings.get_limits_interval) + + async def get_currents_job(self): + + while True: + with self.timeit_gauge( + f'{self.settings.namespace}_check_currents', + documentation='Time to check limits of all quotas' + ): + + logger.info('refreshing current values') + checks_to_drop = [] + for check in self.checks: + labels = check.label_values + name = f'{self.settings.namespace}_{check.key}' + + try: + value = check.current + + PrometheusExporter.get_or_create_gauge( + name, + documentation=f'{check.description}', + labelnames=labels.keys() + ).labels(**check.label_values).set(value) + except InstanceWithIdentifierNotFound as e: + logger.warn( + 'instance with identifier %s does not exist anymore, dropping it...', e.check.instance_id) + checks_to_drop.append(e.check) + except Exception: + logger.error( + 'getting maximum of quota %s failed', check) + + for check in checks_to_drop: + self.checks.remove(check) + + logger.info('current values refreshed') + await asyncio.sleep(self.settings.get_currents_interval) + + def serve(self): + logger.info(f'starting /metrics endpoint on port {self.settings.port}') + prom.start_http_server(self.settings.port) + + async def background_jobs(self): + await asyncio.gather( + self.load_checks_job(), + self.get_limits_job(), + self.get_currents_job(), + return_exceptions=True + ) + + def start(self): + self.serve() + try: + asyncio.run(self.background_jobs()) + except KeyboardInterrupt: + logger.info('shutting down...') diff --git a/aws_quota/utils.py b/aws_quota/utils.py new file mode 100644 index 0000000..cd8e2ad --- /dev/null +++ b/aws_quota/utils.py @@ -0,0 +1,7 @@ +import functools +import boto3 + + +@functools.lru_cache +def get_account_id(session: boto3.Session) -> str: + return session.client('sts').get_caller_identity()['Account'] \ No newline at end of file diff --git a/img/example-grafana-dashboard.png b/img/example-grafana-dashboard.png new file mode 100644 index 0000000..85c525c Binary files /dev/null and b/img/example-grafana-dashboard.png differ diff --git a/setup.py b/setup.py index c96ab11..69dcae8 100644 --- a/setup.py +++ b/setup.py @@ -23,6 +23,9 @@ 'pylint', 'keepachangelog', 'wheel' + }, + 'prometheus':{ + 'prometheus-client' } }, entry_points='''