diff --git a/.mypy.ini b/.mypy.ini index 4f4d63ef6b..9d05cf30c3 100644 --- a/.mypy.ini +++ b/.mypy.ini @@ -68,6 +68,7 @@ modules = scripts.post_deploy_tdr, scripts.find_in_snapshots, scripts.can_bundle, + scripts.delete_older_function_versions, scripts.zenhub_to_github, azul.plugins.metadata.anvil.bundle, azul.plugins.metadata.anvil.schema, diff --git a/scripts/delete_older_function_versions.py b/scripts/delete_older_function_versions.py new file mode 100644 index 0000000000..79e4c52d60 --- /dev/null +++ b/scripts/delete_older_function_versions.py @@ -0,0 +1,48 @@ +""" +Delete all versions of a Lambda function prior to the specified one. +""" +import argparse +import logging +import sys + +from azul import ( + R, + config, +) +from azul.args import ( + AzulArgumentHelpFormatter, +) +from azul.lambdas import ( + LambdaFunctions, +) +from azul.logging import ( + configure_script_logging, +) + +log = logging.getLogger(__name__) + + +def main(argv: list[str]): + assert config.terraform_component == '', R( + 'This script cannot be run with a Terraform component selected', + config.terraform_component) + parser = argparse.ArgumentParser(description=__doc__, + formatter_class=AzulArgumentHelpFormatter) + parser.add_argument('--function-name', '-f', + required=True, + help='The name of the Lambda function.') + parser.add_argument('--function-version', '-v', + type=int, + required=True, + help='The Lambda function version to keep. Must be an ' + 'integer.') + args = parser.parse_args(argv) + log.info('Deleting function %r versions older than %r', + args.function_name, args.function_version) + functions = LambdaFunctions() + functions.delete_older_versions(args.function_name, args.function_version) + + +if __name__ == '__main__': + configure_script_logging(log) + main(sys.argv[1:]) diff --git a/scripts/generate_openapi_document.py b/scripts/generate_openapi_document.py index bd7bd15285..92bfb1f329 100644 --- a/scripts/generate_openapi_document.py +++ b/scripts/generate_openapi_document.py @@ -42,14 +42,14 @@ def main(): sources=set()) } - lambda_name = Path.cwd().name - assert lambda_name in config.lambda_names(), lambda_name + app_name = Path.cwd().name + assert app_name in config.app_names(), app_name # To create a normalized OpenAPI document, we patch any # deployment-specific variables that affect the document. with ( patch_config('catalogs', catalogs), - patch_config(f'{lambda_name}_function_name', f'azul-{lambda_name}-dev'), + patch_config(f'{app_name}_function_name', f'azul-{app_name}-dev'), patch_config('enable_log_forwarding', False), patch_config('enable_replicas', True), patch_config('monitoring_email', 'azul-group@ucsc.edu') @@ -58,10 +58,10 @@ def main(): with patch.object(target=AzulChaliceApp, attribute='base_url', new=lambda_endpoint): - app_module = load_app_module(lambda_name) + app_module = load_app_module(app_name) assert app_module.app.base_url == lambda_endpoint app_spec = app_module.app.spec() - doc_path = Path(config.project_root) / 'lambdas' / lambda_name / 'openapi.json' + doc_path = Path(config.project_root) / 'lambdas' / app_name / 'openapi.json' with write_file_atomically(doc_path) as file: json.dump(app_spec, file, indent=4) diff --git a/scripts/manage_lambdas.py b/scripts/manage_lambdas.py index ea17269f40..d47da19c35 100644 --- a/scripts/manage_lambdas.py +++ b/scripts/manage_lambdas.py @@ -2,7 +2,7 @@ import logging from azul.lambdas import ( - Lambdas, + LambdaFunctions, ) from azul.logging import ( configure_script_logging, @@ -18,4 +18,5 @@ group.add_argument('--disable', dest='enabled', action='store_false') args = parser.parse_args() assert args.enabled is not None - Lambdas().manage_lambdas(args.enabled) + functions = LambdaFunctions() + functions.manage_lambdas(args.enabled) diff --git a/scripts/reset_lambda_role.py b/scripts/reset_lambda_role.py index 8bd234b8e8..ea18cb7536 100644 --- a/scripts/reset_lambda_role.py +++ b/scripts/reset_lambda_role.py @@ -1,10 +1,16 @@ +""" +Attempt to fix KMSAccessDeniedException when invoking a function. + +See Troubleshooting section in README.md for details. +""" from azul.lambdas import ( - Lambdas, + LambdaFunctions, ) def main(): - Lambdas().reset_lambda_roles() + functions = LambdaFunctions() + functions.reset_lambda_roles() if __name__ == '__main__': diff --git a/scripts/sell_unused_slots.py b/scripts/sell_unused_slots.py index 29f31a6443..ce0ff9ab11 100644 --- a/scripts/sell_unused_slots.py +++ b/scripts/sell_unused_slots.py @@ -27,8 +27,8 @@ aws, ) from azul.lambdas import ( - Lambda, - Lambdas, + LambdaFunction, + LambdaFunctions, ) from azul.logging import ( configure_script_logging, @@ -65,17 +65,18 @@ def is_reindex_active(self) -> bool: @classmethod @cache - def _list_contribution_lambda_functions(cls) -> list[Lambda]: + def _list_contribution_lambda_functions(cls) -> list[LambdaFunction]: """ Search Lambda functions for the names of contribution Lambdas. """ + functions = LambdaFunctions() return [ - lambda_ - for lambda_ in Lambdas().list_lambdas() - if lambda_.is_contribution_lambda + function + for function in functions.list_functions() + if function.contributes ] - def _lambda_invocation_counts(self) -> dict[Lambda, int]: + def _lambda_invocation_counts(self) -> dict[LambdaFunction, int]: # FIXME: DeprecationWarning for datetime methods in Python 3.12 # https://github.com/DataBiosphere/azul/issues/5953 end = datetime.utcnow() @@ -95,6 +96,9 @@ def _lambda_invocation_counts(self) -> dict[Lambda, int]: 'Namespace': 'AWS/Lambda', 'MetricName': 'Invocations', 'Dimensions': [{ + # The 'FunctionName' dimension returns + # aggregate metrics for all versions and + # aliases of the function. 'Name': 'FunctionName', 'Value': lambda_.name }] diff --git a/src/azul/__init__.py b/src/azul/__init__.py index 435c8c7768..bd00085529 100644 --- a/src/azul/__init__.py +++ b/src/azul/__init__.py @@ -743,7 +743,7 @@ def drs_endpoint(self) -> mutable_furl: else: return self.service_endpoint - def lambda_names(self) -> list[str]: + def app_names(self) -> list[str]: return ['indexer', 'service'] @property @@ -760,13 +760,15 @@ def indexer_function_name(self, handler_name: str | None = None): def service_function_name(self, handler_name: str | None = None): return self._function_name('service', handler_name) - def _function_name(self, lambda_name: str, handler_name: str | None): + def _function_name(self, app_name: str, handler_name: str | None): if handler_name is None: - return self.qualified_resource_name(lambda_name) + return self.qualified_resource_name(app_name) else: # FIXME: Eliminate hardcoded separator # https://github.com/databiosphere/azul/issues/2964 - return self.qualified_resource_name(lambda_name, suffix='-' + handler_name) + return self.qualified_resource_name(app_name, suffix='-' + handler_name) + + active_function_alias_name = 'active' qualifier_re = re.compile(r'[a-z][a-z0-9]{1,16}') diff --git a/src/azul/chalice.py b/src/azul/chalice.py index 2f31d8606d..c84ef7ed0d 100644 --- a/src/azul/chalice.py +++ b/src/azul/chalice.py @@ -26,7 +26,6 @@ ) import attrs -import chalice from chalice import ( Chalice, ChaliceViewError, @@ -34,6 +33,7 @@ from chalice.app import ( BadRequestError, CaseInsensitiveMapping, + EventSourceHandler, HeadersType, MultiDict, NotFoundError, @@ -598,7 +598,7 @@ class metric_alarm(HandlerDecorator): period: int def __call__(self, f): - assert isinstance(f, chalice.app.EventSourceHandler), f + assert isinstance(f, EventSourceHandler), f try: metric_alarms = getattr(f, 'metric_alarms') except AttributeError: @@ -611,6 +611,14 @@ def __call__(self, f): def tf_resource_name(self) -> str: return f'{self.tf_function_resource_name}_{self.metric.name}' + @property + def event_source_handlers(self) -> dict[str, EventSourceHandler]: + return { + handler_name: handler + for handler_name, handler in self.handler_map.items() + if isinstance(handler, EventSourceHandler) + } + @property def metric_alarms(self) -> Iterator[metric_alarm]: for metric in LambdaMetric: @@ -622,19 +630,16 @@ def metric_alarms(self) -> Iterator[metric_alarm]: threshold=0, period=60 * 60 if for_errors else 5 * 60) yield alarm.bind(self) - for handler_name, handler in self.handler_map.items(): - if isinstance(handler, chalice.app.EventSourceHandler): - try: - metric_alarms = getattr(handler, 'metric_alarms') - except AttributeError: - metric_alarms = ( - self.metric_alarm(metric=metric, - threshold=0, - period=5 * 60) - for metric in LambdaMetric - ) - for metric_alarm in metric_alarms: - yield metric_alarm.bind(self, handler_name) + for handler_name, handler in self.event_source_handlers.items(): + try: + metric_alarms = getattr(handler, 'metric_alarms') + except AttributeError: + metric_alarms = ( + self.metric_alarm(metric=metric, threshold=0, period=5 * 60) + for metric in LambdaMetric + ) + for metric_alarm in metric_alarms: + yield metric_alarm.bind(self, handler_name) # noinspection PyPep8Naming @attrs.frozen @@ -650,20 +655,25 @@ class retry(HandlerDecorator): num_retries: int def __call__(self, f): - assert isinstance(f, chalice.app.EventSourceHandler), f + assert isinstance(f, EventSourceHandler), f setattr(f, 'retry', self) return f @property def retries(self) -> Iterator[retry]: - for handler_name, handler in self.handler_map.items(): - if isinstance(handler, chalice.app.EventSourceHandler): - try: - retry = getattr(handler, 'retry') - except AttributeError: - pass - else: - yield retry.bind(self, handler_name) + for handler_name, handler in self.event_source_handlers.items(): + try: + retry = getattr(handler, 'retry') + except AttributeError: + pass + else: + yield retry.bind(self, handler_name) + + @property + def tf_function_resource_names(self) -> Iterator[str]: + yield self.unqualified_app_name + for handler_name in self.event_source_handlers: + yield f'{self.unqualified_app_name}_{handler_name}' def default_routes(self): diff --git a/src/azul/health.py b/src/azul/health.py index 4a5f0031db..31a2186300 100644 --- a/src/azul/health.py +++ b/src/azul/health.py @@ -93,7 +93,7 @@ def description(self): @attr.s(frozen=True, kw_only=True, auto_attribs=True) class HealthController(AppController): - lambda_name: str + app_name: str @cached_property def storage_service(self): @@ -134,7 +134,7 @@ def cached_health(self) -> JSON: self.app.catalog, config.default_catalog) else: try: - cache = json.loads(self.storage_service.get(f'health/{self.lambda_name}')) + cache = json.loads(self.storage_service.get(f'health/{self.app_name}')) except StorageObjectNotFound: raise NotFoundError('Cached health object does not exist') else: @@ -148,7 +148,7 @@ def cached_health(self) -> JSON: def update_cache(self) -> None: assert self.app.catalog == config.default_catalog health_object = dict(time=time.time(), health=self._health.as_json_fast()) - self.storage_service.put(object_key=f'health/{self.lambda_name}', + self.storage_service.put(object_key=f'health/{self.app_name}', data=json.dumps(health_object).encode()) @property @@ -182,7 +182,7 @@ class does not examine any resources, only accessing the individual @property def lambda_name(self): - return self.controller.lambda_name + return self.controller.app_name def as_json(self, keys: Iterable[str]) -> JSON: keys = frozenset(keys) @@ -200,9 +200,9 @@ def other_lambdas(self) -> JSON: Indicates whether the companion REST API responds to HTTP requests. """ response = { - lambda_name: self._lambda(lambda_name) - for lambda_name in config.lambda_names() - if lambda_name != self.lambda_name + app_name: self._lambda(app_name) + for app_name in config.app_names() + if app_name != self.lambda_name } return { 'up': all(json_bool(v['up']) for v in response.values()), @@ -333,7 +333,7 @@ class HealthApp(AzulChaliceApp): @cached_property def health_controller(self) -> HealthController: - return HealthController(app=self, lambda_name=self.unqualified_app_name) + return HealthController(app=self, app_name=self.unqualified_app_name) def default_routes(self): _routes = super().default_routes() diff --git a/src/azul/lambdas.py b/src/azul/lambdas.py index 4ed12c197a..e3134df42e 100644 --- a/src/azul/lambdas.py +++ b/src/azul/lambdas.py @@ -33,31 +33,30 @@ @attr.s(auto_attribs=True, kw_only=True, frozen=True) -class Lambda: +class LambdaFunction: name: str role: str slot_location: Optional[str] @property - def is_contribution_lambda(self) -> bool: - for lambda_name in self._contribution_lambda_names(): + def contributes(self) -> bool: + unqualify = config.unqualified_resource_name + for handler_name in self._contribution_handler_names(): try: # FIXME: Eliminate hardcoded separator # https://github.com/databiosphere/azul/issues/2964 - resource_name, _ = config.unqualified_resource_name(self.name, - suffix='-' + lambda_name) + app_name, _ = unqualify(self.name, suffix='-' + handler_name) except AssertionError as e: if not R.caused(e): raise else: - if resource_name == 'indexer': + if app_name == 'indexer': return True return False @classmethod @cache - def _contribution_lambda_names(cls) -> frozenset[str]: - indexer = load_app_module('indexer') + def _contribution_handler_names(cls) -> frozenset[str]: notification_queue_names = { config.notifications_queue.derive(retry=retry).unqual_name for retry in (False, True) @@ -72,11 +71,12 @@ def has_notification_queue(handler) -> bool: resource_name, _, _ = config.unqualified_resource_name_and_suffix(queue) return resource_name in notification_queue_names - return frozenset(( + indexer = load_app_module('indexer') + return frozenset( handler.name for handler in vars(indexer).values() if has_notification_queue(handler) - )) + ) @classmethod def from_response(cls, response: 'FunctionConfigurationTypeDef') -> Self: @@ -92,97 +92,128 @@ def from_response(cls, response: 'FunctionConfigurationTypeDef') -> Self: def __attrs_post_init__(self): if self.slot_location is None: - assert not self.is_contribution_lambda, self + assert not self.contributes, self else: allowed_locations = config.tdr_allowed_source_locations assert self.slot_location in allowed_locations, self.slot_location -class Lambdas: +class LambdaFunctions: tag_name = 'azul-original-concurrency-limit' @property def _lambda(self): return aws.lambda_ - def list_lambdas(self) -> list[Lambda]: + def list_functions(self) -> list[LambdaFunction]: + # Note that this method returns the $LATEST version, which is what + # Amazon also refers to as the "unpublished" version. return [ - Lambda.from_response(function) + LambdaFunction.from_response(function) for response in self._lambda.get_paginator('list_functions').paginate() for function in response['Functions'] ] + def delete_older_versions(self, function_name: str, keep_version: int) -> None: + """ + Delete all versions of a Lambda function prior to the specified one. + + :param function_name: The fully qualified name of the function + e.g. 'azul-service-dev' + + :param keep_version: The version of the function to not delete. + """ + paginator = self._lambda.get_paginator('list_versions_by_function') + versions = [ + function['Version'] + for page in paginator.paginate(FunctionName=function_name) + for function in page['Versions'] + if ( + function['Version'] != '$LATEST' # The so-called "unpublished" version + and int(function['Version']) < keep_version + ) + ] + for version in versions: + log.info('Deleting version %r of %r', version, function_name) + self._lambda.delete_function(FunctionName=function_name, + Qualifier=version) + def manage_lambdas(self, enabled: bool): paginator = self._lambda.get_paginator('list_functions') - lambda_prefixes = [config.qualified_resource_name(lambda_infix) for lambda_infix in config.lambda_names()] - assert all(lambda_prefixes) - for lambda_page in paginator.paginate(FunctionVersion='ALL', MaxItems=500): - for lambda_name in [metadata['FunctionName'] for metadata in lambda_page['Functions']]: - if any(lambda_name.startswith(prefix) for prefix in lambda_prefixes): - self.manage_lambda(lambda_name, enabled) - - def manage_lambda(self, lambda_name: str, enable: bool): - lambda_settings = self._lambda.get_function(FunctionName=lambda_name) - lambda_arn = lambda_settings['Configuration']['FunctionArn'] - lambda_tags = self._lambda.list_tags(Resource=lambda_arn)['Tags'] - lambda_name = lambda_settings['Configuration']['FunctionName'] + prefixes = [ + config.qualified_resource_name(app_name) + for app_name in config.app_names() + ] + assert all(prefixes) + for response in paginator.paginate(MaxItems=500): + for function in response['Functions']: + function_name = function['FunctionName'] + if any(function_name.startswith(prefix) for prefix in prefixes): + self.manage_function(function_name, enabled) + + def manage_function(self, function_name: str, enable: bool): + function = self._lambda.get_function(FunctionName=function_name) + assert function_name == function['Configuration']['FunctionName'] + function_arn = function['Configuration']['FunctionArn'] + tags = self._lambda.list_tags(Resource=function_arn)['Tags'] if enable: - if self.tag_name in lambda_tags.keys(): - original_concurrency_limit = ast.literal_eval(lambda_tags[self.tag_name]) - + if self.tag_name in tags.keys(): + original_concurrency_limit = ast.literal_eval(tags[self.tag_name]) if original_concurrency_limit is not None: - log.info(f'Setting concurrency limit for {lambda_name} back to {original_concurrency_limit}.') - self._lambda.put_function_concurrency(FunctionName=lambda_name, + log.info('Setting concurrency limit on %r back to %r.', + function_name, original_concurrency_limit) + self._lambda.put_function_concurrency(FunctionName=function_name, ReservedConcurrentExecutions=original_concurrency_limit) else: - log.info(f'Removed concurrency limit for {lambda_name}.') - self._lambda.delete_function_concurrency(FunctionName=lambda_name) + log.info('Removed concurrency limit on %r.', function_name) + self._lambda.delete_function_concurrency(FunctionName=function_name) - lambda_arn = lambda_settings['Configuration']['FunctionArn'] - self._lambda.untag_resource(Resource=lambda_arn, TagKeys=[self.tag_name]) + self._lambda.untag_resource(Resource=function_arn, TagKeys=[self.tag_name]) else: - log.warning(f'{lambda_name} is already enabled.') + log.warning('Function %r is already enabled.', function_name) else: - if self.tag_name not in lambda_tags.keys(): + if self.tag_name in tags.keys(): + log.warning('Function %r is already disabled.', function_name) + else: try: - concurrency = lambda_settings['Concurrency'] + concurrency = function['Concurrency'] except KeyError: - # If a lambda doesn't have a limit for concurrency - # executions, Lambda.Client.get_function() - # doesn't return a response with the key, `Concurrency`. + # Function doesn't have a concurrency limit concurrency_limit = None else: concurrency_limit = concurrency['ReservedConcurrentExecutions'] - - log.info(f'Setting concurrency limit for {lambda_name} to zero.') + log.info('Setting concurrency limit on %r to zero.', function_name) new_tag = {self.tag_name: repr(concurrency_limit)} - self._lambda.tag_resource(Resource=lambda_settings['Configuration']['FunctionArn'], Tags=new_tag) - self._lambda.put_function_concurrency(FunctionName=lambda_name, ReservedConcurrentExecutions=0) - else: - log.warning(f'{lambda_name} is already disabled.') + self._lambda.tag_resource(Resource=function_arn, Tags=new_tag) + self._lambda.put_function_concurrency(FunctionName=function_name, ReservedConcurrentExecutions=0) def reset_lambda_roles(self): + """ + Attempt to fix KMSAccessDeniedException when invoking a function. + + See Troubleshooting section in README.md for details. + """ client = self._lambda - lambda_names = set(config.lambda_names()) - - for lambda_ in self.list_lambdas(): - for lambda_name in lambda_names: - if lambda_.name.startswith(config.qualified_resource_name(lambda_name)): - other_lambda_name = one(lambda_names - {lambda_name}) - temporary_role = lambda_.role.replace( - config.qualified_resource_name(lambda_name), - config.qualified_resource_name(other_lambda_name) + app_names = set(config.app_names()) + + for function in self.list_functions(): + for app_name in app_names: + if function.name.startswith(config.qualified_resource_name(app_name)): + other_app_name = one(app_names - {app_name}) + temporary_role = function.role.replace( + config.qualified_resource_name(app_name), + config.qualified_resource_name(other_app_name) ) - log.info('Temporarily updating %r to role %r', lambda_.name, temporary_role) - client.update_function_configuration(FunctionName=lambda_.name, + log.info('Temporarily updating %r to role %r', function.name, temporary_role) + client.update_function_configuration(FunctionName=function.name, Role=temporary_role) - log.info('Updating %r to role %r', lambda_.name, lambda_.role) + log.info('Updating %r to role %r', function.name, function.role) while True: try: - client.update_function_configuration(FunctionName=lambda_.name, - Role=lambda_.role) + client.update_function_configuration(FunctionName=function.name, + Role=function.role) except client.exceptions.ResourceConflictException: - log.info('Function %r is being updated. Retrying ...', lambda_.name) + log.info('Function %r is being updated. Retrying ...', function.name) time.sleep(1) else: break diff --git a/src/azul/queues.py b/src/azul/queues.py index 3bc4fc3de0..1befd3dcc6 100644 --- a/src/azul/queues.py +++ b/src/azul/queues.py @@ -60,7 +60,7 @@ Serializable, ) from azul.lambdas import ( - Lambdas, + LambdaFunctions, ) from azul.modules import ( load_app_module, @@ -502,21 +502,27 @@ def _wait_for_queue_empty(self, queue: 'Queue'): time.sleep(3) queue.reload() - def _manage_sqs_push(self, function_name: str, queue: 'Queue', enable: bool): + def _manage_sqs_push(self, + *, + function: str, + alias: str, + queue: 'Queue', + enable: bool): lambda_ = aws.lambda_ - response = lambda_.list_event_source_mappings(FunctionName=function_name, + partial_arn = f'{function}:{alias}' + response = lambda_.list_event_source_mappings(FunctionName=partial_arn, EventSourceArn=queue.attributes['QueueArn']) mapping_uuid = one(response['EventSourceMappings'])['UUID'] def update_(): log.info('%s push from %r to lambda function %r', - 'Enabling' if enable else 'Disabling', queue.url, function_name) + 'Enabling' if enable else 'Disabling', queue.url, function) lambda_.update_event_source_mapping(UUID=mapping_uuid, Enabled=enable) state = one(response['EventSourceMappings'])['State'] while True: log.info('Push from %r to lambda function %r is in state %r.', - queue.url, function_name, state) + queue.url, function, state) if state in ('Disabling', 'Enabling', 'Updating'): pass elif state == 'Enabled': @@ -570,17 +576,21 @@ def submit(f, *args, **kwargs): if queue_name == config.notifications_queue.name: # Prevent new notifications from being added submit(self._manage_lambda, config.indexer_name, enable) - submit(self._manage_sqs_push, function, queue, enable) + submit(self._manage_sqs_push, + function=function, + alias=config.active_function_alias_name, + queue=queue, + enable=enable) self._handle_futures(futures) futures = [tpe.submit(self._wait_for_queue_idle, queue) for queue in queues.values()] self._handle_futures(futures) def _manage_lambda(self, function_name: str, enable: bool): - self._lambdas.manage_lambda(function_name, enable) + self._functions.manage_function(function_name, enable) @cached_property - def _lambdas(self) -> Lambdas: - return Lambdas() + def _functions(self) -> LambdaFunctions: + return LambdaFunctions() def _handle_futures(self, futures: Iterable[Future]): errors = [] diff --git a/src/azul/terraform.py b/src/azul/terraform.py index 186b8686af..bfe8c86ce2 100644 --- a/src/azul/terraform.py +++ b/src/azul/terraform.py @@ -652,15 +652,17 @@ def ref(block_name: str, resource_type: str, name: str) -> str: # Sort in reverse so that keys that are prefixes of other keys go last rev_ref_map = sorted(ref_map.items(), reverse=True) - def patch_refs(v: AnyMutableJSON) -> AnyMutableJSON: + def patch_refs[T: AnyMutableJSON](v: T) -> T: if isinstance(v, dict): - return {k: patch_refs(v) for k, v in json_dict(v).items()} + return type(v)((k, patch_refs(v)) for k, v in json_dict(v).items()) elif isinstance(v, list): - return list(map(patch_refs, v)) + return type(v)(map(patch_refs, v)) elif isinstance(v, str): + r: str = v for old_ref, new_ref in rev_ref_map: - v = v.replace(old_ref, new_ref) - return v + r = r.replace(old_ref, new_ref) + assert isinstance(r, type(v)) + return r else: return v @@ -755,10 +757,14 @@ def tf_config(self, app_name): '${aws_vpc_endpoint.%s.id}' % app_name ] - functions = json_item_dicts(json_dict(resources['aws_lambda_function'])) - for _, resource in functions: + functions = json_item_dicts(resources['aws_lambda_function']) + for resource_name, resource in functions: assert 'layers' not in resource resource['layers'] = ['${aws_lambda_layer_version.dependencies.arn}'] + # Publishing a new Lambda function version each time lets us perform + # an atomic update of the function, avoiding a race condition + # between the update of the function's configuration and its code. + resource['publish'] = True env = config.es_endpoint_env( es_endpoint=( aws.es_endpoint @@ -776,6 +782,25 @@ def tf_config(self, app_name): resource['source_code_hash'] = '${filebase64sha256("%s")}' % package_zip resource['filename'] = package_zip + # Replace any references to unqualified function ARNs emitted by Chalice + # with references to the alias. + # + def function_to_alias[T: AnyMutableJSON](v: T) -> T: + if isinstance(v, dict): + return type(v)((k, function_to_alias(v)) for k, v in v.items()) + elif isinstance(v, list): + return type(v)(map(function_to_alias, v)) + elif isinstance(v, str) and v.endswith(('.arn}', '.invoke_arn}')): + r = v.replace('${aws_lambda_function', '${aws_lambda_alias') + assert isinstance(r, type(v)) + return r + else: + return v + + openapi_spec = json_dict(json.loads(json_str(locals[app_name]))) + openapi_spec = function_to_alias(openapi_spec) + resources = function_to_alias(resources) + assert 'aws_cloudwatch_log_group' not in resources functions = json_item_dicts(resources['aws_lambda_function']) resources['aws_cloudwatch_log_group'] = { @@ -867,7 +892,6 @@ def tf_config(self, app_name): # default responses for the API, so we use these extensions for that # purpose, too. # - openapi_spec = json.loads(json_str(locals[app_name])) rest_apis = json_dict(resources['aws_api_gateway_rest_api']) rest_api = json_dict(rest_apis[app_name]) assert 'minimum_compression_size' not in rest_api, rest_api @@ -881,43 +905,43 @@ def tf_config(self, app_name): # # https://docs.aws.amazon.com/apigateway/latest/developerguide/request-response-data-mappings.html#mapping-response-parameters # - security_headers = { + security_headers: MutableJSON = { f'gatewayresponse.header.{k}': f"'{v}'" for k, v in AzulChaliceApp.security_headers().items() } - assert 'aws_api_gateway_gateway_response' not in resources, resources - openapi_spec['x-amazon-apigateway-gateway-responses'] = ( - { - f'DEFAULT_{response_type}': { - 'responseParameters': security_headers - } for response_type in ['4XX', '5XX'] - } | { - 'INTEGRATION_FAILURE': { - 'statusCode': '502', - 'responseParameters': security_headers, - 'responseTemplates': { - "application/json": json.dumps({ - 'message': '502 Bad Gateway. The server was unable ' - 'to complete your request.' - }) - } + default_responses: MutableJSON = { + f'DEFAULT_{response_type}': {'responseParameters': security_headers} + for response_type in ['4XX', '5XX'] + } + error_responses: MutableJSON = { + 'INTEGRATION_FAILURE': { + 'statusCode': '502', + 'responseParameters': security_headers, + 'responseTemplates': { + "application/json": json.dumps({ + 'message': '502 Bad Gateway. The server was unable ' + 'to complete your request.' + }) + } + }, + 'INTEGRATION_TIMEOUT': { + 'statusCode': '504', + 'responseParameters': { + **security_headers, + 'gatewayresponse.header.Retry-After': "'10'" }, - 'INTEGRATION_TIMEOUT': { - 'statusCode': '504', - 'responseParameters': { - **security_headers, - 'gatewayresponse.header.Retry-After': "'10'" - }, - 'responseTemplates': { - "application/json": json.dumps({ - 'message': '504 Gateway Timeout. Wait the number of ' - 'seconds specified in the `Retry-After` ' - 'header before retrying the request.' - }) - } + 'responseTemplates': { + "application/json": json.dumps({ + 'message': '504 Gateway Timeout. Wait the number of ' + 'seconds specified in the `Retry-After` ' + 'header before retrying the request.' + }) } } - ) + } + gateway_responses = error_responses | default_responses + assert 'aws_api_gateway_gateway_response' not in resources, resources + openapi_spec['x-amazon-apigateway-gateway-responses'] = gateway_responses locals[app_name] = json.dumps(openapi_spec) # Replace the hard-coded ARN emitted by Chalice with a resource @@ -932,6 +956,14 @@ def tf_config(self, app_name): sqs_name, _ = config.unqualified_resource_name(resource_name, suffix) resource['event_source_arn'] = f'${{aws_sqs_queue.{sqs_name}.arn}}' + # Ensure that the Lambda permissions for the previous aliases aren't + # deleted until after the permissions for the new aliases have been + # created. + # + for name, resource in json_item_dicts(resources['aws_lambda_permission']): + assert 'lifecycle' not in resource, resource + resource['lifecycle'] = {'create_before_destroy': True} + return { 'resource': resources, 'data': data, diff --git a/terraform/api_gateway.tf.json.template.py b/terraform/api_gateway.tf.json.template.py index 2c1d6da5e7..e78e4b61d8 100644 --- a/terraform/api_gateway.tf.json.template.py +++ b/terraform/api_gateway.tf.json.template.py @@ -763,10 +763,38 @@ def add_waf_blocked_alarm(resources: JSON) -> JSON: retry.tf_function_resource_name: { 'function_name': '${aws_lambda_function.%s.function_name}' % retry.tf_function_resource_name, + 'qualifier': '${aws_lambda_alias.%s.name}' + % retry.tf_function_resource_name, 'maximum_retry_attempts': retry.num_retries } for app in apps for retry in app.chalice.retries + }, + 'aws_lambda_alias': { + resource_name: { + 'name': config.active_function_alias_name, + 'function_name': '${aws_lambda_function.%s.function_name}' % resource_name, + 'function_version': '${aws_lambda_function.%s.version}' % resource_name + } + for app in apps + for resource_name in app.chalice.tf_function_resource_names + }, + 'terraform_data': { + resource_name: { + 'triggers_replace': ['${aws_lambda_alias.%s.function_version}' % resource_name], + 'provisioner': { + 'local-exec': { + 'command': ' '.join([ + 'python', + f'{config.project_root}/scripts/delete_older_function_versions.py', + '--function-name ${aws_lambda_alias.%s.function_name}' % resource_name, + '--function-version ${aws_lambda_alias.%s.function_version}' % resource_name + ]) + } + } + } + for app in apps + for resource_name in app.chalice.tf_function_resource_names } }), *( diff --git a/terraform/cloudwatch.tf.json.template.py b/terraform/cloudwatch.tf.json.template.py index ebab05344f..51372af534 100644 --- a/terraform/cloudwatch.tf.json.template.py +++ b/terraform/cloudwatch.tf.json.template.py @@ -72,12 +72,12 @@ def dashboard_body(name: str) -> str: *( { 'aws_cloudwatch_metric_alarm': { - f'{lambda_}_5xx': { - 'alarm_name': config.qualified_resource_name(lambda_ + '_5xx'), + f'{app_name}_5xx': { + 'alarm_name': config.qualified_resource_name(app_name + '_5xx'), 'namespace': 'AWS/ApiGateway', 'metric_name': '5XXError', 'dimensions': { - 'ApiName': config.qualified_resource_name(lambda_), + 'ApiName': config.qualified_resource_name(app_name), 'Stage': config.deployment_stage, }, 'statistic': 'Sum', @@ -92,21 +92,21 @@ def dashboard_body(name: str) -> str: } } } - for lambda_ in config.lambda_names() + for app_name in config.app_names() ), *( { 'aws_cloudwatch_log_metric_filter': { - f'{lambda_}cachehealth': { - 'name': config.qualified_resource_name(f'{lambda_}cachehealth', suffix='.filter'), + f'{app_name}cachehealth': { + 'name': config.qualified_resource_name(f'{app_name}cachehealth', suffix='.filter'), 'pattern': '', 'log_group_name': ( '/aws/lambda/' - + config.qualified_resource_name(lambda_) - + f'-{lambda_}cachehealth' + + config.qualified_resource_name(app_name) + + f'-{app_name}cachehealth' ), 'metric_transformation': { - 'name': config.qualified_resource_name(f'{lambda_}cachehealth'), + 'name': config.qualified_resource_name(f'{app_name}cachehealth'), 'namespace': 'LogMetrics', 'value': 1, 'default_value': 0, @@ -114,13 +114,13 @@ def dashboard_body(name: str) -> str: } } } - for lambda_ in config.lambda_names() + for app_name in config.app_names() ), *( { 'aws_cloudwatch_metric_alarm': { - f'{lambda_}cachehealth': { - 'alarm_name': config.qualified_resource_name(f'{lambda_}cachehealth', suffix='.alarm'), + f'{app_name}cachehealth': { + 'alarm_name': config.qualified_resource_name(f'{app_name}cachehealth', suffix='.alarm'), # CloudWatch uses an unconfigurable "evaluation range" when missing # data is involved. In practice this means that an alarm on the # absence of logs with an evaluation window of ten minutes would @@ -133,7 +133,7 @@ def dashboard_body(name: str) -> str: 'metric': { 'namespace': 'LogMetrics', 'metric_name': '${aws_cloudwatch_log_metric_filter.' - '%scachehealth.metric_transformation[0].name}' % lambda_, + '%scachehealth.metric_transformation[0].name}' % app_name, 'stat': 'Sum', 'period': 10 * 60, } @@ -154,7 +154,7 @@ def dashboard_body(name: str) -> str: } } } - for lambda_ in config.lambda_names() + for app_name in config.app_names() ), { 'aws_cloudwatch_metric_alarm': { @@ -254,8 +254,8 @@ def dashboard_body(name: str) -> str: 'ok_actions': ['${data.aws_sns_topic.monitoring.arn}'], 'treat_missing_data': 'notBreaching', } - for lambda_name in config.lambda_names() - for metric_alarm in load_app_module(lambda_name).app.metric_alarms + for app_name in config.app_names() + for metric_alarm in load_app_module(app_name).app.metric_alarms }, 'waf_rate_blocked': { 'alarm_name': config.qualified_resource_name('waf_rate_blocked'), diff --git a/test/app_test_case.py b/test/app_test_case.py index 8f289f1193..33dab873fd 100644 --- a/test/app_test_case.py +++ b/test/app_test_case.py @@ -75,11 +75,11 @@ class LocalAppTestCase(CatalogTestCase, metaclass=ABCMeta): @classmethod @abstractmethod - def lambda_name(cls) -> str: + def app_name(cls) -> str: """ - Return the name of the AWS Lambda function to start locally. Must match - the name of a subdirectory of ${project_root}/lambdas. Subclasses must - override this to select which AWS Lambda function to start locally. + Return the name of the application to start locally. Must match the name + of a directory in ${project_root}/lambdas. Subclasses must override this + method. """ raise NotImplementedError @@ -100,7 +100,7 @@ def setUpClass(cls): # app modules from different lambdas loaded by different concrete # subclasses. It does, however, violate this one invariant: # `sys.modules[module.__name__] == module` - cls.app_module = load_app_module(cls.lambda_name()) + cls.app_module = load_app_module(cls.app_name()) @classmethod def tearDownClass(cls): diff --git a/test/health_check_test_case.py b/test/health_check_test_case.py index a8ef810bb8..6828fc4eec 100644 --- a/test/health_check_test_case.py +++ b/test/health_check_test_case.py @@ -137,7 +137,7 @@ def test_cached_health(self): # A successful response is obtained when all the systems are functional self._create_mock_queues() - app = load_app_module(self.lambda_name()) + app = load_app_module(self.app_name()) with self._mock(): app.update_health_cache(MagicMock(), MagicMock()) response = self._test('/health/cached') @@ -225,9 +225,9 @@ def _endpoint(self, relative_url: str) -> str: def _other_lambda_names(self) -> list[str]: return [ - lambda_name - for lambda_name in config.lambda_names() - if lambda_name != self.lambda_name() + app_name + for app_name in config.app_names() + if app_name != self.app_name() ] def _expected_other_lambdas(self, *, up: bool) -> MutableJSON: diff --git a/test/indexer/test_health_check.py b/test/indexer/test_health_check.py index 76ec0d3f89..0596e80637 100644 --- a/test/indexer/test_health_check.py +++ b/test/indexer/test_health_check.py @@ -24,7 +24,7 @@ def setUpModule(): class TestIndexerHealthCheck(DCP1TestCase, HealthCheckTestCase): @classmethod - def lambda_name(cls) -> str: + def app_name(cls) -> str: return 'indexer' def _expected_health(self, diff --git a/test/indexer/test_mirror_controller.py b/test/indexer/test_mirror_controller.py index 216afb61f5..0257649bcd 100644 --- a/test/indexer/test_mirror_controller.py +++ b/test/indexer/test_mirror_controller.py @@ -66,7 +66,7 @@ class TestMirrorController(DCP2TestCase, MirrorTestCase): @classmethod - def lambda_name(cls) -> str: + def app_name(cls) -> str: return 'indexer' def test_mirroring(self): diff --git a/test/indexer/test_notifications.py b/test/indexer/test_notifications.py index f662b4e1f6..8cc8834700 100644 --- a/test/indexer/test_notifications.py +++ b/test/indexer/test_notifications.py @@ -35,7 +35,7 @@ class TestValidNotificationRequests(LocalAppTestCase, SqsTestCase): @classmethod - def lambda_name(cls) -> str: + def app_name(cls) -> str: return 'indexer' @mock_aws diff --git a/test/service/__init__.py b/test/service/__init__.py index 972ed0a323..36249d77b6 100644 --- a/test/service/__init__.py +++ b/test/service/__init__.py @@ -105,7 +105,7 @@ def bundles(cls) -> list[SourcedBundleFQID]: ] @classmethod - def lambda_name(cls) -> str: + def app_name(cls) -> str: return 'service' @classmethod diff --git a/test/service/test_app_logging.py b/test/service/test_app_logging.py index 4a69f68e52..a2c7ae31f9 100644 --- a/test/service/test_app_logging.py +++ b/test/service/test_app_logging.py @@ -52,7 +52,7 @@ def tearDownClass(cls): super().tearDownClass() @classmethod - def lambda_name(cls) -> str: + def app_name(cls) -> str: return 'service' def test_request_logs(self): diff --git a/test/service/test_cache_poisoning.py b/test/service/test_cache_poisoning.py index f9dd68323e..74ad5b64b9 100644 --- a/test/service/test_cache_poisoning.py +++ b/test/service/test_cache_poisoning.py @@ -45,7 +45,7 @@ def tearDownClass(cls): super().tearDownClass() @classmethod - def lambda_name(cls) -> str: + def app_name(cls) -> str: return 'service' def _test(self): diff --git a/test/service/test_drs.py b/test/service/test_drs.py index 6a8fdf4fd9..c465104728 100644 --- a/test/service/test_drs.py +++ b/test/service/test_drs.py @@ -146,7 +146,7 @@ class TestDRSEndpoint(DCP1CannedBundleTestCase, LocalAppTestCase): gs_url = 'gs://important-bucket/object/path' @classmethod - def lambda_name(cls) -> str: + def app_name(cls) -> str: return 'service' def test_drs(self): diff --git a/test/service/test_health_check.py b/test/service/test_health_check.py index 39f7edc2cc..9ea10a50ac 100644 --- a/test/service/test_health_check.py +++ b/test/service/test_health_check.py @@ -24,7 +24,7 @@ def setUpModule(): class TestServiceHealthCheck(DCP1TestCase, HealthCheckTestCase): @classmethod - def lambda_name(cls) -> str: + def app_name(cls) -> str: return 'service' def _expected_health(self, diff --git a/test/service/test_manifest_async.py b/test/service/test_manifest_async.py index 8cdc0ea4b4..06324b830f 100644 --- a/test/service/test_manifest_async.py +++ b/test/service/test_manifest_async.py @@ -173,7 +173,7 @@ def test_status_failed(self, _sfn): class TestManifestController(DCP1TestCase, LocalAppTestCase): @classmethod - def lambda_name(cls) -> str: + def app_name(cls) -> str: return 'service' generation_id = UUID('1ea94a54-a64d-54f1-8b41-15455fb958db') diff --git a/test/service/test_repository_files.py b/test/service/test_repository_files.py index 513e481c5f..3738703bcd 100644 --- a/test/service/test_repository_files.py +++ b/test/service/test_repository_files.py @@ -94,7 +94,7 @@ def setUpModule(): class RepositoryFilesTestCase(LocalAppTestCase, metaclass=ABCMeta): @classmethod - def lambda_name(cls) -> str: + def app_name(cls) -> str: return 'service' def chalice_config(self): diff --git a/test/service/test_response.py b/test/service/test_response.py index 8c08ab4b6b..d7caf42a2e 100644 --- a/test/service/test_response.py +++ b/test/service/test_response.py @@ -3690,7 +3690,7 @@ class TestListCatalogsResponse(DCP1CannedBundleTestCase, LocalAppTestCase): maxDiff = None @classmethod - def lambda_name(cls) -> str: + def app_name(cls) -> str: return 'service' def test(self):