[2/2] Fix: Repeat manifest requests after failed generation raise InvalidGeneration (DataBiosphere/azul-private#212)

hannes-ucsc · dsotirho-ucsc · commit d21b07de6743 · 2025-10-02T17:47:58.000-07:00
Add fix
diff --git a/src/azul/service/__init__.py b/src/azul/service/__init__.py
@@ -1,17 +1,34 @@
+from collections import (
+    defaultdict,
+)
+from functools import (
+    partial,
+)
+from itertools import (
+    chain,
+)
+import json
 import logging
 from typing import (
     Protocol,
     Self,
     TypedDict,
+    cast,
+    get_args,
 )
 
 import attr
 from chalice import (
     ForbiddenError,
 )
+from more_itertools import (
+    one,
+    only,
+)
 
 from azul import (
     CatalogName,
+    R,
     mutable_furl,
 )
 from azul.json import (
@@ -25,6 +42,7 @@
     FlatJSON,
     JSON,
     PrimitiveJSON,
+    reify,
 )
 
 log = logging.getLogger(__name__)
@@ -55,6 +73,250 @@
 
 type FiltersJSON = dict[FieldName, FilterJSON]
 
+_filter_operators = FilterJSON.__optional_keys__
+_simple_filter_value_types = reify(PrimitiveJSON)
+_dict_filter_value_types = reify(get_args(FlatJSON.__value__)[1])
+assert _simple_filter_value_types == _dict_filter_value_types
+_filter_range_end_types = reify(FilterRangeEnd)
+
+
+def parse_filters(raw_filters: str | None) -> FiltersJSON:
+    """
+    Deserialize, validate and normalize the given string form of the `filters`
+    request parameter. The aim of normalization is to eliminate any
+    insignificant differences so that serializing the value returned from calls
+    to this method with semantically equivalent and valid arguments yields
+    exactly the same JSON string. Two valid arguments are considered
+    semantically equivalent if they match the same subset of all possible
+    documents.
+
+    >>> parse_filters(None)
+    {}
+
+    >>> parse_filters('{}')
+    {}
+
+    >>> parse_filters('{"x":{"is":[null]}}')
+    {'x': {'is': [None]}}
+
+    Values are sorted.
+
+    >>> parse_filters('{"x":{"is":[2,1,null]}}')
+    {'x': {'is': [None, 1, 2]}}
+
+    The entries in value dictionaries are sorted by key.
+
+    >>> parse_filters('{"x":{"is":[{"b":2,"a":1}]}}')
+    {'x': {'is': [{'a': 1, 'b': 2}]}}
+
+    Value dictionaries are sorted by their values, in order of the key. If two
+    dictionaries have equal values at the first key, the value at the second key
+    is used as a tie breaker and so on.
+
+    >>> parse_filters('{"x":{"is":[{"b":2,"a":1},{"a":0,"b":3},{"b":null,"a":1}]}}')
+    {'x': {'is': [{'a': 0, 'b': 3}, {'a': 1, 'b': None}, {'a': 1, 'b': 2}]}}
+
+    Ranges are sorted by start and end value.
+
+    >>> parse_filters('{"x":{"within":[[3,4],[1,2],[1,1]]}}')
+    {'x': {'within': [[1, 1], [1, 2], [3, 4]]}}
+
+    Overall, filters are sorted by field name.
+
+    >>> parse_filters('{"y":{"within":[[1,2]]},"x":{"is":[4, 3]}}')
+    {'x': {'is': [3, 4]}, 'y': {'within': [[1, 2]]}}
+
+    >>> parse_filters('[]')
+    Traceback (most recent call last):
+        ...
+    AssertionError: R('Filters must be an object')
+
+    >>> parse_filters('{"":42}')
+    Traceback (most recent call last):
+        ...
+    AssertionError: R('Empty field name')
+
+    >>> parse_filters('{"x":42}')
+    Traceback (most recent call last):
+        ...
+    AssertionError: R('Filter must be an object', 'x')
+
+    >>> parse_filters('{"x":{}}')
+    Traceback (most recent call last):
+        ...
+    AssertionError: R('Need exactly one filter per field', 'x')
+
+    >>> parse_filters('{"x":{"is":[1],"contains":[1]}}')
+    Traceback (most recent call last):
+        ...
+    AssertionError: R('Need exactly one filter per field', 'x')
+
+    >>> parse_filters('{"x":{"foo":[2]}}')
+    Traceback (most recent call last):
+        ...
+    AssertionError: R('Invalid operator', 'x', 'foo')
+
+    >>> parse_filters('{"x":{"is":[]}}')
+    Traceback (most recent call last):
+        ...
+    AssertionError: R('Need at least one value', 'x')
+
+    >>> parse_filters('{"x":{"is":[1,1]}}')
+    Traceback (most recent call last):
+        ...
+    AssertionError: R('Duplicate values', 'x')
+
+    >>> parse_filters('{"x":{"within":[1]}}')
+    Traceback (most recent call last):
+        ...
+    AssertionError: R('Value does not match operator', 'x', <class 'int'>, 'within')
+
+    >>> parse_filters('{"x":{"is":[42,4.1]}}')
+    Traceback (most recent call last):
+        ...
+    AssertionError: R('Inconsistent value types', 'x')
+
+    >>> parse_filters('{"x":{"within":[[1]]}}')
+    Traceback (most recent call last):
+        ...
+    AssertionError: R('Range must be list of length 2', 'x')
+
+    >>> parse_filters('{"x":{"within":[[2,1]]}}')
+    Traceback (most recent call last):
+        ...
+    AssertionError: R('Range is inverted', 'x')
+
+    >>> parse_filters('{"x":{"within":[[1,2],["",""]]}}')
+    Traceback (most recent call last):
+        ...
+    AssertionError: R('Inconsistent range ends', 'x')
+
+    >>> parse_filters('{"x":{"within":[[1,1.1],[2,2.2]]}}')
+    Traceback (most recent call last):
+        ...
+    AssertionError: R('Inconsistent range ends', 'x')
+
+    >>> parse_filters('{"x":{"within":[[false,true]]}}')
+    Traceback (most recent call last):
+        ...
+    AssertionError: R('Invalid range end', 'x')
+
+    >>> parse_filters('{"x":{"within":[{}]}}')
+    Traceback (most recent call last):
+        ...
+    AssertionError: R('Value does not match operator', 'x', <class 'dict'>, 'within')
+
+    >>> parse_filters('{"x":{"within":[[1,2],[1,2]]}}')
+    Traceback (most recent call last):
+        ...
+    AssertionError: R('Duplicate ranges', 'x')
+
+    >>> parse_filters('{"x":{"is":[{}]}}')
+    Traceback (most recent call last):
+        ...
+    AssertionError: R('Empty object', 'x')
+
+    >>> parse_filters('{"x":{"is":[{"":1}]}}')
+    Traceback (most recent call last):
+        ...
+    AssertionError: R('Empty property name', 'x')
+
+    >>> parse_filters('{"x":{"is":[{"y":1},{"z":2}]}}')
+    Traceback (most recent call last):
+        ...
+    AssertionError: R('Inconsistent property names', 'x')
+
+    >>> parse_filters('{"x":{"is":[{"y":1,"z":[]}]}}')
+    Traceback (most recent call last):
+        ...
+    AssertionError: R('Invalid property value', 'x')
+
+    >>> parse_filters('{"x":{"is":[{"y":1,"z":2},{"y":"","z":3}]}}')
+    Traceback (most recent call last):
+        ...
+    AssertionError: R('Inconsistent property values', 'x')
+
+    >>> parse_filters('{"x":{"is":[{"a":1,"b":2},{"b":2,"a":1}]}}')
+    Traceback (most recent call last):
+        ...
+    AssertionError: R('Duplicate objects', 'x')
+    """
+    if raw_filters is None:
+        return {}
+    else:
+        filters = json.loads(raw_filters)
+        assert type(filters) is dict, R('Filters must be an object')
+        for field, filter in filters.items():
+            assert len(field) > 0, R('Empty field name')
+            assert type(filter) is dict, R('Filter must be an object', field)
+            assert len(filter) == 1, R('Need exactly one filter per field', field)
+            operator, values = one(filter.items())
+            assert operator in _filter_operators, R('Invalid operator', field, operator)
+            assert len(values) > 0, R('Need at least one value', field)
+            num_value_types = set(map(type, values))
+            num_value_types.discard(type(None))
+            assert len(num_value_types) < 2, R('Inconsistent value types', field)
+            value_type = only(num_value_types)
+            mismatch = R('Value does not match operator', field, value_type, operator)
+            if value_type is None:
+                assert operator in {'is'}, mismatch
+            elif value_type in _simple_filter_value_types:
+                assert operator in {'is', 'contains'}, mismatch
+                assert len(set(values)) == len(values), R('Duplicate values', field)
+            elif value_type is list:
+                assert operator in {'contains', 'within', 'intersects'}, mismatch
+                for range in values:
+                    assert len(range) == 2, R('Range must be list of length 2', field)
+                    assert range[0] <= range[1], R('Range is inverted', field)
+                assert len(set(map(tuple, values))) == len(values), R('Duplicate ranges', field)
+                end_types = set(chain.from_iterable(map(partial(map, type), values)))
+                assert len(end_types) == 1, R('Inconsistent range ends', field)
+                end_type = one(end_types)
+                assert end_type in _filter_range_end_types, R('Invalid range end', field)
+            elif value_type is dict:
+                assert operator == 'is', mismatch
+                key_sets = set(map(frozenset, map(dict.keys, values)))
+                assert len(key_sets) == 1, R('Inconsistent property names', field)
+                keys = one(key_sets)
+                assert len(keys) > 0, R('Empty object', field)
+                assert '' not in keys, R('Empty property name', field)
+                value_types_by_key = defaultdict(set)
+                for value in values:
+                    for k, v in value.items():
+                        assert type(v) in _dict_filter_value_types, R('Invalid property value', field)
+                        if v is not None:
+                            value_types_by_key[k].add(type(v))
+                num_value_types = set(map(len, value_types_by_key.values()))
+                assert num_value_types == {1}, R('Inconsistent property values', field)
+                # Sort each value dictionary in place by key (and value, but key
+                # is already unique). This makes sorting the values and checking
+                # their uniqueness easier.
+                for value in values:
+                    sorted_value = dict(sorted(value.items()))
+                    value.clear()
+                    value.update(sorted_value)
+                unique_values = set(map(tuple, map(dict.items, values)))
+                assert len(unique_values) == len(values), R('Duplicate objects', field)
+            else:
+                assert False, R('Invalid value', field)
+
+            def key(v):
+                if v is None:
+                    return False, v
+                elif type(v) is dict:
+                    # The entries in the dict are alteady sorted by key, the
+                    # values are primitive so we just need to handle None values
+                    # and "freeze" the iterable of entries.
+                    return True, tuple((k, key(v)) for k, v in v.items())
+                else:
+                    return True, v
+
+            values.sort(key=key)
+
+        filters = {k: v for k, v in sorted(filters.items())}
+
+        return cast(FiltersJSON, filters)
+
 
 @attr.s(auto_attribs=True, kw_only=True, frozen=True)
 class Filters:
diff --git a/src/azul/service/app_controller.py b/src/azul/service/app_controller.py
@@ -1,4 +1,3 @@
-import json
 from typing import (
     Any,
     Callable,
@@ -8,6 +7,7 @@
 import attr
 from chalice import (
     BadRequestError as BRE,
+    BadRequestError,
     NotFoundError,
 )
 
@@ -22,6 +22,7 @@
 from azul.service import (
     FileUrlFunc,
     FiltersJSON,
+    parse_filters,
 )
 from azul.strings import (
     pluralize,
@@ -33,14 +34,13 @@ class ServiceAppController(AppController):
     file_url_func: FileUrlFunc
 
     def _parse_filters(self, filters: str | None) -> FiltersJSON:
-        """
-        Parses a string with Azul filters in JSON syntax. Handles default cases
-        where filters are None or '{}'.
-        """
-        if filters is None:
-            return {}
-        else:
-            return json.loads(filters)
+        try:
+            return parse_filters(filters)
+        except AssertionError as e:
+            if R.caused(e):
+                raise R.propagate(e, BadRequestError)
+            else:
+                raise
 
 
 def validate_catalog(catalog):
diff --git a/src/azul/service/manifest_service.py b/src/azul/service/manifest_service.py
@@ -110,10 +110,6 @@
 from azul.json import (
     copy_json,
 )
-from azul.json_freeze import (
-    freeze,
-    sort_frozen,
-)
 from azul.plugins import (
     ColumnMapping,
     DocumentSlice,
@@ -984,7 +980,9 @@ def manifest_key(self) -> ManifestKey:
         different return values.
         """
         git_commit = config.lambda_git_status['commit']
-        filter_string = repr(sort_frozen(freeze(self.filters.explicit)))
+        # The explicit filters are already normalized so we don't to do anything
+        # special to desensitize the hash to insignificat differences
+        filter_string = json.dumps(self.filters.explicit)
         content_hash = str(self.manifest_content_hash)
         catalog = self.catalog
         format = self.format()
diff --git a/test/service/test_manifest.py b/test/service/test_manifest.py
@@ -923,11 +923,11 @@ def test_manifest_content_disposition_header(self):
                     # a pair of deterministically derived v5 UUIDs.
                     (
                         {'project': {'is': ['Single of human pancreas', 'Mouse Melanoma']}},
-                        'hca-manifest-20d97863-d8cf-54f3-8575-0f9593d3d7ef.4bc67e84-4873-591f-b524-a5fe4ec215eb'
+                        'hca-manifest-89bc9973-de91-5fc4-9c6a-8c1f547d45c6.4bc67e84-4873-591f-b524-a5fe4ec215eb'
                     ),
                     (
                         {},
-                        'hca-manifest-c3cf398e-1927-5aae-ba2a-81d8d1800b2d.4bc67e84-4873-591f-b524-a5fe4ec215eb'
+                        'hca-manifest-832a257c-5540-567b-bcb6-260d2e374508.4bc67e84-4873-591f-b524-a5fe4ec215eb'
                     )
                 ]:
                     with self.subTest(filters=filters, format=format):
diff --git a/test/service/test_manifest_async.py b/test/service/test_manifest_async.py

Original file line number	Diff line number	Diff line change
`@@ -923,11 +923,11 @@ def test_manifest_content_disposition_header(self):`
`923`	`923`	`# a pair of deterministically derived v5 UUIDs.`
`924`	`924`	`(`
`925`	`925`	`{'project': {'is': ['Single of human pancreas', 'Mouse Melanoma']}},`
`926`		`- 'hca-manifest-20d97863-d8cf-54f3-8575-0f9593d3d7ef.4bc67e84-4873-591f-b524-a5fe4ec215eb'`
	`926`	`+ 'hca-manifest-89bc9973-de91-5fc4-9c6a-8c1f547d45c6.4bc67e84-4873-591f-b524-a5fe4ec215eb'`
`927`	`927`	`),`
`928`	`928`	`(`
`929`	`929`	`{},`
`930`		`- 'hca-manifest-c3cf398e-1927-5aae-ba2a-81d8d1800b2d.4bc67e84-4873-591f-b524-a5fe4ec215eb'`
	`930`	`+ 'hca-manifest-832a257c-5540-567b-bcb6-260d2e374508.4bc67e84-4873-591f-b524-a5fe4ec215eb'`
`931`	`931`	`)`
`932`	`932`	`]:`
`933`	`933`	`with self.subTest(filters=filters, format=format):`