Skip to content

Commit d21b07d

Browse files
hannes-ucscdsotirho-ucsc
authored andcommitted
[2/2] Fix: Repeat manifest requests after failed generation raise InvalidGeneration (DataBiosphere/azul-private#212)
Add fix
1 parent 07a347c commit d21b07d

File tree

5 files changed

+282
-28
lines changed

5 files changed

+282
-28
lines changed

src/azul/service/__init__.py

Lines changed: 262 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,34 @@
1+
from collections import (
2+
defaultdict,
3+
)
4+
from functools import (
5+
partial,
6+
)
7+
from itertools import (
8+
chain,
9+
)
10+
import json
111
import logging
212
from typing import (
313
Protocol,
414
Self,
515
TypedDict,
16+
cast,
17+
get_args,
618
)
719

820
import attr
921
from chalice import (
1022
ForbiddenError,
1123
)
24+
from more_itertools import (
25+
one,
26+
only,
27+
)
1228

1329
from azul import (
1430
CatalogName,
31+
R,
1532
mutable_furl,
1633
)
1734
from azul.json import (
@@ -25,6 +42,7 @@
2542
FlatJSON,
2643
JSON,
2744
PrimitiveJSON,
45+
reify,
2846
)
2947

3048
log = logging.getLogger(__name__)
@@ -55,6 +73,250 @@
5573

5674
type FiltersJSON = dict[FieldName, FilterJSON]
5775

76+
_filter_operators = FilterJSON.__optional_keys__
77+
_simple_filter_value_types = reify(PrimitiveJSON)
78+
_dict_filter_value_types = reify(get_args(FlatJSON.__value__)[1])
79+
assert _simple_filter_value_types == _dict_filter_value_types
80+
_filter_range_end_types = reify(FilterRangeEnd)
81+
82+
83+
def parse_filters(raw_filters: str | None) -> FiltersJSON:
84+
"""
85+
Deserialize, validate and normalize the given string form of the `filters`
86+
request parameter. The aim of normalization is to eliminate any
87+
insignificant differences so that serializing the value returned from calls
88+
to this method with semantically equivalent and valid arguments yields
89+
exactly the same JSON string. Two valid arguments are considered
90+
semantically equivalent if they match the same subset of all possible
91+
documents.
92+
93+
>>> parse_filters(None)
94+
{}
95+
96+
>>> parse_filters('{}')
97+
{}
98+
99+
>>> parse_filters('{"x":{"is":[null]}}')
100+
{'x': {'is': [None]}}
101+
102+
Values are sorted.
103+
104+
>>> parse_filters('{"x":{"is":[2,1,null]}}')
105+
{'x': {'is': [None, 1, 2]}}
106+
107+
The entries in value dictionaries are sorted by key.
108+
109+
>>> parse_filters('{"x":{"is":[{"b":2,"a":1}]}}')
110+
{'x': {'is': [{'a': 1, 'b': 2}]}}
111+
112+
Value dictionaries are sorted by their values, in order of the key. If two
113+
dictionaries have equal values at the first key, the value at the second key
114+
is used as a tie breaker and so on.
115+
116+
>>> parse_filters('{"x":{"is":[{"b":2,"a":1},{"a":0,"b":3},{"b":null,"a":1}]}}')
117+
{'x': {'is': [{'a': 0, 'b': 3}, {'a': 1, 'b': None}, {'a': 1, 'b': 2}]}}
118+
119+
Ranges are sorted by start and end value.
120+
121+
>>> parse_filters('{"x":{"within":[[3,4],[1,2],[1,1]]}}')
122+
{'x': {'within': [[1, 1], [1, 2], [3, 4]]}}
123+
124+
Overall, filters are sorted by field name.
125+
126+
>>> parse_filters('{"y":{"within":[[1,2]]},"x":{"is":[4, 3]}}')
127+
{'x': {'is': [3, 4]}, 'y': {'within': [[1, 2]]}}
128+
129+
>>> parse_filters('[]')
130+
Traceback (most recent call last):
131+
...
132+
AssertionError: R('Filters must be an object')
133+
134+
>>> parse_filters('{"":42}')
135+
Traceback (most recent call last):
136+
...
137+
AssertionError: R('Empty field name')
138+
139+
>>> parse_filters('{"x":42}')
140+
Traceback (most recent call last):
141+
...
142+
AssertionError: R('Filter must be an object', 'x')
143+
144+
>>> parse_filters('{"x":{}}')
145+
Traceback (most recent call last):
146+
...
147+
AssertionError: R('Need exactly one filter per field', 'x')
148+
149+
>>> parse_filters('{"x":{"is":[1],"contains":[1]}}')
150+
Traceback (most recent call last):
151+
...
152+
AssertionError: R('Need exactly one filter per field', 'x')
153+
154+
>>> parse_filters('{"x":{"foo":[2]}}')
155+
Traceback (most recent call last):
156+
...
157+
AssertionError: R('Invalid operator', 'x', 'foo')
158+
159+
>>> parse_filters('{"x":{"is":[]}}')
160+
Traceback (most recent call last):
161+
...
162+
AssertionError: R('Need at least one value', 'x')
163+
164+
>>> parse_filters('{"x":{"is":[1,1]}}')
165+
Traceback (most recent call last):
166+
...
167+
AssertionError: R('Duplicate values', 'x')
168+
169+
>>> parse_filters('{"x":{"within":[1]}}')
170+
Traceback (most recent call last):
171+
...
172+
AssertionError: R('Value does not match operator', 'x', <class 'int'>, 'within')
173+
174+
>>> parse_filters('{"x":{"is":[42,4.1]}}')
175+
Traceback (most recent call last):
176+
...
177+
AssertionError: R('Inconsistent value types', 'x')
178+
179+
>>> parse_filters('{"x":{"within":[[1]]}}')
180+
Traceback (most recent call last):
181+
...
182+
AssertionError: R('Range must be list of length 2', 'x')
183+
184+
>>> parse_filters('{"x":{"within":[[2,1]]}}')
185+
Traceback (most recent call last):
186+
...
187+
AssertionError: R('Range is inverted', 'x')
188+
189+
>>> parse_filters('{"x":{"within":[[1,2],["",""]]}}')
190+
Traceback (most recent call last):
191+
...
192+
AssertionError: R('Inconsistent range ends', 'x')
193+
194+
>>> parse_filters('{"x":{"within":[[1,1.1],[2,2.2]]}}')
195+
Traceback (most recent call last):
196+
...
197+
AssertionError: R('Inconsistent range ends', 'x')
198+
199+
>>> parse_filters('{"x":{"within":[[false,true]]}}')
200+
Traceback (most recent call last):
201+
...
202+
AssertionError: R('Invalid range end', 'x')
203+
204+
>>> parse_filters('{"x":{"within":[{}]}}')
205+
Traceback (most recent call last):
206+
...
207+
AssertionError: R('Value does not match operator', 'x', <class 'dict'>, 'within')
208+
209+
>>> parse_filters('{"x":{"within":[[1,2],[1,2]]}}')
210+
Traceback (most recent call last):
211+
...
212+
AssertionError: R('Duplicate ranges', 'x')
213+
214+
>>> parse_filters('{"x":{"is":[{}]}}')
215+
Traceback (most recent call last):
216+
...
217+
AssertionError: R('Empty object', 'x')
218+
219+
>>> parse_filters('{"x":{"is":[{"":1}]}}')
220+
Traceback (most recent call last):
221+
...
222+
AssertionError: R('Empty property name', 'x')
223+
224+
>>> parse_filters('{"x":{"is":[{"y":1},{"z":2}]}}')
225+
Traceback (most recent call last):
226+
...
227+
AssertionError: R('Inconsistent property names', 'x')
228+
229+
>>> parse_filters('{"x":{"is":[{"y":1,"z":[]}]}}')
230+
Traceback (most recent call last):
231+
...
232+
AssertionError: R('Invalid property value', 'x')
233+
234+
>>> parse_filters('{"x":{"is":[{"y":1,"z":2},{"y":"","z":3}]}}')
235+
Traceback (most recent call last):
236+
...
237+
AssertionError: R('Inconsistent property values', 'x')
238+
239+
>>> parse_filters('{"x":{"is":[{"a":1,"b":2},{"b":2,"a":1}]}}')
240+
Traceback (most recent call last):
241+
...
242+
AssertionError: R('Duplicate objects', 'x')
243+
"""
244+
if raw_filters is None:
245+
return {}
246+
else:
247+
filters = json.loads(raw_filters)
248+
assert type(filters) is dict, R('Filters must be an object')
249+
for field, filter in filters.items():
250+
assert len(field) > 0, R('Empty field name')
251+
assert type(filter) is dict, R('Filter must be an object', field)
252+
assert len(filter) == 1, R('Need exactly one filter per field', field)
253+
operator, values = one(filter.items())
254+
assert operator in _filter_operators, R('Invalid operator', field, operator)
255+
assert len(values) > 0, R('Need at least one value', field)
256+
num_value_types = set(map(type, values))
257+
num_value_types.discard(type(None))
258+
assert len(num_value_types) < 2, R('Inconsistent value types', field)
259+
value_type = only(num_value_types)
260+
mismatch = R('Value does not match operator', field, value_type, operator)
261+
if value_type is None:
262+
assert operator in {'is'}, mismatch
263+
elif value_type in _simple_filter_value_types:
264+
assert operator in {'is', 'contains'}, mismatch
265+
assert len(set(values)) == len(values), R('Duplicate values', field)
266+
elif value_type is list:
267+
assert operator in {'contains', 'within', 'intersects'}, mismatch
268+
for range in values:
269+
assert len(range) == 2, R('Range must be list of length 2', field)
270+
assert range[0] <= range[1], R('Range is inverted', field)
271+
assert len(set(map(tuple, values))) == len(values), R('Duplicate ranges', field)
272+
end_types = set(chain.from_iterable(map(partial(map, type), values)))
273+
assert len(end_types) == 1, R('Inconsistent range ends', field)
274+
end_type = one(end_types)
275+
assert end_type in _filter_range_end_types, R('Invalid range end', field)
276+
elif value_type is dict:
277+
assert operator == 'is', mismatch
278+
key_sets = set(map(frozenset, map(dict.keys, values)))
279+
assert len(key_sets) == 1, R('Inconsistent property names', field)
280+
keys = one(key_sets)
281+
assert len(keys) > 0, R('Empty object', field)
282+
assert '' not in keys, R('Empty property name', field)
283+
value_types_by_key = defaultdict(set)
284+
for value in values:
285+
for k, v in value.items():
286+
assert type(v) in _dict_filter_value_types, R('Invalid property value', field)
287+
if v is not None:
288+
value_types_by_key[k].add(type(v))
289+
num_value_types = set(map(len, value_types_by_key.values()))
290+
assert num_value_types == {1}, R('Inconsistent property values', field)
291+
# Sort each value dictionary in place by key (and value, but key
292+
# is already unique). This makes sorting the values and checking
293+
# their uniqueness easier.
294+
for value in values:
295+
sorted_value = dict(sorted(value.items()))
296+
value.clear()
297+
value.update(sorted_value)
298+
unique_values = set(map(tuple, map(dict.items, values)))
299+
assert len(unique_values) == len(values), R('Duplicate objects', field)
300+
else:
301+
assert False, R('Invalid value', field)
302+
303+
def key(v):
304+
if v is None:
305+
return False, v
306+
elif type(v) is dict:
307+
# The entries in the dict are alteady sorted by key, the
308+
# values are primitive so we just need to handle None values
309+
# and "freeze" the iterable of entries.
310+
return True, tuple((k, key(v)) for k, v in v.items())
311+
else:
312+
return True, v
313+
314+
values.sort(key=key)
315+
316+
filters = {k: v for k, v in sorted(filters.items())}
317+
318+
return cast(FiltersJSON, filters)
319+
58320

59321
@attr.s(auto_attribs=True, kw_only=True, frozen=True)
60322
class Filters:

src/azul/service/app_controller.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
import json
21
from typing import (
32
Any,
43
Callable,
@@ -8,6 +7,7 @@
87
import attr
98
from chalice import (
109
BadRequestError as BRE,
10+
BadRequestError,
1111
NotFoundError,
1212
)
1313

@@ -22,6 +22,7 @@
2222
from azul.service import (
2323
FileUrlFunc,
2424
FiltersJSON,
25+
parse_filters,
2526
)
2627
from azul.strings import (
2728
pluralize,
@@ -33,14 +34,13 @@ class ServiceAppController(AppController):
3334
file_url_func: FileUrlFunc
3435

3536
def _parse_filters(self, filters: str | None) -> FiltersJSON:
36-
"""
37-
Parses a string with Azul filters in JSON syntax. Handles default cases
38-
where filters are None or '{}'.
39-
"""
40-
if filters is None:
41-
return {}
42-
else:
43-
return json.loads(filters)
37+
try:
38+
return parse_filters(filters)
39+
except AssertionError as e:
40+
if R.caused(e):
41+
raise R.propagate(e, BadRequestError)
42+
else:
43+
raise
4444

4545

4646
def validate_catalog(catalog):

src/azul/service/manifest_service.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -110,10 +110,6 @@
110110
from azul.json import (
111111
copy_json,
112112
)
113-
from azul.json_freeze import (
114-
freeze,
115-
sort_frozen,
116-
)
117113
from azul.plugins import (
118114
ColumnMapping,
119115
DocumentSlice,
@@ -984,7 +980,9 @@ def manifest_key(self) -> ManifestKey:
984980
different return values.
985981
"""
986982
git_commit = config.lambda_git_status['commit']
987-
filter_string = repr(sort_frozen(freeze(self.filters.explicit)))
983+
# The explicit filters are already normalized so we don't to do anything
984+
# special to desensitize the hash to insignificat differences
985+
filter_string = json.dumps(self.filters.explicit)
988986
content_hash = str(self.manifest_content_hash)
989987
catalog = self.catalog
990988
format = self.format()

test/service/test_manifest.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -923,11 +923,11 @@ def test_manifest_content_disposition_header(self):
923923
# a pair of deterministically derived v5 UUIDs.
924924
(
925925
{'project': {'is': ['Single of human pancreas', 'Mouse Melanoma']}},
926-
'hca-manifest-20d97863-d8cf-54f3-8575-0f9593d3d7ef.4bc67e84-4873-591f-b524-a5fe4ec215eb'
926+
'hca-manifest-89bc9973-de91-5fc4-9c6a-8c1f547d45c6.4bc67e84-4873-591f-b524-a5fe4ec215eb'
927927
),
928928
(
929929
{},
930-
'hca-manifest-c3cf398e-1927-5aae-ba2a-81d8d1800b2d.4bc67e84-4873-591f-b524-a5fe4ec215eb'
930+
'hca-manifest-832a257c-5540-567b-bcb6-260d2e374508.4bc67e84-4873-591f-b524-a5fe4ec215eb'
931931
)
932932
]:
933933
with self.subTest(filters=filters, format=format):

0 commit comments

Comments
 (0)