Skip to content

Commit fd02e32

Browse files
committed
[r] Fix: AnVIL uses uncommon encoding for MD5 digests (#7154)
1 parent 28cb74e commit fd02e32

File tree

8 files changed

+60
-44
lines changed

8 files changed

+60
-44
lines changed

src/azul/plugins/repository/tdr_anvil/__init__.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -351,7 +351,15 @@ def list_files(self, source: TDRSourceRef, prefix: str) -> list[AnvilFile]:
351351
batch = self._get_batch(source.spec,
352352
'anvil_file',
353353
prefix,
354-
key_column='file_md5sum')
354+
key_column=self._column_from_64_to_hex('file_md5sum'))
355+
356+
def missing_md5(row: BigQueryRow) -> bool:
357+
missing = row['file_md5sum'] is None
358+
if missing:
359+
assert source.spec.name == 'ANVIL_1000G_2019_Dev_20230609_ANV5_202306121732', R(
360+
'File lacks MD5 digest', source, dict(row))
361+
return missing
362+
355363
return [
356364
AnvilFile(uuid=ref.entity_id,
357365
name=row['file_name'],
@@ -360,6 +368,7 @@ def list_files(self, source: TDRSourceRef, prefix: str) -> list[AnvilFile]:
360368
md5=row['file_md5sum'],
361369
drs_uri=row['file_ref'])
362370
for ref, row in batch
371+
if not missing_md5(row)
363372
]
364373

365374
def _emulate_bundle(self, bundle_fqid: TDRAnvilBundleFQID) -> TDRAnvilBundle:
@@ -953,4 +962,11 @@ def _columns(self, table_name: str) -> set[str]:
953962
else:
954963
columns = set(columns)
955964
columns.add('datarepo_row_id')
965+
if table_name == 'anvil_file':
966+
column = 'file_md5sum'
967+
columns.remove(column)
968+
columns.add(f'{self._column_from_64_to_hex(column)} AS {column}')
956969
return columns
970+
971+
def _column_from_64_to_hex(self, column: str) -> str:
972+
return f'TO_HEX(FROM_BASE64({column}))'

test/indexer/data/595c469e-604d-ab34-af39-f5b9f5d61818.tdr.anvil.json

Lines changed: 3 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json

Lines changed: 28 additions & 28 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.tdr.anvil.json

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

test/service/data/manifest/verbatim/jsonl/anvil/linked.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@
127127
"datarepo_row_id": "15b76f9c-6b46-433f-851d-34e89f1b9ba6",
128128
"file_format": ".vcf.gz",
129129
"file_id": "1e269f04-4347-4188-b060-1dcc69e71d67",
130-
"file_md5sum": "vuxgbuCqKZ/fkT9CWTFmIg==",
130+
"file_md5sum": "beec606ee0aa299fdf913f4259316622",
131131
"file_name": "307500.merged.matefixed.sorted.markeddups.recal.g.vcf.gz",
132132
"file_ref": "drs://mock_tdr.lan/v1_6c87f0e1-509d-46a4-b845-7584df39263b_1e269f04-4347-4188-b060-1dcc69e71d67",
133133
"file_size": 213021639,
@@ -149,7 +149,7 @@
149149
"datarepo_row_id": "3b17377b-16b1-431c-9967-e5d01fc5923f",
150150
"file_format": ".bam",
151151
"file_id": "8b722e88-8103-49c1-b351-e64fa7c6ab37",
152-
"file_md5sum": "fNn9e1SovzgOROk3BvH6LQ==",
152+
"file_md5sum": "7cd9fd7b54a8bf380e44e93706f1fa2d",
153153
"file_name": "307500.merged.matefixed.sorted.markeddups.recal.bam",
154154
"file_ref": "drs://mock_tdr.lan/v1_6c87f0e1-509d-46a4-b845-7584df39263b_8b722e88-8103-49c1-b351-e64fa7c6ab37",
155155
"file_size": 3306845592,
@@ -210,7 +210,7 @@
210210
"data_modality": [],
211211
"datarepo_row_id": "6b0f6c0f-5d80-4242-accb-840921351cd5",
212212
"file_format": ".txt",
213-
"file_md5sum": "S/GBrRjzZAQYqh3rdiPYzA==",
213+
"file_md5sum": "4bf181ad18f3640418aa1deb7623d8cc",
214214
"file_id": "1fab11f5-7eab-4318-9a58-68d8d06e0715",
215215
"file_name": "CCDG_13607_B01_GRM_WGS_2019-02-19_chr15.recalibrated_variants.annotated.coding.txt",
216216
"file_ref": "drs://mock_tdr.lan/v1_6c87f0e1-509d-46a4-b845-7584df39263b_1fab11f5-7eab-4318-9a58-68d8d06e0715",

test/service/data/manifest/verbatim/pfb/anvil/pfb_entities.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -294,7 +294,7 @@
294294
"drs_uri": "drs://mock_tdr.lan/v1_6c87f0e1-509d-46a4-b845-7584df39263b_1fab11f5-7eab-4318-9a58-68d8d06e0715",
295295
"file_format": ".txt",
296296
"file_id": "1fab11f5-7eab-4318-9a58-68d8d06e0715",
297-
"file_md5sum": "S/GBrRjzZAQYqh3rdiPYzA==",
297+
"file_md5sum": "4bf181ad18f3640418aa1deb7623d8cc",
298298
"file_name": "CCDG_13607_B01_GRM_WGS_2019-02-19_chr15.recalibrated_variants.annotated.coding.txt",
299299
"file_ref": "drs://mock_tdr.lan/v1_6c87f0e1-509d-46a4-b845-7584df39263b_1fab11f5-7eab-4318-9a58-68d8d06e0715",
300300
"file_size": 15079345,
@@ -419,7 +419,7 @@
419419
"drs_uri": "drs://mock_tdr.lan/v1_6c87f0e1-509d-46a4-b845-7584df39263b_8b722e88-8103-49c1-b351-e64fa7c6ab37",
420420
"file_format": ".bam",
421421
"file_id": "8b722e88-8103-49c1-b351-e64fa7c6ab37",
422-
"file_md5sum": "fNn9e1SovzgOROk3BvH6LQ==",
422+
"file_md5sum": "7cd9fd7b54a8bf380e44e93706f1fa2d",
423423
"file_name": "307500.merged.matefixed.sorted.markeddups.recal.bam",
424424
"file_ref": "drs://mock_tdr.lan/v1_6c87f0e1-509d-46a4-b845-7584df39263b_8b722e88-8103-49c1-b351-e64fa7c6ab37",
425425
"file_size": 3306845592,

test/service/test_manifest.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1691,9 +1691,9 @@ def test_compact_manifest(self):
16911691
),
16921692
(
16931693
'files.file_md5sum',
1694-
'S/GBrRjzZAQYqh3rdiPYzA==',
1695-
'vuxgbuCqKZ/fkT9CWTFmIg==',
1696-
'fNn9e1SovzgOROk3BvH6LQ=='
1694+
'4bf181ad18f3640418aa1deb7623d8cc',
1695+
'beec606ee0aa299fdf913f4259316622',
1696+
'7cd9fd7b54a8bf380e44e93706f1fa2d'
16971697
),
16981698
(
16991699
'files.reference_assembly',

0 commit comments

Comments
 (0)