Small updates for compatibility with WebDataset pipelines #171

kerimoglutolga · 2025-01-24T15:06:28Z

No description provided.

github-actions · 2025-01-24T15:16:50Z

✅ Result of Pytest Coverage

---------- coverage: platform linux, python 3.10.0-final-0 -----------

Name	Stmts	Miss	Cover
mixtera/core/algo/ado/ado.py	375	171	54%
mixtera/core/algo/dynamic_mixing/dynamic_mixing.py	32	3	91%
mixtera/core/client/local/local_stub.py	110	19	83%
mixtera/core/client/mixtera_client.py	127	30	76%
mixtera/core/client/server/server_stub.py	79	19	76%
mixtera/core/datacollection/datasets/croissant_dataset.py	16	3	81%
mixtera/core/datacollection/datasets/dataset.py	53	29	45%
mixtera/core/datacollection/datasets/dataset_type.py	12	0	100%
mixtera/core/datacollection/datasets/jsonl_dataset.py	58	9	84%
mixtera/core/datacollection/datasets/parquet_dataset.py	83	5	94%
mixtera/core/datacollection/datasets/web_dataset.py	68	18	74%
mixtera/core/datacollection/index/index.py	6	1	83%
mixtera/core/datacollection/index/index_collection.py	11	0	100%
mixtera/core/datacollection/index/index_utils.py	14	0	100%
mixtera/core/datacollection/index/parser/metadata_parser.py	51	7	86%
mixtera/core/datacollection/index/parser/parser_collection.py	96	26	73%
mixtera/core/datacollection/mixtera_data_collection.py	369	119	68%
mixtera/core/datacollection/property.py	7	0	100%
mixtera/core/datacollection/property_type.py	4	0	100%
mixtera/core/filesystem/filesystem.py	37	1	97%
mixtera/core/filesystem/local_filesystem.py	26	0	100%
mixtera/core/processing/execution_mode.py	4	0	100%
mixtera/core/processing/property_calculation/executor.py	22	2	91%
mixtera/core/processing/property_calculation/local_executor.py	68	14	79%
mixtera/core/query/chunk_distributor.py	248	171	31%
mixtera/core/query/mixture/arbitrary_mixture.py	12	1	92%
mixtera/core/query/mixture/dynamic_mixture.py	52	3	94%
mixtera/core/query/mixture/hierarchical_static_mixture.py	44	5	89%
mixtera/core/query/mixture/inferring_mixture.py	28	6	79%
mixtera/core/query/mixture/mixture.py	34	8	76%
mixtera/core/query/mixture/mixture_key.py	48	3	94%
mixtera/core/query/mixture/mixture_schedule.py	34	9	74%
mixtera/core/query/mixture/static_mixture.py	42	10	76%
mixtera/core/query/operators/_base.py	23	1	96%
mixtera/core/query/operators/select.py	107	3	97%
mixtera/core/query/query.py	54	0	100%
mixtera/core/query/query_cache.py	76	9	88%
mixtera/core/query/query_plan.py	18	2	89%
mixtera/core/query/query_result.py	378	91	76%
mixtera/core/query/result_chunk.py	296	114	61%
mixtera/hf/mixtera_hf_dataset.py	75	47	37%
mixtera/multimodal/webdataset/pipeline.py	14	14	0%
mixtera/network/client/client_feedback.py	8	0	100%
mixtera/network/connection/server_connection.py	251	49	80%
mixtera/network/network_utils.py	90	10	89%
mixtera/network/server/entrypoint.py	22	22	0%
mixtera/network/server/server.py	280	135	52%
mixtera/network/server_task.py	19	0	100%
mixtera/tests/core/algo/ado/test_ado.py	167	0	100%
mixtera/tests/core/client/local/test_local_stub.py	198	1	99%
mixtera/tests/core/client/server/test_server_stub.py	147	0	100%
mixtera/tests/core/client/test_mixtera_client.py	66	0	100%
mixtera/tests/core/datacollection/datasets/test_dataset.py	0	0	100%
mixtera/tests/core/datacollection/datasets/test_jsonl_dataset.py	67	6	91%
mixtera/tests/core/datacollection/datasets/test_parquet_dataset.py	163	5	97%
mixtera/tests/core/datacollection/datasets/test_web_dataset.py	49	0	100%
mixtera/tests/core/datacollection/index/parser/test_parser_collection.py	81	2	98%
mixtera/tests/core/datacollection/index/test_index_utils.py	15	1	93%
mixtera/tests/core/datacollection/test_mixtera_data_collection.py	249	5	98%
mixtera/tests/core/datacollection/test_property_type.py	7	0	100%
mixtera/tests/core/filesystem/test_filesystem.py	47	0	100%
mixtera/tests/core/filesystem/test_local_filesystem.py	39	0	100%
mixtera/tests/core/processing/property_calculation/test_executor.py	22	0	100%
mixtera/tests/core/processing/property_calculation/test_local_executor.py	51	0	100%
mixtera/tests/core/processing/test_execution_mode.py	7	0	100%
mixtera/tests/core/query/operators/test_base.py	45	1	98%
mixtera/tests/core/query/operators/test_select.py	162	1	99%
mixtera/tests/core/query/test_chunk_distributor.py	113	1	99%
mixtera/tests/core/query/test_dynamic_mixture.py	120	0	100%
mixtera/tests/core/query/test_e2e.py	60	3	95%
mixtera/tests/core/query/test_mixture.py	20	1	95%
mixtera/tests/core/query/test_mixture_schedule.py	15	1	93%
mixtera/tests/core/query/test_query.py	143	4	97%
mixtera/tests/core/query/test_query_cache.py	85	4	95%
mixtera/tests/core/query/test_query_result.py	302	0	100%
mixtera/tests/core/query/test_result_chunk.py	195	1	99%
mixtera/tests/network/connection/test_server_connection.py	379	1	99%
mixtera/tests/network/server/test_server.py	179	0	100%
mixtera/tests/network/test_network_utils.py	165	1	99%
mixtera/tests/network/test_server_task.py	51	0	100%
mixtera/tests/utils/test_checkpoint.py	60	0	100%
mixtera/tests/utils/test_tokenizing_iterator.py	237	1	99%
mixtera/tests/utils/test_utils.py	95	1	99%
mixtera/torch/mixtera_torch_dataset.py	176	133	24%
mixtera/utils/checkpoint.py	22	0	100%
mixtera/utils/dataset_utils.py	104	82	21%
mixtera/utils/feedback.py	20	20	0%
mixtera/utils/prefetch_iterator.py	25	18	28%
mixtera/utils/tokenizing_iterator.py	136	7	95%
mixtera/utils/utils.py	232	76	67%
mixtera/utils/webdataset_utils.py	81	41	49%
TOTAL	8578	1607	81%
Coverage	HTML	written	to
==================	323	passed,	1

MaxiBoether

Thanks for the work! Let's clean this up a bit

MaxiBoether · 2025-04-04T11:40:43Z

examples/download_slim_pajama.py

-import os
 import argparse
-import requests
+import os
 from concurrent.futures import ThreadPoolExecutor, as_completed

+import requests


Unnecessary changes, no?

MaxiBoether · 2025-04-04T11:40:58Z

mixtera/core/datacollection/datasets/__init__.py

+    "CC12MDataset",
+    "MSCOCODataset",
+    "LAION400MDataset",
+    "COYO700MDataset",


As discussed, this must change

MaxiBoether · 2025-04-04T11:42:17Z

mixtera/multimodal/webdataset/pipeline.py

+class MixteraDataPipeline(wds.DataPipeline):
+    """
+    Supports building arbitrary webdataset pipelines with Mixtera's `MixteraTorchDataset` as the data source.
+    """
+
+    def __init__(
+        self,
+        client: MixteraClient,
+        query: Query,
+        query_execution_args: QueryExecutionArgs,
+        result_streaming_args: ResultStreamingArgs,
+        pipeline: Iterable[Any],
+    ):
+        super().__init__(*pipeline)
+        self.client = client
+        self.query = query
+        self.query_execution_args = query_execution_args
+        self.result_streaming_args = result_streaming_args
+
+        torch_dataset = MixteraTorchDataset(
+            client=client,
+            query=query,


I don't understand what this is necessary for. The MixteraTorchDataset is a user facing abstraction

MaxiBoether · 2025-04-04T11:43:09Z

mixtera/utils/webdataset_utils.py

-def decode(sample: dict[str, Any], decode_image: bool = True) -> dict[str, Any]:
+def decode_sample(sample: dict[str, Any]) -> dict[str, Any]:


It's weird to have a funciton called decode_sample which you can optionally tell to not decode. Can you find a cleaner way to disable image decoding?

MaxiBoether · 2025-04-04T11:43:32Z

mixtera/utils/webdataset_utils.py

+class MMIndexedTarRawBytes(MMIndexedTar):
+    """
+    A subclass of `MMIndexedTar` that returns the raw bytes instead of an IOBytes object.
+    """
+
+    def get_file(self, i: int) -> tuple[str, bytes]:
+        filename, data = self.get_at_index(i)
+        return filename, data
+


Why is that necessary?

MaxiBoether · 2025-04-04T11:44:04Z

mixtera/utils/webdataset_utils.py

-            return self.decoder(sample)
-        raise ValueError("Co")
+                sample[ext[1:]] = data
+            sample["__key__"] = key  # type: ignore


Why type ignore?

MaxiBoether · 2025-04-04T11:46:30Z

mixtera/core/datacollection/index/parser/parser_collection.py

+        self.add_metadata(sample_id=line_number, dataset=dataset_name)
+
+
+class DomainNetMetadataParser(MetadataParser):


We need to clean up the datasets vs metadata parsers here. Not sure why in addition to the datasets you added you also added this parser. Let's find a cleaner abstraction for this. You could have like a "LlavaMetadataParser" that somehow e.g. from the path infers which dataset the sample comes from. Or you have one MetadataParser per dataset. You can actually register multiple datasets within the same data collection so that should also not be an issue.

Tolga Kerimoglu and others added 11 commits January 23, 2025 21:40

added generic parser plus some changes to webdataset processing

55d8d88

add generic parser to collection list

ed8e08a

export cc12m dataset in init.py

763b182

add cc12m dataset

91cd44a

raw bytes for webdataset pipeline compatibility

56a261f

add docstring

55b1191

formatting

c690c27

type hint func

80696ac

make mypy happy

ceede7c

remove print

ff6316e

remove unused import

8582e67

webdataset updates

cc55aa4

kerimoglutolga force-pushed the tolga/WebDatasetUpdates branch from 6af7f96 to cc55aa4 Compare February 28, 2025 08:30

Tolga Kerimoglu and others added 3 commits February 28, 2025 09:30

Merge branch 'main' into tolga/WebDatasetUpdates

e749bdb

added domainnet

3873ff8

formatting

30e80c6

MaxiBoether requested changes Apr 4, 2025

View reviewed changes

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Small updates for compatibility with WebDataset pipelines #171

Small updates for compatibility with WebDataset pipelines #171

Uh oh!

kerimoglutolga commented Jan 24, 2025

Uh oh!

github-actions bot commented Jan 24, 2025 •

edited

Loading

Uh oh!

MaxiBoether left a comment

Uh oh!

MaxiBoether Apr 4, 2025

Uh oh!

MaxiBoether Apr 4, 2025

Uh oh!

MaxiBoether Apr 4, 2025

Uh oh!

MaxiBoether Apr 4, 2025

Uh oh!

MaxiBoether Apr 4, 2025

Uh oh!

MaxiBoether Apr 4, 2025

Uh oh!

MaxiBoether Apr 4, 2025

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

		def decode(sample: dict[str, Any], decode_image: bool = True) -> dict[str, Any]:
		def decode_sample(sample: dict[str, Any]) -> dict[str, Any]:

		self.add_metadata(sample_id=line_number, dataset=dataset_name)


		class DomainNetMetadataParser(MetadataParser):

Small updates for compatibility with WebDataset pipelines #171

Are you sure you want to change the base?

Small updates for compatibility with WebDataset pipelines #171

Uh oh!

Conversation

kerimoglutolga commented Jan 24, 2025

Uh oh!

github-actions bot commented Jan 24, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

✅ Result of Pytest Coverage

Uh oh!

MaxiBoether left a comment

Choose a reason for hiding this comment

Uh oh!

MaxiBoether Apr 4, 2025

Choose a reason for hiding this comment

Uh oh!

MaxiBoether Apr 4, 2025

Choose a reason for hiding this comment

Uh oh!

MaxiBoether Apr 4, 2025

Choose a reason for hiding this comment

Uh oh!

MaxiBoether Apr 4, 2025

Choose a reason for hiding this comment

Uh oh!

MaxiBoether Apr 4, 2025

Choose a reason for hiding this comment

Uh oh!

MaxiBoether Apr 4, 2025

Choose a reason for hiding this comment

Uh oh!

MaxiBoether Apr 4, 2025

Choose a reason for hiding this comment

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

github-actions bot commented Jan 24, 2025 •

edited

Loading