Performance improvement for processing (#146)

sritterginkgo · web-flow · commit bb362a0e9e82 · 2024-05-31T17:08:01.000+01:00
diff --git a/src/litdata/streaming/writer.py b/src/litdata/streaming/writer.py
@@ -99,6 +99,9 @@ def __init__(
         self._distributed_env = _DistributedEnv.detect()
         self._follow_tensor_dimension = follow_tensor_dimension
 
+        self._per_sample_num_bytes = 0
+        self._per_sample_num_items = 0
+
     @property
     def filled(self) -> bool:
         """Returns whether the caching phase is done."""
@@ -277,8 +280,9 @@ def __setitem__(self, index: int, items: Any) -> None:
         self.add_item(index, items)
 
     def add_item(self, index: int, items: Any) -> Optional[str]:
-        # Track the minimum index provided to the writer
-        # Serialize the items and store an Item object.
+        """Given an index and items will serialize the items and store an Item object to the growing
+        `_serialized_items`."""
+
         if index in self._serialized_items:
             raise ValueError(f"The provided index {index} already exists in the cache.")
 
@@ -289,23 +293,50 @@ def add_item(self, index: int, items: Any) -> Optional[str]:
             bytes=len(data),
             dim=dim,
         )
-
-        if not self._should_write():
+        if self._min_index is None:
+            # When processing the first item for the current chunk
+            indexes = list(self._serialized_items.keys())
+            self._max_index = self._min_index = indexes[0] if len(indexes) == 1 else min(*indexes)
+            self._per_sample_num_items = self._per_sample_num_bytes = 0
+            if not self._should_write():
+                return None
+        elif index < self._min_index:
+            # reset the "temp" chunk
+            self._max_index = self._min_index = index
+            self._per_sample_num_items = self._per_sample_num_bytes = 0
+            if not self._should_write():
+                return None
+        elif index == self._max_index:
+            if not self._should_write():
+                return None
+        else:
             return None
+
         filepath = os.path.join(self._cache_dir, self.get_chunk_filename())
+
         self.write_chunk()
+
+        # now to reset
         self._min_index = None
         self._max_index = None
+        self._per_sample_num_bytes = 0
+        self._per_sample_num_items = 0
+
         return filepath
 
     def _should_write(self) -> bool:
         # TODO: Misleading method name, it modifies `self._min_index` and `self._max_index`!
         if not self._serialized_items:
             return False
-        indexes = list(self._serialized_items.keys())
-        self._min_index = index = indexes[0] if len(indexes) == 1 else min(*indexes)
-        num_bytes = 0
-        num_items = 0
+
+        if not isinstance(self._max_index, int):
+            return False
+
+        # We have already validated the indexes from the interval `min_index` to `max_index`` are in `_serialized_items`
+        # Resetting the num_bytes and  num_items back the values.
+        num_bytes = self._per_sample_num_bytes
+        num_items = self._per_sample_num_items
+        index = self._max_index
         while True:
             item = self._serialized_items.get(index, None)
             if item:
@@ -318,6 +349,9 @@ def _should_write(self) -> bool:
                     self._max_index = index - 1
                     return True
             else:
+                self._per_sample_num_bytes = num_bytes
+                self._per_sample_num_items = num_items
+                self._max_index = index
                 return False
 
     def write_chunk_to_file(
diff --git a/tests/streaming/test_writer.py b/tests/streaming/test_writer.py
@@ -226,3 +226,29 @@ def test_writer_human_format(tmpdir):
 
     binary_writer = BinaryWriter(tmpdir, chunk_bytes="64MB")
     assert binary_writer._chunk_bytes == 64000000
+
+
+def test_writer_unordered_indexes(tmpdir):
+    cache_dir = os.path.join(tmpdir, "chunks")
+    os.makedirs(cache_dir, exist_ok=True)
+
+    binary_writer = BinaryWriter(cache_dir, chunk_size=5)
+
+    arr = [2, 3, 1, 4, 6, 5, 7, 8, 11, 9, 10, 12]
+
+    for i in arr:
+        binary_writer[i] = i - 1
+
+    binary_writer.done()
+    binary_writer.merge()
+
+    reader = BinaryReader(cache_dir)
+    for i in range(12):
+        assert i == reader.read(ChunkedIndex(i, chunk_index=i // 5))
+
+    with open(os.path.join(cache_dir, "index.json")) as f:
+        data = json.load(f)
+
+    assert data["chunks"][0]["chunk_size"] == 5
+    assert data["chunks"][1]["chunk_size"] == 5
+    assert data["chunks"][2]["chunk_size"] == 2