Combine setup into one task

jantonguirao · jantonguirao · commit e4ffc5c2cadb · 2025-09-05T11:28:01.000+02:00
Signed-off-by: Joaquin Anton Guirao &lt;janton@nvidia.com&gt;
diff --git a/dali/operators/imgcodec/image_decoder.h b/dali/operators/imgcodec/image_decoder.h
@@ -32,6 +32,7 @@
 #include "dali/pipeline/operator/common.h"
 #include "dali/pipeline/operator/operator.h"
 
+
 #if not(WITH_DYNAMIC_NVIMGCODEC_ENABLED)
 nvimgcodecStatus_t get_libjpeg_turbo_extension_desc(nvimgcodecExtensionDesc_t *ext_desc);
 nvimgcodecStatus_t get_libtiff_extension_desc(nvimgcodecExtensionDesc_t *ext_desc);
@@ -674,58 +675,79 @@ class ImageDecoder : public StatelessOperator<Backend> {
     TensorListShape<> out_shape(nsamples, 3);
 
     const bool use_cache = cache_ && cache_->IsCacheEnabled() && dtype_ == DALI_UINT8;
-    auto setup_block = [&](int block_idx, int nblocks, int tid) {
-      int i_start = nsamples * block_idx / nblocks;
-      int i_end = nsamples * (block_idx + 1) / nblocks;
-      DomainTimeRange tr("Setup #" + std::to_string(block_idx) + "/" + std::to_string(nblocks),
-                          DomainTimeRange::kOrange);
-      for (int i = i_start; i < i_end; i++) {
-        auto *st = state_[i].get();
-        st->image_info.buffer = nullptr;
-        assert(st != nullptr);
-        const auto &input_sample = input[i];
-
-        auto src_info = input.GetMeta(i).GetSourceInfo();
-        if (use_cache && cache_->IsInCache(src_info)) {
-          auto cached_shape = cache_->CacheImageShape(src_info);
-          auto roi = GetRoi(spec_, ws, i, cached_shape);
-          if (!roi.use_roi()) {
-            out_shape.set_tensor_shape(i, st->out_shape);
-            st->load_from_cache = true;
-            continue;
-          }
-        }
-        st->load_from_cache = false;
-        ParseSample(st->parsed_sample,
-                    span<const uint8_t>{static_cast<const uint8_t *>(input_sample.raw_data()),
-                                        volume(input_sample.shape())});
-        st->sub_encoded_stream.reset();
-        st->out_shape = st->parsed_sample.dali_img_info.shape;
-        st->out_shape[2] = NumberOfChannels(format_, st->out_shape[2]);
-        if (use_orientation_ &&
-            (st->parsed_sample.nvimgcodec_img_info.orientation.rotated % 180 != 0)) {
-          std::swap(st->out_shape[0], st->out_shape[1]);
+    auto setup_sample = [&](int sample_idx, int tid) {
+      auto *st = state_[sample_idx].get();
+      st->image_info.buffer = nullptr;
+      assert(st != nullptr);
+      const auto &input_sample = input[sample_idx];
+
+      auto src_info = input.GetMeta(sample_idx).GetSourceInfo();
+      if (use_cache && cache_->IsInCache(src_info)) {
+        auto cached_shape = cache_->CacheImageShape(src_info);
+        auto roi = GetRoi(spec_, ws, sample_idx, cached_shape);
+        if (!roi.use_roi()) {
+          out_shape.set_tensor_shape(sample_idx, st->out_shape);
+          st->load_from_cache = true;
+          return;
         }
+      }
+      st->load_from_cache = false;
+      ParseSample(st->parsed_sample,
+                  span<const uint8_t>{static_cast<const uint8_t *>(input_sample.raw_data()),
+                                      volume(input_sample.shape())});
+      st->sub_encoded_stream.reset();
+      st->out_shape = st->parsed_sample.dali_img_info.shape;
+      st->out_shape[2] = NumberOfChannels(format_, st->out_shape[2]);
+      if (use_orientation_ &&
+          (st->parsed_sample.nvimgcodec_img_info.orientation.rotated % 180 != 0)) {
+        std::swap(st->out_shape[0], st->out_shape[1]);
+      }
 
-        ROI &roi = rois_[i] = GetRoi(spec_, ws, i, st->out_shape);
-        if (roi.use_roi()) {
-          auto roi_sh = roi.shape();
-          if (roi.end.size() >= 2) {
-            DALI_ENFORCE(0 <= roi.end[0] && roi.end[0] <= st->out_shape[0] && 0 <= roi.end[1] &&
-                              roi.end[1] <= st->out_shape[1],
-                          "ROI end must fit within the image bounds");
-          }
-          if (roi.begin.size() >= 2) {
-            DALI_ENFORCE(0 <= roi.begin[0] && roi.begin[0] <= st->out_shape[0] &&
-                              0 <= roi.begin[1] && roi.begin[1] <= st->out_shape[1],
-                          "ROI begin must fit within the image bounds");
-          }
-          st->out_shape[0] = roi_sh[0];
-          st->out_shape[1] = roi_sh[1];
+      ROI &roi = rois_[sample_idx] = GetRoi(spec_, ws, sample_idx, st->out_shape);
+      if (roi.use_roi()) {
+        auto roi_sh = roi.shape();
+        if (roi.end.size() >= 2) {
+          DALI_ENFORCE(0 <= roi.end[0] && roi.end[0] <= st->out_shape[0] && 0 <= roi.end[1] &&
+                            roi.end[1] <= st->out_shape[1],
+                        "ROI end must fit within the image bounds");
+        }
+        if (roi.begin.size() >= 2) {
+          DALI_ENFORCE(0 <= roi.begin[0] && roi.begin[0] <= st->out_shape[0] &&
+                            0 <= roi.begin[1] && roi.begin[1] <= st->out_shape[1],
+                        "ROI begin must fit within the image bounds");
         }
-        out_shape.set_tensor_shape(i, st->out_shape);
-        PrepareOutput(*state_[i], rois_[i], ws);
-        assert(!ws.has_stream() || ws.stream() == st->image_info.cuda_stream);
+        st->out_shape[0] = roi_sh[0];
+        st->out_shape[1] = roi_sh[1];
+      }
+      out_shape.set_tensor_shape(sample_idx, st->out_shape);
+      PrepareOutput(*state_[sample_idx], rois_[sample_idx], ws);
+      assert(!ws.has_stream() || ws.stream() == st->image_info.cuda_stream);
+    };
+
+    // The image descriptors are created in parallel, in block-wise fashion.
+    auto init_desc_task = [&](int sample_idx) {
+      auto &st = *state_[sample_idx];
+      if (use_cache && st.load_from_cache) {
+        return;
+      }
+      if (!st.need_processing) {
+        st.image_info.buffer = output.raw_mutable_tensor(sample_idx);
+      }
+      st.image = NvImageCodecImage::Create(instance_, &st.image_info);
+      if (rois_[sample_idx].use_roi()) {
+        auto &roi = rois_[sample_idx];
+        nvimgcodecCodeStreamView_t cs_view = {
+            NVIMGCODEC_STRUCTURE_TYPE_CODE_STREAM_VIEW,
+            sizeof(nvimgcodecCodeStreamView_t),
+            nullptr,
+            0,  // image_idx
+            {NVIMGCODEC_STRUCTURE_TYPE_REGION, sizeof(nvimgcodecRegion_t), nullptr, 2}};
+        cs_view.region.start[0] = roi.begin[0];
+        cs_view.region.start[1] = roi.begin[1];
+        cs_view.region.end[0] = roi.end[0];
+        cs_view.region.end[1] = roi.end[1];
+        st.sub_encoded_stream = NvImageCodecCodeStream::FromSubCodeStream(
+            st.parsed_sample.encoded_stream.get(), &cs_view);
       }
     };
 
@@ -734,91 +756,70 @@ class ImageDecoder : public StatelessOperator<Backend> {
     int ntasks = std::min<int>(nblocks, std::min<int>(8, tp_->NumThreads() + 1));
 
     if (ntasks < 2) {
+      // run all in current thread
       DomainTimeRange tr("Setup", DomainTimeRange::kOrange);
-      setup_block(0, 1, -1);  // run all in current thread
+      {
+        DomainTimeRange tr("Parse", DomainTimeRange::kOrange);
+        for (int sample_idx = 0; sample_idx < nsamples; sample_idx++) {
+          setup_sample(sample_idx, -1);
+        }
+      }
+      {
+        DomainTimeRange tr("Alloc output", DomainTimeRange::kOrange);
+        output.Resize(out_shape);
+      }
+      {
+        DomainTimeRange tr("Create images", DomainTimeRange::kOrange);
+        for (int sample_idx = 0; sample_idx < nsamples; sample_idx++) {
+          init_desc_task(sample_idx);
+        }
+      }
     } else {
+      // run in parallel
       int block_idx = 0;
-      atomic_idx_.store(0);
-      auto setup_task = [&, nblocks](int tid) {
+      // relaxed, only need atomicity, not ordering
+      atomic_idx_.store(0, std::memory_order_relaxed);
+      parse_barrier_.Reset(ntasks);
+      alloc_output_barrier_.Reset(ntasks);
+      auto setup_task = [&](int tid) {
+        int sample_idx;
         DomainTimeRange tr("Setup", DomainTimeRange::kOrange);
-        int block_idx;
-        while ((block_idx = atomic_idx_.fetch_add(1)) < nblocks) {
-          setup_block(block_idx, nblocks, tid);
+        {
+          DomainTimeRange tr("Parse", DomainTimeRange::kOrange);
+          while ((sample_idx = atomic_idx_.fetch_add(1, std::memory_order_relaxed)) < nsamples) {
+            setup_sample(sample_idx, tid);
+          }
         }
-      };
-
-      for (int task_idx = 0; task_idx < ntasks - 1; task_idx++) {
-        tp_->AddWork(setup_task, -task_idx);
-      }
-      assert(ntasks >= 2);
-      tp_->RunAll(false);  // start work but not wait
-      setup_task(-1);      // last task in current thread
-      tp_->WaitForWork();  // wait for the other threads
-    }
-
-    // Allocate the memory for the outputs...
-    {
-      DomainTimeRange tr("Alloc output", DomainTimeRange::kOrange);
-      output.Resize(out_shape);
-    }
-    // ... and create image descriptors.
+        parse_barrier_.ArriveAndWait();  // wait until parsing is done
 
-    // The image descriptors are created in parallel, in block-wise fashion.
-    auto init_desc_task = [&](int start_sample, int end_sample) {
-      DomainTimeRange tr(
-          "Create images " + std::to_string(start_sample) + ".." + std::to_string(end_sample),
-          DomainTimeRange::kOrange);
-      for (int orig_idx = start_sample; orig_idx < end_sample; orig_idx++) {
-        auto &st = *state_[orig_idx];
-        if (use_cache && st.load_from_cache) {
-          continue;
-        }
-        if (!st.need_processing) {
-          st.image_info.buffer = output.raw_mutable_tensor(orig_idx);
-        }
-        st.image = NvImageCodecImage::Create(instance_, &st.image_info);
-        if (rois_[orig_idx].use_roi()) {
-          auto &roi = rois_[orig_idx];
-          nvimgcodecCodeStreamView_t cs_view = {
-              NVIMGCODEC_STRUCTURE_TYPE_CODE_STREAM_VIEW,
-              sizeof(nvimgcodecCodeStreamView_t),
-              nullptr,
-              0,  // image_idx
-              {NVIMGCODEC_STRUCTURE_TYPE_REGION, sizeof(nvimgcodecRegion_t), nullptr, 2}};
-          cs_view.region.start[0] = roi.begin[0];
-          cs_view.region.start[1] = roi.begin[1];
-          cs_view.region.end[0] = roi.end[0];
-          cs_view.region.end[1] = roi.end[1];
-          st.sub_encoded_stream = NvImageCodecCodeStream::FromSubCodeStream(
-              st.parsed_sample.encoded_stream.get(), &cs_view);
+        if (tid == -1) {
+          DomainTimeRange tr("Alloc output", DomainTimeRange::kOrange);
+          output.Resize(out_shape);
+          atomic_idx_.store(0, std::memory_order_relaxed);
+          alloc_output_barrier_.Arrive();  // No need to wait here, we are in the main thread
+        } else {
+          alloc_output_barrier_.ArriveAndWait();  // wait until allocation is done
         }
-      }
-    };
 
-    // Just one task? Run it in this thread!
-    if (ntasks < 2) {
-      DomainTimeRange tr("Create images", DomainTimeRange::kOrange);
-      init_desc_task(0, nsamples);
-    } else {
-      DomainTimeRange tr("Create images", DomainTimeRange::kOrange);
-      // Many tasks? Run in thread pool.
-      int block_idx = 0;
-      atomic_idx_.store(0);
-      auto create_images_task = [&, nblocks](int tid) {
-        int block_idx;
-        while ((block_idx = atomic_idx_.fetch_add(1)) < nblocks) {
-          int64_t start = nsamples * block_idx / nblocks;
-          int64_t end = nsamples * (block_idx + 1) / nblocks;
-          init_desc_task(start, end);
+        // Create image descriptors
+        {
+          DomainTimeRange tr("Create images", DomainTimeRange::kOrange);
+          while ((sample_idx = atomic_idx_.fetch_add(16, std::memory_order_relaxed)) < nsamples) {
+            int sample_start = sample_idx;
+            int sample_end = std::min(sample_idx + 16, nsamples);
+            for (int i = sample_start; i < sample_end; i++) {
+              init_desc_task(i);
+            }
+          }
         }
       };
 
       for (int task_idx = 0; task_idx < ntasks - 1; task_idx++) {
-        tp_->AddWork(create_images_task, -task_idx);
+        tp_->AddWork(setup_task, -task_idx);
       }
       assert(ntasks >= 2);
       tp_->RunAll(false);  // start work but not wait
-      create_images_task(-1);
+      setup_task(-1);      // last task in current thread
       tp_->WaitForWork();  // wait for the other threads
     }
 
@@ -985,6 +986,49 @@ class ImageDecoder : public StatelessOperator<Backend> {
   std::vector<nvimgcodecExtension_t> extensions_;
 
   std::vector<std::function<void(int)>> nvimgcodec_scheduled_tasks_;
+
+  class ThreadBarrier {
+   public:
+    explicit ThreadBarrier(std::size_t count) : count_(count), current_(count) {}
+    void Arrive() {
+      std::unique_lock<std::mutex> lock(lock_);
+      if (current_ == 0) {
+        throw std::logic_error("barrier is already completed");
+      }
+      current_--;
+      if (current_ == 0) {
+        cv_.notify_all();
+      }
+    }
+    void ArriveAndWait(bool reset = false) {
+      std::unique_lock<std::mutex> lock(lock_);
+      if (current_ == 0) {
+        throw std::logic_error("barrier is already completed");
+      }
+      current_--;
+      if (current_ == 0 || count_ == 0) {
+        if (reset)
+          current_ = count_;
+        cv_.notify_all();
+      } else {
+        cv_.wait(lock, [this] { return current_ == 0; });
+      }
+    }
+    void Reset(std::size_t count) {
+      std::lock_guard<std::mutex> lock(lock_);
+      count_ = count;
+      current_ = count;
+    }
+
+   private:
+    std::mutex lock_;
+    std::condition_variable cv_;
+    size_t count_;
+    size_t current_;
+  };
+
+  ThreadBarrier parse_barrier_{0};
+  ThreadBarrier alloc_output_barrier_{0};
 };
 
 }  // namespace imgcodec