@@ -765,6 +765,9 @@ class ImageDecoder : public StatelessOperator<Backend> {
765765
766766 // The image descriptors are created in parallel, in block-wise fashion.
767767 auto init_desc_task = [&](int start_sample, int end_sample) {
768+ DomainTimeRange tr (
769+ " Create images " + std::to_string (start_sample) + " .." + std::to_string (end_sample),
770+ DomainTimeRange::kOrange );
768771 for (int orig_idx = start_sample; orig_idx < end_sample; orig_idx++) {
769772 auto &st = *state_[orig_idx];
770773 if (use_cache && st.load_from_cache ) {
@@ -799,19 +802,24 @@ class ImageDecoder : public StatelessOperator<Backend> {
799802 } else {
800803 DomainTimeRange tr (" Create images" , DomainTimeRange::kOrange );
801804 // Many tasks? Run in thread pool.
802- // The first span of tasks is processed in the main operator thread.
803- for (int i = 1 ; i < ntasks; i++) {
804- int start = i * nsamples / ntasks;
805- int end = (i + 1 ) * nsamples / ntasks;
806- tp_->AddWork ([&, start, end](int ) {
805+ int block_idx = 0 ;
806+ atomic_idx_.store (0 );
807+ auto create_images_task = [&, nblocks](int tid) {
808+ int block_idx;
809+ while ((block_idx = atomic_idx_.fetch_add (1 )) < nblocks) {
810+ int64_t start = nsamples * block_idx / nblocks;
811+ int64_t end = nsamples * (block_idx + 1 ) / nblocks;
807812 init_desc_task (start, end);
808- });
813+ }
814+ };
815+
816+ for (int task_idx = 0 ; task_idx < ntasks - 1 ; task_idx++) {
817+ tp_->AddWork (create_images_task, -task_idx);
809818 }
810- // Start processing of subsequent segments...
811- tp_->RunAll (false );
812- // ...and process the 1st segment in this thread.
813- init_desc_task (0 , nsamples / ntasks);
814- tp_->WaitForWork ();
819+ assert (ntasks >= 2 );
820+ tp_->RunAll (false ); // start work but not wait
821+ create_images_task (-1 );
822+ tp_->WaitForWork (); // wait for the other threads
815823 }
816824
817825 bool any_need_processing = false ;
0 commit comments