feat(engine): Prefetch inputs of Merge pipeline concurrently (#19039)

chaudum · ashwanthgoli · web-flow · commit f50c87fce298 · 2025-08-30T09:26:07.000Z
This PR adds prefetching to the inputs of the Merge node.

The Merge is a pipeline that takes N inputs and sequentially consumes each one of them. It completely exhausts an input before moving to the next one.
However the inputs do not prefetch any data, so calling Read() may wait on i/o (fetching data from object storage, decoding, etc). Prefetching data allows to fetch the next batch which the caller node is processing the current batch.

This PR changes the behaviour of the Merge so inputs are prefetched. Since inputs of the Merge pipeline are read sequentially, the current input is always prefetched. The setting `-querier.engine.merge-prefetch-count` controls how many of the next inputs are prefetched concurrently.

Signed-off-by: Christian Haudum &lt;christian.haudum@gmail.com&gt;
Co-authored-by: Ashwanth Goli &lt;iamashwanth@gmail.com&gt;
diff --git a/docs/sources/shared/configuration.md b/docs/sources/shared/configuration.md
@@ -4913,6 +4913,13 @@ engine:
   # CLI flag: -querier.engine.batch-size
   [batch_size: <int> | default = 100]
 
+  # Experimental: The number of inputs that are prefetched simultaneously by any
+  # Merge node. A value of 0 means that only the currently processed input is
+  # prefetched, 1 means that only the next input is prefetched, and so on. A
+  # negative value means that all inputs are be prefetched in parallel.
+  # CLI flag: -querier.engine.merge-prefetch-count
+  [merge_prefetch_count: <int> | default = 0]
+
   # Experimental: Maximum total size of future pages for DataObjScan to download
   # before they are needed, for roundtrip reduction to object storage. Setting
   # to zero disables downloading future pages. Only used in the next generation
diff --git a/pkg/engine/engine.go b/pkg/engine/engine.go
@@ -194,6 +194,7 @@ func (e *QueryEngine) Execute(ctx context.Context, params logql.Params) (logqlmo
 
 		cfg := executor.Config{
 			BatchSize:                int64(e.opts.BatchSize),
+			MergePrefetchCount:       e.opts.MergePrefetchCount,
 			Bucket:                   e.bucket,
 			DataobjScanPageCacheSize: int64(e.opts.DataobjScanPageCacheSize),
 		}
diff --git a/pkg/engine/executor/executor.go b/pkg/engine/executor/executor.go
@@ -27,14 +27,16 @@ type Config struct {
 	Bucket    objstore.Bucket
 
 	DataobjScanPageCacheSize int64
+	MergePrefetchCount       int
 }
 
 func Run(ctx context.Context, cfg Config, plan *physical.Plan, logger log.Logger) Pipeline {
 	c := &Context{
-		plan:      plan,
-		batchSize: cfg.BatchSize,
-		bucket:    cfg.Bucket,
-		logger:    logger,
+		plan:               plan,
+		batchSize:          cfg.BatchSize,
+		mergePrefetchCount: cfg.MergePrefetchCount,
+		bucket:             cfg.Bucket,
+		logger:             logger,
 
 		dataobjScanPageCacheSize: cfg.DataobjScanPageCacheSize,
 	}
@@ -51,12 +53,14 @@ func Run(ctx context.Context, cfg Config, plan *physical.Plan, logger log.Logger
 // Context is the execution context
 type Context struct {
 	batchSize int64
+
 	logger    log.Logger
 	plan      *physical.Plan
 	evaluator expressionEvaluator
 	bucket    objstore.Bucket
 
 	dataobjScanPageCacheSize int64
+	mergePrefetchCount       int
 }
 
 func (c *Context) execute(ctx context.Context, node physical.Node) Pipeline {
@@ -314,7 +318,7 @@ func (c *Context) executeMerge(ctx context.Context, _ *physical.Merge, inputs []
 		return emptyPipeline()
 	}
 
-	pipeline, err := NewMergePipeline(inputs)
+	pipeline, err := newMergePipeline(inputs, c.mergePrefetchCount)
 	if err != nil {
 		return errorPipeline(ctx, err)
 	}
diff --git a/pkg/engine/executor/merge.go b/pkg/engine/executor/merge.go
@@ -4,103 +4,146 @@ import (
 	"context"
 	"errors"
 	"fmt"
-	"slices"
 
 	"github.com/apache/arrow-go/v18/arrow"
 )
 
 // Merge is a pipeline that takes N inputs and sequentially consumes each one of them.
 // It completely exhausts an input before moving to the next one.
 type Merge struct {
-	inputs    []Pipeline
-	exhausted []bool
-	state     state
+	inputs      []Pipeline
+	maxPrefetch int
+	initialized bool
+	currInput   int // index of the currently processed input
+	state       state
 }
 
 var _ Pipeline = (*Merge)(nil)
 
-func NewMergePipeline(inputs []Pipeline) (*Merge, error) {
+// newMergePipeline creates a new merge pipeline that merges N inputs into a single output.
+//
+// The argument maxPrefetch controls how many inputs are prefetched simultaneously while the current one is consumed.
+// Set maxPrefetch to 0 to disable prefetching of the next input.
+// Set maxPrefetch to 1 to prefetch only the next input, and so on.
+// Set maxPrefetch to -1 to pretetch all inputs at once.
+func newMergePipeline(inputs []Pipeline, maxPrefetch int) (*Merge, error) {
 	if len(inputs) == 0 {
-		return nil, fmt.Errorf("no inputs provided for merge pipeline")
+		return nil, fmt.Errorf("merge pipeline: no inputs provided")
 	}
 
+	// Default to number of inputs if maxConcurrency is negative or exceeds the number of inputs.
+	if maxPrefetch < 0 || maxPrefetch >= len(inputs) {
+		maxPrefetch = len(inputs) - 1
+	}
+
+	// Wrap inputs into prefetching pipeline.
 	for i := range inputs {
+		// Only wrap input, but do not call init() on it, as it would start prefetching.
+		// Prefetching is started in the [Merge.init] function
 		inputs[i] = newPrefetchingPipeline(inputs[i])
 	}
 
 	return &Merge{
-		inputs:    inputs,
-		exhausted: make([]bool, len(inputs)),
+		inputs:      inputs,
+		maxPrefetch: maxPrefetch,
 	}, nil
 }
 
+func (m *Merge) init(ctx context.Context) {
+	if m.initialized {
+		return
+	}
+
+	// Initialize pre-fetching of inputs defined by maxPrefetch.
+	// The first/current input is always initialized.
+	for i := range m.inputs {
+		if i <= m.maxPrefetch {
+			m.startPrefetchingInputAtIndex(ctx, i)
+		}
+	}
+
+	m.initialized = true
+}
+
+// startPrefetchingInputAtIndex initializes the input at given index i,
+// if the index is not out of bounds and if the input is of type [prefetchWrapper].
+// Initializing the input will start its prefetching.
+func (m *Merge) startPrefetchingInputAtIndex(ctx context.Context, i int) {
+	if i >= len(m.inputs) {
+		return
+	}
+	inp, ok := m.inputs[i].(*prefetchWrapper)
+	if ok {
+		inp.init(ctx)
+	}
+}
+
 // Read reads the next value into its state.
 // It returns an error if reading fails or when the pipeline is exhausted.
 func (m *Merge) Read(ctx context.Context) error {
 	if m.state.err != nil {
 		return m.state.err
 	}
 
+	m.init(ctx)
 	record, err := m.read(ctx)
 	m.state = newState(record, err)
 
 	if err != nil {
-		return fmt.Errorf("run merge: %w", err)
+		return err
 	}
 
 	return nil
 }
 
 func (m *Merge) read(ctx context.Context) (arrow.Record, error) {
-	if !slices.Contains(m.exhausted, false) {
+	// All inputs have been consumed and are exhausted
+	if m.currInput >= len(m.inputs) {
 		return nil, EOF
 	}
 
-	for i, input := range m.inputs {
-		if m.exhausted[i] {
-			continue
-		}
+	for m.currInput < len(m.inputs) {
+		input := m.inputs[m.currInput]
 
 		if err := input.Read(ctx); err != nil {
 			if errors.Is(err, EOF) {
 				input.Close()
-				m.exhausted[i] = true
+				// Proceed to the next input
+				m.currInput++
+				// Initialize the next input so it starts prefetching
+				m.startPrefetchingInputAtIndex(ctx, m.currInput+m.maxPrefetch)
 				continue
 			}
 
 			return nil, err
 		}
 
-		// not updating reference counts as this pipeline is not consuming
-		// the record.
 		return input.Value()
 	}
 
-	// return EOF if none of the inputs returned a record.
+	// Return EOF if none of the inputs returned a record.
 	return nil, EOF
 }
 
+// Value returns the current value in state.
+func (m *Merge) Value() (arrow.Record, error) {
+	return m.state.Value()
+}
+
 // Close implements Pipeline.
 func (m *Merge) Close() {
-	for i, input := range m.inputs {
-		// exhausted inputs are already closed
-		if !m.exhausted[i] {
-			input.Close()
-		}
+	// exhausted inputs are already closed
+	for _, input := range m.inputs[m.currInput:] {
+		input.Close()
 	}
 }
 
-// Inputs implements Pipeline.
+// Inputs returns the inputs of the pipeline.
 func (m *Merge) Inputs() []Pipeline {
 	return m.inputs
 }
 
-// Transport implements Pipeline.
+// Transport returns the type of transport of the implementation.
 func (m *Merge) Transport() Transport {
 	return Local
 }
-
-// Value implements Pipeline.
-func (m *Merge) Value() (arrow.Record, error) {
-	return m.state.Value()
-}
diff --git a/pkg/engine/executor/merge_test.go b/pkg/engine/executor/merge_test.go
diff --git a/pkg/logql/engine.go b/pkg/logql/engine.go

Original file line number	Diff line number	Diff line change
`@@ -194,6 +194,7 @@ func (e *QueryEngine) Execute(ctx context.Context, params logql.Params) (logqlmo`
`194`	`194`
`195`	`195`	`cfg := executor.Config{`
`196`	`196`	`BatchSize: int64(e.opts.BatchSize),`
	`197`	`+ MergePrefetchCount: e.opts.MergePrefetchCount,`
`197`	`198`	`Bucket: e.bucket,`
`198`	`199`	`DataobjScanPageCacheSize: int64(e.opts.DataobjScanPageCacheSize),`
`199`	`200`	`}`