@@ -4,103 +4,146 @@ import (
4
4
"context"
5
5
"errors"
6
6
"fmt"
7
- "slices"
8
7
9
8
"github.com/apache/arrow-go/v18/arrow"
10
9
)
11
10
12
11
// Merge is a pipeline that takes N inputs and sequentially consumes each one of them.
13
12
// It completely exhausts an input before moving to the next one.
14
13
type Merge struct {
15
- inputs []Pipeline
16
- exhausted []bool
17
- state state
14
+ inputs []Pipeline
15
+ maxPrefetch int
16
+ initialized bool
17
+ currInput int // index of the currently processed input
18
+ state state
18
19
}
19
20
20
21
var _ Pipeline = (* Merge )(nil )
21
22
22
- func NewMergePipeline (inputs []Pipeline ) (* Merge , error ) {
23
+ // newMergePipeline creates a new merge pipeline that merges N inputs into a single output.
24
+ //
25
+ // The argument maxPrefetch controls how many inputs are prefetched simultaneously while the current one is consumed.
26
+ // Set maxPrefetch to 0 to disable prefetching of the next input.
27
+ // Set maxPrefetch to 1 to prefetch only the next input, and so on.
28
+ // Set maxPrefetch to -1 to pretetch all inputs at once.
29
+ func newMergePipeline (inputs []Pipeline , maxPrefetch int ) (* Merge , error ) {
23
30
if len (inputs ) == 0 {
24
- return nil , fmt .Errorf ("no inputs provided for merge pipeline " )
31
+ return nil , fmt .Errorf ("merge pipeline: no inputs provided" )
25
32
}
26
33
34
+ // Default to number of inputs if maxConcurrency is negative or exceeds the number of inputs.
35
+ if maxPrefetch < 0 || maxPrefetch >= len (inputs ) {
36
+ maxPrefetch = len (inputs ) - 1
37
+ }
38
+
39
+ // Wrap inputs into prefetching pipeline.
27
40
for i := range inputs {
41
+ // Only wrap input, but do not call init() on it, as it would start prefetching.
42
+ // Prefetching is started in the [Merge.init] function
28
43
inputs [i ] = newPrefetchingPipeline (inputs [i ])
29
44
}
30
45
31
46
return & Merge {
32
- inputs : inputs ,
33
- exhausted : make ([] bool , len ( inputs )) ,
47
+ inputs : inputs ,
48
+ maxPrefetch : maxPrefetch ,
34
49
}, nil
35
50
}
36
51
52
+ func (m * Merge ) init (ctx context.Context ) {
53
+ if m .initialized {
54
+ return
55
+ }
56
+
57
+ // Initialize pre-fetching of inputs defined by maxPrefetch.
58
+ // The first/current input is always initialized.
59
+ for i := range m .inputs {
60
+ if i <= m .maxPrefetch {
61
+ m .startPrefetchingInputAtIndex (ctx , i )
62
+ }
63
+ }
64
+
65
+ m .initialized = true
66
+ }
67
+
68
+ // startPrefetchingInputAtIndex initializes the input at given index i,
69
+ // if the index is not out of bounds and if the input is of type [prefetchWrapper].
70
+ // Initializing the input will start its prefetching.
71
+ func (m * Merge ) startPrefetchingInputAtIndex (ctx context.Context , i int ) {
72
+ if i >= len (m .inputs ) {
73
+ return
74
+ }
75
+ inp , ok := m .inputs [i ].(* prefetchWrapper )
76
+ if ok {
77
+ inp .init (ctx )
78
+ }
79
+ }
80
+
37
81
// Read reads the next value into its state.
38
82
// It returns an error if reading fails or when the pipeline is exhausted.
39
83
func (m * Merge ) Read (ctx context.Context ) error {
40
84
if m .state .err != nil {
41
85
return m .state .err
42
86
}
43
87
88
+ m .init (ctx )
44
89
record , err := m .read (ctx )
45
90
m .state = newState (record , err )
46
91
47
92
if err != nil {
48
- return fmt . Errorf ( "run merge: %w" , err )
93
+ return err
49
94
}
50
95
51
96
return nil
52
97
}
53
98
54
99
func (m * Merge ) read (ctx context.Context ) (arrow.Record , error ) {
55
- if ! slices .Contains (m .exhausted , false ) {
100
+ // All inputs have been consumed and are exhausted
101
+ if m .currInput >= len (m .inputs ) {
56
102
return nil , EOF
57
103
}
58
104
59
- for i , input := range m .inputs {
60
- if m .exhausted [i ] {
61
- continue
62
- }
105
+ for m .currInput < len (m .inputs ) {
106
+ input := m .inputs [m .currInput ]
63
107
64
108
if err := input .Read (ctx ); err != nil {
65
109
if errors .Is (err , EOF ) {
66
110
input .Close ()
67
- m .exhausted [i ] = true
111
+ // Proceed to the next input
112
+ m .currInput ++
113
+ // Initialize the next input so it starts prefetching
114
+ m .startPrefetchingInputAtIndex (ctx , m .currInput + m .maxPrefetch )
68
115
continue
69
116
}
70
117
71
118
return nil , err
72
119
}
73
120
74
- // not updating reference counts as this pipeline is not consuming
75
- // the record.
76
121
return input .Value ()
77
122
}
78
123
79
- // return EOF if none of the inputs returned a record.
124
+ // Return EOF if none of the inputs returned a record.
80
125
return nil , EOF
81
126
}
82
127
128
+ // Value returns the current value in state.
129
+ func (m * Merge ) Value () (arrow.Record , error ) {
130
+ return m .state .Value ()
131
+ }
132
+
83
133
// Close implements Pipeline.
84
134
func (m * Merge ) Close () {
85
- for i , input := range m .inputs {
86
- // exhausted inputs are already closed
87
- if ! m .exhausted [i ] {
88
- input .Close ()
89
- }
135
+ // exhausted inputs are already closed
136
+ for _ , input := range m .inputs [m .currInput :] {
137
+ input .Close ()
90
138
}
91
139
}
92
140
93
- // Inputs implements Pipeline .
141
+ // Inputs returns the inputs of the pipeline .
94
142
func (m * Merge ) Inputs () []Pipeline {
95
143
return m .inputs
96
144
}
97
145
98
- // Transport implements Pipeline .
146
+ // Transport returns the type of transport of the implementation .
99
147
func (m * Merge ) Transport () Transport {
100
148
return Local
101
149
}
102
-
103
- // Value implements Pipeline.
104
- func (m * Merge ) Value () (arrow.Record , error ) {
105
- return m .state .Value ()
106
- }
0 commit comments