Skip to content

Commit 54bb2fd

Browse files
authored
SQL: Use specialized virtual columns for JSON_OBJECT, JSON_MERGE. (#18553)
* SQL: Use specialized virtual columns for JSON_OBJECT, JSON_MERGE. Building on #18521, this patch makes virtual column specialization recursive. It also now happens immediately on calling getOrCreateVirtualColumnForExpression. Specializations are added for JSON_OBJECT and JSON_MERGE. Now, chains of JSON_MERGE, JSON_OBJECT, and JSON_VALUE can preserve lazy evaluation, index usage, dictionary usage, etc. There is a change to VirtualColumnCreator that can affect extensions that add SQL operators. To allow the creator to access rewritten arguments, a "DruidExpression self" parameter is added. The "String expression" is no longer needed so it is removed. * Correct javadoc. * Update expectations. * NestedFieldVirtualColumn: Process arrays when making object selectors, even if not from VariantColumn. * Update test.
1 parent 4195ffd commit 54bb2fd

File tree

13 files changed

+682
-184
lines changed

13 files changed

+682
-184
lines changed

processing/src/main/java/org/apache/druid/segment/virtual/NestedFieldVirtualColumn.java

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -414,8 +414,14 @@ public ColumnValueSelector<?> makeColumnValueSelector(
414414
return new ArrayElementColumnValueSelector(arraySelector, elementNumber);
415415
}
416416

417-
// we are not a nested column and are being asked for a path that will never exist, so we are nil selector
418-
return NilColumnValueSelector.instance();
417+
if (holder.getCapabilities().isArray() || ColumnType.NESTED_DATA.equals(holder.getCapabilities().toColumnType())) {
418+
// Not a root access and no specialized path available. But the underlying column is array or nested typed,
419+
// so we may still be able to walk it using exprs. Try that.
420+
return makeColumnValueSelectorUsingColumnSelectorFactory(selectorFactory);
421+
} else {
422+
// we are not a nested or array column, and are being asked for a path that will never exist, so nil selector
423+
return NilColumnValueSelector.instance();
424+
}
419425
}
420426

421427
@Override
@@ -486,11 +492,11 @@ public VectorObjectSelector makeVectorObjectSelector(
486492
final NestedVectorColumnSelectorFactory nestedColumnSelectorFactory =
487493
column.as(NestedVectorColumnSelectorFactory.class);
488494

489-
if (isNestedColumn(holder)) {
495+
if (isNestedColumn(holder) || holder.getCapabilities().isArray()) {
490496
if (fieldSpec.processFromRaw || nestedTypeInspector == null || nestedColumnSelectorFactory == null) {
491497
// 1) If processFromRaw is true, that means JSON_QUERY.
492-
// 2) If no nestedTypeInspector, nestedColumnSelectorFactory then that means this is a nested type that is
493-
// not exposed as a nested column.
498+
// 2) If no nestedTypeInspector, nestedColumnSelectorFactory then that means this is a nested or array
499+
// type that is not exposed as a nested column.
494500
// Either way, we read and process raw objects.
495501
return new RawFieldVectorObjectSelector(
496502
selectorFactory.makeObjectSelector(fieldSpec.columnName),
@@ -499,7 +505,9 @@ public VectorObjectSelector makeVectorObjectSelector(
499505
);
500506
}
501507
final ColumnType leastRestrictiveType = nestedTypeInspector.getFieldLogicalType(fieldSpec.parts);
502-
if (leastRestrictiveType != null && leastRestrictiveType.isNumeric() && !Types.isNumeric(fieldSpec.expectedType)) {
508+
if (leastRestrictiveType != null
509+
&& leastRestrictiveType.isNumeric()
510+
&& !Types.isNumeric(fieldSpec.expectedType)) {
503511
return ExpressionVectorSelectors.castValueSelectorToObject(
504512
offset,
505513
columnName,

processing/src/main/java/org/apache/druid/segment/virtual/NestedObjectVirtualColumn.java

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -88,11 +88,12 @@ public NestedObjectVirtualColumn(
8888
StringUtils.format(
8989
"%s(%s)",
9090
NestedDataExpressions.JsonObjectExprMacro.NAME,
91-
keyExprMap.entrySet().stream().sorted(Map.Entry.comparingByKey()).map(entry -> {
92-
final String key = entry.getKey();
93-
final TypedExpression valueExpr = entry.getValue();
94-
return Parser.constant(key).stringify() + ',' + valueExpr.expression;
95-
}).collect(Collectors.joining(","))
91+
Preconditions.checkNotNull(keyExprMap, "object")
92+
.entrySet().stream().map(entry -> {
93+
final String key = entry.getKey();
94+
final TypedExpression valueExpr = entry.getValue();
95+
return Parser.constant(key).stringify() + ',' + valueExpr.expression;
96+
}).collect(Collectors.joining(","))
9697
),
9798
ColumnType.NESTED_DATA,
9899
macroTable

processing/src/test/java/org/apache/druid/segment/virtual/NestedObjectVirtualColumnTest.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@
2929
import org.junit.Assert;
3030
import org.junit.Test;
3131

32-
import java.util.HashMap;
3332
import java.util.Map;
3433

3534
public class NestedObjectVirtualColumnTest
@@ -56,9 +55,10 @@ public void testSerde() throws JsonProcessingException
5655
@Test
5756
public void testGetKeyExprMap()
5857
{
59-
Map<String, NestedObjectVirtualColumn.TypedExpression> keyExprMap = new HashMap<>();
60-
keyExprMap.put("key1", new NestedObjectVirtualColumn.TypedExpression("expr1", ColumnType.STRING));
61-
keyExprMap.put("key2", new NestedObjectVirtualColumn.TypedExpression("expr2", ColumnType.DOUBLE));
58+
Map<String, NestedObjectVirtualColumn.TypedExpression> keyExprMap = ImmutableMap.of(
59+
"key1", new NestedObjectVirtualColumn.TypedExpression("expr1", ColumnType.STRING),
60+
"key2", new NestedObjectVirtualColumn.TypedExpression("expr2", ColumnType.DOUBLE)
61+
);
6262

6363
NestedObjectVirtualColumn column = new NestedObjectVirtualColumn(
6464
"test_obj",

quidem-ut/src/test/quidem/org.apache.druid.quidem.QTest/qaArray/ops_funcs_mv_funcs.08.all.iq

Lines changed: 175 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -119,16 +119,34 @@ FROM test_array;
119119
#-------------------------------------------------------------------------
120120
SELECT mv_to_array(json_value(a_nested, '$[0]' RETURNING boolean array)) AS col
121121
FROM test_array;
122-
should be an identifier expression. Use array() instead
123-
!error
122+
+--------------+
123+
| col |
124+
+--------------+
125+
| [1, 0, null] |
126+
| [1, 0, null] |
127+
| [null, 0, 1] |
128+
| [null, 0, 1] |
129+
+--------------+
130+
(4 rows)
131+
132+
!ok
124133

125134
#-------------------------------------------------------------------------
126135
# TESTCASE: test_mv_funcs TEST_ID: A1_B16_C3_D1
127136
#-------------------------------------------------------------------------
128137
SELECT mv_to_array(json_value(a_nested, '$[7][0]' RETURNING boolean array)) AS col
129138
FROM test_array;
130-
should be an identifier expression. Use array() instead
131-
!error
139+
+--------------+
140+
| col |
141+
+--------------+
142+
| [1, 0, null] |
143+
| [1, 0, null] |
144+
| [null, 0, 1] |
145+
| [null, 0, 1] |
146+
+--------------+
147+
(4 rows)
148+
149+
!ok
132150

133151
#-------------------------------------------------------------------------
134152
# TESTCASE: test_mv_funcs TEST_ID: A1_B16_C4_D1
@@ -152,16 +170,34 @@ FROM test_array;
152170
#-------------------------------------------------------------------------
153171
SELECT mv_to_array(json_value(a_nested, '$[1]' RETURNING bigint array)) AS col
154172
FROM test_array;
155-
should be an identifier expression. Use array() instead
156-
!error
173+
+--------------+
174+
| col |
175+
+--------------+
176+
| [1, 2, null] |
177+
| [1, 2, null] |
178+
| [null, 2, 1] |
179+
| [null, 2, 1] |
180+
+--------------+
181+
(4 rows)
182+
183+
!ok
157184

158185
#-------------------------------------------------------------------------
159186
# TESTCASE: test_mv_funcs TEST_ID: A1_B16_C6_D1
160187
#-------------------------------------------------------------------------
161188
SELECT mv_to_array(json_value(a_nested, '$[7][1]' RETURNING bigint array)) AS col
162189
FROM test_array;
163-
should be an identifier expression. Use array() instead
164-
!error
190+
+--------------+
191+
| col |
192+
+--------------+
193+
| [1, 2, null] |
194+
| [1, 2, null] |
195+
| [null, 2, 1] |
196+
| [null, 2, 1] |
197+
+--------------+
198+
(4 rows)
199+
200+
!ok
165201

166202
#-------------------------------------------------------------------------
167203
# TESTCASE: test_mv_funcs TEST_ID: A1_B16_C7_D1
@@ -185,16 +221,34 @@ FROM test_array;
185221
#-------------------------------------------------------------------------
186222
SELECT mv_to_array(json_value(a_nested, '$[2]' RETURNING decimal array)) AS col
187223
FROM test_array;
188-
should be an identifier expression. Use array() instead
189-
!error
224+
+------------------+
225+
| col |
226+
+------------------+
227+
| [0.1, 0.2, null] |
228+
| [0.1, 0.2, null] |
229+
| [null, 0.2, 0.1] |
230+
| [null, 0.2, 0.1] |
231+
+------------------+
232+
(4 rows)
233+
234+
!ok
190235

191236
#-------------------------------------------------------------------------
192237
# TESTCASE: test_mv_funcs TEST_ID: A1_B16_C9_D1
193238
#-------------------------------------------------------------------------
194239
SELECT mv_to_array(json_value(a_nested, '$[7][2]' RETURNING decimal array)) AS col
195240
FROM test_array;
196-
should be an identifier expression. Use array() instead
197-
!error
241+
+------------------+
242+
| col |
243+
+------------------+
244+
| [0.1, 0.2, null] |
245+
| [0.1, 0.2, null] |
246+
| [null, 0.2, 0.1] |
247+
| [null, 0.2, 0.1] |
248+
+------------------+
249+
(4 rows)
250+
251+
!ok
198252

199253
#-------------------------------------------------------------------------
200254
# TESTCASE: test_mv_funcs TEST_ID: A1_B16_C10_D1
@@ -218,16 +272,34 @@ FROM test_array;
218272
#-------------------------------------------------------------------------
219273
SELECT mv_to_array(json_value(a_nested, '$[3]' RETURNING varchar array)) AS col
220274
FROM test_array;
221-
should be an identifier expression. Use array() instead
222-
!error
275+
+----------------+
276+
| col |
277+
+----------------+
278+
| [S1, S2, null] |
279+
| [S1, S2, null] |
280+
| [null, S2, S1] |
281+
| [null, S2, S1] |
282+
+----------------+
283+
(4 rows)
284+
285+
!ok
223286

224287
#-------------------------------------------------------------------------
225288
# TESTCASE: test_mv_funcs TEST_ID: A1_B16_C12_D1
226289
#-------------------------------------------------------------------------
227290
SELECT mv_to_array(json_value(a_nested, '$[7][3]' RETURNING varchar array)) AS col
228291
FROM test_array;
229-
should be an identifier expression. Use array() instead
230-
!error
292+
+----------------+
293+
| col |
294+
+----------------+
295+
| [S1, S2, null] |
296+
| [S1, S2, null] |
297+
| [null, S2, S1] |
298+
| [null, S2, S1] |
299+
+----------------+
300+
(4 rows)
301+
302+
!ok
231303

232304
#-------------------------------------------------------------------------
233305
# TESTCASE: test_mv_funcs TEST_ID: A1_B16_C13_D1
@@ -251,16 +323,34 @@ FROM test_array;
251323
#-------------------------------------------------------------------------
252324
SELECT mv_to_array(json_value(a_nested, '$[4]' RETURNING varchar array)) AS col
253325
FROM test_array;
254-
should be an identifier expression. Use array() instead
255-
!error
326+
+--------------------+
327+
| col |
328+
+--------------------+
329+
| [null, null, null] |
330+
| [null, null, null] |
331+
| [null, null, null] |
332+
| [null, null, null] |
333+
+--------------------+
334+
(4 rows)
335+
336+
!ok
256337

257338
#-------------------------------------------------------------------------
258339
# TESTCASE: test_mv_funcs TEST_ID: A1_B16_C15_D1
259340
#-------------------------------------------------------------------------
260341
SELECT mv_to_array(json_value(a_nested, '$[7][4]' RETURNING varchar array)) AS col
261342
FROM test_array;
262-
should be an identifier expression. Use array() instead
263-
!error
343+
+--------------------+
344+
| col |
345+
+--------------------+
346+
| [null, null, null] |
347+
| [null, null, null] |
348+
| [null, null, null] |
349+
| [null, null, null] |
350+
+--------------------+
351+
(4 rows)
352+
353+
!ok
264354

265355
#-------------------------------------------------------------------------
266356
# TESTCASE: test_mv_funcs TEST_ID: A1_B16_C16_D1
@@ -284,16 +374,34 @@ FROM test_array;
284374
#-------------------------------------------------------------------------
285375
SELECT mv_to_array(json_value(a_nested, '$[5]' RETURNING varchar array)) AS col
286376
FROM test_array;
287-
should be an identifier expression. Use array() instead
288-
!error
377+
+-----+
378+
| col |
379+
+-----+
380+
| [] |
381+
| [] |
382+
| [] |
383+
| [] |
384+
+-----+
385+
(4 rows)
386+
387+
!ok
289388

290389
#-------------------------------------------------------------------------
291390
# TESTCASE: test_mv_funcs TEST_ID: A1_B16_C18_D1
292391
#-------------------------------------------------------------------------
293392
SELECT mv_to_array(json_value(a_nested, '$[7][5]' RETURNING varchar array)) AS col
294393
FROM test_array;
295-
should be an identifier expression. Use array() instead
296-
!error
394+
+-----+
395+
| col |
396+
+-----+
397+
| [] |
398+
| [] |
399+
| [] |
400+
| [] |
401+
+-----+
402+
(4 rows)
403+
404+
!ok
297405

298406
#-------------------------------------------------------------------------
299407
# TESTCASE: test_mv_funcs TEST_ID: A1_B16_C19_D1
@@ -317,16 +425,34 @@ FROM test_array;
317425
#-------------------------------------------------------------------------
318426
SELECT mv_to_array(json_value(a_nested, '$[6]' RETURNING varchar array)) AS col
319427
FROM test_array;
320-
should be an identifier expression. Use array() instead
321-
!error
428+
+--------------------------+
429+
| col |
430+
+--------------------------+
431+
| [null, S1, 0.1, 1, true] |
432+
| [null, S1, 0.1, 1, true] |
433+
| [true, 1, 0.1, S1, null] |
434+
| [true, 1, 0.1, S1, null] |
435+
+--------------------------+
436+
(4 rows)
437+
438+
!ok
322439

323440
#-------------------------------------------------------------------------
324441
# TESTCASE: test_mv_funcs TEST_ID: A1_B16_C21_D1
325442
#-------------------------------------------------------------------------
326443
SELECT mv_to_array(json_value(a_nested, '$[7][6]' RETURNING varchar array)) AS col
327444
FROM test_array;
328-
should be an identifier expression. Use array() instead
329-
!error
445+
+--------------------------+
446+
| col |
447+
+--------------------------+
448+
| [null, S1, 0.1, 1, true] |
449+
| [null, S1, 0.1, 1, true] |
450+
| [true, 1, 0.1, S1, null] |
451+
| [true, 1, 0.1, S1, null] |
452+
+--------------------------+
453+
(4 rows)
454+
455+
!ok
330456

331457
#-------------------------------------------------------------------------
332458
# TESTCASE: test_mv_funcs TEST_ID: A1_B16_C22_D1
@@ -341,14 +467,31 @@ Cannot apply 'MV_TO_ARRAY' to arguments of type 'MV_TO_ARRAY(
341467
#-------------------------------------------------------------------------
342468
SELECT mv_to_array(json_value(a_nested, '$[7]' RETURNING varchar array)) AS col
343469
FROM test_array;
344-
should be an identifier expression. Use array() instead
345-
!error
470+
+-----+
471+
| col |
472+
+-----+
473+
| |
474+
| |
475+
| |
476+
| |
477+
+-----+
478+
(4 rows)
479+
480+
!ok
346481

347482
#-------------------------------------------------------------------------
348483
# TESTCASE: test_mv_funcs TEST_ID: A1_B16_C24_D1
349484
#-------------------------------------------------------------------------
350485
SELECT mv_to_array(json_value(a_nested, '$[7][7]' RETURNING varchar array)) AS col
351486
FROM test_array;
352-
should be an identifier expression. Use array() instead
353-
!error
487+
+-----+
488+
| col |
489+
+-----+
490+
| |
491+
| |
492+
| |
493+
| |
494+
+-----+
495+
(4 rows)
354496

497+
!ok

0 commit comments

Comments
 (0)