apache · comphead · Aug 11, 2025 · Jul 2, 2025 · Jul 2, 2025 · Jul 2, 2025
diff --git a/native/core/src/execution/planner.rs b/native/core/src/execution/planner.rs
@@ -85,6 +85,13 @@ use datafusion::physical_expr::window::WindowExpr;
 use datafusion::physical_expr::LexOrdering;
 
 use crate::parquet::parquet_exec::init_datasource_exec;
+use arrow::array::{
+    BinaryBuilder, BooleanArray, Date32Array, Decimal128Array, Float32Array, Float64Array,
+    Int16Array, Int32Array, Int64Array, Int8Array, NullArray, StringBuilder,
+    TimestampMicrosecondArray,
+};
+use arrow::buffer::BooleanBuffer;
+use datafusion::common::utils::SingleRowListArrayBuilder;
 use datafusion::physical_plan::coalesce_batches::CoalesceBatchesExec;
 use datafusion::physical_plan::filter::FilterExec as DataFusionFilterExec;
 use datafusion_comet_proto::spark_operator::SparkFilePartition;
@@ -474,6 +481,125 @@ impl PhysicalPlanner {
                                     )))
                                 }
                             }
+                        },
+                        Value::ListVal(values) => {
+                            if let DataType::List(f) = data_type {
+                                match f.data_type() {
+                                    DataType::Null => {
+                                        SingleRowListArrayBuilder::new(Arc::new(NullArray::new(values.clone().null_mask.len())))
+                                            .build_list_scalar()
+                                    }
+                                    DataType::Boolean => {
+                                        let vals = values.clone();
+                                        SingleRowListArrayBuilder::new(Arc::new(BooleanArray::new(BooleanBuffer::from(vals.boolean_values), Some(vals.null_mask.into()))))
+                                            .build_list_scalar()
+                                    }
+                                    DataType::Int8 => {
+                                        let vals = values.clone();
+                                        SingleRowListArrayBuilder::new(Arc::new(Int8Array::new(vals.byte_values.iter().map(|&x| x as i8).collect::<Vec<_>>().into(), Some(vals.null_mask.into()))))
+                                            .build_list_scalar()
+                                    }
+                                    DataType::Int16 => {
+                                        let vals = values.clone();
+                                        SingleRowListArrayBuilder::new(Arc::new(Int16Array::new(vals.short_values.iter().map(|&x| x as i16).collect::<Vec<_>>().into(), Some(vals.null_mask.into()))))
+                                            .build_list_scalar()
+                                    }
+                                    DataType::Int32 => {
+                                        let vals = values.clone();
+                                        SingleRowListArrayBuilder::new(Arc::new(Int32Array::new(vals.int_values.into(), Some(vals.null_mask.into()))))
+                                            .build_list_scalar()
+                                    }
+                                    DataType::Int64 => {
+                                        let vals = values.clone();
+                                        SingleRowListArrayBuilder::new(Arc::new(Int64Array::new(vals.long_values.into(), Some(vals.null_mask.into()))))
+                                            .build_list_scalar()
+                                    }
+                                    DataType::Float32 => {
+                                        let vals = values.clone();
+                                        SingleRowListArrayBuilder::new(Arc::new(Float32Array::new(vals.float_values.into(), Some(vals.null_mask.into()))))
+                                            .build_list_scalar()
+                                    }
+                                    DataType::Float64 => {
+                                        let vals = values.clone();
+                                        SingleRowListArrayBuilder::new(Arc::new(Float64Array::new(vals.double_values.into(), Some(vals.null_mask.into()))))
+                                            .build_list_scalar()
+                                    }
+                                    DataType::Timestamp(TimeUnit::Microsecond, None) => {
+                                        let vals = values.clone();
+                                        SingleRowListArrayBuilder::new(Arc::new(TimestampMicrosecondArray::new(vals.long_values.into(), Some(vals.null_mask.into()))))
+                                            .build_list_scalar()
+                                    }
+                                    DataType::Timestamp(TimeUnit::Microsecond, Some(tz)) => {
+                                        let vals = values.clone();
+                                        SingleRowListArrayBuilder::new(Arc::new(TimestampMicrosecondArray::new(vals.long_values.into(), Some(vals.null_mask.into())).with_timezone(Arc::clone(tz))))
+                                            .build_list_scalar()
+                                    }
+                                    DataType::Date32 => {
+                                        let vals = values.clone();
+                                        SingleRowListArrayBuilder::new(Arc::new(Date32Array::new(vals.int_values.into(), Some(vals.null_mask.into()))))
+                                            .build_list_scalar()
+                                    }
+                                    DataType::Binary => {
+                                        // Using a builder as it is cumbersome to create BinaryArray from a vector with nulls
+                                        // and calculate correct offsets
+                                        let vals = values.clone();
+                                        let item_capacity = vals.string_values.len();
+                                        let data_capacity = vals.string_values.first().map(|s| s.len() * item_capacity).unwrap_or(0);
+                                        let mut arr = BinaryBuilder::with_capacity(item_capacity, data_capacity);
+
+                                        for (i, v) in vals.bytes_values.into_iter().enumerate() {
+                                            if vals.null_mask[i] {
+                                                arr.append_value(v);
+                                            } else {
+                                                arr.append_null();
+                                            }
+                                        }
+
+                                        SingleRowListArrayBuilder::new(Arc::new(arr.finish()))
+                                            .build_list_scalar()
+                                    }
+                                    DataType::Utf8 => {
+                                        // Using a builder as it is cumbersome to create StringArray from a vector with nulls
+                                        // and calculate correct offsets
+                                        let vals = values.clone();
+                                        let item_capacity = vals.string_values.len();
+                                        let data_capacity = vals.string_values.first().map(|s| s.len() * item_capacity).unwrap_or(0);
+                                        let mut arr = StringBuilder::with_capacity(item_capacity, data_capacity);
+
+                                        for (i, v) in vals.string_values.into_iter().enumerate() {
+                                            if vals.null_mask[i] {
+                                                arr.append_value(v);
+                                            } else {
+                                                arr.append_null();
+                                            }
+                                        }
+
+                                        SingleRowListArrayBuilder::new(Arc::new(arr.finish()))
+                                            .build_list_scalar()
+                                    }
+                                    DataType::Decimal128(p, s) => {
+                                        let vals = values.clone();
+                                        SingleRowListArrayBuilder::new(Arc::new(Decimal128Array::new(vals.decimal_values.into_iter().map(|v| {
+                                            let big_integer = BigInt::from_signed_bytes_be(&v);
+                                            big_integer.to_i128().ok_or_else(|| {
+                                                GeneralError(format!(
+                                                    "Cannot parse {big_integer:?} as i128 for Decimal literal"
+                                                ))
+                                            }).unwrap()
+                                        }).collect::<Vec<_>>().into(), Some(vals.null_mask.into())).with_precision_and_scale(*p, *s)?)).build_list_scalar()
+                                    }
+                                    dt => {
+                                        return Err(GeneralError(format!(
+                                            "DataType::List literal does not support {dt:?} type"
+                                        )))
+                                    }
+                                }
+
+                            } else {
+                                return Err(GeneralError(format!(
+                                    "Expected DataType::List but got {data_type:?}"
+                                )))
+                            }
                         }
                     }
                 };
@@ -1300,6 +1426,7 @@ impl PhysicalPlanner {
                 // The `ScanExec` operator will take actual arrays from Spark during execution
                 let scan =
                     ScanExec::new(self.exec_context_id, input_source, &scan.source, data_types)?;
+
                 Ok((
                     vec![scan.clone()],
                     Arc::new(SparkPlan::new(spark_plan.plan_id, Arc::new(scan), vec![])),
@@ -2322,7 +2449,6 @@ impl PhysicalPlanner {
                         other => other,
                     };
                     let func = self.session_ctx.udf(fun_name)?;
-
                     let coerced_types = func
                         .coerce_types(&input_expr_types)
                         .unwrap_or_else(|_| input_expr_types.clone());

diff --git a/native/proto/src/lib.rs b/native/proto/src/lib.rs
@@ -21,6 +21,7 @@
 
 // Include generated modules from .proto files.
 #[allow(missing_docs)]
+#[allow(clippy::large_enum_variant)]
 pub mod spark_expression {
     include!(concat!("generated", "/spark.spark_expression.rs"));
 }

diff --git a/native/proto/src/proto/expr.proto b/native/proto/src/proto/expr.proto
@@ -21,6 +21,8 @@ syntax = "proto3";
 
 package spark.spark_expression;
 
+import "types.proto";
+
 option java_package = "org.apache.comet.serde";
 
 // The basic message representing a Spark expression.
@@ -112,13 +114,13 @@ enum StatisticsType {
 }
 
 message Count {
-   repeated Expr children = 1;
+  repeated Expr children = 1;
 }
 
 message Sum {
-   Expr child = 1;
-   DataType datatype = 2;
-   bool fail_on_error = 3;
+  Expr child = 1;
+  DataType datatype = 2;
+  bool fail_on_error = 3;
 }
 
 message Min {
@@ -215,10 +217,11 @@ message Literal {
     string string_val = 8;
     bytes bytes_val = 9;
     bytes decimal_val = 10;
-   }
+    ListLiteral list_val = 11;
+  }
 
-   DataType datatype = 11;
-   bool is_null = 12;
+  DataType datatype = 12;
+  bool is_null = 13;
 }
 
 enum EvalMode {
@@ -478,5 +481,4 @@ message DataType {
   }
 
   DataTypeInfo type_info = 2;
-}
-
+}
diff --git a/native/proto/src/proto/types.proto b/native/proto/src/proto/types.proto
@@ -0,0 +1,41 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+
+
+syntax = "proto3";
+
+package spark.spark_expression;
+
+option java_package = "org.apache.comet.serde";
+
+message ListLiteral {
+  // Only one of these fields should be populated based on the array type
+  repeated bool boolean_values = 1;
+  repeated int32 byte_values = 2;
+  repeated int32 short_values = 3;
+  repeated int32 int_values = 4;
+  repeated int64 long_values = 5;
+  repeated float float_values = 6;
+  repeated double double_values = 7;
+  repeated string string_values = 8;
+  repeated bytes bytes_values = 9;
+  repeated bytes decimal_values = 10;
+  repeated ListLiteral list_values = 11;
+
+  repeated bool null_mask = 12;
+}
diff --git a/native/spark-expr/src/conversion_funcs/cast.rs b/native/spark-expr/src/conversion_funcs/cast.rs
@@ -20,6 +20,7 @@ use crate::utils::array_with_timezone;
 use crate::{EvalMode, SparkError, SparkResult};
 use arrow::array::builder::StringBuilder;
 use arrow::array::{DictionaryArray, StringArray, StructArray};
+use arrow::compute::can_cast_types;
 use arrow::datatypes::{DataType, Schema};
 use arrow::{
     array::{
@@ -968,6 +969,9 @@ fn cast_array(
             to_type,
             cast_options,
         )?),
+        (List(_), List(_)) if can_cast_types(from_type, to_type) => {
+            Ok(cast_with_options(&array, to_type, &CAST_OPTIONS)?)
+        }
         (UInt8 | UInt16 | UInt32 | UInt64, Int8 | Int16 | Int32 | Int64)
             if cast_options.allow_cast_unsigned_ints =>
         {
@@ -1018,7 +1022,7 @@ fn is_datafusion_spark_compatible(
         DataType::Int8 | DataType::Int16 | DataType::Int32 | DataType::Int64 => {
             // note that the cast from Int32/Int64 -> Decimal128 here is actually
             // not compatible with Spark (no overflow checks) but we have tests that
-            // rely on this cast working so we have to leave it here for now
+            // rely on this cast working, so we have to leave it here for now
             matches!(
                 to_type,
                 DataType::Boolean

diff --git a/spark/src/main/scala/org/apache/comet/DataTypeSupport.scala b/spark/src/main/scala/org/apache/comet/DataTypeSupport.scala
@@ -73,4 +73,9 @@ object DataTypeSupport {
   val ARRAY_ELEMENT = "array element"
   val MAP_KEY = "map key"
   val MAP_VALUE = "map value"
+
+  def isComplexType(dt: DataType): Boolean = dt match {
+    case _: StructType | _: ArrayType | _: MapType => true
+    case _ => false
+  }
 }
diff --git a/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala b/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala
@@ -19,7 +19,7 @@
 
 package org.apache.comet.expressions
 
-import org.apache.spark.sql.types.{DataType, DataTypes, DecimalType, StructType}
+import org.apache.spark.sql.types.{ArrayType, DataType, DataTypes, DecimalType, NullType, StructType}
 
 sealed trait SupportLevel
 
@@ -62,6 +62,9 @@ object CometCast {
     }
 
     (fromType, toType) match {
+      case (dt: ArrayType, _: ArrayType) if dt.elementType == NullType => Compatible()
+      case (dt: ArrayType, dt1: ArrayType) =>
+        isSupported(dt.elementType, dt1.elementType, timeZoneId, evalMode)
       case (dt: DataType, _) if dt.typeName == "timestamp_ntz" =>
         // https://github.com/apache/datafusion-comet/issues/378
         toType match {

diff --git a/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala b/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala
@@ -37,6 +37,7 @@ import org.apache.spark.sql.types._
 import org.apache.comet.{CometConf, CometSparkSessionExtensions, DataTypeSupport}
 import org.apache.comet.CometConf._
 import org.apache.comet.CometSparkSessionExtensions.{isCometLoaded, isCometScanEnabled, withInfo, withInfos}
+import org.apache.comet.DataTypeSupport.isComplexType
 import org.apache.comet.parquet.{CometParquetScan, SupportsComet}
 
 /**
@@ -277,11 +278,6 @@ case class CometScanRule(session: SparkSession) extends Rule[SparkPlan] {
     val partitionSchemaSupported =
       typeChecker.isSchemaSupported(partitionSchema, fallbackReasons)
 
-    def isComplexType(dt: DataType): Boolean = dt match {
-      case _: StructType | _: ArrayType | _: MapType => true
-      case _ => false
-    }
-
     def hasMapsContainingStructs(dataType: DataType): Boolean = {
       dataType match {
         case s: StructType => s.exists(field => hasMapsContainingStructs(field.dataType))