apache · viirya · Oct 12, 2025 · Oct 16, 2025 · cloud-fan · Oct 13, 2025
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/SupportsPushDownVariants.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/SupportsPushDownVariants.java
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.connector.read;
+
+import org.apache.spark.annotation.Evolving;
+
+/**
+ * A mix-in interface for {@link Scan}. Data sources can implement this interface to
+ * support pushing down variant field access operations to the data source.
+ * <p>
+ * When variant columns are accessed with specific field extractions (e.g., variant_get),
+ * the optimizer can push these accesses down to the data source. The data source can then
+ * read only the required fields from variant columns, reducing I/O and improving performance.
+ * <p>
+ * The typical workflow is:
+ * <ol>
+ *   <li>Optimizer analyzes the query plan and identifies variant field accesses</li>
+ *   <li>Optimizer calls {@link #pushVariantAccess} with the access information</li>
+ *   <li>Data source validates and stores the variant access information</li>
+ *   <li>Optimizer retrieves pushed information via {@link #pushedVariantAccess}</li>
+ *   <li>Data source uses the information to optimize reading in {@link #readSchema()}
+ *   and readers</li>
+ * </ol>
+ *
+ * @since 4.1.0
+ */
+@Evolving
+public interface SupportsPushDownVariants extends Scan {
+
+  /**
+   * Pushes down variant field access information to the data source.
+   * <p>
+   * Implementations should validate if the variant accesses can be pushed down based on
+   * the data source's capabilities. If some accesses cannot be pushed down, the implementation
+   * can choose to:
+   * <ul>
+   *   <li>Push down only the supported accesses and return true</li>
+   *   <li>Reject all pushdown and return false</li>
+   * </ul>
+   * <p>
+   * The implementation should store the variant access information that can be pushed down.
+   * The stored information will be retrieved later via {@link #pushedVariantAccess()}.
+   *
+   * @param variantAccessInfo Array of variant access information, one per variant column
+   * @return true if at least some variant accesses were pushed down, false if none were pushed
+   */
+  boolean pushVariantAccess(VariantAccessInfo[] variantAccessInfo);
+
+  /**
+   * Returns the variant access information that has been pushed down to this scan.
+   * <p>
+   * This method is called by the optimizer after {@link #pushVariantAccess} to retrieve
+   * what variant accesses were actually accepted by the data source. The optimizer uses
+   * this information to rewrite the query plan.
+   * <p>
+   * If {@link #pushVariantAccess} was not called or returned false, this should return
+   * an empty array.
+   *
+   * @return Array of pushed down variant access information
+   */
+  VariantAccessInfo[] pushedVariantAccess();
+}
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/VariantAccessInfo.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/VariantAccessInfo.java
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.connector.read;
+
+import java.io.Serializable;
+import java.util.Objects;
+
+import org.apache.spark.annotation.Evolving;
+import org.apache.spark.sql.types.StructType;
+
+/**
+ * Variant access information that describes how variant fields are accessed in a query.
+ * <p>
+ * This class captures the information needed by data sources to optimize reading variant columns.
+ * Instead of reading the entire variant value, the data source can read only the fields that
+ * are actually accessed, represented as a structured schema.
+ * <p>
+ * For example, if a query accesses `variant_get(v, '$.a', 'int')` and
+ * `variant_get(v, '$.b', 'string')`, the extracted schema would be
+ * `struct&lt;0:int, 1:string&gt;` where field ordinals correspond to the access order.
+ *
+ * @since 4.1.0
+ */
+@Evolving
+public final class VariantAccessInfo implements Serializable {
+  private final String columnName;
+  private final StructType extractedSchema;
+
+  /**
+   * Creates variant access information for a variant column.
+   *
+   * @param columnName The name of the variant column
+   * @param extractedSchema The schema representing extracted fields from the variant.
+   *                       Each field represents one variant field access, with field names
+   *                       typically being ordinals (e.g., "0", "1", "2") and metadata
+   *                       containing variant-specific information like JSON path.
+   */
+  public VariantAccessInfo(String columnName, StructType extractedSchema) {
+    this.columnName = Objects.requireNonNull(columnName, "columnName cannot be null");
+    this.extractedSchema =
+            Objects.requireNonNull(extractedSchema, "extractedSchema cannot be null");
+  }
+
+  /**
+   * Returns the name of the variant column.
+   */
+  public String columnName() {
+    return columnName;
+  }
+
+  /**
+   * Returns the schema representing fields extracted from the variant column.
+   * <p>
+   * The schema structure is:
+   * <ul>
+   *   <li>Field names: Typically ordinals ("0", "1", "2", ...) representing access order</li>
+   *   <li>Field types: The target data type for each field extraction</li>
+   *   <li>Field metadata: Contains variant-specific information such as JSON path,
+   *       timezone, and error handling mode</li>
+   * </ul>
+   * <p>
+   * Data sources should use this schema to determine what fields to extract from the variant
+   * and what types they should be converted to.
+   */
+  public StructType extractedSchema() {
+    return extractedSchema;
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if (this == o) return true;
+    if (o == null || getClass() != o.getClass()) return false;
+    VariantAccessInfo that = (VariantAccessInfo) o;
+    return columnName.equals(that.columnName) &&
+           extractedSchema.equals(that.extractedSchema);
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(columnName, extractedSchema);
+  }
+
+  @Override
+  public String toString() {
+    return "VariantAccessInfo{" +
+           "columnName='" + columnName + '\'' +
+           ", extractedSchema=" + extractedSchema +
+           '}';
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala
@@ -40,11 +40,11 @@ class SparkOptimizer(
       SchemaPruning,
       GroupBasedRowLevelOperationScanPlanning,
       V1Writes,
-      PushVariantIntoScan,
       V2ScanRelationPushDown,
       V2ScanPartitioningAndOrdering,
       V2Writes,
-      PruneFileSourcePartitions)
+      PruneFileSourcePartitions,
+      PushVariantIntoScan)
 
   override def preCBORules: Seq[Rule[LogicalPlan]] =
     Seq(OptimizeMetadataOnlyDeleteFromTable)