feat: auto scan mode should check for supported file location (#1930)

andygrove · web-flow · commit eab58d4925f2 · 2025-06-27T11:58:42.000-06:00
diff --git a/dev/diffs/3.4.3.diff b/dev/diffs/3.4.3.diff
@@ -2868,6 +2868,28 @@ index 52abd248f3a..7a199931a08 100644
        case h: HiveTableScanExec => h.partitionPruningPred.collect {
          case d: DynamicPruningExpression => d.child
        }
+diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala
+index de3b1ffccf0..2a76d127093 100644
+--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala
++++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala
+@@ -23,14 +23,15 @@ import java.util.concurrent.{Executors, TimeUnit}
+ import org.scalatest.BeforeAndAfterEach
+ 
+ import org.apache.spark.metrics.source.HiveCatalogMetrics
+-import org.apache.spark.sql.QueryTest
++import org.apache.spark.sql.{IgnoreCometSuite, QueryTest}
+ import org.apache.spark.sql.execution.datasources.FileStatusCache
+ import org.apache.spark.sql.hive.test.TestHiveSingleton
+ import org.apache.spark.sql.internal.SQLConf
+ import org.apache.spark.sql.test.SQLTestUtils
+ 
+ class PartitionedTablePerfStatsSuite
+-  extends QueryTest with TestHiveSingleton with SQLTestUtils with BeforeAndAfterEach {
++  extends QueryTest with TestHiveSingleton with SQLTestUtils with BeforeAndAfterEach
++    with IgnoreCometSuite {
+ 
+   override def beforeEach(): Unit = {
+     super.beforeEach()
 diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
 index a902cb3a69e..800a3acbe99 100644
 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
diff --git a/dev/diffs/3.5.6.diff b/dev/diffs/3.5.6.diff
@@ -2883,6 +2883,28 @@ index 549431ef4f4..e48f1730da6 100644
      withTempDir { dir =>
        withSQLConf(
          "parquet.crypto.factory.class" ->
+diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala
+index de3b1ffccf0..2a76d127093 100644
+--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala
++++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala
+@@ -23,14 +23,15 @@ import java.util.concurrent.{Executors, TimeUnit}
+ import org.scalatest.BeforeAndAfterEach
+ 
+ import org.apache.spark.metrics.source.HiveCatalogMetrics
+-import org.apache.spark.sql.QueryTest
++import org.apache.spark.sql.{IgnoreCometSuite, QueryTest}
+ import org.apache.spark.sql.execution.datasources.FileStatusCache
+ import org.apache.spark.sql.hive.test.TestHiveSingleton
+ import org.apache.spark.sql.internal.SQLConf
+ import org.apache.spark.sql.test.SQLTestUtils
+ 
+ class PartitionedTablePerfStatsSuite
+-  extends QueryTest with TestHiveSingleton with SQLTestUtils with BeforeAndAfterEach {
++  extends QueryTest with TestHiveSingleton with SQLTestUtils with BeforeAndAfterEach
++    with IgnoreCometSuite {
+ 
+   override def beforeEach(): Unit = {
+     super.beforeEach()
 diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
 index 6160c3e5f6c..0956d7d9edc 100644
 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
diff --git a/dev/diffs/4.0.0-preview1.diff b/dev/diffs/4.0.0-preview1.diff
@@ -3079,6 +3079,28 @@ index 52abd248f3a..7a199931a08 100644
        case h: HiveTableScanExec => h.partitionPruningPred.collect {
          case d: DynamicPruningExpression => d.child
        }
+diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala
+index de3b1ffccf0..2a76d127093 100644
+--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala
++++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala
+@@ -23,14 +23,15 @@ import java.util.concurrent.{Executors, TimeUnit}
+ import org.scalatest.BeforeAndAfterEach
+ 
+ import org.apache.spark.metrics.source.HiveCatalogMetrics
+-import org.apache.spark.sql.QueryTest
++import org.apache.spark.sql.{IgnoreCometSuite, QueryTest}
+ import org.apache.spark.sql.execution.datasources.FileStatusCache
+ import org.apache.spark.sql.hive.test.TestHiveSingleton
+ import org.apache.spark.sql.internal.SQLConf
+ import org.apache.spark.sql.test.SQLTestUtils
+ 
+ class PartitionedTablePerfStatsSuite
+-  extends QueryTest with TestHiveSingleton with SQLTestUtils with BeforeAndAfterEach {
++  extends QueryTest with TestHiveSingleton with SQLTestUtils with BeforeAndAfterEach
++    with IgnoreCometSuite {
+ 
+   override def beforeEach(): Unit = {
+     super.beforeEach()
 diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
 index 0bcac639443..8957c76886f 100644
 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
diff --git a/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala b/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala
@@ -258,11 +258,15 @@ case class CometScanRule(session: SparkSession) extends Rule[SparkPlan] {
   }
 
   private def selectScan(scanExec: FileSourceScanExec, partitionSchema: StructType): String = {
-    // TODO these checks are not yet exhaustive. For example, native_iceberg_compat does
-    //  not support reading from S3
 
     val fallbackReasons = new ListBuffer[String]()
 
+    // native_iceberg_compat only supports local filesystem and S3
+    if (!scanExec.relation.inputFiles
+        .forall(path => path.startsWith("file://") || path.startsWith("s3a://"))) {
+      fallbackReasons += s"$SCAN_NATIVE_ICEBERG_COMPAT only supports local filesystem and S3"
+    }
+
     val typeChecker = CometScanTypeChecker(SCAN_NATIVE_ICEBERG_COMPAT)
     val schemaSupported =
       typeChecker.isSchemaSupported(scanExec.requiredSchema, fallbackReasons)
@@ -297,7 +301,8 @@ case class CometScanRule(session: SparkSession) extends Rule[SparkPlan] {
       fallbackReasons += s"$SCAN_NATIVE_ICEBERG_COMPAT requires ${COMET_EXEC_ENABLED.key}=true"
     }
 
-    if (cometExecEnabled && schemaSupported && partitionSchemaSupported && !knownIssues) {
+    if (cometExecEnabled && schemaSupported && partitionSchemaSupported && !knownIssues &&
+      fallbackReasons.isEmpty) {
       logInfo(s"Auto scan mode selecting $SCAN_NATIVE_ICEBERG_COMPAT")
       SCAN_NATIVE_ICEBERG_COMPAT
     } else {
diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/CometScanExec.scala b/spark/src/main/scala/org/apache/spark/sql/comet/CometScanExec.scala
@@ -74,6 +74,8 @@ case class CometScanExec(
     with ShimCometScanExec
     with CometPlan {
 
+  assert(scanImpl != CometConf.SCAN_AUTO)
+
   // FIXME: ideally we should reuse wrapped.supportsColumnar, however that fails many tests
   override lazy val supportsColumnar: Boolean =
     relation.fileFormat.supportBatch(relation.sparkSession, schema)