[SPARK-53143][SQL] Fix self join in DataFrame API - Join is not the only expected output from analyzer

davidm-db · cloud-fan · cloud-fan · commit 1f1bacca720b · 2025-08-28T21:34:56.000+08:00
### What changes were proposed in this pull request? `Dataset.resolveSelfJoinCondition` expects that analyzer output will always be of `Join` type and that is not correct. There are edge cases when analyzer can produce a plan that has `Project` as the top node. This PR fixes failures in such cases. Check [SPARK-53143](https://issues.apache.org/jira/browse/SPARK-53143) for more details. ### Why are the changes needed? Bug fix. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Manually in shell. Unit test to cover the problematic case. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #51873 from davidm-db/spark-53143. Lead-authored-by: David Milicevic <david.milicevic@databricks.com> Co-authored-by: Wenchen Fan <cloud0fan@gmail.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/classic/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/classic/Dataset.scala
@@ -28,7 +28,7 @@ import scala.util.control.NonFatal
 
 import org.apache.commons.text.StringEscapeUtils
 
-import org.apache.spark.{sql, TaskContext}
+import org.apache.spark.{sql, SparkException, TaskContext}
 import org.apache.spark.annotation.{DeveloperApi, Stable, Unstable}
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.api.java.function._
@@ -649,7 +649,7 @@ class Dataset[T] private[sql](
   private def resolveSelfJoinCondition(
       right: Dataset[_],
       joinExprs: Option[Column],
-      joinType: String): Join = {
+      joinType: String): LogicalPlan = {
     // Note that in this function, we introduce a hack in the case of self-join to automatically
     // resolve ambiguous join conditions into ones that might make sense [SPARK-6231].
     // Consider this case: df.join(df, df("key") === df("key"))
@@ -660,28 +660,40 @@ class Dataset[T] private[sql](
 
     // Trigger analysis so in the case of self-join, the analyzer will clone the plan.
     // After the cloning, left and right side will have distinct expression ids.
-    val plan = withPlan(
-      Join(logicalPlan, right.logicalPlan,
-        JoinType(joinType), joinExprs.map(_.expr), JoinHint.NONE))
-      .queryExecution.analyzed.asInstanceOf[Join]
+    val planToAnalyze = Join(
+      logicalPlan, right.logicalPlan, JoinType(joinType), joinExprs.map(_.expr), JoinHint.NONE)
+    val analyzedJoinPlan = withPlan(planToAnalyze).queryExecution.analyzed
 
     // If auto self join alias is disabled, return the plan.
     if (!sparkSession.sessionState.conf.dataFrameSelfJoinAutoResolveAmbiguity) {
-      return plan
+      return analyzedJoinPlan
     }
 
     // If left/right have no output set intersection, return the plan.
     val lanalyzed = this.queryExecution.analyzed
     val ranalyzed = right.queryExecution.analyzed
     if (lanalyzed.outputSet.intersect(ranalyzed.outputSet).isEmpty) {
-      return plan
+      return analyzedJoinPlan
     }
 
     // Otherwise, find the trivially true predicates and automatically resolves them to both sides.
     // By the time we get here, since we have already run analysis, all attributes should've been
     // resolved and become AttributeReference.
-
-    JoinWith.resolveSelfJoinCondition(sparkSession.sessionState.analyzer.resolver, plan)
+    analyzedJoinPlan match {
+      case project @ Project(_, join: Join) =>
+        // SPARK-53143: Handling edge-cases when `AddMetadataColumns` analyzer rule adds `Project`
+        //              node on top of `Join` node.
+        // Check "SPARK-53143: self join edge-case when Join is not returned by the analyzer" in
+        //   `DataframeSelfJoinSuite` for more details.
+        val newProject = project.copy(child = JoinWith.resolveSelfJoinCondition(
+          sparkSession.sessionState.analyzer.resolver, join))
+        newProject.copyTagsFrom(project)
+        newProject
+      case join: Join =>
+        JoinWith.resolveSelfJoinCondition(sparkSession.sessionState.analyzer.resolver, join)
+      case _ => throw SparkException.internalError(
+        s"Unexpected plan type: ${analyzedJoinPlan.getClass.getName} for self join resolution.")
+    }
   }
 
   /** @inheritdoc */
@@ -781,28 +793,38 @@ class Dataset[T] private[sql](
       tolerance: Column,
       allowExactMatches: Boolean,
       direction: String): DataFrame = {
-    val joined = resolveSelfJoinCondition(other, Option(joinExprs), joinType)
-    val leftAsOfExpr = leftAsOf.expr.transformUp {
-      case a: AttributeReference if logicalPlan.outputSet.contains(a) =>
-        val index = logicalPlan.output.indexWhere(_.exprId == a.exprId)
-        joined.left.output(index)
-    }
-    val rightAsOfExpr = rightAsOf.expr.transformUp {
-      case a: AttributeReference if other.logicalPlan.outputSet.contains(a) =>
-        val index = other.logicalPlan.output.indexWhere(_.exprId == a.exprId)
-        joined.right.output(index)
-    }
-    withPlan {
+
+    def createAsOfJoinPlan(joinPlan: Join): AsOfJoin = {
+      val leftAsOfExpr = leftAsOf.expr.transformUp {
+        case a: AttributeReference if logicalPlan.outputSet.contains(a) =>
+          val index = logicalPlan.output.indexWhere(_.exprId == a.exprId)
+          joinPlan.left.output(index)
+      }
+      val rightAsOfExpr = rightAsOf.expr.transformUp {
+        case a: AttributeReference if other.logicalPlan.outputSet.contains(a) =>
+          val index = other.logicalPlan.output.indexWhere(_.exprId == a.exprId)
+          joinPlan.right.output(index)
+      }
       AsOfJoin(
-        joined.left, joined.right,
+        joinPlan.left, joinPlan.right,
         leftAsOfExpr, rightAsOfExpr,
-        joined.condition,
-        joined.joinType,
+        joinPlan.condition,
+        joinPlan.joinType,
         Option(tolerance).map(_.expr),
         allowExactMatches,
         AsOfJoinDirection(direction)
       )
     }
+
+    resolveSelfJoinCondition(other, Option(joinExprs), joinType) match {
+      case project @ Project(_, join: Join) =>
+        val newProjectPlan = project.copy(child = createAsOfJoinPlan(join))
+        newProjectPlan.copyTagsFrom(project)
+        withPlan { newProjectPlan }
+      case join: Join => withPlan { createAsOfJoinPlan(join) }
+      case plan => throw SparkException.internalError(
+        s"Unexpected plan type: ${plan.getClass.getName} returned from self join resolution.")
+    }
   }
 
   /** @inheritdoc */
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSelfJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSelfJoinSuite.scala
@@ -527,4 +527,30 @@ class DataFrameSelfJoinSuite extends QueryTest with SharedSparkSession {
       }
     }
   }
+
+  test("SPARK-53143: self join edge-case when Join is not returned by the analyzer") {
+    withTable("table_1", "table_2") {
+      // Edge case with multiple joins. Example: two joins, where the latter one is self join.
+      // The first one is the "using" join - in this case, analyzer's
+      //   `ResolveNaturalAndUsingJoin` will add `Project` as the top node.
+      // The second join is a self join, but with specified join condition (i.e. `joinExprs`) -
+      //   if the join condition uses columns that are not part of the project list (of the first
+      //   join), `AddMetadataColumns` rule will be hit to add metadata for those columns. As a
+      //   consequence, `Project` will be added to the top of joined plan to return the
+      //   original/expected list of projected columns.
+      // Whereas similar (i.e. `Project` node on top) can happen in multiple other cases,
+      //   from `Dataset` perspective the issue is specific to self joins only, since
+      //   `resolveSelfJoinCondition` assumed that the analyzed plan will be always of `Join` type.
+      sql("CREATE TABLE IF NOT EXISTS table_1 (id INT);")
+      sql("INSERT INTO table_1 VALUES (1), (2);")
+      sql("CREATE TABLE IF NOT EXISTS table_2 (id INT, col_1 STRING);")
+      sql("INSERT INTO table_2 VALUES (1, 'str'), (2, 'test');")
+      val df = spark.table("table_2").where("col_1 = 'test'").select("id")
+      assert(
+        spark.table("table_1").alias("t")
+          .join(df.alias("df1"), usingColumns = Seq("id"))
+          .join(df.alias("df2"), joinExprs = $"df1.id" === $"df2.id", joinType = "left")
+          .count() == 1)
+    }
+  }
 }