diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala index b3236bbfa3755..298658a95627f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala @@ -743,10 +743,11 @@ object SupportedBinaryExpr { object LikeSimplification extends Rule[LogicalPlan] with PredicateHelper { // if guards below protect from escapes on trailing %. // Cases like "something\%" are not optimized, but this does not affect correctness. - private val startsWith = "([^_%]+)%".r - private val endsWith = "%([^_%]+)".r - private val startsAndEndsWith = "([^_%]+)%([^_%]+)".r - private val contains = "%([^_%]+)%".r + // Consecutive wildcard characters are equivalent to a single wildcard character. + private val startsWith = "([^_%]+)%+".r + private val endsWith = "%+([^_%]+)".r + private val startsAndEndsWith = "([^_%]+)%+([^_%]+)".r + private val contains = "%+([^_%]+)%+".r private val equalTo = "([^_%]*)".r private def simplifyLike( diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LikeSimplificationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LikeSimplificationSuite.scala index 992170dbc0d24..e8cb2b2dd8b06 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LikeSimplificationSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LikeSimplificationSuite.scala @@ -165,6 +165,54 @@ class LikeSimplificationSuite extends PlanTest { comparePlans(optimized5, correctAnswer5) } + test("SPARK-52817: Spark SQL LIKE expressions show poor performance when using multiple '%'") { + val originalQuery1 = + testRelation + .where($"a" like "abc%%") + val optimized1 = Optimize.execute(originalQuery1.analyze) + val correctAnswer1 = testRelation + .where(StartsWith($"a", "abc")) + .analyze + comparePlans(optimized1, correctAnswer1) + + val originalQuery2 = + testRelation + .where($"a" like "%%xyz") + val optimized2 = Optimize.execute(originalQuery2.analyze) + val correctAnswer2 = testRelation + .where(EndsWith($"a", "xyz")) + .analyze + comparePlans(optimized2, correctAnswer2) + + val originalQuery3 = + testRelation + .where($"a" like "abc%%def") + val optimized3 = Optimize.execute(originalQuery3.analyze) + val correctAnswer3 = testRelation + .where( + (Length($"a") >= 6 && (StartsWith($"a", "abc") && EndsWith($"a", "def")))) + .analyze + comparePlans(optimized3, correctAnswer3) + + val originalQuery4 = + testRelation + .where(($"a" like "%%mn%%")) + val optimized4 = Optimize.execute(originalQuery4.analyze) + val correctAnswer4 = testRelation + .where(Contains($"a", "mn")) + .analyze + comparePlans(optimized4, correctAnswer4) + + val originalQuery5 = + testRelation + .where(($"a" like "%%%mn%%%")) + val optimized5 = Optimize.execute(originalQuery5.analyze) + val correctAnswer5 = testRelation + .where(Contains($"a", "mn")) + .analyze + comparePlans(optimized5, correctAnswer5) + } + test("simplify LikeAll") { val originalQuery = testRelation