From 2050ca6696a7b8123752d85f63b2f5379c22c6b7 Mon Sep 17 00:00:00 2001 From: Yikf Date: Thu, 7 Jul 2022 14:56:26 +0800 Subject: [PATCH 1/2] Support url encode/decode function --- .../catalyst/expressions/ExpressionInfo.java | 2 +- .../catalyst/analysis/FunctionRegistry.scala | 6 +- .../expressions/stringExpressions.scala | 177 ----------- .../catalyst/expressions/urlExpressions.scala | 290 ++++++++++++++++++ .../sql-functions/sql-expression-schema.md | 2 + .../sql-tests/inputs/url-functions.sql | 19 ++ .../sql-tests/results/url-functions.sql.out | 111 +++++++ .../spark/sql/StringFunctionsSuite.scala | 47 --- .../apache/spark/sql/UrlFunctionsSuite.scala | 85 +++++ 9 files changed, 513 insertions(+), 226 deletions(-) create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/urlExpressions.scala create mode 100644 sql/core/src/test/resources/sql-tests/inputs/url-functions.sql create mode 100644 sql/core/src/test/resources/sql-tests/results/url-functions.sql.out create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/UrlFunctionsSuite.scala diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionInfo.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionInfo.java index 9ed764a348503..be2b3dbe81990 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionInfo.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionInfo.java @@ -45,7 +45,7 @@ public class ExpressionInfo { "collection_funcs", "predicate_funcs", "conditional_funcs", "conversion_funcs", "csv_funcs", "datetime_funcs", "generator_funcs", "hash_funcs", "json_funcs", "lambda_funcs", "map_funcs", "math_funcs", "misc_funcs", "string_funcs", "struct_funcs", - "window_funcs", "xml_funcs", "table_funcs")); + "window_funcs", "xml_funcs", "table_funcs", "url_funcs")); private static final Set validSources = new HashSet<>(Arrays.asList("built-in", "hive", "python_udf", "scala_udf", "java_udf")); diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala index 52d84cfa17504..f25f6d3060f35 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala @@ -544,7 +544,6 @@ object FunctionRegistry { expressionBuilder("lpad", LPadExpressionBuilder), expression[StringTrimLeft]("ltrim"), expression[JsonTuple]("json_tuple"), - expression[ParseUrl]("parse_url"), expression[StringLocate]("position", true), expression[FormatString]("printf", true), expression[RegExpExtract]("regexp_extract"), @@ -586,6 +585,11 @@ object FunctionRegistry { expression[XPathString]("xpath_string"), expression[RegExpCount]("regexp_count"), + // url functions + expression[UrlEncode]("url_encode"), + expression[UrlDecode]("url_decode"), + expression[ParseUrl]("parse_url"), + // datetime functions expression[AddMonths]("add_months"), expression[CurrentDate]("current_date"), diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index a4c5af582fae8..bc24a12f08335 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -17,11 +17,9 @@ package org.apache.spark.sql.catalyst.expressions -import java.net.{URI, URISyntaxException} import java.text.{BreakIterator, DecimalFormat, DecimalFormatSymbols} import java.util.{Base64 => JBase64} import java.util.{HashMap, Locale, Map => JMap} -import java.util.regex.Pattern import scala.collection.mutable.ArrayBuffer @@ -1626,181 +1624,6 @@ case class StringRPad(str: Expression, len: Expression, pad: Expression = Litera copy(str = newFirst, len = newSecond, pad = newThird) } -object ParseUrl { - private val HOST = UTF8String.fromString("HOST") - private val PATH = UTF8String.fromString("PATH") - private val QUERY = UTF8String.fromString("QUERY") - private val REF = UTF8String.fromString("REF") - private val PROTOCOL = UTF8String.fromString("PROTOCOL") - private val FILE = UTF8String.fromString("FILE") - private val AUTHORITY = UTF8String.fromString("AUTHORITY") - private val USERINFO = UTF8String.fromString("USERINFO") - private val REGEXPREFIX = "(&|^)" - private val REGEXSUBFIX = "=([^&]*)" -} - -/** - * Extracts a part from a URL - */ -@ExpressionDescription( - usage = "_FUNC_(url, partToExtract[, key]) - Extracts a part from a URL.", - examples = """ - Examples: - > SELECT _FUNC_('http://spark.apache.org/path?query=1', 'HOST'); - spark.apache.org - > SELECT _FUNC_('http://spark.apache.org/path?query=1', 'QUERY'); - query=1 - > SELECT _FUNC_('http://spark.apache.org/path?query=1', 'QUERY', 'query'); - 1 - """, - since = "2.0.0", - group = "string_funcs") -case class ParseUrl(children: Seq[Expression], failOnError: Boolean = SQLConf.get.ansiEnabled) - extends Expression with ExpectsInputTypes with CodegenFallback { - def this(children: Seq[Expression]) = this(children, SQLConf.get.ansiEnabled) - - override def nullable: Boolean = true - override def inputTypes: Seq[DataType] = Seq.fill(children.size)(StringType) - override def dataType: DataType = StringType - override def prettyName: String = "parse_url" - - // If the url is a constant, cache the URL object so that we don't need to convert url - // from UTF8String to String to URL for every row. - @transient private lazy val cachedUrl = children(0) match { - case Literal(url: UTF8String, _) if url ne null => getUrl(url) - case _ => null - } - - // If the key is a constant, cache the Pattern object so that we don't need to convert key - // from UTF8String to String to StringBuilder to String to Pattern for every row. - @transient private lazy val cachedPattern = children(2) match { - case Literal(key: UTF8String, _) if key ne null => getPattern(key) - case _ => null - } - - // If the partToExtract is a constant, cache the Extract part function so that we don't need - // to check the partToExtract for every row. - @transient private lazy val cachedExtractPartFunc = children(1) match { - case Literal(part: UTF8String, _) => getExtractPartFunc(part) - case _ => null - } - - import ParseUrl._ - - override def checkInputDataTypes(): TypeCheckResult = { - if (children.size > 3 || children.size < 2) { - TypeCheckResult.TypeCheckFailure(s"$prettyName function requires two or three arguments") - } else { - super[ExpectsInputTypes].checkInputDataTypes() - } - } - - private def getPattern(key: UTF8String): Pattern = { - Pattern.compile(REGEXPREFIX + key.toString + REGEXSUBFIX) - } - - private def getUrl(url: UTF8String): URI = { - try { - new URI(url.toString) - } catch { - case e: URISyntaxException if failOnError => - throw QueryExecutionErrors.invalidUrlError(url, e) - case _: URISyntaxException => null - } - } - - private def getExtractPartFunc(partToExtract: UTF8String): URI => String = { - - // partToExtract match { - // case HOST => _.toURL().getHost - // case PATH => _.toURL().getPath - // case QUERY => _.toURL().getQuery - // case REF => _.toURL().getRef - // case PROTOCOL => _.toURL().getProtocol - // case FILE => _.toURL().getFile - // case AUTHORITY => _.toURL().getAuthority - // case USERINFO => _.toURL().getUserInfo - // case _ => (url: URI) => null - // } - - partToExtract match { - case HOST => _.getHost - case PATH => _.getRawPath - case QUERY => _.getRawQuery - case REF => _.getRawFragment - case PROTOCOL => _.getScheme - case FILE => - (url: URI) => - if (url.getRawQuery ne null) { - url.getRawPath + "?" + url.getRawQuery - } else { - url.getRawPath - } - case AUTHORITY => _.getRawAuthority - case USERINFO => _.getRawUserInfo - case _ => (url: URI) => null - } - } - - private def extractValueFromQuery(query: UTF8String, pattern: Pattern): UTF8String = { - val m = pattern.matcher(query.toString) - if (m.find()) { - UTF8String.fromString(m.group(2)) - } else { - null - } - } - - private def extractFromUrl(url: URI, partToExtract: UTF8String): UTF8String = { - if (cachedExtractPartFunc ne null) { - UTF8String.fromString(cachedExtractPartFunc.apply(url)) - } else { - UTF8String.fromString(getExtractPartFunc(partToExtract).apply(url)) - } - } - - private def parseUrlWithoutKey(url: UTF8String, partToExtract: UTF8String): UTF8String = { - if (cachedUrl ne null) { - extractFromUrl(cachedUrl, partToExtract) - } else { - val currentUrl = getUrl(url) - if (currentUrl ne null) { - extractFromUrl(currentUrl, partToExtract) - } else { - null - } - } - } - - override def eval(input: InternalRow): Any = { - val evaluated = children.map{e => e.eval(input).asInstanceOf[UTF8String]} - if (evaluated.contains(null)) return null - if (evaluated.size == 2) { - parseUrlWithoutKey(evaluated(0), evaluated(1)) - } else { - // 3-arg, i.e. QUERY with key - assert(evaluated.size == 3) - if (evaluated(1) != QUERY) { - return null - } - - val query = parseUrlWithoutKey(evaluated(0), evaluated(1)) - if (query eq null) { - return null - } - - if (cachedPattern ne null) { - extractValueFromQuery(query, cachedPattern) - } else { - extractValueFromQuery(query, getPattern(evaluated(2))) - } - } - } - - override protected def withNewChildrenInternal(newChildren: IndexedSeq[Expression]): ParseUrl = - copy(children = newChildren) -} - /** * Returns the input formatted according do printf-style format strings */ diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/urlExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/urlExpressions.scala new file mode 100644 index 0000000000000..bbbed3130ca90 --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/urlExpressions.scala @@ -0,0 +1,290 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.expressions + +import java.net.{URI, URISyntaxException, URLDecoder, URLEncoder} +import java.util.regex.Pattern + +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.analysis.TypeCheckResult +import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback +import org.apache.spark.sql.catalyst.expressions.objects.StaticInvoke +import org.apache.spark.sql.catalyst.trees.UnaryLike +import org.apache.spark.sql.errors.QueryExecutionErrors +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types.{AbstractDataType, DataType, StringType} +import org.apache.spark.unsafe.types.UTF8String + +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = """ + _FUNC_(str) - Translates a string into {@code application/x-www-form-urlencoded} format using a specific encoding scheme. + """, + arguments = """ + Arguments: + str - a string expression to be translated + """, + examples = """ + Examples: + > SELECT _FUNC_('https://spark.apache.org'); + https%3A%2F%2Fspark.apache.org + """, + since = "3.4.0", + group = "url_funcs") +// scalastyle:on line.size.limit +case class UrlEncode(child: Expression) + extends RuntimeReplaceable with UnaryLike[Expression] with ImplicitCastInputTypes { + + override def replacement: Expression = + StaticInvoke( + UrlCodec.getClass, + StringType, + "encode", + Seq(child, Literal("UTF-8")), + Seq(StringType)) + + override protected def withNewChildInternal(newChild: Expression): Expression = { + copy(child = newChild) + } + + override def inputTypes: Seq[AbstractDataType] = Seq(StringType) + + override def prettyName: String = "url_encode" +} + +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = """ + _FUNC_(str) - Decodes a {@code application/x-www-form-urlencoded} `str`. + """, + arguments = """ + Arguments: + * str - a string expression to decode + """, + examples = """ + Examples: + > SELECT _FUNC_('https%3A%2F%2Fspark.apache.org'); + https://spark.apache.org + """, + since = "3.4.0", + group = "url_funcs") +// scalastyle:on line.size.limit +case class UrlDecode(child: Expression) + extends RuntimeReplaceable with UnaryLike[Expression] with ImplicitCastInputTypes { + + override def replacement: Expression = + StaticInvoke( + UrlCodec.getClass, + StringType, + "decode", + Seq(child, Literal("UTF-8")), + Seq(StringType)) + + override protected def withNewChildInternal(newChild: Expression): Expression = { + copy(child = newChild) + } + + override def inputTypes: Seq[AbstractDataType] = Seq(StringType) + + override def prettyName: String = "url_decode" +} + +object UrlCodec { + def encode(src: UTF8String, enc: UTF8String): UTF8String = { + UTF8String.fromString(URLEncoder.encode(src.toString, enc.toString)) + } + + def decode(src: UTF8String, enc: UTF8String): UTF8String = { + UTF8String.fromString(URLDecoder.decode(src.toString, enc.toString)) + } +} + +object ParseUrl { + private val HOST = UTF8String.fromString("HOST") + private val PATH = UTF8String.fromString("PATH") + private val QUERY = UTF8String.fromString("QUERY") + private val REF = UTF8String.fromString("REF") + private val PROTOCOL = UTF8String.fromString("PROTOCOL") + private val FILE = UTF8String.fromString("FILE") + private val AUTHORITY = UTF8String.fromString("AUTHORITY") + private val USERINFO = UTF8String.fromString("USERINFO") + private val REGEXPREFIX = "(&|^)" + private val REGEXSUBFIX = "=([^&]*)" +} + +/** + * Extracts a part from a URL + */ +@ExpressionDescription( + usage = "_FUNC_(url, partToExtract[, key]) - Extracts a part from a URL.", + examples = """ + Examples: + > SELECT _FUNC_('http://spark.apache.org/path?query=1', 'HOST'); + spark.apache.org + > SELECT _FUNC_('http://spark.apache.org/path?query=1', 'QUERY'); + query=1 + > SELECT _FUNC_('http://spark.apache.org/path?query=1', 'QUERY', 'query'); + 1 + """, + since = "2.0.0", + group = "url_funcs") +case class ParseUrl(children: Seq[Expression], failOnError: Boolean = SQLConf.get.ansiEnabled) + extends Expression with ExpectsInputTypes with CodegenFallback { + def this(children: Seq[Expression]) = this(children, SQLConf.get.ansiEnabled) + + override def nullable: Boolean = true + override def inputTypes: Seq[DataType] = Seq.fill(children.size)(StringType) + override def dataType: DataType = StringType + override def prettyName: String = "parse_url" + + // If the url is a constant, cache the URL object so that we don't need to convert url + // from UTF8String to String to URL for every row. + @transient private lazy val cachedUrl = children(0) match { + case Literal(url: UTF8String, _) if url ne null => getUrl(url) + case _ => null + } + + // If the key is a constant, cache the Pattern object so that we don't need to convert key + // from UTF8String to String to StringBuilder to String to Pattern for every row. + @transient private lazy val cachedPattern = children(2) match { + case Literal(key: UTF8String, _) if key ne null => getPattern(key) + case _ => null + } + + // If the partToExtract is a constant, cache the Extract part function so that we don't need + // to check the partToExtract for every row. + @transient private lazy val cachedExtractPartFunc = children(1) match { + case Literal(part: UTF8String, _) => getExtractPartFunc(part) + case _ => null + } + + import ParseUrl._ + + override def checkInputDataTypes(): TypeCheckResult = { + if (children.size > 3 || children.size < 2) { + TypeCheckResult.TypeCheckFailure(s"$prettyName function requires two or three arguments") + } else { + super[ExpectsInputTypes].checkInputDataTypes() + } + } + + private def getPattern(key: UTF8String): Pattern = { + Pattern.compile(REGEXPREFIX + key.toString + REGEXSUBFIX) + } + + private def getUrl(url: UTF8String): URI = { + try { + new URI(url.toString) + } catch { + case e: URISyntaxException if failOnError => + throw QueryExecutionErrors.invalidUrlError(url, e) + case _: URISyntaxException => null + } + } + + private def getExtractPartFunc(partToExtract: UTF8String): URI => String = { + + // partToExtract match { + // case HOST => _.toURL().getHost + // case PATH => _.toURL().getPath + // case QUERY => _.toURL().getQuery + // case REF => _.toURL().getRef + // case PROTOCOL => _.toURL().getProtocol + // case FILE => _.toURL().getFile + // case AUTHORITY => _.toURL().getAuthority + // case USERINFO => _.toURL().getUserInfo + // case _ => (url: URI) => null + // } + + partToExtract match { + case HOST => _.getHost + case PATH => _.getRawPath + case QUERY => _.getRawQuery + case REF => _.getRawFragment + case PROTOCOL => _.getScheme + case FILE => + (url: URI) => + if (url.getRawQuery ne null) { + url.getRawPath + "?" + url.getRawQuery + } else { + url.getRawPath + } + case AUTHORITY => _.getRawAuthority + case USERINFO => _.getRawUserInfo + case _ => (url: URI) => null + } + } + + private def extractValueFromQuery(query: UTF8String, pattern: Pattern): UTF8String = { + val m = pattern.matcher(query.toString) + if (m.find()) { + UTF8String.fromString(m.group(2)) + } else { + null + } + } + + private def extractFromUrl(url: URI, partToExtract: UTF8String): UTF8String = { + if (cachedExtractPartFunc ne null) { + UTF8String.fromString(cachedExtractPartFunc.apply(url)) + } else { + UTF8String.fromString(getExtractPartFunc(partToExtract).apply(url)) + } + } + + private def parseUrlWithoutKey(url: UTF8String, partToExtract: UTF8String): UTF8String = { + if (cachedUrl ne null) { + extractFromUrl(cachedUrl, partToExtract) + } else { + val currentUrl = getUrl(url) + if (currentUrl ne null) { + extractFromUrl(currentUrl, partToExtract) + } else { + null + } + } + } + + override def eval(input: InternalRow): Any = { + val evaluated = children.map{e => e.eval(input).asInstanceOf[UTF8String]} + if (evaluated.contains(null)) return null + if (evaluated.size == 2) { + parseUrlWithoutKey(evaluated(0), evaluated(1)) + } else { + // 3-arg, i.e. QUERY with key + assert(evaluated.size == 3) + if (evaluated(1) != QUERY) { + return null + } + + val query = parseUrlWithoutKey(evaluated(0), evaluated(1)) + if (query eq null) { + return null + } + + if (cachedPattern ne null) { + extractValueFromQuery(query, cachedPattern) + } else { + extractValueFromQuery(query, getPattern(evaluated(2))) + } + } + } + + override protected def withNewChildrenInternal(newChildren: IndexedSeq[Expression]): ParseUrl = + copy(children = newChildren) +} diff --git a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md index e24ae5d0f7bf9..3bae88e23c3b0 100644 --- a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md +++ b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md @@ -330,6 +330,8 @@ | org.apache.spark.sql.catalyst.expressions.UnixTimestamp | unix_timestamp | SELECT unix_timestamp() | struct | | org.apache.spark.sql.catalyst.expressions.Upper | ucase | SELECT ucase('SparkSql') | struct | | org.apache.spark.sql.catalyst.expressions.Upper | upper | SELECT upper('SparkSql') | struct | +| org.apache.spark.sql.catalyst.expressions.UrlDecode | url_decode | SELECT url_decode('https%3A%2F%2Fspark.apache.org') | struct | +| org.apache.spark.sql.catalyst.expressions.UrlEncode | url_encode | SELECT url_encode('https://spark.apache.org') | struct | | org.apache.spark.sql.catalyst.expressions.Uuid | uuid | SELECT uuid() | struct | | org.apache.spark.sql.catalyst.expressions.WeekDay | weekday | SELECT weekday('2009-07-30') | struct | | org.apache.spark.sql.catalyst.expressions.WeekOfYear | weekofyear | SELECT weekofyear('2008-02-20') | struct | diff --git a/sql/core/src/test/resources/sql-tests/inputs/url-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/url-functions.sql new file mode 100644 index 0000000000000..9f8af7eac7e33 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/url-functions.sql @@ -0,0 +1,19 @@ +-- parse_url function +select parse_url('http://userinfo@spark.apache.org/path?query=1#Ref', 'HOST'); +select parse_url('http://userinfo@spark.apache.org/path?query=1#Ref', 'PATH'); +select parse_url('http://userinfo@spark.apache.org/path?query=1#Ref', 'QUERY'); +select parse_url('http://userinfo@spark.apache.org/path?query=1#Ref', 'REF'); +select parse_url('http://userinfo@spark.apache.org/path?query=1#Ref', 'PROTOCOL'); +select parse_url('http://userinfo@spark.apache.org/path?query=1#Ref', 'FILE'); +select parse_url('http://userinfo@spark.apache.org/path?query=1#Ref', 'AUTHORITY'); +select parse_url('http://userinfo@spark.apache.org/path?query=1#Ref', 'USERINFO'); + +-- url_encode function +select url_encode('https://spark.apache.org'); +select url_encode('inva lid://user:pass@host/file\\;param?query\\;p2'); +select url_encode(null); + +-- url_decode function +select url_decode('https%3A%2F%2Fspark.apache.org'); +select url_decode('inva lid://user:pass@host/file\\;param?query\\;p2'); +select url_decode(null); \ No newline at end of file diff --git a/sql/core/src/test/resources/sql-tests/results/url-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/url-functions.sql.out new file mode 100644 index 0000000000000..fc714bfc41bff --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/url-functions.sql.out @@ -0,0 +1,111 @@ +-- Automatically generated by SQLQueryTestSuite +-- !query +select parse_url('http://userinfo@spark.apache.org/path?query=1#Ref', 'HOST') +-- !query schema +struct +-- !query output +spark.apache.org + + +-- !query +select parse_url('http://userinfo@spark.apache.org/path?query=1#Ref', 'PATH') +-- !query schema +struct +-- !query output +/path + + +-- !query +select parse_url('http://userinfo@spark.apache.org/path?query=1#Ref', 'QUERY') +-- !query schema +struct +-- !query output +query=1 + + +-- !query +select parse_url('http://userinfo@spark.apache.org/path?query=1#Ref', 'REF') +-- !query schema +struct +-- !query output +Ref + + +-- !query +select parse_url('http://userinfo@spark.apache.org/path?query=1#Ref', 'PROTOCOL') +-- !query schema +struct +-- !query output +http + + +-- !query +select parse_url('http://userinfo@spark.apache.org/path?query=1#Ref', 'FILE') +-- !query schema +struct +-- !query output +/path?query=1 + + +-- !query +select parse_url('http://userinfo@spark.apache.org/path?query=1#Ref', 'AUTHORITY') +-- !query schema +struct +-- !query output +userinfo@spark.apache.org + + +-- !query +select parse_url('http://userinfo@spark.apache.org/path?query=1#Ref', 'USERINFO') +-- !query schema +struct +-- !query output +userinfo + + +-- !query +select url_encode('https://spark.apache.org') +-- !query schema +struct +-- !query output +https%3A%2F%2Fspark.apache.org + + +-- !query +select url_encode('inva lid://user:pass@host/file\\;param?query\\;p2') +-- !query schema +struct +-- !query output +inva+lid%3A%2F%2Fuser%3Apass%40host%2Ffile%5C%3Bparam%3Fquery%5C%3Bp2 + + +-- !query +select url_encode(null) +-- !query schema +struct +-- !query output +NULL + + +-- !query +select url_decode('https%3A%2F%2Fspark.apache.org') +-- !query schema +struct +-- !query output +https://spark.apache.org + + +-- !query +select url_decode('inva lid://user:pass@host/file\\;param?query\\;p2') +-- !query schema +struct +-- !query output +inva lid://user:pass@host/file\;param?query\;p2 + + +-- !query +select url_decode(null) +-- !query schema +struct +-- !query output +NULL diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala index 2f118f236e2c4..d07be9c19714e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala @@ -346,53 +346,6 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession { Row("???hi", "hi???", "h", "h")) } - test("string parse_url function") { - - def testUrl(url: String, expected: Row): Unit = { - checkAnswer(Seq[String]((url)).toDF("url").selectExpr( - "parse_url(url, 'HOST')", "parse_url(url, 'PATH')", - "parse_url(url, 'QUERY')", "parse_url(url, 'REF')", - "parse_url(url, 'PROTOCOL')", "parse_url(url, 'FILE')", - "parse_url(url, 'AUTHORITY')", "parse_url(url, 'USERINFO')", - "parse_url(url, 'QUERY', 'query')"), expected) - } - - testUrl( - "http://userinfo@spark.apache.org/path?query=1#Ref", - Row("spark.apache.org", "/path", "query=1", "Ref", - "http", "/path?query=1", "userinfo@spark.apache.org", "userinfo", "1")) - - testUrl( - "https://use%20r:pas%20s@example.com/dir%20/pa%20th.HTML?query=x%20y&q2=2#Ref%20two", - Row("example.com", "/dir%20/pa%20th.HTML", "query=x%20y&q2=2", "Ref%20two", - "https", "/dir%20/pa%20th.HTML?query=x%20y&q2=2", "use%20r:pas%20s@example.com", - "use%20r:pas%20s", "x%20y")) - - testUrl( - "http://user:pass@host", - Row("host", "", null, null, "http", "", "user:pass@host", "user:pass", null)) - - testUrl( - "http://user:pass@host/", - Row("host", "/", null, null, "http", "/", "user:pass@host", "user:pass", null)) - - testUrl( - "http://user:pass@host/?#", - Row("host", "/", "", "", "http", "/?", "user:pass@host", "user:pass", null)) - - testUrl( - "http://user:pass@host/file;param?query;p2", - Row("host", "/file;param", "query;p2", null, "http", "/file;param?query;p2", - "user:pass@host", "user:pass", null)) - - withSQLConf(SQLConf.ANSI_ENABLED.key -> "false") { - testUrl( - "inva lid://user:pass@host/file;param?query;p2", - Row(null, null, null, null, null, null, null, null, null)) - } - - } - test("string repeat function") { val df = Seq(("hi", 2)).toDF("a", "b") diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UrlFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UrlFunctionsSuite.scala new file mode 100644 index 0000000000000..85f0d70df7be0 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/UrlFunctionsSuite.scala @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.test.SharedSparkSession + +class UrlFunctionsSuite extends QueryTest with SharedSparkSession { + import testImplicits._ + + test("url parse_url function") { + + def testUrl(url: String, expected: Row): Unit = { + checkAnswer(Seq[String]((url)).toDF("url").selectExpr( + "parse_url(url, 'HOST')", "parse_url(url, 'PATH')", + "parse_url(url, 'QUERY')", "parse_url(url, 'REF')", + "parse_url(url, 'PROTOCOL')", "parse_url(url, 'FILE')", + "parse_url(url, 'AUTHORITY')", "parse_url(url, 'USERINFO')", + "parse_url(url, 'QUERY', 'query')"), expected) + } + + testUrl( + "http://userinfo@spark.apache.org/path?query=1#Ref", + Row("spark.apache.org", "/path", "query=1", "Ref", + "http", "/path?query=1", "userinfo@spark.apache.org", "userinfo", "1")) + + testUrl( + "https://use%20r:pas%20s@example.com/dir%20/pa%20th.HTML?query=x%20y&q2=2#Ref%20two", + Row("example.com", "/dir%20/pa%20th.HTML", "query=x%20y&q2=2", "Ref%20two", + "https", "/dir%20/pa%20th.HTML?query=x%20y&q2=2", "use%20r:pas%20s@example.com", + "use%20r:pas%20s", "x%20y")) + + testUrl( + "http://user:pass@host", + Row("host", "", null, null, "http", "", "user:pass@host", "user:pass", null)) + + testUrl( + "http://user:pass@host/", + Row("host", "/", null, null, "http", "/", "user:pass@host", "user:pass", null)) + + testUrl( + "http://user:pass@host/?#", + Row("host", "/", "", "", "http", "/?", "user:pass@host", "user:pass", null)) + + testUrl( + "http://user:pass@host/file;param?query;p2", + Row("host", "/file;param", "query;p2", null, "http", "/file;param?query;p2", + "user:pass@host", "user:pass", null)) + + withSQLConf(SQLConf.ANSI_ENABLED.key -> "false") { + testUrl( + "inva lid://user:pass@host/file;param?query;p2", + Row(null, null, null, null, null, null, null, null, null)) + } + } + + test("url encode/decode function") { + def testUrl(url: String, fn: String, expected: Row): Unit = { + checkAnswer(Seq[String]((url)).toDF("url") + .selectExpr(s"$fn(url)"), expected) + } + + testUrl("https://spark.apache.org", "url_encode", Row("https%3A%2F%2Fspark.apache.org")) + testUrl("null", "url_encode", Row("null")) + + testUrl("https%3A%2F%2Fspark.apache.org", "url_decode", Row("https://spark.apache.org")) + testUrl("null", "url_decode", Row("null")) + } + +} From 61344dfcb1578c116d8fbdc814c49e830aeebc65 Mon Sep 17 00:00:00 2001 From: Yikf Date: Tue, 12 Jul 2022 17:00:05 +0800 Subject: [PATCH 2/2] fix comment --- .../spark/sql/catalyst/expressions/urlExpressions.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/urlExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/urlExpressions.scala index bbbed3130ca90..174e60371af68 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/urlExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/urlExpressions.scala @@ -33,7 +33,7 @@ import org.apache.spark.unsafe.types.UTF8String // scalastyle:off line.size.limit @ExpressionDescription( usage = """ - _FUNC_(str) - Translates a string into {@code application/x-www-form-urlencoded} format using a specific encoding scheme. + _FUNC_(str) - Translates a string into 'application/x-www-form-urlencoded' format using a specific encoding scheme. """, arguments = """ Arguments: @@ -70,7 +70,7 @@ case class UrlEncode(child: Expression) // scalastyle:off line.size.limit @ExpressionDescription( usage = """ - _FUNC_(str) - Decodes a {@code application/x-www-form-urlencoded} `str`. + _FUNC_(str) - Decodes a `str` in 'application/x-www-form-urlencoded' format using a specific encoding scheme. """, arguments = """ Arguments: