Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SPARK-51040][SQL] Enforce determinism when assigning implicit aliases to collation types #49735

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,9 @@ package object util extends Logging {
// generated column names don't contain back-ticks or double-quotes.
def usePrettyExpression(e: Expression): Expression = e transform {
case a: Attribute => new PrettyAttribute(a)
case Literal(s: UTF8String, collationStringType: StringType)
if collationStringType.collationId != 0 =>
PrettyAttribute(s.toString, StringType)
case Literal(s: UTF8String, StringType) => PrettyAttribute(s.toString, StringType)
case Literal(v, t: NumericType) if v != null => PrettyAttribute(v.toString, t)
case Literal(null, dataType) => PrettyAttribute("NULL", dataType)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -442,77 +442,77 @@ Project [array_except(array(collate(aaa, utf8_lcase)), array(collate(AAA, utf8_l
-- !query
select 'a' collate unicode < 'A'
-- !query analysis
Project [(collate(a, unicode) < A) AS (collate(a, unicode) < 'A' collate UNICODE)#x]
Project [(collate(a, unicode) < A) AS (collate(a, unicode) < A)#x]
+- OneRowRelation


-- !query
select 'a' collate unicode_ci = 'A'
-- !query analysis
Project [(collate(a, unicode_ci) = A) AS (collate(a, unicode_ci) = 'A' collate UNICODE_CI)#x]
Project [(collate(a, unicode_ci) = A) AS (collate(a, unicode_ci) = A)#x]
+- OneRowRelation


-- !query
select 'a' collate unicode_ai = 'å'
-- !query analysis
Project [(collate(a, unicode_ai) = å) AS (collate(a, unicode_ai) = 'å' collate UNICODE_AI)#x]
Project [(collate(a, unicode_ai) = å) AS (collate(a, unicode_ai) = å)#x]
+- OneRowRelation


-- !query
select 'a' collate unicode_ci_ai = 'Å'
-- !query analysis
Project [(collate(a, unicode_ci_ai) = Å) AS (collate(a, unicode_ci_ai) = 'Å' collate UNICODE_CI_AI)#x]
Project [(collate(a, unicode_ci_ai) = Å) AS (collate(a, unicode_ci_ai) = Å)#x]
+- OneRowRelation


-- !query
select 'a' collate en < 'A'
-- !query analysis
Project [(collate(a, en) < A) AS (collate(a, en) < 'A' collate en)#x]
Project [(collate(a, en) < A) AS (collate(a, en) < A)#x]
+- OneRowRelation


-- !query
select 'a' collate en_ci = 'A'
-- !query analysis
Project [(collate(a, en_ci) = A) AS (collate(a, en_ci) = 'A' collate en_CI)#x]
Project [(collate(a, en_ci) = A) AS (collate(a, en_ci) = A)#x]
+- OneRowRelation


-- !query
select 'a' collate en_ai = 'å'
-- !query analysis
Project [(collate(a, en_ai) = å) AS (collate(a, en_ai) = 'å' collate en_AI)#x]
Project [(collate(a, en_ai) = å) AS (collate(a, en_ai) = å)#x]
+- OneRowRelation


-- !query
select 'a' collate en_ci_ai = 'Å'
-- !query analysis
Project [(collate(a, en_ci_ai) = Å) AS (collate(a, en_ci_ai) = 'Å' collate en_CI_AI)#x]
Project [(collate(a, en_ci_ai) = Å) AS (collate(a, en_ci_ai) = Å)#x]
+- OneRowRelation


-- !query
select 'Kypper' collate sv < 'Köpfe'
-- !query analysis
Project [(collate(Kypper, sv) < Köpfe) AS (collate(Kypper, sv) < 'Köpfe' collate sv)#x]
Project [(collate(Kypper, sv) < Köpfe) AS (collate(Kypper, sv) < Köpfe)#x]
+- OneRowRelation


-- !query
select 'Kypper' collate de > 'Köpfe'
-- !query analysis
Project [(collate(Kypper, de) > Köpfe) AS (collate(Kypper, de) > 'Köpfe' collate de)#x]
Project [(collate(Kypper, de) > Köpfe) AS (collate(Kypper, de) > Köpfe)#x]
+- OneRowRelation


-- !query
select 'I' collate tr_ci = 'ı'
-- !query analysis
Project [(collate(I, tr_ci) = ı) AS (collate(I, tr_ci) = 'ı' collate tr_CI)#x]
Project [(collate(I, tr_ci) = ı) AS (collate(I, tr_ci) = ı)#x]
+- OneRowRelation


Expand Down Expand Up @@ -919,7 +919,7 @@ Project [elt(1, collate(utf8_binary#x, utf8_binary), cast(utf8_lcase#x as string
-- !query
select elt(1, utf8_binary, 'word'), elt(1, utf8_lcase, 'word') from t5
-- !query analysis
Project [elt(1, utf8_binary#x, word, true) AS elt(1, utf8_binary, word)#x, elt(1, utf8_lcase#x, word, true) AS elt(1, utf8_lcase, 'word' collate UTF8_LCASE)#x]
Project [elt(1, utf8_binary#x, word, true) AS elt(1, utf8_binary, word)#x, elt(1, utf8_lcase#x, word, true) AS elt(1, utf8_lcase, word)#x]
+- SubqueryAlias spark_catalog.default.t5
+- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet

Expand Down Expand Up @@ -1684,7 +1684,7 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
"inputType" : "\"STRING COLLATE UNICODE_AI\"",
"paramIndex" : "first",
"requiredType" : "\"STRING\"",
"sqlExpr" : "\"replace(collate(utf8_binary, unicode_ai), collate(utf8_lcase, unicode_ai), 'abc' collate UNICODE_AI)\""
"sqlExpr" : "\"replace(collate(utf8_binary, unicode_ai), collate(utf8_lcase, unicode_ai), abc)\""
},
"queryContext" : [ {
"objectType" : "",
Expand Down Expand Up @@ -2073,7 +2073,7 @@ Project [overlay(collate(utf8_binary#x, utf8_lcase), collate(utf8_lcase#x, utf8_
-- !query
select overlay(utf8_binary, 'a', 2), overlay(utf8_lcase, 'a', 2) from t5
-- !query analysis
Project [overlay(utf8_binary#x, a, 2, -1) AS overlay(utf8_binary, a, 2, -1)#x, overlay(utf8_lcase#x, a, 2, -1) AS overlay(utf8_lcase, 'a' collate UTF8_LCASE, 2, -1)#x]
Project [overlay(utf8_binary#x, a, 2, -1) AS overlay(utf8_binary, a, 2, -1)#x, overlay(utf8_lcase#x, a, 2, -1) AS overlay(utf8_lcase, a, 2, -1)#x]
+- SubqueryAlias spark_catalog.default.t5
+- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet

Expand Down
28 changes: 14 additions & 14 deletions sql/core/src/test/resources/sql-tests/results/collations.sql.out
Original file line number Diff line number Diff line change
Expand Up @@ -479,87 +479,87 @@ struct<array_except(array(collate(aaa, utf8_lcase)), array(collate(AAA, utf8_lca
-- !query
select 'a' collate unicode < 'A'
-- !query schema
struct<(collate(a, unicode) < 'A' collate UNICODE):boolean>
struct<(collate(a, unicode) < A):boolean>
-- !query output
true


-- !query
select 'a' collate unicode_ci = 'A'
-- !query schema
struct<(collate(a, unicode_ci) = 'A' collate UNICODE_CI):boolean>
struct<(collate(a, unicode_ci) = A):boolean>
-- !query output
true


-- !query
select 'a' collate unicode_ai = 'å'
-- !query schema
struct<(collate(a, unicode_ai) = 'å' collate UNICODE_AI):boolean>
struct<(collate(a, unicode_ai) = å):boolean>
-- !query output
true


-- !query
select 'a' collate unicode_ci_ai = 'Å'
-- !query schema
struct<(collate(a, unicode_ci_ai) = 'Å' collate UNICODE_CI_AI):boolean>
struct<(collate(a, unicode_ci_ai) = Å):boolean>
-- !query output
true


-- !query
select 'a' collate en < 'A'
-- !query schema
struct<(collate(a, en) < 'A' collate en):boolean>
struct<(collate(a, en) < A):boolean>
-- !query output
true


-- !query
select 'a' collate en_ci = 'A'
-- !query schema
struct<(collate(a, en_ci) = 'A' collate en_CI):boolean>
struct<(collate(a, en_ci) = A):boolean>
-- !query output
true


-- !query
select 'a' collate en_ai = 'å'
-- !query schema
struct<(collate(a, en_ai) = 'å' collate en_AI):boolean>
struct<(collate(a, en_ai) = å):boolean>
-- !query output
true


-- !query
select 'a' collate en_ci_ai = 'Å'
-- !query schema
struct<(collate(a, en_ci_ai) = 'Å' collate en_CI_AI):boolean>
struct<(collate(a, en_ci_ai) = Å):boolean>
-- !query output
true


-- !query
select 'Kypper' collate sv < 'Köpfe'
-- !query schema
struct<(collate(Kypper, sv) < 'Köpfe' collate sv):boolean>
struct<(collate(Kypper, sv) < Köpfe):boolean>
-- !query output
true


-- !query
select 'Kypper' collate de > 'Köpfe'
-- !query schema
struct<(collate(Kypper, de) > 'Köpfe' collate de):boolean>
struct<(collate(Kypper, de) > Köpfe):boolean>
-- !query output
true


-- !query
select 'I' collate tr_ci = 'ı'
-- !query schema
struct<(collate(I, tr_ci) = 'ı' collate tr_CI):boolean>
struct<(collate(I, tr_ci) = ı):boolean>
-- !query output
true

Expand Down Expand Up @@ -1120,7 +1120,7 @@ kitten
-- !query
select elt(1, utf8_binary, 'word'), elt(1, utf8_lcase, 'word') from t5
-- !query schema
struct<elt(1, utf8_binary, word):string,elt(1, utf8_lcase, 'word' collate UTF8_LCASE):string collate UTF8_LCASE>
struct<elt(1, utf8_binary, word):string,elt(1, utf8_lcase, word):string collate UTF8_LCASE>
-- !query output
Hello, world! Nice day. Hello, world! Nice day.
Something else. Nothing here. Something else. Nothing here.
Expand Down Expand Up @@ -2549,7 +2549,7 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
"inputType" : "\"STRING COLLATE UNICODE_AI\"",
"paramIndex" : "first",
"requiredType" : "\"STRING\"",
"sqlExpr" : "\"replace(collate(utf8_binary, unicode_ai), collate(utf8_lcase, unicode_ai), 'abc' collate UNICODE_AI)\""
"sqlExpr" : "\"replace(collate(utf8_binary, unicode_ai), collate(utf8_lcase, unicode_ai), abc)\""
},
"queryContext" : [ {
"objectType" : "",
Expand Down Expand Up @@ -3413,7 +3413,7 @@ ksitTing
-- !query
select overlay(utf8_binary, 'a', 2), overlay(utf8_lcase, 'a', 2) from t5
-- !query schema
struct<overlay(utf8_binary, a, 2, -1):string,overlay(utf8_lcase, 'a' collate UTF8_LCASE, 2, -1):string collate UTF8_LCASE>
struct<overlay(utf8_binary, a, 2, -1):string,overlay(utf8_lcase, a, 2, -1):string collate UTF8_LCASE>
-- !query output
Hallo, world! Nice day. Hallo, world! Nice day.
Saark SaL
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1209,10 +1209,9 @@ class CollationSQLExpressionsSuite
condition = "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE",
sqlState = Some("42K09"),
parameters = Map(
"sqlExpr" -> ("\"str_to_map('a:1,b:2,c:3' collate " + s"${t.collation}, " +
"'?' collate " + s"${t.collation}, '?' collate ${t.collation})" + "\""),
"sqlExpr" -> "\"str_to_map(a:1,b:2,c:3, ?, ?)\"",
"paramIndex" -> "first",
"inputSql" -> ("\"'a:1,b:2,c:3' collate " + s"${t.collation}" + "\""),
"inputSql" -> "\"a:1,b:2,c:3\"",
"inputType" -> ("\"STRING COLLATE " + s"${t.collation}" + "\""),
"requiredType" -> "\"STRING\""),
context = ExpectedContext(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -451,8 +451,7 @@ class CollationSQLRegexpSuite
condition = "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE",
parameters = Map(
"sqlExpr" ->
("\"regexp_replace(collate(ABCDE, UNICODE_CI), '.c.' collate UNICODE_CI," +
" 'FFF' collate UNICODE_CI, 1)\""),
("\"regexp_replace(collate(ABCDE, UNICODE_CI), .c., FFF, 1)\""),
"paramIndex" -> "first",
"inputSql" -> "\"collate(ABCDE, UNICODE_CI)\"",
"inputType" -> "\"STRING COLLATE UNICODE_CI\"",
Expand Down
Loading