Skip to content

Commit ec5387b

Browse files
zhuqi-lucasThomasAWalker
authored andcommitted
Support utf8view datatype for window (apache#15257)
* Support utf8view datatype for window * Fix table name * Also support subquery testing
1 parent c94163f commit ec5387b

File tree

4 files changed

+96
-5
lines changed

4 files changed

+96
-5
lines changed

datafusion/expr/src/type_coercion/mod.rs

+6-3
Original file line numberDiff line numberDiff line change
@@ -79,9 +79,12 @@ pub fn is_datetime(dt: &DataType) -> bool {
7979
)
8080
}
8181

82-
/// Determine whether the given data type `dt` is a `Utf8` or `LargeUtf8`.
83-
pub fn is_utf8_or_large_utf8(dt: &DataType) -> bool {
84-
matches!(dt, DataType::Utf8 | DataType::LargeUtf8)
82+
/// Determine whether the given data type `dt` is a `Utf8` or `Utf8View` or `LargeUtf8`.
83+
pub fn is_utf8_or_utf8view_or_large_utf8(dt: &DataType) -> bool {
84+
matches!(
85+
dt,
86+
DataType::Utf8 | DataType::Utf8View | DataType::LargeUtf8
87+
)
8588
}
8689

8790
/// Determine whether the given data type `dt` is a `Decimal`.

datafusion/optimizer/src/analyzer/type_coercion.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ use datafusion_expr::type_coercion::functions::{
4646
use datafusion_expr::type_coercion::other::{
4747
get_coerce_type_for_case_expression, get_coerce_type_for_list,
4848
};
49-
use datafusion_expr::type_coercion::{is_datetime, is_utf8_or_large_utf8};
49+
use datafusion_expr::type_coercion::{is_datetime, is_utf8_or_utf8view_or_large_utf8};
5050
use datafusion_expr::utils::merge_schema;
5151
use datafusion_expr::{
5252
is_false, is_not_false, is_not_true, is_not_unknown, is_true, is_unknown, not,
@@ -709,7 +709,7 @@ fn coerce_frame_bound(
709709

710710
fn extract_window_frame_target_type(col_type: &DataType) -> Result<DataType> {
711711
if col_type.is_numeric()
712-
|| is_utf8_or_large_utf8(col_type)
712+
|| is_utf8_or_utf8view_or_large_utf8(col_type)
713713
|| matches!(col_type, DataType::Null)
714714
|| matches!(col_type, DataType::Boolean)
715715
{

datafusion/sqllogictest/test_files/subquery_sort.slt

+29
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,35 @@ physical_plan
104104
05)--------SortExec: expr=[c1@0 DESC], preserve_partitioning=[false]
105105
06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c3, c9], file_type=csv, has_header=true
106106

107+
#Test with utf8view for window function
108+
statement ok
109+
CREATE TABLE sink_table_with_utf8view AS
110+
SELECT arrow_cast(c1, 'Utf8View') AS c1, c2, c3, c9
111+
FROM sink_table;
112+
113+
114+
query TT
115+
EXPLAIN SELECT t2.c1, t2.r FROM (SELECT c1, RANK() OVER (ORDER BY c1 DESC) AS r, c3, c9 FROM sink_table_with_utf8view ORDER BY c1, c3 LIMIT 2) AS t2 ORDER BY t2.c1, t2.c3, t2.c9;
116+
----
117+
logical_plan
118+
01)Projection: t2.c1, t2.r
119+
02)--Sort: t2.c1 ASC NULLS LAST, t2.c3 ASC NULLS LAST, t2.c9 ASC NULLS LAST
120+
03)----SubqueryAlias: t2
121+
04)------Sort: sink_table_with_utf8view.c1 ASC NULLS LAST, sink_table_with_utf8view.c3 ASC NULLS LAST, fetch=2
122+
05)--------Projection: sink_table_with_utf8view.c1, rank() ORDER BY [sink_table_with_utf8view.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS r, sink_table_with_utf8view.c3, sink_table_with_utf8view.c9
123+
06)----------WindowAggr: windowExpr=[[rank() ORDER BY [sink_table_with_utf8view.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
124+
07)------------TableScan: sink_table_with_utf8view projection=[c1, c3, c9]
125+
physical_plan
126+
01)ProjectionExec: expr=[c1@0 as c1, r@1 as r]
127+
02)--SortExec: TopK(fetch=2), expr=[c1@0 ASC NULLS LAST, c3@2 ASC NULLS LAST, c9@3 ASC NULLS LAST], preserve_partitioning=[false]
128+
03)----ProjectionExec: expr=[c1@0 as c1, rank() ORDER BY [sink_table_with_utf8view.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as r, c3@1 as c3, c9@2 as c9]
129+
04)------BoundedWindowAggExec: wdw=[rank() ORDER BY [sink_table_with_utf8view.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "rank() ORDER BY [sink_table_with_utf8view.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Utf8View(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]
130+
05)--------SortPreservingMergeExec: [c1@0 DESC]
131+
06)----------SortExec: expr=[c1@0 DESC], preserve_partitioning=[true]
132+
07)------------DataSourceExec: partitions=4, partition_sizes=[1, 0, 0, 0]
133+
134+
statement ok
135+
DROP TABLE sink_table_with_utf8view;
107136

108137
query TT
109138
EXPLAIN SELECT c1, c2 FROM (SELECT DISTINCT ON (c1) c1, c2, c3, c9 FROM sink_table ORDER BY c1, c3 DESC, c9) AS t2 ORDER BY t2.c1, t2.c3 DESC, t2.c9

datafusion/sqllogictest/test_files/window.slt

+59
Original file line numberDiff line numberDiff line change
@@ -5538,3 +5538,62 @@ physical_plan
55385538
01)ProjectionExec: expr=[max(aggregate_test_100_ordered.c5) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@1 as max_c5]
55395539
02)--WindowAggExec: wdw=[max(aggregate_test_100_ordered.c5) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "max(aggregate_test_100_ordered.c5) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
55405540
03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c5], file_type=csv, has_header=true
5541+
5542+
# Testing Utf8View with window
5543+
statement ok
5544+
CREATE TABLE aggregate_test_100_utf8view AS SELECT
5545+
arrow_cast(c1, 'Utf8View') as c1,
5546+
c9,
5547+
c13
5548+
FROM aggregate_test_100;
5549+
5550+
5551+
#fn window_frame_ranges_string_check
5552+
query II
5553+
SELECT
5554+
SUM(LENGTH(c13)) OVER(ORDER BY c13),
5555+
SUM(LENGTH(c1)) OVER(ORDER BY c1)
5556+
FROM aggregate_test_100_utf8view
5557+
ORDER BY c9
5558+
LIMIT 5
5559+
----
5560+
2100 100
5561+
510 79
5562+
1440 21
5563+
1830 61
5564+
2010 21
5565+
5566+
5567+
#fn test_window_rank
5568+
query IIIIIRR
5569+
SELECT
5570+
c9,
5571+
RANK() OVER(ORDER BY c1) AS rank1,
5572+
RANK() OVER(ORDER BY c1 ROWS BETWEEN 10 PRECEDING and 1 FOLLOWING) as rank2,
5573+
DENSE_RANK() OVER(ORDER BY c1) as dense_rank1,
5574+
DENSE_RANK() OVER(ORDER BY c1 ROWS BETWEEN 10 PRECEDING and 1 FOLLOWING) as dense_rank2,
5575+
PERCENT_RANK() OVER(ORDER BY c1) as percent_rank1,
5576+
PERCENT_RANK() OVER(ORDER BY c1 ROWS BETWEEN 10 PRECEDING and 1 FOLLOWING) as percent_rank2
5577+
FROM aggregate_test_100_utf8view
5578+
ORDER BY c9
5579+
LIMIT 5
5580+
----
5581+
28774375 80 80 5 5 0.79797979798 0.79797979798
5582+
63044568 62 62 4 4 0.616161616162 0.616161616162
5583+
141047417 1 1 1 1 0 0
5584+
141680161 41 41 3 3 0.40404040404 0.40404040404
5585+
145294611 1 1 1 1 0 0
5586+
5587+
5588+
# CTAS with NTILE function
5589+
statement ok
5590+
CREATE TABLE new_table AS SELECT NTILE(2) OVER(ORDER BY c1) AS ntile_2 FROM aggregate_test_100_utf8view;
5591+
5592+
statement ok
5593+
DROP TABLE new_table;
5594+
5595+
statement ok
5596+
DROP TABLE aggregate_test_100_utf8view;
5597+
5598+
statement ok
5599+
DROP TABLE aggregate_test_100

0 commit comments

Comments
 (0)