Skip to content

Commit e380fa1

Browse files
committed
Update ASCII scalar function to support Utf8View apache#11834
1 parent 0bbce5d commit e380fa1

File tree

2 files changed

+206
-28
lines changed

2 files changed

+206
-28
lines changed

datafusion/functions/src/string/ascii.rs

+107-28
Original file line numberDiff line numberDiff line change
@@ -16,33 +16,15 @@
1616
// under the License.
1717

1818
use crate::utils::make_scalar_function;
19-
use arrow::array::Int32Array;
20-
use arrow::array::{ArrayRef, OffsetSizeTrait};
19+
use arrow::array::{Int32Array, ArrayRef, AsArray, ArrayAccessor, ArrayIter};
20+
use arrow::error::ArrowError;
2121
use arrow::datatypes::DataType;
22-
use datafusion_common::{cast::as_generic_string_array, internal_err, Result};
22+
use datafusion_common::{Result, internal_err};
2323
use datafusion_expr::ColumnarValue;
2424
use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
2525
use std::any::Any;
2626
use std::sync::Arc;
2727

28-
/// Returns the numeric code of the first character of the argument.
29-
/// ascii('x') = 120
30-
pub fn ascii<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
31-
let string_array = as_generic_string_array::<T>(&args[0])?;
32-
33-
let result = string_array
34-
.iter()
35-
.map(|string| {
36-
string.map(|string: &str| {
37-
let mut chars = string.chars();
38-
chars.next().map_or(0, |v| v as i32)
39-
})
40-
})
41-
.collect::<Int32Array>();
42-
43-
Ok(Arc::new(result) as ArrayRef)
44-
}
45-
4628
#[derive(Debug)]
4729
pub struct AsciiFunc {
4830
signature: Signature,
@@ -60,7 +42,7 @@ impl AsciiFunc {
6042
Self {
6143
signature: Signature::uniform(
6244
1,
63-
vec![Utf8, LargeUtf8],
45+
vec![Utf8, LargeUtf8, Utf8View],
6446
Volatility::Immutable,
6547
),
6648
}
@@ -87,12 +69,109 @@ impl ScalarUDFImpl for AsciiFunc {
8769
}
8870

8971
fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
90-
match args[0].data_type() {
91-
DataType::Utf8 => make_scalar_function(ascii::<i32>, vec![])(args),
92-
DataType::LargeUtf8 => {
93-
return make_scalar_function(ascii::<i64>, vec![])(args);
94-
}
95-
_ => internal_err!("Unsupported data type"),
72+
make_scalar_function(ascii, vec![])(args)
73+
}
74+
}
75+
76+
// fn calculate_ascii<'a, I>(string_array: I) -> Result<ArrayRef>
77+
// where
78+
// I: IntoIterator<Item = Option<&'a str>>,
79+
// {
80+
// let result = string_array
81+
// .into_iter()
82+
// .map(|string| {
83+
// string.map(|s| {
84+
// let mut chars = s.chars();
85+
// chars.next().map_or(0, |v| v as i32)
86+
// })
87+
// })
88+
// .collect::<Int32Array>();
89+
90+
// Ok(Arc::new(result) as ArrayRef)
91+
// }
92+
93+
fn calculate_ascii<'a, V>(array: V) -> Result<ArrayRef, ArrowError>
94+
where
95+
V: ArrayAccessor<Item = &'a str>,
96+
{
97+
let iter = ArrayIter::new(array);
98+
let result = iter
99+
.map(|string| {
100+
string.map(|s| {
101+
let mut chars = s.chars();
102+
chars.next().map_or(0, |v| v as i32)
103+
})
104+
})
105+
.collect::<Int32Array>();
106+
107+
Ok(Arc::new(result) as ArrayRef)
108+
}
109+
110+
/// Returns the numeric code of the first character of the argument.
111+
pub fn ascii(args: &[ArrayRef]) -> Result<ArrayRef> {
112+
match args[0].data_type() {
113+
DataType::Utf8 => {
114+
let string_array = args[0].as_string::<i32>();
115+
Ok(calculate_ascii(string_array)?)
116+
}
117+
DataType::LargeUtf8 => {
118+
let string_array = args[0].as_string::<i64>();
119+
Ok(calculate_ascii(string_array)?)
96120
}
121+
DataType::Utf8View => {
122+
let string_array = args[0].as_string_view();
123+
Ok(calculate_ascii(string_array)?)
124+
}
125+
_ => internal_err!("Unsupported data type"),
126+
}
127+
}
128+
129+
#[cfg(test)]
130+
mod tests {
131+
use crate::string::ascii::AsciiFunc;
132+
use crate::utils::test::test_function;
133+
use arrow::array::{Array, Int32Array};
134+
use arrow::datatypes::DataType::Int32;
135+
use datafusion_common::{Result, ScalarValue};
136+
use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
137+
138+
macro_rules! test_ascii {
139+
($INPUT:expr, $EXPECTED:expr) => {
140+
test_function!(
141+
AsciiFunc::new(),
142+
&[ColumnarValue::Scalar(ScalarValue::Utf8($INPUT))],
143+
$EXPECTED,
144+
i32,
145+
Int32,
146+
Int32Array
147+
);
148+
149+
test_function!(
150+
AsciiFunc::new(),
151+
&[ColumnarValue::Scalar(ScalarValue::LargeUtf8($INPUT))],
152+
$EXPECTED,
153+
i32,
154+
Int32,
155+
Int32Array
156+
);
157+
158+
test_function!(
159+
AsciiFunc::new(),
160+
&[ColumnarValue::Scalar(ScalarValue::Utf8View($INPUT))],
161+
$EXPECTED,
162+
i32,
163+
Int32,
164+
Int32Array
165+
);
166+
};
167+
}
168+
169+
#[test]
170+
fn test_functions() -> Result<()> {
171+
test_ascii!(Some(String::from("x")), Ok(Some(120)));
172+
test_ascii!(Some(String::from("a")), Ok(Some(97)));
173+
test_ascii!(Some(String::from("")), Ok(Some(0)));
174+
test_ascii!(None, Ok(None));
175+
Ok(())
97176
}
98177
}

datafusion/sqllogictest/test_files/string_view.slt

+99
Original file line numberDiff line numberDiff line change
@@ -500,3 +500,102 @@ select column2|| ' ' ||column3 from temp;
500500
----
501501
rust fast
502502
datafusion cool
503+
504+
### ASCII
505+
# Setup the initial test data
506+
statement ok
507+
create table test_source as values
508+
('Andrew', 'X'),
509+
('Xiangpeng', 'Xiangpeng'),
510+
('Raphael', 'R'),
511+
(NULL, 'R');
512+
513+
# Table with the different combination of column types
514+
statement ok
515+
create table test as
516+
SELECT
517+
arrow_cast(column1, 'Utf8') as column1_utf8,
518+
arrow_cast(column2, 'Utf8') as column2_utf8,
519+
arrow_cast(column1, 'LargeUtf8') as column1_large_utf8,
520+
arrow_cast(column2, 'LargeUtf8') as column2_large_utf8,
521+
arrow_cast(column1, 'Utf8View') as column1_utf8view,
522+
arrow_cast(column2, 'Utf8View') as column2_utf8view
523+
FROM test_source;
524+
525+
# Test ASCII with utf8view against utf8view, utf8, and largeutf8
526+
# (should be no casts)
527+
query TT
528+
EXPLAIN SELECT
529+
ASCII(column1_utf8view) as c1,
530+
ASCII(column2_utf8) as c2,
531+
ASCII(column2_large_utf8) as c3
532+
FROM test;
533+
----
534+
logical_plan
535+
01)Projection: ascii(test.column1_utf8view) AS c1, ascii(test.column2_utf8) AS c2, ascii(test.column2_large_utf8) AS c3
536+
02)--TableScan: test projection=[column2_utf8, column2_large_utf8, column1_utf8view]
537+
538+
query III
539+
SELECT
540+
ASCII(column1_utf8view) as c1,
541+
ASCII(column2_utf8) as c2,
542+
ASCII(column2_large_utf8) as c3
543+
FROM test;
544+
----
545+
65 88 88
546+
88 88 88
547+
82 82 82
548+
NULL 82 82
549+
550+
query TT
551+
EXPLAIN SELECT
552+
ASCII(column1_utf8) as c1,
553+
ASCII(column1_large_utf8) as c2,
554+
ASCII(column2_utf8view) as c3,
555+
ASCII('hello') as c4,
556+
ASCII(arrow_cast('world', 'Utf8View')) as c5
557+
FROM test;
558+
----
559+
logical_plan
560+
01)Projection: ascii(test.column1_utf8) AS c1, ascii(test.column1_large_utf8) AS c2, ascii(test.column2_utf8view) AS c3, Int32(104) AS c4, Int32(119) AS c5
561+
02)--TableScan: test projection=[column1_utf8, column1_large_utf8, column2_utf8view]
562+
563+
query IIIII
564+
SELECT
565+
ASCII(column1_utf8) as c1,
566+
ASCII(column1_large_utf8) as c2,
567+
ASCII(column2_utf8view) as c3,
568+
ASCII('hello') as c4,
569+
ASCII(arrow_cast('world', 'Utf8View')) as c5
570+
FROM test;
571+
----
572+
65 65 88 104 119
573+
88 88 88 104 119
574+
82 82 82 104 119
575+
NULL NULL 82 104 119
576+
577+
# Test ASCII with literals cast to Utf8View
578+
query TT
579+
EXPLAIN SELECT
580+
ASCII(arrow_cast('äöüß', 'Utf8View')) as c1,
581+
ASCII(arrow_cast('', 'Utf8View')) as c2,
582+
ASCII(arrow_cast(NULL, 'Utf8View')) as c3
583+
FROM test;
584+
----
585+
logical_plan
586+
01)Projection: Int32(228) AS c1, Int32(0) AS c2, Int32(NULL) AS c3
587+
02)--TableScan: test projection=[]
588+
589+
query III
590+
SELECT
591+
ASCII(arrow_cast('äöüß', 'Utf8View')) as c1,
592+
ASCII(arrow_cast('', 'Utf8View')) as c2,
593+
ASCII(arrow_cast(NULL, 'Utf8View')) as c3
594+
----
595+
228 0 NULL
596+
597+
statement ok
598+
drop table test;
599+
600+
statement ok
601+
drop table test_source;

0 commit comments

Comments
 (0)