Skip to content

Commit b2458bd

Browse files
authored
StringView support in arrow-csv (#6062)
* StringView support in arrow-csv * review and micro-benches
1 parent 66390ff commit b2458bd

File tree

2 files changed

+128
-8
lines changed

2 files changed

+128
-8
lines changed

arrow-csv/src/reader/mod.rs

+86-8
Original file line numberDiff line numberDiff line change
@@ -795,6 +795,14 @@ fn parse(
795795
})
796796
.collect::<StringArray>(),
797797
) as ArrayRef),
798+
DataType::Utf8View => Ok(Arc::new(
799+
rows.iter()
800+
.map(|row| {
801+
let s = row.get(i);
802+
(!null_regex.is_null(s)).then_some(s)
803+
})
804+
.collect::<StringViewArray>(),
805+
) as ArrayRef),
798806
DataType::Dictionary(key_type, value_type)
799807
if value_type.as_ref() == &DataType::Utf8 =>
800808
{
@@ -2380,17 +2388,27 @@ mod tests {
23802388
}
23812389

23822390
fn err_test(csv: &[u8], expected: &str) {
2383-
let schema = Arc::new(Schema::new(vec![
2391+
fn err_test_with_schema(csv: &[u8], expected: &str, schema: Arc<Schema>) {
2392+
let buffer = std::io::BufReader::with_capacity(2, Cursor::new(csv));
2393+
let b = ReaderBuilder::new(schema)
2394+
.with_batch_size(2)
2395+
.build_buffered(buffer)
2396+
.unwrap();
2397+
let err = b.collect::<Result<Vec<_>, _>>().unwrap_err().to_string();
2398+
assert_eq!(err, expected)
2399+
}
2400+
2401+
let schema_utf8 = Arc::new(Schema::new(vec![
23842402
Field::new("text1", DataType::Utf8, true),
23852403
Field::new("text2", DataType::Utf8, true),
23862404
]));
2387-
let buffer = std::io::BufReader::with_capacity(2, Cursor::new(csv));
2388-
let b = ReaderBuilder::new(schema)
2389-
.with_batch_size(2)
2390-
.build_buffered(buffer)
2391-
.unwrap();
2392-
let err = b.collect::<Result<Vec<_>, _>>().unwrap_err().to_string();
2393-
assert_eq!(err, expected)
2405+
err_test_with_schema(csv, expected, schema_utf8);
2406+
2407+
let schema_utf8view = Arc::new(Schema::new(vec![
2408+
Field::new("text1", DataType::Utf8View, true),
2409+
Field::new("text2", DataType::Utf8View, true),
2410+
]));
2411+
err_test_with_schema(csv, expected, schema_utf8view);
23942412
}
23952413

23962414
#[test]
@@ -2587,4 +2605,64 @@ mod tests {
25872605
&vec![2, 22]
25882606
);
25892607
}
2608+
2609+
#[test]
2610+
fn test_parse_string_view_single_column() {
2611+
let csv = ["foo", "something_cannot_be_inlined", "foobar"].join("\n");
2612+
let schema = Arc::new(Schema::new(vec![Field::new(
2613+
"c1",
2614+
DataType::Utf8View,
2615+
true,
2616+
)]));
2617+
2618+
let mut decoder = ReaderBuilder::new(schema).build_decoder();
2619+
2620+
let decoded = decoder.decode(csv.as_bytes()).unwrap();
2621+
assert_eq!(decoded, csv.len());
2622+
decoder.decode(&[]).unwrap();
2623+
2624+
let batch = decoder.flush().unwrap().unwrap();
2625+
assert_eq!(batch.num_columns(), 1);
2626+
assert_eq!(batch.num_rows(), 3);
2627+
let col = batch.column(0).as_string_view();
2628+
assert_eq!(col.data_type(), &DataType::Utf8View);
2629+
assert_eq!(col.value(0), "foo");
2630+
assert_eq!(col.value(1), "something_cannot_be_inlined");
2631+
assert_eq!(col.value(2), "foobar");
2632+
}
2633+
2634+
#[test]
2635+
fn test_parse_string_view_multi_column() {
2636+
let csv = ["foo,", ",something_cannot_be_inlined", "foobarfoobar,bar"].join("\n");
2637+
let schema = Arc::new(Schema::new(vec![
2638+
Field::new("c1", DataType::Utf8View, true),
2639+
Field::new("c2", DataType::Utf8View, true),
2640+
]));
2641+
2642+
let mut decoder = ReaderBuilder::new(schema).build_decoder();
2643+
2644+
let decoded = decoder.decode(csv.as_bytes()).unwrap();
2645+
assert_eq!(decoded, csv.len());
2646+
decoder.decode(&[]).unwrap();
2647+
2648+
let batch = decoder.flush().unwrap().unwrap();
2649+
assert_eq!(batch.num_columns(), 2);
2650+
assert_eq!(batch.num_rows(), 3);
2651+
let c1 = batch.column(0).as_string_view();
2652+
let c2 = batch.column(1).as_string_view();
2653+
assert_eq!(c1.data_type(), &DataType::Utf8View);
2654+
assert_eq!(c2.data_type(), &DataType::Utf8View);
2655+
2656+
assert!(!c1.is_null(0));
2657+
assert!(c1.is_null(1));
2658+
assert!(!c1.is_null(2));
2659+
assert_eq!(c1.value(0), "foo");
2660+
assert_eq!(c1.value(2), "foobarfoobar");
2661+
2662+
assert!(c2.is_null(0));
2663+
assert!(!c2.is_null(1));
2664+
assert!(!c2.is_null(2));
2665+
assert_eq!(c2.value(1), "something_cannot_be_inlined");
2666+
assert_eq!(c2.value(2), "bar");
2667+
}
25902668
}

arrow/benches/csv_reader.rs

+42
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ extern crate criterion;
2121
use std::io::Cursor;
2222
use std::sync::Arc;
2323

24+
use arrow::util::bench_util::create_string_view_array_with_len;
2425
use criterion::*;
2526
use rand::Rng;
2627

@@ -59,6 +60,7 @@ fn do_bench(c: &mut Criterion, name: &str, cols: Vec<ArrayRef>) {
5960
fn criterion_benchmark(c: &mut Criterion) {
6061
let mut rng = seedable_rng();
6162

63+
// Single Primitive Column tests
6264
let values = Int32Array::from_iter_values((0..4096).map(|_| rng.gen_range(0..1024)));
6365
let cols = vec![Arc::new(values) as ArrayRef];
6466
do_bench(c, "4096 i32_small(0)", cols);
@@ -101,6 +103,7 @@ fn criterion_benchmark(c: &mut Criterion) {
101103
let cols = vec![Arc::new(values) as ArrayRef];
102104
do_bench(c, "4096 f64(0)", cols);
103105

106+
// Single String Column tests
104107
let cols = vec![Arc::new(create_string_array_with_len::<i32>(4096, 0., 10)) as ArrayRef];
105108
do_bench(c, "4096 string(10, 0)", cols);
106109

@@ -113,6 +116,20 @@ fn criterion_benchmark(c: &mut Criterion) {
113116
let cols = vec![Arc::new(create_string_array_with_len::<i32>(4096, 0.5, 100)) as ArrayRef];
114117
do_bench(c, "4096 string(100, 0.5)", cols);
115118

119+
// Single StringView Column tests
120+
let cols = vec![Arc::new(create_string_view_array_with_len(4096, 0., 10, false)) as ArrayRef];
121+
do_bench(c, "4096 StringView(10, 0)", cols);
122+
123+
let cols = vec![Arc::new(create_string_view_array_with_len(4096, 0., 30, false)) as ArrayRef];
124+
do_bench(c, "4096 StringView(30, 0)", cols);
125+
126+
let cols = vec![Arc::new(create_string_view_array_with_len(4096, 0., 100, false)) as ArrayRef];
127+
do_bench(c, "4096 StringView(100, 0)", cols);
128+
129+
let cols = vec![Arc::new(create_string_view_array_with_len(4096, 0.5, 100, false)) as ArrayRef];
130+
do_bench(c, "4096 StringView(100, 0.5)", cols);
131+
132+
// Multi-Column(with String) tests
116133
let cols = vec![
117134
Arc::new(create_string_array_with_len::<i32>(4096, 0.5, 20)) as ArrayRef,
118135
Arc::new(create_string_array_with_len::<i32>(4096, 0., 30)) as ArrayRef,
@@ -136,6 +153,31 @@ fn criterion_benchmark(c: &mut Criterion) {
136153
"4096 string(20, 0.5), string(30, 0), f64(0), i64(0)",
137154
cols,
138155
);
156+
157+
// Multi-Column(with StringView) tests
158+
let cols = vec![
159+
Arc::new(create_string_view_array_with_len(4096, 0.5, 20, false)) as ArrayRef,
160+
Arc::new(create_string_view_array_with_len(4096, 0., 30, false)) as ArrayRef,
161+
Arc::new(create_string_view_array_with_len(4096, 0., 100, false)) as ArrayRef,
162+
Arc::new(create_primitive_array::<Int64Type>(4096, 0.)) as ArrayRef,
163+
];
164+
do_bench(
165+
c,
166+
"4096 StringView(20, 0.5), StringView(30, 0), StringView(100, 0), i64(0)",
167+
cols,
168+
);
169+
170+
let cols = vec![
171+
Arc::new(create_string_view_array_with_len(4096, 0.5, 20, false)) as ArrayRef,
172+
Arc::new(create_string_view_array_with_len(4096, 0., 30, false)) as ArrayRef,
173+
Arc::new(create_primitive_array::<Float64Type>(4096, 0.)) as ArrayRef,
174+
Arc::new(create_primitive_array::<Int64Type>(4096, 0.)) as ArrayRef,
175+
];
176+
do_bench(
177+
c,
178+
"4096 StringView(20, 0.5), StringView(30, 0), f64(0), i64(0)",
179+
cols,
180+
);
139181
}
140182

141183
criterion_group!(benches, criterion_benchmark);

0 commit comments

Comments
 (0)