Skip to content

Commit eb285e8

Browse files
committed
HIVE-29197: Disable vectorization for multi-column COUNT(DISTINCT)
1 parent 481d274 commit eb285e8

File tree

4 files changed

+627
-1
lines changed

4 files changed

+627
-1
lines changed

ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4501,6 +4501,12 @@ public static ImmutablePair<VectorAggregationDesc,String> getVectorAggregationDe
45014501
vecAggrClasses = new Class[] {
45024502
VectorUDAFComputeDsKllSketchDouble.class, VectorUDAFComputeDsKllSketchFinal.class
45034503
};
4504+
} else if (VECTORIZABLE_UDAF.COUNT.toString().equalsIgnoreCase(aggregationName) && parameterList.size() > 1) {
4505+
// Handle unsupported multi-column COUNT DISTINCT
4506+
String issue = "Unsupported COUNT DISTINCT with multiple columns: "
4507+
+ aggregationName + "(" + parameterList + "). "
4508+
+ "Hive only supports COUNT(DISTINCT col) in vectorized execution. ";
4509+
return new ImmutablePair<>(null, issue);
45044510
} else {
45054511
VectorizedUDAFs annotation =
45064512
AnnotationUtils.getAnnotation(evaluator.getClass(), VectorizedUDAFs.class);
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
drop table if exists test_vector;
2+
create external table test_vector(id string, pid bigint) PARTITIONED BY (full_date int);
3+
insert into test_vector (pid, full_date, id) values (1, '20240305', '6150');
4+
5+
--------------------------------------------------------------------------------
6+
-- 1. Basic COUNT cases (valid in vectorization)
7+
--------------------------------------------------------------------------------
8+
SELECT COUNT(pid) AS cnt_col, COUNT(*) AS cnt_star, COUNT(20240305) AS cnt_const, COUNT(DISTINCT pid) as cnt_distinct, COUNT(1) AS CNT
9+
FROM test_vector WHERE full_date=20240305;
10+
EXPLAIN VECTORIZATION EXPRESSION
11+
SELECT COUNT(pid) AS cnt_col, COUNT(*) AS cnt_star, COUNT(20240305) AS cnt_const,COUNT(DISTINCT pid) as cnt_distinct, COUNT(1) AS CNT
12+
FROM test_vector WHERE full_date=20240305;
13+
14+
--------------------------------------------------------------------------------
15+
-- 2. COUNT with DISTINCT column + constant (INVALID in vectorization)
16+
--------------------------------------------------------------------------------
17+
SELECT COUNT(DISTINCT pid, 20240305) AS CNT FROM test_vector WHERE full_date=20240305;
18+
EXPLAIN VECTORIZATION EXPRESSION
19+
SELECT COUNT(DISTINCT pid, 20240305) AS CNT FROM test_vector WHERE full_date=20240305;
20+
21+
--------------------------------------------------------------------------------
22+
-- 3. COUNT(DISTINCT pid, full_date) (multi-col distinct → FAIL)
23+
--------------------------------------------------------------------------------
24+
SELECT COUNT(DISTINCT pid, full_date) AS CNT FROM test_vector WHERE full_date=20240305;
25+
EXPLAIN VECTORIZATION EXPRESSION
26+
SELECT COUNT(DISTINCT pid, full_date) AS CNT FROM test_vector WHERE full_date=20240305;
27+
28+
--------------------------------------------------------------------------------
29+
-- 4. COUNT(DISTINCT pid, full_date, id) (multi-col distinct → FAIL)
30+
--------------------------------------------------------------------------------
31+
SELECT COUNT(DISTINCT pid, full_date, id) AS CNT FROM test_vector WHERE full_date=20240305;
32+
EXPLAIN VECTORIZATION EXPRESSION
33+
SELECT COUNT(DISTINCT pid, full_date, id) AS CNT FROM test_vector WHERE full_date=20240305;
34+
35+
DROP TABLE test_vector;

ql/src/test/results/clientpositive/llap/vector_count.q.out

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -212,7 +212,7 @@ STAGE PLANS:
212212
enabled: true
213213
enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true
214214
inputFileFormats: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat
215-
notVectorizedReason: GROUPBY operator: Aggregations with > 1 parameter are not supported unless all the extra parameters are constants count([Column[a], Column[b]])
215+
notVectorizedReason: GROUPBY operator: Unsupported COUNT DISTINCT with multiple columns: count([Column[a], Column[b]]). Hive only supports COUNT(DISTINCT col) in vectorized execution.
216216
vectorized: false
217217
Reducer 2
218218
Execution mode: llap

0 commit comments

Comments
 (0)