Skip to content

Commit e2c5767

Browse files
authored
fix(ge_profiler): support nonnull_count for complex types (#14631)
1 parent acffdce commit e2c5767

File tree

1 file changed

+15
-2
lines changed

1 file changed

+15
-2
lines changed

metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -307,7 +307,6 @@ def _is_single_row_query_method(query: Any) -> bool:
307307
"get_column_max",
308308
"get_column_mean",
309309
"get_column_stdev",
310-
"get_column_nonnull_count",
311310
"get_column_unique_count",
312311
}
313312
CONSTANT_ROW_QUERY_METHODS = {
@@ -331,6 +330,7 @@ def _is_single_row_query_method(query: Any) -> bool:
331330

332331
FIRST_PARTY_SINGLE_ROW_QUERY_METHODS = {
333332
"get_column_unique_count_dh_patch",
333+
"_get_column_cardinality",
334334
}
335335

336336
# We'll do this the inefficient way since the arrays are pretty small.
@@ -497,7 +497,20 @@ def _get_column_cardinality(
497497
self, column_spec: _SingleColumnSpec, column: str
498498
) -> None:
499499
try:
500-
nonnull_count = self.dataset.get_column_nonnull_count(column)
500+
# Don't use Great Expectations get_column_nonnull_count because it
501+
# generates this SQL:
502+
#
503+
# sum(CASE WHEN (mycolumn IN (NULL) OR mycolumn IS NULL) THEN 1 ELSE 0 END)
504+
#
505+
# which fails for complex types (such as Databricks maps) that don't
506+
# support the IN operator.
507+
nonnull_count = convert_to_json_serializable(
508+
self.dataset.engine.execute(
509+
sa.select(sa.func.count(sa.column(column))).select_from(
510+
self.dataset._table
511+
)
512+
).scalar()
513+
)
501514
column_spec.nonnull_count = nonnull_count
502515
except Exception as e:
503516
logger.debug(

0 commit comments

Comments
 (0)