Skip to content

Commit c1e871d

Browse files
feat: Add rank(pct=True) support (#2084)
1 parent dabac32 commit c1e871d

File tree

10 files changed

+100
-69
lines changed

10 files changed

+100
-69
lines changed

bigframes/core/block_transforms.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -417,6 +417,7 @@ def rank(
417417
ascending: bool = True,
418418
grouping_cols: tuple[str, ...] = (),
419419
columns: tuple[str, ...] = (),
420+
pct: bool = False,
420421
):
421422
if method not in ["average", "min", "max", "first", "dense"]:
422423
raise ValueError(
@@ -459,6 +460,12 @@ def rank(
459460
),
460461
skip_reproject_unsafe=(col != columns[-1]),
461462
)
463+
if pct:
464+
block, max_id = block.apply_window_op(
465+
rownum_id, agg_ops.max_op, windows.unbound(grouping_keys=grouping_cols)
466+
)
467+
block, rownum_id = block.project_expr(ops.div_op.as_expr(rownum_id, max_id))
468+
462469
rownum_col_ids.append(rownum_id)
463470

464471
# Step 2: Apply aggregate to groups of like input values.

bigframes/core/groupby/dataframe_group_by.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,11 @@ def median(self, numeric_only: bool = False, *, exact: bool = True) -> df.DataFr
181181
return self._aggregate_all(agg_ops.median_op, numeric_only=True)
182182

183183
def rank(
184-
self, method="average", ascending: bool = True, na_option: str = "keep"
184+
self,
185+
method="average",
186+
ascending: bool = True,
187+
na_option: str = "keep",
188+
pct: bool = False,
185189
) -> df.DataFrame:
186190
return df.DataFrame(
187191
block_ops.rank(
@@ -191,6 +195,7 @@ def rank(
191195
ascending,
192196
grouping_cols=tuple(self._by_col_ids),
193197
columns=tuple(self._selected_cols),
198+
pct=pct,
194199
)
195200
)
196201

bigframes/core/groupby/series_group_by.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,11 @@ def mean(self, *args) -> series.Series:
100100
return self._aggregate(agg_ops.mean_op)
101101

102102
def rank(
103-
self, method="average", ascending: bool = True, na_option: str = "keep"
103+
self,
104+
method="average",
105+
ascending: bool = True,
106+
na_option: str = "keep",
107+
pct: bool = False,
104108
) -> series.Series:
105109
return series.Series(
106110
block_ops.rank(
@@ -110,6 +114,7 @@ def rank(
110114
ascending,
111115
grouping_cols=tuple(self._by_col_ids),
112116
columns=(self._value_column,),
117+
pct=pct,
113118
)
114119
)
115120

bigframes/dataframe.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4990,9 +4990,12 @@ def rank(
49904990
numeric_only=False,
49914991
na_option: str = "keep",
49924992
ascending=True,
4993+
pct: bool = False,
49934994
) -> DataFrame:
49944995
df = self._drop_non_numeric() if numeric_only else self
4995-
return DataFrame(block_ops.rank(df._block, method, na_option, ascending))
4996+
return DataFrame(
4997+
block_ops.rank(df._block, method, na_option, ascending, pct=pct)
4998+
)
49964999

49975000
def first_valid_index(self):
49985001
return

bigframes/series.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -851,8 +851,11 @@ def rank(
851851
numeric_only=False,
852852
na_option: str = "keep",
853853
ascending: bool = True,
854+
pct: bool = False,
854855
) -> Series:
855-
return Series(block_ops.rank(self._block, method, na_option, ascending))
856+
return Series(
857+
block_ops.rank(self._block, method, na_option, ascending, pct=pct)
858+
)
856859

857860
def fillna(self, value=None) -> Series:
858861
return self._apply_binary_op(value, ops.fillna_op)

tests/system/small/test_dataframe.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5442,13 +5442,13 @@ def test_df_value_counts(scalars_dfs, subset, normalize, ascending, dropna):
54425442

54435443

54445444
@pytest.mark.parametrize(
5445-
("na_option", "method", "ascending", "numeric_only"),
5445+
("na_option", "method", "ascending", "numeric_only", "pct"),
54465446
[
5447-
("keep", "average", True, True),
5448-
("top", "min", False, False),
5449-
("bottom", "max", False, False),
5450-
("top", "first", False, False),
5451-
("bottom", "dense", False, False),
5447+
("keep", "average", True, True, True),
5448+
("top", "min", False, False, False),
5449+
("bottom", "max", False, False, True),
5450+
("top", "first", False, False, False),
5451+
("bottom", "dense", False, False, True),
54525452
],
54535453
)
54545454
def test_df_rank_with_nulls(
@@ -5458,6 +5458,7 @@ def test_df_rank_with_nulls(
54585458
method,
54595459
ascending,
54605460
numeric_only,
5461+
pct,
54615462
):
54625463
unsupported_columns = ["geography_col"]
54635464
bf_result = (
@@ -5467,6 +5468,7 @@ def test_df_rank_with_nulls(
54675468
method=method,
54685469
ascending=ascending,
54695470
numeric_only=numeric_only,
5471+
pct=pct,
54705472
)
54715473
.to_pandas()
54725474
)
@@ -5477,6 +5479,7 @@ def test_df_rank_with_nulls(
54775479
method=method,
54785480
ascending=ascending,
54795481
numeric_only=numeric_only,
5482+
pct=pct,
54805483
)
54815484
.astype(pd.Float64Dtype())
54825485
)

tests/system/small/test_groupby.py

Lines changed: 17 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -96,63 +96,36 @@ def test_dataframe_groupby_quantile(scalars_df_index, scalars_pandas_df_index, q
9696

9797

9898
@pytest.mark.parametrize(
99-
("na_option", "method", "ascending"),
99+
("na_option", "method", "ascending", "pct"),
100100
[
101101
(
102102
"keep",
103103
"average",
104104
True,
105-
),
106-
(
107-
"top",
108-
"min",
109-
False,
110-
),
111-
(
112-
"bottom",
113-
"max",
114-
False,
115-
),
116-
(
117-
"top",
118-
"first",
119-
False,
120-
),
121-
(
122-
"bottom",
123-
"dense",
124105
False,
125106
),
107+
("top", "min", False, False),
108+
("bottom", "max", False, False),
109+
("top", "first", False, True),
110+
("bottom", "dense", False, True),
126111
],
127112
)
128113
def test_dataframe_groupby_rank(
129-
scalars_df_index,
130-
scalars_pandas_df_index,
131-
na_option,
132-
method,
133-
ascending,
114+
scalars_df_index, scalars_pandas_df_index, na_option, method, ascending, pct
134115
):
135116
# TODO: supply a reason why this isn't compatible with pandas 1.x
136117
pytest.importorskip("pandas", minversion="2.0.0")
137118
col_names = ["int64_too", "float64_col", "int64_col", "string_col"]
138119
bf_result = (
139120
scalars_df_index[col_names]
140121
.groupby("string_col")
141-
.rank(
142-
na_option=na_option,
143-
method=method,
144-
ascending=ascending,
145-
)
122+
.rank(na_option=na_option, method=method, ascending=ascending, pct=pct)
146123
).to_pandas()
147124
pd_result = (
148125
(
149126
scalars_pandas_df_index[col_names]
150127
.groupby("string_col")
151-
.rank(
152-
na_option=na_option,
153-
method=method,
154-
ascending=ascending,
155-
)
128+
.rank(na_option=na_option, method=method, ascending=ascending, pct=pct)
156129
)
157130
.astype("float64")
158131
.astype("Float64")
@@ -737,63 +710,51 @@ def test_series_groupby_agg_list(scalars_df_index, scalars_pandas_df_index):
737710

738711

739712
@pytest.mark.parametrize(
740-
("na_option", "method", "ascending"),
713+
("na_option", "method", "ascending", "pct"),
741714
[
742-
(
743-
"keep",
744-
"average",
745-
True,
746-
),
715+
("keep", "average", True, False),
747716
(
748717
"top",
749718
"min",
750719
False,
720+
True,
751721
),
752722
(
753723
"bottom",
754724
"max",
755725
False,
726+
True,
756727
),
757728
(
758729
"top",
759730
"first",
760731
False,
732+
True,
761733
),
762734
(
763735
"bottom",
764736
"dense",
765737
False,
738+
False,
766739
),
767740
],
768741
)
769742
def test_series_groupby_rank(
770-
scalars_df_index,
771-
scalars_pandas_df_index,
772-
na_option,
773-
method,
774-
ascending,
743+
scalars_df_index, scalars_pandas_df_index, na_option, method, ascending, pct
775744
):
776745
# TODO: supply a reason why this isn't compatible with pandas 1.x
777746
pytest.importorskip("pandas", minversion="2.0.0")
778747
col_names = ["int64_col", "string_col"]
779748
bf_result = (
780749
scalars_df_index[col_names]
781750
.groupby("string_col")["int64_col"]
782-
.rank(
783-
na_option=na_option,
784-
method=method,
785-
ascending=ascending,
786-
)
751+
.rank(na_option=na_option, method=method, ascending=ascending, pct=pct)
787752
).to_pandas()
788753
pd_result = (
789754
(
790755
scalars_pandas_df_index[col_names]
791756
.groupby("string_col")["int64_col"]
792-
.rank(
793-
na_option=na_option,
794-
method=method,
795-
ascending=ascending,
796-
)
757+
.rank(na_option=na_option, method=method, ascending=ascending, pct=pct)
797758
)
798759
.astype("float64")
799760
.astype("Float64")

tests/system/small/test_series.py

Lines changed: 41 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2704,10 +2704,48 @@ def test_series_nsmallest(scalars_df_index, scalars_pandas_df_index, keep):
27042704
)
27052705

27062706

2707-
def test_rank_ints(scalars_df_index, scalars_pandas_df_index):
2707+
@pytest.mark.parametrize(
2708+
("na_option", "method", "ascending", "numeric_only", "pct"),
2709+
[
2710+
("keep", "average", True, True, False),
2711+
("top", "min", False, False, True),
2712+
("bottom", "max", False, False, False),
2713+
("top", "first", False, False, True),
2714+
("bottom", "dense", False, False, False),
2715+
],
2716+
)
2717+
def test_series_rank(
2718+
scalars_df_index,
2719+
scalars_pandas_df_index,
2720+
na_option,
2721+
method,
2722+
ascending,
2723+
numeric_only,
2724+
pct,
2725+
):
27082726
col_name = "int64_too"
2709-
bf_result = scalars_df_index[col_name].rank().to_pandas()
2710-
pd_result = scalars_pandas_df_index[col_name].rank().astype(pd.Float64Dtype())
2727+
bf_result = (
2728+
scalars_df_index[col_name]
2729+
.rank(
2730+
na_option=na_option,
2731+
method=method,
2732+
ascending=ascending,
2733+
numeric_only=numeric_only,
2734+
pct=pct,
2735+
)
2736+
.to_pandas()
2737+
)
2738+
pd_result = (
2739+
scalars_pandas_df_index[col_name]
2740+
.rank(
2741+
na_option=na_option,
2742+
method=method,
2743+
ascending=ascending,
2744+
numeric_only=numeric_only,
2745+
pct=pct,
2746+
)
2747+
.astype(pd.Float64Dtype())
2748+
)
27112749

27122750
pd.testing.assert_series_equal(
27132751
bf_result,

third_party/bigframes_vendored/pandas/core/generic.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1042,6 +1042,10 @@ def rank(
10421042
ascending (bool, default True):
10431043
Whether or not the elements should be ranked in ascending order.
10441044
1045+
pct (bool, default False):
1046+
Whether or not to display the returned rankings in percentile
1047+
form.
1048+
10451049
Returns:
10461050
bigframes.pandas.DataFrame or bigframes.pandas.Series:
10471051
Return a Series or DataFrame with data ranks as values.

third_party/bigframes_vendored/pandas/core/groupby/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -428,6 +428,8 @@ def rank(
428428
* keep: leave NA values where they are.
429429
* top: smallest rank if ascending.
430430
* bottom: smallest rank if descending.
431+
pct (bool, default False):
432+
Compute percentage rank of data within each group
431433
432434
Returns:
433435
DataFrame with ranking of values within each group

0 commit comments

Comments
 (0)