feat: Add rank(pct=True) support (#2084)

TrevorBergeron · web-flow · commit c1e871d9327b · 2025-09-15T17:36:08.000-07:00
diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py
@@ -417,6 +417,7 @@ def rank(
     ascending: bool = True,
     grouping_cols: tuple[str, ...] = (),
     columns: tuple[str, ...] = (),
+    pct: bool = False,
 ):
     if method not in ["average", "min", "max", "first", "dense"]:
         raise ValueError(
@@ -459,6 +460,12 @@ def rank(
             ),
             skip_reproject_unsafe=(col != columns[-1]),
         )
+        if pct:
+            block, max_id = block.apply_window_op(
+                rownum_id, agg_ops.max_op, windows.unbound(grouping_keys=grouping_cols)
+            )
+            block, rownum_id = block.project_expr(ops.div_op.as_expr(rownum_id, max_id))
+
         rownum_col_ids.append(rownum_id)
 
     # Step 2: Apply aggregate to groups of like input values.
diff --git a/bigframes/core/groupby/dataframe_group_by.py b/bigframes/core/groupby/dataframe_group_by.py
@@ -181,7 +181,11 @@ def median(self, numeric_only: bool = False, *, exact: bool = True) -> df.DataFr
         return self._aggregate_all(agg_ops.median_op, numeric_only=True)
 
     def rank(
-        self, method="average", ascending: bool = True, na_option: str = "keep"
+        self,
+        method="average",
+        ascending: bool = True,
+        na_option: str = "keep",
+        pct: bool = False,
     ) -> df.DataFrame:
         return df.DataFrame(
             block_ops.rank(
@@ -191,6 +195,7 @@ def rank(
                 ascending,
                 grouping_cols=tuple(self._by_col_ids),
                 columns=tuple(self._selected_cols),
+                pct=pct,
             )
         )
 
diff --git a/bigframes/core/groupby/series_group_by.py b/bigframes/core/groupby/series_group_by.py
@@ -100,7 +100,11 @@ def mean(self, *args) -> series.Series:
         return self._aggregate(agg_ops.mean_op)
 
     def rank(
-        self, method="average", ascending: bool = True, na_option: str = "keep"
+        self,
+        method="average",
+        ascending: bool = True,
+        na_option: str = "keep",
+        pct: bool = False,
     ) -> series.Series:
         return series.Series(
             block_ops.rank(
@@ -110,6 +114,7 @@ def rank(
                 ascending,
                 grouping_cols=tuple(self._by_col_ids),
                 columns=(self._value_column,),
+                pct=pct,
             )
         )
 
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
@@ -4990,9 +4990,12 @@ def rank(
         numeric_only=False,
         na_option: str = "keep",
         ascending=True,
+        pct: bool = False,
     ) -> DataFrame:
         df = self._drop_non_numeric() if numeric_only else self
-        return DataFrame(block_ops.rank(df._block, method, na_option, ascending))
+        return DataFrame(
+            block_ops.rank(df._block, method, na_option, ascending, pct=pct)
+        )
 
     def first_valid_index(self):
         return
diff --git a/bigframes/series.py b/bigframes/series.py
@@ -851,8 +851,11 @@ def rank(
         numeric_only=False,
         na_option: str = "keep",
         ascending: bool = True,
+        pct: bool = False,
     ) -> Series:
-        return Series(block_ops.rank(self._block, method, na_option, ascending))
+        return Series(
+            block_ops.rank(self._block, method, na_option, ascending, pct=pct)
+        )
 
     def fillna(self, value=None) -> Series:
         return self._apply_binary_op(value, ops.fillna_op)
diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
@@ -5442,13 +5442,13 @@ def test_df_value_counts(scalars_dfs, subset, normalize, ascending, dropna):
 
 
 @pytest.mark.parametrize(
-    ("na_option", "method", "ascending", "numeric_only"),
+    ("na_option", "method", "ascending", "numeric_only", "pct"),
     [
-        ("keep", "average", True, True),
-        ("top", "min", False, False),
-        ("bottom", "max", False, False),
-        ("top", "first", False, False),
-        ("bottom", "dense", False, False),
+        ("keep", "average", True, True, True),
+        ("top", "min", False, False, False),
+        ("bottom", "max", False, False, True),
+        ("top", "first", False, False, False),
+        ("bottom", "dense", False, False, True),
     ],
 )
 def test_df_rank_with_nulls(
@@ -5458,6 +5458,7 @@ def test_df_rank_with_nulls(
     method,
     ascending,
     numeric_only,
+    pct,
 ):
     unsupported_columns = ["geography_col"]
     bf_result = (
@@ -5467,6 +5468,7 @@ def test_df_rank_with_nulls(
             method=method,
             ascending=ascending,
             numeric_only=numeric_only,
+            pct=pct,
         )
         .to_pandas()
     )
@@ -5477,6 +5479,7 @@ def test_df_rank_with_nulls(
             method=method,
             ascending=ascending,
             numeric_only=numeric_only,
+            pct=pct,
         )
         .astype(pd.Float64Dtype())
     )
diff --git a/tests/system/small/test_groupby.py b/tests/system/small/test_groupby.py
@@ -96,63 +96,36 @@ def test_dataframe_groupby_quantile(scalars_df_index, scalars_pandas_df_index, q
 
 
 @pytest.mark.parametrize(
-    ("na_option", "method", "ascending"),
+    ("na_option", "method", "ascending", "pct"),
     [
         (
             "keep",
             "average",
             True,
-        ),
-        (
-            "top",
-            "min",
-            False,
-        ),
-        (
-            "bottom",
-            "max",
-            False,
-        ),
-        (
-            "top",
-            "first",
-            False,
-        ),
-        (
-            "bottom",
-            "dense",
             False,
         ),
+        ("top", "min", False, False),
+        ("bottom", "max", False, False),
+        ("top", "first", False, True),
+        ("bottom", "dense", False, True),
     ],
 )
 def test_dataframe_groupby_rank(
-    scalars_df_index,
-    scalars_pandas_df_index,
-    na_option,
-    method,
-    ascending,
+    scalars_df_index, scalars_pandas_df_index, na_option, method, ascending, pct
 ):
     # TODO: supply a reason why this isn't compatible with pandas 1.x
     pytest.importorskip("pandas", minversion="2.0.0")
     col_names = ["int64_too", "float64_col", "int64_col", "string_col"]
     bf_result = (
         scalars_df_index[col_names]
         .groupby("string_col")
-        .rank(
-            na_option=na_option,
-            method=method,
-            ascending=ascending,
-        )
+        .rank(na_option=na_option, method=method, ascending=ascending, pct=pct)
     ).to_pandas()
     pd_result = (
         (
             scalars_pandas_df_index[col_names]
             .groupby("string_col")
-            .rank(
-                na_option=na_option,
-                method=method,
-                ascending=ascending,
-            )
+            .rank(na_option=na_option, method=method, ascending=ascending, pct=pct)
         )
         .astype("float64")
         .astype("Float64")
@@ -737,63 +710,51 @@ def test_series_groupby_agg_list(scalars_df_index, scalars_pandas_df_index):
 
 
 @pytest.mark.parametrize(
-    ("na_option", "method", "ascending"),
+    ("na_option", "method", "ascending", "pct"),
     [
-        (
-            "keep",
-            "average",
-            True,
-        ),
+        ("keep", "average", True, False),
         (
             "top",
             "min",
             False,
+            True,
         ),
         (
             "bottom",
             "max",
             False,
+            True,
         ),
         (
             "top",
             "first",
             False,
+            True,
         ),
         (
             "bottom",
             "dense",
             False,
+            False,
         ),
     ],
 )
 def test_series_groupby_rank(
-    scalars_df_index,
-    scalars_pandas_df_index,
-    na_option,
-    method,
-    ascending,
+    scalars_df_index, scalars_pandas_df_index, na_option, method, ascending, pct
 ):
     # TODO: supply a reason why this isn't compatible with pandas 1.x
     pytest.importorskip("pandas", minversion="2.0.0")
     col_names = ["int64_col", "string_col"]
     bf_result = (
         scalars_df_index[col_names]
         .groupby("string_col")["int64_col"]
-        .rank(
-            na_option=na_option,
-            method=method,
-            ascending=ascending,
-        )
+        .rank(na_option=na_option, method=method, ascending=ascending, pct=pct)
     ).to_pandas()
     pd_result = (
         (
             scalars_pandas_df_index[col_names]
             .groupby("string_col")["int64_col"]
-            .rank(
-                na_option=na_option,
-                method=method,
-                ascending=ascending,
-            )
+            .rank(na_option=na_option, method=method, ascending=ascending, pct=pct)
         )
         .astype("float64")
         .astype("Float64")
diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py
@@ -2704,10 +2704,48 @@ def test_series_nsmallest(scalars_df_index, scalars_pandas_df_index, keep):
     )
 
 
-def test_rank_ints(scalars_df_index, scalars_pandas_df_index):
+@pytest.mark.parametrize(
+    ("na_option", "method", "ascending", "numeric_only", "pct"),
+    [
+        ("keep", "average", True, True, False),
+        ("top", "min", False, False, True),
+        ("bottom", "max", False, False, False),
+        ("top", "first", False, False, True),
+        ("bottom", "dense", False, False, False),
+    ],
+)
+def test_series_rank(
+    scalars_df_index,
+    scalars_pandas_df_index,
+    na_option,
+    method,
+    ascending,
+    numeric_only,
+    pct,
+):
     col_name = "int64_too"
-    bf_result = scalars_df_index[col_name].rank().to_pandas()
-    pd_result = scalars_pandas_df_index[col_name].rank().astype(pd.Float64Dtype())
+    bf_result = (
+        scalars_df_index[col_name]
+        .rank(
+            na_option=na_option,
+            method=method,
+            ascending=ascending,
+            numeric_only=numeric_only,
+            pct=pct,
+        )
+        .to_pandas()
+    )
+    pd_result = (
+        scalars_pandas_df_index[col_name]
+        .rank(
+            na_option=na_option,
+            method=method,
+            ascending=ascending,
+            numeric_only=numeric_only,
+            pct=pct,
+        )
+        .astype(pd.Float64Dtype())
+    )
 
     pd.testing.assert_series_equal(
         bf_result,
diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py
@@ -1042,6 +1042,10 @@ def rank(
             ascending (bool, default True):
                 Whether or not the elements should be ranked in ascending order.
 
+            pct (bool, default False):
+                Whether or not to display the returned rankings in percentile
+                form.
+
         Returns:
             bigframes.pandas.DataFrame or bigframes.pandas.Series:
                 Return a Series or DataFrame with data ranks as values.
diff --git a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py
@@ -428,6 +428,8 @@ def rank(
                 * keep: leave NA values where they are.
                 * top: smallest rank if ascending.
                 * bottom: smallest rank if descending.
+            pct (bool, default False):
+                Compute percentage rank of data within each group
 
         Returns:
             DataFrame with ranking of values within each group