From 19838669b103f34a98061f251f05e27b83722f3c Mon Sep 17 00:00:00 2001 From: Pedro Diogo Date: Tue, 3 Jun 2025 19:54:49 +0100 Subject: [PATCH 1/5] feature #49580: support new-style float_format string in to_csv feat(to_csv): support new-style float_format strings using str.format Detect and process new-style format strings (e.g., "{:,.2f}") in the float_format parameter of to_csv. - Check if float_format is a string and matches new-style pattern - Convert it to a callable (e.g., lambda x: float_format.format(x)) - Ensure compatibility with NaN values and mixed data types - Improves formatting output for floats when exporting to CSV Example: df = pd.DataFrame([1234.56789, 9876.54321]) df.to_csv(float_format="{:,.2f}") # now outputs formatted values like 1,234.57 Co-authored-by: Pedro Santos --- pandas/_libs/tslibs/offsets.pyx | 8 +- pandas/io/formats/format.py | 30 +++- .../tests/io/formats/test_csv_benchmarks.py | 29 ++++ pandas/tests/io/formats/test_to_csv.py | 141 ++++++++++++++++++ 4 files changed, 203 insertions(+), 5 deletions(-) create mode 100644 pandas/tests/io/formats/test_csv_benchmarks.py diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index a16964435ef50..5ffa363ea3ea8 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -5108,8 +5108,8 @@ def _warn_about_deprecated_aliases(name: str, is_period: bool) -> str: warnings.warn( f"\'{name}\' is deprecated and will be removed " f"in a future version, please use " - f"\'{c_PERIOD_AND_OFFSET_DEPR_FREQSTR.get(name)}\'" - f" instead.", + f"\'{c_PERIOD_AND_OFFSET_DEPR_FREQSTR.get(name)}\' " + f"instead.", FutureWarning, stacklevel=find_stack_level(), ) @@ -5122,8 +5122,8 @@ def _warn_about_deprecated_aliases(name: str, is_period: bool) -> str: warnings.warn( f"\'{name}\' is deprecated and will be removed " f"in a future version, please use " - f"\'{_name}\'" - f" instead.", + f"\'{_name}\' " + f"instead.", FutureWarning, stacklevel=find_stack_level(), ) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index fb799361fea67..4046ffd21af3b 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -455,7 +455,7 @@ def __init__( self.na_rep = na_rep self.formatters = self._initialize_formatters(formatters) self.justify = self._initialize_justify(justify) - self.float_format = float_format + self.float_format = self._validate_float_format(float_format) self.sparsify = self._initialize_sparsify(sparsify) self.show_index_names = index_names self.decimal = decimal @@ -850,6 +850,34 @@ def _get_column_name_list(self) -> list[Hashable]: names.append("" if columns.name is None else columns.name) return names + def _validate_float_format( + self, fmt: FloatFormatType | None + ) -> FloatFormatType | None: + """ + Validates and processes the float_format argument. + Converts new-style format strings to callables. + """ + + if fmt is None: + return None + + if callable(fmt): + return fmt + + if isinstance(fmt, str): + if "%" in fmt: + # Keeps old-style format strings as they are (C code handles them) + return fmt + else: + try: + _ = fmt.format(1.0) # Test with an arbitrary float + return lambda x: fmt.format(x) + except (ValueError, KeyError, IndexError) as e: + raise ValueError(f"Invalid new-style format string {fmt!r}") from e + + # If fmt is neither None, nor callable, nor a successfully processed string, + raise ValueError("float_format must be a string or callable") + class DataFrameRenderer: """Class for creating dataframe output in multiple formats. diff --git a/pandas/tests/io/formats/test_csv_benchmarks.py b/pandas/tests/io/formats/test_csv_benchmarks.py new file mode 100644 index 0000000000000..420faba96516e --- /dev/null +++ b/pandas/tests/io/formats/test_csv_benchmarks.py @@ -0,0 +1,29 @@ +import numpy as np +import pytest + +from pandas import DataFrame + +pytestmark = pytest.mark.usefixtures("benchmark") + +# Create a single generator instance for all tests +rng = np.random.default_rng(seed=42) + + +def test_benchmark_old_style_format(benchmark): + df = DataFrame(rng.random((1000, 1000))) + benchmark(lambda: df.to_csv(float_format="%.6f")) + + +def test_benchmark_new_style_format(benchmark): + df = DataFrame(rng.random((1000, 1000))) + benchmark(lambda: df.to_csv(float_format="{:.6f}")) + + +def test_benchmark_new_style_thousands(benchmark): + df = DataFrame(rng.random((1000, 1000))) + benchmark(lambda: df.to_csv(float_format="{:,.2f}")) + + +def test_benchmark_callable_format(benchmark): + df = DataFrame(rng.random((1000, 1000))) + benchmark(lambda: df.to_csv(float_format=lambda x: f"{x:.6f}")) diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index 6d762fdeb8d79..afe8c23725392 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -1,6 +1,7 @@ import io import os import sys +import warnings from zipfile import ZipFile from _csv import Error @@ -741,3 +742,143 @@ def test_to_csv_iterative_compression_buffer(compression): pd.read_csv(buffer, compression=compression, index_col=0), df ) assert not buffer.closed + + +def test_new_style_float_format_basic(): + df = DataFrame({"A": [1234.56789, 9876.54321]}) + result = df.to_csv(float_format="{:.2f}") + expected = ",A\n0,1234.57\n1,9876.54\n" + assert result == expected + + +def test_new_style_float_format_thousands(): + df = DataFrame({"A": [1234.56789, 9876.54321]}) + result = df.to_csv(float_format="{:,.2f}") + expected = ',A\n0,"1,234.57"\n1,"9,876.54"\n' + assert result == expected + + +def test_new_style_scientific_format(): + df = DataFrame({"A": [0.000123, 0.000456]}) + result = df.to_csv(float_format="{:.2e}") + expected = ",A\n0,1.23e-04\n1,4.56e-04\n" + assert result == expected + + +def test_new_style_with_nan(): + df = DataFrame({"A": [1.23, np.nan, 4.56]}) + result = df.to_csv(float_format="{:.2f}", na_rep="NA") + expected = ",A\n0,1.23\n1,NA\n2,4.56\n" + assert result == expected + + +def test_new_style_with_mixed_types(): + df = DataFrame({"A": [1.23, 4.56], "B": ["x", "y"]}) + result = df.to_csv(float_format="{:.2f}") + expected = ",A,B\n0,1.23,x\n1,4.56,y\n" + assert result == expected + + +def test_new_style_with_mixed_types_in_column(): + df = DataFrame({"A": [1.23, "text", 4.56]}) + with warnings.catch_warnings(record=True): + warnings.simplefilter("always") + result = df.to_csv(float_format="{:.2f}") + + expected = ",A\n0,1.23\n1,text\n2,4.56\n" + assert result == expected + + +def test_invalid_new_style_format_missing_brace(): + df = DataFrame({"A": [1.23]}) + with pytest.raises(ValueError, match="Invalid new-style format string '{:.2f"): + df.to_csv(float_format="{:.2f") + + +def test_invalid_new_style_format_specifier(): + df = DataFrame({"A": [1.23]}) + with pytest.raises(ValueError, match="Invalid new-style format string '{:.2z}'"): + df.to_csv(float_format="{:.2z}") + + +def test_old_style_format_compatibility(): + df = DataFrame({"A": [1234.56789, 9876.54321]}) + result = df.to_csv(float_format="%.2f") + expected = ",A\n0,1234.57\n1,9876.54\n" + assert result == expected + + +def test_callable_float_format_compatibility(): + df = DataFrame({"A": [1234.56789, 9876.54321]}) + result = df.to_csv(float_format=lambda x: f"{x:,.2f}") + expected = ',A\n0,"1,234.57"\n1,"9,876.54"\n' + assert result == expected + + +def test_no_float_format(): + df = DataFrame({"A": [1.23, 4.56]}) + result = df.to_csv(float_format=None) + expected = ",A\n0,1.23\n1,4.56\n" + assert result == expected + + +def test_large_numbers(): + df = DataFrame({"A": [1e308, 2e308]}) + result = df.to_csv(float_format="{:.2e}") + expected = ",A\n0,1.00e+308\n1,inf\n" + assert result == expected + + +def test_zero_and_negative(): + df = DataFrame({"A": [0.0, -1.23456]}) + result = df.to_csv(float_format="{:+.2f}") + expected = ",A\n0,+0.00\n1,-1.23\n" + assert result == expected + + +def test_unicode_format(): + df = DataFrame({"A": [1.23, 4.56]}) + result = df.to_csv(float_format="{:.2f}€", encoding="utf-8") + expected = ",A\n0,1.23€\n1,4.56€\n" + assert result == expected + + +def test_empty_dataframe(): + df = DataFrame({"A": []}) + result = df.to_csv(float_format="{:.2f}") + expected = ",A\n" + assert result == expected + + +def test_multi_column_float(): + df = DataFrame({"A": [1.23, 4.56], "B": [7.89, 0.12]}) + result = df.to_csv(float_format="{:.2f}") + expected = ",A,B\n0,1.23,7.89\n1,4.56,0.12\n" + assert result == expected + + +def test_invalid_float_format_type(): + df = DataFrame({"A": [1.23]}) + with pytest.raises(ValueError, match="float_format must be a string or callable"): + df.to_csv(float_format=123) + + +def test_new_style_with_inf(): + df = DataFrame({"A": [1.23, np.inf, -np.inf]}) + result = df.to_csv(float_format="{:.2f}", na_rep="NA") + expected = ",A\n0,1.23\n1,inf\n2,-inf\n" + assert result == expected + + +def test_new_style_with_precision_edge(): + df = DataFrame({"A": [1.23456789]}) + result = df.to_csv(float_format="{:.10f}") + expected = ",A\n0,1.2345678900\n" + assert result == expected + + +def test_new_style_with_template(): + df = DataFrame({"A": [1234.56789]}) + result = df.to_csv(float_format="Value: {:,.2f}") + expected = ',A\n0,"Value: 1,234.57"\n' + assert result == expected From 7eccc899d8d9c34cbf6ec19fcaaa6486d0d6c57c Mon Sep 17 00:00:00 2001 From: Pedro Diogo Date: Fri, 20 Jun 2025 17:48:51 +0100 Subject: [PATCH 2/5] update benchmark test --- asv_bench/benchmarks/io/csv.py | 17 +++++++++++ .../tests/io/formats/test_csv_benchmarks.py | 29 ------------------- 2 files changed, 17 insertions(+), 29 deletions(-) delete mode 100644 pandas/tests/io/formats/test_csv_benchmarks.py diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 3a15f754ae523..8b68568783ad7 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -52,6 +52,23 @@ def setup(self, kind): def time_frame(self, kind): self.df.to_csv(self.fname) +class ToCSVFloatFormatVariants(BaseIO): + fname = "__test__.csv" + + def setup(self): + self.df = DataFrame(np.random.default_rng(seed=42).random((1000, 1000))) + + def time_old_style_percent_format(self): + self.df.to_csv(self.fname, float_format="%.6f") + + def time_new_style_brace_format(self): + self.df.to_csv(self.fname, float_format="{:.6f}") + + def time_new_style_thousands_format(self): + self.df.to_csv(self.fname, float_format="{:,.2f}") + + def time_callable_format(self): + self.df.to_csv(self.fname, float_format=lambda x: f"{x:.6f}") class ToCSVMultiIndexUnusedLevels(BaseIO): fname = "__test__.csv" diff --git a/pandas/tests/io/formats/test_csv_benchmarks.py b/pandas/tests/io/formats/test_csv_benchmarks.py deleted file mode 100644 index 420faba96516e..0000000000000 --- a/pandas/tests/io/formats/test_csv_benchmarks.py +++ /dev/null @@ -1,29 +0,0 @@ -import numpy as np -import pytest - -from pandas import DataFrame - -pytestmark = pytest.mark.usefixtures("benchmark") - -# Create a single generator instance for all tests -rng = np.random.default_rng(seed=42) - - -def test_benchmark_old_style_format(benchmark): - df = DataFrame(rng.random((1000, 1000))) - benchmark(lambda: df.to_csv(float_format="%.6f")) - - -def test_benchmark_new_style_format(benchmark): - df = DataFrame(rng.random((1000, 1000))) - benchmark(lambda: df.to_csv(float_format="{:.6f}")) - - -def test_benchmark_new_style_thousands(benchmark): - df = DataFrame(rng.random((1000, 1000))) - benchmark(lambda: df.to_csv(float_format="{:,.2f}")) - - -def test_benchmark_callable_format(benchmark): - df = DataFrame(rng.random((1000, 1000))) - benchmark(lambda: df.to_csv(float_format=lambda x: f"{x:.6f}")) From c81352abb8b4f520f789b23b925a2bd6130a8e1e Mon Sep 17 00:00:00 2001 From: Pedro Diogo Date: Fri, 20 Jun 2025 17:58:03 +0100 Subject: [PATCH 3/5] fixed pre commit --- asv_bench/benchmarks/io/csv.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 8b68568783ad7..9ee867260aa39 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -52,6 +52,7 @@ def setup(self, kind): def time_frame(self, kind): self.df.to_csv(self.fname) + class ToCSVFloatFormatVariants(BaseIO): fname = "__test__.csv" @@ -70,6 +71,7 @@ def time_new_style_thousands_format(self): def time_callable_format(self): self.df.to_csv(self.fname, float_format=lambda x: f"{x:.6f}") + class ToCSVMultiIndexUnusedLevels(BaseIO): fname = "__test__.csv" From cbc096f4019bae9bc8236bc1051368a1d654e3ad Mon Sep 17 00:00:00 2001 From: Pedro Diogo Date: Fri, 20 Jun 2025 19:07:12 +0100 Subject: [PATCH 4/5] fixed offsets.pyx --- pandas/_libs/tslibs/offsets.pyx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 5ffa363ea3ea8..a16964435ef50 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -5108,8 +5108,8 @@ def _warn_about_deprecated_aliases(name: str, is_period: bool) -> str: warnings.warn( f"\'{name}\' is deprecated and will be removed " f"in a future version, please use " - f"\'{c_PERIOD_AND_OFFSET_DEPR_FREQSTR.get(name)}\' " - f"instead.", + f"\'{c_PERIOD_AND_OFFSET_DEPR_FREQSTR.get(name)}\'" + f" instead.", FutureWarning, stacklevel=find_stack_level(), ) @@ -5122,8 +5122,8 @@ def _warn_about_deprecated_aliases(name: str, is_period: bool) -> str: warnings.warn( f"\'{name}\' is deprecated and will be removed " f"in a future version, please use " - f"\'{_name}\' " - f"instead.", + f"\'{_name}\'" + f" instead.", FutureWarning, stacklevel=find_stack_level(), ) From fec8e4a11adaa24a79e3db3db7dba9f7c86c49b0 Mon Sep 17 00:00:00 2001 From: Pedro Diogo Date: Sat, 21 Jun 2025 16:36:26 +0100 Subject: [PATCH 5/5] fixed tests to windows --- pandas/tests/io/formats/test_to_csv.py | 34 +++++++++++++------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index afe8c23725392..958713c29a395 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -746,35 +746,35 @@ def test_to_csv_iterative_compression_buffer(compression): def test_new_style_float_format_basic(): df = DataFrame({"A": [1234.56789, 9876.54321]}) - result = df.to_csv(float_format="{:.2f}") + result = df.to_csv(float_format="{:.2f}", lineterminator="\n") expected = ",A\n0,1234.57\n1,9876.54\n" assert result == expected def test_new_style_float_format_thousands(): df = DataFrame({"A": [1234.56789, 9876.54321]}) - result = df.to_csv(float_format="{:,.2f}") + result = df.to_csv(float_format="{:,.2f}", lineterminator="\n") expected = ',A\n0,"1,234.57"\n1,"9,876.54"\n' assert result == expected def test_new_style_scientific_format(): df = DataFrame({"A": [0.000123, 0.000456]}) - result = df.to_csv(float_format="{:.2e}") + result = df.to_csv(float_format="{:.2e}", lineterminator="\n") expected = ",A\n0,1.23e-04\n1,4.56e-04\n" assert result == expected def test_new_style_with_nan(): df = DataFrame({"A": [1.23, np.nan, 4.56]}) - result = df.to_csv(float_format="{:.2f}", na_rep="NA") + result = df.to_csv(float_format="{:.2f}", na_rep="NA", lineterminator="\n") expected = ",A\n0,1.23\n1,NA\n2,4.56\n" assert result == expected def test_new_style_with_mixed_types(): df = DataFrame({"A": [1.23, 4.56], "B": ["x", "y"]}) - result = df.to_csv(float_format="{:.2f}") + result = df.to_csv(float_format="{:.2f}", lineterminator="\n") expected = ",A,B\n0,1.23,x\n1,4.56,y\n" assert result == expected @@ -783,7 +783,7 @@ def test_new_style_with_mixed_types_in_column(): df = DataFrame({"A": [1.23, "text", 4.56]}) with warnings.catch_warnings(record=True): warnings.simplefilter("always") - result = df.to_csv(float_format="{:.2f}") + result = df.to_csv(float_format="{:.2f}", lineterminator="\n") expected = ",A\n0,1.23\n1,text\n2,4.56\n" assert result == expected @@ -803,56 +803,56 @@ def test_invalid_new_style_format_specifier(): def test_old_style_format_compatibility(): df = DataFrame({"A": [1234.56789, 9876.54321]}) - result = df.to_csv(float_format="%.2f") + result = df.to_csv(float_format="%.2f", lineterminator="\n") expected = ",A\n0,1234.57\n1,9876.54\n" assert result == expected def test_callable_float_format_compatibility(): df = DataFrame({"A": [1234.56789, 9876.54321]}) - result = df.to_csv(float_format=lambda x: f"{x:,.2f}") + result = df.to_csv(float_format=lambda x: f"{x:,.2f}", lineterminator="\n") expected = ',A\n0,"1,234.57"\n1,"9,876.54"\n' assert result == expected def test_no_float_format(): df = DataFrame({"A": [1.23, 4.56]}) - result = df.to_csv(float_format=None) + result = df.to_csv(float_format=None, lineterminator="\n") expected = ",A\n0,1.23\n1,4.56\n" assert result == expected def test_large_numbers(): df = DataFrame({"A": [1e308, 2e308]}) - result = df.to_csv(float_format="{:.2e}") + result = df.to_csv(float_format="{:.2e}", lineterminator="\n") expected = ",A\n0,1.00e+308\n1,inf\n" assert result == expected def test_zero_and_negative(): df = DataFrame({"A": [0.0, -1.23456]}) - result = df.to_csv(float_format="{:+.2f}") + result = df.to_csv(float_format="{:+.2f}", lineterminator="\n") expected = ",A\n0,+0.00\n1,-1.23\n" assert result == expected def test_unicode_format(): df = DataFrame({"A": [1.23, 4.56]}) - result = df.to_csv(float_format="{:.2f}€", encoding="utf-8") + result = df.to_csv(float_format="{:.2f}€", encoding="utf-8", lineterminator="\n") expected = ",A\n0,1.23€\n1,4.56€\n" assert result == expected def test_empty_dataframe(): df = DataFrame({"A": []}) - result = df.to_csv(float_format="{:.2f}") + result = df.to_csv(float_format="{:.2f}", lineterminator="\n") expected = ",A\n" assert result == expected def test_multi_column_float(): df = DataFrame({"A": [1.23, 4.56], "B": [7.89, 0.12]}) - result = df.to_csv(float_format="{:.2f}") + result = df.to_csv(float_format="{:.2f}", lineterminator="\n") expected = ",A,B\n0,1.23,7.89\n1,4.56,0.12\n" assert result == expected @@ -865,20 +865,20 @@ def test_invalid_float_format_type(): def test_new_style_with_inf(): df = DataFrame({"A": [1.23, np.inf, -np.inf]}) - result = df.to_csv(float_format="{:.2f}", na_rep="NA") + result = df.to_csv(float_format="{:.2f}", na_rep="NA", lineterminator="\n") expected = ",A\n0,1.23\n1,inf\n2,-inf\n" assert result == expected def test_new_style_with_precision_edge(): df = DataFrame({"A": [1.23456789]}) - result = df.to_csv(float_format="{:.10f}") + result = df.to_csv(float_format="{:.10f}", lineterminator="\n") expected = ",A\n0,1.2345678900\n" assert result == expected def test_new_style_with_template(): df = DataFrame({"A": [1234.56789]}) - result = df.to_csv(float_format="Value: {:,.2f}") + result = df.to_csv(float_format="Value: {:,.2f}", lineterminator="\n") expected = ',A\n0,"Value: 1,234.57"\n' assert result == expected