Merge remote-tracking branch 'upstream/main' into sty/ruff/docstrings

mroeschke · Dec 21, 2023 · 534f4f0 · 534f4f0
2 parents 91fb539 + 4ac340e
commit 534f4f0
Show file tree

Hide file tree

Showing 17 changed files with 201 additions and 49 deletions.
diff --git a/.github/workflows/broken-linkcheck.yml b/.github/workflows/broken-linkcheck.yml
@@ -9,6 +9,7 @@ on:
       - "doc/make.py"
 jobs:
   linkcheck:
+    if: false
     runs-on: ubuntu-latest
     defaults:
       run:

diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -81,6 +81,9 @@ delim_whitespace : boolean, default False
   If this option is set to ``True``, nothing should be passed in for the
   ``delimiter`` parameter.
 
+  .. deprecated: 2.2.0
+    Use ``sep="\\s+" instead.
+
 Column and index locations and names
 ++++++++++++++++++++++++++++++++++++
 

diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst
@@ -484,6 +484,7 @@ Other Deprecations
 - Deprecated support for combining parsed datetime columns in :func:`read_csv` along with the ``keep_date_col`` keyword (:issue:`55569`)
 - Deprecated the :attr:`.DataFrameGroupBy.grouper` and :attr:`SeriesGroupBy.grouper`; these attributes will be removed in a future version of pandas (:issue:`56521`)
 - Deprecated the :class:`.Grouping` attributes ``group_index``, ``result_index``, and ``group_arraylike``; these will be removed in a future version of pandas (:issue:`56148`)
+- Deprecated the ``delim_whitespace`` keyword in :func:`read_csv` and :func:`read_table`, use ``sep="\\s+"`` instead (:issue:`55569`)
 - Deprecated the ``errors="ignore"`` option in :func:`to_datetime`, :func:`to_timedelta`, and :func:`to_numeric`; explicitly catch exceptions instead (:issue:`54467`)
 - Deprecated the ``fastpath`` keyword in the :class:`Series` constructor (:issue:`20110`)
 - Deprecated the ``kind`` keyword in :meth:`Series.resample` and :meth:`DataFrame.resample`, explicitly cast the object's ``index`` instead (:issue:`55895`)
@@ -618,6 +619,7 @@ Indexing
 - Bug in :meth:`DataFrame.loc` when setting :class:`Series` with extension dtype into NumPy dtype (:issue:`55604`)
 - Bug in :meth:`Index.difference` not returning a unique set of values when ``other`` is empty or ``other`` is considered non-comparable (:issue:`55113`)
 - Bug in setting :class:`Categorical` values into a :class:`DataFrame` with numpy dtypes raising ``RecursionError`` (:issue:`52927`)
+- Fixed bug when creating new column with missing values when setting a single string value (:issue:`56204`)
 
 Missing
 ^^^^^^^

diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
@@ -68,6 +68,7 @@
 from pandas.core.construction import (
     array as pd_array,
     extract_array,
+    sanitize_array,
 )
 from pandas.core.indexers import (
     check_array_indexer,
@@ -1904,7 +1905,13 @@ def _setitem_with_indexer(self, indexer, value, name: str = "iloc"):
                                 return
 
                             self.obj[key] = empty_value
-
+                        elif not is_list_like(value):
+                            # Find our empty_value dtype by constructing an array
+                            #  from our value and doing a .take on it
+                            arr = sanitize_array(value, Index(range(1)), copy=False)
+                            taker = -1 * np.ones(len(self.obj), dtype=np.intp)
+                            empty_value = algos.take_nd(arr, taker)
+                            self.obj[key] = empty_value
                         else:
                             # FIXME: GH#42099#issuecomment-864326014
                             self.obj[key] = infer_fill_value(value)

diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
@@ -746,7 +746,7 @@ def astype(
         Block
         """
         values = self.values
-        if squeeze and values.ndim == 2:
+        if squeeze and values.ndim == 2 and is_1d_only_ea_dtype(dtype):
             if values.shape[0] != 1:
                 raise ValueError("Can not squeeze with more than one column.")
             values = values[0, :]  # type: ignore[call-overload]

diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py
@@ -403,6 +403,9 @@
     used as the ``sep`` delimiter. Equivalent to setting ``sep='\\s+'``. If this option
     is set to ``True``, nothing should be passed in for the ``delimiter``
     parameter.
+
+    .. deprecated:: 2.2.0
+        Use ``sep="\\s+"`` instead.
 low_memory : bool, default True
     Internally process the file in chunks, resulting in lower memory use
     while parsing, but possibly mixed type inference.  To ensure no mixed
@@ -670,7 +673,7 @@ def read_csv(
     encoding_errors: str | None = ...,
     dialect: str | csv.Dialect | None = ...,
     on_bad_lines=...,
-    delim_whitespace: bool = ...,
+    delim_whitespace: bool | lib.NoDefault = ...,
     low_memory: bool = ...,
     memory_map: bool = ...,
     float_precision: Literal["high", "legacy"] | None = ...,
@@ -730,7 +733,7 @@ def read_csv(
     encoding_errors: str | None = ...,
     dialect: str | csv.Dialect | None = ...,
     on_bad_lines=...,
-    delim_whitespace: bool = ...,
+    delim_whitespace: bool | lib.NoDefault = ...,
     low_memory: bool = ...,
     memory_map: bool = ...,
     float_precision: Literal["high", "legacy"] | None = ...,
@@ -790,7 +793,7 @@ def read_csv(
     encoding_errors: str | None = ...,
     dialect: str | csv.Dialect | None = ...,
     on_bad_lines=...,
-    delim_whitespace: bool = ...,
+    delim_whitespace: bool | lib.NoDefault = ...,
     low_memory: bool = ...,
     memory_map: bool = ...,
     float_precision: Literal["high", "legacy"] | None = ...,
@@ -850,7 +853,7 @@ def read_csv(
     encoding_errors: str | None = ...,
     dialect: str | csv.Dialect | None = ...,
     on_bad_lines=...,
-    delim_whitespace: bool = ...,
+    delim_whitespace: bool | lib.NoDefault = ...,
     low_memory: bool = ...,
     memory_map: bool = ...,
     float_precision: Literal["high", "legacy"] | None = ...,
@@ -928,7 +931,7 @@ def read_csv(
     # Error Handling
     on_bad_lines: str = "error",
     # Internal
-    delim_whitespace: bool = False,
+    delim_whitespace: bool | lib.NoDefault = lib.no_default,
     low_memory: bool = _c_parser_defaults["low_memory"],
     memory_map: bool = False,
     float_precision: Literal["high", "legacy"] | None = None,
@@ -978,6 +981,17 @@ def read_csv(
             stacklevel=find_stack_level(),
         )
 
+    if delim_whitespace is not lib.no_default:
+        # GH#55569
+        warnings.warn(
+            "The 'delim_whitespace' keyword in pd.read_csv is deprecated and "
+            "will be removed in a future version. Use ``sep='\\s+'`` instead",
+            FutureWarning,
+            stacklevel=find_stack_level(),
+        )
+    else:
+        delim_whitespace = False
+
     if verbose is not lib.no_default:
         # GH#55569
         warnings.warn(
@@ -1305,7 +1319,7 @@ def read_table(
     # Error Handling
     on_bad_lines: str = "error",
     # Internal
-    delim_whitespace: bool = False,
+    delim_whitespace: bool | lib.NoDefault = lib.no_default,
     low_memory: bool = _c_parser_defaults["low_memory"],
     memory_map: bool = False,
     float_precision: str | None = None,
@@ -1346,6 +1360,17 @@ def read_table(
             stacklevel=find_stack_level(),
         )
 
+    if delim_whitespace is not lib.no_default:
+        # GH#55569
+        warnings.warn(
+            "The 'delim_whitespace' keyword in pd.read_table is deprecated and "
+            "will be removed in a future version. Use ``sep='\\s+'`` instead",
+            FutureWarning,
+            stacklevel=find_stack_level(),
+        )
+    else:
+        delim_whitespace = False
+
     if verbose is not lib.no_default:
         # GH#55569
         warnings.warn(
@@ -2131,6 +2156,9 @@ def _refine_defaults_read(
         used as the sep. Equivalent to setting ``sep='\\s+'``. If this option
         is set to True, nothing should be passed in for the ``delimiter``
         parameter.
+
+        .. deprecated:: 2.2.0
+            Use ``sep="\\s+"`` instead.
     engine : {{'c', 'python'}}
         Parser engine to use. The C engine is faster while the python engine is
         currently more feature-complete.

diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py
@@ -1935,6 +1935,26 @@ def test_adding_new_conditional_column() -> None:
     tm.assert_frame_equal(df, expected)
 
 
+@pytest.mark.parametrize(
+    ("dtype", "infer_string"),
+    [
+        (object, False),
+        ("string[pyarrow_numpy]", True),
+    ],
+)
+def test_adding_new_conditional_column_with_string(dtype, infer_string) -> None:
+    # https://github.com/pandas-dev/pandas/issues/56204
+    pytest.importorskip("pyarrow")
+
+    df = DataFrame({"a": [1, 2], "b": [3, 4]})
+    with pd.option_context("future.infer_string", infer_string):
+        df.loc[df["a"] == 1, "c"] = "1"
+    expected = DataFrame({"a": [1, 2], "b": [3, 4], "c": ["1", float("nan")]}).astype(
+        {"a": "int64", "b": "int64", "c": dtype}
+    )
+    tm.assert_frame_equal(df, expected)
+
+
 def test_add_new_column_infer_string():
     # GH#55366
     pytest.importorskip("pyarrow")

diff --git a/pandas/tests/frame/indexing/test_set_value.py b/pandas/tests/frame/indexing/test_set_value.py
@@ -32,10 +32,7 @@ def test_set_value_resize(self, float_frame, using_infer_string):
         else:
             assert res["baz"].dtype == np.object_
         res = float_frame.copy()
-        with tm.assert_produces_warning(
-            FutureWarning, match="Setting an item of incompatible dtype"
-        ):
-            res._set_value("foobar", "baz", True)
+        res._set_value("foobar", "baz", True)
         assert res["baz"].dtype == np.object_
 
         res = float_frame.copy()

diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py
@@ -193,3 +193,10 @@ def test_convert_dtypes_avoid_block_splitting(self):
         )
         tm.assert_frame_equal(result, expected)
         assert result._mgr.nblocks == 2
+
+    def test_convert_dtypes_from_arrow(self):
+        # GH#56581
+        df = pd.DataFrame([["a", datetime.time(18, 12)]], columns=["a", "b"])
+        result = df.convert_dtypes()
+        expected = df.astype({"a": "string[python]"})
+        tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py
@@ -500,13 +500,21 @@ def test_trailing_spaces(all_parsers, kwargs, expected):
     data = "A B C  \nrandom line with trailing spaces    \nskip\n1,2,3\n1,2.,4.\nrandom line with trailing tabs\t\t\t\n   \n5.1,NaN,10.0\n"  # noqa: E501
     parser = all_parsers
 
+    depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
+
     if parser.engine == "pyarrow":
         msg = "The 'delim_whitespace' option is not supported with the 'pyarrow' engine"
         with pytest.raises(ValueError, match=msg):
-            parser.read_csv(StringIO(data.replace(",", "  ")), **kwargs)
+            with tm.assert_produces_warning(
+                FutureWarning, match=depr_msg, check_stacklevel=False
+            ):
+                parser.read_csv(StringIO(data.replace(",", "  ")), **kwargs)
         return
 
-    result = parser.read_csv(StringIO(data.replace(",", "  ")), **kwargs)
+    with tm.assert_produces_warning(
+        FutureWarning, match=depr_msg, check_stacklevel=False
+    ):
+        result = parser.read_csv(StringIO(data.replace(",", "  ")), **kwargs)
     tm.assert_frame_equal(result, expected)
 
 
@@ -515,8 +523,12 @@ def test_raise_on_sep_with_delim_whitespace(all_parsers):
     data = "a b c\n1 2 3"
     parser = all_parsers
 
+    depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
     with pytest.raises(ValueError, match="you can only specify one"):
-        parser.read_csv(StringIO(data), sep=r"\s", delim_whitespace=True)
+        with tm.assert_produces_warning(
+            FutureWarning, match=depr_msg, check_stacklevel=False
+        ):
+            parser.read_csv(StringIO(data), sep=r"\s", delim_whitespace=True)
 
 
 def test_read_filepath_or_buffer(all_parsers):
@@ -539,18 +551,27 @@ def test_single_char_leading_whitespace(all_parsers, delim_whitespace):
 b\n"""
 
     expected = DataFrame({"MyColumn": list("abab")})
+    depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
 
     if parser.engine == "pyarrow":
         msg = "The 'skipinitialspace' option is not supported with the 'pyarrow' engine"
         with pytest.raises(ValueError, match=msg):
-            parser.read_csv(
-                StringIO(data), skipinitialspace=True, delim_whitespace=delim_whitespace
-            )
+            with tm.assert_produces_warning(
+                FutureWarning, match=depr_msg, check_stacklevel=False
+            ):
+                parser.read_csv(
+                    StringIO(data),
+                    skipinitialspace=True,
+                    delim_whitespace=delim_whitespace,
+                )
         return
 
-    result = parser.read_csv(
-        StringIO(data), skipinitialspace=True, delim_whitespace=delim_whitespace
-    )
+    with tm.assert_produces_warning(
+        FutureWarning, match=depr_msg, check_stacklevel=False
+    ):
+        result = parser.read_csv(
+            StringIO(data), skipinitialspace=True, delim_whitespace=delim_whitespace
+        )
     tm.assert_frame_equal(result, expected)
 
 
@@ -798,12 +819,20 @@ def test_read_table_delim_whitespace_default_sep(all_parsers):
     f = StringIO("a  b  c\n1 -2 -3\n4  5   6")
     parser = all_parsers
 
+    depr_msg = "The 'delim_whitespace' keyword in pd.read_table is deprecated"
+
     if parser.engine == "pyarrow":
         msg = "The 'delim_whitespace' option is not supported with the 'pyarrow' engine"
         with pytest.raises(ValueError, match=msg):
-            parser.read_table(f, delim_whitespace=True)
+            with tm.assert_produces_warning(
+                FutureWarning, match=depr_msg, check_stacklevel=False
+            ):
+                parser.read_table(f, delim_whitespace=True)
         return
-    result = parser.read_table(f, delim_whitespace=True)
+    with tm.assert_produces_warning(
+        FutureWarning, match=depr_msg, check_stacklevel=False
+    ):
+        result = parser.read_table(f, delim_whitespace=True)
     expected = DataFrame({"a": [1, 4], "b": [-2, 5], "c": [-3, 6]})
     tm.assert_frame_equal(result, expected)
 
@@ -817,11 +846,15 @@ def test_read_csv_delim_whitespace_non_default_sep(all_parsers, delimiter):
         "Specified a delimiter with both sep and "
         "delim_whitespace=True; you can only specify one."
     )
-    with pytest.raises(ValueError, match=msg):
-        parser.read_csv(f, delim_whitespace=True, sep=delimiter)
+    depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
+    with tm.assert_produces_warning(
+        FutureWarning, match=depr_msg, check_stacklevel=False
+    ):
+        with pytest.raises(ValueError, match=msg):
+            parser.read_csv(f, delim_whitespace=True, sep=delimiter)
 
-    with pytest.raises(ValueError, match=msg):
-        parser.read_csv(f, delim_whitespace=True, delimiter=delimiter)
+        with pytest.raises(ValueError, match=msg):
+            parser.read_csv(f, delim_whitespace=True, delimiter=delimiter)
 
 
 def test_read_csv_delimiter_and_sep_no_default(all_parsers):
@@ -858,11 +891,15 @@ def test_read_table_delim_whitespace_non_default_sep(all_parsers, delimiter):
         "Specified a delimiter with both sep and "
         "delim_whitespace=True; you can only specify one."
     )
-    with pytest.raises(ValueError, match=msg):
-        parser.read_table(f, delim_whitespace=True, sep=delimiter)
+    depr_msg = "The 'delim_whitespace' keyword in pd.read_table is deprecated"
+    with tm.assert_produces_warning(
+        FutureWarning, match=depr_msg, check_stacklevel=False
+    ):
+        with pytest.raises(ValueError, match=msg):
+            parser.read_table(f, delim_whitespace=True, sep=delimiter)
 
-    with pytest.raises(ValueError, match=msg):
-        parser.read_table(f, delim_whitespace=True, delimiter=delimiter)
+        with pytest.raises(ValueError, match=msg):
+            parser.read_table(f, delim_whitespace=True, delimiter=delimiter)
 
 
 @skip_pyarrow

diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py
@@ -51,7 +51,11 @@ def test_delim_whitespace_custom_terminator(c_parser_only):
     data = "a b c~1 2 3~4 5 6~7 8 9"
     parser = c_parser_only
 
-    df = parser.read_csv(StringIO(data), lineterminator="~", delim_whitespace=True)
+    depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
+    with tm.assert_produces_warning(
+        FutureWarning, match=depr_msg, check_stacklevel=False
+    ):
+        df = parser.read_csv(StringIO(data), lineterminator="~", delim_whitespace=True)
     expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"])
     tm.assert_frame_equal(df, expected)