BUG: to_stata erroring when encoded text and normal text have mismatched length (#61629)

eicchen · web-flow · commit acccdac0d9ec · 2025-06-30T11:14:28.000-07:00
* Initial testcase provided in Issue

* Replaced check for encoded with unencoded check to prevent edge cases where two values are different

* replaced type check with isinstance()

* Updated patch notes

* pre-commit checks
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -777,6 +777,7 @@ I/O
 - Bug in :meth:`DataFrame.to_excel` when writing empty :class:`DataFrame` with :class:`MultiIndex` on both axes (:issue:`57696`)
 - Bug in :meth:`DataFrame.to_excel` where the :class:`MultiIndex` index with a period level was not a date (:issue:`60099`)
 - Bug in :meth:`DataFrame.to_stata` when exporting a column containing both long strings (Stata strL) and :class:`pd.NA` values (:issue:`23633`)
+- Bug in :meth:`DataFrame.to_stata` when input encoded length and normal length are mismatched (:issue:`61583`)
 - Bug in :meth:`DataFrame.to_stata` when writing :class:`DataFrame` and ``byteorder=`big```. (:issue:`58969`)
 - Bug in :meth:`DataFrame.to_stata` when writing more than 32,000 value labels. (:issue:`60107`)
 - Bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`)
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
@@ -2739,7 +2739,7 @@ def _encode_strings(self) -> None:
                 encoded = self.data[col].str.encode(self._encoding)
                 # If larger than _max_string_length do nothing
                 if (
-                    max_len_string_array(ensure_object(encoded._values))
+                    max_len_string_array(ensure_object(self.data[col]._values))
                     <= self._max_string_length
                 ):
                     self.data[col] = encoded
@@ -3263,11 +3263,15 @@ def generate_blob(self, gso_table: dict[str, tuple[int, int]]) -> bytes:
             bio.write(gso_type)
 
             # llll
-            utf8_string = bytes(strl, "utf-8")
-            bio.write(struct.pack(len_type, len(utf8_string) + 1))
+            if isinstance(strl, str):
+                strl_convert = bytes(strl, "utf-8")
+            else:
+                strl_convert = strl
+
+            bio.write(struct.pack(len_type, len(strl_convert) + 1))
 
             # xxx...xxx
-            bio.write(utf8_string)
+            bio.write(strl_convert)
             bio.write(null)
 
         return bio.getvalue()
diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py
@@ -2601,3 +2601,13 @@ def test_strl_missings(temp_file, version):
         ]
     )
     df.to_stata(temp_file, version=version)
+
+
+@pytest.mark.parametrize("version", [117, 118, 119, None])
+def test_ascii_error(temp_file, version):
+    # GH #61583
+    # Check that 2 byte long unicode characters doesn't cause export error
+    df = DataFrame({"doubleByteCol": ["§" * 1500]})
+    df.to_stata(temp_file, write_index=0, version=version)
+    df_input = read_stata(temp_file)
+    tm.assert_frame_equal(df, df_input)

Original file line number	Diff line number	Diff line change
`@@ -2601,3 +2601,13 @@ def test_strl_missings(temp_file, version):`
`2601`	`2601`	`]`
`2602`	`2602`	`)`
`2603`	`2603`	`df.to_stata(temp_file, version=version)`
	`2604`	`+`
	`2605`	`+`
	`2606`	`+@pytest.mark.parametrize("version", [117, 118, 119, None])`
	`2607`	`+def test_ascii_error(temp_file, version):`
	`2608`	`+ # GH #61583`
	`2609`	`+ # Check that 2 byte long unicode characters doesn't cause export error`
	`2610`	`+ df = DataFrame({"doubleByteCol": ["§" * 1500]})`
	`2611`	`+ df.to_stata(temp_file, write_index=0, version=version)`
	`2612`	`+ df_input = read_stata(temp_file)`
	`2613`	`+ tm.assert_frame_equal(df, df_input)`