snowflakedb · sfc-gh-helmeleegy · Mar 11, 2025 · Mar 13, 2025 · Mar 13, 2025 · Mar 13, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -17,6 +17,7 @@
 #### Improvements
 
 - Support relaxed consistency and ordering guarantees in `pd.read_snowflake` for non-query data sources.
+- Improved how a missing `index_label` in `DataFrame.to_snowflake` and `Series.to_snowflake` is handled when `index=True`. Instead of raising a `ValueError`, system-defined labels are used for the index columns.
 
 ## 1.29.1 (2025-03-12)
 

@@ -1432,6 +1432,7 @@ def _to_snowpark_dataframe_from_snowpark_pandas_dataframe(
             ValueError if index/data column label is None, because snowflake column requires a column identifier.
         """
 
+        frame = self._modin_frame
         index_column_labels = []
         if index:
             # Include index columns
@@ -1444,11 +1445,19 @@ def _to_snowpark_dataframe_from_snowpark_pandas_dataframe(
                         f"Length of 'index_label' should match number of levels, which is {self._modin_frame.num_index_columns}"
                     )
             else:
-                index_column_labels = self._modin_frame.index_column_pandas_labels
+                index_column_labels = frame.index_column_pandas_labels
+
+            if any(
+                is_all_label_components_none(label) for label in index_column_labels
+            ):
+                frame = self.reset_index()._modin_frame
+                index = False
+                index_column_labels = []
 
         if data_column_labels is None:
-            data_column_labels = self._modin_frame.data_column_pandas_labels
-        if self._modin_frame.is_unnamed_series():
+            data_column_labels = frame.data_column_pandas_labels
+
+        if frame.is_unnamed_series():
             # this is an unnamed Snowpark pandas series, there is no customer visible pandas
             # label for the data column, set the label to be None
             data_column_labels = [None]
@@ -1460,11 +1469,6 @@ def _to_snowpark_dataframe_from_snowpark_pandas_dataframe(
                 "Please give it a name by set the dataframe columns like df.columns=['A', 'B'],"
                 " or set the series name if it is a series like series.name='A'."
             )
-        if any(is_all_label_components_none(label) for label in index_column_labels):
-            raise ValueError(
-                f"Label None is found in the index columns {index_column_labels}, which is invalid in Snowflake. "
-                "Please give it a name by passing index_label arguments."
-            )
 
         # perform a column name duplication check
         index_and_data_columns = data_column_labels + index_column_labels
@@ -1484,12 +1488,12 @@ def _to_snowpark_dataframe_from_snowpark_pandas_dataframe(
         # the data column identifiers
         if index:
             identifiers_to_retain.extend(
-                self._modin_frame.index_column_snowflake_quoted_identifiers
+                frame.index_column_snowflake_quoted_identifiers
             )
         identifiers_to_retain.extend(
             [
                 t[0]
-                for t in self._modin_frame.get_snowflake_quoted_identifiers_group_by_pandas_labels(
+                for t in frame.get_snowflake_quoted_identifiers_group_by_pandas_labels(
                     data_column_labels, include_index=False
                 )
             ]
@@ -1504,9 +1508,8 @@ def _to_snowpark_dataframe_from_snowpark_pandas_dataframe(
             rename_mapper[snowflake_identifier] = snowflake_quoted_identifier_to_save
 
         # first do a select to project out all unnecessary columns, then rename to avoid conflict
-        ordered_dataframe = self._modin_frame.ordered_dataframe.select(
-            identifiers_to_retain
-        )
+        ordered_dataframe = frame.ordered_dataframe.select(identifiers_to_retain)
+
         return ordered_dataframe.to_projected_snowpark_dataframe(
             col_mapper=rename_mapper
         )

@@ -181,21 +181,48 @@ def test_to_snowflake_column_with_quotes(session, test_table_name):
 
 
 # one extra query to convert index to native pandas when creating the snowpark pandas dataframe
-@sql_count_checker(query_count=1)
-def test_to_snowflake_index_label_none_raises(test_table_name):
+@sql_count_checker(query_count=8)
+def test_to_snowflake_index_label_none(test_table_name):
     df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+    df.to_snowflake(test_table_name, if_exists="replace")
+    verify_columns(test_table_name, ["index", "a", "b"])
 
-    message = re.escape(
-        "Label None is found in the index columns [None], which is invalid in Snowflake."
+    df = pd.DataFrame(
+        {"a": [1, 2, 3], "b": [4, 5, 6]}, index=pd.Index([2, 3, 4], name="index")
     )
-    with pytest.raises(ValueError, match=message):
-        df.to_snowflake(test_table_name, if_exists="replace")
+    df.to_snowflake(test_table_name, if_exists="replace", index_label=[None])
+    verify_columns(test_table_name, ["index", "a", "b"])
+
+    # nameless index
+    df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, index=pd.Index([2, 3, 4]))
+    df.to_snowflake(test_table_name, if_exists="replace", index_label=[None])
+    verify_columns(test_table_name, ["index", "a", "b"])
+
+
+# one extra query to convert index to native pandas when creating the snowpark pandas dataframe
+@sql_count_checker(query_count=6)
+def test_to_snowflake_index_label_none_data_column_conflict(test_table_name):
+    df = pd.DataFrame({"index": [1, 2, 3], "a": [4, 5, 6]})
+    df.to_snowflake(test_table_name, if_exists="replace")
+    # If the column name "index" is taken by one of the data columns,
+    # then "level_0" is used instead for naming the index column.
+    # This is based on the behavior of reset_index.
+    verify_columns(test_table_name, ["level_0", "index", "a"])
 
     df = pd.DataFrame(
-        {"a": [1, 2, 3], "b": [4, 5, 6]}, index=pd.Index([2, 3, 4], name="index")
+        {"index": [1, 2, 3], "a": [4, 5, 6]}, index=pd.Index([2, 3, 4], name="index")
     )
-    with pytest.raises(ValueError, match=message):
+    # If the index already has a name, "index", then "level_0" is not used,
+    # and a ValueError is raised instead.
+    # This is based on the behavior of reset_index.
+    with pytest.raises(ValueError):
         df.to_snowflake(test_table_name, if_exists="replace", index_label=[None])
+    # verify_columns(test_table_name, ["level_0", "index", "a"])
+
+    # nameless index
+    df = pd.DataFrame({"index": [1, 2, 3], "a": [4, 5, 6]}, index=pd.Index([2, 3, 4]))
+    df.to_snowflake(test_table_name, if_exists="replace", index_label=[None])
+    verify_columns(test_table_name, ["level_0", "index", "a"])
 
 
 # one extra query to convert index to native pandas when creating the snowpark pandas dataframe

@@ -64,7 +64,8 @@ def test_to_snowpark_with_read_snowflake(tmp_table_basic, index) -> None:
         assert snowpark_df.schema[start].column_identifier == '"row_number"'
         assert isinstance(snowpark_df.schema[start].datatype, LongType)
         start += 1
-    # verify the rest of data column is included
+
+    # verify the rest of data columns are included
     assert snowpark_df.schema[start].column_identifier.name == "ID"
     assert snowpark_df.schema[start].column_identifier.quoted_name == '"ID"'
     assert isinstance(snowpark_df.schema[start].datatype, LongType)
@@ -94,7 +95,8 @@ def test_to_snowpark_from_pandas_df(native_pandas_df_basic) -> None:
     # verify the index column is included
     assert snowpark_df.schema[0].column_identifier.quoted_name == '"ID"'
     assert isinstance(snowpark_df.schema[0].datatype, LongType)
-    # verify the rest of data column is included
+
+    # verify the rest of data columns are included
     assert snowpark_df.schema[1].column_identifier.name == "FOOT_SIZE"
     assert snowpark_df.schema[1].column_identifier.quoted_name == '"FOOT_SIZE"'
     assert isinstance(snowpark_df.schema[1].datatype, DoubleType)
@@ -161,7 +163,8 @@ def test_to_snowpark_with_operations(session, tmp_table_basic) -> None:
     assert isinstance(snowpark_df.schema[0].datatype, DoubleType)
     assert snowpark_df.schema[1].column_identifier.quoted_name == '"ID"'
     assert isinstance(snowpark_df.schema[1].datatype, LongType)
-    # verify the rest of data column is included
+
+    # verify the rest of data columns are included
     assert snowpark_df.schema[2].column_identifier.quoted_name == '"model"'
     assert isinstance(snowpark_df.schema[2].datatype, StringType)
 
@@ -181,15 +184,25 @@ def test_to_snowpark_with_duplicated_columns_raise(native_pandas_df_basic) -> No
 
 
 @sql_count_checker(query_count=1)
-def test_to_snowpark_with_none_index_label_raises(tmp_table_basic) -> None:
-    snow_dataframe = pd.read_snowflake(tmp_table_basic)
+def test_to_snowpark_with_none_index_label(tmp_table_basic) -> None:
+    snowpandas_df = pd.read_snowflake(tmp_table_basic)
 
-    message = re.escape(
-        "Label None is found in the index columns [None], which is invalid in Snowflake. "
-        "Please give it a name by passing index_label arguments."
-    )
-    with pytest.raises(ValueError, match=message):
-        snow_dataframe.to_snowpark()
+    snowpark_df = snowpandas_df.to_snowpark()
+
+    # verify the index column is included
+    assert snowpark_df.schema[0].column_identifier == '"index"'
+    assert isinstance(snowpark_df.schema[0].datatype, LongType)
+
+    # verify the rest of data columns are included
+    assert snowpark_df.schema[1].column_identifier.name == "ID"
+    assert snowpark_df.schema[1].column_identifier.quoted_name == '"ID"'
+    assert isinstance(snowpark_df.schema[1].datatype, LongType)
+    assert snowpark_df.schema[2].column_identifier.name == "FOOT_SIZE"
+    assert snowpark_df.schema[2].column_identifier.quoted_name == '"FOOT_SIZE"'
+    assert isinstance(snowpark_df.schema[2].datatype, DoubleType)
+    assert snowpark_df.schema[3].column_identifier.name == "SHOE_MODEL"
+    assert snowpark_df.schema[3].column_identifier.quoted_name == '"SHOE_MODEL"'
+    assert isinstance(snowpark_df.schema[3].datatype, StringType)
 
 
 @sql_count_checker(query_count=0)

@@ -59,14 +59,11 @@ def test_to_snowflake_index_name_conflict_negative(test_table_name, snow_series)
         snow_series.to_snowflake(test_table_name, if_exists="replace", index_label="a")
 
 
-@sql_count_checker(query_count=0)
-def test_to_snowflake_index_label_none_raises(test_table_name):
+@sql_count_checker(query_count=2)
+def test_to_snowflake_index_label_none(test_table_name):
     snow_series = pd.Series([1, 2, 3], name="a", index=native_pd.Index([4, 5, 6]))
-    message = re.escape(
-        "Label None is found in the index columns [None], which is invalid in Snowflake."
-    )
-    with pytest.raises(ValueError, match=message):
-        snow_series.to_snowflake(test_table_name, if_exists="replace", index=True)
+    snow_series.to_snowflake(test_table_name, if_exists="replace", index=True)
+    _verify_columns(test_table_name, ["index", "a"])
 
 
 @sql_count_checker(query_count=2)

@@ -47,7 +47,8 @@ def test_to_snowpark_from_pandas_series(snow_series_basic, index, index_label) -
         )
         assert isinstance(snowpark_df.schema[start].datatype, StringType)
         start += 1
-    # verify the rest of data column is included
+
+    # verify the data column is included
     assert snowpark_df.schema[start].column_identifier.quoted_name == '"SER"'
     assert isinstance(snowpark_df.schema[start].datatype, LongType)
 
@@ -77,13 +78,18 @@ def test_to_snowpark_with_no_series_name_raises(snow_series_basic) -> None:
 
 
 @sql_count_checker(query_count=0)
-def test_to_snowpark_with_no_index_name_raises() -> None:
+def test_to_snowpark_with_no_index_name() -> None:
     native_series = native_pd.Series(
         [1, 2, 3, 4], index=native_pd.Index(["A", "B", "C", "D"]), name="SER"
     )
     snow_series = pd.Series(native_series)
-    message = re.escape(
-        "Label None is found in the index columns [None], which is invalid in Snowflake. "
-    )
-    with pytest.raises(ValueError, match=message):
-        snow_series.to_snowpark()
+
+    snowpark_df = snow_series.to_snowpark()
+
+    # verify the index column is included
+    assert snowpark_df.schema[0].column_identifier.quoted_name == '"index"'
+    assert isinstance(snowpark_df.schema[0].datatype, StringType)
+
+    # verify the data column is included
+    assert snowpark_df.schema[1].column_identifier.quoted_name == '"SER"'
+    assert isinstance(snowpark_df.schema[1].datatype, LongType)