Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SNOW-1432531: Improve handling of missing index_label in DataFrame/Series.to_snowflake when index=True #3143

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#### Improvements

- Support relaxed consistency and ordering guarantees in `pd.read_snowflake` for non-query data sources.
- Improved how a missing `index_label` in `DataFrame.to_snowflake` and `Series.to_snowflake` is handled when `index=True`. Instead of raising a `ValueError`, system-defined labels are used for the index columns.

## 1.29.1 (2025-03-12)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1432,6 +1432,7 @@ def _to_snowpark_dataframe_from_snowpark_pandas_dataframe(
ValueError if index/data column label is None, because snowflake column requires a column identifier.
"""

frame = self._modin_frame
index_column_labels = []
if index:
# Include index columns
Expand All @@ -1444,11 +1445,19 @@ def _to_snowpark_dataframe_from_snowpark_pandas_dataframe(
f"Length of 'index_label' should match number of levels, which is {self._modin_frame.num_index_columns}"
)
else:
index_column_labels = self._modin_frame.index_column_pandas_labels
index_column_labels = frame.index_column_pandas_labels

if any(
is_all_label_components_none(label) for label in index_column_labels
):
frame = self.reset_index()._modin_frame
index = False
index_column_labels = []

if data_column_labels is None:
data_column_labels = self._modin_frame.data_column_pandas_labels
if self._modin_frame.is_unnamed_series():
data_column_labels = frame.data_column_pandas_labels

if frame.is_unnamed_series():
# this is an unnamed Snowpark pandas series, there is no customer visible pandas
# label for the data column, set the label to be None
data_column_labels = [None]
Expand All @@ -1460,11 +1469,6 @@ def _to_snowpark_dataframe_from_snowpark_pandas_dataframe(
"Please give it a name by set the dataframe columns like df.columns=['A', 'B'],"
" or set the series name if it is a series like series.name='A'."
)
if any(is_all_label_components_none(label) for label in index_column_labels):
raise ValueError(
f"Label None is found in the index columns {index_column_labels}, which is invalid in Snowflake. "
"Please give it a name by passing index_label arguments."
)

# perform a column name duplication check
index_and_data_columns = data_column_labels + index_column_labels
Expand All @@ -1484,12 +1488,12 @@ def _to_snowpark_dataframe_from_snowpark_pandas_dataframe(
# the data column identifiers
if index:
identifiers_to_retain.extend(
self._modin_frame.index_column_snowflake_quoted_identifiers
frame.index_column_snowflake_quoted_identifiers
)
identifiers_to_retain.extend(
[
t[0]
for t in self._modin_frame.get_snowflake_quoted_identifiers_group_by_pandas_labels(
for t in frame.get_snowflake_quoted_identifiers_group_by_pandas_labels(
data_column_labels, include_index=False
)
]
Expand All @@ -1504,9 +1508,8 @@ def _to_snowpark_dataframe_from_snowpark_pandas_dataframe(
rename_mapper[snowflake_identifier] = snowflake_quoted_identifier_to_save

# first do a select to project out all unnecessary columns, then rename to avoid conflict
ordered_dataframe = self._modin_frame.ordered_dataframe.select(
identifiers_to_retain
)
ordered_dataframe = frame.ordered_dataframe.select(identifiers_to_retain)

return ordered_dataframe.to_projected_snowpark_dataframe(
col_mapper=rename_mapper
)
Expand Down
43 changes: 35 additions & 8 deletions tests/integ/modin/frame/test_to_snowflake.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,21 +181,48 @@ def test_to_snowflake_column_with_quotes(session, test_table_name):


# one extra query to convert index to native pandas when creating the snowpark pandas dataframe
@sql_count_checker(query_count=1)
def test_to_snowflake_index_label_none_raises(test_table_name):
@sql_count_checker(query_count=8)
def test_to_snowflake_index_label_none(test_table_name):
df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
df.to_snowflake(test_table_name, if_exists="replace")
verify_columns(test_table_name, ["index", "a", "b"])

message = re.escape(
"Label None is found in the index columns [None], which is invalid in Snowflake."
df = pd.DataFrame(
{"a": [1, 2, 3], "b": [4, 5, 6]}, index=pd.Index([2, 3, 4], name="index")
)
with pytest.raises(ValueError, match=message):
df.to_snowflake(test_table_name, if_exists="replace")
df.to_snowflake(test_table_name, if_exists="replace", index_label=[None])
verify_columns(test_table_name, ["index", "a", "b"])

# nameless index
df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, index=pd.Index([2, 3, 4]))
df.to_snowflake(test_table_name, if_exists="replace", index_label=[None])
verify_columns(test_table_name, ["index", "a", "b"])


# one extra query to convert index to native pandas when creating the snowpark pandas dataframe
@sql_count_checker(query_count=6)
def test_to_snowflake_index_label_none_data_column_conflict(test_table_name):
df = pd.DataFrame({"index": [1, 2, 3], "a": [4, 5, 6]})
df.to_snowflake(test_table_name, if_exists="replace")
# If the column name "index" is taken by one of the data columns,
# then "level_0" is used instead for naming the index column.
# This is based on the behavior of reset_index.
verify_columns(test_table_name, ["level_0", "index", "a"])

df = pd.DataFrame(
{"a": [1, 2, 3], "b": [4, 5, 6]}, index=pd.Index([2, 3, 4], name="index")
{"index": [1, 2, 3], "a": [4, 5, 6]}, index=pd.Index([2, 3, 4], name="index")
)
with pytest.raises(ValueError, match=message):
# If the index already has a name, "index", then "level_0" is not used,
# and a ValueError is raised instead.
# This is based on the behavior of reset_index.
with pytest.raises(ValueError):
df.to_snowflake(test_table_name, if_exists="replace", index_label=[None])
# verify_columns(test_table_name, ["level_0", "index", "a"])

# nameless index
df = pd.DataFrame({"index": [1, 2, 3], "a": [4, 5, 6]}, index=pd.Index([2, 3, 4]))
df.to_snowflake(test_table_name, if_exists="replace", index_label=[None])
verify_columns(test_table_name, ["level_0", "index", "a"])


# one extra query to convert index to native pandas when creating the snowpark pandas dataframe
Expand Down
35 changes: 24 additions & 11 deletions tests/integ/modin/frame/test_to_snowpark.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,8 @@ def test_to_snowpark_with_read_snowflake(tmp_table_basic, index) -> None:
assert snowpark_df.schema[start].column_identifier == '"row_number"'
assert isinstance(snowpark_df.schema[start].datatype, LongType)
start += 1
# verify the rest of data column is included

# verify the rest of data columns are included
assert snowpark_df.schema[start].column_identifier.name == "ID"
assert snowpark_df.schema[start].column_identifier.quoted_name == '"ID"'
assert isinstance(snowpark_df.schema[start].datatype, LongType)
Expand Down Expand Up @@ -94,7 +95,8 @@ def test_to_snowpark_from_pandas_df(native_pandas_df_basic) -> None:
# verify the index column is included
assert snowpark_df.schema[0].column_identifier.quoted_name == '"ID"'
assert isinstance(snowpark_df.schema[0].datatype, LongType)
# verify the rest of data column is included

# verify the rest of data columns are included
assert snowpark_df.schema[1].column_identifier.name == "FOOT_SIZE"
assert snowpark_df.schema[1].column_identifier.quoted_name == '"FOOT_SIZE"'
assert isinstance(snowpark_df.schema[1].datatype, DoubleType)
Expand Down Expand Up @@ -161,7 +163,8 @@ def test_to_snowpark_with_operations(session, tmp_table_basic) -> None:
assert isinstance(snowpark_df.schema[0].datatype, DoubleType)
assert snowpark_df.schema[1].column_identifier.quoted_name == '"ID"'
assert isinstance(snowpark_df.schema[1].datatype, LongType)
# verify the rest of data column is included

# verify the rest of data columns are included
assert snowpark_df.schema[2].column_identifier.quoted_name == '"model"'
assert isinstance(snowpark_df.schema[2].datatype, StringType)

Expand All @@ -181,15 +184,25 @@ def test_to_snowpark_with_duplicated_columns_raise(native_pandas_df_basic) -> No


@sql_count_checker(query_count=1)
def test_to_snowpark_with_none_index_label_raises(tmp_table_basic) -> None:
snow_dataframe = pd.read_snowflake(tmp_table_basic)
def test_to_snowpark_with_none_index_label(tmp_table_basic) -> None:
snowpandas_df = pd.read_snowflake(tmp_table_basic)

message = re.escape(
"Label None is found in the index columns [None], which is invalid in Snowflake. "
"Please give it a name by passing index_label arguments."
)
with pytest.raises(ValueError, match=message):
snow_dataframe.to_snowpark()
snowpark_df = snowpandas_df.to_snowpark()

# verify the index column is included
assert snowpark_df.schema[0].column_identifier == '"index"'
assert isinstance(snowpark_df.schema[0].datatype, LongType)

# verify the rest of data columns are included
assert snowpark_df.schema[1].column_identifier.name == "ID"
assert snowpark_df.schema[1].column_identifier.quoted_name == '"ID"'
assert isinstance(snowpark_df.schema[1].datatype, LongType)
assert snowpark_df.schema[2].column_identifier.name == "FOOT_SIZE"
assert snowpark_df.schema[2].column_identifier.quoted_name == '"FOOT_SIZE"'
assert isinstance(snowpark_df.schema[2].datatype, DoubleType)
assert snowpark_df.schema[3].column_identifier.name == "SHOE_MODEL"
assert snowpark_df.schema[3].column_identifier.quoted_name == '"SHOE_MODEL"'
assert isinstance(snowpark_df.schema[3].datatype, StringType)


@sql_count_checker(query_count=0)
Expand Down
11 changes: 4 additions & 7 deletions tests/integ/modin/series/test_to_snowflake.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,14 +59,11 @@ def test_to_snowflake_index_name_conflict_negative(test_table_name, snow_series)
snow_series.to_snowflake(test_table_name, if_exists="replace", index_label="a")


@sql_count_checker(query_count=0)
def test_to_snowflake_index_label_none_raises(test_table_name):
@sql_count_checker(query_count=2)
def test_to_snowflake_index_label_none(test_table_name):
snow_series = pd.Series([1, 2, 3], name="a", index=native_pd.Index([4, 5, 6]))
message = re.escape(
"Label None is found in the index columns [None], which is invalid in Snowflake."
)
with pytest.raises(ValueError, match=message):
snow_series.to_snowflake(test_table_name, if_exists="replace", index=True)
snow_series.to_snowflake(test_table_name, if_exists="replace", index=True)
_verify_columns(test_table_name, ["index", "a"])


@sql_count_checker(query_count=2)
Expand Down
20 changes: 13 additions & 7 deletions tests/integ/modin/series/test_to_snowpark.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,8 @@ def test_to_snowpark_from_pandas_series(snow_series_basic, index, index_label) -
)
assert isinstance(snowpark_df.schema[start].datatype, StringType)
start += 1
# verify the rest of data column is included

# verify the data column is included
assert snowpark_df.schema[start].column_identifier.quoted_name == '"SER"'
assert isinstance(snowpark_df.schema[start].datatype, LongType)

Expand Down Expand Up @@ -77,13 +78,18 @@ def test_to_snowpark_with_no_series_name_raises(snow_series_basic) -> None:


@sql_count_checker(query_count=0)
def test_to_snowpark_with_no_index_name_raises() -> None:
def test_to_snowpark_with_no_index_name() -> None:
native_series = native_pd.Series(
[1, 2, 3, 4], index=native_pd.Index(["A", "B", "C", "D"]), name="SER"
)
snow_series = pd.Series(native_series)
message = re.escape(
"Label None is found in the index columns [None], which is invalid in Snowflake. "
)
with pytest.raises(ValueError, match=message):
snow_series.to_snowpark()

snowpark_df = snow_series.to_snowpark()

# verify the index column is included
assert snowpark_df.schema[0].column_identifier.quoted_name == '"index"'
assert isinstance(snowpark_df.schema[0].datatype, StringType)

# verify the data column is included
assert snowpark_df.schema[1].column_identifier.quoted_name == '"SER"'
assert isinstance(snowpark_df.schema[1].datatype, LongType)
Loading