snowflakedb · sfc-gh-aalam · Nov 27, 2024 · Dec 19, 2024 · Dec 20, 2024 · Dec 20, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -60,6 +60,17 @@
 - Fixed a bug in options sql generation that could cause multiple values to be formatted incorrectly.
 - Fixed a bug in `Session.catalog` where empty strings for database or schema were not handled correctly and were generating erroneous sql statements.
 
+#### Behavior Changes
+
+- Added new methods in class `DataFrame`:
+  - `col_regex`: Select columns that match with provided regex.
+  - `map` and its alias `foreach`: A method to apply user function on each row with 1-1 mapping.
+  - `flat_map`: A method to apply user function on each row with one to many mapping.
+  - `toJSON` and its alias `to_json`: Convert each row of dataframe into json string.
+  - `transform`: Chain multiple transformations on dataframe.
+- Removed Snowpark Python function `snowflake_cortex_summarize`. Users can install snowflake-ml-python and use the snowflake.cortex.summarize function instead.
+- Removed Snowpark Python function `snowflake_cortex_sentiment`. Users can install snowflake-ml-python and use the snowflake.cortex.sentiment function instead.
+
 #### Experimental Features
 
 - Added support for writing pyarrow Tables to Snowflake tables.
@@ -103,6 +114,7 @@
 
 #### New Features
 
+- Added support for `DataFrame.summary()` to compute desired statistics of a DataFrame.
 - Added support for the following functions in `functions.py`
   - `array_reverse`
   - `divnull`
@@ -235,6 +247,7 @@
 
 #### New Features
 
+- Added support for property `version` and class method `get_active_session` for `Session` class.
 - Added support for property `version` and class method `get_active_session` for `Session` class.
 - Added new methods and variables to enhance data type handling and JSON serialization/deserialization:
   - To `DataType`, its derived classes, and `StructField`:

@@ -52,6 +52,8 @@ DataFrame
     DataFrame.fillna
     DataFrame.filter
     DataFrame.first
+    DataFrame.flat_map
+    DataFrame.flatMap
     DataFrame.flatten
     DataFrame.groupBy
     DataFrame.group_by
@@ -60,6 +62,7 @@ DataFrame
     DataFrame.join
     DataFrame.join_table_function
     DataFrame.limit
+    DataFrame.map
     DataFrame.minus
     DataFrame.natural_join
     DataFrame.orderBy
@@ -81,15 +84,19 @@ DataFrame
     DataFrame.show
     DataFrame.sort
     DataFrame.subtract
+    DataFrame.summary
     DataFrame.take
     DataFrame.toDF
+    DataFrame.toJSON
     DataFrame.toLocalIterator
     DataFrame.toPandas
     DataFrame.to_df
     DataFrame.to_local_iterator
+    DataFrame.to_json
     DataFrame.to_pandas
     DataFrame.to_pandas_batches
     DataFrame.to_snowpark_pandas
+    DataFrame.transform
     DataFrame.union
     DataFrame.unionAll
     DataFrame.unionAllByName
@@ -119,7 +126,6 @@ DataFrame
     DataFrameAnalyticsFunctions.compute_lag
     DataFrameAnalyticsFunctions.compute_lead
     DataFrameAnalyticsFunctions.time_series_agg
-    dataframe.map
 
 
 

@@ -312,8 +312,6 @@ Functions
     sinh
     size
     skew
-    snowflake_cortex_sentiment
-    snowflake_cortex_summarize
     sort_array
     soundex
     split

@@ -80,6 +80,7 @@
     SubfieldString,
     UnresolvedAttribute,
     WithinGroup,
+    UnresolvedColumnRegex,
 )
 from snowflake.snowpark._internal.analyzer.grouping_set import (
     GroupingSet,
@@ -332,7 +333,7 @@ def analyze(
         if isinstance(expr, WindowSpecDefinition):
             return window_spec_expression(
                 [
-                    self.analyze(
+                    self.to_sql_try_avoid_cast(
                         x, df_aliased_col_name_to_real_col_name, parse_local_name
                     )
                     for x in expr.partition_spec
@@ -424,6 +425,14 @@ def analyze(
                     ]
                 )
 
+        if isinstance(expr, UnresolvedColumnRegex):
+            return ",".join(
+                [
+                    self.analyze(e, df_aliased_col_name_to_real_col_name)
+                    for e in expr.expressions
+                ]
+            )
+
         if isinstance(expr, SnowflakeUDF):
             if expr.api_call_source is not None:
                 self.session._conn._telemetry_client.send_function_usage_telemetry(
@@ -454,7 +463,7 @@ def analyze(
             return table_function_partition_spec(
                 expr.over,
                 [
-                    self.analyze(
+                    self.to_sql_try_avoid_cast(
                         x, df_aliased_col_name_to_real_col_name, parse_local_name
                     )
                     for x in expr.partition_spec
@@ -621,7 +630,9 @@ def table_function_expression_extractor(
                 "NamedArgumentsTableFunction, GeneratorTableFunction, or FlattenFunction."
             )
         partition_spec_sql = (
-            self.analyze(expr.partition_spec, df_aliased_col_name_to_real_col_name)
+            self.to_sql_try_avoid_cast(
+                expr.partition_spec, df_aliased_col_name_to_real_col_name
+            )
             if expr.partition_spec
             else ""
         )

@@ -292,6 +292,25 @@ def individual_node_complexity(self) -> Dict[PlanNodeCategory, int]:
         )
 
 
+class UnresolvedColumnRegex(Expression):
+    def __init__(self, expressions: List[Attribute]) -> None:
+        super().__init__()
+        assert len(expressions) > 0
+        self.expressions = expressions
+
+    def dependent_column_names(self) -> Optional[AbstractSet[str]]:
+        return derive_dependent_columns(*self.expressions)
+
+    def dependent_column_names_with_duplication(self) -> List[str]:
+        return derive_dependent_columns_with_duplication(*self.expressions)
+
+    @property
+    def individual_node_complexity(self) -> Dict[PlanNodeCategory, int]:
+        # expressions contain column names that match given regex. The generated sql is
+        # SELECT col1, col2, .... FROM child
+        return {PlanNodeCategory.COLUMN: len(self.expressions)}
+
+
 class UnresolvedAttribute(Expression, NamedExpression):
     def __init__(
         self, name: str, is_sql_text: bool = False, df_alias: Optional[str] = None

@@ -62,6 +62,7 @@
     Star,
     UnresolvedAttribute,
     derive_dependent_columns,
+    UnresolvedColumnRegex,
 )
 from snowflake.snowpark._internal.analyzer.schema_utils import analyze_attributes
 from snowflake.snowpark._internal.analyzer.snowflake_plan import Query, SnowflakePlan
@@ -1801,11 +1802,14 @@ def derive_column_states_from_subquery(
     analyzer = from_.analyzer
     column_states = ColumnStateDict()
     for c in cols:
-        if isinstance(c, UnresolvedAlias) and isinstance(c.child, Star):
+        if isinstance(c, UnresolvedAlias) and (
+            isinstance(c.child, Star) or isinstance(c.child, UnresolvedColumnRegex)
+        ):
             if c.child.expressions:
                 # df.select(df["*"]) will have child expressions. df.select("*") doesn't.
+                # df.select(df.colRegex(...)) will have a column expressions
                 columns_from_star = [copy(e) for e in c.child.expressions]
-            elif c.child.df_alias:
+            elif isinstance(c.child, Star) and c.child.df_alias:
                 if c.child.df_alias not in from_.df_aliased_col_name_to_real_col_name:
                     raise SnowparkClientExceptionMessages.DF_ALIAS_NOT_RECOGNIZED(
                         c.child.df_alias
-Original file line number
+Diff line change
@@ Expand Up / @@ -312,8 +312,6 @@ Functions @@
         sinh
         size
         skew
-        snowflake_cortex_sentiment
-        snowflake_cortex_summarize
         sort_array
         soundex
         split
@@ Expand Down @@