[SPARK-51065][SQL] Disallowing non-nullable schema when Avro encoding is used for TransformWithState

ericm-db · HeartSaVioR · commit 6794e15c514c · 2025-02-09T13:19:23.000+09:00
### What changes were proposed in this pull request? Right now, effectively set all fields in a schema to nullable, regardless of what the user specifies. - However, when Avro encoding is used, we want to enforce nullability in order to enable the schema evolution cases we support. - Nullability can only be set by the user in Python, so when non-nullable fields are defined, we throw an error - In Scala, Encoders.product set fields to non-nullable by default (user cannot configure this), so we turn the fields to nullable ### Why are the changes needed? In order to keep parity with the user-specified schema with the actual schema that we use, and to enable the schema evolution use cases we want ### Does this PR introduce _any_ user-facing change? This error is thrown if the schema is defined as non-nullable ``` Traceback (most recent call last): File "/Users/eric.marnadi/spark/python/pyspark/sql/tests/pandas/test_pandas_transform_with_state.py", line 1496, in test_not_nullable_fails self._run_evolution_test( File "/Users/eric.marnadi/spark/python/pyspark/sql/tests/pandas/test_pandas_transform_with_state.py", line 1344, in _run_evolution_test q.processAllAvailable() File "/Users/eric.marnadi/spark/python/pyspark/sql/streaming/query.py", line 351, in processAllAvailable return self._jsq.processAllAvailable() File "/Users/eric.marnadi/spark/python/lib/py4j-0.10.9.9-src.zip/py4j/java_gateway.py", line 1362, in __call__ return_value = get_return_value( File "/Users/eric.marnadi/spark/python/pyspark/errors/exceptions/captured.py", line 258, in deco raise converted from None pyspark.errors.exceptions.captured.StreamingQueryException: [STREAM_FAILED] Query [id = 541c5df0-24e4-4702-b87a-c4edfb6a952c, runId = 4259c7b9-3846-4f73-9204-c3d71b07018c] terminated with exception: [STATE_STORE_SCHEMA_MUST_BE_NULLABLE] If schema evolution is enabled, all the fields in the schema for column family state must be nullable. Please set the 'spark.sql.streaming.stateStore.encodingFormat' to 'UnsafeRow' or make the schema nullable. Current schema: StructType(StructField(id,IntegerType,false),StructField(name,StringType,false)) SQLSTATE: XXKST SQLSTATE: XXKST === Streaming Query === Identifier: evolution_test [id = 541c5df0-24e4-4702-b87a-c4edfb6a952c, runId = 4259c7b9-3846-4f73-9204-c3d71b07018c] Current Committed Offsets: {} Current ``` ### How was this patch tested? Unit tests ### Was this patch authored or co-authored using generative AI tooling? No Closes #49751 from ericm-db/disallow-non-nullable-schema. Authored-by: Eric Marnadi <eric.marnadi@databricks.com> Signed-off-by: Jungtaek Lim <kabhwan.opensource@gmail.com> (cherry picked from commit 301b666) Signed-off-by: Jungtaek Lim <kabhwan.opensource@gmail.com>
diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json
@@ -5059,6 +5059,14 @@
     ],
     "sqlState" : "42601"
   },
+  "TRANSFORM_WITH_STATE_SCHEMA_MUST_BE_NULLABLE" : {
+    "message" : [
+      "If Avro encoding is enabled, all the fields in the schema for column family <columnFamilyName> must be nullable",
+      "when using the TransformWithState operator.",
+      "Please make the schema nullable. Current schema: <schema>"
+    ],
+    "sqlState" : "XXKST"
+  },
   "TRANSPOSE_EXCEED_ROW_LIMIT" : {
     "message" : [
       "Number of rows exceeds the allowed limit of <maxValues> for TRANSPOSE. If this was intended, set <config> to at least the current row count."
diff --git a/python/pyspark/sql/tests/pandas/test_pandas_transform_with_state.py b/python/pyspark/sql/tests/pandas/test_pandas_transform_with_state.py
@@ -1470,6 +1470,39 @@ def check_exception(error):
                     check_exception=check_exception,
                 )
 
+    def test_not_nullable_fails(self):
+        with self.sql_conf({"spark.sql.streaming.stateStore.encodingFormat": "avro"}):
+            with tempfile.TemporaryDirectory() as checkpoint_dir:
+                input_path = tempfile.mkdtemp()
+                self._prepare_test_resource1(input_path)
+
+                df = self._build_test_df(input_path)
+
+                def check_basic_state(batch_df, batch_id):
+                    result = batch_df.collect()[0]
+                    assert result.value["id"] == 0  # First ID from test data
+                    assert result.value["name"] == "name-0"
+
+                def check_exception(error):
+                    from pyspark.errors.exceptions.captured import StreamingQueryException
+
+                    if not isinstance(error, StreamingQueryException):
+                        return False
+
+                    error_msg = str(error)
+                    return (
+                        "[TRANSFORM_WITH_STATE_SCHEMA_MUST_BE_NULLABLE]" in error_msg
+                        and "column family state must be nullable" in error_msg
+                    )
+
+                self._run_evolution_test(
+                    BasicProcessorNotNullable(),
+                    checkpoint_dir,
+                    check_basic_state,
+                    df,
+                    check_exception=check_exception,
+                )
+
 
 class SimpleStatefulProcessorWithInitialState(StatefulProcessor):
     # this dict is the same as input initial state dataframe
@@ -1893,6 +1926,27 @@ def close(self) -> None:
         pass
 
 
+class BasicProcessorNotNullable(StatefulProcessor):
+    # Schema definitions
+    state_schema = StructType(
+        [StructField("id", IntegerType(), False), StructField("name", StringType(), False)]
+    )
+
+    def init(self, handle):
+        self.state = handle.getValueState("state", self.state_schema)
+
+    def handleInputRows(self, key, rows, timerValues) -> Iterator[pd.DataFrame]:
+        for pdf in rows:
+            pass
+        id_val = int(key[0])
+        name = f"name-{id_val}"
+        self.state.update((id_val, name))
+        yield pd.DataFrame({"id": [key[0]], "value": [{"id": id_val, "name": name}]})
+
+    def close(self) -> None:
+        pass
+
+
 class AddFieldsProcessor(StatefulProcessor):
     state_schema = StructType(
         [
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/TransformWithStateInPandasExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/TransformWithStateInPandasExec.scala
@@ -115,9 +115,14 @@ case class TransformWithStateInPandasExec(
   override def operatorStateMetadataVersion: Int = 2
 
   override def getColFamilySchemas(
-      setNullableFields: Boolean
+      shouldBeNullable: Boolean
   ): Map[String, StateStoreColFamilySchema] = {
-    driverProcessorHandle.getColumnFamilySchemas(setNullableFields)
+    // For Python, the user can explicitly set nullability on schema, so
+    // we need to throw an error if the schema is nullable
+    driverProcessorHandle.getColumnFamilySchemas(
+      shouldCheckNullable = shouldBeNullable,
+      shouldSetNullable = shouldBeNullable
+    )
   }
 
   override def getStateVariableInfos(): Map[String, TransformWithStateVariableInfo] = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ListStateMetricsImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ListStateMetricsImpl.scala
@@ -38,7 +38,7 @@ trait ListStateMetricsImpl {
   // We keep track of the count of entries in the list in a separate column family
   // to avoid scanning the entire list to get the count.
   private val counterCFValueSchema: StructType =
-    StructType(Seq(StructField("count", LongType, nullable = false)))
+    StructType(Seq(StructField("count", LongType, nullable = true)))
 
   private val counterCFProjection = UnsafeProjection.create(counterCFValueSchema)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StateStoreColumnFamilySchemaUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StateStoreColumnFamilySchemaUtils.scala
@@ -47,7 +47,7 @@ object StateStoreColumnFamilySchemaUtils {
         // Byte type is converted to Int in Avro, which doesn't work for us as Avro
         // uses zig-zag encoding as opposed to big-endian for Ints
         Seq(
-          StructField(s"${field.name}_marker", BinaryType, nullable = false),
+          StructField(s"${field.name}_marker", BinaryType, nullable = true),
           field.copy(name = s"${field.name}_value", BinaryType)
         )
       } else {
@@ -117,7 +117,7 @@ object StateStoreColumnFamilySchemaUtils {
       getRowCounterCFName(stateName), keySchemaId = 0,
       keyEncoder.schema,
       valueSchemaId = 0,
-      StructType(Seq(StructField("count", LongType, nullable = false))),
+      StructType(Seq(StructField("count", LongType, nullable = true))),
       Some(NoPrefixKeyStateEncoderSpec(keyEncoder.schema)))
     schemas.put(counterSchema.colFamilyName, counterSchema)
 
@@ -149,7 +149,7 @@ object StateStoreColumnFamilySchemaUtils {
         keySchemaId = 0,
         keyEncoder.schema,
         valueSchemaId = 0,
-        StructType(Seq(StructField("count", LongType, nullable = false))),
+        StructType(Seq(StructField("count", LongType, nullable = true))),
         Some(NoPrefixKeyStateEncoderSpec(keyEncoder.schema)))
       schemas.put(countSchema.colFamilyName, countSchema)
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StatefulProcessorHandleImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StatefulProcessorHandleImpl.scala
@@ -363,18 +363,38 @@ class DriverStatefulProcessorHandleImpl(timeMode: TimeMode, keyExprEnc: Expressi
     addTimerColFamily()
   }
 
+  /**
+   * This method returns all column family schemas, and checks and enforces nullability
+   * if need be. The nullability check and set is only set to true when Avro is enabled.
+   * @param shouldCheckNullable Whether we need to check the nullability. This is set to
+   *                            true when using Python, as this is the only avenue through
+   *                            which users can set nullability
+   * @param shouldSetNullable Whether we need to set the fields as nullable. This is set to
+   *                          true when using Scala, as primitive type encoders set the field
+   *                          to non-nullable. Changing fields from non-nullable to nullable
+   *                          does not break anything (and is required for Avro encoding), so
+   *                          we can safely make this change.
+   * @return column family schemas used by this stateful processor.
+   */
   def getColumnFamilySchemas(
-      setNullableFields: Boolean
+      shouldCheckNullable: Boolean,
+      shouldSetNullable: Boolean
   ): Map[String, StateStoreColFamilySchema] = {
     val schemas = columnFamilySchemas.toMap
-    if (setNullableFields) {
-      schemas.map { case (colFamilyName, stateStoreColFamilySchema) =>
-        colFamilyName -> stateStoreColFamilySchema.copy(
-          valueSchema = stateStoreColFamilySchema.valueSchema.toNullable
+    schemas.map { case (colFamilyName, schema) =>
+      schema.valueSchema.fields.foreach { field =>
+        if (!field.nullable && shouldCheckNullable) {
+          throw StateStoreErrors.twsSchemaMustBeNullable(
+            schema.colFamilyName, schema.valueSchema.toString())
+        }
+      }
+      if (shouldSetNullable) {
+        colFamilyName -> schema.copy(
+          valueSchema = schema.valueSchema.toNullable
         )
+      } else {
+        colFamilyName -> schema
       }
-    } else {
-      schemas
     }
   }
 
@@ -549,7 +569,7 @@ class DriverStatefulProcessorHandleImpl(timeMode: TimeMode, keyExprEnc: Expressi
       elementKeySchema: StructType): StateStoreColFamilySchema = {
     val countIndexName = s"$$count_$stateName"
     val countValueSchema = StructType(Seq(
-      StructField("count", LongType, nullable = false)
+      StructField("count", LongType)
     ))
 
     StateStoreColFamilySchema(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/TTLState.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/TTLState.scala
@@ -357,7 +357,7 @@ abstract class OneToManyTTLState(
   // Schema of the entry count index: elementKey -> count
   private val COUNT_INDEX = "$count_" + stateName
   private val COUNT_INDEX_VALUE_SCHEMA: StructType =
-    StructType(Seq(StructField("count", LongType, nullable = false)))
+    StructType(Seq(StructField("count", LongType)))
   private val countIndexValueProjector = UnsafeProjection.create(COUNT_INDEX_VALUE_SCHEMA)
 
   // Reused internal row that we use to create an UnsafeRow with the schema of
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/TransformWithStateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/TransformWithStateExec.scala
@@ -140,7 +140,7 @@ case class TransformWithStateExec(
    * after init is called.
    */
   override def getColFamilySchemas(
-      setNullableFields: Boolean
+      shouldBeNullable: Boolean
   ): Map[String, StateStoreColFamilySchema] = {
     val keySchema = keyExpressions.toStructType
     // we have to add the default column family schema because the RocksDBStateEncoder
@@ -149,8 +149,11 @@ case class TransformWithStateExec(
       0, keyExpressions.toStructType, 0, DUMMY_VALUE_ROW_SCHEMA,
       Some(NoPrefixKeyStateEncoderSpec(keySchema)))
 
+    // For Scala, the user can't explicitly set nullability on schema, so there is
+    // no reason to throw an error, and we can simply set the schema to nullable.
     val columnFamilySchemas = getDriverProcessorHandle()
-      .getColumnFamilySchemas(setNullableFields) ++
+      .getColumnFamilySchemas(
+        shouldCheckNullable = false, shouldSetNullable = shouldBeNullable) ++
         Map(StateStore.DEFAULT_COL_FAMILY_NAME -> defaultSchema)
     closeProcessorHandle()
     columnFamilySchemas
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/TransformWithStateVariableUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/TransformWithStateVariableUtils.scala
@@ -175,7 +175,9 @@ object TransformWithStateOperatorProperties extends Logging {
  */
 trait TransformWithStateMetadataUtils extends Logging {
 
-  def getColFamilySchemas(setNullableFields: Boolean): Map[String, StateStoreColFamilySchema]
+  // This method will return the column family schemas, and check whether the fields in the
+  // schema are nullable. If Avro encoding is used, we want to enforce nullability
+  def getColFamilySchemas(shouldBeNullable: Boolean): Map[String, StateStoreColFamilySchema]
 
   def getStateVariableInfos(): Map[String, TransformWithStateVariableInfo]
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreErrors.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreErrors.scala
@@ -145,6 +145,12 @@ object StateStoreErrors {
     new StateStoreValueSchemaNotCompatible(storedValueSchema, newValueSchema)
   }
 
+  def twsSchemaMustBeNullable(
+      columnFamilyName: String,
+      schema: String): TWSSchemaMustBeNullable = {
+    new TWSSchemaMustBeNullable(columnFamilyName, schema)
+  }
+
   def stateStoreInvalidValueSchemaEvolution(
       oldValueSchema: String,
       newValueSchema: String): StateStoreInvalidValueSchemaEvolution = {
@@ -346,6 +352,15 @@ class StateStoreValueSchemaNotCompatible(
       "storedValueSchema" -> storedValueSchema,
       "newValueSchema" -> newValueSchema))
 
+class TWSSchemaMustBeNullable(
+    columnFamilyName: String,
+    schema: String)
+  extends SparkUnsupportedOperationException(
+    errorClass = "TRANSFORM_WITH_STATE_SCHEMA_MUST_BE_NULLABLE",
+    messageParameters = Map(
+      "columnFamilyName" -> columnFamilyName,
+      "schema" -> schema))
+
 class StateStoreInvalidValueSchemaEvolution(
     oldValueSchema: String,
     newValueSchema: String)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/TransformWithStateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/TransformWithStateSuite.scala
@@ -1406,7 +1406,7 @@ abstract class TransformWithStateSuite extends StateStoreMetricsTest
             val schema3 = StateStoreColFamilySchema(
               "$rowCounter_listState", 0,
               keySchema, 0,
-              new StructType().add("count", LongType, nullable = shouldBeNullable),
+              new StructType().add("count", LongType, nullable = true),
               Some(NoPrefixKeyStateEncoderSpec(keySchema)),
               None
             )
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/TransformWithValueStateTTLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/TransformWithValueStateTTLSuite.scala
@@ -317,15 +317,15 @@ class TransformWithValueStateTTLSuite extends TransformWithStateTTLTest {
         val schema2 = StateStoreColFamilySchema(
           "$count_listState", 0,
           keySchema, 0,
-          new StructType().add("count", LongType, nullable = shouldBeNullable),
+          new StructType().add("count", LongType, nullable = true),
           Some(NoPrefixKeyStateEncoderSpec(keySchema)),
           None
         )
 
         val schema3 = StateStoreColFamilySchema(
           "$rowCounter_listState", 0,
           keySchema, 0,
-          new StructType().add("count", LongType, nullable = shouldBeNullable),
+          new StructType().add("count", LongType, nullable = true),
           Some(NoPrefixKeyStateEncoderSpec(keySchema)),
           None
         )

Original file line number	Diff line number	Diff line change
`@@ -1406,7 +1406,7 @@ abstract class TransformWithStateSuite extends StateStoreMetricsTest`
`1406`	`1406`	`val schema3 = StateStoreColFamilySchema(`
`1407`	`1407`	`"$rowCounter_listState", 0,`
`1408`	`1408`	`keySchema, 0,`
`1409`		`- new StructType().add("count", LongType, nullable = shouldBeNullable),`
	`1409`	`+ new StructType().add("count", LongType, nullable = true),`
`1410`	`1410`	`Some(NoPrefixKeyStateEncoderSpec(keySchema)),`
`1411`	`1411`	`None`
`1412`	`1412`	`)`