apache · ericm-db · Jan 31, 2025 · Jan 31, 2025 · Jan 31, 2025 · Jan 31, 2025
diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json
@@ -5072,6 +5072,14 @@
     ],
     "sqlState" : "42601"
   },
+  "TRANSFORM_WITH_STATE_SCHEMA_MUST_BE_NULLABLE" : {
+    "message" : [
+      "If schema evolution is enabled, all the fields in the schema for column family <columnFamilyName> must be nullable",
+      " when using the TransformWithState operator.",
+      " Please make the schema nullable. Current schema: <schema>"
+    ],
+    "sqlState" : "XXKST"
+  },
   "TRANSPOSE_EXCEED_ROW_LIMIT" : {
     "message" : [
       "Number of rows exceeds the allowed limit of <maxValues> for TRANSPOSE. If this was intended, set <config> to at least the current row count."

diff --git a/python/pyspark/sql/tests/pandas/test_pandas_transform_with_state.py b/python/pyspark/sql/tests/pandas/test_pandas_transform_with_state.py
@@ -1470,6 +1470,39 @@ def check_exception(error):
                     check_exception=check_exception,
                 )
 
+    def test_not_nullable_fails(self):
+        with self.sql_conf({"spark.sql.streaming.stateStore.encodingFormat": "avro"}):
+            with tempfile.TemporaryDirectory() as checkpoint_dir:
+                input_path = tempfile.mkdtemp()
+                self._prepare_test_resource1(input_path)
+
+                df = self._build_test_df(input_path)
+
+                def check_basic_state(batch_df, batch_id):
+                    result = batch_df.collect()[0]
+                    assert result.value["id"] == 0  # First ID from test data
+                    assert result.value["name"] == "name-0"
+
+                def check_exception(error):
+                    from pyspark.errors.exceptions.captured import StreamingQueryException
+
+                    if not isinstance(error, StreamingQueryException):
+                        return False
+
+                    error_msg = str(error)
+                    return (
+                        "[TRANSFORM_WITH_STATE_SCHEMA_MUST_BE_NULLABLE]" in error_msg
+                        and "column family state must be nullable" in error_msg
+                    )
+
+                self._run_evolution_test(
+                    BasicProcessorNotNullable(),
+                    checkpoint_dir,
+                    check_basic_state,
+                    df,
+                    check_exception=check_exception,
+                )
+
 
 class SimpleStatefulProcessorWithInitialState(StatefulProcessor):
     # this dict is the same as input initial state dataframe
@@ -1893,6 +1926,27 @@ def close(self) -> None:
         pass
 
 
+class BasicProcessorNotNullable(StatefulProcessor):
+    # Schema definitions
+    state_schema = StructType(
+        [StructField("id", IntegerType(), False), StructField("name", StringType(), False)]
+    )
+
+    def init(self, handle):
+        self.state = handle.getValueState("state", self.state_schema)
+
+    def handleInputRows(self, key, rows, timerValues) -> Iterator[pd.DataFrame]:
+        for pdf in rows:
+            pass
+        id_val = int(key[0])
+        name = f"name-{id_val}"
+        self.state.update((id_val, name))
+        yield pd.DataFrame({"id": [key[0]], "value": [{"id": id_val, "name": name}]})
+
+    def close(self) -> None:
+        pass
+
+
 class AddFieldsProcessor(StatefulProcessor):
     state_schema = StructType(
         [

diff --git a/...src/main/scala/org/apache/spark/sql/execution/python/TransformWithStateInPandasExec.scala b/...src/main/scala/org/apache/spark/sql/execution/python/TransformWithStateInPandasExec.scala
@@ -115,9 +115,14 @@ case class TransformWithStateInPandasExec(
   override def operatorStateMetadataVersion: Int = 2
 
   override def getColFamilySchemas(
-      setNullableFields: Boolean
+      shouldBeNullable: Boolean
   ): Map[String, StateStoreColFamilySchema] = {
-    driverProcessorHandle.getColumnFamilySchemas(setNullableFields)
+    // For Python, the user can explicitly set nullability on schema, so
+    // we need to throw an error if the schema is nullable
+    driverProcessorHandle.getColumnFamilySchemas(
+      shouldCheckNullable = shouldBeNullable,
+      shouldSetNullable = shouldBeNullable
+    )
   }
 
   override def getStateVariableInfos(): Map[String, TransformWithStateVariableInfo] = {

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ListStateMetricsImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ListStateMetricsImpl.scala
@@ -38,7 +38,7 @@ trait ListStateMetricsImpl {
   // We keep track of the count of entries in the list in a separate column family
   // to avoid scanning the entire list to get the count.
   private val counterCFValueSchema: StructType =
-    StructType(Seq(StructField("count", LongType, nullable = false)))
+    StructType(Seq(StructField("count", LongType, nullable = true)))
 
   private val counterCFProjection = UnsafeProjection.create(counterCFValueSchema)
 

diff --git a/...in/scala/org/apache/spark/sql/execution/streaming/StateStoreColumnFamilySchemaUtils.scala b/...in/scala/org/apache/spark/sql/execution/streaming/StateStoreColumnFamilySchemaUtils.scala
@@ -47,7 +47,7 @@ object StateStoreColumnFamilySchemaUtils {
         // Byte type is converted to Int in Avro, which doesn't work for us as Avro
         // uses zig-zag encoding as opposed to big-endian for Ints
         Seq(
-          StructField(s"${field.name}_marker", BinaryType, nullable = false),
+          StructField(s"${field.name}_marker", BinaryType, nullable = true),
           field.copy(name = s"${field.name}_value", BinaryType)
         )
       } else {
@@ -117,7 +117,7 @@ object StateStoreColumnFamilySchemaUtils {
       getRowCounterCFName(stateName), keySchemaId = 0,
       keyEncoder.schema,
       valueSchemaId = 0,
-      StructType(Seq(StructField("count", LongType, nullable = false))),
+      StructType(Seq(StructField("count", LongType, nullable = true))),
       Some(NoPrefixKeyStateEncoderSpec(keyEncoder.schema)))
     schemas.put(counterSchema.colFamilyName, counterSchema)
 
@@ -149,7 +149,7 @@ object StateStoreColumnFamilySchemaUtils {
         keySchemaId = 0,
         keyEncoder.schema,
         valueSchemaId = 0,
-        StructType(Seq(StructField("count", LongType, nullable = false))),
+        StructType(Seq(StructField("count", LongType, nullable = true))),
         Some(NoPrefixKeyStateEncoderSpec(keyEncoder.schema)))
       schemas.put(countSchema.colFamilyName, countSchema)
     }

diff --git a/...src/main/scala/org/apache/spark/sql/execution/streaming/StatefulProcessorHandleImpl.scala b/...src/main/scala/org/apache/spark/sql/execution/streaming/StatefulProcessorHandleImpl.scala
@@ -363,18 +363,36 @@ class DriverStatefulProcessorHandleImpl(timeMode: TimeMode, keyExprEnc: Expressi
     addTimerColFamily()
   }
 
+  /**
+   * This method returns all column family schemas, and checks and enforces nullability
+   * if need be. The nullability check and set is only set to true when Avro is enabled.
+   * @param shouldCheckNullable Whether we need to check the nullability. This is set to
+   *                            true when using Python, as this is the only avenue through
+   *                            which users can set nullability
+   * @param shouldSetNullable Whether we need to set the fields as nullable. This is set to
+   *                          true when using Scala, as case classes are set to
+   *                          non-nullable by default.
+   * @return column family schemas used by this stateful processor.
+   */
   def getColumnFamilySchemas(
-      setNullableFields: Boolean
+      shouldCheckNullable: Boolean,
+      shouldSetNullable: Boolean
   ): Map[String, StateStoreColFamilySchema] = {
     val schemas = columnFamilySchemas.toMap
-    if (setNullableFields) {
-      schemas.map { case (colFamilyName, stateStoreColFamilySchema) =>
-        colFamilyName -> stateStoreColFamilySchema.copy(
-          valueSchema = stateStoreColFamilySchema.valueSchema.toNullable
+    schemas.map { case (colFamilyName, schema) =>
+      schema.valueSchema.fields.foreach { field =>
+        if (!field.nullable && shouldCheckNullable) {
+          throw StateStoreErrors.twsSchemaMustBeNullable(
+            schema.colFamilyName, schema.valueSchema.toString())
+        }
+      }
+      if (shouldSetNullable) {
+        colFamilyName -> schema.copy(
+          valueSchema = schema.valueSchema.toNullable
         )
+      } else {
+        colFamilyName -> schema
       }
-    } else {
-      schemas
     }
   }
 
@@ -549,7 +567,7 @@ class DriverStatefulProcessorHandleImpl(timeMode: TimeMode, keyExprEnc: Expressi
       elementKeySchema: StructType): StateStoreColFamilySchema = {
     val countIndexName = s"$$count_$stateName"
     val countValueSchema = StructType(Seq(
-      StructField("count", LongType, nullable = false)
+      StructField("count", LongType)
     ))
 
     StateStoreColFamilySchema(

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/TTLState.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/TTLState.scala
@@ -357,7 +357,7 @@ abstract class OneToManyTTLState(
   // Schema of the entry count index: elementKey -> count
   private val COUNT_INDEX = "$count_" + stateName
   private val COUNT_INDEX_VALUE_SCHEMA: StructType =
-    StructType(Seq(StructField("count", LongType, nullable = false)))
+    StructType(Seq(StructField("count", LongType)))
   private val countIndexValueProjector = UnsafeProjection.create(COUNT_INDEX_VALUE_SCHEMA)
 
   // Reused internal row that we use to create an UnsafeRow with the schema of

diff --git a/...core/src/main/scala/org/apache/spark/sql/execution/streaming/TransformWithStateExec.scala b/...core/src/main/scala/org/apache/spark/sql/execution/streaming/TransformWithStateExec.scala
@@ -140,7 +140,7 @@ case class TransformWithStateExec(
    * after init is called.
    */
   override def getColFamilySchemas(
-      setNullableFields: Boolean
+      shouldBeNullable: Boolean
   ): Map[String, StateStoreColFamilySchema] = {
     val keySchema = keyExpressions.toStructType
     // we have to add the default column family schema because the RocksDBStateEncoder
@@ -149,8 +149,11 @@ case class TransformWithStateExec(
       0, keyExpressions.toStructType, 0, DUMMY_VALUE_ROW_SCHEMA,
       Some(NoPrefixKeyStateEncoderSpec(keySchema)))
 
+    // For Scala, the user can't explicitly set nullability on schema, so there is
+    // no reason to throw an error, and we can simply set the schema to nullable.
     val columnFamilySchemas = getDriverProcessorHandle()
-      .getColumnFamilySchemas(setNullableFields) ++
+      .getColumnFamilySchemas(
+        shouldCheckNullable = false, shouldSetNullable = shouldBeNullable) ++
         Map(StateStore.DEFAULT_COL_FAMILY_NAME -> defaultSchema)
     closeProcessorHandle()
     columnFamilySchemas

diff --git a/...main/scala/org/apache/spark/sql/execution/streaming/TransformWithStateVariableUtils.scala b/...main/scala/org/apache/spark/sql/execution/streaming/TransformWithStateVariableUtils.scala
@@ -175,7 +175,9 @@ object TransformWithStateOperatorProperties extends Logging {
  */
 trait TransformWithStateMetadataUtils extends Logging {
 
-  def getColFamilySchemas(setNullableFields: Boolean): Map[String, StateStoreColFamilySchema]
+  // This method will return the column family schemas, and check whether the fields in the
+  // schema are nullable. If Avro encoding is used, we want to enforce nullability
+  def getColFamilySchemas(shouldBeNullable: Boolean): Map[String, StateStoreColFamilySchema]
 
   def getStateVariableInfos(): Map[String, TransformWithStateVariableInfo]
 

diff --git a/...core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreErrors.scala b/...core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreErrors.scala
@@ -145,6 +145,12 @@ object StateStoreErrors {
     new StateStoreValueSchemaNotCompatible(storedValueSchema, newValueSchema)
   }
 
+  def twsSchemaMustBeNullable(
+      columnFamilyName: String,
+      schema: String): TWSSchemaMustBeNullable = {
+    new TWSSchemaMustBeNullable(columnFamilyName, schema)
+  }
+
   def stateStoreInvalidValueSchemaEvolution(
       oldValueSchema: String,
       newValueSchema: String): StateStoreInvalidValueSchemaEvolution = {
@@ -346,6 +352,15 @@ class StateStoreValueSchemaNotCompatible(
       "storedValueSchema" -> storedValueSchema,
       "newValueSchema" -> newValueSchema))
 
+class TWSSchemaMustBeNullable(
+    columnFamilyName: String,
+    schema: String)
+  extends SparkUnsupportedOperationException(
+    errorClass = "TRANSFORM_WITH_STATE_SCHEMA_MUST_BE_NULLABLE",
+    messageParameters = Map(
+      "columnFamilyName" -> columnFamilyName,
+      "schema" -> schema))
+
 class StateStoreInvalidValueSchemaEvolution(
     oldValueSchema: String,
     newValueSchema: String)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/TransformWithValueStateTTLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/TransformWithValueStateTTLSuite.scala
@@ -273,7 +273,6 @@ class TransformWithValueStateTTLSuite extends TransformWithStateTTLTest {
     ) {
       withTempDir { checkpointDir =>
         // When Avro is used, we want to set the StructFields to nullable
-        val shouldBeNullable = usingAvroEncoding()
         val metadataPathPostfix = "state/0/_stateSchema/default"
         val stateSchemaPath = new Path(checkpointDir.toString, s"$metadataPathPostfix")
         val hadoopConf = spark.sessionState.newHadoopConf()
@@ -317,15 +316,15 @@ class TransformWithValueStateTTLSuite extends TransformWithStateTTLTest {
         val schema2 = StateStoreColFamilySchema(
           "$count_listState", 0,
           keySchema, 0,
-          new StructType().add("count", LongType, nullable = shouldBeNullable),
+          new StructType().add("count", LongType, nullable = true),
           Some(NoPrefixKeyStateEncoderSpec(keySchema)),
           None
         )
 
         val schema3 = StateStoreColFamilySchema(
           "$rowCounter_listState", 0,
           keySchema, 0,
-          new StructType().add("count", LongType, nullable = shouldBeNullable),
+          new StructType().add("count", LongType, nullable = true),
           Some(NoPrefixKeyStateEncoderSpec(keySchema)),
           None
         )
@@ -409,7 +408,7 @@ class TransformWithValueStateTTLSuite extends TransformWithStateTTLTest {
           "valueStateTTL", 0,
           keySchema, 0,
           new StructType()
-            .add("value", new StructType().add("value", IntegerType, nullable = shouldBeNullable))
+            .add("value", new StructType().add("value", IntegerType, nullable = true))
             .add("ttlExpirationMs", LongType),
           Some(NoPrefixKeyStateEncoderSpec(keySchema)),
           None
@@ -418,7 +417,7 @@ class TransformWithValueStateTTLSuite extends TransformWithStateTTLTest {
         val schema10 = StateStoreColFamilySchema(
           "valueState", 0,
           keySchema, 0,
-          new StructType().add("value", IntegerType, nullable = shouldBeNullable),
+          new StructType().add("value", IntegerType, nullable = true),
           Some(NoPrefixKeyStateEncoderSpec(keySchema)),
           None
         )
@@ -428,7 +427,7 @@ class TransformWithValueStateTTLSuite extends TransformWithStateTTLTest {
           keySchema, 0,
           new StructType()
             .add("value", new StructType()
-              .add("id", LongType, nullable = shouldBeNullable)
+              .add("id", LongType, nullable = true)
               .add("name", StringType))
             .add("ttlExpirationMs", LongType),
           Some(NoPrefixKeyStateEncoderSpec(keySchema)),