-
Notifications
You must be signed in to change notification settings - Fork 28.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-51065][SQL] Disallowing non-nullable schema when Avro encoding is used for TransformWithState #49751
[SPARK-51065][SQL] Disallowing non-nullable schema when Avro encoding is used for TransformWithState #49751
Changes from all commits
98e423a
df5359e
16106a6
fbfa147
6786133
96b14c3
570820a
6697723
8897e58
84b5342
ac28140
409164f
45de742
77a9c3f
42affc0
9b947cb
5452b68
ade7074
4ef6468
ce8bb14
f0b3d8c
2f33bb4
a28aba8
d628b8d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -47,7 +47,7 @@ object StateStoreColumnFamilySchemaUtils { | |
// Byte type is converted to Int in Avro, which doesn't work for us as Avro | ||
// uses zig-zag encoding as opposed to big-endian for Ints | ||
Seq( | ||
StructField(s"${field.name}_marker", BinaryType, nullable = false), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Lets say |
||
StructField(s"${field.name}_marker", BinaryType, nullable = true), | ||
field.copy(name = s"${field.name}_value", BinaryType) | ||
) | ||
} else { | ||
|
@@ -117,7 +117,7 @@ object StateStoreColumnFamilySchemaUtils { | |
getRowCounterCFName(stateName), keySchemaId = 0, | ||
keyEncoder.schema, | ||
valueSchemaId = 0, | ||
StructType(Seq(StructField("count", LongType, nullable = false))), | ||
StructType(Seq(StructField("count", LongType, nullable = true))), | ||
Some(NoPrefixKeyStateEncoderSpec(keyEncoder.schema))) | ||
schemas.put(counterSchema.colFamilyName, counterSchema) | ||
|
||
|
@@ -149,7 +149,7 @@ object StateStoreColumnFamilySchemaUtils { | |
keySchemaId = 0, | ||
keyEncoder.schema, | ||
valueSchemaId = 0, | ||
StructType(Seq(StructField("count", LongType, nullable = false))), | ||
StructType(Seq(StructField("count", LongType, nullable = true))), | ||
Some(NoPrefixKeyStateEncoderSpec(keyEncoder.schema))) | ||
schemas.put(countSchema.colFamilyName, countSchema) | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -140,7 +140,7 @@ case class TransformWithStateExec( | |
* after init is called. | ||
*/ | ||
override def getColFamilySchemas( | ||
setNullableFields: Boolean | ||
shouldBeNullable: Boolean | ||
): Map[String, StateStoreColFamilySchema] = { | ||
val keySchema = keyExpressions.toStructType | ||
// we have to add the default column family schema because the RocksDBStateEncoder | ||
|
@@ -149,8 +149,11 @@ case class TransformWithStateExec( | |
0, keyExpressions.toStructType, 0, DUMMY_VALUE_ROW_SCHEMA, | ||
Some(NoPrefixKeyStateEncoderSpec(keySchema))) | ||
|
||
// For Scala, the user can't explicitly set nullability on schema, so there is | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Likewise I mentioned in other comment, it is not impossible to set nullability on encoder (although I tend to agree most users won't). Let's not make this be conditional. Also, this is concerning me - if we are very confident that users would never be able to set column to be nullable, why we need to change the schema as we all know it has to be nullable? What we are worrying about if we just do the same with Python? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. #49751 (comment) |
||
// no reason to throw an error, and we can simply set the schema to nullable. | ||
val columnFamilySchemas = getDriverProcessorHandle() | ||
.getColumnFamilySchemas(setNullableFields) ++ | ||
.getColumnFamilySchemas( | ||
shouldCheckNullable = false, shouldSetNullable = shouldBeNullable) ++ | ||
Map(StateStore.DEFAULT_COL_FAMILY_NAME -> defaultSchema) | ||
closeProcessorHandle() | ||
columnFamilySchemas | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -145,6 +145,12 @@ object StateStoreErrors { | |
new StateStoreValueSchemaNotCompatible(storedValueSchema, newValueSchema) | ||
} | ||
|
||
def twsSchemaMustBeNullable( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think TWS deserves its own error collection class, but I agree this is out of scope. Let's make a follow-up. |
||
columnFamilyName: String, | ||
schema: String): TWSSchemaMustBeNullable = { | ||
new TWSSchemaMustBeNullable(columnFamilyName, schema) | ||
} | ||
|
||
def stateStoreInvalidValueSchemaEvolution( | ||
oldValueSchema: String, | ||
newValueSchema: String): StateStoreInvalidValueSchemaEvolution = { | ||
|
@@ -346,6 +352,15 @@ class StateStoreValueSchemaNotCompatible( | |
"storedValueSchema" -> storedValueSchema, | ||
"newValueSchema" -> newValueSchema)) | ||
|
||
class TWSSchemaMustBeNullable( | ||
columnFamilyName: String, | ||
schema: String) | ||
extends SparkUnsupportedOperationException( | ||
errorClass = "TRANSFORM_WITH_STATE_SCHEMA_MUST_BE_NULLABLE", | ||
messageParameters = Map( | ||
"columnFamilyName" -> columnFamilyName, | ||
"schema" -> schema)) | ||
|
||
class StateStoreInvalidValueSchemaEvolution( | ||
oldValueSchema: String, | ||
newValueSchema: String) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why not having identical test in Scala as well? I don't see a new test verifying the error.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The thing is, there is no way for user to specify this using Scala.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes and probably also no.
I agree moderate users may not ever try to get over and just stick with case class or POJO or so. But "we" can imagine a way to get over, exactly the same way how we could support PySpark:
This is how we come up with state encoder for Python version of FMGWS. This is to serde with Row interface - my rough memory says it's not InternalRow but Row, so, most likely work with GenericRow, but we can try with both GenericRow and InternalRow.
I'm OK with deferring this as follow-up.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Now I get that you are not able to test this actually, as we have to just accept non-nullable column and change to nullable. Again I doubt this is just a bug though.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(Not really a bug, we figured out.)