Skip to content

Commit f866549

Browse files
viiryabeliefer
authored andcommitted
[SPARK-45816][SQL] Return NULL when overflowing during casting from timestamp to integers
### What changes were proposed in this pull request? Spark cast works in two modes: ansi and non-ansi. When overflowing during casting, the common behavior under non-ansi mode is to return null. However, casting from Timestamp to Int/Short/Byte returns a wrapping value now. The behavior to silently overflow doesn't make sense. This patch changes it to the common behavior, i.e., returning null. ### Why are the changes needed? Returning a wrapping value, e.g., negative one, during casting Timestamp to Int/Short/Byte could implicitly cause misinterpret casted result without caution. We also should follow the common behavior of overflowing handling. ### Does this PR introduce _any_ user-facing change? Yes. Overflowing during casting from Timestamp to Int/Short/Byte under non-ansi mode, returns null instead of wrapping value. ### How was this patch tested? Will add test or update test if any existing ones fail ### Was this patch authored or co-authored using generative AI tooling? No Closes #43694 from viirya/fix_cast_integers. Authored-by: Liang-Chi Hsieh <[email protected]> Signed-off-by: Jiaan Geng <[email protected]>
1 parent 1d8df4f commit f866549

File tree

3 files changed

+33
-25
lines changed

3 files changed

+33
-25
lines changed

docs/sql-migration-guide.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ license: |
2828
- Since Spark 4.0, any read of SQL tables takes into consideration the SQL configs `spark.sql.files.ignoreCorruptFiles`/`spark.sql.files.ignoreMissingFiles` instead of the core config `spark.files.ignoreCorruptFiles`/`spark.files.ignoreMissingFiles`.
2929
- Since Spark 4.0, `spark.sql.hive.metastore` drops the support of Hive prior to 2.0.0 as they require JDK 8 that Spark does not support anymore. Users should migrate to higher versions.
3030
- Since Spark 4.0, `spark.sql.parquet.compression.codec` drops the support of codec name `lz4raw`, please use `lz4_raw` instead.
31+
- Since Spark 4.0, when overflowing during casting timestamp to byte/short/int under non-ansi mode, Spark will return null instead a wrapping value.
3132

3233
## Upgrading from Spark SQL 3.4 to 3.5
3334

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala

Lines changed: 29 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -344,6 +344,7 @@ object Cast extends QueryErrorsBase {
344344
case (StringType, _) => true
345345
case (_, StringType) => false
346346

347+
case (TimestampType, ByteType | ShortType | IntegerType) => true
347348
case (FloatType | DoubleType, TimestampType) => true
348349
case (TimestampType, DateType) => false
349350
case (_, DateType) => true
@@ -777,6 +778,14 @@ case class Cast(
777778
buildCast[Int](_, i => yearMonthIntervalToInt(i, x.startField, x.endField).toLong)
778779
}
779780

781+
private def errorOrNull(t: Any, from: DataType, to: DataType) = {
782+
if (ansiEnabled) {
783+
throw QueryExecutionErrors.castingCauseOverflowError(t, from, to)
784+
} else {
785+
null
786+
}
787+
}
788+
780789
// IntConverter
781790
private[this] def castToInt(from: DataType): Any => Any = from match {
782791
case StringType if ansiEnabled =>
@@ -788,17 +797,15 @@ case class Cast(
788797
buildCast[Boolean](_, b => if (b) 1 else 0)
789798
case DateType =>
790799
buildCast[Int](_, d => null)
791-
case TimestampType if ansiEnabled =>
800+
case TimestampType =>
792801
buildCast[Long](_, t => {
793802
val longValue = timestampToLong(t)
794803
if (longValue == longValue.toInt) {
795804
longValue.toInt
796805
} else {
797-
throw QueryExecutionErrors.castingCauseOverflowError(t, from, IntegerType)
806+
errorOrNull(t, from, IntegerType)
798807
}
799808
})
800-
case TimestampType =>
801-
buildCast[Long](_, t => timestampToLong(t).toInt)
802809
case x: NumericType if ansiEnabled =>
803810
val exactNumeric = PhysicalNumericType.exactNumeric(x)
804811
b => exactNumeric.toInt(b)
@@ -826,17 +833,15 @@ case class Cast(
826833
buildCast[Boolean](_, b => if (b) 1.toShort else 0.toShort)
827834
case DateType =>
828835
buildCast[Int](_, d => null)
829-
case TimestampType if ansiEnabled =>
836+
case TimestampType =>
830837
buildCast[Long](_, t => {
831838
val longValue = timestampToLong(t)
832839
if (longValue == longValue.toShort) {
833840
longValue.toShort
834841
} else {
835-
throw QueryExecutionErrors.castingCauseOverflowError(t, from, ShortType)
842+
errorOrNull(t, from, ShortType)
836843
}
837844
})
838-
case TimestampType =>
839-
buildCast[Long](_, t => timestampToLong(t).toShort)
840845
case x: NumericType if ansiEnabled =>
841846
val exactNumeric = PhysicalNumericType.exactNumeric(x)
842847
b =>
@@ -875,17 +880,15 @@ case class Cast(
875880
buildCast[Boolean](_, b => if (b) 1.toByte else 0.toByte)
876881
case DateType =>
877882
buildCast[Int](_, d => null)
878-
case TimestampType if ansiEnabled =>
883+
case TimestampType =>
879884
buildCast[Long](_, t => {
880885
val longValue = timestampToLong(t)
881886
if (longValue == longValue.toByte) {
882887
longValue.toByte
883888
} else {
884-
throw QueryExecutionErrors.castingCauseOverflowError(t, from, ByteType)
889+
errorOrNull(t, from, ByteType)
885890
}
886891
})
887-
case TimestampType =>
888-
buildCast[Long](_, t => timestampToLong(t).toByte)
889892
case x: NumericType if ansiEnabled =>
890893
val exactNumeric = PhysicalNumericType.exactNumeric(x)
891894
b =>
@@ -1661,22 +1664,26 @@ case class Cast(
16611664
integralType: String,
16621665
from: DataType,
16631666
to: DataType): CastFunction = {
1664-
if (ansiEnabled) {
1665-
val longValue = ctx.freshName("longValue")
1666-
val fromDt = ctx.addReferenceObj("from", from, from.getClass.getName)
1667-
val toDt = ctx.addReferenceObj("to", to, to.getClass.getName)
1668-
(c, evPrim, _) =>
1669-
code"""
1667+
1668+
val longValue = ctx.freshName("longValue")
1669+
val fromDt = ctx.addReferenceObj("from", from, from.getClass.getName)
1670+
val toDt = ctx.addReferenceObj("to", to, to.getClass.getName)
1671+
1672+
(c, evPrim, evNull) =>
1673+
val overflow = if (ansiEnabled) {
1674+
code"""throw QueryExecutionErrors.castingCauseOverflowError($c, $fromDt, $toDt);"""
1675+
} else {
1676+
code"$evNull = true;"
1677+
}
1678+
1679+
code"""
16701680
long $longValue = ${timestampToLongCode(c)};
16711681
if ($longValue == ($integralType) $longValue) {
16721682
$evPrim = ($integralType) $longValue;
16731683
} else {
1674-
throw QueryExecutionErrors.castingCauseOverflowError($c, $fromDt, $toDt);
1684+
$overflow
16751685
}
16761686
"""
1677-
} else {
1678-
(c, evPrim, _) => code"$evPrim = ($integralType) ${timestampToLongCode(c)};"
1679-
}
16801687
}
16811688

16821689
private[this] def castDayTimeIntervalToIntegralTypeCode(

sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastWithAnsiOffSuite.scala

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -514,9 +514,9 @@ class CastWithAnsiOffSuite extends CastSuiteBase {
514514
val negativeTs = Timestamp.valueOf("1900-05-05 18:34:56.1")
515515
assert(negativeTs.getTime < 0)
516516
val expectedSecs = Math.floorDiv(negativeTs.getTime, MILLIS_PER_SECOND)
517-
checkEvaluation(cast(negativeTs, ByteType), expectedSecs.toByte)
518-
checkEvaluation(cast(negativeTs, ShortType), expectedSecs.toShort)
519-
checkEvaluation(cast(negativeTs, IntegerType), expectedSecs.toInt)
517+
checkEvaluation(cast(negativeTs, ByteType), null)
518+
checkEvaluation(cast(negativeTs, ShortType), null)
519+
checkEvaluation(cast(negativeTs, IntegerType), null)
520520
checkEvaluation(cast(negativeTs, LongType), expectedSecs)
521521
}
522522
}

0 commit comments

Comments
 (0)