Skip to content

Commit 9d6fb58

Browse files
asl3cloud-fan
authored andcommitted
[SPARK-51034][SQL] Reformat Describe As JSON statistics dict for parse-ability
### What changes were proposed in this pull request? Reformat `Describe As JSON` statistics table metadata into a dict rather than string for improved parse-ability ### Why are the changes needed? Existing `Describe` formats table statistics into a string `xxx bytes, xxx rows` which is not conducive to parsing. This PR stores the table statistics metadata in raw format and delegates the formatting to the caller of toJsonLinkedHashmap, for improved parsing of the JSON statistics. ### Does this PR introduce _any_ user-facing change? Yes, it affects the output of `Describe As JSON`. ### How was this patch tested? Added to test suite `DescribeTableSuite` ### Was this patch authored or co-authored using generative AI tooling? No Closes #49728 from asl3/asl3/describetable-statistics. Authored-by: Amanda Liu <[email protected]> Signed-off-by: Wenchen Fan <[email protected]>
1 parent 2581ca1 commit 9d6fb58

File tree

2 files changed

+68
-28
lines changed

2 files changed

+68
-28
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala

+60-26
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ import com.fasterxml.jackson.annotation.JsonInclude.Include
2828
import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper}
2929
import com.fasterxml.jackson.module.scala.{ClassTagExtensions, DefaultScalaModule}
3030
import org.apache.commons.lang3.StringUtils
31-
import org.json4s.JsonAST.{JArray, JBool, JDouble, JInt, JLong, JNull, JObject, JString, JValue}
31+
import org.json4s.JsonAST.{JArray, JBool, JDecimal, JDouble, JInt, JLong, JNull, JObject, JString, JValue}
3232
import org.json4s.jackson.JsonMethods._
3333

3434
import org.apache.spark.SparkException
@@ -61,36 +61,60 @@ trait MetadataMapSupport {
6161
jsonToString(toJsonLinkedHashMap)
6262
}
6363

64+
/**
65+
* Some fields from JsonLinkedHashMap are reformatted for human readability in `describe table`.
66+
* If a field does not require special re-formatting, it is simply handled by `jsonToString`.
67+
*/
68+
private def jsonToStringReformat(key: String, jValue: JValue): Option[(String, String)] = {
69+
val reformattedValue: Option[String] = key match {
70+
case "Statistics" =>
71+
jValue match {
72+
case JObject(fields) =>
73+
Some(fields.flatMap {
74+
case ("size_in_bytes", JDecimal(bytes)) => Some(s"$bytes bytes")
75+
case ("num_rows", JDecimal(rows)) => Some(s"$rows rows")
76+
case _ => None
77+
}.mkString(", "))
78+
case _ => Some(jValue.values.toString)
79+
}
80+
case "Created Time" | "Last Access" =>
81+
jValue match {
82+
case JLong(value) => Some(new Date(value).toString)
83+
case _ => Some(jValue.values.toString)
84+
}
85+
case _ => None
86+
}
87+
reformattedValue.map(value => key -> value)
88+
}
89+
6490
protected def jsonToString(
6591
jsonMap: mutable.LinkedHashMap[String, JValue]): mutable.LinkedHashMap[String, String] = {
6692
val map = new mutable.LinkedHashMap[String, String]()
67-
val timestampKeys = Set("Created Time", "Last Access")
6893
jsonMap.foreach { case (key, jValue) =>
69-
val stringValue = jValue match {
70-
case JString(value) => value
71-
case JArray(values) =>
72-
values.map(_.values)
73-
.map {
74-
case str: String => quoteIdentifier(str)
75-
case other => other.toString
76-
}
77-
.mkString("[", ", ", "]")
78-
case JObject(fields) =>
79-
fields.map { case (k, v) =>
80-
s"$k=${v.values.toString}"
81-
}
82-
.mkString("[", ", ", "]")
83-
case JInt(value) => value.toString
84-
case JDouble(value) => value.toString
85-
case JLong(value) =>
86-
if (timestampKeys.contains(key)) {
87-
new Date(value).toString
88-
} else {
89-
value.toString
94+
jsonToStringReformat(key, jValue) match {
95+
case Some((formattedKey, formattedValue)) =>
96+
map.put(formattedKey, formattedValue)
97+
case None =>
98+
val stringValue = jValue match {
99+
case JString(value) => value
100+
case JArray(values) =>
101+
values.map(_.values)
102+
.map {
103+
case str: String => quoteIdentifier(str)
104+
case other => other.toString
105+
}
106+
.mkString("[", ", ", "]")
107+
case JObject(fields) =>
108+
fields.map { case (k, v) =>
109+
s"$k=${v.values.toString}"
110+
}.mkString("[", ", ", "]")
111+
case JInt(value) => value.toString
112+
case JDouble(value) => value.toString
113+
case JLong(value) => value.toString
114+
case _ => jValue.values.toString
90115
}
91-
case _ => jValue.values.toString
116+
map.put(key, stringValue)
92117
}
93-
map.put(key, stringValue)
94118
}
95119
map
96120
}
@@ -642,7 +666,9 @@ case class CatalogTable(
642666
map += "View Query Output Columns" -> viewQueryOutputColumns
643667
}
644668
if (tableProperties != JNull) map += "Table Properties" -> tableProperties
645-
if (stats.isDefined) map += "Statistics" -> JString(stats.get.simpleString)
669+
stats.foreach { s =>
670+
map += "Statistics" -> JObject(s.jsonString.toList)
671+
}
646672
map ++= storage.toJsonLinkedHashMap.map { case (k, v) => k -> v }
647673
if (tracksPartitionsInCatalog) map += "Partition Provider" -> JString("Catalog")
648674
if (partitionColumns != JNull) map += "Partition Columns" -> partitionColumns
@@ -811,6 +837,14 @@ case class CatalogStatistics(
811837
val rowCountString = if (rowCount.isDefined) s", ${rowCount.get} rows" else ""
812838
s"$sizeInBytes bytes$rowCountString"
813839
}
840+
841+
def jsonString: Map[String, JValue] = {
842+
val rowCountInt: BigInt = rowCount.getOrElse(0L)
843+
Map(
844+
"size_in_bytes" -> JDecimal(BigDecimal(sizeInBytes)),
845+
"num_rows" -> JDecimal(BigDecimal(rowCountInt))
846+
)
847+
}
814848
}
815849

816850
/**

sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/DescribeTableSuite.scala

+8-2
Original file line numberDiff line numberDiff line change
@@ -298,7 +298,7 @@ trait DescribeTableSuiteBase extends command.DescribeTableSuiteBase
298298
}
299299
}
300300

301-
test("DESCRIBE AS JSON partition spec") {
301+
test("DESCRIBE AS JSON partition spec and statistics") {
302302
withNamespaceAndTable("ns", "table") { t =>
303303
val tableCreationStr =
304304
s"""
@@ -314,6 +314,7 @@ trait DescribeTableSuiteBase extends command.DescribeTableSuiteBase
314314
|""".stripMargin
315315
spark.sql(tableCreationStr)
316316
spark.sql(s"ALTER TABLE $t ADD PARTITION (region='USA', category='tech')")
317+
spark.sql(s"ANALYZE TABLE $t COMPUTE STATISTICS FOR ALL COLUMNS")
317318

318319
val descriptionDf =
319320
spark.sql(s"DESCRIBE FORMATTED $t PARTITION (region='USA', category='tech') AS JSON")
@@ -349,7 +350,11 @@ trait DescribeTableSuiteBase extends command.DescribeTableSuiteBase
349350
},
350351
partition_provider = Some("Catalog"),
351352
partition_columns = Some(List("region", "category")),
352-
partition_values = Some(Map("region" -> "USA", "category" -> "tech"))
353+
partition_values = Some(Map("region" -> "USA", "category" -> "tech")),
354+
statistics = Some(Map(
355+
"size_in_bytes" -> 0,
356+
"num_rows" -> 0
357+
))
353358
)
354359

355360
assert(parsedOutput.location.isDefined)
@@ -751,6 +756,7 @@ case class DescribeTableJson(
751756
partition_provider: Option[String] = None,
752757
partition_columns: Option[List[String]] = Some(Nil),
753758
partition_values: Option[Map[String, String]] = None,
759+
statistics: Option[Map[String, Any]] = None,
754760
view_text: Option[String] = None,
755761
view_original_text: Option[String] = None,
756762
view_schema_mode: Option[String] = None,

0 commit comments

Comments
 (0)