-
Notifications
You must be signed in to change notification settings - Fork 28.7k
[SPARK-52582][SQL] Improve the memory usage of XML parser #51287
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
xiaonanyang-db
wants to merge
14
commits into
apache:master
Choose a base branch
from
xiaonanyang-db:SPARK-52582
base: master
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
Show all changes
14 commits
Select commit
Hold shift + click to select a range
05e975c
draft
xiaonanyang-db 854d3de
add test
xiaonanyang-db 6d2a960
fix
xiaonanyang-db 57c9685
address comments
xiaonanyang-db f8f8216
u
xiaonanyang-db f787809
u
xiaonanyang-db 6baa722
u
xiaonanyang-db 02b09f6
u
xiaonanyang-db 57d3105
u
xiaonanyang-db 2cb368f
u
xiaonanyang-db 5fe3877
u
xiaonanyang-db 26a8565
u
xiaonanyang-db 3e959a8
u
xiaonanyang-db b52e78c
use optimized parser in schema inerence
xiaonanyang-db File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -41,7 +41,7 @@ import org.apache.spark.{SparkIllegalArgumentException, SparkUpgradeException} | |
import org.apache.spark.internal.Logging | ||
import org.apache.spark.sql.catalyst.InternalRow | ||
import org.apache.spark.sql.catalyst.expressions.{ExprUtils, GenericInternalRow} | ||
import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, BadRecordException, DateFormatter, DropMalformedMode, FailureSafeParser, GenericArrayData, MapData, ParseMode, PartialResultArrayException, PartialResultException, PermissiveMode, TimestampFormatter} | ||
import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, BadRecordException, DateFormatter, FailureSafeParser, GenericArrayData, MapData, ParseMode, PartialResultArrayException, PartialResultException, TimestampFormatter} | ||
import org.apache.spark.sql.catalyst.util.LegacyDateFormats.FAST_DATE_FORMAT | ||
import org.apache.spark.sql.catalyst.xml.StaxXmlParser.convertStream | ||
import org.apache.spark.sql.errors.QueryExecutionErrors | ||
|
@@ -118,23 +118,11 @@ class StaxXmlParser( | |
}.flatten | ||
} | ||
|
||
def parseColumn(xml: String, schema: StructType): InternalRow = { | ||
// The user=specified schema from from_xml, etc will typically not include a | ||
// "corrupted record" column. In PERMISSIVE mode, which puts bad records in | ||
// such a column, this would cause an error. In this mode, if such a column | ||
// is not manually specified, then fall back to DROPMALFORMED, which will return | ||
// null column values where parsing fails. | ||
val parseMode = | ||
if (options.parseMode == PermissiveMode && | ||
!schema.fields.exists(_.name == options.columnNameOfCorruptRecord)) { | ||
DropMalformedMode | ||
} else { | ||
options.parseMode | ||
} | ||
val xsdSchema = Option(options.rowValidationXSDPath).map(ValidatorUtil.getSchema) | ||
doParseColumn(xml, parseMode, xsdSchema).orNull | ||
} | ||
|
||
/** | ||
* Parse the given XML string record as an InternalRow | ||
* @param xml The single XML record string to parse | ||
* @param xsdSchema The xsd schema to validate the XML against, if provided. | ||
*/ | ||
def doParseColumn(xml: String, | ||
parseMode: ParseMode, | ||
xsdSchema: Option[Schema]): Option[InternalRow] = { | ||
|
@@ -186,6 +174,94 @@ class StaxXmlParser( | |
} | ||
} | ||
|
||
/** | ||
* The optimized version of the XML stream parser that reads XML records from the input file | ||
* stream sequentially without loading each individual XML record string into memory. | ||
*/ | ||
def parseStreamOptimized( | ||
inputStream: InputStream, | ||
schema: StructType, | ||
streamLiteral: () => UTF8String): Iterator[InternalRow] = { | ||
// XSD validation would require converting to string first, which defeats the purpose | ||
// For now, skip XSD validation in the optimized parsing mode to maintain memory efficiency | ||
if (Option(options.rowValidationXSDPath).isDefined) { | ||
logWarning("XSD validation is not supported in streaming mode and will be skipped") | ||
} | ||
val safeParser = new FailureSafeParser[XMLEventReader]( | ||
input => { | ||
// The first event is guaranteed to be a StartElement, so we can read attributes from it | ||
// without using StaxXmlParserUtils.skipUntil. | ||
val attributes = input.nextEvent().asStartElement().getAttributes.asScala.toArray | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Move this inside |
||
doParseColumn(input, attributes, streamLiteral) | ||
}, | ||
options.parseMode, | ||
schema, | ||
options.columnNameOfCorruptRecord | ||
) | ||
|
||
val xmlTokenizer = new OptimizedXmlTokenizer(inputStream, options) | ||
StaxXmlParser.convertStream(xmlTokenizer) { tokens => | ||
safeParser.parse(tokens) | ||
}.flatten | ||
} | ||
|
||
/** | ||
* Parse the next XML record from the event stream. | ||
* @param parser The XML event reader over the entire XML file stream. The first event has been | ||
* advanced to the next record in the file. | ||
* @param rootAttributes The attributes of the record root element. | ||
* @param xmlLiteral A function that returns the entire XML file content as a UTF8String. Used | ||
* to create a BadRecordException in case of parsing errors. | ||
* TODO: Only include the file content starting with the current record. | ||
*/ | ||
def doParseColumn( | ||
parser: XMLEventReader, | ||
rootAttributes: Array[Attribute], | ||
xmlLiteral: () => UTF8String): Option[InternalRow] = { | ||
try { | ||
options.singleVariantColumn match { | ||
case Some(_) => | ||
// If the singleVariantColumn is specified, parse the entire xml record as a Variant | ||
val v = StaxXmlParser.parseVariant(parser, rootAttributes, options) | ||
Some(InternalRow(v)) | ||
case _ => | ||
// Otherwise, parse the xml record as Structs | ||
val result = Some(convertObject(parser, schema, rootAttributes)) | ||
result | ||
} | ||
} catch { | ||
case e: SparkUpgradeException => throw e | ||
case e@(_: RuntimeException | _: XMLStreamException | _: MalformedInputException | ||
| _: SAXException) => | ||
// Skip rest of the content in the parser and put the whole XML file in the | ||
// BadRecordException. | ||
parser.close() | ||
// XML parser currently doesn't support partial results for corrupted records. | ||
// For such records, all fields other than the field configured by | ||
// `columnNameOfCorruptRecord` are set to `null`. | ||
throw BadRecordException(xmlLiteral, () => Array.empty, e) | ||
case e: CharConversionException if options.charset.isEmpty => | ||
val msg = | ||
"""XML parser cannot handle a character in its input. | ||
|Specifying encoding as an input option explicitly might help to resolve the issue. | ||
|""".stripMargin + e.getMessage | ||
val wrappedCharException = new CharConversionException(msg) | ||
wrappedCharException.initCause(e) | ||
throw BadRecordException(xmlLiteral, () => Array.empty, | ||
wrappedCharException) | ||
case PartialResultException(row, cause) => | ||
throw BadRecordException( | ||
record = xmlLiteral, | ||
partialResults = () => Array(row), | ||
cause) | ||
case PartialResultArrayException(rows, cause) => | ||
throw BadRecordException( | ||
record = xmlLiteral, | ||
partialResults = () => rows, | ||
cause) | ||
} | ||
} | ||
|
||
/** | ||
* Parse the current token (and related children) according to a desired schema | ||
*/ | ||
|
@@ -637,6 +713,14 @@ class StaxXmlParser( | |
} | ||
} | ||
|
||
trait XmlTokenizerBase[T] extends Logging { | ||
/** | ||
* Finds the next XML record in the stream. | ||
* @return an Option containing the next XML record as a String, or None if no more records | ||
*/ | ||
def next(): Option[T] | ||
} | ||
|
||
/** | ||
* XMLRecordReader class to read through a given xml document to output xml blocks as records | ||
* as specified by the start tag and end tag. | ||
|
@@ -645,7 +729,7 @@ class StaxXmlParser( | |
*/ | ||
class XmlTokenizer( | ||
inputStream: InputStream, | ||
options: XmlOptions) extends Logging { | ||
options: XmlOptions) extends XmlTokenizerBase[String] { | ||
private var reader = new BufferedReader( | ||
new InputStreamReader(inputStream, Charset.forName(options.charset))) | ||
private var currentStartTag: String = _ | ||
|
@@ -665,7 +749,7 @@ class XmlTokenizer( | |
* @param value the object that will be written | ||
* @return whether it reads successfully | ||
*/ | ||
def next(): Option[String] = { | ||
override def next(): Option[String] = { | ||
var nextString: Option[String] = None | ||
try { | ||
if (readUntilStartElement()) { | ||
|
@@ -898,6 +982,84 @@ class XmlTokenizer( | |
} | ||
} | ||
|
||
/** | ||
* Optimized XML tokenizer that avoids loading entire XML records into memory. | ||
* - Uses XMLEventReader to parse XML stream directly | ||
* - Never buffers complete XML records in memory | ||
* - Allows the parser to work directly with XML events | ||
*/ | ||
class OptimizedXmlTokenizer(inputStream: InputStream, options: XmlOptions) | ||
extends XmlTokenizerBase[XMLEventReader] { | ||
private var reader = StaxXmlParserUtils.filteredReader(inputStream, options) | ||
|
||
/** | ||
* Returns the next XML record as a positioned XMLEventReader. | ||
* This avoids creating intermediate string representations. | ||
*/ | ||
override def next(): Option[XMLEventReader] = { | ||
var nextRecord: Option[XMLEventReader] = None | ||
try { | ||
// Skip to the next row start element | ||
if (skipToNextRowStart()) { | ||
nextRecord = Some(reader) | ||
} | ||
} catch { | ||
case e: FileNotFoundException if options.ignoreMissingFiles => | ||
logWarning("Skipping the rest of the content in the missing file", e) | ||
case NonFatal(e) => | ||
ExceptionUtils.getRootCause(e) match { | ||
case _: AccessControlException | _: BlockMissingException => | ||
close() | ||
throw e | ||
case _: RuntimeException | _: IOException if options.ignoreCorruptFiles => | ||
logWarning("Skipping the rest of the content in the corrupted file", e) | ||
case _: XMLStreamException => | ||
logWarning("Skipping the rest of the content in the corrupted file", e) | ||
case e: Throwable => | ||
close() | ||
throw e | ||
} | ||
} finally { | ||
if (nextRecord.isEmpty && reader != null) { | ||
close() | ||
} | ||
} | ||
nextRecord | ||
} | ||
|
||
def close(): Unit = { | ||
if (reader != null) { | ||
reader.close() | ||
inputStream.close() | ||
reader = null | ||
} | ||
} | ||
|
||
/** | ||
* Skip through the XML stream until we find the next row start element. | ||
*/ | ||
private def skipToNextRowStart(): Boolean = { | ||
val rowTagName = options.rowTag | ||
while (reader.hasNext) { | ||
val event = reader.peek() | ||
event match { | ||
case startElement: StartElement => | ||
val elementName = StaxXmlParserUtils.getName(startElement.getName, options) | ||
if (elementName == rowTagName) { | ||
return true | ||
} | ||
case _: EndDocument => | ||
return false | ||
case _ => | ||
// Continue searching | ||
} | ||
// if not the event we want, advance the reader | ||
reader.nextEvent() | ||
} | ||
false | ||
} | ||
} | ||
|
||
object StaxXmlParser { | ||
/** | ||
* Parses a stream that contains CSV strings and turns it into an iterator of tokens. | ||
|
@@ -907,15 +1069,22 @@ object StaxXmlParser { | |
convertStream(xmlTokenizer)(tokens => tokens) | ||
} | ||
|
||
private def convertStream[T]( | ||
xmlTokenizer: XmlTokenizer)( | ||
convert: String => T) = new Iterator[T] { | ||
def tokenizeStreamOptimized( | ||
inputStream: InputStream, | ||
options: XmlOptions): Iterator[XMLEventReader] = { | ||
val xmlTokenizer = new OptimizedXmlTokenizer(inputStream, options) | ||
convertStream(xmlTokenizer)(tokens => tokens) | ||
} | ||
|
||
private def convertStream[TokenType, ResultType]( | ||
xmlTokenizer: XmlTokenizerBase[TokenType])( | ||
convert: TokenType => ResultType) = new Iterator[ResultType] { | ||
|
||
private var nextRecord = xmlTokenizer.next() | ||
|
||
override def hasNext: Boolean = nextRecord.nonEmpty | ||
|
||
override def next(): T = { | ||
override def next(): ResultType = { | ||
if (!hasNext) { | ||
throw QueryExecutionErrors.endOfStreamError() | ||
} | ||
|
@@ -931,9 +1100,17 @@ object StaxXmlParser { | |
def parseVariant(xml: String, options: XmlOptions): VariantVal = { | ||
val parser = StaxXmlParserUtils.filteredReader(xml) | ||
val rootAttributes = StaxXmlParserUtils.gatherRootAttributes(parser) | ||
val v = convertVariant(parser, rootAttributes, options) | ||
val v = parseVariant(parser, rootAttributes, options) | ||
parser.close() | ||
v | ||
new VariantVal(v.getValue, v.getMetadata) | ||
} | ||
|
||
def parseVariant( | ||
parser: XMLEventReader, | ||
rootAttributes: Array[Attribute], | ||
options: XmlOptions): VariantVal = { | ||
val v = convertVariant(parser, rootAttributes, options) | ||
new VariantVal(v.getValue, v.getMetadata) | ||
} | ||
|
||
/** | ||
|
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change | ||||||
---|---|---|---|---|---|---|---|---|
|
@@ -16,7 +16,8 @@ | |||||||
*/ | ||||||||
package org.apache.spark.sql.catalyst.xml | ||||||||
|
||||||||
import java.io.StringReader | ||||||||
import java.io.{InputStreamReader, StringReader} | ||||||||
import java.nio.charset.Charset | ||||||||
import javax.xml.namespace.QName | ||||||||
import javax.xml.stream.{EventFilter, XMLEventReader, XMLInputFactory, XMLStreamConstants} | ||||||||
import javax.xml.stream.events._ | ||||||||
|
@@ -35,26 +36,33 @@ object StaxXmlParserUtils { | |||||||
factory | ||||||||
} | ||||||||
|
||||||||
val filter = new EventFilter { | ||||||||
override def accept(event: XMLEvent): Boolean = | ||||||||
event.getEventType match { | ||||||||
// Ignore comments and processing instructions | ||||||||
case XMLStreamConstants.COMMENT | XMLStreamConstants.PROCESSING_INSTRUCTION => false | ||||||||
// unsupported events | ||||||||
case XMLStreamConstants.DTD | | ||||||||
XMLStreamConstants.ENTITY_DECLARATION | | ||||||||
XMLStreamConstants.ENTITY_REFERENCE | | ||||||||
XMLStreamConstants.NOTATION_DECLARATION => false | ||||||||
case _ => true | ||||||||
} | ||||||||
} | ||||||||
|
||||||||
def filteredReader(xml: String): XMLEventReader = { | ||||||||
val filter = new EventFilter { | ||||||||
override def accept(event: XMLEvent): Boolean = | ||||||||
event.getEventType match { | ||||||||
// Ignore comments and processing instructions | ||||||||
case XMLStreamConstants.COMMENT | XMLStreamConstants.PROCESSING_INSTRUCTION => false | ||||||||
// unsupported events | ||||||||
case XMLStreamConstants.DTD | | ||||||||
XMLStreamConstants.ENTITY_DECLARATION | | ||||||||
XMLStreamConstants.ENTITY_REFERENCE | | ||||||||
XMLStreamConstants.NOTATION_DECLARATION => false | ||||||||
case _ => true | ||||||||
} | ||||||||
} | ||||||||
// It does not have to skip for white space, since `XmlInputFormat` | ||||||||
// always finds the root tag without a heading space. | ||||||||
val eventReader = factory.createXMLEventReader(new StringReader(xml)) | ||||||||
factory.createFilteredReader(eventReader, filter) | ||||||||
} | ||||||||
|
||||||||
def filteredReader(inputStream: java.io.InputStream, options: XmlOptions): XMLEventReader = { | ||||||||
val inputStreamReader = new InputStreamReader(inputStream, Charset.forName(options.charset)) | ||||||||
val eventReader = factory.createXMLEventReader(inputStreamReader) | ||||||||
Comment on lines
+61
to
+62
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Any issue with this?
Suggested change
|
||||||||
factory.createFilteredReader(eventReader, filter) | ||||||||
} | ||||||||
|
||||||||
def gatherRootAttributes(parser: XMLEventReader): Array[Attribute] = { | ||||||||
val rootEvent = | ||||||||
StaxXmlParserUtils.skipUntil(parser, XMLStreamConstants.START_ELEMENT) | ||||||||
|
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Validator::validate
function can take as input aStreamSource
orStAXSource
. Either one can be extended to pass individualrowTag
element to thevalidate
function.