-
Notifications
You must be signed in to change notification settings - Fork 441
Use the same library when Reading and Writing CSV #30
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 4 commits
2a24bee
411a0ae
54782ac
508dbbe
2909010
b266557
6987735
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -26,6 +26,7 @@ class CsvParser { | |
| private var useHeader: Boolean = true | ||
| private var delimiter: Character = ',' | ||
| private var quote: Character = '"' | ||
| private var escape: Character = null | ||
| private var schema: StructType = null | ||
|
|
||
| def withUseHeader(flag: Boolean): CsvParser = { | ||
|
|
@@ -43,14 +44,19 @@ class CsvParser { | |
| this | ||
| } | ||
|
|
||
| def withEscapeChar(escape: Character): CsvParser = { | ||
| this.escape = escape | ||
| this | ||
| } | ||
|
|
||
| def withSchema(schema: StructType): CsvParser = { | ||
| this.schema = schema | ||
| this | ||
| } | ||
|
|
||
| /** Returns a Schema RDD for the given CSV path. */ | ||
| def csvFile(sqlContext: SQLContext, path: String): DataFrame = { | ||
| val relation: CsvRelation = CsvRelation(path, useHeader, delimiter, quote, schema)(sqlContext) | ||
| val relation: CsvRelation = CsvRelation(path, useHeader, delimiter, quote, escape, schema)(sqlContext) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is more than 100 characters. |
||
| sqlContext.baseRelationToDataFrame(relation) | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -33,7 +33,8 @@ case class CsvRelation protected[spark] ( | |
| location: String, | ||
| useHeader: Boolean, | ||
| delimiter: Char, | ||
| quote: Char, | ||
| quote: Character, | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use Char |
||
| escape: Character, | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use Char |
||
| userSchema: StructType = null)(@transient val sqlContext: SQLContext) | ||
| extends BaseRelation with TableScan with InsertableRelation { | ||
|
|
||
|
|
@@ -53,6 +54,7 @@ case class CsvRelation protected[spark] ( | |
| val csvFormat = CSVFormat.DEFAULT | ||
| .withDelimiter(delimiter) | ||
| .withQuote(quote) | ||
| .withEscape(escape) | ||
| .withSkipHeaderRecord(false) | ||
| .withHeader(fieldNames: _*) | ||
|
|
||
|
|
@@ -78,6 +80,7 @@ case class CsvRelation protected[spark] ( | |
| val csvFormat = CSVFormat.DEFAULT | ||
| .withDelimiter(delimiter) | ||
| .withQuote(quote) | ||
| .withEscape(escape) | ||
| .withSkipHeaderRecord(false) | ||
| val firstRow = CSVParser.parse(firstLine, csvFormat).getRecords.head.toList | ||
| val header = if (useHeader) { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,6 +17,13 @@ package com.databricks.spark | |
|
|
||
| import org.apache.spark.sql.{SQLContext, DataFrame} | ||
|
|
||
| import org.apache.commons.csv.CSVFormat; | ||
| import org.apache.commons.csv.CSVPrinter; | ||
|
|
||
| import java.io.StringWriter; | ||
|
|
||
| import scala.collection.convert.WrapAsJava | ||
|
|
||
| package object csv { | ||
|
|
||
| /** | ||
|
|
@@ -28,7 +35,8 @@ package object csv { | |
| location = filePath, | ||
| useHeader = true, | ||
| delimiter = ',', | ||
| quote = '"')(sqlContext) | ||
| quote = '"', | ||
| escape = '\\')(sqlContext) | ||
| sqlContext.baseRelationToDataFrame(csvRelation) | ||
| } | ||
|
|
||
|
|
@@ -37,36 +45,38 @@ package object csv { | |
| location = filePath, | ||
| useHeader = true, | ||
| delimiter = '\t', | ||
| quote = '"')(sqlContext) | ||
| quote = '"', | ||
| escape = '\\')(sqlContext) | ||
| sqlContext.baseRelationToDataFrame(csvRelation) | ||
| } | ||
| } | ||
|
|
||
| implicit class CsvSchemaRDD(dataFrame: DataFrame) { | ||
| def saveAsCsvFile(path: String, parameters: Map[String, String] = Map()): Unit = { | ||
| // TODO(hossein): For nested types, we may want to perform special work | ||
| val delimiter = parameters.getOrElse("delimiter", ",") | ||
| val delimiter = parameters.getOrElse("delimiter", ",").charAt(0) | ||
| val quote = parameters.getOrElse("quote", "\"").charAt(0) | ||
| val escape = parameters.getOrElse("escape", "\\").charAt(0) | ||
| val generateHeader = parameters.getOrElse("header", "false").toBoolean | ||
| val header = if (generateHeader) { | ||
| dataFrame.columns.map(c => s""""$c"""").mkString(delimiter) | ||
| } else { | ||
| "" // There is no need to generate header in this case | ||
| } | ||
| val strRDD = dataFrame.rdd.mapPartitions { iter => | ||
| new Iterator[String] { | ||
| var firstRow: Boolean = generateHeader | ||
| val header = dataFrame.columns | ||
|
|
||
| override def hasNext = iter.hasNext | ||
| var firstRow: Boolean = generateHeader | ||
| val csvFileFormat = CSVFormat.DEFAULT | ||
| .withDelimiter(delimiter) | ||
| .withQuote(quote) | ||
| .withEscape(escape) | ||
|
|
||
| override def next: String = { | ||
| if (firstRow) { | ||
| firstRow = false | ||
| header + "\n" + iter.next.mkString(delimiter) | ||
| } else { | ||
| iter.next.mkString(delimiter) | ||
| } | ||
| } | ||
| val strRDD = dataFrame.rdd.mapPartitions { iter => | ||
| var firstRow: Boolean = generateHeader | ||
| val newIter = iter.map(_.toSeq.toArray) | ||
| val stringWriter = new StringWriter() | ||
| val csvPrinter = new CSVPrinter(stringWriter, csvFileFormat) | ||
| if (firstRow) { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this condition is always true for each partition. Note that previously we were making the check inside the Iterator.next(). |
||
| firstRow = false | ||
| csvPrinter.printRecord(header:_*) | ||
| } | ||
| csvPrinter.printRecords(WrapAsJava.asJavaIterable(newIter.toIterable)) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This can be inefficient because it seems to me we are traversing the iterator multiple times for each partition. I know CSVPrinter API is not very flexible here, but can we avoid multiple traversals? |
||
| Iterator(stringWriter.toString) | ||
| } | ||
| strRDD.saveAsTextFile(path) | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,4 @@ | ||
| year,make,model,comment | ||
| 2012,VW,Touran,"The ideal car for \"families\" and all their \"bags\", \"boxes\" and \"barbecues\"" | ||
| 2013,Seat,Alhambra,"It is a great \"family\" car, for big families" | ||
| 2014,Peugeot,5008,"It is a fine \"family\" car" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This file is removed in master. No need for these changes.