[SPARK-50854][SS] Make path fully qualified before passing it to FileStreamSink

vrozov · vrozov · commit a52f3b7f036a · 2025-02-11T16:15:03.000Z
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala
@@ -2950,4 +2950,8 @@ private[sql] object QueryExecutionErrors extends QueryErrorsBase with ExecutionE
       messageParameters = Map("option" -> option)
     )
   }
+
+  def notAbsolutePathError(path: Path): SparkException = {
+    SparkException.internalError(s"$path is not absolute path.")
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
@@ -120,6 +120,11 @@ case class DataSource(
   private def newHadoopConfiguration(): Configuration =
     sparkSession.sessionState.newHadoopConfWithOptions(options)
 
+  private def makeQualified(path: Path): Path = {
+    val fs = path.getFileSystem(newHadoopConfiguration())
+    path.makeQualified(fs.getUri, fs.getWorkingDirectory)
+  }
+
   lazy val sourceInfo: SourceInfo = sourceSchema()
   private val caseInsensitiveOptions = CaseInsensitiveMap(options)
   private val equality = sparkSession.sessionState.conf.resolver
@@ -319,9 +324,9 @@ case class DataSource(
         s.createSink(sparkSession.sqlContext, caseInsensitiveOptions, partitionColumns, outputMode)
 
       case fileFormat: FileFormat =>
-        val path = caseInsensitiveOptions.getOrElse("path", {
+        val path = makeQualified(new Path(caseInsensitiveOptions.getOrElse("path", {
           throw QueryExecutionErrors.dataPathNotSpecifiedError()
-        })
+        }))).toString
         if (outputMode != OutputMode.Append) {
           throw QueryCompilationErrors.dataSourceOutputModeUnsupportedError(className, outputMode)
         }
@@ -456,9 +461,7 @@ case class DataSource(
     //  3. It's OK that the output path doesn't exist yet;
     val allPaths = paths ++ caseInsensitiveOptions.get("path")
     val outputPath = if (allPaths.length == 1) {
-      val path = new Path(allPaths.head)
-      val fs = path.getFileSystem(newHadoopConfiguration())
-      path.makeQualified(fs.getUri, fs.getWorkingDirectory)
+      makeQualified(new Path(allPaths.head))
     } else {
       throw QueryExecutionErrors.multiplePathsSpecifiedError(allPaths)
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala
@@ -134,6 +134,9 @@ class FileStreamSink(
 
   private val hadoopConf = sparkSession.sessionState.newHadoopConf()
   private val basePath = new Path(path)
+  if (!basePath.isAbsolute) {
+    throw QueryExecutionErrors.notAbsolutePathError(basePath)
+  }
   private val logPath = getMetadataLogPath(basePath.getFileSystem(hadoopConf), basePath,
     sparkSession.sessionState.conf)
   private val retention = options.get("retention").map(Utils.timeStringAsMs)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/AsyncProgressTrackingMicroBatchExecutionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/AsyncProgressTrackingMicroBatchExecutionSuite.scala
@@ -1101,7 +1101,7 @@ class AsyncProgressTrackingMicroBatchExecutionSuite
         .start("/tmp")
     }
 
-    e.getMessage should equal("Sink FileSink[/tmp] does not support async progress tracking")
+    e.getMessage should equal("Sink FileSink[file:/tmp] does not support async progress tracking")
   }
 
   test("with log purging") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.streaming
 
 import java.io.{File, IOException}
-import java.nio.file.Files
+import java.nio.file.{Files, Paths}
 import java.util.Locale
 
 import scala.collection.mutable.ArrayBuffer
@@ -27,7 +27,7 @@ import scala.jdk.CollectionConverters._
 import org.apache.hadoop.fs.{FileStatus, Path, RawLocalFileSystem}
 import org.apache.hadoop.mapreduce.JobContext
 
-import org.apache.spark.SparkConf
+import org.apache.spark.{SparkConf, SparkException}
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.internal.io.FileCommitProtocol
 import org.apache.spark.paths.SparkPath
@@ -36,6 +36,7 @@ import org.apache.spark.sql.{AnalysisException, DataFrame}
 import org.apache.spark.sql.catalyst.util.stringToFile
 import org.apache.spark.sql.execution.DataSourceScanExec
 import org.apache.spark.sql.execution.datasources._
+import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat
 import org.apache.spark.sql.execution.datasources.v2.{BatchScanExec, DataSourceV2Relation, FileScan, FileTable}
 import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.sql.functions._
@@ -292,35 +293,48 @@ abstract class FileStreamSinkSuite extends StreamTest {
   test("parquet") {
     testFormat(None) // should not throw error as default format parquet when not specified
     testFormat(Some("parquet"))
+    testFormat(None, relativizeOutputPath = true)
+    testFormat(Some("parquet"), relativizeOutputPath = true)
   }
 
   test("orc") {
     testFormat(Some("orc"))
+    testFormat(Some("orc"), relativizeOutputPath = true)
   }
 
   test("text") {
     testFormat(Some("text"))
+    testFormat(Some("text"), relativizeOutputPath = true)
   }
 
   test("json") {
     testFormat(Some("json"))
+    testFormat(Some("json"), relativizeOutputPath = true)
   }
 
-  def testFormat(format: Option[String]): Unit = {
-    val inputData = MemoryStream[Int]
+  def testFormat(format: Option[String], relativizeOutputPath: Boolean = false): Unit = {
+    val inputData = MemoryStream[String] // text format only supports String
     val ds = inputData.toDS()
 
-    val outputDir = Utils.createTempDir(namePrefix = "stream.output").getCanonicalPath
+    val tempDir = Utils.createTempDir(namePrefix = "stream.output")
+    val outputPath = if (relativizeOutputPath) {
+      Paths.get("").toAbsolutePath.relativize(tempDir.toPath).toString
+    } else {
+      tempDir.getCanonicalPath
+    }
     val checkpointDir = Utils.createTempDir(namePrefix = "stream.checkpoint").getCanonicalPath
 
-    var query: StreamingQuery = null
+    val writer = ds.toDF("value").writeStream
+      .option("checkpointLocation", checkpointDir)
+    if (format.nonEmpty) {
+      writer.format(format.get)
+    }
 
+    var query: StreamingQuery = null
     try {
-      val writer = ds.map(i => (i, i * 1000)).toDF("id", "value").writeStream
-      if (format.nonEmpty) {
-        writer.format(format.get)
-      }
-      query = writer.option("checkpointLocation", checkpointDir).start(outputDir)
+      query = writer.start(outputPath)
+      inputData.addData("data")
+      query.processAllAvailable()
     } finally {
       if (query != null) {
         query.stop()
@@ -664,6 +678,16 @@ abstract class FileStreamSinkSuite extends StreamTest {
         s" $path."))
     }
   }
+
+  test("SPARK-50854: Make path fully qualified before passing it to FileStreamSink") {
+    val fileFormat = new ParquetFileFormat() // any valid FileFormat
+    val partitionColumnNames = Seq.empty[String]
+    val options = Map.empty[String, String]
+    val exception = intercept[SparkException] {
+      new FileStreamSink(spark, "test.parquet", fileFormat, partitionColumnNames, options)
+    }
+    assert(exception.getMessage.contains("is not absolute path."))
+  }
 }
 
 object PendingCommitFilesTrackingManifestFileCommitProtocol {

Original file line number	Diff line number	Diff line change
`@@ -2950,4 +2950,8 @@ private[sql] object QueryExecutionErrors extends QueryErrorsBase with ExecutionE`
`2950`	`2950`	`messageParameters = Map("option" -> option)`
`2951`	`2951`	`)`
`2952`	`2952`	`}`
	`2953`	`+`
	`2954`	`+ def notAbsolutePathError(path: Path): SparkException = {`
	`2955`	`+ SparkException.internalError(s"$path is not absolute path.")`
	`2956`	`+ }`
`2953`	`2957`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1101,7 +1101,7 @@ class AsyncProgressTrackingMicroBatchExecutionSuite`
`1101`	`1101`	`.start("/tmp")`
`1102`	`1102`	`}`
`1103`	`1103`
`1104`		`- e.getMessage should equal("Sink FileSink[/tmp] does not support async progress tracking")`
	`1104`	`+ e.getMessage should equal("Sink FileSink[file:/tmp] does not support async progress tracking")`
`1105`	`1105`	`}`
`1106`	`1106`
`1107`	`1107`	`test("with log purging") {`