Github user petermaxlee commented on a diff in the pull request: https://github.com/apache/spark/pull/14728#discussion_r76095002 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala --- @@ -41,36 +40,59 @@ class FileStreamSource( metadataPath: String, options: Map[String, String]) extends Source with Logging { + import FileStreamSource._ + + private val sourceOptions = new FileStreamOptions(options) + private val fs = new Path(path).getFileSystem(sparkSession.sessionState.newHadoopConf()) private val qualifiedBasePath = fs.makeQualified(new Path(path)) // can contains glob patterns - private val metadataLog = new HDFSMetadataLog[Seq[String]](sparkSession, metadataPath) + + private val metadataLog = new HDFSMetadataLog[Seq[FileEntry]](sparkSession, metadataPath) + private var maxBatchId = metadataLog.getLatest().map(_._1).getOrElse(-1L) /** Maximum number of new files to be considered in each batch */ - private val maxFilesPerBatch = getMaxFilesPerBatch() + private val maxFilesPerBatch = sourceOptions.maxFilesPerTrigger + + /** A mapping from a file that we have processed to some timestamp it was last modified. */ + // Visible for testing. + val seenFiles = new SeenFilesMap(sourceOptions.maxFileAgeMs) - private val seenFiles = new OpenHashSet[String] - metadataLog.get(None, Some(maxBatchId)).foreach { case (batchId, files) => - files.foreach(seenFiles.add) + metadataLog.get(None, Some(maxBatchId)).foreach { case (batchId, entry) => + entry.foreach(seenFiles.add) + seenFiles.purge() --- End diff -- This is not safe until we truncate the logs.
--- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org