Github user gatorsmile commented on a diff in the pull request: https://github.com/apache/spark/pull/14038#discussion_r118045500 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala --- @@ -31,6 +31,34 @@ import org.apache.spark.sql.types.StructType /** + * A filter class to list up qualified paths in parallel. + */ +abstract class PathFilter extends Serializable { + final def accept(path: Path): Boolean = isDataPath(path) || isMetaDataPath(path) + def isDataPath(path: Path): Boolean = false + def isMetaDataPath(path: Path): Boolean = false +} + +object PathFilter { + + /** A default path filter to pass through all input paths. */ + val defaultPathFilter = new PathFilter { + + override def isDataPath(path: Path): Boolean = { + // We filter follow paths: + // 1. everything that starts with _ and ., except _common_metadata and _metadata + // because Parquet needs to find those metadata files from leaf files returned by this method. + // We should refactor this logic to not mix metadata files with data files. + // 2. everything that ends with `._COPYING_`, because this is a intermediate state of file. we + // should skip this file in case of double reading. + val name = path.getName + !((name.startsWith("_") && !name.contains("=")) || name.startsWith(".") || + name.endsWith("._COPYING_")) --- End diff -- Like @rxin said, this sounds risky to me too.
--- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org