j-y-matsubara commented on a change in pull request #9531: URL: https://github.com/apache/airflow/pull/9531#discussion_r448466461
########## File path: airflow/utils/file.py ########## @@ -90,6 +90,48 @@ def open_maybe_zipped(fileloc, mode='r'): return io.open(fileloc, mode=mode) +def find_path_from_directory( + base_dir_path: str, + ignore_list_file: str) -> Generator[str, None, None]: + """ + Search the file and return the path of the file that should not be ignored. + :param base_dir_path: the base path to be searched for. + :param ignore_file_list_name: the file name in which specifies a regular expression pattern is written. + + :return : file path not to be ignored + """ + + patterns_by_dir: Dict[str, List[Pattern[str]]] = {} + + for root, dirs, files in os.walk(str(base_dir_path), followlinks=True): + patterns: List[Pattern[str]] = patterns_by_dir.get(root, []) + + ignore_list_file_path = os.path.join(root, ignore_list_file) + if os.path.isfile(ignore_list_file_path): + with open(ignore_list_file_path, 'r') as file: + lines_no_comments = [re.compile(r"\s*#.*").sub("", line) for line in file.read().split("\n")] + patterns += [re.compile(line) for line in lines_no_comments if line] + patterns = list(set(patterns)) + + dirs[:] = [ + subdir + for subdir in dirs + if not any(p.search( + os.path.join(os.path.relpath(root, str(base_dir_path)), subdir)) for p in patterns) + ] + + for subdir in dirs: + patterns_by_dir[os.path.join(root, subdir)] = patterns.copy() Review comment: This is necessary. A canonical pattern that is evaluated in a parent directory must also be evaluated in its parent's child directories. At least that's how .airflowignore (selection of dag) is currently specificated. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org