turbaszek commented on a change in pull request #9531:
URL: https://github.com/apache/airflow/pull/9531#discussion_r448366593



##########
File path: airflow/utils/file.py
##########
@@ -90,6 +90,48 @@ def open_maybe_zipped(fileloc, mode='r'):
         return io.open(fileloc, mode=mode)
 
 
+def find_path_from_directory(
+        base_dir_path: str,
+        ignore_list_file: str) -> Generator[str, None, None]:
+    """
+    Search the file and return the path of the file that should not be ignored.
+    :param base_dir_path: the base path to be searched for.
+    :param ignore_file_list_name: the file name in which specifies a regular 
expression pattern is written.
+
+    :return : file path not to be ignored
+    """
+
+    patterns_by_dir: Dict[str, List[Pattern[str]]] = {}
+
+    for root, dirs, files in os.walk(str(base_dir_path), followlinks=True):
+        patterns: List[Pattern[str]] = patterns_by_dir.get(root, [])
+
+        ignore_list_file_path = os.path.join(root, ignore_list_file)
+        if os.path.isfile(ignore_list_file_path):
+            with open(ignore_list_file_path, 'r') as file:
+                lines_no_comments = [re.compile(r"\s*#.*").sub("", line) for 
line in file.read().split("\n")]
+                patterns += [re.compile(line) for line in lines_no_comments if 
line]
+                patterns = list(set(patterns))
+
+        dirs[:] = [
+            subdir
+            for subdir in dirs
+            if not any(p.search(
+                os.path.join(os.path.relpath(root, str(base_dir_path)), 
subdir)) for p in patterns)
+        ]
+
+        for subdir in dirs:
+            patterns_by_dir[os.path.join(root, subdir)] = patterns.copy()

Review comment:
       ```suggestion
   
           patterns_by_dir  = {os.path.join(root, sd): patterns.copy() for sd 
in dirs}
   ```
   WDYT? Also, do we have to create copy of `patterns` each time?

##########
File path: airflow/plugins_manager.py
##########
@@ -164,34 +165,34 @@ def load_plugins_from_plugin_directory():
     global plugins  # pylint: disable=global-statement
     log.debug("Loading plugins from directory: %s", settings.PLUGINS_FOLDER)
 
-    # Crawl through the plugins folder to find AirflowPlugin derivatives
-    for root, _, files in os.walk(settings.PLUGINS_FOLDER, followlinks=True):  
# noqa # pylint: disable=too-many-nested-blocks
-        for f in files:
-            filepath = os.path.join(root, f)
-            try:
-                if not os.path.isfile(filepath):
-                    continue
-                mod_name, file_ext = os.path.splitext(
-                    os.path.split(filepath)[-1])
-                if file_ext != '.py':
-                    continue
-
-                log.debug('Importing plugin module %s', filepath)
-
-                loader = importlib.machinery.SourceFileLoader(mod_name, 
filepath)
-                spec = importlib.util.spec_from_loader(mod_name, loader)
-                mod = importlib.util.module_from_spec(spec)
-                sys.modules[spec.name] = mod
-                loader.exec_module(mod)
-                for mod_attr_value in list(mod.__dict__.values()):
-                    if is_valid_plugin(mod_attr_value):
-                        plugin_instance = mod_attr_value()
-                        plugins.append(plugin_instance)
-            except Exception as e:  # pylint: disable=broad-except
-                log.exception(e)
-                path = filepath or str(f)
-                log.error('Failed to import plugin %s', path)
-                import_errors[path] = str(e)
+    ignore_list_file = ".airflowignore"

Review comment:
       Should we make it a constant?

##########
File path: airflow/utils/file.py
##########
@@ -90,6 +90,48 @@ def open_maybe_zipped(fileloc, mode='r'):
         return io.open(fileloc, mode=mode)
 
 
+def find_path_from_directory(
+        base_dir_path: str,
+        ignore_list_file: str) -> Generator[str, None, None]:
+    """
+    Search the file and return the path of the file that should not be ignored.
+    :param base_dir_path: the base path to be searched for.
+    :param ignore_file_list_name: the file name in which specifies a regular 
expression pattern is written.
+
+    :return : file path not to be ignored
+    """
+
+    patterns_by_dir: Dict[str, List[Pattern[str]]] = {}
+
+    for root, dirs, files in os.walk(str(base_dir_path), followlinks=True):
+        patterns: List[Pattern[str]] = patterns_by_dir.get(root, [])
+
+        ignore_list_file_path = os.path.join(root, ignore_list_file)
+        if os.path.isfile(ignore_list_file_path):
+            with open(ignore_list_file_path, 'r') as file:
+                lines_no_comments = [re.compile(r"\s*#.*").sub("", line) for 
line in file.read().split("\n")]
+                patterns += [re.compile(line) for line in lines_no_comments if 
line]
+                patterns = list(set(patterns))
+
+        dirs[:] = [
+            subdir
+            for subdir in dirs
+            if not any(p.search(
+                os.path.join(os.path.relpath(root, str(base_dir_path)), 
subdir)) for p in patterns)
+        ]
+
+        for subdir in dirs:
+            patterns_by_dir[os.path.join(root, subdir)] = patterns.copy()
+
+        for file in files:  # type: ignore
+            if file == ignore_list_file:
+                continue
+            file_path = os.path.join(root, str(file))
+            if any([re.findall(p, file_path) for p in patterns]):

Review comment:
       ```suggestion
               if any(re.findall(p, file_path) for p in patterns):
   ```
   Should work also

##########
File path: airflow/plugins_manager.py
##########
@@ -164,34 +165,34 @@ def load_plugins_from_plugin_directory():
     global plugins  # pylint: disable=global-statement
     log.debug("Loading plugins from directory: %s", settings.PLUGINS_FOLDER)
 
-    # Crawl through the plugins folder to find AirflowPlugin derivatives
-    for root, _, files in os.walk(settings.PLUGINS_FOLDER, followlinks=True):  
# noqa # pylint: disable=too-many-nested-blocks
-        for f in files:
-            filepath = os.path.join(root, f)
-            try:
-                if not os.path.isfile(filepath):
-                    continue
-                mod_name, file_ext = os.path.splitext(
-                    os.path.split(filepath)[-1])
-                if file_ext != '.py':
-                    continue
-
-                log.debug('Importing plugin module %s', filepath)
-
-                loader = importlib.machinery.SourceFileLoader(mod_name, 
filepath)
-                spec = importlib.util.spec_from_loader(mod_name, loader)
-                mod = importlib.util.module_from_spec(spec)
-                sys.modules[spec.name] = mod
-                loader.exec_module(mod)
-                for mod_attr_value in list(mod.__dict__.values()):
-                    if is_valid_plugin(mod_attr_value):
-                        plugin_instance = mod_attr_value()
-                        plugins.append(plugin_instance)
-            except Exception as e:  # pylint: disable=broad-except
-                log.exception(e)
-                path = filepath or str(f)
-                log.error('Failed to import plugin %s', path)
-                import_errors[path] = str(e)
+    ignore_list_file = ".airflowignore"
+
+    for file_path in find_path_from_directory(  # pylint: 
disable=too-many-nested-blocks
+            str(settings.PLUGINS_FOLDER), str(ignore_list_file)):
+
+        try:
+            if not os.path.isfile(file_path):
+                continue
+            mod_name, file_ext = os.path.splitext(
+                os.path.split(file_path)[-1])
+            if file_ext != '.py':
+                continue
+
+            log.info('Importing plugin module %s', file_path)
+
+            loader = importlib.machinery.SourceFileLoader(mod_name, file_path)
+            spec = importlib.util.spec_from_loader(mod_name, loader)
+            mod = importlib.util.module_from_spec(spec)
+            sys.modules[spec.name] = mod
+            loader.exec_module(mod)
+            for mod_attr_value in list(mod.__dict__.values()):
+                if is_valid_plugin(mod_attr_value):
+                    plugin_instance = mod_attr_value()
+                    plugins.append(plugin_instance)

Review comment:
       ```suggestion
               for mod_attr_value in (m for m in mod.__dict__.values() if 
is_valid_plugin(m)):
                   plugin_instance = mod_attr_value()
                   plugins.append(plugin_instance)
   ```
   No strong opinion here but in this way we may be able to fix the `# pylint: 
disable=too-many-nested-blocks`

##########
File path: airflow/plugins_manager.py
##########
@@ -164,34 +165,34 @@ def load_plugins_from_plugin_directory():
     global plugins  # pylint: disable=global-statement
     log.debug("Loading plugins from directory: %s", settings.PLUGINS_FOLDER)
 
-    # Crawl through the plugins folder to find AirflowPlugin derivatives
-    for root, _, files in os.walk(settings.PLUGINS_FOLDER, followlinks=True):  
# noqa # pylint: disable=too-many-nested-blocks
-        for f in files:
-            filepath = os.path.join(root, f)
-            try:
-                if not os.path.isfile(filepath):
-                    continue
-                mod_name, file_ext = os.path.splitext(
-                    os.path.split(filepath)[-1])
-                if file_ext != '.py':
-                    continue
-
-                log.debug('Importing plugin module %s', filepath)
-
-                loader = importlib.machinery.SourceFileLoader(mod_name, 
filepath)
-                spec = importlib.util.spec_from_loader(mod_name, loader)
-                mod = importlib.util.module_from_spec(spec)
-                sys.modules[spec.name] = mod
-                loader.exec_module(mod)
-                for mod_attr_value in list(mod.__dict__.values()):
-                    if is_valid_plugin(mod_attr_value):
-                        plugin_instance = mod_attr_value()
-                        plugins.append(plugin_instance)
-            except Exception as e:  # pylint: disable=broad-except
-                log.exception(e)
-                path = filepath or str(f)
-                log.error('Failed to import plugin %s', path)
-                import_errors[path] = str(e)
+    ignore_list_file = ".airflowignore"
+
+    for file_path in find_path_from_directory(  # pylint: 
disable=too-many-nested-blocks
+            str(settings.PLUGINS_FOLDER), str(ignore_list_file)):
+
+        try:
+            if not os.path.isfile(file_path):
+                continue
+            mod_name, file_ext = os.path.splitext(
+                os.path.split(file_path)[-1])
+            if file_ext != '.py':
+                continue

Review comment:
       Does this part have to be in `try` clause? 

##########
File path: airflow/plugins_manager.py
##########
@@ -164,34 +165,34 @@ def load_plugins_from_plugin_directory():
     global plugins  # pylint: disable=global-statement
     log.debug("Loading plugins from directory: %s", settings.PLUGINS_FOLDER)
 
-    # Crawl through the plugins folder to find AirflowPlugin derivatives
-    for root, _, files in os.walk(settings.PLUGINS_FOLDER, followlinks=True):  
# noqa # pylint: disable=too-many-nested-blocks
-        for f in files:
-            filepath = os.path.join(root, f)
-            try:
-                if not os.path.isfile(filepath):
-                    continue
-                mod_name, file_ext = os.path.splitext(
-                    os.path.split(filepath)[-1])
-                if file_ext != '.py':
-                    continue
-
-                log.debug('Importing plugin module %s', filepath)
-
-                loader = importlib.machinery.SourceFileLoader(mod_name, 
filepath)
-                spec = importlib.util.spec_from_loader(mod_name, loader)
-                mod = importlib.util.module_from_spec(spec)
-                sys.modules[spec.name] = mod
-                loader.exec_module(mod)
-                for mod_attr_value in list(mod.__dict__.values()):
-                    if is_valid_plugin(mod_attr_value):
-                        plugin_instance = mod_attr_value()
-                        plugins.append(plugin_instance)
-            except Exception as e:  # pylint: disable=broad-except
-                log.exception(e)
-                path = filepath or str(f)
-                log.error('Failed to import plugin %s', path)
-                import_errors[path] = str(e)
+    ignore_list_file = ".airflowignore"
+
+    for file_path in find_path_from_directory(  # pylint: 
disable=too-many-nested-blocks
+            str(settings.PLUGINS_FOLDER), str(ignore_list_file)):

Review comment:
       Is this cast to `str` necessary?

##########
File path: airflow/utils/file.py
##########
@@ -90,6 +90,48 @@ def open_maybe_zipped(fileloc, mode='r'):
         return io.open(fileloc, mode=mode)
 
 
+def find_path_from_directory(
+        base_dir_path: str,
+        ignore_list_file: str) -> Generator[str, None, None]:
+    """
+    Search the file and return the path of the file that should not be ignored.
+    :param base_dir_path: the base path to be searched for.
+    :param ignore_file_list_name: the file name in which specifies a regular 
expression pattern is written.
+
+    :return : file path not to be ignored
+    """
+
+    patterns_by_dir: Dict[str, List[Pattern[str]]] = {}
+
+    for root, dirs, files in os.walk(str(base_dir_path), followlinks=True):
+        patterns: List[Pattern[str]] = patterns_by_dir.get(root, [])
+
+        ignore_list_file_path = os.path.join(root, ignore_list_file)
+        if os.path.isfile(ignore_list_file_path):
+            with open(ignore_list_file_path, 'r') as file:
+                lines_no_comments = [re.compile(r"\s*#.*").sub("", line) for 
line in file.read().split("\n")]

Review comment:
       Should we compile this only once?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to