This is an automated email from the ASF dual-hosted git repository. csantanapr pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/incubator-openwhisk-utilities.git
The following commit(s) were added to refs/heads/master by this push: new e1faec9 Fix exclusion directory and file matching and add support for reading a top level gitignore file (#57) e1faec9 is described below commit e1faec929990f9107238c5287bae2209eca1b1a2 Author: rodric rabbah <rod...@gmail.com> AuthorDate: Tue Apr 2 22:36:42 2019 -0400 Fix exclusion directory and file matching and add support for reading a top level gitignore file (#57) --- .gitignore | 3 + LICENSE.txt | 12 ++ README.md | 4 +- licenses/LICENSE-pathspec.txt | 374 ++++++++++++++++++++++++++++++++++++++++++ scancode/lib/compat.py | 45 +++++ scancode/lib/gitwildmatch.py | 325 ++++++++++++++++++++++++++++++++++++ scancode/lib/pathspec.py | 146 +++++++++++++++++ scancode/lib/pattern.py | 155 +++++++++++++++++ scancode/lib/util.py | 359 ++++++++++++++++++++++++++++++++++++++++ scancode/scanCode.py | 57 ++++--- scancode/travis.cfg | 3 + 11 files changed, 1455 insertions(+), 28 deletions(-) diff --git a/.gitignore b/.gitignore index 23c57c9..fd8ce7c 100644 --- a/.gitignore +++ b/.gitignore @@ -26,3 +26,6 @@ ansible/roles/nginx/files/*cert.pem # .zip files must be explicited whitelisted *.zip + +# .pyc files +*.pyc diff --git a/LICENSE.txt b/LICENSE.txt index a2fe52f..23fba0a 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -200,3 +200,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. + + +======================================================================== +Mozilla Public License 2.0 +======================================================================== + +This distribution bundles the following component, which is +available under an Mozilla Public License 2.0 +(https://www.mozilla.org/en-US/MPL/2.0/). + +Pathspec 0.5.9 (https://pypi.org/project/pathspec/) under scanCode/lib. +License included at licenses/LICENSE-pathspec.txt. diff --git a/README.md b/README.md index 93c7de3..fecac52 100644 --- a/README.md +++ b/README.md @@ -47,7 +47,7 @@ Scan detected 3 error(s) in 1 file(s): To make sure this never happens to you, you can run the same tests on your local machine every time you commit changes. -1. Clone the OpenWhisk utilities project repo.: +1. Clone the OpenWhisk utilities project: ```bash $ git clone https://github.com/apache/incubator-openwhisk-utilities.git ``` @@ -66,7 +66,7 @@ $ cat /path/to/openwhisk/.git/hooks/pre-commit # determine openwhisk base directory root="$(git rev-parse --show-toplevel)" -python /path/to/incubator-openwhisk-utilities/scancode/scanCode.py . --config $root/tools/ +python /path/to/incubator-openwhisk-utilities/scancode/scanCode.py . --config $root/tools/ --gitignore $root/.gitignore ``` _Note_: A hook a locally installed, so if you check out the repository again, you will need to reinstall it. diff --git a/licenses/LICENSE-pathspec.txt b/licenses/LICENSE-pathspec.txt new file mode 100644 index 0000000..52d1351 --- /dev/null +++ b/licenses/LICENSE-pathspec.txt @@ -0,0 +1,374 @@ +Mozilla Public License Version 2.0 +================================== + +1. Definitions +-------------- + +1.1. "Contributor" + means each individual or legal entity that creates, contributes to + the creation of, or owns Covered Software. + +1.2. "Contributor Version" + means the combination of the Contributions of others (if any) used + by a Contributor and that particular Contributor's Contribution. + +1.3. "Contribution" + means Covered Software of a particular Contributor. + +1.4. "Covered Software" + means Source Code Form to which the initial Contributor has attached + the notice in Exhibit A, the Executable Form of such Source Code + Form, and Modifications of such Source Code Form, in each case + including portions thereof. + +1.5. "Incompatible With Secondary Licenses" + means + + (a) that the initial Contributor has attached the notice described + in Exhibit B to the Covered Software; or + + (b) that the Covered Software was made available under the terms of + version 1.1 or earlier of the License, but not also under the + terms of a Secondary License. + +1.6. "Executable Form" + means any form of the work other than Source Code Form. + +1.7. "Larger Work" + means a work that combines Covered Software with other material, in + a separate file or files, that is not Covered Software. + +1.8. "License" + means this document. + +1.9. "Licensable" + means having the right to grant, to the maximum extent possible, + whether at the time of the initial grant or subsequently, any and + all of the rights conveyed by this License. + +1.10. "Modifications" + means any of the following: + + (a) any file in Source Code Form that results from an addition to, + deletion from, or modification of the contents of Covered + Software; or + + (b) any new file in Source Code Form that contains any Covered + Software. + +1.11. "Patent Claims" of a Contributor + means any patent claim(s), including without limitation, method, + process, and apparatus claims, in any patent Licensable by such + Contributor that would be infringed, but for the grant of the + License, by the making, using, selling, offering for sale, having + made, import, or transfer of either its Contributions or its + Contributor Version. + +1.12. "Secondary License" + means either the GNU General Public License, Version 2.0, the GNU + Lesser General Public License, Version 2.1, the GNU Affero General + Public License, Version 3.0, or any later versions of those + licenses. + +1.13. "Source Code Form" + means the form of the work preferred for making modifications. + +1.14. "You" (or "Your") + means an individual or a legal entity exercising rights under this + License. For legal entities, "You" includes any entity that + controls, is controlled by, or is under common control with You. For + purposes of this definition, "control" means (a) the power, direct + or indirect, to cause the direction or management of such entity, + whether by contract or otherwise, or (b) ownership of more than + fifty percent (50%) of the outstanding shares or beneficial + ownership of such entity. + +2. License Grants and Conditions +-------------------------------- + +2.1. Grants + +Each Contributor hereby grants You a world-wide, royalty-free, +non-exclusive license: + +(a) under intellectual property rights (other than patent or trademark) + Licensable by such Contributor to use, reproduce, make available, + modify, display, perform, distribute, and otherwise exploit its + Contributions, either on an unmodified basis, with Modifications, or + as part of a Larger Work; and + +(b) under Patent Claims of such Contributor to make, use, sell, offer + for sale, have made, import, and otherwise transfer either its + Contributions or its Contributor Version. + +2.2. Effective Date + +The licenses granted in Section 2.1 with respect to any Contribution +become effective for each Contribution on the date the Contributor first +distributes such Contribution. + +2.3. Limitations on Grant Scope + +The licenses granted in this Section 2 are the only rights granted under +this License. No additional rights or licenses will be implied from the +distribution or licensing of Covered Software under this License. +Notwithstanding Section 2.1(b) above, no patent license is granted by a +Contributor: + +(a) for any code that a Contributor has removed from Covered Software; + or + +(b) for infringements caused by: (i) Your and any other third party's + modifications of Covered Software, or (ii) the combination of its + Contributions with other software (except as part of its Contributor + Version); or + +(c) under Patent Claims infringed by Covered Software in the absence of + its Contributions. + +This License does not grant any rights in the trademarks, service marks, +or logos of any Contributor (except as may be necessary to comply with +the notice requirements in Section 3.4). + +2.4. Subsequent Licenses + +No Contributor makes additional grants as a result of Your choice to +distribute the Covered Software under a subsequent version of this +License (see Section 10.2) or under the terms of a Secondary License (if +permitted under the terms of Section 3.3). + +2.5. Representation + +Each Contributor represents that the Contributor believes its +Contributions are its original creation(s) or it has sufficient rights +to grant the rights to its Contributions conveyed by this License. + +2.6. Fair Use + +This License is not intended to limit any rights You have under +applicable copyright doctrines of fair use, fair dealing, or other +equivalents. + +2.7. Conditions + +Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted +in Section 2.1. + +3. Responsibilities +------------------- + +3.1. Distribution of Source Form + +All distribution of Covered Software in Source Code Form, including any +Modifications that You create or to which You contribute, must be under +the terms of this License. You must inform recipients that the Source +Code Form of the Covered Software is governed by the terms of this +License, and how they can obtain a copy of this License. You may not +attempt to alter or restrict the recipients' rights in the Source Code +Form. + +3.2. Distribution of Executable Form + +If You distribute Covered Software in Executable Form then: + +(a) such Covered Software must also be made available in Source Code + Form, as described in Section 3.1, and You must inform recipients of + the Executable Form how they can obtain a copy of such Source Code + Form by reasonable means in a timely manner, at a charge no more + than the cost of distribution to the recipient; and + +(b) You may distribute such Executable Form under the terms of this + License, or sublicense it under different terms, provided that the + license for the Executable Form does not attempt to limit or alter + the recipients' rights in the Source Code Form under this License. + +3.3. Distribution of a Larger Work + +You may create and distribute a Larger Work under terms of Your choice, +provided that You also comply with the requirements of this License for +the Covered Software. If the Larger Work is a combination of Covered +Software with a work governed by one or more Secondary Licenses, and the +Covered Software is not Incompatible With Secondary Licenses, this +License permits You to additionally distribute such Covered Software +under the terms of such Secondary License(s), so that the recipient of +the Larger Work may, at their option, further distribute the Covered +Software under the terms of either this License or such Secondary +License(s). + +3.4. Notices + +You may not remove or alter the substance of any license notices +(including copyright notices, patent notices, disclaimers of warranty, +or limitations of liability) contained within the Source Code Form of +the Covered Software, except that You may alter any license notices to +the extent required to remedy known factual inaccuracies. + +3.5. Application of Additional Terms + +You may choose to offer, and to charge a fee for, warranty, support, +indemnity or liability obligations to one or more recipients of Covered +Software. However, You may do so only on Your own behalf, and not on +behalf of any Contributor. You must make it absolutely clear that any +such warranty, support, indemnity, or liability obligation is offered by +You alone, and You hereby agree to indemnify every Contributor for any +liability incurred by such Contributor as a result of warranty, support, +indemnity or liability terms You offer. You may include additional +disclaimers of warranty and limitations of liability specific to any +jurisdiction. + +4. Inability to Comply Due to Statute or Regulation +--------------------------------------------------- + +If it is impossible for You to comply with any of the terms of this +License with respect to some or all of the Covered Software due to +statute, judicial order, or regulation then You must: (a) comply with +the terms of this License to the maximum extent possible; and (b) +describe the limitations and the code they affect. Such description must +be placed in a text file included with all distributions of the Covered +Software under this License. Except to the extent prohibited by statute +or regulation, such description must be sufficiently detailed for a +recipient of ordinary skill to be able to understand it. + +5. Termination +-------------- + +5.1. The rights granted under this License will terminate automatically +if You fail to comply with any of its terms. However, if You become +compliant, then the rights granted under this License from a particular +Contributor are reinstated (a) provisionally, unless and until such +Contributor explicitly and finally terminates Your grants, and (b) on an +ongoing basis, if such Contributor fails to notify You of the +non-compliance by some reasonable means prior to 60 days after You have +come back into compliance. Moreover, Your grants from a particular +Contributor are reinstated on an ongoing basis if such Contributor +notifies You of the non-compliance by some reasonable means, this is the +first time You have received notice of non-compliance with this License +from such Contributor, and You become compliant prior to 30 days after +Your receipt of the notice. + +5.2. If You initiate litigation against any entity by asserting a patent +infringement claim (excluding declaratory judgment actions, +counter-claims, and cross-claims) alleging that a Contributor Version +directly or indirectly infringes any patent, then the rights granted to +You by any and all Contributors for the Covered Software under Section +2.1 of this License shall terminate. + +5.3. In the event of termination under Sections 5.1 or 5.2 above, all +end user license agreements (excluding distributors and resellers) which +have been validly granted by You or Your distributors under this License +prior to termination shall survive termination. + +************************************************************************ +* * +* 6. Disclaimer of Warranty * +* ------------------------- * +* * +* Covered Software is provided under this License on an "as is" * +* basis, without warranty of any kind, either expressed, implied, or * +* statutory, including, without limitation, warranties that the * +* Covered Software is free of defects, merchantable, fit for a * +* particular purpose or non-infringing. The entire risk as to the * +* quality and performance of the Covered Software is with You. * +* Should any Covered Software prove defective in any respect, You * +* (not any Contributor) assume the cost of any necessary servicing, * +* repair, or correction. This disclaimer of warranty constitutes an * +* essential part of this License. No use of any Covered Software is * +* authorized under this License except under this disclaimer. * +* * +************************************************************************ + +************************************************************************ +* * +* 7. Limitation of Liability * +* -------------------------- * +* * +* Under no circumstances and under no legal theory, whether tort * +* (including negligence), contract, or otherwise, shall any * +* Contributor, or anyone who distributes Covered Software as * +* permitted above, be liable to You for any direct, indirect, * +* special, incidental, or consequential damages of any character * +* including, without limitation, damages for lost profits, loss of * +* goodwill, work stoppage, computer failure or malfunction, or any * +* and all other commercial damages or losses, even if such party * +* shall have been informed of the possibility of such damages. This * +* limitation of liability shall not apply to liability for death or * +* personal injury resulting from such party's negligence to the * +* extent applicable law prohibits such limitation. Some * +* jurisdictions do not allow the exclusion or limitation of * +* incidental or consequential damages, so this exclusion and * +* limitation may not apply to You. * +* * +************************************************************************ + +8. Litigation +------------- + +Any litigation relating to this License may be brought only in the +courts of a jurisdiction where the defendant maintains its principal +place of business and such litigation shall be governed by laws of that +jurisdiction, without reference to its conflict-of-law provisions. +Nothing in this Section shall prevent a party's ability to bring +cross-claims or counter-claims. + +9. Miscellaneous +---------------- + +This License represents the complete agreement concerning the subject +matter hereof. If any provision of this License is held to be +unenforceable, such provision shall be reformed only to the extent +necessary to make it enforceable. Any law or regulation which provides +that the language of a contract shall be construed against the drafter +shall not be used to construe this License against a Contributor. + +10. Versions of the License +--------------------------- + +10.1. New Versions + +Mozilla Foundation is the license steward. Except as provided in Section +10.3, no one other than the license steward has the right to modify or +publish new versions of this License. Each version will be given a +distinguishing version number. + +10.2. Effect of New Versions + +You may distribute the Covered Software under the terms of the version +of the License under which You originally received the Covered Software, +or under the terms of any subsequent version published by the license +steward. + +10.3. Modified Versions + +If you create software not governed by this License, and you want to +create a new license for such software, you may create and use a +modified version of this License if you rename the license and remove +any references to the name of the license steward (except to note that +such modified license differs from this License). + +10.4. Distributing Source Code Form that is Incompatible With Secondary +Licenses + +If You choose to distribute Source Code Form that is Incompatible With +Secondary Licenses under the terms of this version of the License, the +notice described in Exhibit B of this License must be attached. + +Exhibit A - Source Code Form License Notice +------------------------------------------- + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. If a copy of the MPL was not distributed with this + file, You can obtain one at http://mozilla.org/MPL/2.0/. + +If it is not possible or desirable to put the notice in a particular +file, then You may include the notice in a location (such as a LICENSE +file in a relevant directory) where a recipient would be likely to look +for such a notice. + +You may add additional accurate notices of copyright ownership. + +Exhibit B - "Incompatible With Secondary Licenses" Notice +--------------------------------------------------------- + + This Source Code Form is "Incompatible With Secondary Licenses", as + defined by the Mozilla Public License, v. 2.0. + diff --git a/scancode/lib/compat.py b/scancode/lib/compat.py new file mode 100644 index 0000000..540599b --- /dev/null +++ b/scancode/lib/compat.py @@ -0,0 +1,45 @@ +# encoding: utf-8 +# +# This is a copy of source code from Pathspec 0.5.9 +# (https://pypi.org/project/pathspec/) which is +# available under an Mozilla Public License 2.0 +# (https://www.mozilla.org/en-US/MPL/2.0/). +# A copy of the license is also available in +# ../../licenses/LICENSE-pathspec.txt. +# +""" +This module provides compatibility between Python 2 and 3. Hardly +anything is used by this project to constitute including `six`_. + +.. _`six`: http://pythonhosted.org/six +""" + +import sys + +if sys.version_info[0] < 3: + # Python 2. + unicode = unicode + string_types = (basestring,) + + from itertools import izip_longest + + def iterkeys(mapping): + return mapping.iterkeys() + +else: + # Python 3. + unicode = str + string_types = (unicode,) + + from itertools import zip_longest as izip_longest + + def iterkeys(mapping): + return mapping.keys() + +try: + # Python 3.6+. + from collections.abc import Collection as collection_type +except ImportError: + # Python 2.7 - 3.5. + from collections import Container as collection_type + diff --git a/scancode/lib/gitwildmatch.py b/scancode/lib/gitwildmatch.py new file mode 100644 index 0000000..5076bd3 --- /dev/null +++ b/scancode/lib/gitwildmatch.py @@ -0,0 +1,325 @@ +# encoding: utf-8 +# +# This is a copy of source code from Pathspec 0.5.9 +# (https://pypi.org/project/pathspec/) which is +# available under an Mozilla Public License 2.0 +# (https://www.mozilla.org/en-US/MPL/2.0/). +# A copy of the license is also available in +# ../../licenses/LICENSE-pathspec.txt. +# +""" +This module implements Git's wildmatch pattern matching which itself is +derived from Rsync's wildmatch. Git uses wildmatch for its ".gitignore" +files. +""" + +from __future__ import unicode_literals + +import re +import warnings + +import util +from compat import unicode +from pattern import RegexPattern + +#: The encoding to use when parsing a byte string pattern. +_BYTES_ENCODING = 'latin1' + + +class GitWildMatchPattern(RegexPattern): + """ + The :class:`GitWildMatchPattern` class represents a compiled Git + wildmatch pattern. + """ + + # Keep the dict-less class hierarchy. + __slots__ = () + + @classmethod + def pattern_to_regex(cls, pattern): + """ + Convert the pattern into a regular expression. + + *pattern* (:class:`unicode` or :class:`bytes`) is the pattern to + convert into a regular expression. + + Returns the uncompiled regular expression (:class:`unicode`, :class:`bytes`, + or :data:`None`), and whether matched files should be included + (:data:`True`), excluded (:data:`False`), or if it is a + null-operation (:data:`None`). + """ + if isinstance(pattern, unicode): + return_type = unicode + elif isinstance(pattern, bytes): + return_type = bytes + pattern = pattern.decode(_BYTES_ENCODING) + else: + raise TypeError("pattern:{!r} is not a unicode or byte string.".format(pattern)) + + pattern = pattern.strip() + + if pattern.startswith('#'): + # A pattern starting with a hash ('#') serves as a comment + # (neither includes nor excludes files). Escape the hash with a + # back-slash to match a literal hash (i.e., '\#'). + regex = None + include = None + + elif pattern == '/': + # EDGE CASE: According to `git check-ignore` (v2.4.1), a single + # '/' does not match any file. + regex = None + include = None + + elif pattern: + + if pattern.startswith('!'): + # A pattern starting with an exclamation mark ('!') negates the + # pattern (exclude instead of include). Escape the exclamation + # mark with a back-slash to match a literal exclamation mark + # (i.e., '\!'). + include = False + # Remove leading exclamation mark. + pattern = pattern[1:] + else: + include = True + + if pattern.startswith('\\'): + # Remove leading back-slash escape for escaped hash ('#') or + # exclamation mark ('!'). + pattern = pattern[1:] + + # Split pattern into segments. + pattern_segs = pattern.split('/') + + # Normalize pattern to make processing easier. + + if not pattern_segs[0]: + # A pattern beginning with a slash ('/') will only match paths + # directly on the root directory instead of any descendant + # paths. So, remove empty first segment to make pattern relative + # to root. + del pattern_segs[0] + + elif len(pattern_segs) == 1 or (len(pattern_segs) == 2 and not pattern_segs[1]): + # A single pattern without a beginning slash ('/') will match + # any descendant path. This is equivalent to "**/{pattern}". So, + # prepend with double-asterisks to make pattern relative to + # root. + # EDGE CASE: This also holds for a single pattern with a + # trailing slash (e.g. dir/). + if pattern_segs[0] != '**': + pattern_segs.insert(0, '**') + + else: + # EDGE CASE: A pattern without a beginning slash ('/') but + # contains at least one prepended directory (e.g. + # "dir/{pattern}") should not match "**/dir/{pattern}", + # according to `git check-ignore` (v2.4.1). + pass + + if not pattern_segs[-1] and len(pattern_segs) > 1: + # A pattern ending with a slash ('/') will match all descendant + # paths if it is a directory but not if it is a regular file. + # This is equivilent to "{pattern}/**". So, set last segment to + # double asterisks to include all descendants. + pattern_segs[-1] = '**' + + # Build regular expression from pattern. + output = ['^'] + need_slash = False + end = len(pattern_segs) - 1 + for i, seg in enumerate(pattern_segs): + if seg == '**': + if i == 0 and i == end: + # A pattern consisting solely of double-asterisks ('**') + # will match every path. + output.append('.+') + elif i == 0: + # A normalized pattern beginning with double-asterisks + # ('**') will match any leading path segments. + output.append('(?:.+/)?') + need_slash = False + elif i == end: + # A normalized pattern ending with double-asterisks ('**') + # will match any trailing path segments. + output.append('/.*') + else: + # A pattern with inner double-asterisks ('**') will match + # multiple (or zero) inner path segments. + output.append('(?:/.+)?') + need_slash = True + elif seg == '*': + # Match single path segment. + if need_slash: + output.append('/') + output.append('[^/]+') + need_slash = True + else: + # Match segment glob pattern. + if need_slash: + output.append('/') + output.append(cls._translate_segment_glob(seg)) + if i == end and include is True: + # A pattern ending without a slash ('/') will match a file + # or a directory (with paths underneath it). E.g., "foo" + # matches "foo", "foo/bar", "foo/bar/baz", etc. + # EDGE CASE: However, this does not hold for exclusion cases + # according to `git check-ignore` (v2.4.1). + output.append('(?:/.*)?') + need_slash = True + output.append('$') + regex = ''.join(output) + + else: + # A blank pattern is a null-operation (neither includes nor + # excludes files). + regex = None + include = None + + if regex is not None and return_type is bytes: + regex = regex.encode(_BYTES_ENCODING) + + return regex, include + + @staticmethod + def _translate_segment_glob(pattern): + """ + Translates the glob pattern to a regular expression. This is used in + the constructor to translate a path segment glob pattern to its + corresponding regular expression. + + *pattern* (:class:`str`) is the glob pattern. + + Returns the regular expression (:class:`str`). + """ + # NOTE: This is derived from `fnmatch.translate()` and is similar to + # the POSIX function `fnmatch()` with the `FNM_PATHNAME` flag set. + + escape = False + regex = '' + i, end = 0, len(pattern) + while i < end: + # Get next character. + char = pattern[i] + i += 1 + + if escape: + # Escape the character. + escape = False + regex += re.escape(char) + + elif char == '\\': + # Escape character, escape next character. + escape = True + + elif char == '*': + # Multi-character wildcard. Match any string (except slashes), + # including an empty string. + regex += '[^/]*' + + elif char == '?': + # Single-character wildcard. Match any single character (except + # a slash). + regex += '[^/]' + + elif char == '[': + # Braket expression wildcard. Except for the beginning + # exclamation mark, the whole braket expression can be used + # directly as regex but we have to find where the expression + # ends. + # - "[][!]" matchs ']', '[' and '!'. + # - "[]-]" matchs ']' and '-'. + # - "[!]a-]" matchs any character except ']', 'a' and '-'. + j = i + # Pass brack expression negation. + if j < end and pattern[j] == '!': + j += 1 + # Pass first closing braket if it is at the beginning of the + # expression. + if j < end and pattern[j] == ']': + j += 1 + # Find closing braket. Stop once we reach the end or find it. + while j < end and pattern[j] != ']': + j += 1 + + if j < end: + # Found end of braket expression. Increment j to be one past + # the closing braket: + # + # [...] + # ^ ^ + # i j + # + j += 1 + expr = '[' + + if pattern[i] == '!': + # Braket expression needs to be negated. + expr += '^' + i += 1 + elif pattern[i] == '^': + # POSIX declares that the regex braket expression negation + # "[^...]" is undefined in a glob pattern. Python's + # `fnmatch.translate()` escapes the caret ('^') as a + # literal. To maintain consistency with undefined behavior, + # I am escaping the '^' as well. + expr += '\\^' + i += 1 + + # Build regex braket expression. Escape slashes so they are + # treated as literal slashes by regex as defined by POSIX. + expr += pattern[i:j].replace('\\', '\\\\') + + # Add regex braket expression to regex result. + regex += expr + + # Set i to one past the closing braket. + i = j + + else: + # Failed to find closing braket, treat opening braket as a + # braket literal instead of as an expression. + regex += '\\[' + + else: + # Regular character, escape it for regex. + regex += re.escape(char) + + return regex + +util.register_pattern('gitwildmatch', GitWildMatchPattern) + + +class GitIgnorePattern(GitWildMatchPattern): + """ + The :class:`GitIgnorePattern` class is deprecated by :class:`GitWildMatchPattern`. + This class only exists to maintain compatibility with v0.4. + """ + + def __init__(self, *args, **kw): + """ + Warn about deprecation. + """ + self._deprecated() + return super(GitIgnorePattern, self).__init__(*args, **kw) + + @staticmethod + def _deprecated(): + """ + Warn about deprecation. + """ + warnings.warn("GitIgnorePattern ('gitignore') is deprecated. Use GitWildMatchPattern ('gitwildmatch') instead.", DeprecationWarning, stacklevel=3) + + @classmethod + def pattern_to_regex(cls, *args, **kw): + """ + Warn about deprecation. + """ + cls._deprecated() + return super(GitIgnorePattern, cls).pattern_to_regex(*args, **kw) + +# Register `GitIgnorePattern` as "gitignore" for backward compatibility +# with v0.4. +util.register_pattern('gitignore', GitIgnorePattern) + diff --git a/scancode/lib/pathspec.py b/scancode/lib/pathspec.py new file mode 100644 index 0000000..da08db1 --- /dev/null +++ b/scancode/lib/pathspec.py @@ -0,0 +1,146 @@ +# encoding: utf-8 +# +# This is a copy of source code from Pathspec 0.5.9 +# (https://pypi.org/project/pathspec/) which is +# available under an Mozilla Public License 2.0 +# (https://www.mozilla.org/en-US/MPL/2.0/). +# A copy of the license is also available in +# ../../licenses/LICENSE-pathspec.txt. +# +""" +This module provides an object oriented interface for pattern matching +of files. +""" + +import util +from compat import collection_type, iterkeys, izip_longest, string_types, unicode + + +class PathSpec(object): + """ + The :class:`PathSpec` class is a wrapper around a list of compiled + :class:`.Pattern` instances. + """ + + def __init__(self, patterns): + """ + Initializes the :class:`PathSpec` instance. + + *patterns* (:class:`~collections.abc.Collection` or :class:`~collections.abc.Iterable`) + yields each compiled pattern (:class:`.Pattern`). + """ + + self.patterns = patterns if isinstance(patterns, collection_type) else list(patterns) + """ + *patterns* (:class:`~collections.abc.Collection` of :class:`.Pattern`) + contains the compiled patterns. + """ + + def __eq__(self, other): + """ + Tests the equality of this path-spec with *other* (:class:`PathSpec`) + by comparing their :attr:`~PathSpec.patterns` attributes. + """ + if isinstance(other, PathSpec): + paired_patterns = izip_longest(self.patterns, other.patterns) + return all(a == b for a, b in paired_patterns) + else: + return NotImplemented + + def __len__(self): + """ + Returns the number of compiled patterns this path-spec contains + (:class:`int`). + """ + return len(self.patterns) + + @classmethod + def from_lines(cls, pattern_factory, lines): + """ + Compiles the pattern lines. + + *pattern_factory* can be either the name of a registered pattern + factory (:class:`str`), or a :class:`~collections.abc.Callable` used + to compile patterns. It must accept an uncompiled pattern (:class:`str`) + and return the compiled pattern (:class:`.Pattern`). + + *lines* (:class:`~collections.abc.Iterable`) yields each uncompiled + pattern (:class:`str`). This simply has to yield each line so it can + be a :class:`file` (e.g., from :func:`open` or :class:`io.StringIO`) + or the result from :meth:`str.splitlines`. + + Returns the :class:`PathSpec` instance. + """ + if isinstance(pattern_factory, string_types): + pattern_factory = util.lookup_pattern(pattern_factory) + if not callable(pattern_factory): + raise TypeError("pattern_factory:{!r} is not callable.".format(pattern_factory)) + + if isinstance(lines, (bytes, unicode)): + raise TypeError("lines:{!r} is not an iterable.".format(lines)) + + lines = [pattern_factory(line) for line in lines if line] + return cls(lines) + + def match_file(self, file, separators=None): + """ + Matches the file to this path-spec. + + *file* (:class:`str`) is the file path to be matched against + :attr:`self.patterns <PathSpec.patterns>`. + + *separators* (:class:`~collections.abc.Collection` of :class:`str`) + optionally contains the path separators to normalize. See + :func:`~pathspec.util.normalize_file` for more information. + + Returns :data:`True` if *file* matched; otherwise, :data:`False`. + """ + norm_file = util.normalize_file(file, separators=separators) + return util.match_file(self.patterns, norm_file) + + def match_files(self, files, separators=None): + """ + Matches the files to this path-spec. + + *files* (:class:`~collections.abc.Iterable` of :class:`str`) contains + the file paths to be matched against :attr:`self.patterns + <PathSpec.patterns>`. + + *separators* (:class:`~collections.abc.Collection` of :class:`str`; + or :data:`None`) optionally contains the path separators to + normalize. See :func:`~pathspec.util.normalize_file` for more + information. + + Returns the matched files (:class:`~collections.abc.Iterable` of + :class:`str`). + """ + if isinstance(files, (bytes, unicode)): + raise TypeError("files:{!r} is not an iterable.".format(files)) + + file_map = util.normalize_files(files, separators=separators) + matched_files = util.match_files(self.patterns, iterkeys(file_map)) + for path in matched_files: + yield file_map[path] + + def match_tree(self, root, on_error=None, follow_links=None): + """ + Walks the specified root path for all files and matches them to this + path-spec. + + *root* (:class:`str`) is the root directory to search for files. + + *on_error* (:class:`~collections.abc.Callable` or :data:`None`) + optionally is the error handler for file-system exceptions. See + :func:`~pathspec.util.iter_tree` for more information. + + + *follow_links* (:class:`bool` or :data:`None`) optionally is whether + to walk symbolik links that resolve to directories. See + :func:`~pathspec.util.iter_tree` for more information. + + Returns the matched files (:class:`~collections.abc.Iterable` of + :class:`str`). + """ + files = util.iter_tree(root, on_error=on_error, follow_links=follow_links) + return self.match_files(files) + diff --git a/scancode/lib/pattern.py b/scancode/lib/pattern.py new file mode 100644 index 0000000..b297f3c --- /dev/null +++ b/scancode/lib/pattern.py @@ -0,0 +1,155 @@ +# encoding: utf-8 +# +# This is a copy of source code from Pathspec 0.5.9 +# (https://pypi.org/project/pathspec/) which is +# available under an Mozilla Public License 2.0 +# (https://www.mozilla.org/en-US/MPL/2.0/). +# A copy of the license is also available in +# ../../licenses/LICENSE-pathspec.txt. +# +""" +This module provides the base definition for patterns. +""" + +import re + +from compat import unicode + + +class Pattern(object): + """ + The :class:`Pattern` class is the abstract definition of a pattern. + """ + + # Make the class dict-less. + __slots__ = ('include',) + + def __init__(self, include): + """ + Initializes the :class:`Pattern` instance. + + *include* (:class:`bool` or :data:`None`) is whether the matched + files should be included (:data:`True`), excluded (:data:`False`), + or is a null-operation (:data:`None`). + """ + + self.include = include + """ + *include* (:class:`bool` or :data:`None`) is whether the matched + files should be included (:data:`True`), excluded (:data:`False`), + or is a null-operation (:data:`None`). + """ + + def match(self, files): + """ + Matches this pattern against the specified files. + + *files* (:class:`~collections.abc.Iterable` of :class:`str`) contains + each file relative to the root directory (e.g., ``"relative/path/to/file"``). + + Returns an :class:`~collections.abc.Iterable` yielding each matched + file path (:class:`str`). + """ + raise NotImplementedError("{}.{} must override match().".format(self.__class__.__module__, self.__class__.__name__)) + + +class RegexPattern(Pattern): + """ + The :class:`RegexPattern` class is an implementation of a pattern + using regular expressions. + """ + + # Make the class dict-less. + __slots__ = ('regex',) + + def __init__(self, pattern, include=None): + """ + Initializes the :class:`RegexPattern` instance. + + *pattern* (:class:`unicode`, :class:`bytes`, :class:`re.RegexObject`, + or :data:`None`) is the pattern to compile into a regular + expression. + + *include* (:class:`bool` or :data:`None`) must be :data:`None` + unless *pattern* is a precompiled regular expression (:class:`re.RegexObject`) + in which case it is whether matched files should be included + (:data:`True`), excluded (:data:`False`), or is a null operation + (:data:`None`). + + .. NOTE:: Subclasses do not need to support the *include* + parameter. + """ + + self.regex = None + """ + *regex* (:class:`re.RegexObject`) is the regular expression for the + pattern. + """ + + if isinstance(pattern, (unicode, bytes)): + assert include is None, "include:{!r} must be null when pattern:{!r} is a string.".format(include, pattern) + regex, include = self.pattern_to_regex(pattern) + # NOTE: Make sure to allow a null regular expression to be + # returned for a null-operation. + if include is not None: + regex = re.compile(regex) + + elif pattern is not None and hasattr(pattern, 'match'): + # Assume pattern is a precompiled regular expression. + # - NOTE: Used specified *include*. + regex = pattern + + elif pattern is None: + # NOTE: Make sure to allow a null pattern to be passed for a + # null-operation. + assert include is None, "include:{!r} must be null when pattern:{!r} is null.".format(include, pattern) + + else: + raise TypeError("pattern:{!r} is not a string, RegexObject, or None.".format(pattern)) + + super(RegexPattern, self).__init__(include) + self.regex = regex + + def __eq__(self, other): + """ + Tests the equality of this regex pattern with *other* (:class:`RegexPattern`) + by comparing their :attr:`~Pattern.include` and :attr:`~RegexPattern.regex` + attributes. + """ + if isinstance(other, RegexPattern): + return self.include == other.include and self.regex == other.regex + else: + return NotImplemented + + def match(self, files): + """ + Matches this pattern against the specified files. + + *files* (:class:`~collections.abc.Iterable` of :class:`str`) + contains each file relative to the root directory (e.g., "relative/path/to/file"). + + Returns an :class:`~collections.abc.Iterable` yielding each matched + file path (:class:`str`). + """ + if self.include is not None: + for path in files: + if self.regex.match(path) is not None: + yield path + + @classmethod + def pattern_to_regex(cls, pattern): + """ + Convert the pattern into an uncompiled regular expression. + + *pattern* (:class:`str`) is the pattern to convert into a regular + expression. + + Returns the uncompiled regular expression (:class:`str` or :data:`None`), + and whether matched files should be included (:data:`True`), + excluded (:data:`False`), or is a null-operation (:data:`None`). + + .. NOTE:: The default implementation simply returns *pattern* and + :data:`True`. + """ + return pattern, True + diff --git a/scancode/lib/util.py b/scancode/lib/util.py new file mode 100644 index 0000000..c5bcd7b --- /dev/null +++ b/scancode/lib/util.py @@ -0,0 +1,359 @@ +# encoding: utf-8 +# +# This is a copy of source code from Pathspec 0.5.9 +# (https://pypi.org/project/pathspec/) which is +# available under an Mozilla Public License 2.0 +# (https://www.mozilla.org/en-US/MPL/2.0/). +# A copy of the license is also available in +# ../../licenses/LICENSE-pathspec.txt. +# +""" +This module provides utility methods for dealing with path-specs. +""" + +import os +import os.path +import posixpath +import stat + +from compat import collection_type, string_types + +NORMALIZE_PATH_SEPS = [sep for sep in [os.sep, os.altsep] if sep and sep != posixpath.sep] +""" +*NORMALIZE_PATH_SEPS* (:class:`list` of :class:`str`) contains the path +separators that need to be normalized to the POSIX separator for the +current operating system. The separators are determined by examining +:data:`os.sep` and :data:`os.altsep`. +""" + +_registered_patterns = {} +""" +*_registered_patterns* (``dict``) maps a name (``str``) to the +registered pattern factory (``callable``). +""" + +def iter_tree(root, on_error=None, follow_links=None): + """ + Walks the specified directory for all files. + + *root* (:class:`str`) is the root directory to search for files. + + *on_error* (:class:`~collections.abc.Callable` or :data:`None`) + optionally is the error handler for file-system exceptions. It will be + called with the exception (:exc:`OSError`). Reraise the exception to + abort the walk. Default is :data:`None` to ignore file-system + exceptions. + + *follow_links* (:class:`bool` or :data:`None`) optionally is whether + to walk symbolik links that resolve to directories. Default is + :data:`None` for :data:`True`. + + Raises :exc:`RecursionError` if recursion is detected. + + Returns an :class:`~collections.abc.Iterable` yielding the path to + each file (:class:`str`) relative to *root*. + """ + if on_error is not None and not callable(on_error): + raise TypeError("on_error:{!r} is not callable.".format(on_error)) + + if follow_links is None: + follow_links = True + + for file_rel in _iter_tree_next(os.path.abspath(root), '', {}, on_error, follow_links): + yield file_rel + +def _iter_tree_next(root_full, dir_rel, memo, on_error, follow_links): + """ + Scan the directory for all descendant files. + + *root_full* (:class:`str`) the absolute path to the root directory. + + *dir_rel* (:class:`str`) the path to the directory to scan relative to + *root_full*. + + *memo* (:class:`dict`) keeps track of ancestor directories + encountered. Maps each ancestor real path (:class:`str``) to relative + path (:class:`str`). + + *on_error* (:class:`~collections.abc.Callable` or :data:`None`) + optionally is the error handler for file-system exceptions. + + *follow_links* (:class:`bool`) is whether to walk symbolik links that + resolve to directories. + """ + dir_full = os.path.join(root_full, dir_rel) + dir_real = os.path.realpath(dir_full) + + # Remember each encountered ancestor directory and its canonical + # (real) path. If a canonical path is encountered more than once, + # recursion has occurred. + if dir_real not in memo: + memo[dir_real] = dir_rel + else: + raise RecursionError(real_path=dir_real, first_path=memo[dir_real], second_path=dir_rel) + + for node in os.listdir(dir_full): + node_rel = os.path.join(dir_rel, node) + node_full = os.path.join(root_full, node_rel) + + # Inspect child node. + try: + node_stat = os.lstat(node_full) + except OSError as e: + if on_error is not None: + on_error(e) + continue + + if stat.S_ISLNK(node_stat.st_mode): + # Child node is a link, inspect the target node. + is_link = True + try: + node_stat = os.stat(node_full) + except OSError as e: + if on_error is not None: + on_error(e) + continue + else: + is_link = False + + if stat.S_ISDIR(node_stat.st_mode) and (follow_links or not is_link): + # Child node is a directory, recurse into it and yield its + # decendant files. + for file_rel in _iter_tree_next(root_full, node_rel, memo, on_error, follow_links): + yield file_rel + + elif stat.S_ISREG(node_stat.st_mode): + # Child node is a file, yield it. + yield node_rel + + # NOTE: Make sure to remove the canonical (real) path of the directory + # from the ancestors memo once we are done with it. This allows the + # same directory to appear multiple times. If this is not done, the + # second occurance of the directory will be incorrectly interpreted as + # a recursion. See <https://github.com/cpburnz/python-path-specification/pull/7>. + del memo[dir_real] + +def lookup_pattern(name): + """ + Lookups a registered pattern factory by name. + + *name* (:class:`str`) is the name of the pattern factory. + + Returns the registered pattern factory (:class:`~collections.abc.Callable`). + If no pattern factory is registered, raises :exc:`KeyError`. + """ + return _registered_patterns[name] + +def match_file(patterns, file): + """ + Matches the file to the patterns. + + *patterns* (:class:`~collections.abc.Iterable` of :class:`~pathspec.pattern.Pattern`) + contains the patterns to use. + + *file* (:class:`str`) is the normalized file path to be matched + against *patterns*. + + Returns :data:`True` if *file* matched; otherwise, :data:`False`. + """ + matched = False + for pattern in patterns: + if pattern.include is not None: + if file in pattern.match((file,)): + matched = pattern.include + return matched + +def match_files(patterns, files): + """ + Matches the files to the patterns. + + *patterns* (:class:`~collections.abc.Iterable` of :class:`~pathspec.pattern.Pattern`) + contains the patterns to use. + + *files* (:class:`~collections.abc.Iterable` of :class:`str`) contains + the normalized file paths to be matched against *patterns*. + + Returns the matched files (:class:`set` of :class:`str`). + """ + all_files = files if isinstance(files, collection_type) else list(files) + return_files = set() + for pattern in patterns: + if pattern.include is not None: + result_files = pattern.match(all_files) + if pattern.include: + return_files.update(result_files) + else: + return_files.difference_update(result_files) + return return_files + +def normalize_file(file, separators=None): + """ + Normalizes the file path to use the POSIX path separator (i.e., ``'/'``). + + *file* (:class:`str`) is the file path. + + *separators* (:class:`~collections.abc.Collection` of :class:`str`; or + :data:`None`) optionally contains the path separators to normalize. + This does not need to include the POSIX path separator (``'/'``), but + including it will not affect the results. Default is :data:`None` for + :data:`NORMALIZE_PATH_SEPS`. To prevent normalization, pass an empty + container (e.g., an empty tuple ``()``). + + Returns the normalized file path (:class:`str`). + """ + # Normalize path separators. + if separators is None: + separators = NORMALIZE_PATH_SEPS + norm_file = file + for sep in separators: + norm_file = norm_file.replace(sep, posixpath.sep) + + # Remove current directory prefix. + if norm_file.startswith('./'): + norm_file = norm_file[2:] + + return norm_file + +def normalize_files(files, separators=None): + """ + Normalizes the file paths to use the POSIX path separator. + + *files* (:class:`~collections.abc.Iterable` of :class:`str`) contains + the file paths to be normalized. + + *separators* (:class:`~collections.abc.Collection` of :class:`str`; or + :data:`None`) optionally contains the path separators to normalize. + See :func:`normalize_file` for more information. + + Returns a :class:`dict` mapping the each normalized file path (:class:`str`) + to the original file path (:class:`str`) + """ + norm_files = {} + for path in files: + norm_files[normalize_file(path, separators=separators)] = path + return norm_files + +def register_pattern(name, pattern_factory, override=None): + """ + Registers the specified pattern factory. + + *name* (:class:`str`) is the name to register the pattern factory + under. + + *pattern_factory* (:class:`~collections.abc.Callable`) is used to + compile patterns. It must accept an uncompiled pattern (:class:`str`) + and return the compiled pattern (:class:`.Pattern`). + + *override* (:class:`bool` or :data:`None`) optionally is whether to + allow overriding an already registered pattern under the same name + (:data:`True`), instead of raising an :exc:`AlreadyRegisteredError` + (:data:`False`). Default is :data:`None` for :data:`False`. + """ + if not isinstance(name, string_types): + raise TypeError("name:{!r} is not a string.".format(name)) + if not callable(pattern_factory): + raise TypeError("pattern_factory:{!r} is not callable.".format(pattern_factory)) + if name in _registered_patterns and not override: + raise AlreadyRegisteredError(name, _registered_patterns[name]) + _registered_patterns[name] = pattern_factory + + +class AlreadyRegisteredError(Exception): + """ + The :exc:`AlreadyRegisteredError` exception is raised when a pattern + factory is registered under a name already in use. + """ + + def __init__(self, name, pattern_factory): + """ + Initializes the :exc:`AlreadyRegisteredError` instance. + + *name* (:class:`str`) is the name of the registered pattern. + + *pattern_factory* (:class:`~collections.abc.Callable`) is the + registered pattern factory. + """ + super(AlreadyRegisteredError, self).__init__(name, pattern_factory) + + @property + def message(self): + """ + *message* (:class:`str`) is the error message. + """ + return "{name!r} is already registered for pattern factory:{pattern_factory!r}.".format( + name=self.name, + pattern_factory=self.pattern_factory, + ) + + @property + def name(self): + """ + *name* (:class:`str`) is the name of the registered pattern. + """ + return self.args[0] + + @property + def pattern_factory(self): + """ + *pattern_factory* (:class:`~collections.abc.Callable`) is the + registered pattern factory. + """ + return self.args[1] + + +class RecursionError(Exception): + """ + The :exc:`RecursionError` exception is raised when recursion is + detected. + """ + + def __init__(self, real_path, first_path, second_path): + """ + Initializes the :exc:`RecursionError` instance. + + *real_path* (:class:`str`) is the real path that recursion was + encountered on. + + *first_path* (:class:`str`) is the first path encountered for + *real_path*. + + *second_path* (:class:`str`) is the second path encountered for + *real_path*. + """ + super(RecursionError, self).__init__(real_path, first_path, second_path) + + @property + def first_path(self): + """ + *first_path* (:class:`str`) is the first path encountered for + :attr:`self.real_path <RecursionError.real_path>`. + """ + return self.args[1] + + @property + def message(self): + """ + *message* (:class:`str`) is the error message. + """ + return "Real path {real!r} was encountered at {first!r} and then {second!r}.".format( + real=self.real_path, + first=self.first_path, + second=self.second_path, + ) + + @property + def real_path(self): + """ + *real_path* (:class:`str`) is the real path that recursion was + encountered on. + """ + return self.args[0] + + @property + def second_path(self): + """ + *second_path* (:class:`str`) is the second path encountered for + :attr:`self.real_path <RecursionError.real_path>`. + """ + return self.args[2] + diff --git a/scancode/scanCode.py b/scancode/scanCode.py index 029bfad..06d7dd3 100755 --- a/scancode/scanCode.py +++ b/scancode/scanCode.py @@ -40,6 +40,11 @@ import re import sys import textwrap +# import pathspec from local lib path +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)) + '/lib') +import pathspec +from gitwildmatch import GitWildMatchPattern + VERBOSE = False # Terminal colors @@ -65,6 +70,7 @@ ERR_TABS = "line contains tabs." ERR_TRAILING_WHITESPACE = "line has trailing whitespace." HELP_CONFIG_FILE = "provide custom configuration file" +HELP_GITIGNORE_FILE = "provide .gitignore file for additional path exclusions" HELP_DISPLAY_EXCLUSIONS = "display path exclusion information" HELP_ROOT_DIR = "starting directory for the scan" HELP_VERBOSE = "enable verbose output" @@ -74,6 +80,7 @@ MSG_CHECKS_PASSED = "All checks passed." MSG_CONFIG_ADDING_LICENSE_FILE = "Adding valid license from: [%s], value:\n%s" MSG_ERROR_SUMMARY = "Scan detected %d error(s) in %d file(s):" MSG_READING_CONFIGURATION = "Reading configuration file [%s]..." +MSG_READING_GITIGNORE = "Reading gitignore file [%s]..." MSG_READING_LICENSE_FILE = "Reading license file [%s]..." MSG_RUNNING_FILE_CHECKS = " Running File Check [%s]" MSG_RUNNING_LINE_CHECKS = " Running Line Check [%s]" @@ -81,10 +88,10 @@ MSG_SCANNING_FILTER = "Scanning files with filter: [%s]:" MSG_SCANNING_STARTED = "Scanning files starting at [%s]..." WARN_CONFIG_SECTION_NOT_FOUND = "Configuration file section [%s] not found." -WARN_SCAN_EXCLUDED_PATH_SUMMARY = "Scan excluded (%s) directories:" +WARN_SCAN_EXCLUDED_PATH_SUMMARY = "Scan excluded (%s) patterns:" WARN_SCAN_EXCLUDED_FILE_SUMMARY = "Scan excluded (%s) files:" WARN_SCAN_EXCLUDED_FILE = " Excluded file: %s" -WARN_SCAN_EXCLUDED_PATH = " Excluded path: %s" +WARN_SCAN_EXCLUDED_PATH = " Excluded pattern: %s" MSG_DESCRIPTION = "Scans all source code under specified directory for " \ "project compliance using provided configuration." @@ -211,18 +218,21 @@ def read_license_files(config): raise Exception(ERR_REQUIRED_SECTION % SECTION_LICENSE) -def read_path_exclusions(config): +def read_path_exclusions(config, gitignore_file): """Read the list of paths to exclude from the scan.""" path_dict = get_config_section_dict(config, SECTION_EXCLUDE) # vprint("path_dict: " + str(path_dict)) if path_dict is not None: # each 'key' is an exclusion path for key in path_dict: + key = str.strip(key) if key is not None: exclusion_paths.append(key) - else: - raise Exception(ERR_REQUIRED_SECTION % SECTION_LICENSE) + if gitignore_file is not None: + print_highlight(MSG_READING_GITIGNORE % gitignore_file.name) + for line in gitignore_file.read().splitlines(): + exclusion_paths.append(line) def read_scan_options(config): """Read the Options from the configuration file.""" @@ -251,7 +261,7 @@ def read_regex(config): raise Exception(ERR_REQUIRED_SECTION % SECTION_REGEX) -def read_config_file(file): +def read_config_file(file, gitignore_file): """Read in and validate configuration file.""" try: print_highlight(MSG_READING_CONFIGURATION % file.name) @@ -263,7 +273,7 @@ def read_config_file(file): config.readfp(file) read_license_files(config) read_path_inclusions(config) - read_path_exclusions(config) + read_path_exclusions(config, gitignore_file) read_scan_options(config) read_regex(config) except Exception as e: @@ -399,32 +409,20 @@ def run_line_checks(file_path, checks): errors.append((line_number, err)) return errors - def all_paths(root_dir): """Generator that returns files with known extensions that can be scanned. Iteration is recursive beginning at the passed root directory and skipping directories that are listed as exception paths. """ - # For every file in every directory (path) starting at "root_dir" + spec = pathspec.PathSpec.from_lines(GitWildMatchPattern, exclusion_paths) + exclusion_files_set = set(map(lambda f: os.path.join(root_dir, f), spec.match_tree(root_dir))) + for dir_path, dir_names, files in os.walk(root_dir): for f in files: filename = os.path.join(dir_path, f) - - # Map will contain a boolean for each exclusion path tested - # as input to the lambda function. - # only if all() values in the Map are "True" (meaning the file is - # not excluded) then it should yield the filename to run checks on. - # not dir_path.endswith(p) and - if all(map(lambda p: p not in dir_path, exclusion_paths)): - # directory not excluded, now check for any file exclusions - if all(map(lambda p: p not in filename, exclusion_paths)): - yield filename - else: - exclusion_files_set.add(filename) - else: - # directory is excluded - exclusion_files_set.add(filename) + if filename not in exclusion_files_set: + yield filename def colors(): """Create a collection of helper functions to colorize strings.""" @@ -488,6 +486,11 @@ if __name__ == "__main__": dest="config", default=DEFAULT_CONFIG_FILE, help=HELP_CONFIG_FILE) + parser.add_argument("--gitignore", + type=argparse.FileType('r'), + action="store", + dest="gitignore", + help=HELP_GITIGNORE_FILE) parser.add_argument("root_directory", type=str, default=DEFAULT_ROOT_DIR, @@ -500,6 +503,7 @@ if __name__ == "__main__": # Config file at this point is an actual file object config_file = args.config + gitignore_file = args.gitignore # Assign supported scan functions to either file or line globals # These checks run once per-file @@ -517,7 +521,7 @@ if __name__ == "__main__": }) # Read / load configuration file from file (pointer) - if read_config_file(config_file) == -1: + if read_config_file(config_file, gitignore_file) == -1: exit(1) # Verify starting path parameter is valid @@ -532,11 +536,12 @@ if __name__ == "__main__": # Runs all listed checks on all relevant files. all_errors = [] + paths_to_check = set(all_paths(root_dir)) for fltr, chks1, chks2 in FILTERS_WITH_CHECK_FUNCTIONS: # print_error(col.cyan(MSG_SCANNING_FILTER % fltr)) # print_error("chks1=" + str(chks1)) # print_error("chks2=" + str(chks2)) - for path in fnmatch.filter(all_paths(root_dir), fltr): + for path in fnmatch.filter(paths_to_check, fltr): errors = run_file_checks(path, chks1) errors += run_line_checks(path, chks2) all_errors += map(lambda p: (path, p[0], p[1]), errors) diff --git a/scancode/travis.cfg b/scancode/travis.cfg index 9c10889..8f90563 100644 --- a/scancode/travis.cfg +++ b/scancode/travis.cfg @@ -52,6 +52,9 @@ ASFMinifiedLicenseHeaderREM.txt # Scancode unit tests tests/exclude +# Pathspec library +lib/ + [Options] # Not all code files allow licenses to appear starting at the first character # of the file. This option tells the scan to allow licenses to appear starting