Script 'mail_helper' called by obssrc Hello community, here is the log from the commit of package python-urlextract for openSUSE:Factory checked in at 2022-10-30 18:29:09 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Comparing /work/SRC/openSUSE:Factory/python-urlextract (Old) and /work/SRC/openSUSE:Factory/.python-urlextract.new.2275 (New) ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "python-urlextract" Sun Oct 30 18:29:09 2022 rev:4 rq:1032275 version:1.7.0 Changes: -------- --- /work/SRC/openSUSE:Factory/python-urlextract/python-urlextract.changes 2022-08-17 18:26:24.095620815 +0200 +++ /work/SRC/openSUSE:Factory/.python-urlextract.new.2275/python-urlextract.changes 2022-10-30 18:29:35.274628167 +0100 @@ -1,0 +2,9 @@ +Sat Oct 29 16:25:27 UTC 2022 - Yogalakshmi Arunachalam <yarunacha...@suse.com> + +- Update to v1.7.0 + * correct handling when authority starts with @ symbol + * remove unreserved characters from the beginning of found URL + * added typing and mypy checkcs - by mimi89999 + * updated list of TLDs + +------------------------------------------------------------------- Old: ---- urlextract-1.6.0.tar.gz New: ---- urlextract-1.7.0.tar.gz ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Other differences: ------------------ ++++++ python-urlextract.spec ++++++ --- /var/tmp/diff_new_pack.p5I030/_old 2022-10-30 18:29:35.698630458 +0100 +++ /var/tmp/diff_new_pack.p5I030/_new 2022-10-30 18:29:35.706630500 +0100 @@ -19,7 +19,7 @@ %{?!python_module:%define python_module() python-%{**} python3-%{**}} %define skip_python2 1 Name: python-urlextract -Version: 1.6.0 +Version: 1.7.0 Release: 0 Summary: Collects and extracts URLs from given text License: MIT ++++++ urlextract-1.6.0.tar.gz -> urlextract-1.7.0.tar.gz ++++++ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/URLExtract-1.6.0/.bumpversion.cfg new/URLExtract-1.7.0/.bumpversion.cfg --- old/URLExtract-1.6.0/.bumpversion.cfg 2022-05-17 21:56:52.000000000 +0200 +++ new/URLExtract-1.7.0/.bumpversion.cfg 2022-10-22 19:41:56.000000000 +0200 @@ -1,5 +1,5 @@ [bumpversion] -current_version = 1.6.0 +current_version = 1.7.0 commit = True tag = True message = Version {new_version} diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/URLExtract-1.6.0/CHANGELOG.rst new/URLExtract-1.7.0/CHANGELOG.rst --- old/URLExtract-1.6.0/CHANGELOG.rst 2022-05-17 21:56:52.000000000 +0200 +++ new/URLExtract-1.7.0/CHANGELOG.rst 2022-10-22 19:41:56.000000000 +0200 @@ -2,6 +2,12 @@ ~~~~~~~~~ - N/A +- 1.7.0 (2022-10-22) + - correct handling when authority starts with @ symbol + - remove unreserved characters from the beginning of found URL + - added typing and mypy checkcs - by mimi89999 + - updated list of TLDs + - 1.6.0 (2022-05-17) - Add a list of URLs allowed to extract (issue #125) - by khoben - correct order of actual and expected in tests diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/URLExtract-1.6.0/README.rst new/URLExtract-1.7.0/README.rst --- old/URLExtract-1.6.0/README.rst 2022-05-17 21:56:52.000000000 +0200 +++ new/URLExtract-1.7.0/README.rst 2022-10-22 19:41:56.000000000 +0200 @@ -4,8 +4,8 @@ URLExtract is python class for collecting (extracting) URLs from given text based on locating TLD. -.. image:: https://img.shields.io/travis/lipoja/URLExtract/master.svg - :target: https://travis-ci.org/lipoja/URLExtract +.. image:: https://img.shields.io/github/workflow/status/lipoja/URLExtract/Upload%20Python%20Package + :target: https://github.com/lipoja/URLExtract/actions/workflows/python-publish.yml :alt: Build Status .. image:: https://img.shields.io/github/tag/lipoja/URLExtract.svg :target: https://github.com/lipoja/URLExtract/tags diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/URLExtract-1.6.0/setup.py new/URLExtract-1.7.0/setup.py --- old/URLExtract-1.6.0/setup.py 2022-05-17 21:56:52.000000000 +0200 +++ new/URLExtract-1.7.0/setup.py 2022-10-22 19:41:56.000000000 +0200 @@ -16,7 +16,7 @@ # version of URLExtract # (do not forget to change it in urlextract_core.py as well) -__version__ = "1.6.0" +__version__ = "1.7.0" def read(readme): diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/URLExtract-1.6.0/tests/unit/test_extract_email.py new/URLExtract-1.7.0/tests/unit/test_extract_email.py --- old/URLExtract-1.6.0/tests/unit/test_extract_email.py 2022-05-17 21:56:52.000000000 +0200 +++ new/URLExtract-1.7.0/tests/unit/test_extract_email.py 2022-10-22 19:41:56.000000000 +0200 @@ -34,6 +34,7 @@ [ ("Do not extract emails by default j...@example.com", ["j...@example.com"]), ("<em...@address.net>", ["em...@address.net"]), + ("whitespace @address.net>", []), ("Given URIs are not mail j...@example.com/asdasd j...@example.com:1234", []), ("Given URIs are not mail j...@example.com?not j...@example.com#not", []), ], diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/URLExtract-1.6.0/tests/unit/test_find_urls.py new/URLExtract-1.7.0/tests/unit/test_find_urls.py --- old/URLExtract-1.6.0/tests/unit/test_find_urls.py 2022-05-17 21:56:52.000000000 +0200 +++ new/URLExtract-1.7.0/tests/unit/test_find_urls.py 2022-10-22 19:41:56.000000000 +0200 @@ -37,8 +37,8 @@ ["https://example.com/what.com"], ), ( - "https://i2.wp.com/siliconfilter.com/2011/06/example.jpg", - ["https://i2.wp.com/siliconfilter.com/2011/06/example.jpg"], + "* test link -https://www.example.com", + ["https://www.example.com"], ), ( "https://www.test.org/paper/apostrophe'in-url", @@ -57,6 +57,7 @@ "<script src='//www.example.com/somejsfile.js'>", ["www.example.com/somejsfile.js"], ), + ("bad.email @address.net>", ['bad.email']), ], ) def test_find_urls(urlextract, text, expected): diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/URLExtract-1.6.0/tox.ini new/URLExtract-1.7.0/tox.ini --- old/URLExtract-1.6.0/tox.ini 2022-05-17 21:56:52.000000000 +0200 +++ new/URLExtract-1.7.0/tox.ini 2022-10-22 19:41:56.000000000 +0200 @@ -3,6 +3,7 @@ py-{nocache,cache} black flake8 + mypy skip_missing_interpreters = true [testenv] @@ -30,3 +31,9 @@ black urlextract --check --skip-string-normalization black tests --check --skip-string-normalization black setup.py --check --skip-string-normalization + +[testenv:mypy] +deps = + mypy +commands = + mypy --install-types --non-interactive --namespace-packages urlextract diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/URLExtract-1.6.0/urlextract/cachefile.py new/URLExtract-1.7.0/urlextract/cachefile.py --- old/URLExtract-1.6.0/urlextract/cachefile.py 2022-05-17 21:56:52.000000000 +0200 +++ new/URLExtract-1.7.0/urlextract/cachefile.py 2022-10-22 19:41:56.000000000 +0200 @@ -12,10 +12,12 @@ import os import tempfile import urllib.request +from typing import Set, Iterable, Tuple, List, Union, NoReturn + from datetime import datetime from urllib.error import URLError, HTTPError -import idna +import idna # type: ignore import filelock from platformdirs import user_cache_dir @@ -61,7 +63,7 @@ self._tld_list_path = self._get_default_cache_file_path() self._default_cache_file = True - def _get_default_cache_dir(self): + def _get_default_cache_dir(self) -> str: """ Returns default cache directory (data directory) @@ -72,7 +74,7 @@ return os.path.join(os.path.dirname(__file__), self._DATA_DIR) - def _get_default_cache_file_path(self): + def _get_default_cache_file_path(self) -> str: """ Returns default cache file path @@ -91,7 +93,7 @@ return default_list_path - def _get_writable_cache_dir(self): + def _get_writable_cache_dir(self) -> str: """ Get writable cache directory with fallback to user's cache directory and global temp directory @@ -124,7 +126,7 @@ raise CacheFileError("Cache directories are not writable.") - def _get_cache_file_path(self): + def _get_cache_file_path(self) -> str: """ Get path for cache file @@ -148,7 +150,7 @@ # get path for cached file return os.path.join(cache_dir, self._CACHE_FILE_NAME) - def _get_cache_lock_file_path(self): + def _get_cache_lock_file_path(self) -> str: """ Get path for cache file lock @@ -158,7 +160,7 @@ """ return self._get_cache_file_path() + ".lock" - def _download_tlds_list(self): + def _download_tlds_list(self) -> bool: """ Function downloads list of TLDs from IANA. LINK: https://data.iana.org/TLD/tlds-alpha-by-domain.txt @@ -215,7 +217,7 @@ return True - def _load_cached_tlds(self): + def _load_cached_tlds(self) -> Set[str]: """ Loads TLDs from cached file to set. @@ -231,7 +233,7 @@ ) raise CacheFileError("Cached file is not readable for current user.") - set_of_tlds = set() + set_of_tlds: Set[str] = set() with filelock.FileLock(self._get_cache_lock_file_path()): with open(self._tld_list_path, "r") as f_cache_tld: @@ -249,7 +251,7 @@ return set_of_tlds - def _get_last_cachefile_modification(self): + def _get_last_cachefile_modification(self) -> Union[datetime, None]: """ Get last modification of cache file with TLDs. diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/URLExtract-1.6.0/urlextract/data/tlds-alpha-by-domain.txt new/URLExtract-1.7.0/urlextract/data/tlds-alpha-by-domain.txt --- old/URLExtract-1.6.0/urlextract/data/tlds-alpha-by-domain.txt 2022-05-17 21:56:52.000000000 +0200 +++ new/URLExtract-1.7.0/urlextract/data/tlds-alpha-by-domain.txt 2022-10-22 19:41:56.000000000 +0200 @@ -1,4 +1,4 @@ -# Version 2022051700, Last Updated Tue May 17 07:07:01 2022 UTC +# Version 2022102200, Last Updated Sat Oct 22 07:07:01 2022 UTC AAA AARP ABARTH @@ -176,7 +176,6 @@ BRUSSELS BS BT -BUGATTI BUILD BUILDERS BUSINESS @@ -196,7 +195,6 @@ CAM CAMERA CAMP -CANCERRESEARCH CANON CAPETOWN CAPITAL diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/URLExtract-1.6.0/urlextract/urlextract_core.py new/URLExtract-1.7.0/urlextract/urlextract_core.py --- old/URLExtract-1.6.0/urlextract/urlextract_core.py 2022-05-17 21:56:52.000000000 +0200 +++ new/URLExtract-1.7.0/urlextract/urlextract_core.py 2022-10-22 19:41:56.000000000 +0200 @@ -8,22 +8,24 @@ .. codeauthor:: Jan Lipovsk?? <janlipov...@gmail.com>, janlipovsky.cz .. contributors: https://github.com/lipoja/URLExtract/graphs/contributors """ +from argparse import Namespace import functools import ipaddress import logging import re import socket +from typing import Set, Iterable, Tuple, List, Union, NoReturn, Generator import string import sys from collections import OrderedDict from datetime import datetime, timedelta -import uritools +import uritools # type: ignore from urlextract.cachefile import CacheFile, CacheFileError # version of URLExtract (do not forget to change it in setup.py as well) -__version__ = "1.6.0" +__version__ = "1.7.0" # default value for maximum count of processed URLs by find_url DEFAULT_LIMIT = 10000 @@ -67,8 +69,8 @@ } _ipv4_tld = [".{}".format(ip) for ip in reversed(range(256))] - _ignore_list = set() - _permit_list = set() + _ignore_list: Set[str] = set() + _permit_list: Set[str] = set() _limit = DEFAULT_LIMIT @@ -116,7 +118,7 @@ # characters that are allowed to be right after TLD self._after_tld_chars = self._get_after_tld_chars() - def _get_after_tld_chars(self): + def _get_after_tld_chars(self) -> Set[str]: """Initialize after tld characters""" after_tld_chars = set(string.whitespace) after_tld_chars |= {"/", '"', "'", "<", ">", "?", ":", ".", ","} @@ -142,7 +144,7 @@ self._tlds_re = re.compile("|".join(re_escaped), flags=re.IGNORECASE) @property - def extract_email(self): + def extract_email(self) -> bool: """ If set to True email will be extracted from text @@ -151,7 +153,7 @@ return self._extract_email @extract_email.setter - def extract_email(self, extract): + def extract_email(self, extract: bool): """ Set if emails will be extracted from text @@ -160,7 +162,7 @@ self._extract_email = extract @property - def extract_localhost(self): + def extract_localhost(self) -> bool: """ If set to True 'localhost' will be extracted as URL from text @@ -169,7 +171,7 @@ return self._extract_localhost @extract_localhost.setter - def extract_localhost(self, enable): + def extract_localhost(self, enable: bool): """ Set if 'localhost' will be extracted as URL from text @@ -179,7 +181,7 @@ self._extract_localhost = enable @property - def ignore_list(self): + def ignore_list(self) -> Set[str]: """ Returns set of URLs on ignore list @@ -189,7 +191,7 @@ return self._ignore_list @ignore_list.setter - def ignore_list(self, ignore_list): + def ignore_list(self, ignore_list: Set[str]): """ Set of URLs to be ignored (not returned) while extracting from text @@ -256,7 +258,7 @@ return True - def update_when_older(self, days): + def update_when_older(self, days: int) -> bool: """ Update TLD list cache file if the list is older than number of days given in parameter `days` or if does not exist. @@ -278,7 +280,7 @@ return True @staticmethod - def get_version(): + def get_version() -> str: """ Returns version number. @@ -288,7 +290,7 @@ return __version__ - def get_after_tld_chars(self): + def get_after_tld_chars(self) -> List[str]: """ Returns list of chars that are allowed after TLD @@ -298,7 +300,7 @@ return list(self._after_tld_chars) - def set_after_tld_chars(self, after_tld_chars): + def set_after_tld_chars(self, after_tld_chars: Iterable[str]): """ Set chars that are allowed after TLD. @@ -307,7 +309,7 @@ self._after_tld_chars = set(after_tld_chars) - def get_stop_chars_left(self): + def get_stop_chars_left(self) -> Set[str]: """ Returns set of stop chars for text on left from TLD. @@ -316,7 +318,7 @@ """ return self._stop_chars_left - def set_stop_chars_left(self, stop_chars): + def set_stop_chars_left(self, stop_chars: Set[str]): """ Set stop characters for text on left from TLD. Stop characters are used when determining end of URL. @@ -332,7 +334,7 @@ self._stop_chars_left = stop_chars - def get_stop_chars_right(self): + def get_stop_chars_right(self) -> Set[str]: """ Returns set of stop chars for text on right from TLD. @@ -341,7 +343,7 @@ """ return self._stop_chars_right - def set_stop_chars_right(self, stop_chars): + def set_stop_chars_right(self, stop_chars: Set[str]): """ Set stop characters for text on right from TLD. Stop characters are used when determining end of URL. @@ -357,7 +359,7 @@ self._stop_chars_right = stop_chars - def get_enclosures(self): + def get_enclosures(self) -> Set[Tuple[str, str]]: """ Returns set of enclosure pairs that might be used to enclosure URL. For example brackets (example.com), [example.com], {example.com} @@ -367,7 +369,7 @@ """ return self._enclosure - def add_enclosure(self, left_char, right_char): + def add_enclosure(self, left_char: str, right_char: str): """ Add new enclosure pair of characters. That and should be removed when their presence is detected at beginning and end of found URL @@ -381,7 +383,7 @@ self._after_tld_chars = self._get_after_tld_chars() - def remove_enclosure(self, left_char, right_char): + def remove_enclosure(self, left_char: str, right_char: str): """ Remove enclosure pair from set of enclosures. @@ -397,8 +399,8 @@ self._after_tld_chars = self._get_after_tld_chars() def _complete_url( - self, text, tld_pos, tld, check_dns=False, with_schema_only=False - ): + self, text: str, tld_pos: int, tld: str, check_dns=False, with_schema_only=False + ) -> str: """ Expand string in both sides to match whole URL. @@ -486,6 +488,9 @@ # URL should not start with two backslashes if complete_url.startswith("//"): complete_url = complete_url[2:] + # URL should not start with unreserved characters + if complete_url.startswith(("-", ".", "~", "_")): + complete_url = complete_url[1:] if not self._is_domain_valid( complete_url, tld, check_dns=check_dns, with_schema_only=with_schema_only ): @@ -493,7 +498,7 @@ return complete_url - def _validate_tld_match(self, text, matched_tld, tld_pos): + def _validate_tld_match(self, text: str, matched_tld: str, tld_pos: int) -> bool: """ Validate TLD match - tells if at found position is really TLD. @@ -517,7 +522,9 @@ return False - def _is_domain_valid(self, url, tld, check_dns=False, with_schema_only=False): + def _is_domain_valid( + self, url: str, tld: str, check_dns=False, with_schema_only=False + ): """ Checks if given URL has valid domain name (ignores subdomains) @@ -570,6 +577,10 @@ url_parts = uritools.urisplit(url) # <scheme>://<authority>/<path>?<query>#<fragment> + # authority can't start with @ + if url_parts.authority.startswith('@'): + return False + # if URI contains user info and schema was automatically added # the url is probably an email if url_parts.getuserinfo() and added_schema: @@ -653,7 +664,7 @@ return True - def _remove_enclosure_from_url(self, text_url, tld_pos, tld): + def _remove_enclosure_from_url(self, text_url: str, tld_pos: int, tld: str) -> str: """ Removes enclosure characters from URL given in text_url. For example: (example.com) -> example.com @@ -707,7 +718,7 @@ return new_url @staticmethod - def _split_markdown(text_url, tld_pos): + def _split_markdown(text_url: str, tld_pos: int) -> str: """ Split markdown URL. There is an issue wen Markdown URL is found. Parsing of the URL does not stop on right place so wrongly found URL @@ -736,7 +747,8 @@ return text_url @staticmethod - def _get_tld_pos(url, tld): + # TODO: fix DOC to accomodate to return value + def _get_tld_pos(url: str, tld: str) -> int: """ Return position of TLD in hostname. @@ -751,9 +763,11 @@ offset = url.find(host) return host.rfind(tld) + offset + # TODO: move type assertion to be Generator based + # found https://stackoverflow.com/a/38423388/14669675 def gen_urls( - self, text, check_dns=False, get_indices=False, with_schema_only=False - ): + self, text: str, check_dns=False, get_indices=False, with_schema_only=False + ) -> Generator[Union[str, Tuple[str, Tuple[int, int]]], None, None]: """ Creates generator over found URLs in given text. @@ -814,12 +828,12 @@ def find_urls( self, - text, + text: str, only_unique=False, check_dns=False, get_indices=False, with_schema_only=False, - ): + ) -> List[Union[str, Tuple[str, Tuple[int, int]]]]: """ Find all URLs in given text. @@ -847,7 +861,7 @@ return list(OrderedDict.fromkeys(urls)) return list(urls) - result_urls = [] + result_urls: List[Union[str, Tuple[str, Tuple[int, int]]]] = [] url = next(urls, "") url_count = 1 while url: @@ -867,7 +881,7 @@ return list(OrderedDict.fromkeys(result_urls)) return result_urls - def has_urls(self, text, check_dns=False, with_schema_only=False): + def has_urls(self, text: str, check_dns=False, with_schema_only=False) -> bool: """ Checks if text contains any valid URL. Returns True if text contains at least one URL. @@ -928,7 +942,8 @@ """ import argparse - def get_args(): + # TODO: add type checking here + def get_args() -> Namespace: """Parse programs arguments""" parser = argparse.ArgumentParser( description="urlextract - prints out all URLs that were " @@ -1046,10 +1061,10 @@ args.input_file.close() -def dns_cache_install(): +def dns_cache_install() -> None: try: - from dns import resolver as dnspython_resolver_module - from dns_cache.resolver import ExceptionCachingResolver + from dns import resolver as dnspython_resolver_module # type: ignore + from dns_cache.resolver import ExceptionCachingResolver # type: ignore if not dnspython_resolver_module.default_resolver: dnspython_resolver_module.default_resolver = ExceptionCachingResolver() @@ -1058,7 +1073,7 @@ pass try: - from dns.resolver import ( + from dns.resolver import ( # type: ignore LRUCache, Resolver, _resolver,