Hello community, here is the log from the commit of package urlwatch for openSUSE:Factory checked in at 2018-06-08 23:16:18 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Comparing /work/SRC/openSUSE:Factory/urlwatch (Old) and /work/SRC/openSUSE:Factory/.urlwatch.new (New) ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "urlwatch" Fri Jun 8 23:16:18 2018 rev:11 rq:614546 version:2.13 Changes: -------- --- /work/SRC/openSUSE:Factory/urlwatch/urlwatch.changes 2018-05-22 17:02:34.102901647 +0200 +++ /work/SRC/openSUSE:Factory/.urlwatch.new/urlwatch.changes 2018-06-08 23:16:24.210031361 +0200 @@ -1,0 +2,12 @@ +Wed Jun 6 11:14:19 UTC 2018 - kbabi...@suse.com + +- Update to 2.13: + * Added support for specifying a `diff_tool` (e.g. `wdiff`) for each job + * Added support for testing filters via `--test-filter JOB` + * Remove default parameter from internal `html2text` module (Fixes #239) + * Better error/exception reporting in `--verbose` mode (Fixes #164) + +- Update to 2.12: + * Bugfix: Do not 'forget' old data if an exception occurs + +------------------------------------------------------------------- Old: ---- urlwatch-2.11.tar.gz New: ---- urlwatch-2.13.tar.gz ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Other differences: ------------------ ++++++ urlwatch.spec ++++++ --- /var/tmp/diff_new_pack.dVaQR7/_old 2018-06-08 23:16:25.022002035 +0200 +++ /var/tmp/diff_new_pack.dVaQR7/_new 2018-06-08 23:16:25.026001890 +0200 @@ -17,7 +17,7 @@ Name: urlwatch -Version: 2.11 +Version: 2.13 Release: 0 Summary: A tool for monitoring webpages for updates License: BSD-3-Clause @@ -65,7 +65,7 @@ %files %defattr(-,root,root,-) -%doc ChangeLog README.md +%doc CHANGELOG* README* %license COPYING* %{_bindir}/%{name} %{_mandir}/man1/%{name}.1%{ext_man} ++++++ urlwatch-2.11.tar.gz -> urlwatch-2.13.tar.gz ++++++ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/urlwatch-2.11/CHANGELOG.md new/urlwatch-2.13/CHANGELOG.md --- old/urlwatch-2.11/CHANGELOG.md 1970-01-01 01:00:00.000000000 +0100 +++ new/urlwatch-2.13/CHANGELOG.md 2018-06-03 14:42:56.000000000 +0200 @@ -0,0 +1,112 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format mostly follows [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). + +## [2.13] -- 2018-06-03 + +### Added +- Support for specifying a `diff_tool` (e.g. `wdiff`) for each job (Fixes #243) +- Support for testing filters via `--test-filter JOB` (Fixes #237) + +### Changed +- Moved ChangeLog file to CHANGELOG.md and using Keep a Changelog format. +- Force version check in `setup.py`, to exclude Python 2 (Fixes #244) +- Remove default parameter from internal `html2text` module (Fixes #239) +- Better error/exception reporting in `--verbose` mode (Fixes #164) + +### Removed +- Old ChangeLog entries + + +## [2.12] -- 2018-06-01 + +### Fixed +- Bugfix: Do not 'forget' old data if an exception occurs (Fixes #242) + + +## [2.11] -- 2018-05-19 + +### Fixed +- Retry: Make sure `tries` is initialized to zero on load (Fixes #241) + +### Changed +- html2text: Make sure the bs4 method strips HTML tags (by Louis Sautier) + + +## [2.10] -- 2018-05-17 + +### Added +- Browser: Add support for browser jobs using `requests-html` (Fixes #215) +- Retry: Add support for optional retry count in job list (by cmichi, fixes #235) +- HTTP: Add support for specifying optional headers (by Tero Mononen) + +### Changed +- File editing: Fix issue when `$EDITOR` contains spaces (Fixes #220) +- ChangeLog: Add versions to recent ChangeLog entries (Fixes #235) + + +## [2.9] -- 2018-03-24 + +### Added +- E-Mail: Add support for `--smtp-login` and document GMail SMTP usage +- Pushover: Device and sound attribute (by Tobias Haupenthal) + +### Changed +- XDG: Move cache file to `XDG_CACHE_DIR` (by Maxime Werlen) +- Migration: Unconditionally migrate urlwatch 1.x cache dirs (Fixes #206) + +### Fixed +- Cleanups: Fix out-of-date debug message, use https (by Jakub Wilk) + + +## [2.8] -- 2018-01-28 + +### Changed +- Documentation: Mention `appdirs` (by e-dschungel) + +### Fixed +- SMTP: Fix handling of missing `user` field (by e-dschungel) +- Manpage: Fix documentation of XDG environment variables (by Jelle van der Waa) +- Unit tests: Fix imports for out-of-source-tree tests (by Maxime Werlen) + + +## [2.7] -- 2017-11-08 + +### Added +- Filtering: `style` (by gvandenbroucke), `tag` (by cmichi) +- New reporter: Telegram support (by gvandenbroucke) +- Paths: Add `XDG_CONFIG_DIR` support (by Jelle van der Waa) + +### Changed +- ElementsByAttribute: look for matching tag in handle_endtag (by Gaetan Leurent) +- HTTP: Option to avoid 304 responses, `Content-Type` header (by Vinicius Massuchetto) +- html2text: Configuration options (by Vinicius Massuchetto) + +### Fixed +- Issue #127: Fix error reporting +- E-Mail: Fix encodings (by Seokjin Han), Allow `user` parameter for SMTP (by Jay Sitter) + + +## [2.6] -- 2016-12-04 + +### Added +- New filters: `sha1sum`, `hexdump`, `element-by-class` +- New reporters: pushbullet (by R0nd); mailgun (by lechuckcaptain) + +### Changed +- Improved filters: `BeautifulSoup` support for `html2txt` (by lechuckcaptain) +- Improved handlers: HTTP Proxy (by lechuckcaptain); support for `file://` URIs +- CI Integration: Build configuration for Travis CI (by lechuckcaptain) +- Consistency: Feature list is now sorted by name + +### Fixed +- Issue #108: Fix creation of example files on first startup +- Issue #118: Fix match filters for missing keys +- Small fixes by: Jakub Wilk, Marc Urben, Adam Dobrawy and Louis Sautier + + +Older ChangeLog entries can be found in the +[old ChangeLog file](https://github.com/thp/urlwatch/blob/2.12/ChangeLog), +or with `git show 2.12:ChangeLog` on the command line. diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/urlwatch-2.11/ChangeLog new/urlwatch-2.13/ChangeLog --- old/urlwatch-2.11/ChangeLog 2018-05-19 18:42:00.000000000 +0200 +++ new/urlwatch-2.13/ChangeLog 1970-01-01 01:00:00.000000000 +0100 @@ -1,239 +0,0 @@ -2008-03-04 Thomas Perl <thp.io/about> - * Initial Version - -2008-03-17 Thomas Perl <thp.io/about> - * Release version 1.0 - -2008-03-20 Lukas Vana <fab...@fabian.cz> - * Add support for error handling missing URLs - * Notify users when NEW sites appear - * Option "display_errors" can be set in watch.py - -2008-03-22 Thomas Perl <thp.io/about> - * Release version 1.1 - -2008-05-09 Lukas Upton <hagaku...@gmail.com> - * Fix problem with Mac OS X 10.5.2 and Ubuntu 8.04 - -2008-05-10 Thomas Perl <thp.io/about> - * Release version 1.2 - -2008-05-15 Craig Hoffman <craig.hoffm...@gmail.com> - * Add support for sending a User-Agent header - -2008-05-16 Thomas Perl <thp.io/about> - * Release version 1.3 - -2008-11-14 Thomas Perl <thp.io/about> - + Add example for using HTML Tidy (needs python-utidylib) - + Add example for using the ical2txt module (needs python-vobject) - + Add ical2txt.py module for converting ics to plaintext - * More comments in hooks.py for better user documentation - * Release version 1.4 - -2008-11-18 Thomas Perl <thp.io/about> - * Support for installing into the system - * Use ~/.urlwatch/ for config, cache and hooks - * Apply BSD license - * Add setup.py (and remove makefile) - * Command-line options - * Verbose logging mode - * Example urls.txt and hooks.py - * Update README - * Add manpage (urlwatch.1) - * Release version 1.5 - -2008-12-23 Thomas Perl <thp.io/about> - * Use hashlib in Python 2.5 and above for SHA-1 generation - * Release version 1.6 - -2009-01-03 Thomas Perl <thp.io/about> - * Add urlwatch.html2txt module to convert/format HTML to plaintext - * Add example of using html2txt in the example hooks file - * The html-to-plaintext feature has been suggested by Evert Meulie - * Release version 1.7 - -2009-01-05 Thomas Perl <thp.io/about> - * Fix a problem with relative links in Lynx' "-dump" mode - -2009-01-07 Thomas Perl <thp.io/about> - * Fix another problem with file-relative links in html2text w/ Lynx - -2009-01-12 Thomas Perl <thp.io/about> - * Describe ical2txt and html2txt with examples in manpage - -2009-01-15 Thomas Perl <thp.io/about> - * Add TODO list - -2009-01-20 Thomas Perl <thp.io/about> - * Set the socket timeout to one minute to avoid hangs - -2009-07-27 Thomas Perl <thp.io/about> - * Catch and handle IOErrors from FTP timeouts - -2009-08-01 Thomas Perl <thp.io/about> - * Add error handling for socket timeouts (HTTP mode) - -2009-08-10 Thomas Perl <thp.io/about> - * Handle httplib errors (Debian bug 529740) - (Thanks to Bastian Kleineidam and Franck Joncourt) - * urlwatch 1.8 released - -2009-09-29 Thomas Perl <thp.io/about> - * Support for shell pipe (|) in urls.txt - * Support for If-Modified-Since header + HTTP 304 - * Show previous/current timestamp in diff output - * Remove TODO list - * urlwatch 1.9 released - -2010-05-10 Thomas Perl <thp.io/about> - * Get encoding from headers and convert to UTF-8 - (suggested by Ján Ondrej) - * urlwatch 1.10 released - -2010-07-30 Thomas Perl <thp.io/about> - * Detect non-zero shell command exit codes and raise an error - * urlwatch 1.11 released - -2011-02-10 Thomas Perl <thp.io/about> - * Allow None as return value for filters - (if a filter returns None, interpret it as "don't filter") - * Update website URL, contact info and copyright years - * urlwatch 1.12 released - -2011-08-22 Thomas Perl <thp.io/about> - * Support for POST requests (suggested by Sébastien Fricker) - * Use concurrent.futures for parallel execution (needs Python 3.2 - or "futures" from PyPI for older Python versions, including 2.x) - * Various code changes to enhance compatibility with Python 3 - * Add convert-to-python3.sh script to convert the codebase into - Python 3 format using the "2to3" utility included with Python - * urlwatch 1.13 released - -2011-11-15 Thomas Perl <thp.io/about> - * Fix an encoding issue related to the html2txt module (thanks to - Thomas Dziedzic for reporting this issue and testing the patch) - * urlwatch 1.14 released - -2012-08-30 Thomas Perl <thp.io/about> - * Merge changes from Slavko <slav...@slavino.sk> related to UTF-8 - and html2txt, this has been tested on Debian-based systems - * urlwatch 1.15 released - -2012-09-13 Xavier Izard <ctrl.alt....@free.fr> - * Added basic support for email delivery, using internal SMTP lib. - (see options --mailto, --mailfrom and --smtp) - -2013-03-11 Thomas Perl <thp.io/about> - * Minimalistic, automatic setup.py script (based on jabberbot) - * Move files around ({examples,urlwatch.1} -> share/...) - * Update Python 3 migration script and MANIFEST.in with new paths - -2013-11-23 Thomas Perl <thp.io/about> - * Fix a bug with parsing content-encoding headers - -2014-01-29 Thomas Perl <thp.io/about> - * Update manpage - * urlwatch 1.16 released - -2014-08-01 Thomas Perl <thp.io/about> - * Handle invalid encoding sent by server (fixes Debian bug 731931) - * Fix lynx handing for relative URLs (fixes Debian bug 732112) - * Fix resolving of relative URL filenames (fixes Debian bug 748905) - * urlwatch 1.17 released - -2015-02-27 Thomas Perl <thp.io/about> - * Fallback to using pwd if os.getlogin() fails (fixes #2) - * Handle HTTP compression (Content-encoding: gzip/deflate) - * Add option to suppress output on stdout (-q/--quiet) - * Allow customizing subject when sending e-mail (-S/--subject) - * Added support for TLS and SMTP auth (-p/--pass, -T/--tls, -A/--auth) - * Added support for specifying cache directory (-c/--cache) - * Add support for HTTP Auth to urlwatch.handler (fixes #10) - -2016-01-16 Thomas Perl <thp.io/about> - * Version 2.0 with lots of changes, only a few listed here - * Requires Python 3, support for Python 2 dropped - * Uses SQLite 3 / minidb for cache storage - * Uses PyYAML for the URL list and configuration file - * Subclass-based hooking features - * Custom job types by subclassing Job - * Custom reporters by subclassing ReporterBase - * Custom filters by subclassing FilterBase - * Old data will be migrated as good as possible to the new formats - -2016-02-03 Thomas Perl <thp.io/about> - * Replace urllib usage with requests (by Louis Sautier) - * Add cookies support (by Louis Sautier) - * Convert README to Markdown (README.md, by Louis Sautier) - * Add a new auto-applying filter that uses regexes, fixes #37 (by Louis Sautier) - * Use setuptools, install dependencies (Fixes #33) - * Fix HTTP basic authentication (Fixes #26) - * Add ssl_no_verify option for UrlJob - * Update list of dependencies (add requests) - * Fix unit tests for files only in source tree (Fixes #34) - * Add test/data to source tarball (#34) - * Workaround a requests shortcoming related to encoding - -2016-06-14 Thomas Perl <thp.io/about> - * Add support for pushover (by Richard Palmer) - * html2txt: Use -nonumbers and UTF-8 output for Lynx - * Fix SMTP server connection setup (fixes #50) - * setup.py: Allow running from non-source directory (Fixes #52) - * Fix adding URLs with = in them (Fixes #59) - * Add option to use sendmail instead of SMTP (by e-dschungel) - * Add InverseGrepFilter which removes lines matching a regex (by e-dschungel) - * New html2text method "pyhtml2text" using the Python module "html2text" (by e-dschungel) - -2016-07-12 Thomas Perl <thp.io/about> - * Check current directory and use os.path.relpath (Fixes #73) - * Add link to watched location in email report (by Guillaume Maudoux) - * setup.py: Remove the discovery logic that fails with pip, just hardcode most things - * Windows compatibility fixes (os.rename, shelljob checks) - * Do not copy example files if they do not exist - * Handle SIGPIPE (fixes #77) - -2016-12-04 Thomas Perl <thp.io/about> [2.6] - * New filters: sha1sum, hexdump, element-by-class - * New reporters: pushbullet (by R0nd); mailgun (by lechuckcaptain) - * Improved filters: BeautifulSoup support for html2txt (by lechuckcaptain) - * Improved handlers: HTTP Proxy (by lechuckcaptain); support for file:// URIs - * CI Integration: Build configuration for Travis CI (by lechuckcaptain) - * Consistency: Feature list is now sorted by name - * Issue #108: Fix creation of example files on first startup - * Issue #118: Fix match filters for missing keys - * Small fixes by: Jakub Wilk, Marc Urben, Adam Dobrawy and Louis Sautier - -2017-11-08 Thomas Perl <thp.io/about> [2.7] - * Issue #127: Fix error reporting - * ElementsByAttribute: look for matching tag in handle_endtag (by Gaetan Leurent) - * Paths: Add XDG_CONFIG_DIR support (by Jelle van der Waa) - * E-Mail: Fix encodings (by Seokjin Han), Allow 'user' parameter for SMTP (by Jay Sitter) - * HTTP: Option to avoid 304 responses, Content-Type header (by Vinicius Massuchetto) - * html2text: Configuration options (by Vinicius Massuchetto) - * Filtering: style (by gvandenbroucke), tag (by cmichi) - * New reporter: Telegram support (by gvandenbroucke) - -2018-01-28 Thomas Perl <m...@thp.io> [2.8] - * Documentation: Mention appdirs (by e-dschungel) - * SMTP: Fix handling of missing user field (by e-dschungel) - * Manpage: Fix documentation of XDG environment variables (by Jelle van der Waa) - * Unit tests: Fix imports for out-of-source-tree tests (by Maxime Werlen) - -2018-03-24 Thomas Perl <m...@thp.io> [2.9] - * Pushover: Device and sound attribute (by Tobias Haupenthal) - * XDG: Move cache file to XDG_CACHE_DIR (by Maxime Werlen) - * E-Mail: Add support for --smtp-login and document GMail SMTP usage - * Cleanups: Fix out-of-date debug message, use https (by Jakub Wilk) - * Migration: Unconditionally migrate urlwatch 1.x cache dirs (Fixes #206) - -2018-05-17 Thomas Perl <m...@thp.io> [2.10] - * File editing: Fix issue when $EDITOR contains spaces (Fixes #220) - * Browser: Add support for browser jobs using requests-html (Fixes #215) - * Retry: Add support for optional retry count in job list (by cmichi, fixes #235) - * HTTP: Add support for specifying optional headers (by Tero Mononen) - * ChangeLog: Add versions to recent ChangeLog entries (Fixes #235) - -2018-05-19 Thomas Perl <m...@thp.io> [2.11] - * Retry: Make sure "tries" is initialized to zero on load (Fixes #241) - * html2text: Make sure the bs4 method strips HTML tags (by Louis Sautier) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/urlwatch-2.11/MANIFEST.in new/urlwatch-2.13/MANIFEST.in --- old/urlwatch-2.11/MANIFEST.in 2018-05-19 18:42:00.000000000 +0200 +++ new/urlwatch-2.13/MANIFEST.in 2018-06-03 14:42:56.000000000 +0200 @@ -1,3 +1,3 @@ -include ChangeLog COPYING README.md +include CHANGELOG.md COPYING README.md recursive-include share * recursive-include test/data * diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/urlwatch-2.11/README.md new/urlwatch-2.13/README.md --- old/urlwatch-2.11/README.md 2018-05-19 18:42:00.000000000 +0200 +++ new/urlwatch-2.13/README.md 2018-06-03 14:42:56.000000000 +0200 @@ -113,12 +113,37 @@ your urls.yaml page without requiring a custom hook where previously you would have needed to write custom filtering code in Python. +If you are using the `grep` filter, you can grep for a comma (`,`) +by using `\054` (`:` does not need to be escaped separately and +can be used as-is), for example to convert HTML to text, then grep +for `a,b:`, and then strip whitespace, use this: + +```yaml +url: https://example.org/ +filter: html2text,grep:a\054b:,strip +``` + If you want to extract only the body tag you can use this filer: ```yaml url: https://thp.io/2008/urlwatch/ filter: element-by-tag:body ``` +You can also specify an external `diff`-style tool (a tool that takes +two filenames (old, new) as parameter and returns on its standard output +the difference of the files), for example to use GNU `wdiff` to get +word-based differences instead of line-based difference: + +```yaml +url: https://example.com/ +diff_tool: wdiff +``` + +Note that `diff_tool` specifies an external command-line tool, so that +tool must be installed separately (e.g. `apt install wdiff` on Debian or +`brew install wdiff` on macOS). Coloring is supported for `wdiff`-style +output, but potentially not for other diff tools. + PUSHOVER -------- @@ -197,6 +222,23 @@ password. +TESTING FILTERS +--------------- + +While creating your filter pipeline, you might want to preview what the filtered +output looks like. You can do so by first configuring your job and then running +urlwatch with the `--test-filter` command, passing in the index (from `--list`) +or the URL/location of the job to be tested: + +``` +urlwatch --test-filter 1 # Test the first job in the list +urlwatch --test-filter https://example.net/ # Test the job with the given URL +``` + +The output of this command will be the filtered plaintext of the job, this is the +output that will (in a real urlwatch run) be the input to the diff algorithm. + + CONTACT ------- diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/urlwatch-2.11/lib/urlwatch/__init__.py new/urlwatch-2.13/lib/urlwatch/__init__.py --- old/urlwatch-2.11/lib/urlwatch/__init__.py 2018-05-19 18:42:00.000000000 +0200 +++ new/urlwatch-2.13/lib/urlwatch/__init__.py 2018-06-03 14:42:56.000000000 +0200 @@ -12,5 +12,5 @@ __author__ = 'Thomas Perl <m...@thp.io>' __license__ = 'BSD' __url__ = 'https://thp.io/2008/urlwatch/' -__version__ = '2.11' +__version__ = '2.13' __user_agent__ = '%s/%s (+https://thp.io/2008/urlwatch/info.html)' % (pkgname, __version__) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/urlwatch-2.11/lib/urlwatch/command.py new/urlwatch-2.13/lib/urlwatch/command.py --- old/urlwatch-2.11/lib/urlwatch/command.py 2018-05-19 18:42:00.000000000 +0200 +++ new/urlwatch-2.13/lib/urlwatch/command.py 2018-06-03 14:42:56.000000000 +0200 @@ -35,7 +35,8 @@ import sys from .filters import FilterBase -from .jobs import JobBase +from .handler import JobState +from .jobs import JobBase, UrlJob from .reporters import ReporterBase from .util import atomic_rename, edit_file from .mailer import set_password, have_password @@ -102,26 +103,45 @@ print('%d: %s' % (idx + 1, pretty_name)) return 0 + def _find_job(self, query): + try: + index = int(query) + if index <= 0: + return None + try: + return self.urlwatcher.jobs[index - 1] + except IndexError: + return None + except ValueError: + return next((job for job in self.urlwatcher.jobs if job.get_location() == query), None) + + def test_filter(self): + job = self._find_job(self.urlwatch_config.test_filter) + if job is None: + print('Not found: %r' % (self.urlwatch_config.test_filter,)) + return 1 + + if isinstance(job, UrlJob): + # Force re-retrieval of job, as we're testing filters + job.ignore_cached = True + + job_state = JobState(self.urlwatcher.cache_storage, job) + job_state.process() + print(job_state.new_data) + # We do not save the job state or job on purpose here, since we are possibly modifying the job + # (ignore_cached) and we do not want to store the newly-retrieved data yet (filter testing) + return 0 + def modify_urls(self): save = True if self.urlwatch_config.delete is not None: - try: - index = int(self.urlwatch_config.delete) - 1 - try: - job = self.urlwatcher.jobs.pop(index) - print('Removed %r' % (job,)) - except IndexError: - print('Not found: %r' % (index,)) - save = False - except ValueError: - job = next((job for job in self.urlwatcher.jobs if job.get_location() == self.urlwatch_config.delete), - None) - try: - self.urlwatcher.jobs.remove(job) - print('Removed %r' % (job,)) - except ValueError: - print('Not found: %r' % (self.urlwatch_config.delete,)) - save = False + job = self._find_job(self.urlwatch_config.delete) + if job is not None: + self.urlwatcher.jobs.remove(job) + print('Removed %r' % (job,)) + else: + print('Not found: %r' % (self.urlwatch_config.delete,)) + save = False if self.urlwatch_config.add is not None: d = {k: v for k, v in (item.split('=', 1) for item in self.urlwatch_config.add.split(','))} @@ -144,6 +164,8 @@ sys.exit(self.urlwatcher.urls_storage.edit(self.urlwatch_config.urls_yaml_example)) if self.urlwatch_config.edit_hooks: sys.exit(self.edit_hooks()) + if self.urlwatch_config.test_filter: + sys.exit(self.test_filter()) if self.urlwatch_config.list: sys.exit(self.list_urls()) if self.urlwatch_config.add is not None or self.urlwatch_config.delete is not None: diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/urlwatch-2.11/lib/urlwatch/config.py new/urlwatch-2.13/lib/urlwatch/config.py --- old/urlwatch-2.11/lib/urlwatch/config.py 2018-05-19 18:42:00.000000000 +0200 +++ new/urlwatch-2.13/lib/urlwatch/config.py 2018-06-03 14:42:56.000000000 +0200 @@ -94,6 +94,7 @@ group.add_argument('--list', action='store_true', help='list jobs') group.add_argument('--add', metavar='JOB', help='add job (key1=value1,key2=value2,...)') group.add_argument('--delete', metavar='JOB', help='delete job by location or index') + group.add_argument('--test-filter', metavar='JOB', help='test filter output of job by location or index') group = parser.add_argument_group('interactive commands ($EDITOR/$VISUAL)') group.add_argument('--edit', action='store_true', help='edit URL/job list') group.add_argument('--edit-config', action='store_true', help='edit configuration file') diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/urlwatch-2.11/lib/urlwatch/handler.py new/urlwatch-2.13/lib/urlwatch/handler.py --- old/urlwatch-2.11/lib/urlwatch/handler.py 2018-05-19 18:42:00.000000000 +0200 +++ new/urlwatch-2.13/lib/urlwatch/handler.py 2018-06-03 14:42:56.000000000 +0200 @@ -58,6 +58,10 @@ self.tries = 0 def save(self): + if self.new_data is None and self.exception is not None: + # If no new data has been retrieved due to an exception, use the old job data + self.new_data = self.old_data + self.cache_storage.save(self.job, self.job.get_guid(), self.new_data, time.time(), self.tries) def process(self): @@ -107,7 +111,9 @@ def _result(self, verb, job_state): if job_state.exception is not None: - logger.debug('Got exception while processing %r: %s', job_state.job, job_state.exception) + # TODO: Once we require Python >= 3.5, we can just pass in job_state.exception as "exc_info" parameter + exc_info = (type(job_state.exception), job_state.exception, job_state.exception.__traceback__) + logger.debug('Got exception while processing %r', job_state.job, exc_info=exc_info) job_state.verb = verb self.job_states.append(job_state) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/urlwatch-2.11/lib/urlwatch/html2txt.py new/urlwatch-2.13/lib/urlwatch/html2txt.py --- old/urlwatch-2.11/lib/urlwatch/html2txt.py 2018-05-19 18:42:00.000000000 +0200 +++ new/urlwatch-2.13/lib/urlwatch/html2txt.py 2018-06-03 14:42:56.000000000 +0200 @@ -36,14 +36,13 @@ logger = logging.getLogger(__name__) -def html2text(data, method='lynx', options=None): - +def html2text(data, method, options): """ Convert a string consisting of HTML to plain text for easy difference checking. Method may be one of: - 'lynx' (default) - Use "lynx -dump" for conversion + 'lynx' - Use "lynx -dump" for conversion options: see "lynx -help" output for options that work with "-dump" 'html2text' - Use "html2text -nobs" for conversion options: https://linux.die.net/man/1/html2text @@ -54,9 +53,6 @@ 'pyhtml2text' - Use Python module "html2text" options: https://github.com/Alir3z4/html2text/blob/master/docs/usage.md#available-options """ - if options is None: - options = {} - if method == 're': stripped_tags = re.sub(r'<[^>]*>', '', data) d = '\n'.join((l.rstrip() for l in stripped_tags.splitlines() if l.strip() != '')) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/urlwatch-2.11/lib/urlwatch/jobs.py new/urlwatch-2.13/lib/urlwatch/jobs.py --- old/urlwatch-2.11/lib/urlwatch/jobs.py 2018-05-19 18:42:00.000000000 +0200 +++ new/urlwatch-2.13/lib/urlwatch/jobs.py 2018-06-03 14:42:56.000000000 +0200 @@ -146,7 +146,7 @@ class Job(JobBase): __required__ = () - __optional__ = ('name', 'filter', 'max_tries') + __optional__ = ('name', 'filter', 'max_tries', 'diff_tool') def pretty_name(self): return self.name if self.name else self.get_location() diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/urlwatch-2.11/lib/urlwatch/reporters.py new/urlwatch-2.13/lib/urlwatch/reporters.py --- old/urlwatch-2.11/lib/urlwatch/reporters.py 2018-05-19 18:42:00.000000000 +0200 +++ new/urlwatch-2.13/lib/urlwatch/reporters.py 2018-06-03 14:42:56.000000000 +0200 @@ -28,6 +28,10 @@ import difflib +import tempfile +import subprocess +import re +import shlex import email.utils import itertools import logging @@ -55,6 +59,11 @@ logger = logging.getLogger(__name__) +# Regular expressions that match the added/removed markers of GNU wdiff output +WDIFF_ADDED_RE = r'[{][+].*?[+][}]' +WDIFF_REMOVED_RE = r'[[][-].*?[-][]]' + + class ReporterBase(object, metaclass=TrackSubClasses): __subclasses__ = {} @@ -98,6 +107,21 @@ raise NotImplementedError() def unified_diff(self, job_state): + if job_state.job.diff_tool is not None: + with tempfile.NamedTemporaryFile() as old_file, tempfile.NamedTemporaryFile() as new_file: + old_file.write(job_state.old_data.encode('utf-8')) + old_file.flush() + new_file.write(job_state.new_data.encode('utf-8')) + new_file.flush() + cmdline = shlex.split(job_state.job.diff_tool) + [old_file.name, new_file.name] + proc = subprocess.Popen(cmdline, stdout=subprocess.PIPE) + stdout, _ = proc.communicate() + # Diff tools return 0 for "nothing changed" or 1 for "files differ", anything else is an error + if proc.returncode in (0, 1): + return stdout.decode('utf-8') + else: + raise subprocess.CalledProcessError(result, cmdline) + timestamp_old = email.utils.formatdate(job_state.timestamp, localtime=1) timestamp_new = email.utils.formatdate(time.time(), localtime=1) return ''.join(difflib.unified_diff([l + '\n' for l in job_state.old_data.splitlines()], @@ -306,6 +330,10 @@ body = '\n'.join(super().submit()) for line in body.splitlines(): + # Basic colorization for wdiff-style differences + line = re.sub(WDIFF_ADDED_RE, lambda x: self._green(x.group(0)), line) + line = re.sub(WDIFF_REMOVED_RE, lambda x: self._red(x.group(0)), line) + # FIXME: This isn't ideal, but works for now... if line in separators: print(line) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/urlwatch-2.11/setup.py new/urlwatch-2.13/setup.py --- old/urlwatch-2.11/setup.py 2018-05-19 18:42:00.000000000 +0200 +++ new/urlwatch-2.13/setup.py 2018-06-03 14:42:56.000000000 +0200 @@ -4,11 +4,15 @@ import os import re +import sys main_py = open(os.path.join('lib', 'urlwatch', '__init__.py')).read() m = dict(re.findall("\n__([a-z]+)__ = '([^']+)'", main_py)) docs = re.findall('"""(.*?)"""', main_py, re.DOTALL) +if sys.version_info < (3, 3): + sys.exit('urlwatch requires Python 3.3 or newer') + m['name'] = 'urlwatch' m['author'], m['author_email'] = re.match(r'(.*) <(.*)>', m['author']).groups() m['description'], m['long_description'] = docs[0].strip().split('\n\n', 1) @@ -16,6 +20,7 @@ m['scripts'] = ['urlwatch'] m['package_dir'] = {'': 'lib'} m['packages'] = ['urlwatch'] +m['python_requires'] = '>3.3.0' m['data_files'] = [ ('share/man/man1', ['share/man/man1/urlwatch.1']), ('share/urlwatch/examples', [