Hello community, here is the log from the commit of package python-html2text for openSUSE:Factory checked in at 2016-03-16 10:34:32 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Comparing /work/SRC/openSUSE:Factory/python-html2text (Old) and /work/SRC/openSUSE:Factory/.python-html2text.new (New) ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "python-html2text" Changes: -------- --- /work/SRC/openSUSE:Factory/python-html2text/python-html2text.changes 2015-04-21 10:53:48.000000000 +0200 +++ /work/SRC/openSUSE:Factory/.python-html2text.new/python-html2text.changes 2016-03-16 10:34:33.000000000 +0100 @@ -1,0 +2,38 @@ +Sun Jan 3 21:18:57 UTC 2016 - frei...@opensuse.org + +Update to version 2015-11.4: + +* Fix #38: Long links wrapping controlled by `--no-wrap-links`. +* Note: `--no-wrap-links` implies `--reference-links` +* Feature #83: Add callback-on-tag. +* Fix #87: Decode errors can be handled via command line. +* Feature #95: Docs, decode errors spelling mistake. +* Fix #84: Make bodywidth kwarg overridable using config. + + +Additional changes from version 2015.6.21 + +* Fix #31: HTML entities stay inside link. +* Fix #71: Coverage detects command line tests. +* Fix #39: Documentation update. +* Fix #61: Functionality added for optional use of automatic links. +* Feature #80: ``title`` attribute is preserved in both inline and reference links. +* Feature #82: More command line options. See docs. + +Additional changes from version 2015.6.12 + +* Feature #76: Making ``pre`` blocks clearer for further automatic formatting. +* Fix #71: Coverage detects tests carried out in ``subprocesses`` + +Additional changes from version 2015.6.6 + +* Fix #24: ``3.200.3`` vs ``2014.7.3`` output quirks. +* Fix #61. Malformed links in markdown output. +* Feature #62: Automatic version number. +* Fix #63: Nested code, anchor bug. +* Fix #64: Proper handling of anchors with content that starts with tags. +* Feature #67: Documentation all over the module. +* Feature #70: Adding tests for the module. +* Fix #73: Typo in config documentation. + +------------------------------------------------------------------- Old: ---- html2text-2015.4.14.tar.gz New: ---- html2text-2015.11.4.tar.gz ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Other differences: ------------------ ++++++ python-html2text.spec ++++++ --- /var/tmp/diff_new_pack.eg5XFb/_old 2016-03-16 10:34:34.000000000 +0100 +++ /var/tmp/diff_new_pack.eg5XFb/_new 2016-03-16 10:34:34.000000000 +0100 @@ -1,7 +1,7 @@ # # spec file for package python-html2text # -# Copyright (c) 2015 SUSE LINUX GmbH, Nuernberg, Germany. +# Copyright (c) 2016 SUSE LINUX GmbH, Nuernberg, Germany. # # All modifications and additions to the file contributed by third parties # remain the property of their copyright owners, unless otherwise agreed @@ -17,7 +17,7 @@ Name: python-html2text -Version: 2015.4.14 +Version: 2015.11.4 Release: 0 Url: https://github.com/Alir3z4/html2text/ Summary: Turn HTML into equivalent Markdown-structured text ++++++ html2text-2015.4.14.tar.gz -> html2text-2015.11.4.tar.gz ++++++ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2015.4.14/AUTHORS.rst new/html2text-2015.11.4/AUTHORS.rst --- old/html2text-2015.4.14/AUTHORS.rst 2015-04-13 17:48:22.000000000 +0200 +++ new/html2text-2015.11.4/AUTHORS.rst 2015-11-04 15:32:38.000000000 +0100 @@ -14,6 +14,11 @@ * Miguel Tavares <mgon...@gmail.com> * Scott Blackburn <sc...@skipflag.com> * Peter Wu <pe...@lekensteyn.nl> +* Arjoonn Sharma <gh: theSage21> +* Ali Mohammad <gh: alawibaba> +* Albert Berger <gh: nbdsp> +* Etienne Millon <m...@emillon.org> +* John C F <gh: critiqjo> Maintainer: diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2015.4.14/ChangeLog.rst new/html2text-2015.11.4/ChangeLog.rst --- old/html2text-2015.4.14/ChangeLog.rst 2015-04-14 19:05:05.000000000 +0200 +++ new/html2text-2015.11.4/ChangeLog.rst 2015-11-04 15:48:46.000000000 +0100 @@ -1,3 +1,49 @@ +2015.11.4 +========= +---- + +* Fix #38: Long links wrapping controlled by `--no-wrap-links`. +* Note: `--no-wrap-links` implies `--reference-links` +* Feature #83: Add callback-on-tag. +* Fix #87: Decode errors can be handled via command line. +* Feature #95: Docs, decode errors spelling mistake. +* Fix #84: Make bodywidth kwarg overridable using config. + + +2015.6.21 +========= +---- + +* Fix #31: HTML entities stay inside link. +* Fix #71: Coverage detects command line tests. +* Fix #39: Documentation update. +* Fix #61: Functionality added for optional use of automatic links. +* Feature #80: ``title`` attribute is preserved in both inline and reference links. +* Feature #82: More command line options. See docs. + + +2015.6.12 +========= +---- + +* Feature #76: Making ``pre`` blocks clearer for further automatic formatting. +* Fix #71: Coverage detects tests carried out in ``subprocesses`` + + +2015.6.6 +======== +---- + +* Fix #24: ``3.200.3`` vs ``2014.7.3`` output quirks. +* Fix #61. Malformed links in markdown output. +* Feature #62: Automatic version number. +* Fix #63: Nested code, anchor bug. +* Fix #64: Proper handling of anchors with content that starts with tags. +* Feature #67: Documentation all over the module. +* Feature #70: Adding tests for the module. +* Fix #73: Typo in config documentation. + + 2015.4.14 ========= ---- @@ -16,7 +62,7 @@ 2015.2.18 -========== +========= ---- * Fix #38: Anchor tags with empty text or with `<img>` tags inside are no longer stripped. diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2015.4.14/PKG-INFO new/html2text-2015.11.4/PKG-INFO --- old/html2text-2015.4.14/PKG-INFO 2015-04-14 19:09:30.000000000 +0200 +++ new/html2text-2015.11.4/PKG-INFO 2015-11-04 16:23:02.000000000 +0100 @@ -1,6 +1,6 @@ Metadata-Version: 1.1 Name: html2text -Version: 2015.4.14 +Version: 2015.11.4 Summary: Turn HTML into equivalent Markdown-structured text. Home-page: https://github.com/Alir3z4/html2text/ Author: Alireza Savand @@ -23,3 +23,4 @@ Classifier: Programming Language :: Python :: 3.1 Classifier: Programming Language :: Python :: 3.2 Classifier: Programming Language :: Python :: 3.3 +Classifier: Programming Language :: Python :: 3.4 diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2015.4.14/README.md new/html2text-2015.11.4/README.md --- old/html2text-2015.4.14/README.md 2015-04-14 19:05:05.000000000 +0200 +++ new/html2text-2015.11.4/README.md 2015-11-04 15:32:38.000000000 +0100 @@ -15,24 +15,16 @@ Usage: `html2text [(filename|url) [encoding]]` - | Option | Description |--------------------------------------------------------|--------------------------------------------------- | `--version` | Show program's version number and exit | `-h`, `--help` | Show this help message and exit | `--ignore-links` | Don't include any formatting for links -|`--protect-links` | Protect links from line breaks surrounding them "+" with angle brackets -|`--ignore-images` | Don't include any formatting for images -|`--images-to-alt` | Discard image data, only keep alt text -|`--images-with-size` | Write image tags with height and width attrs as raw html to retain dimensions -|`-g`, `--google-doc` | Convert an html-exported Google Document -|`-d`, `--dash-unordered-list` | Use a dash rather than a star for unordered list items -|`-b` `BODY_WIDTH`, `--body-width`=`BODY_WIDTH` | Number of characters per output line, `0` for no wrap -|`-i` `LIST_INDENT`, `--google-list-indent`=`LIST_INDENT`| Number of pixels Google indents nested lists -|`-s`, `--hide-strikethrough` | Hide strike-through text. only relevent when `-g` is specified as well |`--escape-all` | Escape all special characters. Output is less readable, but avoids corner case formatting issues. -| `--bypass-tables` | Format tables in HTML rather than Markdown syntax. -| `--single-line-break` | Use a single line break after a block element rather than two. +| `--reference-links` | Use reference links instead of links to create markdown +| `--mark-code` | Mark preformatted and code blocks with [code]...[/code] + +For a complete list of options see the [docs](docs/usage.md) Or you can use it from within `Python`: @@ -83,3 +75,14 @@ ## How to run unit tests PYTHONPATH=$PYTHONPATH:. coverage run --source=html2text setup.py test -v + +To see the coverage results: + + coverage combine + coverage html + +then open the `./htmlcov/index.html` file in your browser. + +## Documentation + +Documentation lives [here](docs/index.md) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2015.4.14/html2text/__init__.py new/html2text-2015.11.4/html2text/__init__.py --- old/html2text-2015.4.14/html2text/__init__.py 2015-04-14 19:09:03.000000000 +0200 +++ new/html2text-2015.11.4/html2text/__init__.py 2015-11-04 15:48:14.000000000 +0100 @@ -7,7 +7,7 @@ try: from textwrap import wrap -except ImportError: +except ImportError: # pragma: no cover pass from html2text.compat import urlparse, HTMLParser @@ -29,7 +29,7 @@ skipwrap ) -__version__ = "2015.4.14" +__version__ = (2015, 11, 4) # TODO: @@ -50,29 +50,36 @@ self.split_next_td = False self.td_count = 0 self.table_start = False - self.unicode_snob = config.UNICODE_SNOB - self.escape_snob = config.ESCAPE_SNOB + self.unicode_snob = config.UNICODE_SNOB # covered in cli + self.escape_snob = config.ESCAPE_SNOB # covered in cli self.links_each_paragraph = config.LINKS_EACH_PARAGRAPH - self.body_width = bodywidth - self.skip_internal_links = config.SKIP_INTERNAL_LINKS - self.inline_links = config.INLINE_LINKS - self.protect_links = config.PROTECT_LINKS - self.google_list_indent = config.GOOGLE_LIST_INDENT - self.ignore_links = config.IGNORE_ANCHORS - self.ignore_images = config.IGNORE_IMAGES - self.images_to_alt = config.IMAGES_TO_ALT - self.images_with_size = config.IMAGES_WITH_SIZE - self.ignore_emphasis = config.IGNORE_EMPHASIS - self.bypass_tables = config.BYPASS_TABLES - self.google_doc = False - self.ul_item_mark = '*' - self.emphasis_mark = '_' + self.body_width = bodywidth # covered in cli + self.skip_internal_links = config.SKIP_INTERNAL_LINKS # covered in cli + self.inline_links = config.INLINE_LINKS # covered in cli + self.protect_links = config.PROTECT_LINKS # covered in cli + self.google_list_indent = config.GOOGLE_LIST_INDENT # covered in cli + self.ignore_links = config.IGNORE_ANCHORS # covered in cli + self.ignore_images = config.IGNORE_IMAGES # covered in cli + self.images_to_alt = config.IMAGES_TO_ALT # covered in cli + self.images_with_size = config.IMAGES_WITH_SIZE # covered in cli + self.ignore_emphasis = config.IGNORE_EMPHASIS # covered in cli + self.bypass_tables = config.BYPASS_TABLES # covered in cli + self.google_doc = False # covered in cli + self.ul_item_mark = '*' # covered in cli + self.emphasis_mark = '_' # covered in cli self.strong_mark = '**' + self.single_line_break = config.SINGLE_LINE_BREAK # covered in cli + self.use_automatic_links = config.USE_AUTOMATIC_LINKS # covered in cli + self.hide_strikethrough = False # covered in cli + self.mark_code = config.MARK_CODE self.single_line_break = config.SINGLE_LINE_BREAK + self.use_automatic_links = config.USE_AUTOMATIC_LINKS + self.wrap_links = config.WRAP_LINKS # covered in cli + self.tag_callback = None - if out is None: + if out is None: # pragma: no cover self.out = self.outtextf - else: + else: # pragma: no cover self.out = out # empty list to store output characters before they are "joined" @@ -165,13 +172,14 @@ charref = self.charref(c) if not self.code and not self.pre: charref = cgi.escape(charref) - self.o(charref, 1) + self.handle_data(charref, True) def handle_entityref(self, c): entityref = self.entityref(c) - if not self.code and not self.pre and entityref != ' _place_holder;': + if (not self.code and not self.pre + and entityref != ' _place_holder;'): entityref = cgi.escape(entityref) - self.o(entityref, 1) + self.handle_data(entityref, True) def handle_starttag(self, tag, attrs): self.handle_tag(tag, attrs, 1) @@ -187,9 +195,8 @@ self.a list. If the set of attributes is not found, returns None :rtype: int """ - if 'href' not in attrs: + if 'href' not in attrs: # pragma: no cover return None - i = -1 for a in self.a: i += 1 @@ -275,6 +282,18 @@ else: attrs = dict(attrs) + if self.tag_callback is not None: + if self.tag_callback(self, tag, attrs, start) is True: + return + + # first thing inside the anchor tag is another tag that produces some output + if (start and not self.maybe_automatic_link is None + and tag not in ['p', 'div', 'style', 'dl', 'dt'] + and (tag != "img" or self.ignore_images)): + self.o("[") + self.maybe_automatic_link = None + self.empty_link = False + if self.google_doc: # the attrs parameter is empty for a closing tag. in addition, we # need the attributes of the parent nodes in order to get a @@ -396,7 +415,13 @@ self.empty_link = False self.maybe_automatic_link = None if self.inline_links: - self.o("](" + escape_md(a['href']) + ")") + try: + title = escape_md(a['title']) + except KeyError: + self.o("](" + escape_md(a['href']) + ")") + else: + self.o("](" + escape_md(a['href']) + + ' "' + title + '" )') else: i = self.previousIndex(a) if i is not None: @@ -553,16 +578,22 @@ self.pre = 1 else: self.pre = 0 + if self.mark_code: + self.out("\n[/code]") self.p() + # TODO: Add docstring for these one letter functions def pbr(self): + "Pretty print has a line break" if self.p_p == 0: self.p_p = 1 def p(self): + "Set pretty print to 1 or 2 lines" self.p_p = 1 if self.single_line_break else 2 def soft_br(self): + "Soft breaks" self.pbr() self.br_toggle = ' ' @@ -598,6 +629,9 @@ #self.out(" :") #TODO: not output when already one there if not data.startswith("\n"): # <pre>stuff... data = "\n" + data + if self.mark_code: + self.out("\n[code]") + self.p_p = 0 bq = (">" * self.blockquote) if not (force and data and data[0] == ">") and self.blockquote: @@ -668,7 +702,7 @@ self.out(data) self.outcount += 1 - def handle_data(self, data): + def handle_data(self, data, entity_char=False): if r'\/script>' in data: self.quiet -= 1 @@ -677,7 +711,8 @@ if not self.maybe_automatic_link is None: href = self.maybe_automatic_link - if href == data and self.absolute_url_matcher.match(href): + if (href == data and self.absolute_url_matcher.match(href) + and self.use_automatic_links): self.o("<" + data + ">") self.empty_link = False return @@ -686,11 +721,12 @@ self.maybe_automatic_link = None self.empty_link = False - if not self.code and not self.pre: + if not self.code and not self.pre and not entity_char: data = escape_md_section(data, snob=self.escape_snob) self.o(data, 1) - def unknown_decl(self, data): + def unknown_decl(self, data): # pragma: no cover + # TODO: what is this doing here? pass def charref(self, name): @@ -703,9 +739,12 @@ return unifiable_n[c] else: try: - return unichr(c) - except NameError: # Python3 - return chr(c) + try: + return unichr(c) + except NameError: # Python3 + return chr(c) + except ValueError: # invalid unicode + return '' def entityref(self, c): if not self.unicode_snob and c in config.UNIFIABLE.keys(): @@ -763,9 +802,14 @@ assert wrap, "Requires Python 2.3." result = '' newlines = 0 + # I cannot think of a better solution for now. + # To avoid the non-wrap behaviour for entire paras + # because of the presence of a link in it + if not self.wrap_links: + self.inline_links = False for para in text.split("\n"): if len(para) > 0: - if not skipwrap(para): + if not skipwrap(para, self.wrap_links): result += "\n".join(wrap(para, self.body_width)) if para.endswith(' '): result += " \n" @@ -788,7 +832,9 @@ return result -def html2text(html, baseurl='', bodywidth=config.BODY_WIDTH): +def html2text(html, baseurl='', bodywidth=None): + if bodywidth is None: + bodywidth = config.BODY_WIDTH h = HTML2Text(baseurl=baseurl, bodywidth=bodywidth) return h.handle(html) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2015.4.14/html2text/cli.py new/html2text-2015.11.4/html2text/cli.py --- old/html2text-2015.4.14/html2text/cli.py 2015-04-14 19:05:05.000000000 +0200 +++ new/html2text-2015.11.4/html2text/cli.py 2015-11-04 15:32:38.000000000 +0100 @@ -8,8 +8,27 @@ def main(): baseurl = '' - p = optparse.OptionParser('%prog [(filename|url) [encoding]]', - version='%prog ' + __version__) + class bcolors: # pragma: no cover + HEADER = '\033[95m' + OKBLUE = '\033[94m' + OKGREEN = '\033[92m' + WARNING = '\033[93m' + FAIL = '\033[91m' + ENDC = '\033[0m' + BOLD = '\033[1m' + UNDERLINE = '\033[4m' + + p = optparse.OptionParser( + '%prog [(filename|url) [encoding]]', + version='%prog ' + ".".join(map(str, __version__)) + ) + p.add_option( + "--no-wrap-links", + dest="wrap_links", + action="store_false", + default=config.WRAP_LINKS, + help="wrap links during conversion" + ) p.add_option( "--ignore-emphasis", dest="ignore_emphasis", @@ -18,6 +37,13 @@ help="don't include any formatting for emphasis" ) p.add_option( + "--reference-links", + dest="inline_links", + action="store_false", + default=config.INLINE_LINKS, + help="use reference style links instead of inline links" + ) + p.add_option( "--ignore-links", dest="ignore_links", action="store_true", @@ -122,11 +148,54 @@ "line breaks. NOTE: Requires --body-width=0" ) ) + p.add_option( + "--unicode-snob", + action="store_true", + dest="unicode_snob", + default=config.UNICODE_SNOB, + help="Use unicode throughout document" + ) + p.add_option( + "--no-automatic-links", + action="store_false", + dest="use_automatic_links", + default=config.USE_AUTOMATIC_LINKS, + help="Do not use automatic links wherever applicable" + ) + p.add_option( + "--no-skip-internal-links", + action="store_false", + dest="skip_internal_links", + default=config.SKIP_INTERNAL_LINKS, + help="Do not skip internal links" + ) + p.add_option( + "--links-after-para", + action="store_true", + dest="links_each_paragraph", + default=config.LINKS_EACH_PARAGRAPH, + help="Put links after each paragraph instead of document" + ) + p.add_option( + "--mark-code", + action="store_true", + dest="mark_code", + default=config.MARK_CODE, + help="Mark program code blocks with [code]...[/code]" + ) + p.add_option( + "--decode-errors", + dest="decode_errors", + action="store", + type="string", + default=config.DECODE_ERRORS, + help="What to do in case of decode errors.'ignore', 'strict' and 'replace' are acceptable values" + ) (options, args) = p.parse_args() # process input encoding = "utf-8" - if len(args) > 0 and args[0] != '-': + if len(args) > 0 and args[0] != '-': # pragma: no cover file_ = args[0] if len(args) == 2: encoding = args[1] @@ -157,7 +226,18 @@ data = wrap_read() if hasattr(data, 'decode'): - data = data.decode(encoding) + try: + try: + data = data.decode(encoding, errors=options.decode_errors) + except TypeError: + # python 2.6.x does not have the errors option + data = data.decode(encoding) + except UnicodeDecodeError as err: + warning = bcolors.WARNING + "Warning:" + bcolors.ENDC + warning += ' Use the ' + bcolors.OKGREEN + warning += '--decode-errors=ignore' + bcolors.ENDC + 'flag.' + print(warning) + raise err h = HTML2Text(baseurl=baseurl) # handle options @@ -168,7 +248,7 @@ h.strong_mark = '__' h.body_width = options.body_width - h.list_indent = options.list_indent + h.google_list_indent = options.list_indent h.ignore_emphasis = options.ignore_emphasis h.ignore_links = options.ignore_links h.protect_links = options.protect_links @@ -180,5 +260,12 @@ h.escape_snob = options.escape_snob h.bypass_tables = options.bypass_tables h.single_line_break = options.single_line_break + h.inline_links = options.inline_links + h.unicode_snob = options.unicode_snob + h.use_automatic_links = options.use_automatic_links + h.skip_internal_links = options.skip_internal_links + h.links_each_paragraph = options.links_each_paragraph + h.mark_code = options.mark_code + h.wrap_links = options.wrap_links wrapwrite(h.handle(data)) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2015.4.14/html2text/compat.py new/html2text-2015.11.4/html2text/compat.py --- old/html2text-2015.4.14/html2text/compat.py 2014-12-03 14:53:22.000000000 +0100 +++ new/html2text-2015.11.4/html2text/compat.py 2015-11-04 15:32:38.000000000 +0100 @@ -1,12 +1,13 @@ -try: +import sys + + +if sys.version_info[0] == 2: import htmlentitydefs import urlparse import HTMLParser -except ImportError: # Python3 - import html.entities as htmlentitydefs + import urllib +else: import urllib.parse as urlparse + import html.entities as htmlentitydefs import html.parser as HTMLParser -try: # Python3 import urllib.request as urllib -except ImportError: - import urllib diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2015.4.14/html2text/config.py new/html2text-2015.11.4/html2text/config.py --- old/html2text-2015.4.14/html2text/config.py 2015-04-14 19:05:05.000000000 +0200 +++ new/html2text-2015.11.4/html2text/config.py 2015-11-04 15:32:38.000000000 +0100 @@ -23,6 +23,8 @@ # Protect links from line breaks surrounding them with angle brackets (in # addition to their square brackets) PROTECT_LINKS = False +# WRAP_LINKS = True +WRAP_LINKS = True # Number of pixels Google indents nested lists GOOGLE_LIST_INDENT = 36 @@ -32,6 +34,11 @@ IMAGES_TO_ALT = False IMAGES_WITH_SIZE = False IGNORE_EMPHASIS = False +MARK_CODE = False +DECODE_ERRORS = 'strict' + +# Convert links with same href and text to <href> format if they are absolute links +USE_AUTOMATIC_LINKS = True # For checking space-only lines on line 771 RE_SPACE = re.compile(r'\s\+') @@ -41,6 +48,7 @@ RE_UNORDERED_LIST_MATCHER = re.compile(r'[-\*\+]\s') RE_MD_CHARS_MATCHER = re.compile(r"([\\\[\]\(\)])") RE_MD_CHARS_MATCHER_ALL = re.compile(r"([`\*_{}\[\]\(\)#!])") +RE_LINK = re.compile(r"(\[.*?\] ?\(.*?\))|(\[.*?\]:.*?)") # to find links in the text RE_MD_DOT_MATCHER = re.compile(r""" ^ # start of line (\s*\d+) # optional whitespace and a number diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2015.4.14/html2text/utils.py new/html2text-2015.11.4/html2text/utils.py --- old/html2text-2015.4.14/html2text/utils.py 2014-12-05 18:59:05.000000000 +0100 +++ new/html2text-2015.11.4/html2text/utils.py 2015-11-04 15:32:38.000000000 +0100 @@ -1,10 +1,11 @@ import sys -from html2text import config +from html2text import config from html2text.compat import htmlentitydefs def name2cp(k): + """Return sname to codepoint""" if k == 'apos': return ord("'") return htmlentitydefs.name2codepoint[k] @@ -20,7 +21,7 @@ if tag[0] == 'h' and len(tag) == 2: try: n = int(tag[1]) - if n in range(1, 10): + if n in range(1, 10): # pragma: no branch return n except ValueError: return 0 @@ -32,7 +33,10 @@ """ out = dict([(x.strip(), y.strip()) for x, y in [z.split(':', 1) for z in - style.split(';') if ':' in z]]) + style.split(';') if ':' in z + ] + ] + ) return out @@ -58,7 +62,7 @@ try: elements = dict([(a.strip(), dumb_property_dict(b)) for a, b in elements]) - except ValueError: + except ValueError: # pragma: no cover elements = {} # not that important return elements @@ -168,7 +172,11 @@ return 0 -def skipwrap(para): +def skipwrap(para, wrap_links): + # If it appears to contain a link + # don't wrap + if (len(config.RE_LINK.findall(para)) > 0) and not wrap_links: + return True # If the text begins with four spaces or one tab, it's a code block; # don't wrap if para[0:4] == ' ' or para[0] == '\t': @@ -204,7 +212,7 @@ sys.stdout.write(text) -def wrap_read(): +def wrap_read(): # pragma: no cover """ :rtype: str """ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2015.4.14/html2text.egg-info/PKG-INFO new/html2text-2015.11.4/html2text.egg-info/PKG-INFO --- old/html2text-2015.4.14/html2text.egg-info/PKG-INFO 2015-04-14 19:09:30.000000000 +0200 +++ new/html2text-2015.11.4/html2text.egg-info/PKG-INFO 2015-11-04 16:23:02.000000000 +0100 @@ -1,6 +1,6 @@ Metadata-Version: 1.1 Name: html2text -Version: 2015.4.14 +Version: 2015.11.4 Summary: Turn HTML into equivalent Markdown-structured text. Home-page: https://github.com/Alir3z4/html2text/ Author: Alireza Savand @@ -23,3 +23,4 @@ Classifier: Programming Language :: Python :: 3.1 Classifier: Programming Language :: Python :: 3.2 Classifier: Programming Language :: Python :: 3.3 +Classifier: Programming Language :: Python :: 3.4 diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2015.4.14/html2text.egg-info/SOURCES.txt new/html2text-2015.11.4/html2text.egg-info/SOURCES.txt --- old/html2text-2015.4.14/html2text.egg-info/SOURCES.txt 2015-04-14 19:09:30.000000000 +0200 +++ new/html2text-2015.11.4/html2text.egg-info/SOURCES.txt 2015-11-04 16:23:02.000000000 +0100 @@ -20,13 +20,25 @@ test/GoogleDocMassDownload.md test/GoogleDocSaved.html test/GoogleDocSaved.md +test/GoogleDocSaved_two.html +test/GoogleDocSaved_two.md test/__init__.py -test/anchor-undefined-href.html -test/anchor-undefined-href.md +test/abbr_tag.html +test/abbr_tag.md +test/anchors.html +test/anchors.md +test/apos_element.html +test/apos_element.md +test/blockquote_example.html +test/blockquote_example.md test/bodywidth_newline.html test/bodywidth_newline.md +test/bold_inside_link.html +test/bold_inside_link.md test/css_import_no_semicolon.html test/css_import_no_semicolon.md +test/decript_tage.html +test/decript_tage.md test/doc_with_table.html test/doc_with_table.md test/doc_with_table_bypass.html @@ -35,8 +47,16 @@ test/emdash-para.md test/empty-link.html test/empty-link.md +test/flip_emphasis.html +test/flip_emphasis.md +test/header_tags.html +test/header_tags.md +test/horizontal_rule.html +test/horizontal_rule.md test/html-escaping.html test/html-escaping.md +test/html_entities_out_of_text.html +test/html_entities_out_of_text.md test/images_to_alt.html test/images_to_alt.md test/images_with_size.html @@ -45,10 +65,28 @@ test/img-tag-with-link.md test/invalid_start.html test/invalid_start.md +test/invalid_unicode.html +test/invalid_unicode.md +test/link_titles.html +test/link_titles.md +test/list_tags_example.html +test/list_tags_example.md +test/mark_code.html +test/mark_code.md test/nbsp.html test/nbsp.md test/nbsp_unicode.html test/nbsp_unicode.md +test/no_inline_links_example.html +test/no_inline_links_example.md +test/no_inline_links_images_to_alt.html +test/no_inline_links_images_to_alt.md +test/no_inline_links_nested.html +test/no_inline_links_nested.md +test/no_wrap_links.html +test/no_wrap_links.md +test/no_wrap_links_no_inline_links.html +test/no_wrap_links_no_inline_links.md test/normal.html test/normal.md test/normal_escape_snob.html diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2015.4.14/setup.py new/html2text-2015.11.4/setup.py --- old/html2text-2015.4.14/setup.py 2015-04-14 19:08:44.000000000 +0200 +++ new/html2text-2015.11.4/setup.py 2015-11-04 15:58:13.000000000 +0100 @@ -34,7 +34,7 @@ setup( name="html2text", - version="2015.4.14", + version=".".join(map(str, __import__('html2text').__version__)), description="Turn HTML into equivalent Markdown-structured text.", author="Aaron Swartz", author_email="m...@aaronsw.com", @@ -58,7 +58,8 @@ 'Programming Language :: Python :: 3.0', 'Programming Language :: Python :: 3.1', 'Programming Language :: Python :: 3.2', - 'Programming Language :: Python :: 3.3' + 'Programming Language :: Python :: 3.3', + 'Programming Language :: Python :: 3.4', ], entry_points=""" [console_scripts] diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2015.4.14/test/GoogleDocMassDownload.html new/html2text-2015.11.4/test/GoogleDocMassDownload.html --- old/html2text-2015.4.14/test/GoogleDocMassDownload.html 2014-07-12 09:53:40.000000000 +0200 +++ new/html2text-2015.11.4/test/GoogleDocMassDownload.html 2015-11-04 15:32:38.000000000 +0100 @@ -6,7 +6,7 @@ @import url(https://themes.googleusercontent.com/fonts/css?kit=lhDjYqiy3mZ0x6ROQEUoUw);</STYLE> </HEAD> <BODY style="width:468pt;background-color:#ffffff;padding:72pt 72pt 72pt 72pt"> - <H1 style="padding-left:0;padding-right:0;padding-top:24pt;color:#000000;direction:ltr;font-size:24pt;margin:0;font-family:Arial;font-weight:bold;padding-bottom:6pt"> + <H1 style="padding-left:0;padding-right:0;padding-top:24pt;color:#000000;direction:ltr;font-size:24pt;margin:0;font-family:Arial;font-weight:bold;padding-bottom:6pt;text-decoration:underline"> <A name="h.xdvi2xfx4hkq"> </A> <SPAN> diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2015.4.14/test/GoogleDocSaved.html new/html2text-2015.11.4/test/GoogleDocSaved.html --- old/html2text-2015.4.14/test/GoogleDocSaved.html 2014-07-12 09:53:40.000000000 +0200 +++ new/html2text-2015.11.4/test/GoogleDocSaved.html 2015-11-04 15:32:38.000000000 +0100 @@ -3,7 +3,7 @@ <TITLE> Sandbox</TITLE> <STYLE type="text/css"> - @import url('https://themes.googleusercontent.com/fonts/css?kit=lhDjYqiy3mZ0x6ROQEUoUw');ol{margin:0;padding:0}p{margin:0}.c12{list-style-type:disc;margin:0;padding:0}.c8{width:468pt;background-color:#ffffff;padding:72pt 72pt 72pt 72pt}.c2{padding-left:0pt;direction:ltr;margin-left:36pt}.c11{list-style-type:lower-latin;margin:0;padding:0}.c4{list-style-type:circle;margin:0;padding:0}.c1{padding-left:0pt;direction:ltr;margin-left:72pt}.c7{list-style-type:decimal;margin:0;padding:0}.c3{font-style:italic;font-family:Courier New}.c0{height:11pt;direction:ltr}.c5{font-weight:bold}.c9{font-family:Consolas}.c13{font-family:Courier New}.c6{direction:ltr}.c10{font-style:italic}body{color:#000000;font-size:11pt;font-family:Arial}h1{padding-top:24pt;color:#000000;font-size:24pt;font-family:Arial;font-weight:bold;padding-bottom:6pt}h2{padding-top:18pt;color:#000000;font-size:18pt;font-family:Arial;font-weight:bold;padding-bottom:4pt}h3{padding-top:14pt;color:#000000;font-size:14pt;font-family:Arial;font-weight:bold;padding-bottom:4pt}h4{padding-top:12pt;color:#000000;font-size:12pt;font-family:Arial;font-weight:bold;padding-bottom:2pt}h5{padding-top:11pt;color:#000000;font-size:11pt;font-family:Arial;font-weight:bold;padding-bottom:2pt}h6{padding-top:10pt;color:#000000;font-size:10pt;font-family:Arial;font-weight:bold;padding-bottom:2pt}</STYLE> + @import url('https://themes.googleusercontent.com/fonts/css?kit=lhDjYqiy3mZ0x6ROQEUoUw');ol{margin:0;padding:0}p{margin:0}.c12{list-style-type:disc;margin:0;padding:0;text-decoration:none;}.c8{width:468pt;background-color:#ffffff;padding:72pt 72pt 72pt 72pt}.c2{padding-left:0pt;direction:ltr;margin-left:36pt}.c11{list-style-type:lower-latin;margin:0;padding:0}.c4{list-style-type:circle;margin:0;padding:0}.c1{padding-left:0pt;direction:ltr;margin-left:72pt}.c7{;margin:0;padding:0}.c3{font-style:italic;font-family:Courier New}.c0{height:11pt;direction:ltr}.c5{font-weight:bold}.c9{font-family:Consolas}.c13{font-family:Courier New}.c6{direction:ltr}.c10{font-style:italic}body{color:#000000;font-size:11pt;font-family:Arial}h1{padding-top:24pt;color:#000000;font-size:24pt;font-family:Arial;font-weight:bold;padding-bottom:6pt}h2{padding-top:18pt;color:#000000;font-size:18pt;font-family:Arial;font-weight:bold;padding-bottom:4pt}h3{padding-top:14pt;color:#000000;font-size:14pt;font-family:Arial;font-weight:bold;padding-bottom:4pt}h4{padding-top:12pt;color:#000000;font-size:12pt;font-family:Arial;font-weight:bold;padding-bottom:2pt}h5{padding-top:11pt;color:#000000;font-size:11pt;font-family:Arial;font-weight:bold;padding-bottom:2pt}h6{padding-top:10pt;color:#000000;font-size:10pt;font-family:Arial;font-weight:bold;padding-bottom:2pt}</STYLE> </HEAD> <BODY class="c8"> <H1 class="c6"> diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2015.4.14/test/GoogleDocSaved_two.html new/html2text-2015.11.4/test/GoogleDocSaved_two.html --- old/html2text-2015.4.14/test/GoogleDocSaved_two.html 1970-01-01 01:00:00.000000000 +0100 +++ new/html2text-2015.11.4/test/GoogleDocSaved_two.html 2015-11-04 15:32:38.000000000 +0100 @@ -0,0 +1,147 @@ +<HTML> + <HEAD> + <TITLE> + Sandbox</TITLE> + <STYLE type="text/css"> + @import url('https://themes.googleusercontent.com/fonts/css?kit=lhDjYqiy3mZ0x6ROQEUoUw');ol{margin:0;padding:0}p{margin:0}.c12{list-style-type:disc;margin:0;padding:0;text-decoration:none;}.c8{text-decoration:line-through;width:468pt;background-color:#ffffff;padding:72pt 72pt 72pt 72pt}.c2{padding-left:0pt;direction:ltr;margin-left:36pt}.c11{list-style-type:lower-latin;margin:0;padding:0}.c4{list-style-type:circle;margin:0;padding:0}.c1{padding-left:0pt;direction:ltr;margin-left:72pt}.c7{;margin:0;padding:0}.c3{font-style:italic;font-family:Courier New}.c0{height:11pt;direction:ltr}.c5{font-weight:bold}.c9{font-family:Consolas}.c13{font-family:Courier New}.c6{direction:ltr}.c10{font-style:italic}body{color:#000000;font-size:11pt;font-family:Arial}h1{padding-top:24pt;color:#000000;font-size:24pt;font-family:Arial;font-weight:bold;padding-bottom:6pt}h2{padding-top:18pt;color:#000000;font-size:18pt;font-family:Arial;font-weight:bold;padding-bottom:4pt}h3{padding-top:14pt;color:#000000;font-size:14pt;font-family:Arial;font-weight:bold;padding-bottom:4pt}h4{padding-top:12pt;color:#000000;font-size:12pt;font-family:Arial;font-weight:bold;padding-bottom:2pt}h5{padding-top:11pt;color:#000000;font-size:11pt;font-family:Arial;font-weight:bold;padding-bottom:2pt}h6{padding-top:10pt;color:#000000;font-size:10pt;font-family:Arial;font-weight:bold;padding-bottom:2pt}</STYLE> + </HEAD> + <BODY class="c8"> + <H1 class="c6"> + <A name="h.xdvi2xfx4hkq"> + </A> + <SPAN> + test doc</SPAN> + </H1> + <P class="c6"> + <SPAN> + first issue</SPAN> + </P> + <P class="c0"> + <SPAN> + </SPAN> + </P> + <OL class="c12" start="1"> + <LI class="c2"> + <SPAN> + bit</SPAN> + </LI> + <LI class="c2"> + <SPAN class="c5 c10"> + bold italic</SPAN> + </LI> + </OL> + <OL class="c4" start="1"> + <LI class="c1"> + <SPAN> + orange</SPAN> + </LI> + <LI class="c1"> + <SPAN> + apple</SPAN> + </LI> + </OL> + <OL class="c12" start="3"> + <LI class="c2"> + <SPAN> + final</SPAN> + </LI> + </OL> + <P class="c0"> + <SPAN> + </SPAN> + </P> + <P class="c6"> + <SPAN> + text to separate lists</SPAN> + </P> + <P class="c0"> + <SPAN> + </SPAN> + </P> + <OL class="c7" start="1"> + <LI class="c2"> + <SPAN> + now with numbers</SPAN> + </LI> + <LI class="c2"> + <SPAN> + the prisoner</SPAN> + </LI> + </OL> + <OL class="c11" start="1"> + <LI class="c1"> + <SPAN> + not an </SPAN> + <SPAN class="c10"> + italic number</SPAN> + </LI> + <LI class="c1"> + <SPAN> + a </SPAN> + <SPAN class="c5"> + bold human</SPAN> + <SPAN> + being</SPAN> + </LI> + </OL> + <OL class="c7" start="3"> + <LI class="c2"> + <SPAN> + end</SPAN> + </LI> + </OL> + <P class="c0"> + <SPAN> + </SPAN> + </P> + <P class="c6"> + <SPAN class="c5"> + bold</SPAN> + </P> + <P class="c6"> + <SPAN class="c10"> + italic</SPAN> + </P> + <P class="c0"> + <SPAN> + </SPAN> + </P> + <P class="c6"> + <SPAN class="c9"> + def func(x):</SPAN> + </P> + <P class="c6"> + <SPAN class="c9"> + if x < 1:</SPAN> + </P> + <P class="c6"> + <SPAN class="c9"> + return 'a'</SPAN> + </P> + <P class="c6"> + <SPAN class="c9"> + return 'b'</SPAN> + </P> + <P class="c0"> + <SPAN> + </SPAN> + </P> + <P class="c6"> + <SPAN> + Some </SPAN> + <SPAN class="c13"> + fixed width text</SPAN> + <SPAN> + here</SPAN> + </P> + <P class="c6"> + <SPAN class="c3"> + italic fixed width text</SPAN> + </P> + <P class="c0"> + <SPAN> + </SPAN> + </P> + </BODY> +</HTML> + diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2015.4.14/test/abbr_tag.html new/html2text-2015.11.4/test/abbr_tag.html --- old/html2text-2015.4.14/test/abbr_tag.html 1970-01-01 01:00:00.000000000 +0100 +++ new/html2text-2015.11.4/test/abbr_tag.html 2015-11-04 15:32:38.000000000 +0100 @@ -0,0 +1,2 @@ +<abbr title="Three Letter Acronym">TLA</abbr> +<abbr>xyz</abbr> diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2015.4.14/test/abbr_tag.md new/html2text-2015.11.4/test/abbr_tag.md --- old/html2text-2015.4.14/test/abbr_tag.md 1970-01-01 01:00:00.000000000 +0100 +++ new/html2text-2015.11.4/test/abbr_tag.md 2015-11-04 15:32:38.000000000 +0100 @@ -0,0 +1,4 @@ +TLA xyz + + *[TLA]: Three Letter Acronym + diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2015.4.14/test/anchor-undefined-href.html new/html2text-2015.11.4/test/anchor-undefined-href.html --- old/html2text-2015.4.14/test/anchor-undefined-href.html 2014-07-12 09:56:14.000000000 +0200 +++ new/html2text-2015.11.4/test/anchor-undefined-href.html 1970-01-01 01:00:00.000000000 +0100 @@ -1,5 +0,0 @@ -<html> - <body> - <a href class="nolink">anchor</a> - </body> -</html> diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2015.4.14/test/anchor-undefined-href.md new/html2text-2015.11.4/test/anchor-undefined-href.md --- old/html2text-2015.4.14/test/anchor-undefined-href.md 2014-07-12 09:56:14.000000000 +0200 +++ new/html2text-2015.11.4/test/anchor-undefined-href.md 1970-01-01 01:00:00.000000000 +0100 @@ -1,2 +0,0 @@ -anchor - diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2015.4.14/test/anchors.html new/html2text-2015.11.4/test/anchors.html --- old/html2text-2015.4.14/test/anchors.html 1970-01-01 01:00:00.000000000 +0100 +++ new/html2text-2015.11.4/test/anchors.html 2015-06-04 09:58:16.000000000 +0200 @@ -0,0 +1,7 @@ +<h1>Processing hyperlinks</h1> + +<p>Additional hyperlink tests!</p> + +<a href="http://some.link"><b>Bold Link</b></a> +<a href="http://some.link/filename.py"><code>filename.py</code></a> +<a href="http://some.link/magicsources.py">The source code is called <code>magic.py</code></a> diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2015.4.14/test/anchors.md new/html2text-2015.11.4/test/anchors.md --- old/html2text-2015.4.14/test/anchors.md 1970-01-01 01:00:00.000000000 +0100 +++ new/html2text-2015.11.4/test/anchors.md 2015-06-04 09:58:16.000000000 +0200 @@ -0,0 +1,8 @@ +# Processing hyperlinks + +Additional hyperlink tests! + +[**Bold Link**](http://some.link) +[`filename.py`](http://some.link/filename.py) [The source code is called +`magic.py`](http://some.link/magicsources.py) + diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2015.4.14/test/apos_element.html new/html2text-2015.11.4/test/apos_element.html --- old/html2text-2015.4.14/test/apos_element.html 1970-01-01 01:00:00.000000000 +0100 +++ new/html2text-2015.11.4/test/apos_element.html 2015-11-04 15:32:38.000000000 +0100 @@ -0,0 +1,5 @@ +<html> + <body> + ' + </body> +</html> diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2015.4.14/test/apos_element.md new/html2text-2015.11.4/test/apos_element.md --- old/html2text-2015.4.14/test/apos_element.md 1970-01-01 01:00:00.000000000 +0100 +++ new/html2text-2015.11.4/test/apos_element.md 2015-11-04 15:32:38.000000000 +0100 @@ -0,0 +1,2 @@ +' + diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2015.4.14/test/blockquote_example.html new/html2text-2015.11.4/test/blockquote_example.html --- old/html2text-2015.4.14/test/blockquote_example.html 1970-01-01 01:00:00.000000000 +0100 +++ new/html2text-2015.11.4/test/blockquote_example.html 2015-11-04 15:32:38.000000000 +0100 @@ -0,0 +1,3 @@ +<blockquote> +The time has come, the Walrus said, to speak of many things. +</blockquote> diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2015.4.14/test/blockquote_example.md new/html2text-2015.11.4/test/blockquote_example.md --- old/html2text-2015.4.14/test/blockquote_example.md 1970-01-01 01:00:00.000000000 +0100 +++ new/html2text-2015.11.4/test/blockquote_example.md 2015-11-04 15:32:38.000000000 +0100 @@ -0,0 +1,2 @@ +> The time has come, the Walrus said, to speak of many things. + diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2015.4.14/test/bold_inside_link.html new/html2text-2015.11.4/test/bold_inside_link.html --- old/html2text-2015.4.14/test/bold_inside_link.html 1970-01-01 01:00:00.000000000 +0100 +++ new/html2text-2015.11.4/test/bold_inside_link.html 2015-11-04 15:38:27.000000000 +0100 @@ -0,0 +1,2 @@ +<a href="link.htm"><b>Text</b></a> +<a href='/nothing/'><b>sample</b></a> diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2015.4.14/test/bold_inside_link.md new/html2text-2015.11.4/test/bold_inside_link.md --- old/html2text-2015.4.14/test/bold_inside_link.md 1970-01-01 01:00:00.000000000 +0100 +++ new/html2text-2015.11.4/test/bold_inside_link.md 2015-11-04 15:39:00.000000000 +0100 @@ -0,0 +1,2 @@ +[**Text**](link.htm) [**sample**](/nothing/) + diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2015.4.14/test/decript_tage.html new/html2text-2015.11.4/test/decript_tage.html --- old/html2text-2015.4.14/test/decript_tage.html 1970-01-01 01:00:00.000000000 +0100 +++ new/html2text-2015.11.4/test/decript_tage.html 2015-11-04 15:32:38.000000000 +0100 @@ -0,0 +1,3 @@ +<del>something</del> +<strike>something</strike> +<s>something</s> diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2015.4.14/test/decript_tage.md new/html2text-2015.11.4/test/decript_tage.md --- old/html2text-2015.4.14/test/decript_tage.md 1970-01-01 01:00:00.000000000 +0100 +++ new/html2text-2015.11.4/test/decript_tage.md 2015-11-04 15:32:38.000000000 +0100 @@ -0,0 +1,2 @@ +<del>something</del> <strike>something</strike> <s>something</s> + diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2015.4.14/test/flip_emphasis.html new/html2text-2015.11.4/test/flip_emphasis.html --- old/html2text-2015.4.14/test/flip_emphasis.html 1970-01-01 01:00:00.000000000 +0100 +++ new/html2text-2015.11.4/test/flip_emphasis.html 2015-11-04 15:32:38.000000000 +0100 @@ -0,0 +1,2 @@ +<i>Something</i> +<b>else</b> diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2015.4.14/test/flip_emphasis.md new/html2text-2015.11.4/test/flip_emphasis.md --- old/html2text-2015.4.14/test/flip_emphasis.md 1970-01-01 01:00:00.000000000 +0100 +++ new/html2text-2015.11.4/test/flip_emphasis.md 2015-11-04 15:32:38.000000000 +0100 @@ -0,0 +1,2 @@ +*Something* __else__ + diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2015.4.14/test/header_tags.html new/html2text-2015.11.4/test/header_tags.html --- old/html2text-2015.4.14/test/header_tags.html 1970-01-01 01:00:00.000000000 +0100 +++ new/html2text-2015.11.4/test/header_tags.html 2015-11-04 15:32:38.000000000 +0100 @@ -0,0 +1,17 @@ +<html> + <body> + <h1>H1</h1> + <h2>H2</h2> + <h3>H3</h3> + <h4>H4</h4> + <h5>H5</h5> + <h6>H6</h6> + <h7>H7</h7> + <h8>H8</h8> + <h9>H9</h9> + <h10>H10</h10> + <h11>H11</h11> + <h12>H12</h12> + <h>NO number</h> + </body> +</html> diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2015.4.14/test/header_tags.md new/html2text-2015.11.4/test/header_tags.md --- old/html2text-2015.4.14/test/header_tags.md 1970-01-01 01:00:00.000000000 +0100 +++ new/html2text-2015.11.4/test/header_tags.md 2015-11-04 15:32:38.000000000 +0100 @@ -0,0 +1,20 @@ +# H1 + +## H2 + +### H3 + +#### H4 + +##### H5 + +###### H6 + +####### H7 + +######## H8 + +######### H9 + +H10 H11 H12 NO number + diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2015.4.14/test/horizontal_rule.html new/html2text-2015.11.4/test/horizontal_rule.html --- old/html2text-2015.4.14/test/horizontal_rule.html 1970-01-01 01:00:00.000000000 +0100 +++ new/html2text-2015.11.4/test/horizontal_rule.html 2015-11-04 15:32:38.000000000 +0100 @@ -0,0 +1,5 @@ +<html> + <body> + <hr> + </body> +</html> diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2015.4.14/test/horizontal_rule.md new/html2text-2015.11.4/test/horizontal_rule.md --- old/html2text-2015.4.14/test/horizontal_rule.md 1970-01-01 01:00:00.000000000 +0100 +++ new/html2text-2015.11.4/test/horizontal_rule.md 2015-11-04 15:32:38.000000000 +0100 @@ -0,0 +1,2 @@ +* * * + diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2015.4.14/test/html_entities_out_of_text.html new/html2text-2015.11.4/test/html_entities_out_of_text.html --- old/html2text-2015.4.14/test/html_entities_out_of_text.html 1970-01-01 01:00:00.000000000 +0100 +++ new/html2text-2015.11.4/test/html_entities_out_of_text.html 2015-11-04 15:32:38.000000000 +0100 @@ -0,0 +1 @@ +<a href="http://thth">állás: Country Manager</a> diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2015.4.14/test/html_entities_out_of_text.md new/html2text-2015.11.4/test/html_entities_out_of_text.md --- old/html2text-2015.4.14/test/html_entities_out_of_text.md 1970-01-01 01:00:00.000000000 +0100 +++ new/html2text-2015.11.4/test/html_entities_out_of_text.md 2015-11-04 15:32:38.000000000 +0100 @@ -0,0 +1,2 @@ +[allas: Country Manager](http://thth) + diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2015.4.14/test/images_with_size.html new/html2text-2015.11.4/test/images_with_size.html --- old/html2text-2015.4.14/test/images_with_size.html 2015-04-14 19:05:05.000000000 +0200 +++ new/html2text-2015.11.4/test/images_with_size.html 2015-11-04 15:32:38.000000000 +0100 @@ -4,4 +4,7 @@ <img src='image_with_width.jpg' alt='An image with a height attr' height='300' data-ignored='ignored data' /> -<img src='image_with_width_and_height.jpg' alt='An image with width and height' width='300' height='300' id='ignored-id' /> \ No newline at end of file +<img src='image_with_width_and_height.jpg' alt='An image with width and height' width='300' height='300' id='ignored-id' /> +<img src='image_with_width_and_height.jpg' width='300' height='300' id='ignored-id' /> +<img src='image_with_width_and_height.jpg' id='ignored-id' /> +<img id='ignored-id' /> diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2015.4.14/test/images_with_size.md new/html2text-2015.11.4/test/images_with_size.md --- old/html2text-2015.4.14/test/images_with_size.md 2015-04-14 19:05:05.000000000 +0200 +++ new/html2text-2015.11.4/test/images_with_size.md 2015-11-04 15:32:38.000000000 +0100 @@ -2,5 +2,6 @@ src='image_with_width.jpg' width='300' alt='An image with a width attr' /> <img src='image_with_width.jpg' height='300' alt='An image with a height attr' /> <img src='image_with_width_and_height.jpg' width='300' height='300' alt='An -image with width and height' /> +image with width and height' /> <img src='image_with_width_and_height.jpg' +width='300' height='300' /> ![](image_with_width_and_height.jpg) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2015.4.14/test/invalid_unicode.html new/html2text-2015.11.4/test/invalid_unicode.html --- old/html2text-2015.4.14/test/invalid_unicode.html 1970-01-01 01:00:00.000000000 +0100 +++ new/html2text-2015.11.4/test/invalid_unicode.html 2015-11-04 15:32:38.000000000 +0100 @@ -0,0 +1 @@ +B�r diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2015.4.14/test/invalid_unicode.md new/html2text-2015.11.4/test/invalid_unicode.md --- old/html2text-2015.4.14/test/invalid_unicode.md 1970-01-01 01:00:00.000000000 +0100 +++ new/html2text-2015.11.4/test/invalid_unicode.md 2015-11-04 15:32:38.000000000 +0100 @@ -0,0 +1,2 @@ +Br + diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2015.4.14/test/link_titles.html new/html2text-2015.11.4/test/link_titles.html --- old/html2text-2015.4.14/test/link_titles.html 1970-01-01 01:00:00.000000000 +0100 +++ new/html2text-2015.11.4/test/link_titles.html 2015-11-04 15:32:38.000000000 +0100 @@ -0,0 +1,3 @@ +<a href="http://example.com" title="MyTitle"> first example</a> +<br> +<a href="http://example.com" > second example</a> diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2015.4.14/test/link_titles.md new/html2text-2015.11.4/test/link_titles.md --- old/html2text-2015.4.14/test/link_titles.md 1970-01-01 01:00:00.000000000 +0100 +++ new/html2text-2015.11.4/test/link_titles.md 2015-11-04 15:32:38.000000000 +0100 @@ -0,0 +1,3 @@ +[ first example](http://example.com "MyTitle" ) +[ second example](http://example.com) + diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2015.4.14/test/list_tags_example.html new/html2text-2015.11.4/test/list_tags_example.html --- old/html2text-2015.4.14/test/list_tags_example.html 1970-01-01 01:00:00.000000000 +0100 +++ new/html2text-2015.11.4/test/list_tags_example.html 2015-11-04 15:32:38.000000000 +0100 @@ -0,0 +1,39 @@ +<dl> +<dt>Definition List</dt> +<dd>A list of terms and their definitions/descriptions.</dd> +<dt>Ordered List</dt> +<dd>A numbered list.</dd> +<dt>Unordered List</dt> +<dd>An unnumbered list.</dd> +</dl> + +<h4>Example 2</h4> +<dl> +<dt>Vocals</dt> +<dd>Bruce Dickinson</dd> +<dt>Guitar</dt> +<dd>Adrian Smith</dd> +<dd>Dave Murray</dd> +<dd>Janick Gers</dd> +<dt>Bass</dt> +<dd>Steve Harris</dd> +<dt>Drums</dt> +<dd>Nicko McBrain</dd> +</dl> + +<ul> +<li>some item</li> +<li>Some other item</li> +<li>some item</li> +</ul> + +<ol> +<li>Some other item</li> +<li>some item</li> +<li>some item</li> +</ol> + +<ul style="list-style-type:ordered;"> +<li>somthing else here</li> +<li>some item</li> +</ul> diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2015.4.14/test/list_tags_example.md new/html2text-2015.11.4/test/list_tags_example.md --- old/html2text-2015.4.14/test/list_tags_example.md 1970-01-01 01:00:00.000000000 +0100 +++ new/html2text-2015.11.4/test/list_tags_example.md 2015-11-04 15:32:38.000000000 +0100 @@ -0,0 +1,36 @@ +Definition List + + A list of terms and their definitions/descriptions. +Ordered List + + A numbered list. +Unordered List + + An unnumbered list. + +#### Example 2 + +Vocals + + Bruce Dickinson +Guitar + + Adrian Smith + Dave Murray + Janick Gers +Bass + + Steve Harris +Drums + + Nicko McBrain + + * some item + * Some other item + * some item + 1. Some other item + 2. some item + 3. some item + * somthing else here + * some item + diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2015.4.14/test/mark_code.html new/html2text-2015.11.4/test/mark_code.html --- old/html2text-2015.4.14/test/mark_code.html 1970-01-01 01:00:00.000000000 +0100 +++ new/html2text-2015.11.4/test/mark_code.html 2015-11-04 15:32:38.000000000 +0100 @@ -0,0 +1,12 @@ +<html> + <body> +<p>Normal text with 'pre' code block.</p> +<pre> +import os + +def function(): + a = 1 +</pre> +<p>Normal text continues.</p> +</body> +</html> diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2015.4.14/test/mark_code.md new/html2text-2015.11.4/test/mark_code.md --- old/html2text-2015.4.14/test/mark_code.md 1970-01-01 01:00:00.000000000 +0100 +++ new/html2text-2015.11.4/test/mark_code.md 2015-11-04 15:32:38.000000000 +0100 @@ -0,0 +1,13 @@ +Normal text with 'pre' code block. + +[code] + + import os + + def function(): + a = 1 + +[/code] + +Normal text continues. + diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2015.4.14/test/no_inline_links_example.html new/html2text-2015.11.4/test/no_inline_links_example.html --- old/html2text-2015.4.14/test/no_inline_links_example.html 1970-01-01 01:00:00.000000000 +0100 +++ new/html2text-2015.11.4/test/no_inline_links_example.html 2015-11-04 15:32:38.000000000 +0100 @@ -0,0 +1,9 @@ +<a href='http://google.com'>Googler</a> +<a> No href</a> +<a title="some title"> No href but title available</a> +<a href='http://example.com' title="Example title"> Example</a> +<a href="http://example.com" title="abc"> +<a href="http://example.com" title="abc"> +<a href="http://example.com" title="abc"> +link text +</a></a></a> diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2015.4.14/test/no_inline_links_example.md new/html2text-2015.11.4/test/no_inline_links_example.md --- old/html2text-2015.4.14/test/no_inline_links_example.md 1970-01-01 01:00:00.000000000 +0100 +++ new/html2text-2015.11.4/test/no_inline_links_example.md 2015-11-04 15:32:38.000000000 +0100 @@ -0,0 +1,9 @@ +[Googler][1] No href No href but title available [ Example][2] [ [ [ link text +][3]][3]][3] + + [1]: http://google.com + + [2]: http://example.com (Example title) + + [3]: http://example.com (abc) + diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2015.4.14/test/no_inline_links_images_to_alt.html new/html2text-2015.11.4/test/no_inline_links_images_to_alt.html --- old/html2text-2015.4.14/test/no_inline_links_images_to_alt.html 1970-01-01 01:00:00.000000000 +0100 +++ new/html2text-2015.11.4/test/no_inline_links_images_to_alt.html 2015-11-04 15:32:38.000000000 +0100 @@ -0,0 +1,7 @@ +<a href="http://example.com"> +<img src="http://example.com/img.png" alt="ALT TEXT" /> +</a> +<br> +<a href="http://example.com"><img src="http://example.com/img.png" alt="ALT TEXT" /></a> +<br> +<a href="http://example.com"><img src="http://example.com/img.png" alt="http://example.com" /></a> diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2015.4.14/test/no_inline_links_images_to_alt.md new/html2text-2015.11.4/test/no_inline_links_images_to_alt.md --- old/html2text-2015.4.14/test/no_inline_links_images_to_alt.md 1970-01-01 01:00:00.000000000 +0100 +++ new/html2text-2015.11.4/test/no_inline_links_images_to_alt.md 2015-11-04 15:32:38.000000000 +0100 @@ -0,0 +1,8 @@ +[ ![ALT TEXT][1] ][2] +[![ALT TEXT][1]][2] +[![http://example.com][1]][2] + + [1]: http://example.com/img.png + + [2]: http://example.com + diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2015.4.14/test/no_inline_links_nested.html new/html2text-2015.11.4/test/no_inline_links_nested.html --- old/html2text-2015.4.14/test/no_inline_links_nested.html 1970-01-01 01:00:00.000000000 +0100 +++ new/html2text-2015.11.4/test/no_inline_links_nested.html 2015-11-04 15:32:38.000000000 +0100 @@ -0,0 +1 @@ +<a href='http://google.com'><a href='/test2/'>this</a>that</a> diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2015.4.14/test/no_inline_links_nested.md new/html2text-2015.11.4/test/no_inline_links_nested.md --- old/html2text-2015.4.14/test/no_inline_links_nested.md 1970-01-01 01:00:00.000000000 +0100 +++ new/html2text-2015.11.4/test/no_inline_links_nested.md 2015-11-04 15:32:38.000000000 +0100 @@ -0,0 +1,6 @@ +[[this][1]that][2] + + [1]: /test2/ + + [2]: http://google.com + diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2015.4.14/test/no_wrap_links.html new/html2text-2015.11.4/test/no_wrap_links.html --- old/html2text-2015.4.14/test/no_wrap_links.html 1970-01-01 01:00:00.000000000 +0100 +++ new/html2text-2015.11.4/test/no_wrap_links.html 2015-11-04 15:32:38.000000000 +0100 @@ -0,0 +1 @@ +And <a href="http://bugs.debian.org/cgi-bin/pkgreport.cgi?tag=multiarch;users=debian-d...@lists.debian.org">here</a> is a long link I had at hand.</p> diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2015.4.14/test/no_wrap_links.md new/html2text-2015.11.4/test/no_wrap_links.md --- old/html2text-2015.4.14/test/no_wrap_links.md 1970-01-01 01:00:00.000000000 +0100 +++ new/html2text-2015.11.4/test/no_wrap_links.md 2015-11-04 15:32:38.000000000 +0100 @@ -0,0 +1,2 @@ +And [here](http://bugs.debian.org/cgi-bin/pkgreport.cgi?tag=multiarch;users=debian-d...@lists.debian.org) is a long link I had at hand. + diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2015.4.14/test/no_wrap_links_no_inline_links.html new/html2text-2015.11.4/test/no_wrap_links_no_inline_links.html --- old/html2text-2015.4.14/test/no_wrap_links_no_inline_links.html 1970-01-01 01:00:00.000000000 +0100 +++ new/html2text-2015.11.4/test/no_wrap_links_no_inline_links.html 2015-11-04 15:32:38.000000000 +0100 @@ -0,0 +1 @@ +And <a href="http://bugs.debian.org/cgi-bin/pkgreport.cgi?tag=multiarch;users=debian-d...@lists.debian.org">here</a> is a long link I had at hand. diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2015.4.14/test/no_wrap_links_no_inline_links.md new/html2text-2015.11.4/test/no_wrap_links_no_inline_links.md --- old/html2text-2015.4.14/test/no_wrap_links_no_inline_links.md 1970-01-01 01:00:00.000000000 +0100 +++ new/html2text-2015.11.4/test/no_wrap_links_no_inline_links.md 2015-11-04 15:32:38.000000000 +0100 @@ -0,0 +1,2 @@ +And [here](http://bugs.debian.org/cgi-bin/pkgreport.cgi?tag=multiarch;users=debian-d...@lists.debian.org) is a long link I had at hand. + diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2015.4.14/test/test_html2text.py new/html2text-2015.11.4/test/test_html2text.py --- old/html2text-2015.4.14/test/test_html2text.py 2015-04-14 19:05:05.000000000 +0200 +++ new/html2text-2015.11.4/test/test_html2text.py 2015-11-04 15:32:38.000000000 +0100 @@ -65,6 +65,13 @@ return result, actual +def test_function(fn, **kwargs): + with open(fn) as inf: + actual = html2text.html2text(inf.read(), **kwargs) + result = get_baseline(fn) + return result, actual + + def get_dump_name(fn, suffix): return '%s-%s_output.md' % (os.path.splitext(fn)[0], suffix) @@ -93,13 +100,18 @@ def test_cmd(self): # Because there is no command-line option to control unicode_snob - if not 'unicode_snob' in module_args: + if 'unicode_snob' not in module_args: self.maxDiff = None result, actual = test_command(fn, *cmdline_args) self.assertEqual(result, actual) + def test_func(self): + result, actual = test_function(fn, **func_args) + self.assertEqual(result, actual) + module_args = {} cmdline_args = [] + func_args = {} base_fn = os.path.basename(fn).lower() if base_fn.startswith('google'): @@ -123,9 +135,10 @@ cmdline_args.append('--bypass-tables') if base_fn.startswith('bodywidth'): - #module_args['unicode_snob'] = True + # module_args['unicode_snob'] = True module_args['body_width'] = 0 cmdline_args.append('--body-width=0') + func_args['bodywidth'] = 0 if base_fn.startswith('protect_links'): module_args['protect_links'] = True @@ -145,17 +158,34 @@ module_args['single_line_break'] = True cmdline_args.append('--single-line-break') - return test_mod, test_cmd + if base_fn.startswith('no_inline_links'): + module_args['inline_links'] = False + cmdline_args.append('--reference-links') + + if base_fn.startswith('no_wrap_links'): + module_args['wrap_links'] = False + cmdline_args.append('--no-wrap-links') + + if base_fn.startswith('mark_code'): + module_args['mark_code'] = True + cmdline_args.append('--mark-code') + + if base_fn not in ['bodywidth_newline.html', 'abbr_tag.html']: + test_func = None + + return test_mod, test_cmd, test_func # Originally from http://stackoverflow.com/questions/32899/\ # how-to-generate-dynamic-parametrized-unit-tests-in-python test_dir_name = os.path.dirname(os.path.realpath(__file__)) for fn in glob.glob("%s/*.html" % test_dir_name): test_name = 'test_%s' % os.path.splitext(os.path.basename(fn))[0].lower() - test_m, test_c = generate_test(fn) + test_m, test_c, test_func = generate_test(fn) setattr(TestHTML2Text, test_name + "_mod", test_m) if test_c: setattr(TestHTML2Text, test_name + "_cmd", test_c) + if test_func: + setattr(TestHTML2Text, test_name + "_func", test_func) if __name__ == "__main__": unittest.main()