converters/html2text add utf8 and other fixes

Davide Gerhard Sat, 21 Jan 2017 14:10:11 -0800

hello,
just some fixes.

changes (most of them from debian[1]):
- close every file after processing, not at the end of program.
- remove limited built-in http support.
- support UTF-8 encoding when processing input.
- don't use backspaces.
- skip numbers in html tag.
- recognize all <meta> tags, not just one.
- recode input according to 'meta http-equiv' in html document.
  (needs iconv)
- convert output to user's locale charset (needs iconv)
- correctly specify NULLs for 64-bit architectures.
- substituted 'char*' with 'const char*' in needed places to avoid
  'deprecated conversion from string constant to ‘char*’' warnings.
- validate --width parameter input.
- fix CXX variable on configure and now should compile with clang.


for conversion, I added iconv as WANTLIB; maybe we can create a FLAVOR
but I think that should be acceptable for everyday usage.

two questions:
- how can I test the port with clang/llvm? some nice mk.conf variable?
- would be nice to add -utf8 to man: I need to use mg/vi or there
  is another tool?

tested on amd64.

thanks
/davide

[1] http://sources.debian.net/patches/html2text/1.3.2a-18/

diff -Nrua -x CVS /usr/ports/converters/html2text/Makefile ./Makefile
--- /usr/ports/converters/html2text/Makefile	Sat Apr 25 23:58:01 2015
+++ ./Makefile	Sat Jan 21 21:48:09 2017
@@ -5,16 +5,16 @@
 COMMENT=		advanced HTML-to-text converter
 
 DISTNAME=		html2text-1.3.2a
-REVISION=		1
+REVISION=		2
 CATEGORIES=		converters textproc
 
-
 HOMEPAGE=		http://www.mbayer.de/html2text/
 
-# GPL
+# GPLv2
 PERMIT_PACKAGE_CDROM=	Yes
 
-WANTLIB += c m stdc++
+WANTLIB +=		c m stdc++ iconv
+LIB_DEPENDS +=		converters/libiconv
 
 MASTER_SITES=		http://www.mbayer.de/html2text/downloads/ \
 			ftp://ftp.ibiblio.org/pub/linux/apps/www/converters/
@@ -23,7 +23,9 @@
 
 MAKE_FLAGS=	PREFIX="${PREFIX}" \
 		CXXFLAGS="-DVERSION=1.3.2a -DAUTO_PTR_BROKEN ${CXXFLAGS}" \
-		CXX="${CXX}"
+		CXXFLAGS+="-I${LOCALBASE}/include" \
+		CXX="${CXX}" \
+		LDFLAGS="-L${LOCALBASE}/lib -liconv"
 
 NO_TEST=	Yes
 
diff -Nrua -x CVS /usr/ports/converters/html2text/patches/patch-Area_C ./patches/patch-Area_C
--- /usr/ports/converters/html2text/patches/patch-Area_C	Thu Jan  1 01:00:00 1970
+++ ./patches/patch-Area_C	Sat Jan 21 20:58:10 2017
@@ -0,0 +1,109 @@
+$OpenBSD$
+--- Area.C.orig	Sun Nov 23 12:05:29 2003
++++ Area.C	Sat Jan 21 20:57:57 2017
+@@ -36,10 +36,13 @@
+ #include <iostream>
+ 
+ #include "Area.h"
++#include "html.h"
+ #include "string.h"
+ 
+ #define LATIN1_nbsp 160
+ 
++extern int use_encoding;
++
+ /* ------------------------------------------------------------------------- */
+ 
+ #define malloc_array(type, size)\
+@@ -81,6 +84,53 @@ Line::~Line()
+ 
+ /* ------------------------------------------------------------------------- */
+ 
++/*           utf_length() and utf_width()       
++ *
++ *     Very simplified algorithm of calculating length of UTF-8
++ *   string. No check for errors. Counting only ASCII bytes and
++ *   leading bytes of UTF-8 multibyte sequences. All bytes like
++ *   10xxxxxx are dropped. If USE_UTF8 is false then returns
++ *   usual length.               --YS
++ */
++
++size_t utf8_aux_count(char ch)
++{
++	if((ch & 0xe0) == 0xc0)
++	{
++		return 1;
++	}
++	else if((ch & 0xf0) == 0xe0)
++	{
++		return 2;
++	}
++	else if ((ch & 0xf8) == 0xf0)
++	{
++		return 3;
++	}
++	else
++	{
++		return 0;
++	}
++}
++
++unsigned int
++Line::utf_length(size_type f, size_type t) const
++{
++	size_type m = (t < length_ ? t : length_);
++	size_type r = m - f;
++	if(USE_UTF8)
++	{
++		for (int i = f; i < m; i++)
++		{
++			char& ch = cells_[i].character;
++			size_type aux_count = utf8_aux_count(ch);
++			r -= aux_count;
++			i += aux_count;
++		}
++	}
++	return r;
++}
++
+ void
+ Line::resize(size_type l)
+ {
+@@ -236,6 +286,28 @@ Area::operator>>=(size_type rs)
+   return *this;
+ }
+ 
++unsigned int
++Area::utf_width()
++{
++  size_type r = width_;
++  if(USE_UTF8) { r = 0;
++    for (size_type yy = 0; yy < height_; yy++) {
++	  int i = width_ - 1;
++      while((i >= 0) && isspace(cells_[yy][i].character))
++	  {
++		  --i;
++	  }
++      size_type aux_count_sum = 0;
++      for (; i >= 0; i--) {
++		aux_count_sum += utf8_aux_count(cells_[yy][i].character);
++      }
++	  size_type r1 = width_ - aux_count_sum;
++      if(r < r1) r = r1;
++    }
++  }
++  return r;
++}
++
+ void
+ Area::resize(size_type w, size_type h)
+ {
+@@ -439,7 +511,7 @@ operator<<(ostream &os, const Area &a)
+       char c = p->character;
+       char a = p->attribute;
+ 
+-      if (c == (char) LATIN1_nbsp) c = ' ';
++      if (c == (char) LATIN1_nbsp && !USE_UTF8) c = ' ';
+ 
+       if (a == Cell::NONE) {
+         os << c;
diff -Nrua -x CVS /usr/ports/converters/html2text/patches/patch-Area_h ./patches/patch-Area_h
--- /usr/ports/converters/html2text/patches/patch-Area_h	Thu Jan  1 01:00:00 1970
+++ ./patches/patch-Area_h	Sat Jan 21 20:58:10 2017
@@ -0,0 +1,21 @@
+$OpenBSD$
+--- Area.h.orig	Sun Nov 23 12:05:29 2003
++++ Area.h	Sat Jan 21 20:57:57 2017
+@@ -81,6 +81,8 @@ class Line { (public)
+   Cell       &operator[](size_type x)       { return cells_[x]; }
+   const Cell *cells() const { return cells_; }
+ 
++  unsigned int utf_length(size_type f, size_type t) const;
++
+   void resize(size_type l);
+   void enlarge(size_type l) { if (l > length_) resize(l); }
+ 
+@@ -133,6 +135,8 @@ class Area { (public)
+   const Cell *operator[](size_type y) const { return cells_[y]; }
+   Cell       *operator[](size_type y)       { return cells_[y]; }
+   const Area &operator>>=(size_type rs);
++
++  unsigned int utf_width();
+ 
+   void resize(size_type w, size_type h);
+   void enlarge(size_type w, size_type h);
diff -Nrua -x CVS /usr/ports/converters/html2text/patches/patch-HTMLControl_C ./patches/patch-HTMLControl_C
--- /usr/ports/converters/html2text/patches/patch-HTMLControl_C	Thu Jan  1 01:00:00 1970
+++ ./patches/patch-HTMLControl_C	Sat Jan 21 20:58:10 2017
@@ -0,0 +1,12 @@
+$OpenBSD$
+--- HTMLControl.C.orig	Sun Nov 23 12:05:29 2003
++++ HTMLControl.C	Sat Jan 21 20:57:57 2017
+@@ -372,7 +372,7 @@ HTMLControl::yylex2(yy_HTMLParser_stype *value_return,
+             attribute.first = c;
+             for (;;) {
+               c = get_char();
+-              if (!isalpha(c) && c != '-' && c != '_' && c != ':') break;
++              if (!isalnum(c) && c != '-' && c != '_' && c != ':') break;
+ 	      // Same as in line 352 - Arno
+               attribute.first += c;
+             }
diff -Nrua -x CVS /usr/ports/converters/html2text/patches/patch-HTMLControl_h ./patches/patch-HTMLControl_h
--- /usr/ports/converters/html2text/patches/patch-HTMLControl_h	Thu Jan  1 01:00:00 1970
+++ ./patches/patch-HTMLControl_h	Sat Jan 21 20:58:10 2017
@@ -0,0 +1,29 @@
+$OpenBSD$
+--- HTMLControl.h.orig	Sun Nov 23 12:05:29 2003
++++ HTMLControl.h	Sat Jan 21 20:57:57 2017
+@@ -38,7 +38,6 @@
+ /* ------------------------------------------------------------------------- */
+ 
+ #include "HTMLParser.h"
+-#include "urlistream.h"
+ #include <istream>
+ 
+ using std::istream;
+@@ -48,7 +47,7 @@ using std::istream;
+ class HTMLControl : public HTMLParser {
+ 
+ public:
+-  HTMLControl(urlistream &is_, bool debug_scanner_, bool debug_parser_) :
++  HTMLControl(istream &is_, bool debug_scanner_, bool debug_parser_) :
+     HTMLParser(),
+     current_line(1),
+     current_column(0),
+@@ -84,7 +83,7 @@ class HTMLControl : public HTMLParser { (private)
+ 
+   bool debug_scanner;
+ 
+-  urlistream &is;
++  istream &is;
+   int     ungotten_chars[5];
+   int     number_of_ungotten_chars;
+ };
diff -Nrua -x CVS /usr/ports/converters/html2text/patches/patch-HTMLParser_C ./patches/patch-HTMLParser_C
--- /usr/ports/converters/html2text/patches/patch-HTMLParser_C	Thu Jan  1 01:00:00 1970
+++ ./patches/patch-HTMLParser_C	Sat Jan 21 20:58:10 2017
@@ -0,0 +1,1090 @@
+$OpenBSD$
+--- HTMLParser.C.orig	Sun Nov 23 12:05:29 2003
++++ HTMLParser.C	Sat Jan 21 20:57:57 2017
+@@ -600,7 +600,7 @@ static const int END_VAR;
+  /* decl const */
+ public:
+  int YY_HTMLParser_PARSE (YY_HTMLParser_PARSE_PARAM);
+- virtual void YY_HTMLParser_ERROR(char *msg) YY_HTMLParser_ERROR_BODY;
++ virtual void YY_HTMLParser_ERROR(const char *msg) YY_HTMLParser_ERROR_BODY;
+ #ifdef YY_HTMLParser_PURE
+ #ifdef YY_HTMLParser_LSP_NEEDED
+  virtual int  YY_HTMLParser_LEX (YY_HTMLParser_STYPE *YY_HTMLParser_LVAL,YY_HTMLParser_LTYPE *YY_HTMLParser_LLOC) YY_HTMLParser_LEX_BODY;
+@@ -911,28 +911,28 @@ static const short yyrhs[] = {   132,
+ #if YY_HTMLParser_DEBUG != 0
+ static const short yyrline[] = { 0,
+    273,   304,   309,   312,   315,   319,   322,   326,   329,   333,
+-   336,   339,   342,   345,   353,   361,   365,   368,   373,   376,
+-   379,   384,   392,   396,   399,   407,   415,   420,   423,   426,
+-   431,   442,   446,   454,   458,   461,   466,   471,   475,   478,
+-   481,   487,   493,   499,   505,   510,   519,   520,   527,   527,
+-   534,   534,   541,   541,   550,   554,   557,   563,   570,   575,
+-   582,   591,   600,   604,   607,   611,   617,   623,   631,   637,
+-   645,   650,   653,   658,   662,   665,   670,   678,   686,   690,
+-   693,   701,   705,   708,   714,   721,   727,   737,   742,   747,
+-   749,   750,   751,   752,   753,   761,   763,   764,   765,   766,
+-   767,   768,   769,   770,   773,   775,   776,   777,   778,   779,
+-   780,   781,   784,   795,   800,   808,   814,   819,   824,   832,
+-   836,   840,   848,   852,   855,   861,   867,   873,   881,   886,
+-   891,   901,   903,   904,   905,   906,   907,   910,   912,   913,
+-   914,   915,   916,   921,   921,   922,   922,   923,   923,   924,
+-   924,   926,   926,   927,   927,   929,   929,   930,   930,   931,
+-   931,   932,   932,   933,   933,   934,   934,   935,   935,   936,
+-   936,   937,   937,   938,   938,   939,   939,   940,   940,   941,
+-   941,   942,   942,   943,   943,   944,   944,   945,   945,   946,
+-   946,   947,   947,   948,   948,   949,   949,   950,   950,   951,
+-   951,   952,   952,   953,   953,   954,   954,   955,   955,   956,
+-   956,   957,   957,   958,   958,   959,   959,   960,   960,   961,
+-   961,   963,   963
++   336,   339,   344,   347,   355,   363,   367,   370,   375,   378,
++   381,   386,   394,   398,   401,   409,   417,   422,   425,   428,
++   433,   444,   448,   456,   460,   463,   468,   473,   477,   480,
++   483,   489,   495,   501,   507,   512,   521,   522,   529,   529,
++   536,   536,   543,   543,   552,   556,   559,   565,   572,   577,
++   584,   593,   602,   606,   609,   613,   619,   625,   633,   639,
++   647,   652,   655,   660,   664,   667,   672,   680,   688,   692,
++   695,   703,   707,   710,   716,   723,   729,   739,   744,   749,
++   751,   752,   753,   754,   755,   763,   765,   766,   767,   768,
++   769,   770,   771,   772,   775,   777,   778,   779,   780,   781,
++   782,   783,   786,   797,   802,   810,   816,   821,   826,   834,
++   838,   842,   850,   854,   857,   863,   869,   875,   883,   888,
++   893,   903,   905,   906,   907,   908,   909,   912,   914,   915,
++   916,   917,   918,   923,   923,   924,   924,   925,   925,   926,
++   926,   928,   928,   929,   929,   931,   931,   932,   932,   933,
++   933,   934,   934,   935,   935,   936,   936,   937,   937,   938,
++   938,   939,   939,   940,   940,   941,   941,   942,   942,   943,
++   943,   944,   944,   945,   945,   946,   946,   947,   947,   948,
++   948,   949,   949,   950,   950,   951,   951,   952,   952,   953,
++   953,   954,   954,   955,   955,   956,   956,   957,   957,   958,
++   958,   959,   959,   960,   960,   961,   961,   962,   962,   963,
++   963,   965,   965
+ };
+ 
+ static const char * const yytname[] = {   "$","error","$illegal.","DOCTYPE",
+@@ -2044,17 +2044,19 @@ case 11:
+ case 12:
+ #line 339 "HTMLParser.y"
+ {
+-    (yyval.document = yyvsp[-1].document)->head.meta_attributes.reset(yyvsp[0].tag_attributes);
++    auto_ptr<Meta> s(new Meta);
++    s->attributes.reset(yyvsp[0].tag_attributes);
++    (yyval.document = yyvsp[-1].document)->head.metas.push_back(s);
+   ;
+     break;}
+ case 13:
+-#line 342 "HTMLParser.y"
++#line 344 "HTMLParser.y"
+ {
+     (yyval.document = yyvsp[-1].document)->head.link_attributes.reset(yyvsp[0].tag_attributes);
+   ;
+     break;}
+ case 14:
+-#line 345 "HTMLParser.y"
++#line 347 "HTMLParser.y"
+ {
+     auto_ptr<Script> s(new Script);
+     s->attributes.reset(yyvsp[0].tag_attributes);
+@@ -2065,7 +2067,7 @@ case 14:
+   ;
+     break;}
+ case 15:
+-#line 353 "HTMLParser.y"
++#line 355 "HTMLParser.y"
+ {
+     auto_ptr<Style> s(new Style);
+     s->attributes.reset(yyvsp[0].tag_attributes);
+@@ -2076,20 +2078,20 @@ case 15:
+   ;
+     break;}
+ case 16:
+-#line 361 "HTMLParser.y"
++#line 363 "HTMLParser.y"
+ {
+     delete yyvsp[0].tag_attributes;
+     yyval.document = yyvsp[-1].document;
+   ;
+     break;}
+ case 17:
+-#line 365 "HTMLParser.y"
++#line 367 "HTMLParser.y"
+ {
+     yyval.document = yyvsp[-1].document;
+   ;
+     break;}
+ case 18:
+-#line 368 "HTMLParser.y"
++#line 370 "HTMLParser.y"
+ {
+     Paragraph *p = new Paragraph;
+     p->texts.reset(yyvsp[0].element_list);
+@@ -2097,25 +2099,25 @@ case 18:
+   ;
+     break;}
+ case 19:
+-#line 373 "HTMLParser.y"
++#line 375 "HTMLParser.y"
+ {
+     (yyval.document = yyvsp[-1].document)->body.content->push_back(auto_ptr<Element>(yyvsp[0].heading));
+   ;
+     break;}
+ case 20:
+-#line 376 "HTMLParser.y"
++#line 378 "HTMLParser.y"
+ {
+     (yyval.document = yyvsp[-1].document)->body.content->push_back(auto_ptr<Element>(yyvsp[0].element));
+   ;
+     break;}
+ case 21:
+-#line 379 "HTMLParser.y"
++#line 381 "HTMLParser.y"
+ {
+     (yyval.document = yyvsp[-1].document)->body.content->push_back(auto_ptr<Element>(yyvsp[0].address));
+   ;
+     break;}
+ case 22:
+-#line 385 "HTMLParser.y"
++#line 387 "HTMLParser.y"
+ {
+     yyval.pcdata = new PCData;
+     yyval.pcdata->text = *yyvsp[0].strinG;
+@@ -2123,19 +2125,19 @@ case 22:
+   ;
+     break;}
+ case 23:
+-#line 393 "HTMLParser.y"
++#line 395 "HTMLParser.y"
+ {
+     yyval.element_list = new list<auto_ptr<Element> >;
+   ;
+     break;}
+ case 24:
+-#line 396 "HTMLParser.y"
++#line 398 "HTMLParser.y"
+ {
+     yyval.element_list = yyvsp[-1].element_list;
+   ;
+     break;}
+ case 25:
+-#line 399 "HTMLParser.y"
++#line 401 "HTMLParser.y"
+ {
+     auto_ptr<Script> s(new Script);
+     s->attributes.reset(yyvsp[0].tag_attributes);
+@@ -2146,7 +2148,7 @@ case 25:
+   ;
+     break;}
+ case 26:
+-#line 407 "HTMLParser.y"
++#line 409 "HTMLParser.y"
+ {
+     auto_ptr<Style> s(new Style);
+     s->attributes.reset(yyvsp[0].tag_attributes);
+@@ -2157,7 +2159,7 @@ case 26:
+   ;
+     break;}
+ case 27:
+-#line 415 "HTMLParser.y"
++#line 417 "HTMLParser.y"
+ {
+     Paragraph *p = new Paragraph;
+     p->texts = auto_ptr<list<auto_ptr<Element> > >(yyvsp[0].element_list);
+@@ -2165,25 +2167,25 @@ case 27:
+   ;
+     break;}
+ case 28:
+-#line 420 "HTMLParser.y"
++#line 422 "HTMLParser.y"
+ {
+     (yyval.element_list = yyvsp[-1].element_list)->push_back(auto_ptr<Element>(yyvsp[0].heading));
+   ;
+     break;}
+ case 29:
+-#line 423 "HTMLParser.y"
++#line 425 "HTMLParser.y"
+ {
+     (yyval.element_list = yyvsp[-1].element_list)->push_back(auto_ptr<Element>(yyvsp[0].element));
+   ;
+     break;}
+ case 30:
+-#line 426 "HTMLParser.y"
++#line 428 "HTMLParser.y"
+ {
+     (yyval.element_list = yyvsp[-1].element_list)->push_back(auto_ptr<Element>(yyvsp[0].address));
+   ;
+     break;}
+ case 31:
+-#line 432 "HTMLParser.y"
++#line 434 "HTMLParser.y"
+ {
+             /* EXTENSION: Allow paragraph content in heading, not only texts */
+     if (yyvsp[-2].heading->level != yyvsp[0].inT) {
+@@ -2194,13 +2196,13 @@ case 31:
+   ;
+     break;}
+ case 32:
+-#line 443 "HTMLParser.y"
++#line 445 "HTMLParser.y"
+ {
+     yyval.element = yyvsp[0].element;
+   ;
+     break;}
+ case 33:
+-#line 446 "HTMLParser.y"
++#line 448 "HTMLParser.y"
+ {
+     Paragraph *p = new Paragraph;
+     p->attributes.reset(yyvsp[-2].tag_attributes);
+@@ -2209,19 +2211,19 @@ case 33:
+   ;
+     break;}
+ case 34:
+-#line 455 "HTMLParser.y"
++#line 457 "HTMLParser.y"
+ {
+     yyval.element_list = new list<auto_ptr<Element> >;
+   ;
+     break;}
+ case 35:
+-#line 458 "HTMLParser.y"
++#line 460 "HTMLParser.y"
+ {
+     yyval.element_list = yyvsp[-1].element_list;
+   ;
+     break;}
+ case 36:
+-#line 461 "HTMLParser.y"
++#line 463 "HTMLParser.y"
+ {
+     yyval.element_list = yyvsp[-1].element_list;
+     yyval.element_list->splice(yyval.element_list->end(), *yyvsp[0].element_list);
+@@ -2229,31 +2231,31 @@ case 36:
+   ;
+     break;}
+ case 37:
+-#line 466 "HTMLParser.y"
++#line 468 "HTMLParser.y"
+ {
+     (yyval.element_list = yyvsp[-1].element_list)->push_back(auto_ptr<Element>(yyvsp[0].element));
+   ;
+     break;}
+ case 38:
+-#line 472 "HTMLParser.y"
++#line 474 "HTMLParser.y"
+ {
+     yyval.element = yyvsp[0].element;
+   ;
+     break;}
+ case 39:
+-#line 475 "HTMLParser.y"
++#line 477 "HTMLParser.y"
+ {
+     yyval.element = yyvsp[0].preformatted;
+   ;
+     break;}
+ case 40:
+-#line 478 "HTMLParser.y"
++#line 480 "HTMLParser.y"
+ {
+     yyval.element = yyvsp[0].definition_list;
+   ;
+     break;}
+ case 41:
+-#line 481 "HTMLParser.y"
++#line 483 "HTMLParser.y"
+ {
+     Division *p = new Division;
+     p->attributes.reset(yyvsp[-2].tag_attributes);
+@@ -2262,7 +2264,7 @@ case 41:
+   ;
+     break;}
+ case 42:
+-#line 487 "HTMLParser.y"
++#line 489 "HTMLParser.y"
+ {
+     Center *p = new Center;
+     delete yyvsp[-2].tag_attributes;       // CENTER has no attributes.
+@@ -2271,7 +2273,7 @@ case 42:
+   ;
+     break;}
+ case 43:
+-#line 493 "HTMLParser.y"
++#line 495 "HTMLParser.y"
+ {
+     delete yyvsp[-2].tag_attributes; // BLOCKQUOTE has no attributes!
+     BlockQuote *bq = new BlockQuote;
+@@ -2280,7 +2282,7 @@ case 43:
+   ;
+     break;}
+ case 44:
+-#line 499 "HTMLParser.y"
++#line 501 "HTMLParser.y"
+ {
+     Form *f = new Form;
+     f->attributes.reset(yyvsp[-2].tag_attributes);
+@@ -2289,7 +2291,7 @@ case 44:
+   ;
+     break;}
+ case 45:
+-#line 505 "HTMLParser.y"
++#line 507 "HTMLParser.y"
+ {
+     HorizontalRule *h = new HorizontalRule;
+     h->attributes.reset(yyvsp[0].tag_attributes);
+@@ -2297,7 +2299,7 @@ case 45:
+   ;
+     break;}
+ case 46:
+-#line 510 "HTMLParser.y"
++#line 512 "HTMLParser.y"
+ {
+     Table *t = new Table;
+     t->attributes.reset(yyvsp[-3].tag_attributes);
+@@ -2307,11 +2309,11 @@ case 46:
+   ;
+     break;}
+ case 47:
+-#line 520 "HTMLParser.y"
++#line 522 "HTMLParser.y"
+ { ++list_nesting; ;
+     break;}
+ case 48:
+-#line 520 "HTMLParser.y"
++#line 522 "HTMLParser.y"
+ {
+     OrderedList *ol = new OrderedList;
+     ol->attributes.reset(yyvsp[-3].tag_attributes);
+@@ -2321,11 +2323,11 @@ case 48:
+   ;
+     break;}
+ case 49:
+-#line 527 "HTMLParser.y"
++#line 529 "HTMLParser.y"
+ { ++list_nesting; ;
+     break;}
+ case 50:
+-#line 527 "HTMLParser.y"
++#line 529 "HTMLParser.y"
+ {
+     UnorderedList *ul = new UnorderedList;
+     ul->attributes.reset(yyvsp[-3].tag_attributes);
+@@ -2335,11 +2337,11 @@ case 50:
+   ;
+     break;}
+ case 51:
+-#line 534 "HTMLParser.y"
++#line 536 "HTMLParser.y"
+ { ++list_nesting; ;
+     break;}
+ case 52:
+-#line 534 "HTMLParser.y"
++#line 536 "HTMLParser.y"
+ {
+     Dir *d = new Dir;
+     d->attributes.reset(yyvsp[-3].tag_attributes);
+@@ -2349,11 +2351,11 @@ case 52:
+   ;
+     break;}
+ case 53:
+-#line 541 "HTMLParser.y"
++#line 543 "HTMLParser.y"
+ { ++list_nesting; ;
+     break;}
+ case 54:
+-#line 541 "HTMLParser.y"
++#line 543 "HTMLParser.y"
+ {
+     Menu *m = new Menu;
+     m->attributes.reset(yyvsp[-3].tag_attributes);
+@@ -2363,26 +2365,26 @@ case 54:
+   ;
+     break;}
+ case 55:
+-#line 551 "HTMLParser.y"
++#line 553 "HTMLParser.y"
+ {
+     yyval.list_items = 0;
+   ;
+     break;}
+ case 56:
+-#line 554 "HTMLParser.y"
++#line 556 "HTMLParser.y"
+ {
+     yyval.list_items = yyvsp[-1].list_items;
+   ;
+     break;}
+ case 57:
+-#line 557 "HTMLParser.y"
++#line 559 "HTMLParser.y"
+ {
+     yyval.list_items = yyvsp[-1].list_items ? yyvsp[-1].list_items : new list<auto_ptr<ListItem> >;
+     yyval.list_items->push_back(auto_ptr<ListItem>(yyvsp[0].list_item));
+   ;
+     break;}
+ case 58:
+-#line 564 "HTMLParser.y"
++#line 566 "HTMLParser.y"
+ {
+     ListNormalItem *lni = new ListNormalItem;
+     lni->attributes.reset(yyvsp[-2].tag_attributes);
+@@ -2391,7 +2393,7 @@ case 58:
+   ;
+     break;}
+ case 59:
+-#line 570 "HTMLParser.y"
++#line 572 "HTMLParser.y"
+ {   /* EXTENSION: Handle a "block" in a list as an indented block. */
+     ListBlockItem *lbi = new ListBlockItem;
+     lbi->block.reset(yyvsp[0].element);
+@@ -2399,7 +2401,7 @@ case 59:
+   ;
+     break;}
+ case 60:
+-#line 575 "HTMLParser.y"
++#line 577 "HTMLParser.y"
+ {              /* EXTENSION: Treat "texts" in a list as an "<LI>". */
+     ListNormalItem *lni = new ListNormalItem;
+     lni->flow.reset(yyvsp[0].element_list);
+@@ -2407,7 +2409,7 @@ case 60:
+   ;
+     break;}
+ case 61:
+-#line 585 "HTMLParser.y"
++#line 587 "HTMLParser.y"
+ {
+     delete yyvsp[-4].tag_attributes;
+     delete yyvsp[-3].element_list; /* Kludge */
+@@ -2415,7 +2417,7 @@ case 61:
+   ;
+     break;}
+ case 62:
+-#line 591 "HTMLParser.y"
++#line 593 "HTMLParser.y"
+ {
+     DefinitionList *dl = new DefinitionList;
+     dl->attributes.reset(yyvsp[-4].tag_attributes);
+@@ -2425,33 +2427,33 @@ case 62:
+   ;
+     break;}
+ case 63:
+-#line 601 "HTMLParser.y"
++#line 603 "HTMLParser.y"
+ {
+     yyval.definition_list_item_list = 0;
+   ;
+     break;}
+ case 64:
+-#line 604 "HTMLParser.y"
++#line 606 "HTMLParser.y"
+ {
+     yyval.definition_list_item_list = yyvsp[0].definition_list_item_list;
+   ;
+     break;}
+ case 65:
+-#line 607 "HTMLParser.y"
++#line 609 "HTMLParser.y"
+ {
+     yyval.definition_list_item_list = yyvsp[-1].definition_list_item_list ? yyvsp[-1].definition_list_item_list : new list<auto_ptr<DefinitionListItem> >;
+     yyval.definition_list_item_list->push_back(auto_ptr<DefinitionListItem>(yyvsp[0].term_name));
+   ;
+     break;}
+ case 66:
+-#line 611 "HTMLParser.y"
++#line 613 "HTMLParser.y"
+ {
+     yyval.definition_list_item_list = yyvsp[-1].definition_list_item_list ? yyvsp[-1].definition_list_item_list : new list<auto_ptr<DefinitionListItem> >;
+     yyval.definition_list_item_list->push_back(auto_ptr<DefinitionListItem>(yyvsp[0].term_definition));
+   ;
+     break;}
+ case 67:
+-#line 618 "HTMLParser.y"
++#line 620 "HTMLParser.y"
+ {      /* EXTENSION: Allow "flow" instead of "texts" */
+     delete yyvsp[-2].tag_attributes;
+     yyval.term_name = new TermName;
+@@ -2459,7 +2461,7 @@ case 67:
+   ;
+     break;}
+ case 68:
+-#line 623 "HTMLParser.y"
++#line 625 "HTMLParser.y"
+ {/* EXTENSION: Ignore <P> after </DT> */
+     delete yyvsp[-4].tag_attributes;
+     delete yyvsp[-1].tag_attributes;
+@@ -2468,7 +2470,7 @@ case 68:
+   ;
+     break;}
+ case 69:
+-#line 632 "HTMLParser.y"
++#line 634 "HTMLParser.y"
+ {
+     delete yyvsp[-2].tag_attributes;
+     yyval.term_definition = new TermDefinition;
+@@ -2476,7 +2478,7 @@ case 69:
+   ;
+     break;}
+ case 70:
+-#line 637 "HTMLParser.y"
++#line 639 "HTMLParser.y"
+ {/* EXTENSION: Ignore <P> after </DD> */
+     delete yyvsp[-4].tag_attributes;
+     delete yyvsp[-1].tag_attributes;
+@@ -2485,44 +2487,44 @@ case 70:
+   ;
+     break;}
+ case 71:
+-#line 646 "HTMLParser.y"
++#line 648 "HTMLParser.y"
+ {
+     yyval.element_list = new list<auto_ptr<Element> >;
+     yyval.element_list->push_back(auto_ptr<Element>(yyvsp[0].element));
+   ;
+     break;}
+ case 72:
+-#line 650 "HTMLParser.y"
++#line 652 "HTMLParser.y"
+ {
+     yyval.element_list = yyvsp[-1].element_list;
+   ;
+     break;}
+ case 73:
+-#line 653 "HTMLParser.y"
++#line 655 "HTMLParser.y"
+ {
+     (yyval.element_list = yyvsp[-1].element_list)->push_back(auto_ptr<Element>(yyvsp[0].element));
+   ;
+     break;}
+ case 74:
+-#line 659 "HTMLParser.y"
++#line 661 "HTMLParser.y"
+ {
+     yyval.element = yyvsp[0].element;
+   ;
+     break;}
+ case 75:
+-#line 662 "HTMLParser.y"
++#line 664 "HTMLParser.y"
+ {          /* EXTENSION: Allow headings in "flow", i.e. in lists */
+     yyval.element = yyvsp[0].heading;
+   ;
+     break;}
+ case 76:
+-#line 665 "HTMLParser.y"
++#line 667 "HTMLParser.y"
+ {
+     yyval.element = yyvsp[0].element;
+   ;
+     break;}
+ case 77:
+-#line 671 "HTMLParser.y"
++#line 673 "HTMLParser.y"
+ {
+     yyval.preformatted = new Preformatted;
+     yyval.preformatted->attributes.reset(yyvsp[-2].tag_attributes);
+@@ -2530,7 +2532,7 @@ case 77:
+   ;
+     break;}
+ case 78:
+-#line 679 "HTMLParser.y"
++#line 681 "HTMLParser.y"
+ {
+     yyval.caption = new Caption;
+     yyval.caption->attributes.reset(yyvsp[-2].tag_attributes);
+@@ -2538,19 +2540,19 @@ case 78:
+   ;
+     break;}
+ case 79:
+-#line 687 "HTMLParser.y"
++#line 689 "HTMLParser.y"
+ {
+     yyval.table_rows = new list<auto_ptr<TableRow> >;
+   ;
+     break;}
+ case 80:
+-#line 690 "HTMLParser.y"
++#line 692 "HTMLParser.y"
+ {
+     yyval.table_rows = yyvsp[-1].table_rows;
+   ;
+     break;}
+ case 81:
+-#line 693 "HTMLParser.y"
++#line 695 "HTMLParser.y"
+ {
+     TableRow *tr = new TableRow;
+     tr->attributes.reset(yyvsp[-2].tag_attributes);
+@@ -2559,19 +2561,19 @@ case 81:
+   ;
+     break;}
+ case 82:
+-#line 702 "HTMLParser.y"
++#line 704 "HTMLParser.y"
+ {
+     yyval.table_cells = new list<auto_ptr<TableCell> >;
+   ;
+     break;}
+ case 83:
+-#line 705 "HTMLParser.y"
++#line 707 "HTMLParser.y"
+ {
+     yyval.table_cells = yyvsp[-1].table_cells;
+   ;
+     break;}
+ case 84:
+-#line 708 "HTMLParser.y"
++#line 710 "HTMLParser.y"
+ {
+     TableCell *tc = new TableCell;
+     tc->attributes.reset(yyvsp[-2].tag_attributes);
+@@ -2580,7 +2582,7 @@ case 84:
+   ;
+     break;}
+ case 85:
+-#line 714 "HTMLParser.y"
++#line 716 "HTMLParser.y"
+ {
+                             /* EXTENSION: Allow "</TD>" in place of "</TH>". */
+     TableHeadingCell *thc = new TableHeadingCell;
+@@ -2590,14 +2592,14 @@ case 85:
+   ;
+     break;}
+ case 86:
+-#line 721 "HTMLParser.y"
++#line 723 "HTMLParser.y"
+ {    /* EXTENSION: Ignore <INPUT> between table cells. */
+     delete yyvsp[0].tag_attributes;
+     yyval.table_cells = yyvsp[-1].table_cells;
+   ;
+     break;}
+ case 87:
+-#line 728 "HTMLParser.y"
++#line 730 "HTMLParser.y"
+ { /* Should be "address_content"... */
+     delete yyvsp[-2].tag_attributes;
+     yyval.address = new Address;
+@@ -2605,40 +2607,40 @@ case 87:
+   ;
+     break;}
+ case 88:
+-#line 738 "HTMLParser.y"
++#line 740 "HTMLParser.y"
+ {
+     yyval.element_list = new list<auto_ptr<Element> >;
+     yyval.element_list->push_back(auto_ptr<Element>(yyvsp[0].element));
+   ;
+     break;}
+ case 89:
+-#line 742 "HTMLParser.y"
++#line 744 "HTMLParser.y"
+ {
+     (yyval.element_list = yyvsp[-1].element_list)->push_back(auto_ptr<Element>(yyvsp[0].element));
+   ;
+     break;}
+ case 90:
+-#line 748 "HTMLParser.y"
++#line 750 "HTMLParser.y"
+ { yyval.element = yyvsp[-1].pcdata; ;
+     break;}
+ case 91:
+-#line 749 "HTMLParser.y"
++#line 751 "HTMLParser.y"
+ { yyval.element = yyvsp[-1].element; ;
+     break;}
+ case 92:
+-#line 750 "HTMLParser.y"
++#line 752 "HTMLParser.y"
+ { yyval.element = yyvsp[-1].element; ;
+     break;}
+ case 93:
+-#line 751 "HTMLParser.y"
++#line 753 "HTMLParser.y"
+ { yyval.element = yyvsp[-1].element; ;
+     break;}
+ case 94:
+-#line 752 "HTMLParser.y"
++#line 754 "HTMLParser.y"
+ { yyval.element = yyvsp[-1].element; ;
+     break;}
+ case 95:
+-#line 753 "HTMLParser.y"
++#line 755 "HTMLParser.y"
+ { /* EXTENSION: NS 1.1 / IE 2.0 */
+     NoBreak *nb = new NoBreak;
+     delete yyvsp[-3].tag_attributes;
+@@ -2647,75 +2649,75 @@ case 95:
+   ;
+     break;}
+ case 96:
+-#line 762 "HTMLParser.y"
++#line 764 "HTMLParser.y"
+ { delete yyvsp[-2].tag_attributes; yyval.element = new Font(TT,     yyvsp[-1].element_list); ;
+     break;}
+ case 97:
+-#line 763 "HTMLParser.y"
++#line 765 "HTMLParser.y"
+ { delete yyvsp[-2].tag_attributes; yyval.element = new Font(I,      yyvsp[-1].element_list); ;
+     break;}
+ case 98:
+-#line 764 "HTMLParser.y"
++#line 766 "HTMLParser.y"
+ { delete yyvsp[-2].tag_attributes; yyval.element = new Font(B,      yyvsp[-1].element_list); ;
+     break;}
+ case 99:
+-#line 765 "HTMLParser.y"
++#line 767 "HTMLParser.y"
+ { delete yyvsp[-2].tag_attributes; yyval.element = new Font(U,      yyvsp[-1].element_list); ;
+     break;}
+ case 100:
+-#line 766 "HTMLParser.y"
++#line 768 "HTMLParser.y"
+ { delete yyvsp[-2].tag_attributes; yyval.element = new Font(STRIKE, yyvsp[-1].element_list); ;
+     break;}
+ case 101:
+-#line 767 "HTMLParser.y"
++#line 769 "HTMLParser.y"
+ { delete yyvsp[-2].tag_attributes; yyval.element = new Font(BIG,    yyvsp[-1].element_list); ;
+     break;}
+ case 102:
+-#line 768 "HTMLParser.y"
++#line 770 "HTMLParser.y"
+ { delete yyvsp[-2].tag_attributes; yyval.element = new Font(SMALL,  yyvsp[-1].element_list); ;
+     break;}
+ case 103:
+-#line 769 "HTMLParser.y"
++#line 771 "HTMLParser.y"
+ { delete yyvsp[-2].tag_attributes; yyval.element = new Font(SUB,    yyvsp[-1].element_list); ;
+     break;}
+ case 104:
+-#line 770 "HTMLParser.y"
++#line 772 "HTMLParser.y"
+ { delete yyvsp[-2].tag_attributes; yyval.element = new Font(SUP,    yyvsp[-1].element_list); ;
+     break;}
+ case 105:
+-#line 774 "HTMLParser.y"
++#line 776 "HTMLParser.y"
+ { delete yyvsp[-2].tag_attributes; yyval.element = new Phrase(EM,     yyvsp[-1].element_list); ;
+     break;}
+ case 106:
+-#line 775 "HTMLParser.y"
++#line 777 "HTMLParser.y"
+ { delete yyvsp[-2].tag_attributes; yyval.element = new Phrase(STRONG, yyvsp[-1].element_list); ;
+     break;}
+ case 107:
+-#line 776 "HTMLParser.y"
++#line 778 "HTMLParser.y"
+ { delete yyvsp[-2].tag_attributes; yyval.element = new Phrase(DFN,    yyvsp[-1].element_list); ;
+     break;}
+ case 108:
+-#line 777 "HTMLParser.y"
++#line 779 "HTMLParser.y"
+ { delete yyvsp[-2].tag_attributes; yyval.element = new Phrase(CODE,   yyvsp[-1].element_list); ;
+     break;}
+ case 109:
+-#line 778 "HTMLParser.y"
++#line 780 "HTMLParser.y"
+ { delete yyvsp[-2].tag_attributes; yyval.element = new Phrase(SAMP,   yyvsp[-1].element_list); ;
+     break;}
+ case 110:
+-#line 779 "HTMLParser.y"
++#line 781 "HTMLParser.y"
+ { delete yyvsp[-2].tag_attributes; yyval.element = new Phrase(KBD,    yyvsp[-1].element_list); ;
+     break;}
+ case 111:
+-#line 780 "HTMLParser.y"
++#line 782 "HTMLParser.y"
+ { delete yyvsp[-2].tag_attributes; yyval.element = new Phrase(VAR,    yyvsp[-1].element_list); ;
+     break;}
+ case 112:
+-#line 781 "HTMLParser.y"
++#line 783 "HTMLParser.y"
+ { delete yyvsp[-2].tag_attributes; yyval.element = new Phrase(CITE,   yyvsp[-1].element_list); ;
+     break;}
+ case 113:
+-#line 788 "HTMLParser.y"
++#line 790 "HTMLParser.y"
+ {
+     delete yyvsp[-2].tag_attributes;
+     Anchor *a = new Anchor;
+@@ -2725,7 +2727,7 @@ case 113:
+   ;
+     break;}
+ case 114:
+-#line 795 "HTMLParser.y"
++#line 797 "HTMLParser.y"
+ {
+     Image *i = new Image;
+     i->attributes.reset(yyvsp[0].tag_attributes);
+@@ -2733,7 +2735,7 @@ case 114:
+   ;
+     break;}
+ case 115:
+-#line 800 "HTMLParser.y"
++#line 802 "HTMLParser.y"
+ {
+     Applet *a = new Applet;
+     a->attributes.reset(yyvsp[-2].tag_attributes);
+@@ -2742,7 +2744,7 @@ case 115:
+   ;
+     break;}
+ case 116:
+-#line 808 "HTMLParser.y"
++#line 810 "HTMLParser.y"
+ {
+     Font2 *f2 = new Font2;
+     f2->attributes.reset(yyvsp[-2].tag_attributes);
+@@ -2751,7 +2753,7 @@ case 116:
+   ;
+     break;}
+ case 117:
+-#line 814 "HTMLParser.y"
++#line 816 "HTMLParser.y"
+ {
+     BaseFont *bf = new BaseFont;
+     bf->attributes.reset(yyvsp[0].tag_attributes);
+@@ -2759,7 +2761,7 @@ case 117:
+   ;
+     break;}
+ case 118:
+-#line 819 "HTMLParser.y"
++#line 821 "HTMLParser.y"
+ {
+     LineBreak *lb = new LineBreak;
+     lb->attributes.reset(yyvsp[0].tag_attributes);
+@@ -2767,7 +2769,7 @@ case 118:
+   ;
+     break;}
+ case 119:
+-#line 824 "HTMLParser.y"
++#line 826 "HTMLParser.y"
+ {
+     Map *m = new Map;
+     m->attributes.reset(yyvsp[-2].tag_attributes);
+@@ -2776,20 +2778,20 @@ case 119:
+   ;
+     break;}
+ case 120:
+-#line 833 "HTMLParser.y"
++#line 835 "HTMLParser.y"
+ {
+     yyval.element_list = 0;
+   ;
+     break;}
+ case 121:
+-#line 836 "HTMLParser.y"
++#line 838 "HTMLParser.y"
+ {
+     yyval.element_list = yyvsp[-1].element_list ? yyvsp[-1].element_list : new list<auto_ptr<Element> >;
+     yyval.element_list->push_back(auto_ptr<Element>(yyvsp[0].element));
+   ;
+     break;}
+ case 122:
+-#line 840 "HTMLParser.y"
++#line 842 "HTMLParser.y"
+ {
+     yyval.element_list = yyvsp[-1].element_list ? yyvsp[-1].element_list : new list<auto_ptr<Element> >;
+     Param *p = new Param;
+@@ -2798,26 +2800,26 @@ case 122:
+   ;
+     break;}
+ case 123:
+-#line 849 "HTMLParser.y"
++#line 851 "HTMLParser.y"
+ {
+     yyval.tag_attributes_list = 0;
+   ;
+     break;}
+ case 124:
+-#line 852 "HTMLParser.y"
++#line 854 "HTMLParser.y"
+ {
+     yyval.tag_attributes_list = yyvsp[-1].tag_attributes_list;
+   ;
+     break;}
+ case 125:
+-#line 855 "HTMLParser.y"
++#line 857 "HTMLParser.y"
+ {
+     yyval.tag_attributes_list = yyvsp[-1].tag_attributes_list ? yyvsp[-1].tag_attributes_list : new list<auto_ptr<list<TagAttribute> > >;
+     yyval.tag_attributes_list->push_back(auto_ptr<list<TagAttribute> >(yyvsp[0].tag_attributes));
+   ;
+     break;}
+ case 126:
+-#line 862 "HTMLParser.y"
++#line 864 "HTMLParser.y"
+ {
+     Input *i = new Input;
+     i->attributes.reset(yyvsp[0].tag_attributes);
+@@ -2825,7 +2827,7 @@ case 126:
+   ;
+     break;}
+ case 127:
+-#line 867 "HTMLParser.y"
++#line 869 "HTMLParser.y"
+ {
+     Select *s = new Select;
+     s->attributes.reset(yyvsp[-2].tag_attributes);
+@@ -2834,7 +2836,7 @@ case 127:
+   ;
+     break;}
+ case 128:
+-#line 873 "HTMLParser.y"
++#line 875 "HTMLParser.y"
+ {
+     TextArea *ta = new TextArea;
+     ta->attributes.reset(yyvsp[-2].tag_attributes);
+@@ -2843,20 +2845,20 @@ case 128:
+   ;
+     break;}
+ case 129:
+-#line 882 "HTMLParser.y"
++#line 884 "HTMLParser.y"
+ {
+     yyval.option_list = new list<auto_ptr<Option> >;
+     yyval.option_list->push_back(auto_ptr<Option>(yyvsp[0].option));
+   ;
+     break;}
+ case 130:
+-#line 886 "HTMLParser.y"
++#line 888 "HTMLParser.y"
+ {
+     (yyval.option_list = yyvsp[-1].option_list)->push_back(auto_ptr<Option>(yyvsp[0].option));
+   ;
+     break;}
+ case 131:
+-#line 892 "HTMLParser.y"
++#line 894 "HTMLParser.y"
+ {
+     yyval.option = new Option;
+     yyval.option->attributes.reset(yyvsp[-2].tag_attributes);
+@@ -2864,99 +2866,99 @@ case 131:
+   ;
+     break;}
+ case 132:
+-#line 902 "HTMLParser.y"
++#line 904 "HTMLParser.y"
+ { yyval.heading = new Heading; yyval.heading->level = 1; yyval.heading->attributes.reset(yyvsp[0].tag_attributes); ;
+     break;}
+ case 133:
+-#line 903 "HTMLParser.y"
++#line 905 "HTMLParser.y"
+ { yyval.heading = new Heading; yyval.heading->level = 2; yyval.heading->attributes.reset(yyvsp[0].tag_attributes); ;
+     break;}
+ case 134:
+-#line 904 "HTMLParser.y"
++#line 906 "HTMLParser.y"
+ { yyval.heading = new Heading; yyval.heading->level = 3; yyval.heading->attributes.reset(yyvsp[0].tag_attributes); ;
+     break;}
+ case 135:
+-#line 905 "HTMLParser.y"
++#line 907 "HTMLParser.y"
+ { yyval.heading = new Heading; yyval.heading->level = 4; yyval.heading->attributes.reset(yyvsp[0].tag_attributes); ;
+     break;}
+ case 136:
+-#line 906 "HTMLParser.y"
++#line 908 "HTMLParser.y"
+ { yyval.heading = new Heading; yyval.heading->level = 5; yyval.heading->attributes.reset(yyvsp[0].tag_attributes); ;
+     break;}
+ case 137:
+-#line 907 "HTMLParser.y"
++#line 909 "HTMLParser.y"
+ { yyval.heading = new Heading; yyval.heading->level = 6; yyval.heading->attributes.reset(yyvsp[0].tag_attributes); ;
+     break;}
+ case 138:
+-#line 911 "HTMLParser.y"
++#line 913 "HTMLParser.y"
+ { yyval.inT = 1; ;
+     break;}
+ case 139:
+-#line 912 "HTMLParser.y"
++#line 914 "HTMLParser.y"
+ { yyval.inT = 2; ;
+     break;}
+ case 140:
+-#line 913 "HTMLParser.y"
++#line 915 "HTMLParser.y"
+ { yyval.inT = 3; ;
+     break;}
+ case 141:
+-#line 914 "HTMLParser.y"
++#line 916 "HTMLParser.y"
+ { yyval.inT = 4; ;
+     break;}
+ case 142:
+-#line 915 "HTMLParser.y"
++#line 917 "HTMLParser.y"
+ { yyval.inT = 5; ;
+     break;}
+ case 143:
+-#line 916 "HTMLParser.y"
++#line 918 "HTMLParser.y"
+ { yyval.inT = 6; ;
+     break;}
+ case 144:
+-#line 921 "HTMLParser.y"
++#line 923 "HTMLParser.y"
+ { yyval.pcdata = 0; ;
+     break;}
+ case 145:
+-#line 921 "HTMLParser.y"
++#line 923 "HTMLParser.y"
+ { yyval.pcdata = yyvsp[0].pcdata; ;
+     break;}
+ case 146:
+-#line 922 "HTMLParser.y"
++#line 924 "HTMLParser.y"
+ { yyval.caption = 0; ;
+     break;}
+ case 147:
+-#line 922 "HTMLParser.y"
++#line 924 "HTMLParser.y"
+ { yyval.caption = yyvsp[0].caption; ;
+     break;}
+ case 148:
+-#line 923 "HTMLParser.y"
++#line 925 "HTMLParser.y"
+ { yyval.element_list = 0; ;
+     break;}
+ case 149:
+-#line 923 "HTMLParser.y"
++#line 925 "HTMLParser.y"
+ { yyval.element_list = yyvsp[0].element_list; ;
+     break;}
+ case 150:
+-#line 924 "HTMLParser.y"
++#line 926 "HTMLParser.y"
+ { yyval.element_list = 0; ;
+     break;}
+ case 151:
+-#line 924 "HTMLParser.y"
++#line 926 "HTMLParser.y"
+ { yyval.element_list = yyvsp[0].element_list; ;
+     break;}
+ case 152:
+-#line 926 "HTMLParser.y"
++#line 928 "HTMLParser.y"
+ { yyval.tag_attributes = 0; ;
+     break;}
+ case 153:
+-#line 926 "HTMLParser.y"
++#line 928 "HTMLParser.y"
+ { yyval.tag_attributes = yyvsp[0].tag_attributes; ;
+     break;}
+ case 154:
+-#line 927 "HTMLParser.y"
++#line 929 "HTMLParser.y"
+ { yyval.tag_attributes = 0; ;
+     break;}
+ case 155:
+-#line 927 "HTMLParser.y"
++#line 929 "HTMLParser.y"
+ { yyval.tag_attributes = yyvsp[0].tag_attributes; ;
+     break;}
+ }
+@@ -3158,7 +3160,7 @@ yyerrhandle:
+ /* END */
+ 
+ /* #line 891 "/usr/local/lib/bison.cc" */
+-#line 965 "HTMLParser.y"
++#line 967 "HTMLParser.y"
+  /* } */
+ 
+ /*
diff -Nrua -x CVS /usr/ports/converters/html2text/patches/patch-HTMLParser_h ./patches/patch-HTMLParser_h
--- /usr/ports/converters/html2text/patches/patch-HTMLParser_h	Thu Jan  1 01:00:00 1970
+++ ./patches/patch-HTMLParser_h	Sat Jan 21 20:58:10 2017
@@ -0,0 +1,12 @@
+$OpenBSD$
+--- HTMLParser.h.orig	Mon Aug 12 19:19:43 2002
++++ HTMLParser.h	Sat Jan 21 20:57:57 2017
+@@ -487,7 +487,7 @@ static const int END_VAR;
+  /* decl const */
+ public:
+  int YY_HTMLParser_PARSE(YY_HTMLParser_PARSE_PARAM);
+- virtual void YY_HTMLParser_ERROR(char *) YY_HTMLParser_ERROR_BODY;
++ virtual void YY_HTMLParser_ERROR(const char *) YY_HTMLParser_ERROR_BODY;
+ #ifdef YY_HTMLParser_PURE
+ #ifdef YY_HTMLParser_LSP_NEEDED
+  virtual int  YY_HTMLParser_LEX(YY_HTMLParser_STYPE *YY_HTMLParser_LVAL,YY_HTMLParser_LTYPE *YY_HTMLParser_LLOC) YY_HTMLParser_LEX_BODY;
diff -Nrua -x CVS /usr/ports/converters/html2text/patches/patch-HTMLParser_y ./patches/patch-HTMLParser_y
--- /usr/ports/converters/html2text/patches/patch-HTMLParser_y	Thu Jan  1 01:00:00 1970
+++ ./patches/patch-HTMLParser_y	Sat Jan 21 20:58:10 2017
@@ -0,0 +1,14 @@
+$OpenBSD$
+--- HTMLParser.y.orig	Sun Nov 23 12:05:29 2003
++++ HTMLParser.y	Sat Jan 21 20:57:57 2017
+@@ -337,7 +337,9 @@ document_:
+     ($$ = $1)->head.base_attributes.reset($2);
+   }
+   | document_ META {
+-    ($$ = $1)->head.meta_attributes.reset($2);
++    auto_ptr<Meta> s(new Meta);
++    s->attributes.reset($2);
++    ($$ = $1)->head.metas.push_back(s);
+   }
+   | document_ LINK {
+     ($$ = $1)->head.link_attributes.reset($2);
diff -Nrua -x CVS /usr/ports/converters/html2text/patches/patch-Makefile_in ./patches/patch-Makefile_in
--- /usr/ports/converters/html2text/patches/patch-Makefile_in	Mon Sep 17 21:21:21 2007
+++ ./patches/patch-Makefile_in	Sat Jan 21 21:27:57 2017
@@ -1,6 +1,6 @@
-$OpenBSD: patch-Makefile_in,v 1.1.1.1 2007/09/17 19:21:21 jasper Exp $
+$OpenBSD$
 --- Makefile.in.orig	Wed Jan 14 14:47:02 2004
-+++ Makefile.in	Thu Sep 13 21:36:17 2007
++++ Makefile.in	Sat Jan 21 21:27:53 2017
 @@ -29,9 +29,9 @@ BISONXX  = bison++
  YFLAGS   =
  
@@ -23,3 +23,12 @@
  LDFLAGS   = $(DEBUG)
  LOADLIBES = $(LIBSTDCXX_LIBS) $(SOCKET_LIBRARIES)
  
+@@ -68,7 +68,7 @@ all : html2text
+ 	@echo '"/usr/local/bin", "/usr/local/man/man1" and "/usr/local/man/man5").';
+ 	@echo
+ 
+-OBJS = html2text.o html.o HTMLControl.o HTMLParser.o Area.o format.o sgml.o table.o urlistream.o Properties.o cmp_nocase.o
++OBJS = html2text.o html.o HTMLControl.o HTMLParser.o Area.o format.o sgml.o table.o Properties.o cmp_nocase.o
+ 
+ html2text : $(OBJS) $(LIBSTDCXX_LIBS)
+ 	$(CXX) $(LDFLAGS) $(OBJS) $(LOADLIBES) $(LDLIBS) -o $@
diff -Nrua -x CVS /usr/ports/converters/html2text/patches/patch-configure ./patches/patch-configure
--- /usr/ports/converters/html2text/patches/patch-configure	Thu Jan  1 01:00:00 1970
+++ ./patches/patch-configure	Sat Jan 21 21:17:41 2017
@@ -0,0 +1,35 @@
+$OpenBSD$
+--- configure.orig	Mon Jan 12 16:47:18 2004
++++ configure	Sat Jan 21 21:17:36 2017
+@@ -38,8 +38,7 @@ int main(int, char **) {
+   return 0;
+ }
+ EOF
+-CXX=unknown;
+-for i in "CC" "g++" "cc" "$CC"; do
++for i in "$CXX" "c++" "g++" "cc" "$CC"; do
+   if $i -c $tmp_file.C 2>/dev/null; then
+     CXX="$i";
+     break;
+@@ -205,12 +204,19 @@ cat <<EOF >$tmp_file.C;
+ #include <new>
+ #include <vector>
+ using namespace std;
+-void func() { map<string, string> x; }
++int main(void) {
++  map<string, string> x;
++  return 0;
++}
+ EOF
+-if $CXX -c $tmp_file.C 2>/dev/null; then
++if $CXX $tmp_file.C 2>/dev/null; then
+   LIBSTDCXX_INCLUDES="";
+   LIBSTDCXX_LIBS="";
+   $echo 'works; no need to make "./libstd"';
++elif $CXX $tmp_file.C -lstdc++ 2>/dev/null; then
++  LIBSTDCXX_INCLUDES="";
++  LIBSTDCXX_LIBS="-lstdc++";
++  $echo 'works with libstdc++; no need to make "./libstd"';
+ else
+   LIBSTDCXX_INCLUDES='-Ilibstd/include';
+   LIBSTDCXX_LIBS='libstd/libstd.a';
diff -Nrua -x CVS /usr/ports/converters/html2text/patches/patch-format_C ./patches/patch-format_C
--- /usr/ports/converters/html2text/patches/patch-format_C	Thu Jan  1 01:00:00 1970
+++ ./patches/patch-format_C	Sat Jan 21 20:58:10 2017
@@ -0,0 +1,113 @@
+$OpenBSD$
+--- format.C.orig	Sun Nov 23 12:05:29 2003
++++ format.C	Sat Jan 21 20:57:57 2017
+@@ -56,6 +56,8 @@ using std::flush;
+ #define nelems(array) (sizeof(array) / sizeof((array)[0]))
+ #endif
+ 
++extern int use_encoding;
++
+ /* ------------------------------------------------------------------------- */
+ 
+ static Line *line_format(const list<auto_ptr<Element> > *elements);
+@@ -560,7 +562,7 @@ Heading::format(Area::size_type w, int halign) const
+     "LEFT",   Area::LEFT,
+     "CENTER", Area::CENTER,
+     "RIGHT",  Area::RIGHT,
+-    0
++    NULL
+   );
+ 
+   static char cell_attributes[7];
+@@ -682,7 +684,7 @@ Paragraph::format(Area::size_type w, int halign) const
+     "LEFT",   Area::LEFT,
+     "CENTER", Area::CENTER,
+     "RIGHT",  Area::RIGHT,
+-    0
++    NULL
+   );
+ 
+   static BlockFormat bf("P");
+@@ -752,7 +754,7 @@ Applet::format(Area::size_type w, int /*halign*/ ) con
+       "LEFT",   Area::LEFT,
+       "MIDDLE", Area::CENTER,
+       "RIGHT",  Area::RIGHT,
+-      0
++      NULL
+     );
+     Area *a = ::format(content.get(), w, halign);
+     if (a) return a;
+@@ -802,7 +804,7 @@ Division::format(Area::size_type w, int halign) const
+     "LEFT",   Area::LEFT,
+     "CENTER", Area::CENTER,
+     "RIGHT",  Area::RIGHT,
+-    0
++    NULL
+   ));
+ }
+ 
+@@ -882,7 +884,7 @@ Input::line_format() const
+     res = '[' + string(size, '*') + ']';
+   } else
+   if (cmp_nocase(type, "CHECKBOX") == 0) {
+-    res = checked ? '*' : LATIN1_ordm; // "ordm" looks like a superscript zero.
++    res = checked ? string("*") : (USE_UTF8 ? string("\u2070") : string(1, LATIN1_ordm)); // "ordm" looks like a superscript zero.
+   } else
+   if (cmp_nocase(type, "RADIO") == 0) {
+     res = checked ? '#' : 'o';
+@@ -1168,10 +1170,13 @@ NoBreak::line_format() const
+   Line *l(::line_format(content.get()));
+   if (!l) return 0;
+ 
++  // don't insert ISO-8859-1 non-breaking spaces, it breaks UTF-8 processing
++  /*
+   for (Line::size_type i = 0; i < l->length(); ++i) {
+     Cell &c((*l)[i]);
+     if (c.character == ' ') c.character = LATIN1_nbsp;
+   }
++  */
+   return l;
+ }
+ 
+@@ -1210,6 +1215,7 @@ make_up(const Line &line, Area::size_type w, int halig
+     }
+ 
+     Line::size_type to = from + 1;
++    int to_from;
+ 
+     Line::size_type lbp = (Line::size_type) -1; // "Last break position".
+ 
+@@ -1238,18 +1244,20 @@ make_up(const Line &line, Area::size_type w, int halig
+         to++;
+       }
+ 
+-      if (to - from > w && lbp != (Area::size_type) -1) { to = lbp; break; }
++      if (line.utf_length(from,to) > w && lbp != (Area::size_type) -1) 
++                    { to = lbp; break; }
+     }
+ 
++    to_from = line.utf_length(from,to);
+     /*
+      * Copy the "from...to" range from the "line" to the bottom of the "res"
+      * Area.
+      */
+     Area::size_type x = 0;
+     Area::size_type len = to - from;
+-    if (halign == Area::LEFT || len >= w) { ;                   } else
+-    if (halign == Area::CENTER)           { x += (w - len) / 2; } else
+-    if (halign == Area::RIGHT)            { x += w - len;       }
++    if (halign == Area::LEFT || to_from >= w) { ;                   } else
++    if (halign == Area::CENTER)           { x += (w - to_from) / 2; } else
++    if (halign == Area::RIGHT)            { x += w - to_from;       }
+     res->insert(line.cells() + from, len, x, res->height());
+ 
+     /*
+@@ -1632,7 +1640,7 @@ ListFormat::get_type(
+     "A",         UPPER_ALPHA,
+     "i",         LOWER_ROMAN,
+     "I",         UPPER_ROMAN,
+-    0
++    NULL
+   );
+ }
+ 
diff -Nrua -x CVS /usr/ports/converters/html2text/patches/patch-html2text_C ./patches/patch-html2text_C
--- /usr/ports/converters/html2text/patches/patch-html2text_C	Thu Jan  1 01:00:00 1970
+++ ./patches/patch-html2text_C	Sat Jan 21 20:58:10 2017
@@ -0,0 +1,400 @@
+$OpenBSD$
+--- html2text.C.orig	Sun Nov 23 12:05:29 2003
++++ html2text.C	Sat Jan 21 20:57:57 2017
+@@ -36,26 +36,41 @@
+ 
+ 
+ #include <iostream>
++#include <fstream>
++#include <sstream>
++#include <algorithm>
++#include <iterator>
+ #include <string.h>
+ #include <stdlib.h>
+ 
++#include <iconv.h>
++#include <errno.h>
++#include <unistd.h>
++#include <langinfo.h>
++
+ #include "html.h"
+ #include "HTMLControl.h"
+-#include "urlistream.h"
++//#include "urlistream.h"
+ #include "format.h"
+ 
+ #define stringify(x) stringify2(x)
+ #define stringify2(x) #x
+ 
+ /* ------------------------------------------------------------------------- */
++using std::ifstream;
++using std::stringstream;
++using std::istream_iterator;
++using std::ostream_iterator;
++using std::noskipws;
+ 
+ class MyParser : public HTMLControl {
+ 
+ public:
+   enum { PRINT_AS_ASCII, UNPARSE, SYNTAX_CHECK };
++  string meta_encoding;
+ 
+   MyParser(
+-    urlistream &is_,
++    istream &is_,
+     bool       debug_scanner_,
+     bool       debug_parser_,
+     ostream    &os_,
+@@ -71,7 +86,7 @@ class MyParser : public HTMLControl { (public)
+   {}
+ 
+ private:
+-  /*virtual*/ void yyerror(char *);
++  /*virtual*/ void yyerror(const char *);
+   /*virtual*/ void process(const Document &);
+ 
+   ostream &os;
+@@ -81,7 +96,7 @@ class MyParser : public HTMLControl { (public)
+ };
+ 
+ /*virtual*/ void
+-MyParser::yyerror(char *p)
++MyParser::yyerror(const char *p)
+ {
+ 
+   /*
+@@ -104,6 +119,23 @@ MyParser::yyerror(char *p)
+ /*virtual*/ void
+ MyParser::process(const Document &document)
+ {
++  list<auto_ptr<Meta> >::const_iterator i;
++  for(i = document.head.metas.begin(); i != document.head.metas.end(); ++i) {
++    bool exists = false;
++    get_attribute(i->get()->attributes.get(), "http-equiv", &exists);
++    if (exists) {
++      string content = get_attribute(i->get()->attributes.get(), "content", "");
++	  char to_find[] = "charset=";
++	  string::size_type found_pos = content.find(to_find);
++	  if (found_pos != string::npos)
++	  {
++        this->meta_encoding = content.substr(found_pos + sizeof(to_find) - 1);
++	    //std::cerr << this->meta_encoding << std::endl;
++	  }
++      break;
++    }
++  }
++
+   switch (mode) {
+ 
+   case PRINT_AS_ASCII:
+@@ -124,6 +156,72 @@ MyParser::process(const Document &document)
+   }
+ }
+ 
++bool recode(stringstream& stream, const char* to_encoding, const char* from_encoding)
++{
++	iconv_t iconv_handle = iconv_open(to_encoding, from_encoding);
++	if (iconv_handle != iconv_t(-1))
++	{
++		stream.seekg(0);
++		string input_string = stream.str();
++		size_t input_size = input_string.size();
++		char* raw_input = new char[input_size+1];
++		char* const orig_raw_input = raw_input;
++		strcpy(raw_input, input_string.data());
++		size_t max_output_size = input_size * 4; // maximum possible overhead
++		char* raw_output = new char[max_output_size+1];
++		char* const orig_raw_output = raw_output;
++		size_t iconv_value =
++			iconv(iconv_handle, &raw_input, &input_size, &raw_output, &max_output_size);
++
++		if (iconv_value != (size_t)-1)
++		{
++			*raw_output = '\0';
++			stream.str(string(orig_raw_output));
++			/* debug */
++			//std::copy(istream_iterator<char>(input_stream), istream_iterator<char>(), ostream_iterator<char>(std::cerr));
++		}
++		else
++		{
++			std::cerr << "Input recoding failed due to ";
++			if (errno == EILSEQ)
++			{
++				std::cerr << "invalid input sequence. Unconverted part of text follows." << std::endl;
++				std::cerr << raw_input;
++			}
++			else
++			{
++				std::cerr << "unknown reason.";
++			}
++			std::cerr << std::endl;
++		}
++
++		delete [] orig_raw_input;
++		delete [] orig_raw_output;
++		iconv_close(iconv_handle);
++
++		if (iconv_value == (size_t)-1)
++		{
++			return false;
++		}
++	}
++	else
++	{
++		if (errno == EINVAL)
++		{
++			std::cerr << "Recoding from '" << from_encoding
++				<< "' to '" << to_encoding << "' is not available." << std::endl;
++			std::cerr << "Check that '" << from_encoding
++				<< "' is a valid encoding." << std::endl;
++		}
++		else
++		{
++			std::cerr << "Error: cannot setup recoding." << std::endl;
++		}
++		return false;
++	}
++	return true;
++}
++
+ /* ------------------------------------------------------------------------- */
+ 
+ static const char *usage = "\
+@@ -132,7 +230,7 @@ Usage:\n\
+   html2text -version\n\
+   html2text [ -unparse | -check ] [ -debug-scanner ] [ -debug-parser ] \\\n\
+      [ -rcfile <file> ] [ -style ( compact | pretty ) ] [ -width <w> ] \\\n\
+-     [ -o <file> ] [ -nobs ] [ -ascii ] [ <input-url> ] ...\n\
++     [ -o <file> ] [ -nobs ] [ -ascii | -utf8 ] [ <input-url> ] ...\n\
+ Formats HTML document(s) read from <input-url> or STDIN and generates ASCII\n\
+ text.\n\
+   -help          Print this text and exit\n\
+@@ -148,9 +246,11 @@ text.\n\
+   -o <file>      Redirect output into <file>\n\
+   -nobs          Do not use backspaces for boldface and underlining\n\
+   -ascii         Use plain ASCII for output instead of ISO-8859-1\n\
++  -utf8          Assume both terminal and input stream are in UTF-8 mode\n\
++  -nometa        Don't try to recode input using 'meta' tag\n\
+ ";
+ 
+-int use_iso8859 = 1;
++int use_encoding = ISO8859;
+ 
+ int
+ main(int argc, char **argv)
+@@ -184,22 +284,25 @@ main(int argc, char **argv)
+   const char *style            = "compact";
+   int        width             = 79;
+   const char *output_file_name = "-";
+-  bool       use_backspaces    = true;
++  bool       use_backspaces    = false;
++  bool       use_meta          = true;
+ 
+   int i;
+   for (i = 1; i < argc && argv[i][0] == '-' && argv[i][1]; i++) {
+     const char *arg = argv[i];
+ 
+-    if (!strcmp(arg, "-unparse"      )) { mode = MyParser::UNPARSE;      } else
+-    if (!strcmp(arg, "-check"        )) { mode = MyParser::SYNTAX_CHECK; } else
+-    if (!strcmp(arg, "-debug-scanner")) { debug_scanner = true;          } else
+-    if (!strcmp(arg, "-debug-parser" )) { debug_parser = true;           } else
+-    if (!strcmp(arg, "-rcfile"       )) { rcfile = argv[++i];            } else
+-    if (!strcmp(arg, "-style"        )) { style = argv[++i];             } else
+-    if (!strcmp(arg, "-width"        )) { width = atoi(argv[++i]);       } else
+-    if (!strcmp(arg, "-o"            )) { output_file_name = argv[++i];  } else
+-    if (!strcmp(arg, "-nobs"         )) { use_backspaces = false;        } else
+-    if (!strcmp(arg, "-ascii"        )) { use_iso8859 = false;           } else
++    if (!strcmp(arg, "-unparse"      )) { mode = MyParser::UNPARSE;                       } else
++    if (!strcmp(arg, "-check"        )) { mode = MyParser::SYNTAX_CHECK;                  } else
++    if (!strcmp(arg, "-debug-scanner")) { debug_scanner = true;                           } else
++    if (!strcmp(arg, "-debug-parser" )) { debug_parser = true;                            } else
++    if (!strcmp(arg, "-rcfile"       )) { rcfile = argv[++i];                             } else
++    if (!strcmp(arg, "-style"        )) { style = argv[++i];                              } else
++    if (!strcmp(arg, "-width"        )) { if (atoi(argv[++i]) > 0) width = atoi(argv[i]); } else
++    if (!strcmp(arg, "-o"            )) { output_file_name = argv[++i];                   } else
++    if (!strcmp(arg, "-nobs"         )) { use_backspaces = false;                         } else
++    if (!strcmp(arg, "-ascii"        )) { use_encoding = ASCII;                           } else
++    if (!strcmp(arg, "-utf8"         )) { use_encoding = UTF8;                            } else
++    if (!strcmp(arg, "-nometa"       )) { use_meta = false;                               } else
+     {
+       std::cerr
+ 	<< "Unrecognized command line option \""
+@@ -329,8 +432,13 @@ main(int argc, char **argv)
+   ostream  *osp;
+   std::ofstream ofs;
+ 
++  bool output_is_tty = false;
+   if (!strcmp(output_file_name, "-")) {
+     osp = &std::cout;
++	if (isatty(1 /* stdout */))
++	{
++		output_is_tty = true;
++	}
+   } else {
+     ofs.open(output_file_name, std::ios::out);
+     if (!ofs) {
+@@ -352,30 +460,145 @@ main(int argc, char **argv)
+     }
+ 
+     istream    *isp;
+-    urlistream uis;
++    istream    *uis;
++	ifstream* infile = NULL;
++	stringstream input_stream;
+ 
+-    uis.open(input_url);
+-    if (!uis.is_open()) {
+-      std::cerr
+-        << "Opening input URL \""
+-	<< input_url
+-        << "\": "
+-        << uis.open_error()
+-        << std::endl;
+-      exit(1);
++	if (strcmp(input_url, "-") == 0)
++	{
++		uis = &std::cin;
++	}
++	else
++	{
++		infile = new ifstream(input_url);
++		if (!infile->is_open())
++		{
++		  delete infile;
++		  std::cerr
++			<< "Cannot open input file \""
++			<< input_url
++			<< "\"."
++			<< std::endl;
++		  exit(1);
++		}
++		uis = infile;
+     }
+ 
+-    MyParser parser(
+-      uis,
+-      debug_scanner,
+-      debug_parser,
+-      *osp,
+-      mode,
+-      width,
+-      input_url
+-    );
++	*uis >> noskipws;
++	std::copy(istream_iterator<char>(*uis), istream_iterator<char>(), ostream_iterator<char>(input_stream));
+ 
++	if (infile)
++	{
++		infile->close();
++		delete infile;
++	}
++
++	string from_encoding;
++	if (use_meta)
++	{
++		std::ofstream fake_osp("/dev/null");
++		// fake parsing to determine meta
++		MyParser parser(
++		  input_stream,
++		  debug_scanner,
++		  debug_parser,
++		  fake_osp,
++		  mode,
++		  width,
++		  input_url
++        );
++		if (parser.yyparse() != 0) exit(1);
++
++		from_encoding = parser.meta_encoding;
++
++		// don't need to debug twice ...
++		debug_scanner = false;
++		debug_parser = false;
++
++		/*
++		 * It will be good to show warning in this case. But there are too many
++		 * html documents without encoding info, so this branch is commented by
++		 * now.
++		if (parser.meta_encoding.empty())
++		{
++			std::cerr << "Warning: cannot determine encoding from html file." << std::endl;
++			std::cerr << "To remove this warning, use '-nometa' option with, optionally, '-utf8' or '-ascii' options" << std::endl;
++			std::cerr << "to process file \"" << input_url << "\"." << std::endl;
++		}
++		*/
++	}
++	if (from_encoding.empty()) // -nometa supplied or no appropriate tag
++	{
++		if (use_encoding == UTF8)
++		{
++			from_encoding = "UTF-8";
++		}
++		else if (use_encoding == ASCII)
++		{
++			// is ASCII mode we don't need recoding at all
++			from_encoding = "";
++		}
++		else
++		{
++			from_encoding = "ISO_8859-1";
++		}
++	}
++
++	bool result = true;
++	if (!from_encoding.empty())
++	{
++		// recode input
++		result = recode(input_stream, "UTF-8", from_encoding.data());
++	}
++	if (!result)
++	{
++		continue;
++	}
++
++    if (number_of_input_urls != 1) {
++      *osp << "###### " << input_url << " ######" << std::endl;
++    }
++
++	// real parsing now always process UTF-8 (except for ASCII mode)
++	if (use_encoding != ASCII)
++	{
++		use_encoding = UTF8;
++	}
++
++	stringstream output_stream;
++
++	// real parsing
++	input_stream.clear();
++	input_stream.seekg(0);
++	MyParser parser(
++	  input_stream,
++	  debug_scanner,
++	  debug_parser,
++	  output_stream,
++	  mode,
++	  width,
++	  input_url
++	);
+     if (parser.yyparse() != 0) exit(1);
++
++	// recode output if output is terminal
++	if (output_is_tty)
++	{
++		setlocale(LC_CTYPE,"");
++		char output_encoding[64];
++		strcpy(output_encoding, nl_langinfo(CODESET));
++		strcat(output_encoding, "//translit");
++
++		result = recode(output_stream, output_encoding, "UTF-8");
++		if (!result)
++		{
++			continue;
++		}
++	}
++	output_stream.clear();
++	output_stream.seekg(0);
++	output_stream >> noskipws;
++	std::copy(istream_iterator<char>(output_stream), istream_iterator<char>(), ostream_iterator<char>(*osp));
+   }
+ 
+   return 0;
diff -Nrua -x CVS /usr/ports/converters/html2text/patches/patch-html_C ./patches/patch-html_C
--- /usr/ports/converters/html2text/patches/patch-html_C	Thu Jan  1 01:00:00 1970
+++ ./patches/patch-html_C	Sat Jan 21 20:58:10 2017
@@ -0,0 +1,28 @@
+$OpenBSD$
+--- html.C.orig	Sun Nov 23 12:05:29 2003
++++ html.C	Sat Jan 21 20:57:57 2017
+@@ -68,6 +68,7 @@ static pack(Option)
+ static pack(DefinitionListItem)
+ static pack(Script)
+ static pack(Style)
++static pack(Meta)
+ 
+ #undef pack
+ 
+@@ -131,9 +132,15 @@ Head::unparse(ostream &os, ostream_manipulator separat
+   if (base_attributes.get()) os << "<BASE" << base_attributes << ">" << std::endl;
+   foreach(scripts, os, separator);
+   foreach(styles, os, separator);
+-  if (meta_attributes.get()) os << "<META" << meta_attributes << ">" << std::endl;
++  foreach(metas, os, separator);
+   if (link_attributes.get()) os << "<LINK" << link_attributes << ">" << std::endl;
+   os << "</HEAD>" << separator;
++}
++
++void
++Meta::unparse(ostream &os, ostream_manipulator separator) const
++{
++  os << "<META" << attributes << ">" << separator;
+ }
+ 
+ void
diff -Nrua -x CVS /usr/ports/converters/html2text/patches/patch-html_h ./patches/patch-html_h
--- /usr/ports/converters/html2text/patches/patch-html_h	Thu Jan  1 01:00:00 1970
+++ ./patches/patch-html_h	Sat Jan 21 20:58:10 2017
@@ -0,0 +1,36 @@
+$OpenBSD$
+--- html.h.orig	Thu Oct  4 22:03:54 2001
++++ html.h	Sat Jan 21 20:57:57 2017
+@@ -61,6 +61,11 @@
+ 
+ /* ------------------------------------------------------------------------- */
+ 
++enum {ASCII, ISO8859, UTF8};
++#define USE_ISO8859 (use_encoding == ISO8859)
++#define USE_ASCII (use_encoding == ASCII)
++#define USE_UTF8 (use_encoding == UTF8)
++
+ #define LATIN1_nbsp   160
+ #define LATIN1_iexcl  161
+ #define LATIN1_cent   162
+@@ -431,13 +436,19 @@ struct Style {
+   void unparse(ostream &, ostream_manipulator separator) const;
+ };
+ 
++struct Meta {
++  auto_ptr<list<TagAttribute> > attributes;    // HTTP-EQUIV NAME CONTENT
++
++  void unparse(ostream &, ostream_manipulator separator) const;
++};
++
+ struct Head {
+   auto_ptr<PCData>              title;
+   auto_ptr<list<TagAttribute> > isindex_attributes; // PROMPT
+   auto_ptr<list<TagAttribute> > base_attributes;    // HREF
+   list<auto_ptr<Script> >       scripts;
+   list<auto_ptr<Style> >        styles;
+-  auto_ptr<list<TagAttribute> > meta_attributes;    // HTTP-EQUIV NAME CONTENT
++  list<auto_ptr<Meta> >         metas;
+   auto_ptr<list<TagAttribute> > link_attributes;    // HREF REL REV TITLE
+ 
+   void unparse(ostream &, ostream_manipulator separator) const;
diff -Nrua -x CVS /usr/ports/converters/html2text/patches/patch-sgml_C ./patches/patch-sgml_C
--- /usr/ports/converters/html2text/patches/patch-sgml_C	Thu Jan  1 01:00:00 1970
+++ ./patches/patch-sgml_C	Sat Jan 21 20:58:10 2017
@@ -0,0 +1,573 @@
+$OpenBSD$
+--- sgml.C.orig	Sun Nov 23 12:09:11 2003
++++ sgml.C	Sat Jan 21 20:57:57 2017
+@@ -61,262 +61,281 @@
+ static const struct TextToInt {
+   char name[8];
+   int  iso8859code;
+-  char *asciistr;
++  const char *asciistr;
++  unsigned long unicode;
+ } entities[] = {
+-  { "AElig",   LATIN1_AElig,  "AE"         },
+-  { "AMP",     0,             "&"          },
+-  { "Aacute",  LATIN1_Aacute, "A'"         },
+-  { "Acirc",   LATIN1_Acirc,  "A^"         },
+-  { "Agrave",  LATIN1_Agrave, "A`"         },
+-  { "Alpha",   0,             "A"          },
+-  { "Aring",   LATIN1_Aring,  "AA"         },
+-  { "Atilde",  LATIN1_Atilde, "A~"         },
+-  { "Auml",    LATIN1_Auml,   "A\""        },
+-  { "Beta",    0,             "B"          },
+-  { "Ccedil",  LATIN1_Ccedil, "C,"         },
+-  { "Chi",     0,             "H"          },
+-  { "Dagger",  0,             "++"         },
+-  { "Delta",   0,             "D"          },
+-  { "ETH",     LATIN1_ETH,    "D-"         },
+-  { "Eacute",  LATIN1_Eacute, "E'"         },
+-  { "Ecirc",   LATIN1_Ecirc,  "E^"         },
+-  { "Egrave",  LATIN1_Egrave, "E`"         },
+-  { "Epsilon", 0,             "E"          },
+-  { "Eta",     0,             "E"          },
+-  { "Euml",    LATIN1_Euml,   "E\""        },
+-  { "GT",      0,             ">"          },
+-  { "Gamma",   0,             "G"          },
+-  { "Iacute",  LATIN1_Iacute, "I'"         },
+-  { "Icirc",   LATIN1_Icirc,  "I^"         },
+-  { "Igrave",  LATIN1_Igrave, "I`"         },
+-  { "Iota",    0,             "I"          },
+-  { "Iuml",    LATIN1_Iuml,   "I\""        },
+-  { "Kappa",   0,             "K"          },
+-  { "LT",      0,             "<"          },
+-  { "Lambda",  0,             "L"          },
+-  { "Mu",      0,             "M"          },
+-  { "Ntilde",  LATIN1_Ntilde, "N~"         },
+-  { "Nu",      0,             "N"          },
+-  { "OElig",   0,             "OE"         },
+-  { "Oacute",  LATIN1_Oacute, "O'"         },
+-  { "Ocirc",   LATIN1_Ocirc,  "O^"         },
+-  { "Ograve",  LATIN1_Ograve, "O`"         },
+-  { "Omega",   0,             "O"          },
+-  { "Omicron", 0,             "O"          },
+-  { "Oslash",  LATIN1_Oslash, "O/"         },
+-  { "Otilde",  LATIN1_Otilde, "O~"         },
+-  { "Ouml",    LATIN1_Ouml,   "O\""        },
+-  { "Phi",     0,             "F"          },
+-  { "Pi",      0,             "P"          },
+-  { "Prime",   0,             "''"         },
+-  { "Psi",     0,             "PS"         },
+-  { "QUOT",    0,             "\""         },
+-  { "Rho",     0,             "R"          },
+-  { "Scaron",  0,             "S"          },
+-  { "Sigma",   0,             "S"          },
+-  { "THORN",   LATIN1_THORN,  "TH"         },
+-  { "Tau",     0,             "T"          },
+-  { "Theta",   0,             "TH"         },
+-  { "Uacute",  LATIN1_Uacute, "U'"         },
+-  { "Ucirc",   LATIN1_Ucirc,  "U^"         },
+-  { "Ugrave",  LATIN1_Ugrave, "U`"         },
+-  { "Upsilon", 0,             "U"          },
+-  { "Uuml",    LATIN1_Uuml,   "U\""        },
+-  { "Xi",      0,             "X"          },
+-  { "Yacute",  LATIN1_Yacute, "Y'"         },
+-  { "Yuml",    0,             "Y\""        },
+-  { "Zeta",    0,             "Z"          },
+-  { "aacute",  LATIN1_aacute, "a'"         },
+-  { "acirc",   LATIN1_acirc,  "a^"         },
+-  { "acute",   LATIN1_acute,  "'"          },
+-  { "aelig",   LATIN1_aelig,  "ae"         },
+-  { "agrave",  LATIN1_agrave, "a`"         },
+-  { "alefsym", 0,             "Aleph"      },
+-  { "alpha",   0,             "a"          },
+-  { "amp",     0,             "&"          },
+-  { "and",     0,             "AND"        },
+-  { "ang",     0,             "-V"         },
+-  { "apos",    0,             "'"          },
+-  { "aring",   LATIN1_aring,  "aa"         },
+-  { "asymp",   0,             "~="         },
+-  { "atilde",  LATIN1_atilde, "a~"         },
+-  { "auml",    LATIN1_auml,   "a\""        },
+-  { "bdquo",   0,             "\""         },
+-  { "beta",    0,             "b"          },
+-  { "brvbar",  LATIN1_brvbar, "|"          },
+-  { "bull",    0,             " o "        },
+-  { "cap",     0,             "(U"         },
+-  { "ccedil",  LATIN1_ccedil, "c,"         },
+-  { "cedil",   LATIN1_cedil,  ","          },
+-  { "cent",    LATIN1_cent,   "-c-"        },
+-  { "chi",     0,             "h"          },
+-  { "circ",    0,             "^"          },
++  { "AElig",   LATIN1_AElig,  "AE",  0x00c6},
++  { "AMP",     0,             "&",   0x0026},
++  { "Aacute",  LATIN1_Aacute, "A'",  0x00c1},
++  { "Acirc",   LATIN1_Acirc,  "A^",  0x00c2},
++  { "Agrave",  LATIN1_Agrave, "A`",  0x00c0},
++  { "Alpha",   0,             "A",   0x0391},
++  { "Aring",   LATIN1_Aring,  "AA",  0x00c5},
++  { "Atilde",  LATIN1_Atilde, "A~",  0x00c3},
++  { "Auml",    LATIN1_Auml,   "A\"", 0x00c4},
++  { "Beta",    0,             "B",   0x0392},
++  { "Ccedil",  LATIN1_Ccedil, "C,",  0x00c7},
++  { "Chi",     0,             "H",   0x03a7},
++  { "Dagger",  0,             "++",  0x2020},
++  { "Delta",   0,             "D",   0x0394},
++  { "ETH",     LATIN1_ETH,    "D-",  0x00d0},
++  { "Eacute",  LATIN1_Eacute, "E'",  0x00c9},
++  { "Ecirc",   LATIN1_Ecirc,  "E^",  0x00ca},
++  { "Egrave",  LATIN1_Egrave, "E`",  0x00c8},
++  { "Epsilon", 0,             "E",   0x0395},
++  { "Eta",     0,             "E",   0x0397},
++  { "Euml",    LATIN1_Euml,   "E\"", 0x00cb},
++  { "GT",      0,             ">",   0x003e},
++  { "Gamma",   0,             "G",   0x0393},
++  { "Iacute",  LATIN1_Iacute, "I'",  0x00cd},
++  { "Icirc",   LATIN1_Icirc,  "I^",  0x00ce},
++  { "Igrave",  LATIN1_Igrave, "I`",  0x00cc},
++  { "Iota",    0,             "I",   0x0399},
++  { "Iuml",    LATIN1_Iuml,   "I\"", 0x00cf},
++  { "Kappa",   0,             "K",   0x039a},
++  { "LT",      0,             "<",   0x003c},
++  { "Lambda",  0,             "L",   0x039b},
++  { "Mu",      0,             "M",   0x039c},
++  { "Ntilde",  LATIN1_Ntilde, "N~",  0x00d1},
++  { "Nu",      0,             "N",   0x039d},
++  { "OElig",   0,             "OE",  0x0152},
++  { "Oacute",  LATIN1_Oacute, "O'",  0x00d3},
++  { "Ocirc",   LATIN1_Ocirc,  "O^",  0x00d4},
++  { "Ograve",  LATIN1_Ograve, "O`",  0x00d2},
++  { "Omega",   0,             "O",   0x03a9},
++  { "Omicron", 0,             "O",   0x039f},
++  { "Oslash",  LATIN1_Oslash, "O/",  0x00d8},
++  { "Otilde",  LATIN1_Otilde, "O~",  0x00d5},
++  { "Ouml",    LATIN1_Ouml,   "O\"", 0x00d6},
++  { "Phi",     0,             "F",   0x03a6},
++  { "Pi",      0,             "P",   0x03a0},
++  { "Prime",   0,             "''",  0x2032},
++  { "Psi",     0,             "PS",  0x03a8},
++  { "QUOT",    0,             "\"",  0x0022},
++  { "Rho",     0,             "R",   0x03a1},
++  { "Scaron",  0,             "S",   0x0161},
++  { "Sigma",   0,             "S",   0x03a3},
++  { "THORN",   LATIN1_THORN,  "TH",  0x00de},
++  { "Tau",     0,             "T",   0x03a4},
++  { "Theta",   0,             "TH",  0x0398},
++  { "Uacute",  LATIN1_Uacute, "U'",  0x00da},
++  { "Ucirc",   LATIN1_Ucirc,  "U^",  0x00db},
++  { "Ugrave",  LATIN1_Ugrave, "U`",  0x00d9},
++  { "Upsilon", 0,             "U",   0x03a5},
++  { "Uuml",    LATIN1_Uuml,   "U\"", 0x00dc},
++  { "Xi",      0,             "X",   0x039e},
++  { "Yacute",  LATIN1_Yacute, "Y'",  0x00dd},
++  { "Yuml",    0,             "Y\"", 0x0178},
++  { "Zeta",    0,             "Z",   0x0396},
++  { "aacute",  LATIN1_aacute, "a'",  0x00e1},
++  { "acirc",   LATIN1_acirc,  "a^",  0x00e2},
++  { "acute",   LATIN1_acute,  "'",   0x00b4},
++  { "aelig",   LATIN1_aelig,  "ae",  0x00e6},
++  { "agrave",  LATIN1_agrave, "a`",  0x00e0},
++  { "alefsym", 0,             "Aleph",0x2135},
++  { "alpha",   0,             "a",   0x03b1},
++  { "amp",     0,             "&",   0x0026},
++  { "and",     0,             "AND", 0x2227},
++  { "ang",     0,             "-V",  0x2220},
++  { "apos",    0,             "'",   0x0027},
++  { "aring",   LATIN1_aring,  "aa",  0x00e5},
++  { "asymp",   0,             "~=",  0x2248},
++  { "atilde",  LATIN1_atilde, "a~",  0x00e3},
++  { "auml",    LATIN1_auml,   "a\"", 0x00e4},
++  { "bdquo",   0,             "\"",  0x201e},
++  { "beta",    0,             "b",   0x03b2},
++  { "brvbar",  LATIN1_brvbar, "|",   0x00a6},
++  { "bull",    0,             " o ", 0x2022},
++  { "cap",     0,             "(U",  0x2229},
++  { "ccedil",  LATIN1_ccedil, "c,",  0x00e7},
++  { "cedil",   LATIN1_cedil,  ",",   0x00b8},
++  { "cent",    LATIN1_cent,   "-c-", 0x00a2},
++  { "chi",     0,             "h",   0x03c7},
++  { "circ",    0,             "^",   0x005e},
+ //  { "clubs",   0,             "[clubs]"    },
+-  { "cong",    0,             "?="         },
+-  { "copy",    LATIN1_copy,   "(c)"        },
+-  { "crarr",   0,             "<-'"        },
+-  { "cup",     0,             ")U"         },
+-  { "curren",  LATIN1_curren, "CUR"        },
+-  { "dArr",    0,             "vv"         },
+-  { "dagger",  0,             "+"          },
+-  { "darr",    0,             "v"          },
+-  { "deg",     LATIN1_deg,    "DEG"        },
+-  { "delta",   0,             "d"          },
++  { "cong",    0,             "?=",  0x2245},
++  { "copy",    LATIN1_copy,   "(c)", 0x00a9},
++  { "crarr",   0,             "<-'", 0x21b5},
++  { "cup",     0,             ")U",  0x222a},
++  { "curren",  LATIN1_curren, "CUR", 0x00a4},
++  { "dArr",    0,             "vv",  0x2193},
++  { "dagger",  0,             "+",   0x2020},
++  { "darr",    0,             "v",   0x2193},
++  { "deg",     LATIN1_deg,    "DEG", 0x00b0},
++  { "delta",   0,             "d",   0x03b4},
+ //  { "diams",   0,             "[diamonds]" },
+-  { "divide",  LATIN1_divide, "/"          },
+-  { "eacute",  LATIN1_eacute, "e'"         },
+-  { "ecirc",   LATIN1_ecirc,  "e^"         },
+-  { "egrave",  LATIN1_egrave, "e`"         },
+-  { "empty",   0,             "{}"         },
+-  { "epsilon", 0,             "e"          },
+-  { "equiv",   0,             "=="         },
+-  { "eta",     0,             "e"          },
+-  { "eth",     LATIN1_eth,    "d-"         },
+-  { "euml",    LATIN1_euml,   "e\""        },
+-  { "euro",    0,             "EUR"        },
+-  { "exist",   0,             "TE"         },
+-  { "fnof",    0,             "f"          },
+-  { "forall",  0,             "FA"         },
+-  { "frac12",  LATIN1_frac12, " 1/2"       },
+-  { "frac14",  LATIN1_frac14, " 1/4"       },
+-  { "frac34",  LATIN1_frac34, " 3/4"       },
+-  { "frasl",   0,             "/"          },
+-  { "gamma",   0,             "g"          },
+-  { "ge",      0,             ">="         },
+-  { "gt",      0,             ">"          },
+-  { "hArr",    0,             "<=>"        },
+-  { "harr",    0,             "<->"        },
++  { "divide",  LATIN1_divide, "/",   0x00f7},
++  { "eacute",  LATIN1_eacute, "e'",  0x00e9},
++  { "ecirc",   LATIN1_ecirc,  "e^",  0x00ea},
++  { "egrave",  LATIN1_egrave, "e`",  0x00e8},
++  { "empty",   0,             "{}",  0x2205},
++  { "epsilon", 0,             "e",   0x03b5},
++  { "equiv",   0,             "==",  0x2261},
++  { "eta",     0,             "e",   0x03b7},
++  { "eth",     LATIN1_eth,    "d-",  0x00f0},
++  { "euml",    LATIN1_euml,   "e\"", 0x00eb},
++  { "euro",    0,             "EUR", 0x20ac},
++  { "exist",   0,             "TE",  0x2203},
++  { "fnof",    0,             "f",   0x0192},
++  { "forall",  0,             "FA",  0x2200},
++  { "frac12",  LATIN1_frac12, " 1/2",0x00bd},
++  { "frac14",  LATIN1_frac14, " 1/4",0x00bc},
++  { "frac34",  LATIN1_frac34, " 3/4",0x00be},
++  { "frasl",   0,             "/" ,  0x2044},
++  { "gamma",   0,             "g",   0x03b3},
++  { "ge",      0,             ">=",  0x2265},
++  { "gt",      0,             ">",   0x003e},
++  { "hArr",    0,             "<=>", 0x21d4},
++  { "harr",    0,             "<->", 0x2194},
+ //  { "hearts",  0,             "[hearts]"   },
+-  { "hellip",  0,             "..."        },
+-  { "iacute",  LATIN1_iacute, "i'"         },
+-  { "icirc",   LATIN1_icirc,  "i^"         },
+-  { "iexcl",   LATIN1_iexcl,  "!"          },
+-  { "igrave",  LATIN1_igrave, "i`"         },
+-  { "image",   0,             "Im"         },
+-  { "infin",   0,             "oo"         },
+-  { "int",     0,             "INT"        },
+-  { "iota",    0,             "i"          },
+-  { "iquest",  LATIN1_iquest, "?"          },
+-  { "isin",    0,             "(-"         },
+-  { "iuml",    LATIN1_iuml,   "i\""        },
+-  { "kappa",   0,             "k"          },
+-  { "lArr",    0,             "<="         },
+-  { "lambda",  0,             "l"          },
+-  { "lang",    0,             "</"         },
+-  { "laquo",   LATIN1_laquo,  "<<"         },
+-  { "larr",    0,             "<-"         },
++  { "hellip",  0,             "...", 0x2026},
++  { "iacute",  LATIN1_iacute, "i'",  0x00ed},
++  { "icirc",   LATIN1_icirc,  "i^",  0x00ee},
++  { "iexcl",   LATIN1_iexcl,  "!",   0x00a1},
++  { "igrave",  LATIN1_igrave, "i`",  0x00ec},
++  { "image",   0,             "Im",  0x2111},
++  { "infin",   0,             "oo",  0x221e},
++  { "int",     0,             "INT", 0x222b},
++  { "iota",    0,             "i",   0x03b9},
++  { "iquest",  LATIN1_iquest, "?",   0x00bf},
++  { "isin",    0,             "(-",  0x2208},
++  { "iuml",    LATIN1_iuml,   "i\"", 0x00ef},
++  { "kappa",   0,             "k",   0x03ba},
++  { "lArr",    0,             "<=",  0x2190},
++  { "lambda",  0,             "l",   0x03bb},
++  { "lang",    0,             "</",  0x2329},
++  { "laquo",   LATIN1_laquo,  "<<",  0x00ab},
++  { "larr",    0,             "<-",  0x2190},
+ //  { "lceil",   0,             "<|"         },
+-  { "ldquo",   0,             "\""         },
+-  { "le",      0,             "<="         },
++  { "ldquo",   0,             "\"",  0x201c},
++  { "le",      0,             "<=",  0x2264},
+ //  { "lfloor",  0,             "|<"         },
+-  { "lowast",  0,             "*"          },
+-  { "loz",     0,             "<>"         },
+-  { "lsaquo",  0,             "<"          },
+-  { "lsquo",   0,             "`"          },
+-  { "lt",      0,             "<"          },
+-  { "macr",    LATIN1_macr,   "-"          },
+-  { "mdash",   0,             "--"         },
+-  { "micro",   LATIN1_micro,  "my"         },
+-  { "middot",  LATIN1_middot, "."          },
+-  { "minus",   0,             "-"          },
+-  { "mu",      0,             "m"          },
+-  { "nabla",   0,             "Nabla"      },
+-  { "nbsp",    LATIN1_nbsp,   " "          },
+-  { "ndash",   0,             "-"          },
+-  { "ne",      0,             "!="         },
+-  { "ni",      0,             "-)"         },
+-  { "not",     LATIN1_not,    "NOT"        },
+-  { "notin",   0,             "!(-"        },
+-  { "nsub",    0,             "!(C"        },
+-  { "ntilde",  LATIN1_ntilde, "n~"         },
+-  { "nu",      0,             "n"          },
+-  { "oacute",  LATIN1_oacute, "o'"         },
+-  { "ocirc",   LATIN1_ocirc,  "o^"         },
+-  { "oelig",   0,             "oe"         },
+-  { "ograve",  LATIN1_ograve, "o`"         },
+-  { "oline",   LATIN1_macr,   "-"          },
+-  { "omega",   0,             "o"          },
+-  { "omicron", 0,             "o"          },
+-  { "oplus",   0,             "(+)"        },
+-  { "or",      0,             "OR"         },
+-  { "ordf",    LATIN1_ordf,   "-a"         },
+-  { "ordm",    LATIN1_ordm,   "-o"         },
+-  { "oslash",  LATIN1_oslash, "o/"         },
+-  { "otilde",  LATIN1_otilde, "o~"         },
+-  { "otimes",  0,             "(x)"        },
+-  { "ouml",    LATIN1_ouml,   "o\""        },
+-  { "para",    LATIN1_para,   "P:"         },
+-  { "part",    0,             "PART"       },
+-  { "permil",  0,             " 0/00"      },
+-  { "perp",    0,             "-T"         },
+-  { "phi",     0,             "f"          },
+-  { "pi",      0,             "p"          },
+-  { "piv",     0,             "Pi"         },
+-  { "plusmn",  LATIN1_plusmn, "+/-"        },
+-  { "pound",   LATIN1_pound,  "-L-"        },
+-  { "prime",   0,             "'"          },
+-  { "prod",    0,             "PROD"       },
+-  { "prop",    0,             "0("         },
+-  { "psi",     0,             "ps"         },
+-  { "quot",    0,             "\""         },
+-  { "rArr",    0,             "=>"         },
+-  { "radic",   0,             "SQRT"       },
+-  { "rang",    0,             "/>"         },
+-  { "raquo",   LATIN1_raquo,  ">>"         },
+-  { "rarr",    0,             "->"         },
++  { "lowast",  0,             "*",   0x2217},
++  { "loz",     0,             "<>",  0x25ca},
++  { "lsaquo",  0,             "<",   0x2039},
++  { "lsquo",   0,             "`",   0x2018},
++  { "lt",      0,             "<",   0x003c},
++  { "macr",    LATIN1_macr,   "-",   0x00af},
++  { "mdash",   0,             "--",  0x2014},
++  { "micro",   LATIN1_micro,  "my",  0x00b5},
++  { "middot",  LATIN1_middot, ".",   0x00b7},
++  { "minus",   0,             "-",   0x2212},
++  { "mu",      0,             "m",   0x03bc},
++  { "nabla",   0,             "Nabla",0x2207},
++  { "nbsp",    LATIN1_nbsp,   " ",   0x00a0},
++  { "ndash",   0,             "-",   0x2013},
++  { "ne",      0,             "!=",  0x2260},
++  { "ni",      0,             "-)",  0x220b},
++  { "not",     LATIN1_not,    "NOT", 0x00ac},
++  { "notin",   0,             "!(-", 0x2209},
++  { "nsub",    0,             "!(C", 0x2284},
++  { "ntilde",  LATIN1_ntilde, "n~",  0x00f1},
++  { "nu",      0,             "n",   0x03bd},
++  { "oacute",  LATIN1_oacute, "o'",  0x00f3},
++  { "ocirc",   LATIN1_ocirc,  "o^",  0x00f4},
++  { "oelig",   0,             "oe",  0x0153},
++  { "ograve",  LATIN1_ograve, "o`",  0x00f2},
++  { "oline",   LATIN1_macr,   "-",   0x203e},
++  { "omega",   0,             "o",   0x03c9},
++  { "omicron", 0,             "o",   0x03bf},
++  { "oplus",   0,             "(+)", 0x2295},
++  { "or",      0,             "OR",  0x2228},
++  { "ordf",    LATIN1_ordf,   "-a",  0x00aa},
++  { "ordm",    LATIN1_ordm,   "-o",  0x00ba},
++  { "oslash",  LATIN1_oslash, "o/",  0x00f8},
++  { "otilde",  LATIN1_otilde, "o~",  0x00f5},
++  { "otimes",  0,             "(x)", 0x2297},
++  { "ouml",    LATIN1_ouml,   "o\"", 0x00f6},
++  { "para",    LATIN1_para,   "P:",  0x00b6},
++  { "part",    0,             "PART",0x2202},
++  { "permil",  0,             " 0/00",0x2030},
++  { "perp",    0,             "-T",  0x22a5},
++  { "phi",     0,             "f",   0x03c6},
++  { "pi",      0,             "p",   0x03c0},
++  { "piv",     0,             "Pi",  0x03d6},
++  { "plusmn",  LATIN1_plusmn, "+/-", 0x00b1},
++  { "pound",   LATIN1_pound,  "-L-", 0x00a3},
++  { "prime",   0,             "'",   0x2032},
++  { "prod",    0,             "PROD",0x220f},
++  { "prop",    0,             "0(",  0x221d},
++  { "psi",     0,             "ps",  0x03c8},
++  { "quot",    0,             "\"",  0x0022},
++  { "rArr",    0,             "=>",  0x21d2},
++  { "radic",   0,             "SQRT",0x221a},
++  { "rang",    0,             "/>",  0x232a},
++  { "raquo",   LATIN1_raquo,  ">>",  0x00bb},
++  { "rarr",    0,             "->",  0x2192},
+ //  { "rceil",   0,             ">|"         },
+-  { "rdquo",   0,             "\""         },
+-  { "real",    0,             "Re"         },
+-  { "reg",     LATIN1_reg,    "(R)"        },
++  { "rdquo",   0,             "\"",  0x201d},
++  { "real",    0,             "Re",  0x211c},
++  { "reg",     LATIN1_reg,    "(R)", 0x00ae},
+ //  { "rfloor",  0,             "|>"         },
+-  { "rho",     0,             "r"          },
+-  { "rsaquo",  0,             ">"          },
+-  { "rsquo",   0,             "'"          },
+-  { "sbquo",   0,             "'"          },
+-  { "scaron",  0,             "s"          },
+-  { "sdot",    0,             "DOT"        },
+-  { "sect",    LATIN1_sect,   "S:"         },
+-  { "shy",     LATIN1_shy,    ""           },
+-  { "sigma",   0,             "s"          },
+-  { "sigmaf",  0,             "s"          },
+-  { "sim",     0,             "~"          },
++  { "rho",     0,             "r",   0x03c1},
++  { "rsaquo",  0,             ">",   0x203a},
++  { "rsquo",   0,             "'",   0x2019},
++  { "sbquo",   0,             "'",   0x201a},
++  { "scaron",  0,             "s",   0x0161},
++  { "sdot",    0,             "DOT", 0x22c5},
++  { "sect",    LATIN1_sect,   "S:",  0x00a7},
++  { "shy",     LATIN1_shy,    "",    0x00ad},
++  { "sigma",   0,             "s",   0x03c3},
++  { "sigmaf",  0,             "s",   0x03c2},
++  { "sim",     0,             "~",   0x223c},
+ //  { "spades",  0,             "[spades]"   },
+-  { "sub",     0,             "(C"         },
+-  { "sube",    0,             "(_"         },
+-  { "sum",     0,             "SUM"        },
+-  { "sup",     0,             ")C"         },
+-  { "sup1",    LATIN1_sup1,   "^1"         },
+-  { "sup2",    LATIN1_sup2,   "^2"         },
+-  { "sup3",    LATIN1_sup3,   "^3"         },
+-  { "supe",    0,             ")_"         },
+-  { "szlig",   LATIN1_szlig,  "ss"         },
+-  { "tau",     0,             "t"          },
+-  { "there4",  0,             ".:"         },
+-  { "theta",   0,             "th"         },
+-  { "thorn",   LATIN1_thorn,  "th"         },
+-  { "tilde",   0,             "~"          },
+-  { "times",   LATIN1_times,  "x"          },
+-  { "trade",   0,             "[TM]"       },
+-  { "uArr",    0,             "^^"         },
+-  { "uacute",  LATIN1_uacute, "u'"         },
+-  { "uarr",    0,             "^"          },
+-  { "ucirc",   LATIN1_ucirc,  "u^"         },
+-  { "ugrave",  LATIN1_ugrave, "u`"         },
+-  { "uml",     LATIN1_uml,    "\""         },
+-  { "upsilon", 0,             "u"          },
+-  { "uuml",    LATIN1_uuml,   "u\""        },
+-  { "weierp",  0,             "P"          },
+-  { "xi",      0,             "x"          },
+-  { "yacute",  LATIN1_yacute, "y'"         },
+-  { "yen",     LATIN1_yen,    "YEN"        },
+-  { "yuml",    LATIN1_yuml,   "y\""        },
+-  { "zeta",    0,             "z"          },
++  { "sub",     0,             "(C",  0x2282},
++  { "sube",    0,             "(_",  0x2286},
++  { "sum",     0,             "SUM", 0x2211},
++  { "sup",     0,             ")C",  0x2283},
++  { "sup1",    LATIN1_sup1,   "^1",  0x00b9},
++  { "sup2",    LATIN1_sup2,   "^2",  0x00b2},
++  { "sup3",    LATIN1_sup3,   "^3",  0x00b3},
++  { "supe",    0,             ")_",  0x2287},
++  { "szlig",   LATIN1_szlig,  "ss",  0x00df},
++  { "tau",     0,             "t",   0x03c4},
++  { "there4",  0,             ".:",  0x2234},
++  { "theta",   0,             "th",  0x03b8},
++  { "thorn",   LATIN1_thorn,  "th",  0x00fe},
++  { "tilde",   0,             "~",   0x02dc},
++  { "times",   LATIN1_times,  "x",   0x00d7},
++  { "trade",   0,             "[TM]",0x2122},
++  { "uArr",    0,             "^^",  0x21d1},
++  { "uacute",  LATIN1_uacute, "u'",  0x00fa},
++  { "uarr",    0,             "^",   0x2191},
++  { "ucirc",   LATIN1_ucirc,  "u^",  0x00fb},
++  { "ugrave",  LATIN1_ugrave, "u`",  0x00f9},
++  { "uml",     LATIN1_uml,    "\"",  0x00a8},
++  { "upsilon", 0,             "u",   0x03c5},
++  { "uuml",    LATIN1_uuml,   "u\"", 0x00fc},
++  { "weierp",  0,             "P",   0x2118},
++  { "xi",      0,             "x",   0x03be},
++  { "yacute",  LATIN1_yacute, "y'",  0x00fd},
++  { "yen",     LATIN1_yen,    "YEN", 0x00a5},
++  { "yuml",    LATIN1_yuml,   "y\"", 0x00ff},
++  { "zeta",    0,             "z",   0x03b6},
+ };
+ 
+-extern int use_iso8859;
++extern int use_encoding;
+ 
+ /* ------------------------------------------------------------------------- */
+ 
++char ubuf[4];
++
++char *mkutf(unsigned long x)
++{
++  memset(ubuf, 0, 4);
++  if(x < 128) ubuf[0] = x;
++  else if(x < 0x800) {
++     ubuf[0] = (0xc0 | ((x >> 6) & 0x1f));
++     ubuf[1] = (0x80 | (x & 0x3f));
++  }
++  else {
++     ubuf[0] = (0xe0 | ((x >> 12) & 0x0f));
++     ubuf[1] = (0x80 | ((x >> 6) & 0x3f));
++     ubuf[2] = (0x80 | (x & 0x3f));
++  }
++  return ubuf;
++}
++
+ void
+ replace_sgml_entities(string *s)
+ {
+@@ -330,9 +349,9 @@ replace_sgml_entities(string *s)
+      */
+     while (j < l && s->at(j) != '&') ++j;
+     /*
+-     * We could convert high-bit chars to "&#233;" here if use_iso8859
+-     * is off, then let them be translated or not.  Is the purpose of
+-     * !use_iso8859 to allow SGML entities to be seen, or to strongly
++     * We could convert high-bit chars to "&#233;" here if USE_ASCII
++     * is on, then let them be translated or not.  Is the purpose of
++     * USE_ASCII to allow SGML entities to be seen, or to strongly
+      * filter against high-ASCII chars that might blow up a terminal
+      * that doesn't speak ISO8859?  For the moment, "allow SGML entities
+      * to be seen" -- no filtering here.
+@@ -370,7 +389,11 @@ replace_sgml_entities(string *s)
+           if (!isdigit(c)) break;
+           x = 10 * x + c - '0';
+         }
+-        if (use_iso8859 || (x < 128)) {
++        if (USE_UTF8) {
++          s->replace(beg, j - beg, mkutf(x));
++          j = beg + 1;
++        }
++        else if (USE_ISO8859 && (x < 256) || USE_ASCII && (x < 128)) {
+         s->replace(beg, j - beg, 1, (char) x);
+         j = beg + 1;
+         } else {
+@@ -408,13 +431,17 @@ replace_sgml_entities(string *s)
+         (int (*)(const void *, const void *)) strcmp
+       );
+       if (entity != NULL) {
+-        if (use_iso8859 && entity->iso8859code) {
++        if (USE_ISO8859 && entity->iso8859code) {
+           s->replace(beg, j - beg, 1, (char) entity->iso8859code);
+           j = beg + 1;
+-        } else if (entity->asciistr) {
++        } else if (USE_ASCII && entity->asciistr) {
+           s->replace(beg, j - beg, entity->asciistr);
+         j = beg + 1;
+         } /* else don't replace it at all, we don't have a translation */
++        else if(USE_UTF8 && entity->unicode) {
++        s->replace(beg, j - beg, mkutf(entity->unicode));
++        j = beg + 1;
++        }
+       }
+     } else {
+       ;                         /* EXTENSION: Allow literal '&' sometimes. */
diff -Nrua -x CVS /usr/ports/converters/html2text/patches/patch-table_C ./patches/patch-table_C
--- /usr/ports/converters/html2text/patches/patch-table_C	Thu Jan  1 01:00:00 1970
+++ ./patches/patch-table_C	Sat Jan 21 20:58:11 2017
@@ -0,0 +1,64 @@
+$OpenBSD$
+--- table.C.orig	Mon Jul 22 13:32:50 2002
++++ table.C	Sat Jan 21 20:57:57 2017
+@@ -122,14 +122,14 @@ create_lcs(
+       "LEFT",   Area::LEFT,
+       "CENTER", Area::CENTER,
+       "RIGHT",  Area::RIGHT,
+-      0
++      NULL
+     );
+     int row_valign = get_attribute(
+       row.attributes.get(), "VALIGN", Area::MIDDLE,
+       "TOP",    Area::LEFT,
+       "MIDDLE", Area::MIDDLE,
+       "BOTTOM", Area::BOTTOM,
+-      0
++      NULL
+     );
+ 
+     const list<auto_ptr<TableCell> >           &cl(*row.cells);
+@@ -158,14 +158,14 @@ create_lcs(
+         "LEFT",   Area::LEFT,
+         "CENTER", Area::CENTER,
+         "RIGHT",  Area::RIGHT,
+-        0
++        NULL
+       );
+       p->valign    = get_attribute(
+         cell.attributes.get(), "VALIGN", row_valign,
+         "TOP",    Area::TOP,
+         "MIDDLE", Area::MIDDLE,
+         "BOTTOM", Area::BOTTOM,
+-        0
++        NULL
+       );
+       {
+ 	auto_ptr<Area> tmp(cell.format(
+@@ -175,7 +175,7 @@ create_lcs(
+           - (*number_of_columns_return - 1) * (column_spacing + 0),
+           Area::LEFT // Yields better results than "p->halign"!
+         ));
+-	p->width = tmp.get() ? tmp->width() : 0;
++	p->width = tmp.get() ? tmp->utf_width() : 0;
+       }
+       p->minimized = false;
+ 
+@@ -308,7 +308,7 @@ narrow_table(
+ 	left_of_column + old_column_width - 1,
+ 	Area::LEFT // Yields better results than "lc.halign"!
+       ));
+-      w = tmp->width();
++      w = tmp->utf_width();
+       if (w >= left_of_column + old_column_width) lc.minimized = true;
+     }
+     if (w > left_of_column + new_column_width) {
+@@ -386,7 +386,7 @@ Table::format(Area::size_type w, int halign) const
+     "LEFT",   Area::LEFT,
+     "CENTER", Area::CENTER,
+     "RIGHT",  Area::RIGHT,
+-    0
++    NULL
+   );
+ 
+   // <TABLE>          => default => no border
diff -Nrua -x CVS /usr/ports/converters/html2text/pkg/PLIST ./pkg/PLIST
--- /usr/ports/converters/html2text/pkg/PLIST	Sat Nov 22 13:06:22 2014
+++ ./pkg/PLIST	Sat Jan 21 21:45:17 2017
@@ -1,5 +1,5 @@
 @comment $OpenBSD: PLIST,v 1.3 2014/11/22 12:06:22 landry Exp $
-bin/html2text
+@bin bin/html2text
 @man man/man1/html2text.1
 @man man/man5/html2textrc.5
 share/doc/html2text/

converters/html2text add utf8 and other fixes

Reply via email to