[MediaWiki-commits] [Gerrit] openzim[master]: Add xapian indexer.

Kelson (Code Review) Sat, 25 Mar 2017 10:17:04 -0700

Kelson has submitted this change and it was merged. ( 
https://gerrit.wikimedia.org/r/296913 )


Change subject: Add xapian indexer.
......................................................................


Add xapian indexer.

Xapian is optional.
Build your index inside zim by adding "-i" or "--createFullTextIndex"
to zimwriterfs' command line.

Change-Id: I52c255e8335d0b6763c1c59eeb1549300d5f6f81
---
M zimwriterfs/Makefile.am
M zimwriterfs/configure.ac
M zimwriterfs/tools.cpp
M zimwriterfs/tools.h
A zimwriterfs/xapian/htmlparse.cc
A zimwriterfs/xapian/htmlparse.h
A zimwriterfs/xapian/myhtmlparse.cc
A zimwriterfs/xapian/myhtmlparse.h
A zimwriterfs/xapian/namedentities.h
A zimwriterfs/xapianIndexer.cpp
A zimwriterfs/xapianIndexer.h
M zimwriterfs/zimwriterfs.cpp
12 files changed, 1,490 insertions(+), 0 deletions(-)

Approvals:
  Kelson: Verified; Looks good to me, approved



diff --git a/zimwriterfs/Makefile.am b/zimwriterfs/Makefile.am
index 628b74c..1d40174 100644
--- a/zimwriterfs/Makefile.am
+++ b/zimwriterfs/Makefile.am
@@ -10,3 +10,15 @@
         resourceTools.cpp \
         pathTools.cpp \
         mimetypecounter.cpp
+
+zimwriterfs_CXXFLAGS = $(ICU_CFLAGS)
+zimwriterfs_LDFLAGS = $(ICU_LDFLAGS)
+
+if HAVE_XAPIAN
+zimwriterfs_CXXFLAGS += $(XAPIAN_CFLAGS)
+zimwriterfs_LDFLAGS += $(XAPIAN_LDFLAGS)
+zimwriterfs_SOURCES += \
+        xapianIndexer.cpp \
+        xapian/myhtmlparse.cc \
+        xapian/htmlparse.cc
+endif
diff --git a/zimwriterfs/configure.ac b/zimwriterfs/configure.ac
index fb12c8f..795d3b1 100644
--- a/zimwriterfs/configure.ac
+++ b/zimwriterfs/configure.ac
@@ -71,6 +71,121 @@
 AC_DEFINE_UNQUOTED(LZMA_MEMORY_SIZE, 128, [set lzma uncompress memory size to 
number of MB])
 AC_DEFINE(ENABLE_LZMA, [1], [defined if lzma compression is enabled])
 
+
+function findLibrary {
+       found=0
+       for f in $(echo $LIBS_ROOT|tr ":" "\n") ; do
+               sf=`find $f -name $1 | grep $ARCH | head -1 2> /dev/null`
+               if [[ -f "$sf" -a $found -eq 0 ]]
+               then
+                       found=1
+                       echo $sf
+               fi
+       done
+       if [[ $found -eq 0 ]]
+       then
+               for f in $(echo $LIBS_ROOT|tr ":" "\n") ; do
+                       sf=`find $f -name $1 | head -1 2> /dev/null`
+                       if [[ -f "$sf" -a $found -eq 0 ]]
+                       then
+                               found=1
+                               echo $sf
+                       fi
+               done
+       fi
+       if [[ $found -eq 0 ]]
+       then
+               echo "no"
+       fi
+}
+
+
+####################################################
+############ ICU
+####################################################
+
+
+ICU_CFLAGS=""
+ICU_LDFLAGS="-licui18n -licuuc -licudata" # replaced by icu-config
+ICU_STATIC_LDFLAGS=""
+
+# if --with-x, add path to LIBRARY_PATH
+AC_ARG_WITH(icu,
+            AC_HELP_STRING([--with-icu=DIR], [alternate location for 
icu-config]),
+            export 
LIBRARY_PATH="${withval}:${LIBRARY_PATH}";ICU_PATH=${withval}
+           )
+
+# look for shared library.
+# AC_CHECK_HEADER([zlib.h],, [AC_MSG_ERROR([[cannot find zlib header]])])
+# AC_CHECK_LIB([z], [zlibVersion],, [AC_MSG_ERROR([[cannot find 
zlib]]);COMPILE_ICU=1])
+# ICU_FILES=`findLibrary "libicuuc.${SHARED_EXT}"`
+
+AC_CHECK_TOOL(HAVE_ICU_CONFIG, icu-config,, "${ICU_PATH}:${PATH}")
+if test [ ! "$HAVE_ICU_CONFIG" ]
+then
+     AC_MSG_ERROR([[cannot find icu-config]])
+else
+    OLDPATH=$PATH
+    PATH="${ICU_PATH}:${PATH}"
+    ICU_CFLAGS=`icu-config --cxxflags`;
+    ICU_LDFLAGS=`icu-config --ldflags`;
+    ICU_VER=`icu-config --version`;
+    ICU_FILES="`findLibrary "libicuuc.${SHARED_EXT}"` `findLibrary 
"libicudata.${SHARED_EXT}"` `findLibrary "libicui18n.${SHARED_EXT}"`"
+    PATH=$OLDPATH
+    if [[ $ICU_VER \< "4.2" ]]
+       then
+            AC_MSG_ERROR([[You need a version of libicu >= 4.2]])
+       fi
+fi
+
+
+AC_SUBST(ICU_CFLAGS)
+AC_SUBST(ICU_LDFLAGS)
+AC_SUBST(ICU_STATIC_LDFLAGS)
+AC_SUBST(ICU_FILES)
+AC_SUBST(COMPILED_ICUDATA_DAT)
+
+####################################################
+############ XAPIAN
+####################################################
+
+XAPIAN_CFLAGS=""
+XAPIAN_LDFLAGS=""
+XAPIAN_STATIC_LDFLAGS=""
+XAPIAN_ENABLE=0
+
+# if --with-x, add path to LIBRARY_PATH
+AC_ARG_WITH([xapian],
+       [AS_HELP_STRING([--with-xapian=DIR], [alternat location for 
xapian-config] @@)],
+       [xapian_dir=$withval],
+       [with_xapian=yes])
+
+
+AS_IF([test "x$with_xapian" == xno],
+        [AM_CONDITIONAL(HAVE_XAPIAN, false)],
+       [OLDPATH=$PATH
+        AS_IF([test "x$with_xapian" != xyes],
+              PATH="$with_xapian:$PATH")
+        AC_CHECK_TOOLS(XAPIAN_CONFIG, xapian-config-1.3, 
xapian-config,[],$PATH)
+        AS_IF([test "x$XAPIAN_CONFIG" == x ],
+               AC_MSG_ERROR([[cannot find xapian-config file]])
+             )
+        XAPIAN_VERSION=`$XAPIAN_CONFIG --version`
+        good_version=yes
+        AS_VERSION_COMPARE($XAPIAN_VERSION, "xapian-config - xapian-core 
1.3.4", [good_version=no], [], [])
+        AS_IF([test "x$good_version" == xno],
+               AC_MSG_ERROR([[xapian version must be >= 1.3.4]])
+             )
+        AM_CONDITIONAL(HAVE_XAPIAN, true)
+        AC_DEFINE(HAVE_XAPIAN)
+        XAPIAN_CFLAGS=`$XAPIAN_CONFIG --cxxflags`;
+        XAPIAN_LDFLAGS=`$XAPIAN_CONFIG --ltlibs`;
+        PATH=$OLDPATH
+       ])
+
+AC_SUBST(XAPIAN_CFLAGS)
+AC_SUBST(XAPIAN_LDFLAGS)
+
 # Configure the output files
 AC_CONFIG_FILES([
   Makefile
diff --git a/zimwriterfs/tools.cpp b/zimwriterfs/tools.cpp
index 019b22c..868f32c 100644
--- a/zimwriterfs/tools.cpp
+++ b/zimwriterfs/tools.cpp
@@ -32,6 +32,10 @@
 #include <sys/stat.h>
 #include <magic.h>
 
+#include <unicode/translit.h>
+#include <unicode/ucnv.h>
+
+
 #ifdef _WIN32
 #define SEPARATOR "\\"
 #else
@@ -523,3 +527,14 @@
   return computeRelativePath(baseUrl, newUrl);
 }
 
+std::string removeAccents(const std::string &text) {
+  ucnv_setDefaultName("UTF-8");
+  UErrorCode status = U_ZERO_ERROR;
+  Transliterator *removeAccentsTrans = Transliterator::createInstance("Lower; 
NFD; [:M:] remove; NFC", UTRANS_FORWARD, status);
+  UnicodeString ustring = UnicodeString(text.c_str());
+  removeAccentsTrans->transliterate(ustring);
+  delete removeAccentsTrans;
+  std::string unaccentedText;
+  ustring.toUTF8String(unaccentedText);
+  return unaccentedText;
+}
diff --git a/zimwriterfs/tools.h b/zimwriterfs/tools.h
index 8b43da4..d85b292 100644
--- a/zimwriterfs/tools.h
+++ b/zimwriterfs/tools.h
@@ -45,4 +45,6 @@
 std::string extractRedirectUrlFromHtml(const GumboVector* head_children);
 void getLinks(GumboNode* node, std::map<std::string, bool> &links);
 
+std::string removeAccents(const std::string &text);
+
 #endif // OPENZIM_ZIMWRITERFS_TOOLS_H
diff --git a/zimwriterfs/xapian/htmlparse.cc b/zimwriterfs/xapian/htmlparse.cc
new file mode 100644
index 0000000..39b49ae
--- /dev/null
+++ b/zimwriterfs/xapian/htmlparse.cc
@@ -0,0 +1,373 @@
+/* htmlparse.cc: simple HTML parser for omega indexer
+ *
+ * Copyright 1999,2000,2001 BrightStation PLC
+ * Copyright 2001 Ananova Ltd
+ * Copyright 2002,2006,2007,2008 Olly Betts
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
+ * USA
+ */
+
+// #include <config.h>
+
+#include "htmlparse.h"
+
+#include <xapian.h>
+
+// #include "utf8convert.h"
+
+#include <algorithm>
+
+#include <ctype.h>
+#include <cstring>
+#include <stdio.h>
+#include <stdlib.h>
+
+using namespace std;
+
+inline void
+lowercase_string(string &str)
+{
+    for (string::iterator i = str.begin(); i != str.end(); ++i) {
+       *i = tolower(static_cast<unsigned char>(*i));
+    }
+}
+
+map<string, unsigned int> HtmlParser::named_ents;
+
+inline static bool
+p_notdigit(char c)
+{
+    return !isdigit(static_cast<unsigned char>(c));
+}
+
+inline static bool
+p_notxdigit(char c)
+{
+    return !isxdigit(static_cast<unsigned char>(c));
+}
+
+inline static bool
+p_notalnum(char c)
+{
+    return !isalnum(static_cast<unsigned char>(c));
+}
+
+inline static bool
+p_notwhitespace(char c)
+{
+    return !isspace(static_cast<unsigned char>(c));
+}
+
+inline static bool
+p_nottag(char c)
+{
+    return !isalnum(static_cast<unsigned char>(c)) &&
+       c != '.' && c != '-' && c != ':'; // ':' for XML namespaces.
+}
+
+inline static bool
+p_whitespacegt(char c)
+{
+    return isspace(static_cast<unsigned char>(c)) || c == '>';
+}
+
+inline static bool
+p_whitespaceeqgt(char c)
+{
+    return isspace(static_cast<unsigned char>(c)) || c == '=' || c == '>';
+}
+
+bool
+HtmlParser::get_parameter(const string & param, string & value)
+{
+    map<string, string>::const_iterator i = parameters.find(param);
+    if (i == parameters.end()) return false;
+    value = i->second;
+    return true;
+}
+
+HtmlParser::HtmlParser()
+{
+    static const struct ent { const char *n; unsigned int v; } ents[] = {
+#include "namedentities.h"
+       { NULL, 0 }
+    };
+    if (named_ents.empty()) {
+       const struct ent *i = ents;
+       while (i->n) {
+           named_ents[string(i->n)] = i->v;
+           ++i;
+       }
+    }
+}
+
+void
+HtmlParser::decode_entities(string &s)
+{
+    // We need a const_iterator version of s.end() - otherwise the
+    // find() and find_if() templates don't work...
+    string::const_iterator amp = s.begin(), s_end = s.end();
+    while ((amp = find(amp, s_end, '&')) != s_end) {
+       unsigned int val = 0;
+       string::const_iterator end, p = amp + 1;
+       if (p != s_end && *p == '#') {
+           p++;
+           if (p != s_end && (*p == 'x' || *p == 'X')) {
+               // hex
+               p++;
+               end = find_if(p, s_end, p_notxdigit);
+               sscanf(s.substr(p - s.begin(), end - p).c_str(), "%x", &val);
+           } else {
+               // number
+               end = find_if(p, s_end, p_notdigit);
+               val = atoi(s.substr(p - s.begin(), end - p).c_str());
+           }
+       } else {
+           end = find_if(p, s_end, p_notalnum);
+           string code = s.substr(p - s.begin(), end - p);
+           map<string, unsigned int>::const_iterator i;
+           i = named_ents.find(code);
+           if (i != named_ents.end()) val = i->second;
+       }
+       if (end < s_end && *end == ';') end++;
+       if (val) {
+           string::size_type amp_pos = amp - s.begin();
+           if (val < 0x80) {
+               s.replace(amp_pos, end - amp, 1u, char(val));
+           } else {
+               // Convert unicode value val to UTF-8.
+               char seq[4];
+               unsigned len = Xapian::Unicode::nonascii_to_utf8(val, seq);
+               s.replace(amp_pos, end - amp, seq, len);
+           }
+           s_end = s.end();
+           // We've modified the string, so the iterators are no longer
+           // valid...
+           amp = s.begin() + amp_pos + 1;
+       } else {
+           amp = end;
+       }
+    }
+}
+
+void
+HtmlParser::parse_html(const string &body)
+{
+    in_script = false;
+
+    parameters.clear();
+    string::const_iterator start = body.begin();
+
+    while (true) {
+       // Skip through until we find an HTML tag, a comment, or the end of
+       // document.  Ignore isolated occurrences of `<' which don't start
+       // a tag or comment.
+       string::const_iterator p = start;
+       while (true) {
+           p = find(p, body.end(), '<');
+           if (p == body.end()) break;
+           unsigned char ch = *(p + 1);
+
+           // Tag, closing tag, or comment (or SGML declaration).
+           if ((!in_script && isalpha(ch)) || ch == '/' || ch == '!') break;
+
+           if (ch == '?') {
+               // PHP code or XML declaration.
+               // XML declaration is only valid at the start of the first line.
+               // FIXME: need to deal with BOMs...
+               if (p != body.begin() || body.size() < 20) break;
+
+               // XML declaration looks something like this:
+               // <?xml version="1.0" encoding="UTF-8"?>
+               if (p[2] != 'x' || p[3] != 'm' || p[4] != 'l') break;
+               if (strchr(" \t\r\n", p[5]) == NULL) break;
+
+               string::const_iterator decl_end = find(p + 6, body.end(), '?');
+               if (decl_end == body.end()) break;
+
+               // Default charset for XML is UTF-8.
+               charset = "UTF-8";
+
+               string decl(p + 6, decl_end);
+               size_t enc = decl.find("encoding");
+               if (enc == string::npos) break;
+
+               enc = decl.find_first_not_of(" \t\r\n", enc + 8);
+               if (enc == string::npos || enc == decl.size()) break;
+
+               if (decl[enc] != '=') break;
+
+               enc = decl.find_first_not_of(" \t\r\n", enc + 1);
+               if (enc == string::npos || enc == decl.size()) break;
+
+               if (decl[enc] != '"' && decl[enc] != '\'') break;
+
+               char quote = decl[enc++];
+               size_t enc_end = decl.find(quote, enc);
+
+               if (enc != string::npos)
+                   charset = decl.substr(enc, enc_end - enc);
+
+               break;
+           }
+           p++;
+       }
+
+       // Process text up to start of tag.
+       if (p > start) {
+           string text = body.substr(start - body.begin(), p - start);
+           // convert_to_utf8(text, charset);
+           decode_entities(text);
+           process_text(text);
+       }
+
+       if (p == body.end()) break;
+
+       start = p + 1;
+
+       if (start == body.end()) break;
+
+       if (*start == '!') {
+           if (++start == body.end()) break;
+           if (++start == body.end()) break;
+           // comment or SGML declaration
+           if (*(start - 1) == '-' && *start == '-') {
+               ++start;
+               string::const_iterator close = find(start, body.end(), '>');
+               // An unterminated comment swallows rest of document
+               // (like Netscape, but unlike MSIE IIRC)
+               if (close == body.end()) break;
+
+               p = close;
+               // look for -->
+               while (p != body.end() && (*(p - 1) != '-' || *(p - 2) != '-'))
+                   p = find(p + 1, body.end(), '>');
+
+               if (p != body.end()) {
+                   // Check for htdig's "ignore this bit" comments.
+                   if (p - start == 15 && string(start, p - 2) == 
"htdig_noindex") {
+                       string::size_type i;
+                       i = body.find("<!--/htdig_noindex-->", p + 1 - 
body.begin());
+                       if (i == string::npos) break;
+                       start = body.begin() + i + 21;
+                       continue;
+                   }
+                   // If we found --> skip to there.
+                   start = p;
+               } else {
+                   // Otherwise skip to the first > we found (as Netscape 
does).
+                   start = close;
+               }
+           } else {
+               // just an SGML declaration, perhaps giving the DTD - ignore it
+               start = find(start - 1, body.end(), '>');
+               if (start == body.end()) break;
+           }
+           ++start;
+       } else if (*start == '?') {
+           if (++start == body.end()) break;
+           // PHP - swallow until ?> or EOF
+           start = find(start + 1, body.end(), '>');
+
+           // look for ?>
+           while (start != body.end() && *(start - 1) != '?')
+               start = find(start + 1, body.end(), '>');
+
+           // unterminated PHP swallows rest of document (rather arbitrarily
+           // but it avoids polluting the database when things go wrong)
+           if (start != body.end()) ++start;
+       } else {
+           // opening or closing tag
+           int closing = 0;
+
+           if (*start == '/') {
+               closing = 1;
+               start = find_if(start + 1, body.end(), p_notwhitespace);
+           }
+
+           p = start;
+           start = find_if(start, body.end(), p_nottag);
+           string tag = body.substr(p - body.begin(), start - p);
+           // convert tagname to lowercase
+           lowercase_string(tag);
+
+           if (closing) {
+               closing_tag(tag);
+               if (in_script && tag == "script") in_script = false;
+
+               /* ignore any bogus parameters on closing tags */
+               p = find(start, body.end(), '>');
+               if (p == body.end()) break;
+               start = p + 1;
+           } else {
+               // FIXME: parse parameters lazily.
+               while (start < body.end() && *start != '>') {
+                   string name, value;
+
+                   p = find_if(start, body.end(), p_whitespaceeqgt);
+
+                   name.assign(body, start - body.begin(), p - start);
+
+                   p = find_if(p, body.end(), p_notwhitespace);
+
+                   start = p;
+                   if (start != body.end() && *start == '=') {
+                       start = find_if(start + 1, body.end(), p_notwhitespace);
+
+                       p = body.end();
+
+                       int quote = *start;
+                       if (quote == '"' || quote == '\'') {
+                           start++;
+                           p = find(start, body.end(), quote);
+                       }
+
+                       if (p == body.end()) {
+                           // unquoted or no closing quote
+                           p = find_if(start, body.end(), p_whitespacegt);
+                       }
+                       value.assign(body, start - body.begin(), p - start);
+                       start = find_if(p, body.end(), p_notwhitespace);
+
+                       if (!name.empty()) {
+                           // convert parameter name to lowercase
+                           lowercase_string(name);
+                           // in case of multiple entries, use the first
+                           // (as Netscape does)
+                           parameters.insert(make_pair(name, value));
+                       }
+                   }
+               }
+#if 0
+               cout << "<" << tag;
+               map<string, string>::const_iterator x;
+               for (x = parameters.begin(); x != parameters.end(); x++) {
+                   cout << " " << x->first << "=\"" << x->second << "\"";
+               }
+               cout << ">\n";
+#endif
+               opening_tag(tag);
+               parameters.clear();
+
+               // In <script> tags we ignore opening tags to avoid problems
+               // with "a<b".
+               if (tag == "script") in_script = true;
+
+               if (start != body.end() && *start == '>') ++start;
+           }
+       }
+    }
+}
diff --git a/zimwriterfs/xapian/htmlparse.h b/zimwriterfs/xapian/htmlparse.h
new file mode 100644
index 0000000..79e96ec
--- /dev/null
+++ b/zimwriterfs/xapian/htmlparse.h
@@ -0,0 +1,49 @@
+/* htmlparse.h: simple HTML parser for omega indexer
+ *
+ * Copyright 1999,2000,2001 BrightStation PLC
+ * Copyright 2002,2006,2008 Olly Betts
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
+ * USA
+ */
+
+#ifndef OMEGA_INCLUDED_HTMLPARSE_H
+#define OMEGA_INCLUDED_HTMLPARSE_H
+
+#include <string>
+#include <map>
+
+using std::string;
+using std::map;
+
+class HtmlParser {
+       map<string, string> parameters;
+    protected:
+       void decode_entities(string &s);
+       bool in_script;
+       string charset;
+       static map<string, unsigned int> named_ents;
+
+       bool get_parameter(const string & param, string & value);
+    public:
+       virtual void process_text(const string &/*text*/) { }
+       virtual void opening_tag(const string &/*tag*/) { }
+       virtual void closing_tag(const string &/*tag*/) { }
+       virtual void parse_html(const string &text);
+       HtmlParser();
+       virtual ~HtmlParser() { }
+};
+
+#endif // OMEGA_INCLUDED_HTMLPARSE_H
diff --git a/zimwriterfs/xapian/myhtmlparse.cc 
b/zimwriterfs/xapian/myhtmlparse.cc
new file mode 100644
index 0000000..e1098af
--- /dev/null
+++ b/zimwriterfs/xapian/myhtmlparse.cc
@@ -0,0 +1,302 @@
+/* myhtmlparse.cc: subclass of HtmlParser for extracting text.
+ *
+ * Copyright 1999,2000,2001 BrightStation PLC
+ * Copyright 2002,2003,2004,2006,2007,2008 Olly Betts
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
+ * USA
+ */
+
+// #include <config.h>
+
+#include "myhtmlparse.h"
+
+// #include "utf8convert.h"
+
+#include <ctype.h>
+#include <string.h>
+
+inline void
+lowercase_string(string &str)
+{
+    for (string::iterator i = str.begin(); i != str.end(); ++i) {
+       *i = tolower(static_cast<unsigned char>(*i));
+    }
+}
+
+void
+MyHtmlParser::parse_html(const string &text, const string &charset_,
+                        bool charset_from_meta_)
+{
+    charset = charset_;
+    charset_from_meta = charset_from_meta_;
+    HtmlParser::parse_html(text);
+}
+
+void
+MyHtmlParser::process_text(const string &text)
+{
+    if (!text.empty() && !in_script_tag && !in_style_tag) {
+       string::size_type b = text.find_first_not_of(WHITESPACE);
+       if (b) pending_space = true;
+       while (b != string::npos) {
+           if (pending_space && !dump.empty()) dump += ' ';
+           string::size_type e = text.find_first_of(WHITESPACE, b);
+           pending_space = (e != string::npos);
+           if (!pending_space) {
+               dump.append(text.data() + b, text.size() - b);
+               return;
+           }
+           dump.append(text.data() + b, e - b);
+           b = text.find_first_not_of(WHITESPACE, e + 1);
+       }
+    }
+}
+
+void
+MyHtmlParser::opening_tag(const string &tag)
+{
+    if (tag.empty()) return;
+    switch (tag[0]) {
+       case 'a':
+           if (tag == "address") pending_space = true;
+           break;
+       case 'b':
+           if (tag == "body") {
+               dump.resize(0);
+               break;
+           }
+           if (tag == "blockquote" || tag == "br") pending_space = true;
+           break;
+       case 'c':
+           if (tag == "center") pending_space = true;
+           break;
+       case 'd':
+           if (tag == "dd" || tag == "dir" || tag == "div" || tag == "dl" ||
+               tag == "dt") pending_space = true;
+           break;
+       case 'e':
+           if (tag == "embed") pending_space = true;
+           break;
+       case 'f':
+           if (tag == "fieldset" || tag == "form") pending_space = true;
+           break;
+       case 'h':
+           // hr, and h1, ..., h6
+           if (tag.length() == 2 && strchr("r123456", tag[1]))
+               pending_space = true;
+           break;
+       case 'i':
+           if (tag == "iframe" || tag == "img" || tag == "isindex" ||
+               tag == "input") pending_space = true;
+           break;
+       case 'k':
+           if (tag == "keygen") pending_space = true;
+           break;
+       case 'l':
+           if (tag == "legend" || tag == "li" || tag == "listing")
+               pending_space = true;
+           break;
+       case 'm':
+           if (tag == "meta") {
+               string content;
+               if (get_parameter("content", content)) {
+                   string name;
+                   if (get_parameter("name", name)) {
+                       lowercase_string(name);
+                       if (name == "description") {
+                           if (sample.empty()) {
+                               swap(sample, content);
+                               // convert_to_utf8(sample, charset);
+                               decode_entities(sample);
+                           }
+                       } else if (name == "keywords") {
+                           if (!keywords.empty()) keywords += ' ';
+                           // convert_to_utf8(content, charset);
+                           decode_entities(content);
+                           keywords += content;
+                       } else if (name == "robots") {
+                           decode_entities(content);
+                           lowercase_string(content);
+                           if (content.find("none") != string::npos ||
+                               content.find("noindex") != string::npos) {
+                               indexing_allowed = false;
+                               throw true;
+                           }
+                       }
+                       break;
+                   }
+                   // If the current charset came from a meta tag, don't
+                   // force reparsing again!
+                   if (charset_from_meta) break;
+                   string hdr;
+                   if (get_parameter("http-equiv", hdr)) {
+                       lowercase_string(hdr);
+                       if (hdr == "content-type") {
+                           lowercase_string(content);
+                           size_t start = content.find("charset=");
+                           if (start == string::npos) break;
+                           start += 8;
+                           if (start == content.size()) break;
+                           size_t end = start;
+                           if (content[start] != '"') {
+                               while (end < content.size()) {
+                                   unsigned char ch = content[end];
+                                   if (ch <= 32 || ch >= 127 ||
+                                       strchr(";()<>@,:\\\"/[]?={}", ch))
+                                       break;
+                                   ++end;
+                               }
+                           } else {
+                               ++start;
+                               ++end;
+                               while (end < content.size()) {
+                                   unsigned char ch = content[end];
+                                   if (ch == '"') break;
+                                   if (ch == '\\') content.erase(end, 1);
+                                   ++end;
+                               }
+                           }
+                           string newcharset(content, start, end - start);
+                           if (charset != newcharset) {
+                               throw newcharset;
+                           }
+                       }
+                   }
+                   break;
+               }
+               if (charset_from_meta) break;
+               string newcharset;
+               if (get_parameter("charset", newcharset)) {
+                   // HTML5 added: <meta charset="...">
+                   lowercase_string(newcharset);
+                   if (charset != newcharset) {
+                       throw newcharset;
+                   }
+               }
+               break;
+           }
+           if (tag == "marquee" || tag == "menu" || tag == "multicol")
+               pending_space = true;
+           break;
+       case 'o':
+           if (tag == "ol" || tag == "option") pending_space = true;
+           break;
+       case 'p':
+           if (tag == "p" || tag == "pre" || tag == "plaintext")
+               pending_space = true;
+           break;
+       case 'q':
+           if (tag == "q") pending_space = true;
+           break;
+       case 's':
+           if (tag == "style") {
+               in_style_tag = true;
+               break;
+           }
+           if (tag == "script") {
+               in_script_tag = true;
+               break;
+           }
+           if (tag == "select") pending_space = true;
+           break;
+       case 't':
+           if (tag == "table" || tag == "td" || tag == "textarea" ||
+               tag == "th") pending_space = true;
+           break;
+       case 'u':
+           if (tag == "ul") pending_space = true;
+           break;
+       case 'x':
+           if (tag == "xmp") pending_space = true;
+           break;
+    }
+}
+
+void
+MyHtmlParser::closing_tag(const string &tag)
+{
+    if (tag.empty()) return;
+    switch (tag[0]) {
+       case 'a':
+           if (tag == "address") pending_space = true;
+           break;
+       case 'b':
+           if (tag == "body") {
+               throw true;
+           }
+           if (tag == "blockquote" || tag == "br") pending_space = true;
+           break;
+       case 'c':
+           if (tag == "center") pending_space = true;
+           break;
+       case 'd':
+           if (tag == "dd" || tag == "dir" || tag == "div" || tag == "dl" ||
+               tag == "dt") pending_space = true;
+           break;
+       case 'f':
+           if (tag == "fieldset" || tag == "form") pending_space = true;
+           break;
+       case 'h':
+           // hr, and h1, ..., h6
+           if (tag.length() == 2 && strchr("r123456", tag[1]))
+               pending_space = true;
+           break;
+       case 'i':
+           if (tag == "iframe") pending_space = true;
+           break;
+       case 'l':
+           if (tag == "legend" || tag == "li" || tag == "listing")
+               pending_space = true;
+           break;
+       case 'm':
+           if (tag == "marquee" || tag == "menu") pending_space = true;
+           break;
+       case 'o':
+           if (tag == "ol" || tag == "option") pending_space = true;
+           break;
+       case 'p':
+           if (tag == "p" || tag == "pre") pending_space = true;
+           break;
+       case 'q':
+           if (tag == "q") pending_space = true;
+           break;
+       case 's':
+           if (tag == "style") {
+               in_style_tag = false;
+               break;
+           }
+           if (tag == "script") {
+               in_script_tag = false;
+               break;
+           }
+           if (tag == "select") pending_space = true;
+           break;
+       case 't':
+           if (tag == "title") {
+               if (title.empty()) swap(title, dump);
+               break;
+           }
+           if (tag == "table" || tag == "td" || tag == "textarea" ||
+               tag == "th") pending_space = true;
+           break;
+       case 'u':
+           if (tag == "ul") pending_space = true;
+           break;
+       case 'x':
+           if (tag == "xmp") pending_space = true;
+           break;
+    }
+}
diff --git a/zimwriterfs/xapian/myhtmlparse.h b/zimwriterfs/xapian/myhtmlparse.h
new file mode 100644
index 0000000..f221cb5
--- /dev/null
+++ b/zimwriterfs/xapian/myhtmlparse.h
@@ -0,0 +1,65 @@
+/* myhtmlparse.h: subclass of HtmlParser for extracting text
+ *
+ * Copyright 1999,2000,2001 BrightStation PLC
+ * Copyright 2002,2003,2004,2006,2008 Olly Betts
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
+ * USA
+ */
+
+#ifndef OMEGA_INCLUDED_MYHTMLPARSE_H
+#define OMEGA_INCLUDED_MYHTMLPARSE_H
+
+#include "htmlparse.h"
+
+// FIXME: Should we include \xa0 which is non-breaking space in iso-8859-1, but
+// not in all charsets and perhaps spans of all \xa0 should become a single
+// \xa0?
+#define WHITESPACE " \t\n\r"
+
+class MyHtmlParser : public HtmlParser {
+    public:
+       bool in_script_tag;
+       bool in_style_tag;
+       bool pending_space;
+       bool indexing_allowed;
+       bool charset_from_meta;
+       string title, sample, keywords, dump;
+       void process_text(const string &text);
+       void opening_tag(const string &tag);
+       void closing_tag(const string &tag);
+       void parse_html(const string &text, const string &charset_,
+                       bool charset_from_meta_);
+       MyHtmlParser() :
+               in_script_tag(false),
+               in_style_tag(false),
+               pending_space(false),
+               indexing_allowed(true),
+               charset_from_meta(false) { }
+
+       void reset() {
+           in_script_tag = false;
+           in_style_tag = false;
+           pending_space = false;
+           indexing_allowed = true;
+           charset_from_meta = false;
+           title.resize(0);
+           sample.resize(0);
+           keywords.resize(0);
+           dump.resize(0);
+       }
+};
+
+#endif // OMEGA_INCLUDED_MYHTMLPARSE_H
diff --git a/zimwriterfs/xapian/namedentities.h 
b/zimwriterfs/xapian/namedentities.h
new file mode 100644
index 0000000..8b7f03e
--- /dev/null
+++ b/zimwriterfs/xapian/namedentities.h
@@ -0,0 +1,279 @@
+/* namedentities.h: named HTML entities.
+ *
+ * Copyright (C) 2006,2007 Olly Betts
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ */
+
+#ifndef OMEGA_INCLUDED_NAMEDENTITIES_H
+#define OMEGA_INCLUDED_NAMEDENTITIES_H
+
+// Names and values from: "Character entity references in HTML 4"
+// http://www.w3.org/TR/html4/sgml/entities.html
+{ "quot", 34 },
+{ "amp", 38 },
+{ "apos", 39 }, // Not in HTML 4 list but used in OpenOffice XML.
+{ "lt", 60 },
+{ "gt", 62 },
+{ "nbsp", 160 },
+{ "iexcl", 161 },
+{ "cent", 162 },
+{ "pound", 163 },
+{ "curren", 164 },
+{ "yen", 165 },
+{ "brvbar", 166 },
+{ "sect", 167 },
+{ "uml", 168 },
+{ "copy", 169 },
+{ "ordf", 170 },
+{ "laquo", 171 },
+{ "not", 172 },
+{ "shy", 173 },
+{ "reg", 174 },
+{ "macr", 175 },
+{ "deg", 176 },
+{ "plusmn", 177 },
+{ "sup2", 178 },
+{ "sup3", 179 },
+{ "acute", 180 },
+{ "micro", 181 },
+{ "para", 182 },
+{ "middot", 183 },
+{ "cedil", 184 },
+{ "sup1", 185 },
+{ "ordm", 186 },
+{ "raquo", 187 },
+{ "frac14", 188 },
+{ "frac12", 189 },
+{ "frac34", 190 },
+{ "iquest", 191 },
+{ "Agrave", 192 },
+{ "Aacute", 193 },
+{ "Acirc", 194 },
+{ "Atilde", 195 },
+{ "Auml", 196 },
+{ "Aring", 197 },
+{ "AElig", 198 },
+{ "Ccedil", 199 },
+{ "Egrave", 200 },
+{ "Eacute", 201 },
+{ "Ecirc", 202 },
+{ "Euml", 203 },
+{ "Igrave", 204 },
+{ "Iacute", 205 },
+{ "Icirc", 206 },
+{ "Iuml", 207 },
+{ "ETH", 208 },
+{ "Ntilde", 209 },
+{ "Ograve", 210 },
+{ "Oacute", 211 },
+{ "Ocirc", 212 },
+{ "Otilde", 213 },
+{ "Ouml", 214 },
+{ "times", 215 },
+{ "Oslash", 216 },
+{ "Ugrave", 217 },
+{ "Uacute", 218 },
+{ "Ucirc", 219 },
+{ "Uuml", 220 },
+{ "Yacute", 221 },
+{ "THORN", 222 },
+{ "szlig", 223 },
+{ "agrave", 224 },
+{ "aacute", 225 },
+{ "acirc", 226 },
+{ "atilde", 227 },
+{ "auml", 228 },
+{ "aring", 229 },
+{ "aelig", 230 },
+{ "ccedil", 231 },
+{ "egrave", 232 },
+{ "eacute", 233 },
+{ "ecirc", 234 },
+{ "euml", 235 },
+{ "igrave", 236 },
+{ "iacute", 237 },
+{ "icirc", 238 },
+{ "iuml", 239 },
+{ "eth", 240 },
+{ "ntilde", 241 },
+{ "ograve", 242 },
+{ "oacute", 243 },
+{ "ocirc", 244 },
+{ "otilde", 245 },
+{ "ouml", 246 },
+{ "divide", 247 },
+{ "oslash", 248 },
+{ "ugrave", 249 },
+{ "uacute", 250 },
+{ "ucirc", 251 },
+{ "uuml", 252 },
+{ "yacute", 253 },
+{ "thorn", 254 },
+{ "yuml", 255 },
+{ "OElig", 338 },
+{ "oelig", 339 },
+{ "Scaron", 352 },
+{ "scaron", 353 },
+{ "Yuml", 376 },
+{ "fnof", 402 },
+{ "circ", 710 },
+{ "tilde", 732 },
+{ "Alpha", 913 },
+{ "Beta", 914 },
+{ "Gamma", 915 },
+{ "Delta", 916 },
+{ "Epsilon", 917 },
+{ "Zeta", 918 },
+{ "Eta", 919 },
+{ "Theta", 920 },
+{ "Iota", 921 },
+{ "Kappa", 922 },
+{ "Lambda", 923 },
+{ "Mu", 924 },
+{ "Nu", 925 },
+{ "Xi", 926 },
+{ "Omicron", 927 },
+{ "Pi", 928 },
+{ "Rho", 929 },
+{ "Sigma", 931 },
+{ "Tau", 932 },
+{ "Upsilon", 933 },
+{ "Phi", 934 },
+{ "Chi", 935 },
+{ "Psi", 936 },
+{ "Omega", 937 },
+{ "alpha", 945 },
+{ "beta", 946 },
+{ "gamma", 947 },
+{ "delta", 948 },
+{ "epsilon", 949 },
+{ "zeta", 950 },
+{ "eta", 951 },
+{ "theta", 952 },
+{ "iota", 953 },
+{ "kappa", 954 },
+{ "lambda", 955 },
+{ "mu", 956 },
+{ "nu", 957 },
+{ "xi", 958 },
+{ "omicron", 959 },
+{ "pi", 960 },
+{ "rho", 961 },
+{ "sigmaf", 962 },
+{ "sigma", 963 },
+{ "tau", 964 },
+{ "upsilon", 965 },
+{ "phi", 966 },
+{ "chi", 967 },
+{ "psi", 968 },
+{ "omega", 969 },
+{ "thetasym", 977 },
+{ "upsih", 978 },
+{ "piv", 982 },
+{ "ensp", 8194 },
+{ "emsp", 8195 },
+{ "thinsp", 8201 },
+{ "zwnj", 8204 },
+{ "zwj", 8205 },
+{ "lrm", 8206 },
+{ "rlm", 8207 },
+{ "ndash", 8211 },
+{ "mdash", 8212 },
+{ "lsquo", 8216 },
+{ "rsquo", 8217 },
+{ "sbquo", 8218 },
+{ "ldquo", 8220 },
+{ "rdquo", 8221 },
+{ "bdquo", 8222 },
+{ "dagger", 8224 },
+{ "Dagger", 8225 },
+{ "bull", 8226 },
+{ "hellip", 8230 },
+{ "permil", 8240 },
+{ "prime", 8242 },
+{ "Prime", 8243 },
+{ "lsaquo", 8249 },
+{ "rsaquo", 8250 },
+{ "oline", 8254 },
+{ "frasl", 8260 },
+{ "euro", 8364 },
+{ "image", 8465 },
+{ "weierp", 8472 },
+{ "real", 8476 },
+{ "trade", 8482 },
+{ "alefsym", 8501 },
+{ "larr", 8592 },
+{ "uarr", 8593 },
+{ "rarr", 8594 },
+{ "darr", 8595 },
+{ "harr", 8596 },
+{ "crarr", 8629 },
+{ "lArr", 8656 },
+{ "uArr", 8657 },
+{ "rArr", 8658 },
+{ "dArr", 8659 },
+{ "hArr", 8660 },
+{ "forall", 8704 },
+{ "part", 8706 },
+{ "exist", 8707 },
+{ "empty", 8709 },
+{ "nabla", 8711 },
+{ "isin", 8712 },
+{ "notin", 8713 },
+{ "ni", 8715 },
+{ "prod", 8719 },
+{ "sum", 8721 },
+{ "minus", 8722 },
+{ "lowast", 8727 },
+{ "radic", 8730 },
+{ "prop", 8733 },
+{ "infin", 8734 },
+{ "ang", 8736 },
+{ "and", 8743 },
+{ "or", 8744 },
+{ "cap", 8745 },
+{ "cup", 8746 },
+{ "int", 8747 },
+{ "there4", 8756 },
+{ "sim", 8764 },
+{ "cong", 8773 },
+{ "asymp", 8776 },
+{ "ne", 8800 },
+{ "equiv", 8801 },
+{ "le", 8804 },
+{ "ge", 8805 },
+{ "sub", 8834 },
+{ "sup", 8835 },
+{ "nsub", 8836 },
+{ "sube", 8838 },
+{ "supe", 8839 },
+{ "oplus", 8853 },
+{ "otimes", 8855 },
+{ "perp", 8869 },
+{ "sdot", 8901 },
+{ "lceil", 8968 },
+{ "rceil", 8969 },
+{ "lfloor", 8970 },
+{ "rfloor", 8971 },
+{ "lang", 9001 },
+{ "rang", 9002 },
+{ "loz", 9674 },
+{ "spades", 9824 },
+{ "clubs", 9827 },
+{ "hearts", 9829 },
+{ "diams", 9830 },
+
+#endif // OMEGA_INCLUDED_NAMEDENTITIES_H
diff --git a/zimwriterfs/xapianIndexer.cpp b/zimwriterfs/xapianIndexer.cpp
new file mode 100644
index 0000000..33666c7
--- /dev/null
+++ b/zimwriterfs/xapianIndexer.cpp
@@ -0,0 +1,180 @@
+/*
+ * Copyright 2011 Emmanuel Engelhart <kel...@kiwix.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU  General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+ * MA 02110-1301, USA.
+ */
+
+#include "xapianIndexer.h"
+#include "tools.h"
+
+#include <unistd.h>
+
+/* Constructor */
+XapianIndexer::XapianIndexer(const std::string& language, const bool verbose) {
+    setVerboseFlag(verbose);
+    readStopWords(language);
+  /*
+  stemmer(Xapian::Stem("french")) {
+  this->indexer.set_stemmer(this->stemmer);
+  */
+}
+
+void XapianIndexer::indexingPrelude(const string indexPath_) {
+    indexPath = indexPath_;
+    this->writableDatabase = Xapian::WritableDatabase(indexPath + ".tmp", 
Xapian::DB_CREATE_OR_OVERWRITE);
+    this->writableDatabase.begin_transaction(true);
+
+    /* Insert the stopwords */
+    if (!this->stopWords.empty()) {
+      std::vector<std::string>::iterator it = this->stopWords.begin();
+      for( ; it != this->stopWords.end(); ++it) {
+       this->stopper.add(*it);
+      }
+
+      this->indexer.set_stopper(&(this->stopper));
+    }
+}
+
+void XapianIndexer::index(const string &url,
+                          const string &title,
+                          const string &unaccentedTitle,
+                          const string &keywords,
+                          const string &content,
+                          const string &snippet,
+                          const string &size,
+                          const string &wordCount) {
+
+    /* Put the data in the document */
+    Xapian::Document currentDocument;
+    currentDocument.clear_values();
+    currentDocument.add_value(0, title);
+    currentDocument.add_value(1, snippet);
+    currentDocument.add_value(2, size);
+    currentDocument.add_value(3, wordCount);
+    currentDocument.set_data(url);
+    indexer.set_document(currentDocument);
+
+    /* Index the title */
+    if (!unaccentedTitle.empty()) {
+      this->indexer.index_text_without_positions(unaccentedTitle, 
this->getTitleBoostFactor(content.size()));
+    }
+
+    /* Index the keywords */
+    if (!keywords.empty()) {
+      this->indexer.index_text_without_positions(keywords, 
keywordsBoostFactor);
+    }
+
+    /* Index the content */
+    if (!content.empty()) {
+      this->indexer.index_text_without_positions(content);
+    }
+
+    /* add to the database */
+    this->writableDatabase.add_document(currentDocument);
+}
+
+void XapianIndexer::flush() {
+    this->writableDatabase.commit_transaction();
+    this->writableDatabase.begin_transaction(true);
+}
+
+void XapianIndexer::indexingPostlude() {
+    this->flush();
+    this->writableDatabase.commit_transaction();
+    this->writableDatabase.commit();
+    this->writableDatabase.compact(indexPath, Xapian::DBCOMPACT_SINGLE_FILE);
+
+    // commit is not available is old version of xapian and seems not 
mandatory there
+    // this->writableDatabase.commit();
+}
+
+void XapianIndexer::handleArticle(Article* article)
+{
+    indexerToken token;
+    size_t found;
+    MyHtmlParser htmlParser;
+
+    if ( article->isRedirect() || article->getMimeType().find("text/html") != 
0 )
+        return;
+
+    token.title = article->getTitle();
+    token.url = article->getUrl();
+    zim::Blob article_content = article->getData();
+    token.content = std::string(article_content.data(), 
article_content.size());
+
+    /* The parser generate a lot of exceptions which should be avoided */
+    try {
+        htmlParser.parse_html(token.content, "UTF-8", true);
+    } catch (...) {
+    }
+
+    /* If content does not have the noindex meta tag */
+    /* Seems that the parser generates an exception in such case */
+    found = htmlParser.dump.find("NOINDEX");
+
+    if (found == string::npos) {
+       /* Get the accented title */
+       token.accentedTitle = (htmlParser.title.empty() ? token.title : 
htmlParser.title);
+
+       /* count words */
+       stringstream countWordStringStream;
+       countWordStringStream << countWords(htmlParser.dump);
+       token.wordCount = countWordStringStream.str();
+
+       /* snippet */
+       std::string snippet = std::string(htmlParser.dump, 0, 300);
+       std::string::size_type last = snippet.find_last_of('.');
+       if (last == snippet.npos)
+         last = snippet.find_last_of(' ');
+       if (last != snippet.npos)
+         snippet = snippet.substr(0, last);
+       token.snippet = snippet;
+
+       /* size */
+       stringstream sizeStringStream;
+       sizeStringStream << token.content.size() / 1024;
+       token.size = sizeStringStream.str();
+
+       /* Remove accent */
+       token.title = removeAccents(token.accentedTitle);
+       token.keywords = removeAccents(htmlParser.keywords);
+       token.content = removeAccents(htmlParser.dump);
+       pushToIndexQueue(token);
+    }
+}
+
+XapianMetaArticle* XapianIndexer::getMetaArticle()
+{
+     return new XapianMetaArticle(this);
+}
+
+zim::Blob XapianMetaArticle::getData() const
+{
+    if ( data.size() == 0 )
+    {
+        indexerToken token;
+        indexer->pushToIndexQueue(token);
+        /* Wait it index everything */
+        int wait = 500;
+        while ( indexer->isRunning() )
+        {
+            usleep(wait);
+        }
+        data = getFileContent(indexer->getIndexPath());
+    }
+    return zim::Blob(data.data(), data.size());
+}
+
diff --git a/zimwriterfs/xapianIndexer.h b/zimwriterfs/xapianIndexer.h
new file mode 100644
index 0000000..71dfe64
--- /dev/null
+++ b/zimwriterfs/xapianIndexer.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright 2011 Emmanuel Engelhart <kel...@kiwix.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU  General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+ * MA 02110-1301, USA.
+ */
+
+#ifndef OPENZIM_ZIMWRITERFS_XAPIANINDEXER_H
+#define OPENZIM_ZIMWRITERFS_XAPIANINDEXER_H
+
+
+#include "indexer.h"
+#include "articlesource.h"
+#include "article.h"
+
+#include <xapian.h>
+#include "xapian/myhtmlparse.h"
+#include <zim/blob.h>
+
+class XapianIndexer;
+
+class XapianMetaArticle : public Article {
+    private:
+        XapianIndexer* indexer;
+        mutable std::string data;
+    public:
+        XapianMetaArticle(XapianIndexer* indexer):
+            indexer(indexer)
+        {
+            ns = 'Z';
+            aid = url = "/Z/fulltextIndex/xapian";
+            title = "Xapian Fulltext Index";
+            mimeType = "application/octet-stream+xapian";
+        };
+        virtual zim::Blob getData() const;
+};
+
+class XapianIndexer : public Indexer, public IHandler {
+    public:
+        XapianIndexer(const std::string& language, bool verbose);
+        std::string getIndexPath() { return indexPath; }
+
+    protected:
+        void indexingPrelude(const string indexPath);
+        void index(const string &url,
+                   const string &title,
+                   const string &unaccentedTitle,
+                   const string &keywords,
+                   const string &content,
+                   const string &snippet,
+                   const string &size,
+                   const string &wordCount);
+        void flush();
+        void indexingPostlude();
+        void handleArticle(Article* article);
+        XapianMetaArticle* getMetaArticle();
+        zim::Blob getData();
+
+        Xapian::WritableDatabase writableDatabase;
+        Xapian::Stem stemmer;
+        Xapian::SimpleStopper stopper;
+        Xapian::TermGenerator indexer;
+        std::string indexPath;
+};
+
+#endif // OPENZIM_ZIMWRITERFS_XAPIANINDEXER_H
diff --git a/zimwriterfs/zimwriterfs.cpp b/zimwriterfs/zimwriterfs.cpp
index 52ed7ea..b48a1ad 100644
--- a/zimwriterfs/zimwriterfs.cpp
+++ b/zimwriterfs/zimwriterfs.cpp
@@ -38,6 +38,9 @@
 #include "queue.h"
 #include "mimetypecounter.h"
 
+#if HAVE_XAPIAN
+#include "xapianIndexer.h"
+#endif
 
 std::string language;
 std::string creator;
@@ -229,6 +232,9 @@
 
 int main(int argc, char** argv) {
   ArticleSource source(filenameQueue);
+#if HAVE_XAPIAN
+  XapianIndexer* xapianIndexer = NULL;
+#endif
   int minChunkSize = 2048;
 
 
@@ -368,6 +374,17 @@
   pthread_create(&(directoryVisitor), NULL, visitDirectoryPath, (void*)NULL);
   pthread_detach(directoryVisitor);
 
+  /* Indexor */
+  if (createFullTextIndex)
+  {
+#if HAVE_XAPIAN
+       xapianIndexer = new XapianIndexer(language, isVerbose());
+       xapianIndexer->start(zimPath + ".indexdb");
+       source.add_customHandler(xapianIndexer);
+#else
+       std::cerr << "Zimwriterfs is compiled without xapian. Indexing is not 
available" << std::endl;
+#endif
+  }
 
   MimetypeCounter mimetypeCounter;
   source.add_customHandler(&mimetypeCounter);
@@ -381,6 +398,9 @@
     std::cerr << e.what() << std::endl;
   }
 
+#if HAVE_XAPIAN
+  delete xapianIndexer;
+#endif
   /* Destroy mutex */
   pthread_mutex_destroy(&directoryVisitorRunningMutex);
   pthread_mutex_destroy(&verboseMutex);

-- 
To view, visit https://gerrit.wikimedia.org/r/296913
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I52c255e8335d0b6763c1c59eeb1549300d5f6f81
Gerrit-PatchSet: 1
Gerrit-Project: openzim
Gerrit-Branch: master
Gerrit-Owner: Mgautierfr <mgaut...@kymeria.fr>
Gerrit-Reviewer: Kelson <kel...@kiwix.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] openzim[master]: Add xapian indexer.

Reply via email to