Kelson has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/296913 )
Change subject: Add xapian indexer. ...................................................................... Add xapian indexer. Xapian is optional. Build your index inside zim by adding "-i" or "--createFullTextIndex" to zimwriterfs' command line. Change-Id: I52c255e8335d0b6763c1c59eeb1549300d5f6f81 --- M zimwriterfs/Makefile.am M zimwriterfs/configure.ac M zimwriterfs/tools.cpp M zimwriterfs/tools.h A zimwriterfs/xapian/htmlparse.cc A zimwriterfs/xapian/htmlparse.h A zimwriterfs/xapian/myhtmlparse.cc A zimwriterfs/xapian/myhtmlparse.h A zimwriterfs/xapian/namedentities.h A zimwriterfs/xapianIndexer.cpp A zimwriterfs/xapianIndexer.h M zimwriterfs/zimwriterfs.cpp 12 files changed, 1,490 insertions(+), 0 deletions(-) Approvals: Kelson: Verified; Looks good to me, approved diff --git a/zimwriterfs/Makefile.am b/zimwriterfs/Makefile.am index 628b74c..1d40174 100644 --- a/zimwriterfs/Makefile.am +++ b/zimwriterfs/Makefile.am @@ -10,3 +10,15 @@ resourceTools.cpp \ pathTools.cpp \ mimetypecounter.cpp + +zimwriterfs_CXXFLAGS = $(ICU_CFLAGS) +zimwriterfs_LDFLAGS = $(ICU_LDFLAGS) + +if HAVE_XAPIAN +zimwriterfs_CXXFLAGS += $(XAPIAN_CFLAGS) +zimwriterfs_LDFLAGS += $(XAPIAN_LDFLAGS) +zimwriterfs_SOURCES += \ + xapianIndexer.cpp \ + xapian/myhtmlparse.cc \ + xapian/htmlparse.cc +endif diff --git a/zimwriterfs/configure.ac b/zimwriterfs/configure.ac index fb12c8f..795d3b1 100644 --- a/zimwriterfs/configure.ac +++ b/zimwriterfs/configure.ac @@ -71,6 +71,121 @@ AC_DEFINE_UNQUOTED(LZMA_MEMORY_SIZE, 128, [set lzma uncompress memory size to number of MB]) AC_DEFINE(ENABLE_LZMA, [1], [defined if lzma compression is enabled]) + +function findLibrary { + found=0 + for f in $(echo $LIBS_ROOT|tr ":" "\n") ; do + sf=`find $f -name $1 | grep $ARCH | head -1 2> /dev/null` + if [[ -f "$sf" -a $found -eq 0 ]] + then + found=1 + echo $sf + fi + done + if [[ $found -eq 0 ]] + then + for f in $(echo $LIBS_ROOT|tr ":" "\n") ; do + sf=`find $f -name $1 | head -1 2> /dev/null` + if [[ -f "$sf" -a $found -eq 0 ]] + then + found=1 + echo $sf + fi + done + fi + if [[ $found -eq 0 ]] + then + echo "no" + fi +} + + +#################################################### +############ ICU +#################################################### + + +ICU_CFLAGS="" +ICU_LDFLAGS="-licui18n -licuuc -licudata" # replaced by icu-config +ICU_STATIC_LDFLAGS="" + +# if --with-x, add path to LIBRARY_PATH +AC_ARG_WITH(icu, + AC_HELP_STRING([--with-icu=DIR], [alternate location for icu-config]), + export LIBRARY_PATH="${withval}:${LIBRARY_PATH}";ICU_PATH=${withval} + ) + +# look for shared library. +# AC_CHECK_HEADER([zlib.h],, [AC_MSG_ERROR([[cannot find zlib header]])]) +# AC_CHECK_LIB([z], [zlibVersion],, [AC_MSG_ERROR([[cannot find zlib]]);COMPILE_ICU=1]) +# ICU_FILES=`findLibrary "libicuuc.${SHARED_EXT}"` + +AC_CHECK_TOOL(HAVE_ICU_CONFIG, icu-config,, "${ICU_PATH}:${PATH}") +if test [ ! "$HAVE_ICU_CONFIG" ] +then + AC_MSG_ERROR([[cannot find icu-config]]) +else + OLDPATH=$PATH + PATH="${ICU_PATH}:${PATH}" + ICU_CFLAGS=`icu-config --cxxflags`; + ICU_LDFLAGS=`icu-config --ldflags`; + ICU_VER=`icu-config --version`; + ICU_FILES="`findLibrary "libicuuc.${SHARED_EXT}"` `findLibrary "libicudata.${SHARED_EXT}"` `findLibrary "libicui18n.${SHARED_EXT}"`" + PATH=$OLDPATH + if [[ $ICU_VER \< "4.2" ]] + then + AC_MSG_ERROR([[You need a version of libicu >= 4.2]]) + fi +fi + + +AC_SUBST(ICU_CFLAGS) +AC_SUBST(ICU_LDFLAGS) +AC_SUBST(ICU_STATIC_LDFLAGS) +AC_SUBST(ICU_FILES) +AC_SUBST(COMPILED_ICUDATA_DAT) + +#################################################### +############ XAPIAN +#################################################### + +XAPIAN_CFLAGS="" +XAPIAN_LDFLAGS="" +XAPIAN_STATIC_LDFLAGS="" +XAPIAN_ENABLE=0 + +# if --with-x, add path to LIBRARY_PATH +AC_ARG_WITH([xapian], + [AS_HELP_STRING([--with-xapian=DIR], [alternat location for xapian-config] @@)], + [xapian_dir=$withval], + [with_xapian=yes]) + + +AS_IF([test "x$with_xapian" == xno], + [AM_CONDITIONAL(HAVE_XAPIAN, false)], + [OLDPATH=$PATH + AS_IF([test "x$with_xapian" != xyes], + PATH="$with_xapian:$PATH") + AC_CHECK_TOOLS(XAPIAN_CONFIG, xapian-config-1.3, xapian-config,[],$PATH) + AS_IF([test "x$XAPIAN_CONFIG" == x ], + AC_MSG_ERROR([[cannot find xapian-config file]]) + ) + XAPIAN_VERSION=`$XAPIAN_CONFIG --version` + good_version=yes + AS_VERSION_COMPARE($XAPIAN_VERSION, "xapian-config - xapian-core 1.3.4", [good_version=no], [], []) + AS_IF([test "x$good_version" == xno], + AC_MSG_ERROR([[xapian version must be >= 1.3.4]]) + ) + AM_CONDITIONAL(HAVE_XAPIAN, true) + AC_DEFINE(HAVE_XAPIAN) + XAPIAN_CFLAGS=`$XAPIAN_CONFIG --cxxflags`; + XAPIAN_LDFLAGS=`$XAPIAN_CONFIG --ltlibs`; + PATH=$OLDPATH + ]) + +AC_SUBST(XAPIAN_CFLAGS) +AC_SUBST(XAPIAN_LDFLAGS) + # Configure the output files AC_CONFIG_FILES([ Makefile diff --git a/zimwriterfs/tools.cpp b/zimwriterfs/tools.cpp index 019b22c..868f32c 100644 --- a/zimwriterfs/tools.cpp +++ b/zimwriterfs/tools.cpp @@ -32,6 +32,10 @@ #include <sys/stat.h> #include <magic.h> +#include <unicode/translit.h> +#include <unicode/ucnv.h> + + #ifdef _WIN32 #define SEPARATOR "\\" #else @@ -523,3 +527,14 @@ return computeRelativePath(baseUrl, newUrl); } +std::string removeAccents(const std::string &text) { + ucnv_setDefaultName("UTF-8"); + UErrorCode status = U_ZERO_ERROR; + Transliterator *removeAccentsTrans = Transliterator::createInstance("Lower; NFD; [:M:] remove; NFC", UTRANS_FORWARD, status); + UnicodeString ustring = UnicodeString(text.c_str()); + removeAccentsTrans->transliterate(ustring); + delete removeAccentsTrans; + std::string unaccentedText; + ustring.toUTF8String(unaccentedText); + return unaccentedText; +} diff --git a/zimwriterfs/tools.h b/zimwriterfs/tools.h index 8b43da4..d85b292 100644 --- a/zimwriterfs/tools.h +++ b/zimwriterfs/tools.h @@ -45,4 +45,6 @@ std::string extractRedirectUrlFromHtml(const GumboVector* head_children); void getLinks(GumboNode* node, std::map<std::string, bool> &links); +std::string removeAccents(const std::string &text); + #endif // OPENZIM_ZIMWRITERFS_TOOLS_H diff --git a/zimwriterfs/xapian/htmlparse.cc b/zimwriterfs/xapian/htmlparse.cc new file mode 100644 index 0000000..39b49ae --- /dev/null +++ b/zimwriterfs/xapian/htmlparse.cc @@ -0,0 +1,373 @@ +/* htmlparse.cc: simple HTML parser for omega indexer + * + * Copyright 1999,2000,2001 BrightStation PLC + * Copyright 2001 Ananova Ltd + * Copyright 2002,2006,2007,2008 Olly Betts + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +// #include <config.h> + +#include "htmlparse.h" + +#include <xapian.h> + +// #include "utf8convert.h" + +#include <algorithm> + +#include <ctype.h> +#include <cstring> +#include <stdio.h> +#include <stdlib.h> + +using namespace std; + +inline void +lowercase_string(string &str) +{ + for (string::iterator i = str.begin(); i != str.end(); ++i) { + *i = tolower(static_cast<unsigned char>(*i)); + } +} + +map<string, unsigned int> HtmlParser::named_ents; + +inline static bool +p_notdigit(char c) +{ + return !isdigit(static_cast<unsigned char>(c)); +} + +inline static bool +p_notxdigit(char c) +{ + return !isxdigit(static_cast<unsigned char>(c)); +} + +inline static bool +p_notalnum(char c) +{ + return !isalnum(static_cast<unsigned char>(c)); +} + +inline static bool +p_notwhitespace(char c) +{ + return !isspace(static_cast<unsigned char>(c)); +} + +inline static bool +p_nottag(char c) +{ + return !isalnum(static_cast<unsigned char>(c)) && + c != '.' && c != '-' && c != ':'; // ':' for XML namespaces. +} + +inline static bool +p_whitespacegt(char c) +{ + return isspace(static_cast<unsigned char>(c)) || c == '>'; +} + +inline static bool +p_whitespaceeqgt(char c) +{ + return isspace(static_cast<unsigned char>(c)) || c == '=' || c == '>'; +} + +bool +HtmlParser::get_parameter(const string & param, string & value) +{ + map<string, string>::const_iterator i = parameters.find(param); + if (i == parameters.end()) return false; + value = i->second; + return true; +} + +HtmlParser::HtmlParser() +{ + static const struct ent { const char *n; unsigned int v; } ents[] = { +#include "namedentities.h" + { NULL, 0 } + }; + if (named_ents.empty()) { + const struct ent *i = ents; + while (i->n) { + named_ents[string(i->n)] = i->v; + ++i; + } + } +} + +void +HtmlParser::decode_entities(string &s) +{ + // We need a const_iterator version of s.end() - otherwise the + // find() and find_if() templates don't work... + string::const_iterator amp = s.begin(), s_end = s.end(); + while ((amp = find(amp, s_end, '&')) != s_end) { + unsigned int val = 0; + string::const_iterator end, p = amp + 1; + if (p != s_end && *p == '#') { + p++; + if (p != s_end && (*p == 'x' || *p == 'X')) { + // hex + p++; + end = find_if(p, s_end, p_notxdigit); + sscanf(s.substr(p - s.begin(), end - p).c_str(), "%x", &val); + } else { + // number + end = find_if(p, s_end, p_notdigit); + val = atoi(s.substr(p - s.begin(), end - p).c_str()); + } + } else { + end = find_if(p, s_end, p_notalnum); + string code = s.substr(p - s.begin(), end - p); + map<string, unsigned int>::const_iterator i; + i = named_ents.find(code); + if (i != named_ents.end()) val = i->second; + } + if (end < s_end && *end == ';') end++; + if (val) { + string::size_type amp_pos = amp - s.begin(); + if (val < 0x80) { + s.replace(amp_pos, end - amp, 1u, char(val)); + } else { + // Convert unicode value val to UTF-8. + char seq[4]; + unsigned len = Xapian::Unicode::nonascii_to_utf8(val, seq); + s.replace(amp_pos, end - amp, seq, len); + } + s_end = s.end(); + // We've modified the string, so the iterators are no longer + // valid... + amp = s.begin() + amp_pos + 1; + } else { + amp = end; + } + } +} + +void +HtmlParser::parse_html(const string &body) +{ + in_script = false; + + parameters.clear(); + string::const_iterator start = body.begin(); + + while (true) { + // Skip through until we find an HTML tag, a comment, or the end of + // document. Ignore isolated occurrences of `<' which don't start + // a tag or comment. + string::const_iterator p = start; + while (true) { + p = find(p, body.end(), '<'); + if (p == body.end()) break; + unsigned char ch = *(p + 1); + + // Tag, closing tag, or comment (or SGML declaration). + if ((!in_script && isalpha(ch)) || ch == '/' || ch == '!') break; + + if (ch == '?') { + // PHP code or XML declaration. + // XML declaration is only valid at the start of the first line. + // FIXME: need to deal with BOMs... + if (p != body.begin() || body.size() < 20) break; + + // XML declaration looks something like this: + // <?xml version="1.0" encoding="UTF-8"?> + if (p[2] != 'x' || p[3] != 'm' || p[4] != 'l') break; + if (strchr(" \t\r\n", p[5]) == NULL) break; + + string::const_iterator decl_end = find(p + 6, body.end(), '?'); + if (decl_end == body.end()) break; + + // Default charset for XML is UTF-8. + charset = "UTF-8"; + + string decl(p + 6, decl_end); + size_t enc = decl.find("encoding"); + if (enc == string::npos) break; + + enc = decl.find_first_not_of(" \t\r\n", enc + 8); + if (enc == string::npos || enc == decl.size()) break; + + if (decl[enc] != '=') break; + + enc = decl.find_first_not_of(" \t\r\n", enc + 1); + if (enc == string::npos || enc == decl.size()) break; + + if (decl[enc] != '"' && decl[enc] != '\'') break; + + char quote = decl[enc++]; + size_t enc_end = decl.find(quote, enc); + + if (enc != string::npos) + charset = decl.substr(enc, enc_end - enc); + + break; + } + p++; + } + + // Process text up to start of tag. + if (p > start) { + string text = body.substr(start - body.begin(), p - start); + // convert_to_utf8(text, charset); + decode_entities(text); + process_text(text); + } + + if (p == body.end()) break; + + start = p + 1; + + if (start == body.end()) break; + + if (*start == '!') { + if (++start == body.end()) break; + if (++start == body.end()) break; + // comment or SGML declaration + if (*(start - 1) == '-' && *start == '-') { + ++start; + string::const_iterator close = find(start, body.end(), '>'); + // An unterminated comment swallows rest of document + // (like Netscape, but unlike MSIE IIRC) + if (close == body.end()) break; + + p = close; + // look for --> + while (p != body.end() && (*(p - 1) != '-' || *(p - 2) != '-')) + p = find(p + 1, body.end(), '>'); + + if (p != body.end()) { + // Check for htdig's "ignore this bit" comments. + if (p - start == 15 && string(start, p - 2) == "htdig_noindex") { + string::size_type i; + i = body.find("<!--/htdig_noindex-->", p + 1 - body.begin()); + if (i == string::npos) break; + start = body.begin() + i + 21; + continue; + } + // If we found --> skip to there. + start = p; + } else { + // Otherwise skip to the first > we found (as Netscape does). + start = close; + } + } else { + // just an SGML declaration, perhaps giving the DTD - ignore it + start = find(start - 1, body.end(), '>'); + if (start == body.end()) break; + } + ++start; + } else if (*start == '?') { + if (++start == body.end()) break; + // PHP - swallow until ?> or EOF + start = find(start + 1, body.end(), '>'); + + // look for ?> + while (start != body.end() && *(start - 1) != '?') + start = find(start + 1, body.end(), '>'); + + // unterminated PHP swallows rest of document (rather arbitrarily + // but it avoids polluting the database when things go wrong) + if (start != body.end()) ++start; + } else { + // opening or closing tag + int closing = 0; + + if (*start == '/') { + closing = 1; + start = find_if(start + 1, body.end(), p_notwhitespace); + } + + p = start; + start = find_if(start, body.end(), p_nottag); + string tag = body.substr(p - body.begin(), start - p); + // convert tagname to lowercase + lowercase_string(tag); + + if (closing) { + closing_tag(tag); + if (in_script && tag == "script") in_script = false; + + /* ignore any bogus parameters on closing tags */ + p = find(start, body.end(), '>'); + if (p == body.end()) break; + start = p + 1; + } else { + // FIXME: parse parameters lazily. + while (start < body.end() && *start != '>') { + string name, value; + + p = find_if(start, body.end(), p_whitespaceeqgt); + + name.assign(body, start - body.begin(), p - start); + + p = find_if(p, body.end(), p_notwhitespace); + + start = p; + if (start != body.end() && *start == '=') { + start = find_if(start + 1, body.end(), p_notwhitespace); + + p = body.end(); + + int quote = *start; + if (quote == '"' || quote == '\'') { + start++; + p = find(start, body.end(), quote); + } + + if (p == body.end()) { + // unquoted or no closing quote + p = find_if(start, body.end(), p_whitespacegt); + } + value.assign(body, start - body.begin(), p - start); + start = find_if(p, body.end(), p_notwhitespace); + + if (!name.empty()) { + // convert parameter name to lowercase + lowercase_string(name); + // in case of multiple entries, use the first + // (as Netscape does) + parameters.insert(make_pair(name, value)); + } + } + } +#if 0 + cout << "<" << tag; + map<string, string>::const_iterator x; + for (x = parameters.begin(); x != parameters.end(); x++) { + cout << " " << x->first << "=\"" << x->second << "\""; + } + cout << ">\n"; +#endif + opening_tag(tag); + parameters.clear(); + + // In <script> tags we ignore opening tags to avoid problems + // with "a<b". + if (tag == "script") in_script = true; + + if (start != body.end() && *start == '>') ++start; + } + } + } +} diff --git a/zimwriterfs/xapian/htmlparse.h b/zimwriterfs/xapian/htmlparse.h new file mode 100644 index 0000000..79e96ec --- /dev/null +++ b/zimwriterfs/xapian/htmlparse.h @@ -0,0 +1,49 @@ +/* htmlparse.h: simple HTML parser for omega indexer + * + * Copyright 1999,2000,2001 BrightStation PLC + * Copyright 2002,2006,2008 Olly Betts + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +#ifndef OMEGA_INCLUDED_HTMLPARSE_H +#define OMEGA_INCLUDED_HTMLPARSE_H + +#include <string> +#include <map> + +using std::string; +using std::map; + +class HtmlParser { + map<string, string> parameters; + protected: + void decode_entities(string &s); + bool in_script; + string charset; + static map<string, unsigned int> named_ents; + + bool get_parameter(const string & param, string & value); + public: + virtual void process_text(const string &/*text*/) { } + virtual void opening_tag(const string &/*tag*/) { } + virtual void closing_tag(const string &/*tag*/) { } + virtual void parse_html(const string &text); + HtmlParser(); + virtual ~HtmlParser() { } +}; + +#endif // OMEGA_INCLUDED_HTMLPARSE_H diff --git a/zimwriterfs/xapian/myhtmlparse.cc b/zimwriterfs/xapian/myhtmlparse.cc new file mode 100644 index 0000000..e1098af --- /dev/null +++ b/zimwriterfs/xapian/myhtmlparse.cc @@ -0,0 +1,302 @@ +/* myhtmlparse.cc: subclass of HtmlParser for extracting text. + * + * Copyright 1999,2000,2001 BrightStation PLC + * Copyright 2002,2003,2004,2006,2007,2008 Olly Betts + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +// #include <config.h> + +#include "myhtmlparse.h" + +// #include "utf8convert.h" + +#include <ctype.h> +#include <string.h> + +inline void +lowercase_string(string &str) +{ + for (string::iterator i = str.begin(); i != str.end(); ++i) { + *i = tolower(static_cast<unsigned char>(*i)); + } +} + +void +MyHtmlParser::parse_html(const string &text, const string &charset_, + bool charset_from_meta_) +{ + charset = charset_; + charset_from_meta = charset_from_meta_; + HtmlParser::parse_html(text); +} + +void +MyHtmlParser::process_text(const string &text) +{ + if (!text.empty() && !in_script_tag && !in_style_tag) { + string::size_type b = text.find_first_not_of(WHITESPACE); + if (b) pending_space = true; + while (b != string::npos) { + if (pending_space && !dump.empty()) dump += ' '; + string::size_type e = text.find_first_of(WHITESPACE, b); + pending_space = (e != string::npos); + if (!pending_space) { + dump.append(text.data() + b, text.size() - b); + return; + } + dump.append(text.data() + b, e - b); + b = text.find_first_not_of(WHITESPACE, e + 1); + } + } +} + +void +MyHtmlParser::opening_tag(const string &tag) +{ + if (tag.empty()) return; + switch (tag[0]) { + case 'a': + if (tag == "address") pending_space = true; + break; + case 'b': + if (tag == "body") { + dump.resize(0); + break; + } + if (tag == "blockquote" || tag == "br") pending_space = true; + break; + case 'c': + if (tag == "center") pending_space = true; + break; + case 'd': + if (tag == "dd" || tag == "dir" || tag == "div" || tag == "dl" || + tag == "dt") pending_space = true; + break; + case 'e': + if (tag == "embed") pending_space = true; + break; + case 'f': + if (tag == "fieldset" || tag == "form") pending_space = true; + break; + case 'h': + // hr, and h1, ..., h6 + if (tag.length() == 2 && strchr("r123456", tag[1])) + pending_space = true; + break; + case 'i': + if (tag == "iframe" || tag == "img" || tag == "isindex" || + tag == "input") pending_space = true; + break; + case 'k': + if (tag == "keygen") pending_space = true; + break; + case 'l': + if (tag == "legend" || tag == "li" || tag == "listing") + pending_space = true; + break; + case 'm': + if (tag == "meta") { + string content; + if (get_parameter("content", content)) { + string name; + if (get_parameter("name", name)) { + lowercase_string(name); + if (name == "description") { + if (sample.empty()) { + swap(sample, content); + // convert_to_utf8(sample, charset); + decode_entities(sample); + } + } else if (name == "keywords") { + if (!keywords.empty()) keywords += ' '; + // convert_to_utf8(content, charset); + decode_entities(content); + keywords += content; + } else if (name == "robots") { + decode_entities(content); + lowercase_string(content); + if (content.find("none") != string::npos || + content.find("noindex") != string::npos) { + indexing_allowed = false; + throw true; + } + } + break; + } + // If the current charset came from a meta tag, don't + // force reparsing again! + if (charset_from_meta) break; + string hdr; + if (get_parameter("http-equiv", hdr)) { + lowercase_string(hdr); + if (hdr == "content-type") { + lowercase_string(content); + size_t start = content.find("charset="); + if (start == string::npos) break; + start += 8; + if (start == content.size()) break; + size_t end = start; + if (content[start] != '"') { + while (end < content.size()) { + unsigned char ch = content[end]; + if (ch <= 32 || ch >= 127 || + strchr(";()<>@,:\\\"/[]?={}", ch)) + break; + ++end; + } + } else { + ++start; + ++end; + while (end < content.size()) { + unsigned char ch = content[end]; + if (ch == '"') break; + if (ch == '\\') content.erase(end, 1); + ++end; + } + } + string newcharset(content, start, end - start); + if (charset != newcharset) { + throw newcharset; + } + } + } + break; + } + if (charset_from_meta) break; + string newcharset; + if (get_parameter("charset", newcharset)) { + // HTML5 added: <meta charset="..."> + lowercase_string(newcharset); + if (charset != newcharset) { + throw newcharset; + } + } + break; + } + if (tag == "marquee" || tag == "menu" || tag == "multicol") + pending_space = true; + break; + case 'o': + if (tag == "ol" || tag == "option") pending_space = true; + break; + case 'p': + if (tag == "p" || tag == "pre" || tag == "plaintext") + pending_space = true; + break; + case 'q': + if (tag == "q") pending_space = true; + break; + case 's': + if (tag == "style") { + in_style_tag = true; + break; + } + if (tag == "script") { + in_script_tag = true; + break; + } + if (tag == "select") pending_space = true; + break; + case 't': + if (tag == "table" || tag == "td" || tag == "textarea" || + tag == "th") pending_space = true; + break; + case 'u': + if (tag == "ul") pending_space = true; + break; + case 'x': + if (tag == "xmp") pending_space = true; + break; + } +} + +void +MyHtmlParser::closing_tag(const string &tag) +{ + if (tag.empty()) return; + switch (tag[0]) { + case 'a': + if (tag == "address") pending_space = true; + break; + case 'b': + if (tag == "body") { + throw true; + } + if (tag == "blockquote" || tag == "br") pending_space = true; + break; + case 'c': + if (tag == "center") pending_space = true; + break; + case 'd': + if (tag == "dd" || tag == "dir" || tag == "div" || tag == "dl" || + tag == "dt") pending_space = true; + break; + case 'f': + if (tag == "fieldset" || tag == "form") pending_space = true; + break; + case 'h': + // hr, and h1, ..., h6 + if (tag.length() == 2 && strchr("r123456", tag[1])) + pending_space = true; + break; + case 'i': + if (tag == "iframe") pending_space = true; + break; + case 'l': + if (tag == "legend" || tag == "li" || tag == "listing") + pending_space = true; + break; + case 'm': + if (tag == "marquee" || tag == "menu") pending_space = true; + break; + case 'o': + if (tag == "ol" || tag == "option") pending_space = true; + break; + case 'p': + if (tag == "p" || tag == "pre") pending_space = true; + break; + case 'q': + if (tag == "q") pending_space = true; + break; + case 's': + if (tag == "style") { + in_style_tag = false; + break; + } + if (tag == "script") { + in_script_tag = false; + break; + } + if (tag == "select") pending_space = true; + break; + case 't': + if (tag == "title") { + if (title.empty()) swap(title, dump); + break; + } + if (tag == "table" || tag == "td" || tag == "textarea" || + tag == "th") pending_space = true; + break; + case 'u': + if (tag == "ul") pending_space = true; + break; + case 'x': + if (tag == "xmp") pending_space = true; + break; + } +} diff --git a/zimwriterfs/xapian/myhtmlparse.h b/zimwriterfs/xapian/myhtmlparse.h new file mode 100644 index 0000000..f221cb5 --- /dev/null +++ b/zimwriterfs/xapian/myhtmlparse.h @@ -0,0 +1,65 @@ +/* myhtmlparse.h: subclass of HtmlParser for extracting text + * + * Copyright 1999,2000,2001 BrightStation PLC + * Copyright 2002,2003,2004,2006,2008 Olly Betts + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +#ifndef OMEGA_INCLUDED_MYHTMLPARSE_H +#define OMEGA_INCLUDED_MYHTMLPARSE_H + +#include "htmlparse.h" + +// FIXME: Should we include \xa0 which is non-breaking space in iso-8859-1, but +// not in all charsets and perhaps spans of all \xa0 should become a single +// \xa0? +#define WHITESPACE " \t\n\r" + +class MyHtmlParser : public HtmlParser { + public: + bool in_script_tag; + bool in_style_tag; + bool pending_space; + bool indexing_allowed; + bool charset_from_meta; + string title, sample, keywords, dump; + void process_text(const string &text); + void opening_tag(const string &tag); + void closing_tag(const string &tag); + void parse_html(const string &text, const string &charset_, + bool charset_from_meta_); + MyHtmlParser() : + in_script_tag(false), + in_style_tag(false), + pending_space(false), + indexing_allowed(true), + charset_from_meta(false) { } + + void reset() { + in_script_tag = false; + in_style_tag = false; + pending_space = false; + indexing_allowed = true; + charset_from_meta = false; + title.resize(0); + sample.resize(0); + keywords.resize(0); + dump.resize(0); + } +}; + +#endif // OMEGA_INCLUDED_MYHTMLPARSE_H diff --git a/zimwriterfs/xapian/namedentities.h b/zimwriterfs/xapian/namedentities.h new file mode 100644 index 0000000..8b7f03e --- /dev/null +++ b/zimwriterfs/xapian/namedentities.h @@ -0,0 +1,279 @@ +/* namedentities.h: named HTML entities. + * + * Copyright (C) 2006,2007 Olly Betts + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef OMEGA_INCLUDED_NAMEDENTITIES_H +#define OMEGA_INCLUDED_NAMEDENTITIES_H + +// Names and values from: "Character entity references in HTML 4" +// http://www.w3.org/TR/html4/sgml/entities.html +{ "quot", 34 }, +{ "amp", 38 }, +{ "apos", 39 }, // Not in HTML 4 list but used in OpenOffice XML. +{ "lt", 60 }, +{ "gt", 62 }, +{ "nbsp", 160 }, +{ "iexcl", 161 }, +{ "cent", 162 }, +{ "pound", 163 }, +{ "curren", 164 }, +{ "yen", 165 }, +{ "brvbar", 166 }, +{ "sect", 167 }, +{ "uml", 168 }, +{ "copy", 169 }, +{ "ordf", 170 }, +{ "laquo", 171 }, +{ "not", 172 }, +{ "shy", 173 }, +{ "reg", 174 }, +{ "macr", 175 }, +{ "deg", 176 }, +{ "plusmn", 177 }, +{ "sup2", 178 }, +{ "sup3", 179 }, +{ "acute", 180 }, +{ "micro", 181 }, +{ "para", 182 }, +{ "middot", 183 }, +{ "cedil", 184 }, +{ "sup1", 185 }, +{ "ordm", 186 }, +{ "raquo", 187 }, +{ "frac14", 188 }, +{ "frac12", 189 }, +{ "frac34", 190 }, +{ "iquest", 191 }, +{ "Agrave", 192 }, +{ "Aacute", 193 }, +{ "Acirc", 194 }, +{ "Atilde", 195 }, +{ "Auml", 196 }, +{ "Aring", 197 }, +{ "AElig", 198 }, +{ "Ccedil", 199 }, +{ "Egrave", 200 }, +{ "Eacute", 201 }, +{ "Ecirc", 202 }, +{ "Euml", 203 }, +{ "Igrave", 204 }, +{ "Iacute", 205 }, +{ "Icirc", 206 }, +{ "Iuml", 207 }, +{ "ETH", 208 }, +{ "Ntilde", 209 }, +{ "Ograve", 210 }, +{ "Oacute", 211 }, +{ "Ocirc", 212 }, +{ "Otilde", 213 }, +{ "Ouml", 214 }, +{ "times", 215 }, +{ "Oslash", 216 }, +{ "Ugrave", 217 }, +{ "Uacute", 218 }, +{ "Ucirc", 219 }, +{ "Uuml", 220 }, +{ "Yacute", 221 }, +{ "THORN", 222 }, +{ "szlig", 223 }, +{ "agrave", 224 }, +{ "aacute", 225 }, +{ "acirc", 226 }, +{ "atilde", 227 }, +{ "auml", 228 }, +{ "aring", 229 }, +{ "aelig", 230 }, +{ "ccedil", 231 }, +{ "egrave", 232 }, +{ "eacute", 233 }, +{ "ecirc", 234 }, +{ "euml", 235 }, +{ "igrave", 236 }, +{ "iacute", 237 }, +{ "icirc", 238 }, +{ "iuml", 239 }, +{ "eth", 240 }, +{ "ntilde", 241 }, +{ "ograve", 242 }, +{ "oacute", 243 }, +{ "ocirc", 244 }, +{ "otilde", 245 }, +{ "ouml", 246 }, +{ "divide", 247 }, +{ "oslash", 248 }, +{ "ugrave", 249 }, +{ "uacute", 250 }, +{ "ucirc", 251 }, +{ "uuml", 252 }, +{ "yacute", 253 }, +{ "thorn", 254 }, +{ "yuml", 255 }, +{ "OElig", 338 }, +{ "oelig", 339 }, +{ "Scaron", 352 }, +{ "scaron", 353 }, +{ "Yuml", 376 }, +{ "fnof", 402 }, +{ "circ", 710 }, +{ "tilde", 732 }, +{ "Alpha", 913 }, +{ "Beta", 914 }, +{ "Gamma", 915 }, +{ "Delta", 916 }, +{ "Epsilon", 917 }, +{ "Zeta", 918 }, +{ "Eta", 919 }, +{ "Theta", 920 }, +{ "Iota", 921 }, +{ "Kappa", 922 }, +{ "Lambda", 923 }, +{ "Mu", 924 }, +{ "Nu", 925 }, +{ "Xi", 926 }, +{ "Omicron", 927 }, +{ "Pi", 928 }, +{ "Rho", 929 }, +{ "Sigma", 931 }, +{ "Tau", 932 }, +{ "Upsilon", 933 }, +{ "Phi", 934 }, +{ "Chi", 935 }, +{ "Psi", 936 }, +{ "Omega", 937 }, +{ "alpha", 945 }, +{ "beta", 946 }, +{ "gamma", 947 }, +{ "delta", 948 }, +{ "epsilon", 949 }, +{ "zeta", 950 }, +{ "eta", 951 }, +{ "theta", 952 }, +{ "iota", 953 }, +{ "kappa", 954 }, +{ "lambda", 955 }, +{ "mu", 956 }, +{ "nu", 957 }, +{ "xi", 958 }, +{ "omicron", 959 }, +{ "pi", 960 }, +{ "rho", 961 }, +{ "sigmaf", 962 }, +{ "sigma", 963 }, +{ "tau", 964 }, +{ "upsilon", 965 }, +{ "phi", 966 }, +{ "chi", 967 }, +{ "psi", 968 }, +{ "omega", 969 }, +{ "thetasym", 977 }, +{ "upsih", 978 }, +{ "piv", 982 }, +{ "ensp", 8194 }, +{ "emsp", 8195 }, +{ "thinsp", 8201 }, +{ "zwnj", 8204 }, +{ "zwj", 8205 }, +{ "lrm", 8206 }, +{ "rlm", 8207 }, +{ "ndash", 8211 }, +{ "mdash", 8212 }, +{ "lsquo", 8216 }, +{ "rsquo", 8217 }, +{ "sbquo", 8218 }, +{ "ldquo", 8220 }, +{ "rdquo", 8221 }, +{ "bdquo", 8222 }, +{ "dagger", 8224 }, +{ "Dagger", 8225 }, +{ "bull", 8226 }, +{ "hellip", 8230 }, +{ "permil", 8240 }, +{ "prime", 8242 }, +{ "Prime", 8243 }, +{ "lsaquo", 8249 }, +{ "rsaquo", 8250 }, +{ "oline", 8254 }, +{ "frasl", 8260 }, +{ "euro", 8364 }, +{ "image", 8465 }, +{ "weierp", 8472 }, +{ "real", 8476 }, +{ "trade", 8482 }, +{ "alefsym", 8501 }, +{ "larr", 8592 }, +{ "uarr", 8593 }, +{ "rarr", 8594 }, +{ "darr", 8595 }, +{ "harr", 8596 }, +{ "crarr", 8629 }, +{ "lArr", 8656 }, +{ "uArr", 8657 }, +{ "rArr", 8658 }, +{ "dArr", 8659 }, +{ "hArr", 8660 }, +{ "forall", 8704 }, +{ "part", 8706 }, +{ "exist", 8707 }, +{ "empty", 8709 }, +{ "nabla", 8711 }, +{ "isin", 8712 }, +{ "notin", 8713 }, +{ "ni", 8715 }, +{ "prod", 8719 }, +{ "sum", 8721 }, +{ "minus", 8722 }, +{ "lowast", 8727 }, +{ "radic", 8730 }, +{ "prop", 8733 }, +{ "infin", 8734 }, +{ "ang", 8736 }, +{ "and", 8743 }, +{ "or", 8744 }, +{ "cap", 8745 }, +{ "cup", 8746 }, +{ "int", 8747 }, +{ "there4", 8756 }, +{ "sim", 8764 }, +{ "cong", 8773 }, +{ "asymp", 8776 }, +{ "ne", 8800 }, +{ "equiv", 8801 }, +{ "le", 8804 }, +{ "ge", 8805 }, +{ "sub", 8834 }, +{ "sup", 8835 }, +{ "nsub", 8836 }, +{ "sube", 8838 }, +{ "supe", 8839 }, +{ "oplus", 8853 }, +{ "otimes", 8855 }, +{ "perp", 8869 }, +{ "sdot", 8901 }, +{ "lceil", 8968 }, +{ "rceil", 8969 }, +{ "lfloor", 8970 }, +{ "rfloor", 8971 }, +{ "lang", 9001 }, +{ "rang", 9002 }, +{ "loz", 9674 }, +{ "spades", 9824 }, +{ "clubs", 9827 }, +{ "hearts", 9829 }, +{ "diams", 9830 }, + +#endif // OMEGA_INCLUDED_NAMEDENTITIES_H diff --git a/zimwriterfs/xapianIndexer.cpp b/zimwriterfs/xapianIndexer.cpp new file mode 100644 index 0000000..33666c7 --- /dev/null +++ b/zimwriterfs/xapianIndexer.cpp @@ -0,0 +1,180 @@ +/* + * Copyright 2011 Emmanuel Engelhart <kel...@kiwix.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + */ + +#include "xapianIndexer.h" +#include "tools.h" + +#include <unistd.h> + +/* Constructor */ +XapianIndexer::XapianIndexer(const std::string& language, const bool verbose) { + setVerboseFlag(verbose); + readStopWords(language); + /* + stemmer(Xapian::Stem("french")) { + this->indexer.set_stemmer(this->stemmer); + */ +} + +void XapianIndexer::indexingPrelude(const string indexPath_) { + indexPath = indexPath_; + this->writableDatabase = Xapian::WritableDatabase(indexPath + ".tmp", Xapian::DB_CREATE_OR_OVERWRITE); + this->writableDatabase.begin_transaction(true); + + /* Insert the stopwords */ + if (!this->stopWords.empty()) { + std::vector<std::string>::iterator it = this->stopWords.begin(); + for( ; it != this->stopWords.end(); ++it) { + this->stopper.add(*it); + } + + this->indexer.set_stopper(&(this->stopper)); + } +} + +void XapianIndexer::index(const string &url, + const string &title, + const string &unaccentedTitle, + const string &keywords, + const string &content, + const string &snippet, + const string &size, + const string &wordCount) { + + /* Put the data in the document */ + Xapian::Document currentDocument; + currentDocument.clear_values(); + currentDocument.add_value(0, title); + currentDocument.add_value(1, snippet); + currentDocument.add_value(2, size); + currentDocument.add_value(3, wordCount); + currentDocument.set_data(url); + indexer.set_document(currentDocument); + + /* Index the title */ + if (!unaccentedTitle.empty()) { + this->indexer.index_text_without_positions(unaccentedTitle, this->getTitleBoostFactor(content.size())); + } + + /* Index the keywords */ + if (!keywords.empty()) { + this->indexer.index_text_without_positions(keywords, keywordsBoostFactor); + } + + /* Index the content */ + if (!content.empty()) { + this->indexer.index_text_without_positions(content); + } + + /* add to the database */ + this->writableDatabase.add_document(currentDocument); +} + +void XapianIndexer::flush() { + this->writableDatabase.commit_transaction(); + this->writableDatabase.begin_transaction(true); +} + +void XapianIndexer::indexingPostlude() { + this->flush(); + this->writableDatabase.commit_transaction(); + this->writableDatabase.commit(); + this->writableDatabase.compact(indexPath, Xapian::DBCOMPACT_SINGLE_FILE); + + // commit is not available is old version of xapian and seems not mandatory there + // this->writableDatabase.commit(); +} + +void XapianIndexer::handleArticle(Article* article) +{ + indexerToken token; + size_t found; + MyHtmlParser htmlParser; + + if ( article->isRedirect() || article->getMimeType().find("text/html") != 0 ) + return; + + token.title = article->getTitle(); + token.url = article->getUrl(); + zim::Blob article_content = article->getData(); + token.content = std::string(article_content.data(), article_content.size()); + + /* The parser generate a lot of exceptions which should be avoided */ + try { + htmlParser.parse_html(token.content, "UTF-8", true); + } catch (...) { + } + + /* If content does not have the noindex meta tag */ + /* Seems that the parser generates an exception in such case */ + found = htmlParser.dump.find("NOINDEX"); + + if (found == string::npos) { + /* Get the accented title */ + token.accentedTitle = (htmlParser.title.empty() ? token.title : htmlParser.title); + + /* count words */ + stringstream countWordStringStream; + countWordStringStream << countWords(htmlParser.dump); + token.wordCount = countWordStringStream.str(); + + /* snippet */ + std::string snippet = std::string(htmlParser.dump, 0, 300); + std::string::size_type last = snippet.find_last_of('.'); + if (last == snippet.npos) + last = snippet.find_last_of(' '); + if (last != snippet.npos) + snippet = snippet.substr(0, last); + token.snippet = snippet; + + /* size */ + stringstream sizeStringStream; + sizeStringStream << token.content.size() / 1024; + token.size = sizeStringStream.str(); + + /* Remove accent */ + token.title = removeAccents(token.accentedTitle); + token.keywords = removeAccents(htmlParser.keywords); + token.content = removeAccents(htmlParser.dump); + pushToIndexQueue(token); + } +} + +XapianMetaArticle* XapianIndexer::getMetaArticle() +{ + return new XapianMetaArticle(this); +} + +zim::Blob XapianMetaArticle::getData() const +{ + if ( data.size() == 0 ) + { + indexerToken token; + indexer->pushToIndexQueue(token); + /* Wait it index everything */ + int wait = 500; + while ( indexer->isRunning() ) + { + usleep(wait); + } + data = getFileContent(indexer->getIndexPath()); + } + return zim::Blob(data.data(), data.size()); +} + diff --git a/zimwriterfs/xapianIndexer.h b/zimwriterfs/xapianIndexer.h new file mode 100644 index 0000000..71dfe64 --- /dev/null +++ b/zimwriterfs/xapianIndexer.h @@ -0,0 +1,78 @@ +/* + * Copyright 2011 Emmanuel Engelhart <kel...@kiwix.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + */ + +#ifndef OPENZIM_ZIMWRITERFS_XAPIANINDEXER_H +#define OPENZIM_ZIMWRITERFS_XAPIANINDEXER_H + + +#include "indexer.h" +#include "articlesource.h" +#include "article.h" + +#include <xapian.h> +#include "xapian/myhtmlparse.h" +#include <zim/blob.h> + +class XapianIndexer; + +class XapianMetaArticle : public Article { + private: + XapianIndexer* indexer; + mutable std::string data; + public: + XapianMetaArticle(XapianIndexer* indexer): + indexer(indexer) + { + ns = 'Z'; + aid = url = "/Z/fulltextIndex/xapian"; + title = "Xapian Fulltext Index"; + mimeType = "application/octet-stream+xapian"; + }; + virtual zim::Blob getData() const; +}; + +class XapianIndexer : public Indexer, public IHandler { + public: + XapianIndexer(const std::string& language, bool verbose); + std::string getIndexPath() { return indexPath; } + + protected: + void indexingPrelude(const string indexPath); + void index(const string &url, + const string &title, + const string &unaccentedTitle, + const string &keywords, + const string &content, + const string &snippet, + const string &size, + const string &wordCount); + void flush(); + void indexingPostlude(); + void handleArticle(Article* article); + XapianMetaArticle* getMetaArticle(); + zim::Blob getData(); + + Xapian::WritableDatabase writableDatabase; + Xapian::Stem stemmer; + Xapian::SimpleStopper stopper; + Xapian::TermGenerator indexer; + std::string indexPath; +}; + +#endif // OPENZIM_ZIMWRITERFS_XAPIANINDEXER_H diff --git a/zimwriterfs/zimwriterfs.cpp b/zimwriterfs/zimwriterfs.cpp index 52ed7ea..b48a1ad 100644 --- a/zimwriterfs/zimwriterfs.cpp +++ b/zimwriterfs/zimwriterfs.cpp @@ -38,6 +38,9 @@ #include "queue.h" #include "mimetypecounter.h" +#if HAVE_XAPIAN +#include "xapianIndexer.h" +#endif std::string language; std::string creator; @@ -229,6 +232,9 @@ int main(int argc, char** argv) { ArticleSource source(filenameQueue); +#if HAVE_XAPIAN + XapianIndexer* xapianIndexer = NULL; +#endif int minChunkSize = 2048; @@ -368,6 +374,17 @@ pthread_create(&(directoryVisitor), NULL, visitDirectoryPath, (void*)NULL); pthread_detach(directoryVisitor); + /* Indexor */ + if (createFullTextIndex) + { +#if HAVE_XAPIAN + xapianIndexer = new XapianIndexer(language, isVerbose()); + xapianIndexer->start(zimPath + ".indexdb"); + source.add_customHandler(xapianIndexer); +#else + std::cerr << "Zimwriterfs is compiled without xapian. Indexing is not available" << std::endl; +#endif + } MimetypeCounter mimetypeCounter; source.add_customHandler(&mimetypeCounter); @@ -381,6 +398,9 @@ std::cerr << e.what() << std::endl; } +#if HAVE_XAPIAN + delete xapianIndexer; +#endif /* Destroy mutex */ pthread_mutex_destroy(&directoryVisitorRunningMutex); pthread_mutex_destroy(&verboseMutex); -- To view, visit https://gerrit.wikimedia.org/r/296913 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I52c255e8335d0b6763c1c59eeb1549300d5f6f81 Gerrit-PatchSet: 1 Gerrit-Project: openzim Gerrit-Branch: master Gerrit-Owner: Mgautierfr <mgaut...@kymeria.fr> Gerrit-Reviewer: Kelson <kel...@kiwix.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits