[MediaWiki-commits] [Gerrit] Add xapian indexer. - change (openzim)

2016-07-03 Thread Kelson (Code Review)
Kelson has submitted this change and it was merged.

Change subject: Add xapian indexer.
..


Add xapian indexer.

Xapian is optional.
Build your index inside zim by adding "-i" or "--createFullTextIndex"
to zimwriterfs' command line.

Change-Id: I52c255e8335d0b6763c1c59eeb1549300d5f6f81
---
M zimwriterfs/Makefile.am
M zimwriterfs/configure.ac
M zimwriterfs/tools.cpp
M zimwriterfs/tools.h
A zimwriterfs/xapian/htmlparse.cc
A zimwriterfs/xapian/htmlparse.h
A zimwriterfs/xapian/myhtmlparse.cc
A zimwriterfs/xapian/myhtmlparse.h
A zimwriterfs/xapian/namedentities.h
A zimwriterfs/xapianIndexer.cpp
A zimwriterfs/xapianIndexer.h
M zimwriterfs/zimwriterfs.cpp
12 files changed, 1,490 insertions(+), 0 deletions(-)

Approvals:
  Kelson: Verified; Looks good to me, approved



diff --git a/zimwriterfs/Makefile.am b/zimwriterfs/Makefile.am
index 628b74c..1d40174 100644
--- a/zimwriterfs/Makefile.am
+++ b/zimwriterfs/Makefile.am
@@ -10,3 +10,15 @@
 resourceTools.cpp \
 pathTools.cpp \
 mimetypecounter.cpp
+
+zimwriterfs_CXXFLAGS = $(ICU_CFLAGS)
+zimwriterfs_LDFLAGS = $(ICU_LDFLAGS)
+
+if HAVE_XAPIAN
+zimwriterfs_CXXFLAGS += $(XAPIAN_CFLAGS)
+zimwriterfs_LDFLAGS += $(XAPIAN_LDFLAGS)
+zimwriterfs_SOURCES += \
+xapianIndexer.cpp \
+xapian/myhtmlparse.cc \
+xapian/htmlparse.cc
+endif
diff --git a/zimwriterfs/configure.ac b/zimwriterfs/configure.ac
index fb12c8f..795d3b1 100644
--- a/zimwriterfs/configure.ac
+++ b/zimwriterfs/configure.ac
@@ -71,6 +71,121 @@
 AC_DEFINE_UNQUOTED(LZMA_MEMORY_SIZE, 128, [set lzma uncompress memory size to 
number of MB])
 AC_DEFINE(ENABLE_LZMA, [1], [defined if lzma compression is enabled])
 
+
+function findLibrary {
+   found=0
+   for f in $(echo $LIBS_ROOT|tr ":" "\n") ; do
+   sf=`find $f -name $1 | grep $ARCH | head -1 2> /dev/null`
+   if [[ -f "$sf" -a $found -eq 0 ]]
+   then
+   found=1
+   echo $sf
+   fi
+   done
+   if [[ $found -eq 0 ]]
+   then
+   for f in $(echo $LIBS_ROOT|tr ":" "\n") ; do
+   sf=`find $f -name $1 | head -1 2> /dev/null`
+   if [[ -f "$sf" -a $found -eq 0 ]]
+   then
+   found=1
+   echo $sf
+   fi
+   done
+   fi
+   if [[ $found -eq 0 ]]
+   then
+   echo "no"
+   fi
+}
+
+
+
+ ICU
+
+
+
+ICU_CFLAGS=""
+ICU_LDFLAGS="-licui18n -licuuc -licudata" # replaced by icu-config
+ICU_STATIC_LDFLAGS=""
+
+# if --with-x, add path to LIBRARY_PATH
+AC_ARG_WITH(icu,
+AC_HELP_STRING([--with-icu=DIR], [alternate location for 
icu-config]),
+export 
LIBRARY_PATH="${withval}:${LIBRARY_PATH}";ICU_PATH=${withval}
+   )
+
+# look for shared library.
+# AC_CHECK_HEADER([zlib.h],, [AC_MSG_ERROR([[cannot find zlib header]])])
+# AC_CHECK_LIB([z], [zlibVersion],, [AC_MSG_ERROR([[cannot find 
zlib]]);COMPILE_ICU=1])
+# ICU_FILES=`findLibrary "libicuuc.${SHARED_EXT}"`
+
+AC_CHECK_TOOL(HAVE_ICU_CONFIG, icu-config,, "${ICU_PATH}:${PATH}")
+if test [ ! "$HAVE_ICU_CONFIG" ]
+then
+ AC_MSG_ERROR([[cannot find icu-config]])
+else
+OLDPATH=$PATH
+PATH="${ICU_PATH}:${PATH}"
+ICU_CFLAGS=`icu-config --cxxflags`;
+ICU_LDFLAGS=`icu-config --ldflags`;
+ICU_VER=`icu-config --version`;
+ICU_FILES="`findLibrary "libicuuc.${SHARED_EXT}"` `findLibrary 
"libicudata.${SHARED_EXT}"` `findLibrary "libicui18n.${SHARED_EXT}"`"
+PATH=$OLDPATH
+if [[ $ICU_VER \< "4.2" ]]
+   then
+AC_MSG_ERROR([[You need a version of libicu >= 4.2]])
+   fi
+fi
+
+
+AC_SUBST(ICU_CFLAGS)
+AC_SUBST(ICU_LDFLAGS)
+AC_SUBST(ICU_STATIC_LDFLAGS)
+AC_SUBST(ICU_FILES)
+AC_SUBST(COMPILED_ICUDATA_DAT)
+
+
+ XAPIAN
+
+
+XAPIAN_CFLAGS=""
+XAPIAN_LDFLAGS=""
+XAPIAN_STATIC_LDFLAGS=""
+XAPIAN_ENABLE=0
+
+# if --with-x, add path to LIBRARY_PATH
+AC_ARG_WITH([xapian],
+   [AS_HELP_STRING([--with-xapian=DIR], [alternat location for 
xapian-config] @@)],
+   [xapian_dir=$withval],
+   [with_xapian=yes])
+
+
+AS_IF([test "x$with_xapian" == xno],
+[AM_CONDITIONAL(HAVE_XAPIAN, false)],
+   [OLDPATH=$PATH
+AS_IF([test "x$with_xapian" != xyes],
+  PATH="$with_xapian:$PATH")
+AC_CHECK_TOOLS(XAPIAN_CONFIG, xapian-config-1.3, 
xapian-config,[],$PATH)
+AS_IF([test "x$XAPIAN_CONFIG" == x ],
+   AC_MSG_ERROR([[cannot find xapian-config file]])
+ )
+XAPIAN_VERSION=`$XAPIAN_CONFIG --version`
+good_version=yes
+AS_VERSION_COMPARE($XAPIAN_

[MediaWiki-commits] [Gerrit] Add xapian indexer. - change (openzim)

2016-06-22 Thread Mgautierfr (Code Review)
Mgautierfr has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/295521

Change subject: Add xapian indexer.
..

Add xapian indexer.

Xapian is optional.
Build your index inside zim by adding "-i" to zimwriterfs' command line.

Change-Id: I866fe525deaf6cabddc41b9ea73bd5101c84ea7a
---
M zimwriterfs/Makefile.am
M zimwriterfs/configure.ac
A zimwriterfs/xapian/htmlparse.cc
A zimwriterfs/xapian/htmlparse.h
A zimwriterfs/xapian/myhtmlparse.cc
A zimwriterfs/xapian/myhtmlparse.h
A zimwriterfs/xapian/namedentities.h
A zimwriterfs/xapianIndexer.cpp
A zimwriterfs/xapianIndexer.h
M zimwriterfs/zimwriterfs.cpp
10 files changed, 1,368 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/openzim refs/changes/21/295521/1

diff --git a/zimwriterfs/Makefile.am b/zimwriterfs/Makefile.am
index e54c64d..c6d6406 100644
--- a/zimwriterfs/Makefile.am
+++ b/zimwriterfs/Makefile.am
@@ -12,3 +12,12 @@
 
 zimwriterfs_CXXFLAGS = $(ICU_CFLAGS)
 zimwriterfs_LDFLAGS = $(ICU_LDFLAGS)
+
+if HAVE_XAPIAN
+zimwriterfs_CXXFLAGS += $(XAPIAN_CFLAGS)
+zimwriterfs_LDFLAGS += $(XAPIAN_LDFLAGS)
+zimwriterfs_SOURCES += \
+xapianIndexer.cpp \
+xapian/myhtmlparse.cc \
+xapian/htmlparse.cc
+endif
diff --git a/zimwriterfs/configure.ac b/zimwriterfs/configure.ac
index ba23cd9..795d3b1 100644
--- a/zimwriterfs/configure.ac
+++ b/zimwriterfs/configure.ac
@@ -145,6 +145,46 @@
 AC_SUBST(ICU_FILES)
 AC_SUBST(COMPILED_ICUDATA_DAT)
 
+
+ XAPIAN
+
+
+XAPIAN_CFLAGS=""
+XAPIAN_LDFLAGS=""
+XAPIAN_STATIC_LDFLAGS=""
+XAPIAN_ENABLE=0
+
+# if --with-x, add path to LIBRARY_PATH
+AC_ARG_WITH([xapian],
+   [AS_HELP_STRING([--with-xapian=DIR], [alternat location for 
xapian-config] @@)],
+   [xapian_dir=$withval],
+   [with_xapian=yes])
+
+
+AS_IF([test "x$with_xapian" == xno],
+[AM_CONDITIONAL(HAVE_XAPIAN, false)],
+   [OLDPATH=$PATH
+AS_IF([test "x$with_xapian" != xyes],
+  PATH="$with_xapian:$PATH")
+AC_CHECK_TOOLS(XAPIAN_CONFIG, xapian-config-1.3, 
xapian-config,[],$PATH)
+AS_IF([test "x$XAPIAN_CONFIG" == x ],
+   AC_MSG_ERROR([[cannot find xapian-config file]])
+ )
+XAPIAN_VERSION=`$XAPIAN_CONFIG --version`
+good_version=yes
+AS_VERSION_COMPARE($XAPIAN_VERSION, "xapian-config - xapian-core 
1.3.4", [good_version=no], [], [])
+AS_IF([test "x$good_version" == xno],
+   AC_MSG_ERROR([[xapian version must be >= 1.3.4]])
+ )
+AM_CONDITIONAL(HAVE_XAPIAN, true)
+AC_DEFINE(HAVE_XAPIAN)
+XAPIAN_CFLAGS=`$XAPIAN_CONFIG --cxxflags`;
+XAPIAN_LDFLAGS=`$XAPIAN_CONFIG --ltlibs`;
+PATH=$OLDPATH
+   ])
+
+AC_SUBST(XAPIAN_CFLAGS)
+AC_SUBST(XAPIAN_LDFLAGS)
 
 # Configure the output files
 AC_CONFIG_FILES([
diff --git a/zimwriterfs/xapian/htmlparse.cc b/zimwriterfs/xapian/htmlparse.cc
new file mode 100644
index 000..483b03f
--- /dev/null
+++ b/zimwriterfs/xapian/htmlparse.cc
@@ -0,0 +1,373 @@
+/* htmlparse.cc: simple HTML parser for omega indexer
+ *
+ * Copyright 1999,2000,2001 BrightStation PLC
+ * Copyright 2001 Ananova Ltd
+ * Copyright 2002,2006,2007,2008 Olly Betts
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
+ * USA
+ */
+
+// #include 
+
+#include "htmlparse.h"
+
+#include 
+
+// #include "utf8convert.h"
+
+#include 
+
+#include 
+#include 
+#include 
+#include 
+
+using namespace std;
+
+inline void
+lowercase_string(string &str)
+{
+for (string::iterator i = str.begin(); i != str.end(); ++i) {
+   *i = tolower(static_cast(*i));
+}
+}
+
+map HtmlParser::named_ents;
+
+inline static bool
+p_notdigit(char c)
+{
+return !isdigit(static_cast(c));
+}
+
+inline static bool
+p_notxdigit(char c)
+{
+return !isxdigit(static_cast(c));
+}
+
+inline static bool
+p_notalnum(char c)
+{
+return !isalnum(static_cast(c));
+}
+
+inline static bool
+p_notwhitespace(char c)
+{
+return !isspace(static_cast(c));
+}
+
+inline static bool
+p_nottag(char c)
+{
+return !isalnum(static_cast(c)) &&
+   c != '.' && c != '-' && c != ':'; // ':' for XML namespaces.
+}
+
+inline s