Mgautierfr has uploaded a new change for review.
https://gerrit.wikimedia.org/r/295521
Change subject: Add xapian indexer.
..
Add xapian indexer.
Xapian is optional.
Build your index inside zim by adding "-i" to zimwriterfs' command line.
Change-Id: I866fe525deaf6cabddc41b9ea73bd5101c84ea7a
---
M zimwriterfs/Makefile.am
M zimwriterfs/configure.ac
A zimwriterfs/xapian/htmlparse.cc
A zimwriterfs/xapian/htmlparse.h
A zimwriterfs/xapian/myhtmlparse.cc
A zimwriterfs/xapian/myhtmlparse.h
A zimwriterfs/xapian/namedentities.h
A zimwriterfs/xapianIndexer.cpp
A zimwriterfs/xapianIndexer.h
M zimwriterfs/zimwriterfs.cpp
10 files changed, 1,368 insertions(+), 0 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/openzim refs/changes/21/295521/1
diff --git a/zimwriterfs/Makefile.am b/zimwriterfs/Makefile.am
index e54c64d..c6d6406 100644
--- a/zimwriterfs/Makefile.am
+++ b/zimwriterfs/Makefile.am
@@ -12,3 +12,12 @@
zimwriterfs_CXXFLAGS = $(ICU_CFLAGS)
zimwriterfs_LDFLAGS = $(ICU_LDFLAGS)
+
+if HAVE_XAPIAN
+zimwriterfs_CXXFLAGS += $(XAPIAN_CFLAGS)
+zimwriterfs_LDFLAGS += $(XAPIAN_LDFLAGS)
+zimwriterfs_SOURCES += \
+xapianIndexer.cpp \
+xapian/myhtmlparse.cc \
+xapian/htmlparse.cc
+endif
diff --git a/zimwriterfs/configure.ac b/zimwriterfs/configure.ac
index ba23cd9..795d3b1 100644
--- a/zimwriterfs/configure.ac
+++ b/zimwriterfs/configure.ac
@@ -145,6 +145,46 @@
AC_SUBST(ICU_FILES)
AC_SUBST(COMPILED_ICUDATA_DAT)
+
+ XAPIAN
+
+
+XAPIAN_CFLAGS=""
+XAPIAN_LDFLAGS=""
+XAPIAN_STATIC_LDFLAGS=""
+XAPIAN_ENABLE=0
+
+# if --with-x, add path to LIBRARY_PATH
+AC_ARG_WITH([xapian],
+ [AS_HELP_STRING([--with-xapian=DIR], [alternat location for
xapian-config] @@)],
+ [xapian_dir=$withval],
+ [with_xapian=yes])
+
+
+AS_IF([test "x$with_xapian" == xno],
+[AM_CONDITIONAL(HAVE_XAPIAN, false)],
+ [OLDPATH=$PATH
+AS_IF([test "x$with_xapian" != xyes],
+ PATH="$with_xapian:$PATH")
+AC_CHECK_TOOLS(XAPIAN_CONFIG, xapian-config-1.3,
xapian-config,[],$PATH)
+AS_IF([test "x$XAPIAN_CONFIG" == x ],
+ AC_MSG_ERROR([[cannot find xapian-config file]])
+ )
+XAPIAN_VERSION=`$XAPIAN_CONFIG --version`
+good_version=yes
+AS_VERSION_COMPARE($XAPIAN_VERSION, "xapian-config - xapian-core
1.3.4", [good_version=no], [], [])
+AS_IF([test "x$good_version" == xno],
+ AC_MSG_ERROR([[xapian version must be >= 1.3.4]])
+ )
+AM_CONDITIONAL(HAVE_XAPIAN, true)
+AC_DEFINE(HAVE_XAPIAN)
+XAPIAN_CFLAGS=`$XAPIAN_CONFIG --cxxflags`;
+XAPIAN_LDFLAGS=`$XAPIAN_CONFIG --ltlibs`;
+PATH=$OLDPATH
+ ])
+
+AC_SUBST(XAPIAN_CFLAGS)
+AC_SUBST(XAPIAN_LDFLAGS)
# Configure the output files
AC_CONFIG_FILES([
diff --git a/zimwriterfs/xapian/htmlparse.cc b/zimwriterfs/xapian/htmlparse.cc
new file mode 100644
index 000..483b03f
--- /dev/null
+++ b/zimwriterfs/xapian/htmlparse.cc
@@ -0,0 +1,373 @@
+/* htmlparse.cc: simple HTML parser for omega indexer
+ *
+ * Copyright 1999,2000,2001 BrightStation PLC
+ * Copyright 2001 Ananova Ltd
+ * Copyright 2002,2006,2007,2008 Olly Betts
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+
+// #include
+
+#include "htmlparse.h"
+
+#include
+
+// #include "utf8convert.h"
+
+#include
+
+#include
+#include
+#include
+#include
+
+using namespace std;
+
+inline void
+lowercase_string(string &str)
+{
+for (string::iterator i = str.begin(); i != str.end(); ++i) {
+ *i = tolower(static_cast(*i));
+}
+}
+
+map HtmlParser::named_ents;
+
+inline static bool
+p_notdigit(char c)
+{
+return !isdigit(static_cast(c));
+}
+
+inline static bool
+p_notxdigit(char c)
+{
+return !isxdigit(static_cast(c));
+}
+
+inline static bool
+p_notalnum(char c)
+{
+return !isalnum(static_cast(c));
+}
+
+inline static bool
+p_notwhitespace(char c)
+{
+return !isspace(static_cast(c));
+}
+
+inline static bool
+p_nottag(char c)
+{
+return !isalnum(static_cast(c)) &&
+ c != '.' && c != '-' && c != ':'; // ':' for XML namespaces.
+}
+
+inline s