RE: FuzzyOCR Warnings and General Questions

2007-04-10 Thread Leon Kolchinsky
 
 I'm running Spamassassin on OpenSuse 10.2 and have just installed
 FuzzyOCR.
 
 It appears to be working in that it scans/detects words in the supplied
 test files.
 
 I noticed spamassassin --lint gives:
 
 [25313] warn: FuzzyOcr: Cannot find executable for pamthreshold
 [25313] warn: FuzzyOcr: Cannot find executable for tesseract
 
 Which seems fair enough as I don't have them.
 
 Is it just a spurious warning though or do I need to be concerned?
 
 Also as a general question other than adding words to the wordlist as
 and when, are there any Must Know tips n tricks for FuzzyOCR?
 
 cheers,

Hi,

Take a look here (http://www200.pair.com/mecham/spam/image_spam2.html) and use 
patches for netpbm  10.34


Or do the following (works for me):

1) Download latest stable version:
# svn checkout https://netpbm.svn.sourceforge.net/svnroot/netpbm/stable netpbm

2) Apply this patch:
diff -Naur netpbm-10.35.21/Makefile.config.in 
netpbm-10.35.21-patched/Makefile.config.in
--- netpbm-10.35.21/Makefile.config.in  2007-01-14 16:18:25.0 +0200
+++ netpbm-10.35.21-patched/Makefile.config.in  2007-01-14 16:33:59.304432096 
+0200
@@ -108,7 +108,7 @@
 #OSF1:
 #INSTALL = $(SRCDIR)/buildtools/installosf
 #Red Hat Linux:
-#INSTALL = install
+INSTALL = install

 # STRIPFLAG is the option you pass to the above install program to make it
 # strip unnecessary information out of binaries.
@@ -280,9 +280,9 @@
 # compiler/linker).  Build-time linking fails without it.  I don't
 # know why -- history seems to be repeating itself.  2005.02.23.

-CFLAGS_SHLIB =
+# CFLAGS_SHLIB =
 # Solaris or SunOS with gcc, and NetBSD:
-#CFLAGS_SHLIB = -fpic
+CFLAGS_SHLIB = -fPIC
 #CFLAGS_SHLIB = -fPIC
 # Sun compiler:
 #CFLAGS_SHLIB = -Kpic
@@ -350,7 +350,7 @@
 # The TIFF library.  See above.  If you want to build the tiff
 # converters, you must have the tiff library already installed.

-TIFFLIB = NONE
+TIFFLIB = libtiff.so
 TIFFHDR_DIR =

 #TIFFLIB = libtiff.so
@@ -382,7 +382,7 @@
 # JPEG stuff statically linked in, in which case you won't need
 # JPEGLIB in order to build the Tiff converters.

-JPEGLIB = NONE
+JPEGLIB = libjpeg.so
 JPEGHDR_DIR =
 #JPEGLIB = libjpeg.so
 #JPEGHDR_DIR = /usr/include/jpeg
@@ -413,7 +413,7 @@
 # case, PNGLIB and PNGHDR_DIR are irrelevant, but PNGVER is still meaningful,
 # because the make file runs 'libpng$(PNGVER)-config'.

-PNGLIB = NONE
+PNGLIB = libpng.so
 PNGHDR_DIR =
 PNGVER =
 #PNGLIB = libpng$(PNGVER).so
@@ -432,7 +432,7 @@
 #
 # If you have 'libpng-config' (see above), these are irrelevant.

-ZLIB = NONE
+ZLIB = libz.so
 ZHDR_DIR =
 #ZLIB = libz.so

diff -Naur netpbm-10.35.21/converter/other/fiasco/codec/dfiasco.c 
netpbm-10.35.21-patched/converter/other/fiasco/codec/dfiasco.c
--- netpbm-10.35.21/converter/other/fiasco/codec/dfiasco.c  2007-01-14 
16:18:03.0 +0200
+++ netpbm-10.35.21-patched/converter/other/fiasco/codec/dfiasco.c  
2007-01-14 16:37:35.780522728 +0200
@@ -15,7 +15,7 @@
  */

 #include string.h
-
+#include stdlib.h
 #include config.h

 #include types.h
diff -Naur netpbm-10.35.21/converter/other/fiasco/config.h 
netpbm-10.35.21-patched/converter/other/fiasco/config.h
--- netpbm-10.35.21/converter/other/fiasco/config.h 2007-01-14 
16:18:03.0 +0200
+++ netpbm-10.35.21-patched/converter/other/fiasco/config.h 2007-01-14 
16:36:00.265043288 +0200
@@ -25,6 +25,12 @@
byte first (like Motorola and SPARC, unlike Intel and VAX).  */
 /* #undef WORDS_BIGENDIAN */

+/* since we don't have autoconf... */
+#include endian.h
+#if __BYTE_ORDER == __BIG_ENDIAN
+#define WORDS_BIGENDIAN 1
+#endif
+
 /* Define if the X Window System is missing or not being used.  */
 #define X_DISPLAY_MISSING 1

diff -Naur netpbm-10.35.21/converter/other/fiasco/input/basis.c 
netpbm-10.35.21-patched/converter/other/fiasco/input/basis.c
--- netpbm-10.35.21/converter/other/fiasco/input/basis.c2007-01-14 
16:18:00.0 +0200
+++ netpbm-10.35.21-patched/converter/other/fiasco/input/basis.c
2007-01-14 16:38:10.711212456 +0200
@@ -13,7 +13,7 @@
  *  $Revision: 5.3 $
  *  $State: Exp $
  */
-
+#include string.h
 #include config.h

 #include types.h
diff -Naur netpbm-10.35.21/converter/pbm/icontopbm.c 
netpbm-10.35.21-patched/converter/pbm/icontopbm.c
--- netpbm-10.35.21/converter/pbm/icontopbm.c   2007-01-14 16:18:22.0 
+0200
+++ netpbm-10.35.21-patched/converter/pbm/icontopbm.c   2007-01-14 
16:43:50.478559968 +0200
@@ -13,6 +13,7 @@
 #include string.h

 #include nstring.h
+#include limits.h
 #include pbm.h

 /* size in bytes of a bitmap */
diff -Naur netpbm-10.35.21/converter/ppm/ppmtowinicon.c 
netpbm-10.35.21-patched/converter/ppm/ppmtowinicon.c
--- netpbm-10.35.21/converter/ppm/ppmtowinicon.c2007-01-14 
16:18:20.0 +0200
+++ netpbm-10.35.21-patched/converter/ppm/ppmtowinicon.c2007-01-14 
16:46:54.505583608 +0200
@@ -12,7 +12,7 @@

 #include math.h
 #include string.h
-
+#include stdlib.h
 #include winico.h
 #include ppm.h
 #include 

Re: FuzzyOCR Warnings and General Questions

2007-04-07 Thread René Berber
Paul Hutchings wrote:

 I'm running Spamassassin on OpenSuse 10.2 and have just installed
 FuzzyOCR.
 
 It appears to be working in that it scans/detects words in the supplied
 test files.
 
 I noticed spamassassin --lint gives:
 
 [25313] warn: FuzzyOcr: Cannot find executable for pamthreshold

This one means you don't have a recent version of Netpbm, pamthreshold appeared
around version 10.34 (I'm using 10.35.21).  Some tests will not work, either
install it or use a workaround (there are some posts about this, I don't
use/know one).

 [25313] warn: FuzzyOcr: Cannot find executable for tesseract

Tesseract is optional, I just comment out line 100 of FuzzyOcr.cf :
#focr_bin_helper tesseract

 Which seems fair enough as I don't have them.
 
 Is it just a spurious warning though or do I need to be concerned?
 
 Also as a general question other than adding words to the wordlist as
 and when, are there any Must Know tips n tricks for FuzzyOCR?

I would recommend to at least read FuzzyOcr.cf so you see what can be controlled
and get an idea of how things work.

The default parametes, as you have seen, work fine... I would only check
focr_enable_image_hashing (disabled by default, recommended set to 2), and
focr_base_score (which is too high in my opinion, 5 is the default and there's a
know bug that counts the same word as several repetitions so the count is not
very reliable).
-- 
René Berber