I don't know where on the bug-fix/new-feature spectrum this fits, but I
patched the PDF processing code to handle some PDFs we have here. They
were generated by Acrobat PDFwriter from some Corel Draw files, and the
PostScript files that acroread made from them did some strange stuff with
character spacing - essentially it would commonly crank up the character
spacing before putting out the last letter in a word, rather than using
a space character or a positioning command. The result was that all the
words were getting stuck together when indexing. This patch fixes this.
I'd really appreciate it if others could test this out with their PDF files
to see if it breaks anything for them.
--- ./htdig/PDF.h.spacebug Thu Jul 23 11:18:54 1998
+++ ./htdig/PDF.h Mon Feb 1 14:48:15 1999
@@ -64,6 +64,13 @@
// appended to _parsedString instead of parsing it.
int _continueString;
+ // Sometimes the character spacing, as set by the Tc command, is set
+ // to a very high value, and is used to treat the characters in the next
+ // Tj as separate words. When this variable is true, text is appended
+ // to _parsedString with a space after each character, instead of as
+ // a single word.
+ int _bigSpacing;
+
// String beeing read
String _parsedString;
--- ./htdig/PDF.cc.spacebug Tue Jan 26 18:27:52 1999
+++ ./htdig/PDF.cc Mon Feb 1 17:15:13 1999
@@ -14,6 +14,7 @@
#include "htdig.h"
#include <htString.h>
#include <StringList.h>
+#include <stdlib.h>
#include <ctype.h>
@@ -24,6 +25,7 @@
{
_data = 0;
_dataLength = 0;
+ _bigSpacing = 0;
initParser();
}
@@ -361,10 +363,17 @@
else if (!strcmp(cmd, "Td") || !strcmp(cmd, "TD") ||
!strcmp(cmd, "Tm") || !strcmp(cmd, "T*"))
{
- // Text positionning commands Td, TD, Tm and T* are condidered
+ // Text positioning commands Td, TD, Tm and T* are considered
// as a word break (see PDF 1.2 spec, chapter 8.7.3)
parseString();
}
+ else if (!strcmp(cmd, "Tc"))
+ {
+ // Text positioning command Tc, with operand of 3 or more, seems
+ // sometimes to act as a word break between or after characters in
+ // the following Tj command. (E.g. PDFs generated from .cdr files.)
+ _bigSpacing = (atof(position) >= 3.0);
+ }
else
{
// Other commands are not considered as a word break
@@ -415,6 +424,8 @@
default:
_parsedString << (char)val;
}
+ if (_bigSpacing)
+ _parsedString << ' ';
// To do : handle more special characters
}
@@ -436,6 +447,8 @@
default :
// Add the escaped character
_parsedString << *pos;
+ if (_bigSpacing)
+ _parsedString << ' ';
pos++;
}
}
@@ -444,6 +457,8 @@
{
// Add character to the string
_parsedString << *pos;
+ if (_bigSpacing)
+ _parsedString << ' ';
pos++;
}
}
@@ -507,7 +522,7 @@
//
// Characters that are not part of a word
//
- if (!*position && isspace(*position))
+ if (*position && isspace(*position))
{
//
// Reduce all multiple whitespace to a single space
@@ -555,5 +570,6 @@
// Flush parsed string
_parsedString = 0;
+ _bigSpacing = 0;
}
--
Gilles R. Detillieux E-mail: <[EMAIL PROTECTED]>
Spinal Cord Research Centre WWW: http://www.scrc.umanitoba.ca/~grdetil
Dept. Physiology, U. of Manitoba Phone: (204)789-3766
Winnipeg, MB R3E 3J7 (Canada) Fax: (204)789-3930
------------------------------------
To unsubscribe from the htdig3-dev mailing list, send a message to
[EMAIL PROTECTED] containing the single word "unsubscribe" in
the SUBJECT of the message.