Hi,
Now that CVS is back up -- or at least communicating at more that 50 bytes
a second -- I finally got made a patch. The code added allows wv to import
an MS Word 2 document, and output it in legible form. It appears (based
on what I have seen, half a dozen docs from all over the world) that
Word 2 docs are quite a lot simpler structured, with a linear text layout,
no OLE.
Of course nothing is perfect, and this outputs the text only in text form,
without "properties" or even basic formatting such as paragraphs (<p> or
\par). Doing that requires getting the FIB right (does anybody know the
FIB layout for Word 2?) and understanding what is happening on lines
188-399 of decode_simple.c (not very legible to an amateur :-( )
Any help or info would be appreciated.
As this patch adds functionality that was competely lacking earlier, it
should IMHO be applied after (necessary) regression testing. I don't
think there are very many Word 2 users any more, but still this may be
good to have.
--
Martin Vermeer [EMAIL PROTECTED]
:wq
Index: clx.c
===================================================================
RCS file: /cvsroot/wv/clx.c,v
retrieving revision 1.17
diff -u -r1.17 clx.c
--- clx.c 2000/08/28 04:41:53 1.17
+++ clx.c 2000/09/02 19:40:13
@@ -103,7 +103,7 @@
wvGetPCD_PLCF(&clx->pcd,&clx->pos,&clx->nopcd,wvStream_tell(fd),lcb,fd);
j+=lcb;
- if (ver == WORD7)
+ if (ver <= WORD7) /* MV 28.8.2000 Appears to be valid */
{
#if 0
/* DANGER !!, this is a completely mad attempt to
differenciate
@@ -115,18 +115,6 @@
/* I think that this is the correct reason for this
behaviour */
if (fExtChar == 0)
#endif
- for (i=0;i<clx->nopcd;i++)
- {
- clx->pcd[i].fc *= 2;
- clx->pcd[i].fc |= 0x40000000UL;
- }
- }
- if (ver == WORD6)
- {
- /* Copy the above ;-) MV 27.8.2000
- Note: worth trying for WORD2 etc. also (?)
- */
- if (fExtChar == 0)
for (i=0;i<clx->nopcd;i++)
{
clx->pcd[i].fc *= 2;
Index: decode_simple.c
===================================================================
RCS file: /cvsroot/wv/decode_simple.c,v
retrieving revision 1.48
diff -u -r1.48 decode_simple.c
--- decode_simple.c 2000/06/06 00:55:16 1.48
+++ decode_simple.c 2000/09/02 19:40:15
@@ -167,6 +167,20 @@
beginning at fib.fcMin up to (but not including) fib.fcMac.
*/
+ if ( (ver == WORD2) && !ps->fib.fComplex)
+ {
+ wvHandleDocument(ps,DOCBEGIN);
+ wvStream_goto(ps->mainfd,ps->fib.fcMin);
+ for (i = ps->fib.fcMin; i < ps->fib.fcMac; i++)
+ {
+ eachchar = wvGetChar(ps->mainfd, 1);
+ wvOutputTextChar(eachchar, 1, ps, &achp);
+ /* Formatting still lacking. How? This is just a start. */
+ }
+ return;
+ }
+
+
#ifdef DEBUG
if ( ps->fib.fcMac != wvGetEndFCPiece(ps->clx.nopcd-1,&ps->clx) )
wvTrace(("fcMac is not the same as the piecetable %x
%x!\n",ps->fib.fcMac,wvGetEndFCPiece(ps->clx.nopcd-1,&ps->clx)));
Index: fib.c
===================================================================
RCS file: /cvsroot/wv/fib.c,v
retrieving revision 1.25
diff -u -r1.25 fib.c
--- fib.c 2000/06/06 00:55:16 1.25
+++ fib.c 2000/09/02 19:40:17
@@ -275,6 +275,14 @@
item->wIdent = read_16ubit(fd);
item->nFib = read_16ubit(fd);
+ if ( (wvQuerySupported(item,NULL) == WORD2) )
+ {
+ wvInitFIB(item);
+ wvStream_offset(fd,-4);
+ wvGetFIB2(item,fd);
+ return;
+ }
+
if ( (wvQuerySupported(item,NULL) == WORD5) || (wvQuerySupported(item,NULL) ==
WORD6) || (wvQuerySupported(item,NULL) == WORD7) )
{
wvInitFIB(item);
@@ -618,6 +626,126 @@
return(ret);
}
+void wvGetFIB2(FIB *item,wvStream *fd)
+ {
+ U16 temp16;
+ U8 temp8;
+
+ item->wIdent = read_16ubit(fd);
+ item->nFib = read_16ubit(fd);
+
+ item->nProduct = read_16ubit(fd);
+ item->lid = read_16ubit(fd);
+ wvTrace(("lid is %x\n",item->lid));
+ item->pnNext = (S16)read_16ubit(fd);
+ temp16 = read_16ubit(fd);
+
+ item->fDot = (temp16 & 0x0001);
+ item->fGlsy = (temp16 & 0x0002) >> 1;
+ item->fComplex = (temp16 & 0x0004) >> 2;
+ item->fHasPic = (temp16 & 0x0008) >> 3;
+ item->cQuickSaves = (temp16 & 0x00F0) >> 4;
+ item->fEncrypted = (temp16 & 0x0100) >> 8;
+ item->fWhichTblStm = 0; /* word 6 files only have one table stream*/
+ item->fReadOnlyRecommended = (temp16 & 0x0400) >> 10;
+ item->fWriteReservation = (temp16 & 0x0800) >> 11;
+ item->fExtChar = (temp16 & 0x1000) >> 12;
+ wvTrace(("fExtChar is %d\n",item->fExtChar));
+ item->fLoadOverride = 0;
+ item->fFarEast = 0;
+ item->fCrypto = 0;
+ item->nFibBack = read_16ubit(fd);
+ wvTrace(("nFibBack is %d\n",item->nFibBack));
+
+ item->lKey = read_32ubit(fd);
+ item->envr = read_8ubit(fd);
+ temp8 = read_8ubit(fd);
+ item->fMac = 0;
+ item->fEmptySpecial = 0;
+ item->fLoadOverridePage = 0;
+ item->fFutureSavedUndo = 0;
+ item->fWord97Saved = 0;
+ item->fSpare0 = 0;
+ item->chse = read_16ubit(fd);
+ item->chsTables = read_16ubit(fd);
+ item->fcMin = read_32ubit(fd); /* These appear correct MV 29.8.2000 */
+ item->fcMac = read_32ubit(fd);
+ wvTrace(("fc from %o to %o", item->fcMin, item->fcMac));
+
+ item->csw = 14;
+ item->wMagicCreated = 0xCA0; /*this is the unique id of the creater, so its
+me :-)*/
+ /* Dunno what more there is here... if nFib = 45 (?) Try anyway. */
+
+ item->cbMac = read_32ubit(fd);
+
+ read_16ubit(fd);
+ read_16ubit(fd);
+ read_16ubit(fd);
+ read_16ubit(fd);
+ read_16ubit(fd);
+ read_16ubit(fd);
+ read_16ubit(fd);
+ read_16ubit(fd);
+
+ item->ccpText = read_32ubit(fd);
+ wvTrace(("length %d == %d", item->fcMac - item->fcMin, item->ccpText));
+ item->ccpFtn = (S32)read_32ubit(fd);
+ item->ccpHdr = (S32)read_32ubit(fd);
+ item->ccpMcr = (S32)read_32ubit(fd);
+ item->ccpAtn = (S32)read_32ubit(fd);
+ item->ccpEdn = (S32)read_32ubit(fd);
+ item->ccpTxbx = (S32)read_32ubit(fd);
+ item->ccpHdrTxbx = (S32)read_32ubit(fd);
+
+ read_32ubit(fd);
+
+ item->fcStshfOrig = (S32)read_32ubit(fd);
+ item->lcbStshfOrig = read_32ubit(fd);
+ item->fcStshf = (S32)read_32ubit(fd);
+ item->lcbStshf = read_32ubit(fd);
+ item->fcPlcffndRef = (S32)read_32ubit(fd);
+ item->lcbPlcffndRef = read_32ubit(fd);
+ item->fcPlcffndTxt = (S32)read_32ubit(fd);
+ item->lcbPlcffndTxt = read_32ubit(fd);
+ item->fcPlcfandRef = (S32)read_32ubit(fd);
+ item->lcbPlcfandRef = read_32ubit(fd);
+ item->fcPlcfandTxt = (S32)read_32ubit(fd);
+ item->lcbPlcfandTxt = read_32ubit(fd);
+ item->fcPlcfsed = (S32)read_32ubit(fd);
+ item->lcbPlcfsed = read_32ubit(fd);
+ item->fcPlcpad = (S32)read_32ubit(fd);
+ item->lcbPlcpad = read_32ubit(fd);
+ item->fcPlcfphe = (S32)read_32ubit(fd);
+ item->lcbPlcfphe = read_32ubit(fd);
+ item->fcSttbfglsy = (S32)read_32ubit(fd);
+ item->lcbSttbfglsy = read_32ubit(fd);
+ item->fcPlcfglsy = (S32)read_32ubit(fd);
+ item->lcbPlcfglsy = read_32ubit(fd);
+ item->fcPlcfhdd = (S32)read_32ubit(fd);
+ item->lcbPlcfhdd = read_32ubit(fd);
+ item->fcPlcfbteChpx = (S32)read_32ubit(fd);
+ item->lcbPlcfbteChpx = read_32ubit(fd);
+ item->fcPlcfbtePapx = (S32)read_32ubit(fd);
+ item->lcbPlcfbtePapx = read_32ubit(fd);
+ item->fcPlcfsea = (S32)read_32ubit(fd);
+ item->lcbPlcfsea = read_32ubit(fd);
+ item->fcSttbfffn = (S32)read_32ubit(fd);
+ item->lcbSttbfffn = read_32ubit(fd);
+ item->fcPlcffldMom = (S32)read_32ubit(fd);
+ item->lcbPlcffldMom = read_32ubit(fd);
+ item->fcPlcffldHdr = (S32)read_32ubit(fd);
+ item->lcbPlcffldHdr = read_32ubit(fd);
+ item->fcPlcffldFtn = (S32)read_32ubit(fd);
+ item->lcbPlcffldFtn = read_32ubit(fd);
+ item->fcPlcffldAtn = (S32)read_32ubit(fd);
+ item->lcbPlcffldAtn = read_32ubit(fd);
+ item->fcPlcffldMcr = (S32)read_32ubit(fd);
+ item->lcbPlcffldMcr = read_32ubit(fd);
+ item->fcSttbfbkmk = (S32)read_32ubit(fd);
+
+ /* item->lcbSttbfbkmk = read_32ubit(fd); This one is too much. */
+
+ }
void wvGetFIB6(FIB *item,wvStream *fd)
{
Index: wv.h
===================================================================
RCS file: /cvsroot/wv/wv.h,v
retrieving revision 1.66
diff -u -r1.66 wv.h
--- wv.h 2000/08/02 23:27:12 1.66
+++ wv.h 2000/09/02 19:40:38
@@ -468,6 +468,7 @@
} FIB;
void wvGetFIB(FIB *item,wvStream *fd);
+void wvGetFIB2(FIB *item,wvStream *fd);
void wvGetFIB6(FIB *item,wvStream *fd);
void wvInitFIB(FIB *item);
Index: wvHtml.c
===================================================================
RCS file: /cvsroot/wv/wvHtml.c,v
retrieving revision 1.45
diff -u -r1.45 wvHtml.c
--- wvHtml.c 2000/08/28 04:41:53 1.45
+++ wvHtml.c 2000/09/02 19:40:40
@@ -251,7 +251,7 @@
ps.filename = argv[optind];
ps.dir = dir;
- if (ret & 0x8000)
+ if (ret & 0x8000) /* Password protected? */
{
if ( (ret & 0x7fff) == WORD8)
{
Index: wvparse.c
===================================================================
RCS file: /cvsroot/wv/wvparse.c,v
retrieving revision 1.26
diff -u -r1.26 wvparse.c
--- wvparse.c 2000/06/06 00:55:16 1.26
+++ wvparse.c 2000/09/02 19:40:40
@@ -62,7 +62,8 @@
if ( (ret&0x7fff) != WORD8 )
ps->data = ps->mainfd;
- if ( (ret != WORD8) && (ret != WORD7) && (ret!= WORD6) )
+ if ( (ret != WORD8) && (ret != WORD7) && (ret!= WORD6) && (ret!= WORD2) )
+ /* WORD2 test */
{
/* return the errors and the encrypted files*/
if (!(ret & 0x8000))
@@ -119,7 +120,8 @@
{
wvError(("Theres a good chance that this is a word 2 doc of nFib
%d\n",read_16ubit(*mainfd)));
wvStream_rewind(*mainfd);
- return(-1);
+// return(-1);
+ return(0);
}
else if (0x37fe == magic)
{
Index: oledecod/oledecod.c
===================================================================
RCS file: /cvsroot/wv/oledecod/oledecod.c,v
retrieving revision 1.8
diff -u -r1.8 oledecod.c
--- oledecod/oledecod.c 2000/03/30 09:01:43 1.8
+++ oledecod/oledecod.c 2000/09/02 19:40:44
@@ -94,13 +94,14 @@
root_list = sbd_list = NULL;
/* open input file */
- test (input != NULL, 4, ends ());
+ test (input != NULL, 4, ends ()); /* cannot be opened */
/* fast check type of file */
verbose ("fast testing type of file");
test ((c = getc (input)) != EOF, 5, ends ());
test (ungetc (c, input) != EOF, 5, ends ());
- test ( (c < 32 || c > 126) , 8, ends ());
+ test ( (c < 32 || c > 126) , 8, ends ()); /* We have a legible character, not good
+*/
+ test (c != 0xdb, 2, ends ()); /* probably non-ole Word 2 file */
test (c == 0xd0, 9, ends ());
/* read header block */