wordleaker

grothoff Fri, 10 Mar 2006 11:12:51 -0800

Author: grothoff
Date: 2006-03-08 18:26:01 -0800 (Wed, 08 Mar 2006)
New Revision: 2467


Modified:
   Extractor/src/include/extractor.h
   Extractor/src/main/extractor.c
   Extractor/src/plugins/ole2/ole2extractor.c
   Extractor/src/plugins/wordleaker/wordextractor.cc
   Extractor/src/plugins/wordleaker/wordleaker.cpp
   Extractor/src/plugins/wordleaker/wordleaker.h
Log:
more wordleaker hacking

Modified: Extractor/src/include/extractor.h
===================================================================
--- Extractor/src/include/extractor.h   2006-03-08 13:52:16 UTC (rev 2466)
+++ Extractor/src/include/extractor.h   2006-03-09 02:26:01 UTC (rev 2467)
@@ -140,8 +140,17 @@
   EXTRACTOR_ORIENTATION = 87,
   EXTRACTOR_TEMPLATE = 88,
   EXTRACTOR_SPLIT = 89,
-
-  EXTRACTOR_PRODUCTVERSION = 90,
+  EXTRACTOR_PRODUCTVERSION = 90,  
+  EXTRACTOR_LAST_SAVED_BY = 91,
+  EXTRACTOR_LAST_PRINTED = 92,  
+  EXTRACTOR_WORD_COUNT = 93,
+  EXTRACTOR_CHARACTER_COUNT = 94,
+  EXTRACTOR_TOTAL_EDITING_TIME = 95,
+  EXTRACTOR_THUMBNAILS = 96,
+  EXTRACTOR_SECURITY = 97,
+  EXTRACTOR_CREATED_BY_SOFTWARE = 98,
+  EXTRACTOR_MODIFIED_BY_SOFTWARE = 99,
+  EXTRACTOR_REVISION_HISTORY = 100,
 } EXTRACTOR_KeywordType;
 
 /**

Modified: Extractor/src/main/extractor.c
===================================================================
--- Extractor/src/main/extractor.c      2006-03-08 13:52:16 UTC (rev 2466)
+++ Extractor/src/main/extractor.c      2006-03-09 02:26:01 UTC (rev 2467)
@@ -132,11 +132,21 @@
   gettext_noop("template"),
   gettext_noop("split"),
   gettext_noop("product version"),
+  gettext_noop("last saved by"),
+  gettext_noop("last printed"),
+  gettext_noop("word count"),
+  gettext_noop("character count"),
+  gettext_noop("total editing time"),
+  gettext_noop("thumbnails"),
+  gettext_noop("security"),
+  gettext_noop("created by software"),
+  gettext_noop("modified by software"),
+  gettext_noop("revision history"),
   NULL,
 };
 
 /* the number of keyword types (for bounds-checking) */
-#define HIGHEST_TYPE_NUMBER 91
+#define HIGHEST_TYPE_NUMBER 101
 
 #ifdef HAVE_LIBOGG
 #if HAVE_VORBIS

Modified: Extractor/src/plugins/ole2/ole2extractor.c
===================================================================
--- Extractor/src/plugins/ole2/ole2extractor.c  2006-03-08 13:52:16 UTC (rev 
2466)
+++ Extractor/src/plugins/ole2/ole2extractor.c  2006-03-09 02:26:01 UTC (rev 
2467)
@@ -1627,7 +1627,7 @@
       g_warning ("error: %s", error->message);
       g_error_free (error);
     } else {
-      g_warning ("unknown error converting string property, using blank");
+      // g_warning ("unknown error converting string property, using blank");
     }
     *data += 4 + len * section->char_size;
     break;

Modified: Extractor/src/plugins/wordleaker/wordextractor.cc
===================================================================
--- Extractor/src/plugins/wordleaker/wordextractor.cc   2006-03-08 13:52:16 UTC 
(rev 2466)
+++ Extractor/src/plugins/wordleaker/wordextractor.cc   2006-03-09 02:26:01 UTC 
(rev 2467)
@@ -30,8 +30,41 @@
 #include "wordleaker.h"
 #include "pole.h"
 
+
+
+#include <iostream>
+#include <fstream>
+#include <stdlib.h>
+#include <list>
+#include <ctime>
+
+
 extern "C" {
 
+  static EXTRACTOR_KeywordType 
+  SummaryProperties[] = {
+    EXTRACTOR_UNKNOWN,
+    EXTRACTOR_UNKNOWN,
+    EXTRACTOR_TITLE,
+    EXTRACTOR_SUBJECT,
+    EXTRACTOR_AUTHOR,
+    EXTRACTOR_KEYWORDS,
+    EXTRACTOR_COMMENT,
+    EXTRACTOR_TEMPLATE,
+    EXTRACTOR_LAST_SAVED_BY,
+    EXTRACTOR_VERSIONNUMBER,
+    EXTRACTOR_TOTAL_EDITING_TIME,
+    EXTRACTOR_LAST_PRINTED,
+    EXTRACTOR_CREATION_DATE,
+    EXTRACTOR_MODIFICATION_DATE,
+    EXTRACTOR_PAGE_COUNT,
+    EXTRACTOR_WORD_COUNT,
+    EXTRACTOR_CHARACTER_COUNT,
+    EXTRACTOR_THUMBNAILS,
+    EXTRACTOR_SOFTWARE,
+    EXTRACTOR_SECURITY,
+  };
+
   static struct EXTRACTOR_Keywords * addKeyword(EXTRACTOR_KeywordType type,
                                                const char * keyword,
                                                struct EXTRACTOR_Keywords * 
next) {
@@ -46,9 +79,151 @@
     return result;
   }
 
+  static char * dateToString( unsigned long date ) {
+    char f[16];
+    sprintf(f, "%d/%d/%d", (date / 10000 % 100), (date / 100 % 100), (date % 
100));
+    return strdup(f);
+  }
+  
+  static const char * idToProduct( unsigned int id ) {
+    // TODO: find the rest of ids
+    switch ( id ) {
+    case  0x6A62:
+      return "Word 97";
+    case 0x626A:
+      return "Word 98 (Mac)";
+    default:
+      return "Unknown";
+    }      
+  }
+
+  static const char * lidToLanguage( unsigned int lid ) {
+    switch ( lid ) {
+    case 0x0400: 
+      return _("No Proofing");
+    case 0x0401: 
+      return _("Arabic");
+    case 0x0402:
+      return _("Bulgarian");
+    case 0x0403:
+      return _("Catalan");
+    case 0x0404:
+      return _("Traditional Chinese");
+    case 0x0804:
+      return _("Simplified Chinese");
+    case 0x0405:
+      return _("Czech");
+    case 0x0406:
+      return _("Danish");
+    case 0x0407:
+      return _("German");
+    case 0x0807:
+      return _("Swiss German");
+    case 0x0408:
+      return _("Greek");
+    case 0x0409:
+      return _("U.S. English");
+    case 0x0809:
+      return _("U.K. English");
+    case 0x0c09:
+      return _("Australian English");
+    case 0x040a:
+      return _("Castilian Spanish");
+    case 0x080a:
+      return _("Mexican Spanish");
+    case 0x040b:
+      return _("Finnish");
+    case 0x040c:
+      return _("French");
+    case 0x080c:
+      return _("Belgian French");
+    case 0x0c0c:
+      return _("Canadian French");
+    case 0x100c:
+      return _("Swiss French");
+    case 0x040d:
+      return _("Hebrew");
+    case 0x040e:
+      return _("Hungarian");
+    case 0x040f:
+      return _("Icelandic");
+    case 0x0410:
+      return _("Italian");
+    case 0x0810:
+      return _("Swiss Italian");
+    case 0x0411:
+      return _("Japanese");
+    case 0x0412:
+      return _("Korean");
+    case 0x0413:
+      return _("Dutch");
+    case 0x0813:
+      return _("Belgian Dutch");
+    case 0x0414:
+      return _("Norwegian - Bokmal");
+    case 0x0814:
+      return _("Norwegian - Nynorsk");
+    case 0x0415:
+      return _("Polish");
+    case 0x0416:
+      return _("Brazilian Portuguese");
+    case 0x0816:
+      return _("Portuguese");
+    case 0x0417:
+      return _("Rhaeto-Romanic");
+    case 0x0418:
+      return _("Romanian");
+    case 0x0419:
+      return _("Russian");
+    case 0x041a:
+      return _("Croato-Serbian (Latin)");
+    case 0x081a:
+      return _("Serbo-Croatian (Cyrillic)");
+    case 0x041b:
+      return _("Slovak");
+    case 0x041c:
+      return _("Albanian");
+    case 0x041d:
+      return _("Swedish");
+    case 0x041e:
+      return _("Thai");
+    case 0x041f:
+      return _("Turkish");
+    case 0x0420:
+      return _("Urdu");
+    case 0x0421:
+      return _("Bahasa"); 
+    case 0x0422:
+      return _("Ukrainian");
+    case 0x0423:
+      return _("Byelorussian");
+    case 0x0424:
+      return _("Slovenian");
+    case 0x0425:
+      return _("Estonian");
+    case 0x0426:
+      return _("Latvian");
+    case 0x0427:
+      return _("Lithuanian");
+    case 0x0429:
+      return _("Farsi");
+    case 0x042D:
+      return _("Basque");
+    case 0x042F:
+      return _("Macedonian");
+    case 0x0436:
+      return _("Afrikaans");
+    case 0x043E:
+      return _("Malaysian");  
+    default:
+      return _("Unknown");
+    }
+  }
+
+
  
   // read the type of the property and displays its value
-  char * getProperty( POLE::Stream* stream ) {
+  static char * getProperty( POLE::Stream* stream ) {
     unsigned long read, type;
     unsigned char buffer[256];
     unsigned char c;
@@ -88,6 +263,8 @@
       j = 0;
       while ( ((c = stream->getch()) != 0) && (i > j) )
        s[j++] = c;
+      if ( (j > 0) && (s[j-1] == '\n') )
+       s[--j] = '\0';
       if (j != i) {
        free(s);
        return NULL;
@@ -98,7 +275,9 @@
       t1 = buffer[0] + (buffer[1] << 8) + (buffer[2] << 16) + (buffer[3] << 
24);
       t2 = buffer[4]  + (buffer[5] << 8) + (buffer[6] << 16) + (buffer[7] << 
24);
       t = filetime_to_unixtime(t1, t2);
-      return ctime_r((time_t *) &t, (char*)malloc(32));
+      char * ret = ctime_r((time_t *) &t, (char*)malloc(32));
+      ret[strlen(ret)-1] = '\0'; /* kill newline */
+      return ret;
     }
     return NULL;
   }
@@ -109,6 +288,7 @@
                                                        size_t size,
                                                        struct 
EXTRACTOR_Keywords * prev) {
     char ver[16];
+    char product[128];
     if (size < 512 + 898)
       return prev;
     const unsigned char * buffer = (const unsigned char*) &data[512];
@@ -130,10 +310,23 @@
     prev = addKeyword(EXTRACTOR_LANGUAGE,
                      lidToLanguage(lid),
                      prev);
+    char * date = dateToString(lProductCreated);
+    snprintf(product, 128, _("%s (Build %s)"),
+            idToProduct(wMagicCreated),
+            date);
+    free(date);
+    prev = addKeyword(EXTRACTOR_CREATED_BY_SOFTWARE,
+                     product,
+                     prev);
+    date = dateToString(lProductRevised);
+    snprintf(product, 128, _("%s (Build %s)"),
+            idToProduct(wMagicRevised),
+            date);
+    free(date);
+    prev = addKeyword(EXTRACTOR_MODIFIED_BY_SOFTWARE,
+                     product,
+                     prev);
     
-    // cout << "Created by: " << idToProduct(wMagicCreated) << " (Build " << 
dateToString(lProductCreated) << ")" << endl;
-    // cout << "Revised by: " << idToProduct(wMagicRevised) << " (Build " << 
dateToString(lProductRevised) << ")" << endl;
-    
     POLE::Storage* storage = new POLE::Storage( filename );
     storage->open();
     if( storage->result() != POLE::Storage::Ok )
@@ -159,11 +352,12 @@
        unsigned int propertyID = buffer[0] + (buffer[1] << 8) + (buffer[2] << 
16) + (buffer[3] << 24);
        unsigned int offsetProp = buffer[4] + (buffer[5] << 8) + (buffer[6] << 
16) + (buffer[7] << 24);
        if (propertyID > 1 && propertyID < 20) {
-         // cout << SummaryProperties[propertyID] << ": ";
          unsigned long offsetCur = stream->tell();
          stream->seek(offsetProp + begin);
-         // read and show the property
          char * prop = getProperty(stream);  
+         prev = addKeyword(SummaryProperties[propertyID],
+                           prop,
+                           prev);
          free(prop);
          stream->seek(offsetCur);
        }
@@ -173,7 +367,9 @@
     unsigned int where = 0;
     
     // FIXME: should look if using 0Table or 1Table
-    stream = storage->stream( "1Table" );
+    stream = storage->stream("1Table");
+    if (! stream) 
+      stream = storage->stream("0Table");
     if (stream) {
       unsigned char * buffer = new unsigned char[lcbSttbSavedBy];
       unsigned char buffer2[1024];
@@ -181,34 +377,40 @@
       // goto offset of revision
       stream->seek(fcSttbSavedBy);
       // read all the revision history
-      stream->read(buffer, lcbSttbSavedBy);
+      if (lcbSttbSavedBy == stream->read(buffer, lcbSttbSavedBy)) {
       
-      // there are n strings, so n/2 revisions (author & file)
-      unsigned int nRev = (buffer[2] + (buffer[3] << 8)) / 2;
-      where = 6;
-      
-      for (unsigned int i=0; i < nRev; i++) {
-       // cout << "Rev #" << i << ": Author \"";
-       unsigned int length = buffer[where++];
-       // it's unicode, for now we only get the low byte
-       for (unsigned int j=0; j < length; j++) {
-         where++;
-         // cout << buffer[where];
-         where++;
+       // there are n strings, so n/2 revisions (author & file)
+       unsigned int nRev = (buffer[2] + (buffer[3] << 8)) / 2;
+       where = 6;
+       for (unsigned int i=0; i < nRev; i++) { 
+         if (where >= lcbSttbSavedBy)
+           break;
+         unsigned int length = buffer[where++];
+         if (where + 2 * length + 2 >= lcbSttbSavedBy)
+           break;
+         char * author = convertToUtf8((const char*) &buffer[where],
+                                       length * 2,
+                                       "UTF-16BE");
+         where += length * 2 + 1;
+         length = buffer[where++];
+         if (where + 2 * length >= lcbSttbSavedBy)
+           break;
+         char * filename = convertToUtf8((const char*) &buffer[where],
+                                         length * 2,
+                                         "UTF-16BE");  
+         where += length * 2 + 1;
+         char * rbuf = (char*) malloc(strlen(author) + strlen(filename) + 512);
+         snprintf(rbuf, 512 + strlen(author) + strlen(filename),
+                  _("Revision #%u: Author '%s' worked on '%s'"),
+                  i, author, filename);
+         free(author);
+         free(filename);
+         prev = addKeyword(EXTRACTOR_REVISION_HISTORY,
+                           rbuf, 
+                           prev);
+         free(rbuf);
        }
-       where++;
-       // cout << "\" worked on file \"";
-       length = buffer[where++];
-       // it's unicode, for now we only get the low byte
-       for (unsigned int j=0; j < length; j++) {
-         where++;
-         // cout << buffer[where];
-         where++;
-       }
-       where++;
-       // cout << "\"" << endl;    
       }
-      
       delete buffer;
     
     }

Modified: Extractor/src/plugins/wordleaker/wordleaker.cpp
===================================================================
--- Extractor/src/plugins/wordleaker/wordleaker.cpp     2006-03-08 13:52:16 UTC 
(rev 2466)
+++ Extractor/src/plugins/wordleaker/wordleaker.cpp     2006-03-09 02:26:01 UTC 
(rev 2467)
@@ -37,6 +37,8 @@
 
 unsigned long fcSttbSavedBy;
 unsigned long lcbSttbSavedBy;
+
+
   
 // read the type of the property and displays its value
 void showProperty( POLE::Stream* stream ) {
@@ -273,7 +275,6 @@
     
 }
 
-#if HAVE_MAIN
 int main(int argc, char *argv[]) {
   cout << endl << "WordLeaker v.0.1" << endl;
   cout << " by Madelman (http://elligre.tk/madelman/)" << endl << endl;
@@ -308,4 +309,3 @@
   
   return 0;
 }
-#endif

Modified: Extractor/src/plugins/wordleaker/wordleaker.h
===================================================================
--- Extractor/src/plugins/wordleaker/wordleaker.h       2006-03-08 13:52:16 UTC 
(rev 2466)
+++ Extractor/src/plugins/wordleaker/wordleaker.h       2006-03-09 02:26:01 UTC 
(rev 2467)
@@ -27,30 +27,8 @@
 
 using namespace std;
 
-static char* SummaryProperties[] = {
-"Unknown", 
-"Unknown",
-"Title",
-"Subject",
-"Author",
-"Keywords",
-"Comments",
-"Template",
-"Last Saved By",
-"Revision Number",
-"Total Editing Time",
-"Last Printed",
-"Create Time/Date",
-"Last Saved Time/Date",
-"Number of Pages",
-"Number of Words",
-"Number of Characters",
-"Thumbnails",
-"Creating Application",
-"Security"
-};
-
-static char* DocumentSummaryProperties[] = {
+static char* 
+DocumentSummaryProperties[] = {
 "Dictionary",
 "Code page",
 "Category",
@@ -70,147 +48,6 @@
 "LinksUpTo"
 };
 
-string dateToString( unsigned long date ) {
-  char f[9];
-  sprintf(f, "%d/%d/%d", (date / 10000 % 100), (date / 100 % 100), (date % 
100));
-  return f;
-}
-
-string idToProduct( unsigned int id ) {
-  // TODO: find the rest of ids
-  switch ( id ) {
-    case  0x6A62:
-        return "Word 97";
-    case 0x626A:
-        return "Word 98 (Mac)";
-    default:
-        return "Unknown";
-  }      
-}
-
-const char * lidToLanguage( unsigned int lid ) {
-  switch ( lid ) {
-    case 0x0400: 
-        return "No Proofing";
-    case 0x0401: 
-        return "Arabic";
-    case 0x0402:
-        return "Bulgarian";
-    case 0x0403:
-        return "Catalan";
-    case 0x0404:
-        return "Traditional Chinese";
-    case 0x0804:
-        return "Simplified Chinese";
-    case 0x0405:
-        return "Czech";
-    case 0x0406:
-        return "Danish";
-    case 0x0407:
-        return "German";
-    case 0x0807:
-        return "Swiss German";
-    case 0x0408:
-        return "Greek";
-    case 0x0409:
-        return "U.S. English";
-    case 0x0809:
-        return "U.K. English";
-    case 0x0c09:
-        return "Australian English";
-    case 0x040a:
-        return "Castilian Spanish";
-    case 0x080a:
-        return "Mexican Spanish";
-    case 0x040b:
-        return "Finnish";
-    case 0x040c:
-        return "French";
-    case 0x080c:
-        return "Belgian French";
-    case 0x0c0c:
-        return "Canadian French";
-    case 0x100c:
-        return "Swiss French";
-    case 0x040d:
-        return "Hebrew";
-    case 0x040e:
-        return "Hungarian";
-    case 0x040f:
-        return "Icelandic";
-    case 0x0410:
-        return "Italian";
-    case 0x0810:
-        return "Swiss Italian";
-    case 0x0411:
-        return "Japanese";
-    case 0x0412:
-        return "Korean";
-    case 0x0413:
-        return "Dutch";
-    case 0x0813:
-        return "Belgian Dutch";
-    case 0x0414:
-        return "Norwegian - Bokmal";
-    case 0x0814:
-        return "Norwegian - Nynorsk";
-    case 0x0415:
-        return "Polish";
-    case 0x0416:
-        return "Brazilian Portuguese";
-    case 0x0816:
-        return "Portuguese";
-    case 0x0417:
-        return "Rhaeto-Romanic";
-    case 0x0418:
-        return "Romanian";
-    case 0x0419:
-        return "Russian";
-    case 0x041a:
-        return "Croato-Serbian (Latin)";
-    case 0x081a:
-        return "Serbo-Croatian (Cyrillic)";
-    case 0x041b:
-        return "Slovak";
-    case 0x041c:
-        return "Albanian";
-    case 0x041d:
-        return "Swedish";
-    case 0x041e:
-        return "Thai";
-    case 0x041f:
-        return "Turkish";
-    case 0x0420:
-        return "Urdu";
-    case 0x0421:
-        return "Bahasa"; 
-    case 0x0422:
-        return "Ukrainian";
-    case 0x0423:
-        return "Byelorussian";
-    case 0x0424:
-        return "Slovenian";
-    case 0x0425:
-        return "Estonian";
-    case 0x0426:
-        return "Latvian";
-    case 0x0427:
-        return "Lithuanian";
-    case 0x0429:
-        return "Farsi";
-    case 0x042D:
-        return "Basque";
-    case 0x042F:
-        return "Macedonian";
-    case 0x0436:
-        return "Afrikaans";
-    case 0x043E:
-        return "Malaysian";  
-    default:
-        return "Unknown";
-  }
-}
-
 /*
  *  filetime_to_unixtime
  *



_______________________________________________
GNUnet-SVN mailing list
GNUnet-SVN@gnu.org
http://lists.gnu.org/mailman/listinfo/gnunet-svn

[GNUnet-SVN] r2467 - in Extractor/src: include main plugins/ole2 plugins/wordleaker

Reply via email to