[htdig] Latin1.patch

Alexey Rodriguez Fri, 30 Jun 2000 08:49:34 -0700
        Hello diggers!!, it took a while because of lack of time but i
have finished the patch. This corrects the Weird endings problem that i
reported previously in this mailing list. It can now work with ispell
files "out of the box" although the only problem was with the spanish one.
It also corrects latin1 character codification which previously was coded
for german. It now supports every latin1 ispell aff file. I tested with
spanish, german and postuguese but it should work for others.
        mungeWord code is horribly slow but works for now, maybe i'll
recode it using regex's to make it faster. E-mail me if you have troubles.
        Geoff, i had problems for providing the dictionary since mungeWord
is a static function so i did what i did. I hope it is ok.

                                        Alexey Rodriguez

---8<---- cut here ------8<------

diff -rc htdig-3.1.5/htfuzzy/Endings.h mod/htdig-3.1.5/htfuzzy/Endings.h
*** htdig-3.1.5/htfuzzy/Endings.h       Fri Feb 25 02:29:10 2000
--- mod/htdig-3.1.5/htfuzzy/Endings.h   Thu Jun 29 14:37:09 2000
***************
*** 39,52 ****
      //
      int                       createDB(Configuration &config);
        
!     static void               mungeWord(char *, String &);
      
  private:
      Database          *root2word;
      Database          *word2root;
  
!     int                       createRoot(Dictionary &, char *, char *, char *);
!     int                       readRules(Dictionary &, char *);
      void              expandWord(String &, List &, Dictionary &, char *, char *);
  };
  
--- 39,52 ----
      //
      int                       createDB(Configuration &config);
        
!     static void               mungeWord(char *, String &, Dictionary &);
      
  private:
      Database          *root2word;
      Database          *word2root;
  
!     int                       createRoot(Dictionary &, char *, char *, char *, 
Dictionary &);
!     int                       readRules(Dictionary &, char *, Dictionary &);
      void              expandWord(String &, List &, Dictionary &, char *, char *);
  };
  
diff -rc htdig-3.1.5/htfuzzy/EndingsDB.cc mod/htdig-3.1.5/htfuzzy/EndingsDB.cc
*** htdig-3.1.5/htfuzzy/EndingsDB.cc    Fri Feb 25 02:29:10 2000
--- mod/htdig-3.1.5/htfuzzy/EndingsDB.cc        Fri Jun 30 10:01:28 2000
***************
*** 25,31 ****
  int
  Endings::createDB(Configuration &config)
  {
!     Dictionary        rules;
      String      tmpdir = getenv("TMPDIR");
      String      word2root, root2word;
      if (tmpdir.length())
--- 25,31 ----
  int
  Endings::createDB(Configuration &config)
  {
!     Dictionary        rules, lat_encoding;
      String      tmpdir = getenv("TMPDIR");
      String      word2root, root2word;
      if (tmpdir.length())
***************
*** 45,58 ****
      if (debug)
        cout << "htfuzzy/endings: Reading rules\n";
        
!     if (readRules(rules, config["endings_affix_file"]) == NOTOK)
        return NOTOK;
  
      if (debug)
        cout << "htfuzzy/endings: Creating databases\n";
        
      if (createRoot(rules, word2root, root2word,
!                  config["endings_dictionary"]) == NOTOK)
        return NOTOK;
  
      //
--- 45,65 ----
      if (debug)
        cout << "htfuzzy/endings: Reading rules\n";
        
!     if (readRules(rules, config["endings_affix_file"], lat_encoding) == NOTOK)
        return NOTOK;
  
+     lat_encoding.Start_Get();
+     char *s;
+     while(s=lat_encoding.Get_Next())
+     {
+         cout<<s<<" "<< * (String*) lat_encoding[s] <<endl;
+     }
+ 
      if (debug)
        cout << "htfuzzy/endings: Creating databases\n";
        
      if (createRoot(rules, word2root, root2word,
!                  config["endings_dictionary"], lat_encoding) == NOTOK)
        return NOTOK;
  
      //
***************
*** 69,75 ****
  
  //*****************************************************************************
  int
! Endings::readRules(Dictionary &rules, char *rulesFile)
  {
      FILE      *fl = fopen(rulesFile, "r");
  
--- 76,82 ----
  
  //*****************************************************************************
  int
! Endings::readRules(Dictionary &rules, char *rulesFile, Dictionary &lat_encoding)
  {
      FILE      *fl = fopen(rulesFile, "r");
  
***************
*** 77,106 ****
        return NOTOK;
  
      int               inSuffixes = 0;
      char      currentSuffix[2] = " ";
!     char      *p;
      char      input[1024];
      String    line;
        
      while (fgets(input, sizeof(input), fl))
      {
        if (input[0] == '\n' || input[0] == '#')
            continue;
  
        if (mystrncasecmp(input, "suffixes", 8) == 0)
        {
            inSuffixes = 1;
            continue;
        }
        else if (mystrncasecmp(input, "prefixes", 8) == 0)
        {
            inSuffixes = 0;
            continue;
        }
!       if (!inSuffixes)
            continue;
  
!       if (mystrncasecmp(input, "flag ", 5) == 0)
        {
            p = input + 5;
            while (*p == '*' || *p == ' ' || *p == '\t')
--- 84,179 ----
        return NOTOK;
  
      int               inSuffixes = 0;
+     int         inLatin1 = 0;
      char      currentSuffix[2] = " ";
!     char      *p, *t;
      char      input[1024];
      String    line;
+     String      Target;
+     String      Origin;
        
      while (fgets(input, sizeof(input), fl))
      {
+                               // As code gets more complex it should be considered
+                               // to redesign the parser or to move to yacc as ispell
+                               // does.
        if (input[0] == '\n' || input[0] == '#')
            continue;
  
        if (mystrncasecmp(input, "suffixes", 8) == 0)
        {
            inSuffixes = 1;
+           inLatin1=0;
            continue;
        }
        else if (mystrncasecmp(input, "prefixes", 8) == 0)
        {
            inSuffixes = 0;
+           inLatin1=0;
            continue;
        }
!       else if (mystrncasecmp(input, "altstringtype", 13) == 0)
!       {
!           if(mystrcasestr(input,"latin1")) inLatin1=1;
!           else inLatin1=0;
!           inSuffixes=0;
            continue;
+       }
+       if (!inSuffixes)
+       {
+           if(!inLatin1 || (mystrncasecmp(input, "altstringchar", 13) != 0 ) )
+           {
+               continue;
+           }
+           else
+           {
+               p = input;
+               p += 13;                // Skip "altstringchar" thingy
+               Target="";
+               Origin="";
+               while(*p == ' ' || *p == '\t')
+                   p++;
+             
+               // Parse the latin1 encoded character
+               while(*p != ' ' && *p != '\t')
+               {
+                   if(*p=='\\')        // I am considering only two posibilities hex 
+char or octal char
+                   {
+                       *p='0';
+                       Target << (char) strtol(p,&t,0);
+                       p=t;
+                   }
+                   else                // Read an ordinary character
+                   {
+                       Target << *p;
+                       p++;
+                   }
+               }
+ 
+               // Skip blank spaces
+               while(*p == ' ' || *p == '\t')
+                   p++;
+ 
+               // Parse the character as it is encoded in standard ispell files
+               while(*p != ' ' && *p != '\t' && *p != '\n')
+               {
+                   if(*p=='\\') { // backslash is quoting the next character
+                       Origin << *(p+1);
+                       p+=2;
+                   }
+                   else if ((*p=='\'') || (*p=='\"'))
+                       p++;            // only skip and forget about it 
+                                       // NOTE: this could be erroneous if there's a 
+space or \t in the quoting
+                   else {
+                       Origin << *p;   // ordinary characters make into Origin
+                       p++;
+                   }
+               }
+               lat_encoding.Add(Origin,new String(Target));
+           }
+       }  // End of latin1 encoding
  
!       else if (mystrncasecmp(input, "flag ", 5) == 0)
        {
            p = input + 5;
            while (*p == '*' || *p == ' ' || *p == '\t')
***************
*** 114,120 ****
            if (line.indexOf('>') > 0)
            {
                List            *list;
!               SuffixEntry     *se = new SuffixEntry(line);
                        
                if (rules.Exists(currentSuffix))
                {
--- 187,193 ----
            if (line.indexOf('>') > 0)
            {
                List            *list;
!               SuffixEntry     *se = new SuffixEntry(line, lat_encoding);
                        
                if (rules.Exists(currentSuffix))
                {
***************
*** 138,144 ****
  
  //*****************************************************************************
  int
! Endings::createRoot(Dictionary &rules, char *word2root, char *root2word, char 
*dictFile)
  {
      FILE      *fl = fopen(dictFile, "r");
      if (fl == NULL)
--- 211,217 ----
  
  //*****************************************************************************
  int
! Endings::createRoot(Dictionary &rules, char *word2root, char *root2word, char 
*dictFile, Dictionary & lat_encoding)
  {
      FILE      *fl = fopen(dictFile, "r");
      if (fl == NULL)
***************
*** 173,179 ****
  
        *p++ = '\0';
  
!       mungeWord(input, word);
        expandWord(words, wordList, rules, word, p);
  
        if (debug > 1)
--- 246,252 ----
  
        *p++ = '\0';
  
!       mungeWord(input, word, lat_encoding);
        expandWord(words, wordList, rules, word, p);
  
        if (debug > 1)
***************
*** 212,281 ****
  // any accents will be combined into single characters.
  //
  void
! Endings::mungeWord(char *input, String &word)
  {
!     char      *p = input + 1;
      
      word = 0;
      while (*input)
      {
!       p = input + 1;
!       switch (*p)
        {
!           case '"':   // The previous character needs to get an umlaut
!               switch (*input)
!               {
!                   case 'a':
!                   case 'A':
!                       word << char(228);
!                       input += 2;
!                       continue;
!                       break;
!                   case 'e':
!                   case 'E':
!                       word << char(235);
!                       input += 2;
!                       continue;
!                       break;
!                   case 'i':
!                   case 'I':
!                       word << char(239);
!                       input += 2;
!                       continue;
!                       break;
!                   case 'o':
!                   case 'O':
!                       word << char(246);
!                       input += 2;
!                       continue;
!                       break;
!                   case 'u':
!                   case 'U':
!                       word << char(252);
!                       input += 2;
!                       continue;
!                       break;
!               }
!               break;
!               
!           case 'S':   // See if the previous character needs to be an sz
!               if (*input == 's')
!               {
!                   word << char(223);
!                   input += 2;
!                   continue;
!               }
!               else
!               {
!                   word << *input;
!               }
!               break;
!               
!           default:
!               word << *input;
!               break;
        }
-       input++;
      }
      word.lowercase();
  }
--- 285,315 ----
  // any accents will be combined into single characters.
  //
  void
! Endings::mungeWord(char *input, String &word, Dictionary &lat_encoding)
  {
!     char      *p = input + 1 , *s;
!     int len;
      
      word = 0;
      while (*input)
      {
!         lat_encoding.Start_Get();
!                               // Replace ispell codification with latin1 codification
!                               // Slow, maybe in a next time this will be regexp'd
!       while(s = lat_encoding.Get_Next())
        {
!         if(mystrncasecmp(input ,s ,strlen(s) ) == 0)
!           {
!             word << (String*) lat_encoding[s];
!             input += strlen( s );
!             break;
!           }
!       }
!       if(!s)                  // No matches
!       {
!           word << *input;
!           input ++;
        }
      }
      word.lowercase();
  }
diff -rc htdig-3.1.5/htfuzzy/SuffixEntry.cc mod/htdig-3.1.5/htfuzzy/SuffixEntry.cc
*** htdig-3.1.5/htfuzzy/SuffixEntry.cc  Fri Feb 25 02:29:10 2000
--- mod/htdig-3.1.5/htfuzzy/SuffixEntry.cc      Thu Jun 29 14:44:43 2000
***************
*** 19,27 ****
  //*****************************************************************************
  // SuffixEntry::SuffixEntry()
  //
! SuffixEntry::SuffixEntry(char *str)
  {
!     parse(str);
  }
  
  
--- 19,27 ----
  //*****************************************************************************
  // SuffixEntry::SuffixEntry()
  //
! SuffixEntry::SuffixEntry(char *str, Dictionary &lat_encoding)
  {
!     parse(str, lat_encoding);
  }
  
  
***************
*** 38,44 ****
  //   Parse a string in the format <expr> '>' <rule> into ourselves.
  //
  void
! SuffixEntry::parse(char *str)
  {
      String    temp = 0;
      
--- 38,44 ----
  //   Parse a string in the format <expr> '>' <rule> into ourselves.
  //
  void
! SuffixEntry::parse(char *str, Dictionary &lat_encoding)
  {
      String    temp = 0;
      
***************
*** 56,70 ****
      while (*str == ' ' || *str == '\t' || *str == '>')
        str++;
  
!     Endings::mungeWord(temp, expression);
      
      temp = 0;
!     while (*str != ' ' && *str != '\t' && *str != '\n' && *str != '\r' && *str)
      {
!       temp << *str;
        str++;
      }
!     Endings::mungeWord(temp, rule);
  }
- 
- 
--- 56,70 ----
      while (*str == ' ' || *str == '\t' || *str == '>')
        str++;
  
!     Endings::mungeWord(temp, expression, lat_encoding);
      
      temp = 0;
!     while (*str != '#' && *str != '\n' && *str != '\r' && *str)
      {
!         if(*str!= ' ' && *str!= '\t') {
!           temp << *str;
!         }
        str++;
      }
!     Endings::mungeWord(temp, rule, lat_encoding);
  }
diff -rc htdig-3.1.5/htfuzzy/SuffixEntry.h mod/htdig-3.1.5/htfuzzy/SuffixEntry.h
*** htdig-3.1.5/htfuzzy/SuffixEntry.h   Fri Feb 25 02:29:10 2000
--- mod/htdig-3.1.5/htfuzzy/SuffixEntry.h       Thu Jun 29 14:43:08 2000
***************
*** 15,20 ****
--- 15,21 ----
  #define _SuffixEntry_h_
  
  #include "Object.h"
+ #include <Dictionary.h>
  #include <htString.h>
  
  
***************
*** 24,36 ****
        //
        // Construction/Destruction
        //
!                                       SuffixEntry(char *);
                                        ~SuffixEntry();
  
        String                  expression;
        String                  rule;
  
!       void                    parse(char *str);
        
  private:
  };
--- 25,37 ----
        //
        // Construction/Destruction
        //
!                                       SuffixEntry(char *, Dictionary &lat_encoding);
                                        ~SuffixEntry();
  
        String                  expression;
        String                  rule;
  
!       void                    parse(char *str, Dictionary &lat_encoding);
        
  private:
  };


------------------------------------
To unsubscribe from the htdig mailing list, send a message to
[EMAIL PROTECTED]
You will receive a message to confirm this.
[htdig] Latin1.patch

Reply via email to