The attached patch adds a new mechanism to transfer rules: <exception>

Exception can contain a single <test> -- if the test evaluates to
'true', the current rule is ignored, and the last applicable rule is
used instead (the implication being that it should only be used in
rules whose <pattern> contains more than one <pattern-item>).

Simple example:

<?xml version="1.0" encoding="UTF-8"?>
<transfer default="chunk">
  <section-def-cats>
    <def-cat n="adj">
      <cat-item tags="adj"/>
    </def-cat>

    <def-cat n="n">
      <cat-item tags="n"/>
    </def-cat>
  </section-def-cats>
  <section-def-attrs>
    <def-attr n="a_nom">
      <attr-item tags="n"/>
    </def-attr>
    <def-attr n="a_adj">
      <attr-item tags="adj"/>
    </def-attr>
  </section-def-attrs>
  <section-def-vars>
    <def-var n="dummy"/>
  </section-def-vars>
  <section-rules>
    <rule>
      <pattern>
        <pattern-item n="adj"/>
        <pattern-item n="n"/>
      </pattern>
      <action>
        <exception>
          <test>
            <equal caseless="yes">
              <clip pos="2" side="sl" part="lem"/>
              <lit v="bar"/>
            </equal>
          </test>
        </exception>
        <out>
          <chunk name="adj_nom">
            <tags>
              <tag><lit-tag v="SN"/></tag>
            </tags>
            <lu>
              <clip pos="1" side="sl" part="whole"/>
            </lu>
            <b pos="1"/>
            <lu>
              <clip pos="2" side="sl" part="whole"/>
            </lu>
          </chunk>
        </out>
      </action>
    </rule>
    <!-- Defaults -->
    <rule>
      <pattern>
        <pattern-item n="adj"/>
      </pattern>
      <action>
        <out>
          <chunk name="adj">
            <tags>
              <tag><lit-tag v="SN"/></tag>
            </tags>
            <lu>
              <clip pos="1" side="sl" part="whole"/>
            </lu>
          </chunk>
        </out>
      </action>
    </rule>
    <rule>
      <pattern>
        <pattern-item n="n"/>
      </pattern>
      <action>
        <out>
          <chunk name="n">
            <tags>
              <tag><lit-tag v="SN"/></tag>
            </tags>
            <lu>
              <clip pos="1" side="sl" part="whole"/>
            </lu>
          </chunk>
        </out>
      </action>
    </rule>
  </section-rules>
</transfer>

$ echo '^fooish<adj>$[1]^bars<n>$[2]^nom<n>$' |apertium-transfer -n
simple.t1x simple.bin
 ^adj_nom<SN>{^fooish<adj>$[1]^bars<n>$}$[2]^n<SN>{^nom<n>$}$
 $ echo '^fooish<adj>$[1]^bar<n>$[2]^nom<n>$' |apertium-transfer -n
simple.t1x simple.bin
 ^adj<SN>{^fooish<adj>$}$[1]^n<SN>{^bar<n>$}$[2]^n<SN>{^nom<n>$}$

Motivation:

The primary motivation was in dealing with Polish: highly inflected
(few 'markers'), adjectives can come before or after the noun.
Inflection *usually* gives enough information for proper segmentation,
but handling it properly would be a matter of having individual rules
for each gender, case, and number + each combination of words (i.e.,
multiply number of NP rules by 70). I've seen recently that it would
help in less inflected languages, so it's probably generally useful.

Caveats:

* Sanity checks aren't -- more are needed
* Only in apertium-transfer (I'll add this to interchunk when transfer
is tested)
* Only a single level of backoff (if the rule that's being backed off
to also triggers an exception... breakage will happen).
* Very little testing (see the example above? that's it)
* Changes the signature of some methods (all private).

Comments? Anybody interested in taking it for a whirl?

-- 
<Leftmost> jimregan, that's because deep inside you, you are evil.
<Leftmost> Also not-so-deep inside you.
Index: transfer.h
===================================================================
--- transfer.h	(revision 23790)
+++ transfer.h	(working copy)
@@ -70,6 +70,7 @@
   int any_tag;
 
   xmlNode *lastrule;
+  xmlNode *prevrule;
   unsigned int nwords;
   
   map<xmlNode *, TransferInstr> evalStringCache;
@@ -98,6 +99,7 @@
   void processModifyCase(xmlNode *localroot);
   bool processLogical(xmlNode *localroot);
   bool processTest(xmlNode *localroot);
+  bool processException(xmlNode *localroot);
   bool processAnd(xmlNode *localroot);
   bool processOr(xmlNode *localroot);
   bool processEqual(xmlNode *localroot);
@@ -108,7 +110,7 @@
   bool processContainsSubstring(xmlNode *localroot);
   bool processNot(xmlNode *localroot);
   bool processIn(xmlNode *localroot);
-  void processRule(xmlNode *localroot);
+  bool processRule(xmlNode *localroot);
   string evalString(xmlNode *localroot);
   void processInstruction(xmlNode *localroot);
   void processChoose(xmlNode *localroot);
@@ -123,7 +125,7 @@
   wstring readBlank(FILE *in);
   wstring readUntil(FILE *in, int const symbol) const;
   void applyWord(wstring const &word_str);
-  void applyRule();
+  bool applyRule();
   TransferToken & readToken(FILE *in);
   bool checkIndex(xmlNode *element, int index, int limit);
   void transfer_wrapper_null_flush(FILE *in, FILE *out);
Index: transfer.dtd
===================================================================
--- transfer.dtd	(revision 23790)
+++ transfer.dtd	(working copy)
@@ -156,12 +156,18 @@
        Each attribute to be activated is referred to by its name in the def-cats section 
 -->
 
-<!ELEMENT action (%sentence;)*>
+<!ELEMENT action (exception?, (%sentence;)*)>
 <!ATTLIST action c CDATA #IMPLIED>
 <!-- 
        Encloses the procedural part of a rule
 -->
 
+<!ELEMENT exception (test)>
+<!ATTLIST exception c CDATA #IMPLIED>
+<!-- 
+       An exception to a rule
+-->
+
 <!ELEMENT choose (when+,otherwise?)>
 <!ATTLIST choose c CDATA #IMPLIED>
 <!-- 
Index: transfer.cc
===================================================================
--- transfer.cc	(revision 23790)
+++ transfer.cc	(working copy)
@@ -1210,6 +1210,19 @@
 }
 
 bool
+Transfer::processException(xmlNode *localroot)
+{
+  for(xmlNode *i = localroot->children; i != NULL; i = i->next)
+  {
+    if(i->type == XML_ELEMENT_NODE)
+    {
+      return !processTest(i);
+    }
+  }  
+  return false;
+}
+
+bool
 Transfer::processAnd(xmlNode *localroot)
 {
   bool val = true;
@@ -1641,17 +1654,26 @@
   return result;
 }
 
-void
+bool
 Transfer::processRule(xmlNode *localroot)
 {
   // localroot is suposed to be an 'action' tag
   for(xmlNode *i = localroot->children; i != NULL; i = i->next)
   {
+    if(!xmlStrcmp(i->name, (const xmlChar *) "exception"))
+    {
+      if(!processException(i))
+      {
+        return false;
+      }
+    }
+
     if(i->type == XML_ELEMENT_NODE)
     {
       processInstruction(i);
     }
   }
+  return true;
 }
 
 TransferToken &
@@ -1754,6 +1776,7 @@
   }
   
   int last = 0;
+  int prev = 0;
 
   output = out;
   ms.init(me->getInitial());
@@ -1764,8 +1787,16 @@
     {
       if(lastrule != NULL)
       {
-	applyRule();
-	input_buffer.setPos(last);
+	if (applyRule())
+        {
+	  input_buffer.setPos(last);
+        }
+        else
+        {
+          lastrule = prevrule;
+          input_buffer.setPos(prev);
+          applyRule();
+        }
       }
       else
       {
@@ -1820,7 +1851,8 @@
 	  }
 	  tmpword.clear();
 	  input_buffer.setPos(last);
-	  input_buffer.next();       
+	  input_buffer.next();
+          prev = last;
 	  last = input_buffer.getPos();
 	  ms.init(me->getInitial());
 	}
@@ -1828,6 +1860,7 @@
 	{
 	  fputws_unlocked(tmpblank[0]->c_str(), output);
 	  tmpblank.clear();
+          prev = last;
 	  last = input_buffer.getPos();
 	  ms.init(me->getInitial());
 	}
@@ -1836,7 +1869,9 @@
     int val = ms.classifyFinals(me->getFinals());
     if(val != -1)
     {
-      lastrule = rule_map[val-1];      
+      prevrule = lastrule;
+      lastrule = rule_map[val-1];
+      prev = last;      
       last = input_buffer.getPos();
     }
 
@@ -1874,9 +1909,10 @@
   }
 }
 
-void
+bool
 Transfer::applyRule()
 {
+  bool ret = true;
   unsigned int limit = tmpword.size();
   
   for(unsigned int i = 0; i != limit; i++)
@@ -1915,7 +1951,11 @@
 			       UtfConverter::toUtf8(tr.first), tr.second);
   }
 
-  processRule(lastrule);
+  if(!processRule(lastrule))
+  {
+    return false;
+  }
+  
   lastrule = NULL;
 
   if(word)
@@ -1939,6 +1979,8 @@
   tmpword.clear();
   tmpblank.clear();
   ms.init(me->getInitial());
+
+  return ret;
 }
 
 void
------------------------------------------------------------------------------
This SF.net email is sponsored by Sprint
What will you do first with EVO, the first 4G phone?
Visit sprint.com/first -- http://p.sf.net/sfu/sprint-com-first
_______________________________________________
Apertium-stuff mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/apertium-stuff

Reply via email to