The attached patch adds a new mechanism to transfer rules: <exception>
Exception can contain a single <test> -- if the test evaluates to
'true', the current rule is ignored, and the last applicable rule is
used instead (the implication being that it should only be used in
rules whose <pattern> contains more than one <pattern-item>).
Simple example:
<?xml version="1.0" encoding="UTF-8"?>
<transfer default="chunk">
<section-def-cats>
<def-cat n="adj">
<cat-item tags="adj"/>
</def-cat>
<def-cat n="n">
<cat-item tags="n"/>
</def-cat>
</section-def-cats>
<section-def-attrs>
<def-attr n="a_nom">
<attr-item tags="n"/>
</def-attr>
<def-attr n="a_adj">
<attr-item tags="adj"/>
</def-attr>
</section-def-attrs>
<section-def-vars>
<def-var n="dummy"/>
</section-def-vars>
<section-rules>
<rule>
<pattern>
<pattern-item n="adj"/>
<pattern-item n="n"/>
</pattern>
<action>
<exception>
<test>
<equal caseless="yes">
<clip pos="2" side="sl" part="lem"/>
<lit v="bar"/>
</equal>
</test>
</exception>
<out>
<chunk name="adj_nom">
<tags>
<tag><lit-tag v="SN"/></tag>
</tags>
<lu>
<clip pos="1" side="sl" part="whole"/>
</lu>
<b pos="1"/>
<lu>
<clip pos="2" side="sl" part="whole"/>
</lu>
</chunk>
</out>
</action>
</rule>
<!-- Defaults -->
<rule>
<pattern>
<pattern-item n="adj"/>
</pattern>
<action>
<out>
<chunk name="adj">
<tags>
<tag><lit-tag v="SN"/></tag>
</tags>
<lu>
<clip pos="1" side="sl" part="whole"/>
</lu>
</chunk>
</out>
</action>
</rule>
<rule>
<pattern>
<pattern-item n="n"/>
</pattern>
<action>
<out>
<chunk name="n">
<tags>
<tag><lit-tag v="SN"/></tag>
</tags>
<lu>
<clip pos="1" side="sl" part="whole"/>
</lu>
</chunk>
</out>
</action>
</rule>
</section-rules>
</transfer>
$ echo '^fooish<adj>$[1]^bars<n>$[2]^nom<n>$' |apertium-transfer -n
simple.t1x simple.bin
^adj_nom<SN>{^fooish<adj>$[1]^bars<n>$}$[2]^n<SN>{^nom<n>$}$
$ echo '^fooish<adj>$[1]^bar<n>$[2]^nom<n>$' |apertium-transfer -n
simple.t1x simple.bin
^adj<SN>{^fooish<adj>$}$[1]^n<SN>{^bar<n>$}$[2]^n<SN>{^nom<n>$}$
Motivation:
The primary motivation was in dealing with Polish: highly inflected
(few 'markers'), adjectives can come before or after the noun.
Inflection *usually* gives enough information for proper segmentation,
but handling it properly would be a matter of having individual rules
for each gender, case, and number + each combination of words (i.e.,
multiply number of NP rules by 70). I've seen recently that it would
help in less inflected languages, so it's probably generally useful.
Caveats:
* Sanity checks aren't -- more are needed
* Only in apertium-transfer (I'll add this to interchunk when transfer
is tested)
* Only a single level of backoff (if the rule that's being backed off
to also triggers an exception... breakage will happen).
* Very little testing (see the example above? that's it)
* Changes the signature of some methods (all private).
Comments? Anybody interested in taking it for a whirl?
--
<Leftmost> jimregan, that's because deep inside you, you are evil.
<Leftmost> Also not-so-deep inside you.
Index: transfer.h
===================================================================
--- transfer.h (revision 23790)
+++ transfer.h (working copy)
@@ -70,6 +70,7 @@
int any_tag;
xmlNode *lastrule;
+ xmlNode *prevrule;
unsigned int nwords;
map<xmlNode *, TransferInstr> evalStringCache;
@@ -98,6 +99,7 @@
void processModifyCase(xmlNode *localroot);
bool processLogical(xmlNode *localroot);
bool processTest(xmlNode *localroot);
+ bool processException(xmlNode *localroot);
bool processAnd(xmlNode *localroot);
bool processOr(xmlNode *localroot);
bool processEqual(xmlNode *localroot);
@@ -108,7 +110,7 @@
bool processContainsSubstring(xmlNode *localroot);
bool processNot(xmlNode *localroot);
bool processIn(xmlNode *localroot);
- void processRule(xmlNode *localroot);
+ bool processRule(xmlNode *localroot);
string evalString(xmlNode *localroot);
void processInstruction(xmlNode *localroot);
void processChoose(xmlNode *localroot);
@@ -123,7 +125,7 @@
wstring readBlank(FILE *in);
wstring readUntil(FILE *in, int const symbol) const;
void applyWord(wstring const &word_str);
- void applyRule();
+ bool applyRule();
TransferToken & readToken(FILE *in);
bool checkIndex(xmlNode *element, int index, int limit);
void transfer_wrapper_null_flush(FILE *in, FILE *out);
Index: transfer.dtd
===================================================================
--- transfer.dtd (revision 23790)
+++ transfer.dtd (working copy)
@@ -156,12 +156,18 @@
Each attribute to be activated is referred to by its name in the def-cats section
-->
-<!ELEMENT action (%sentence;)*>
+<!ELEMENT action (exception?, (%sentence;)*)>
<!ATTLIST action c CDATA #IMPLIED>
<!--
Encloses the procedural part of a rule
-->
+<!ELEMENT exception (test)>
+<!ATTLIST exception c CDATA #IMPLIED>
+<!--
+ An exception to a rule
+-->
+
<!ELEMENT choose (when+,otherwise?)>
<!ATTLIST choose c CDATA #IMPLIED>
<!--
Index: transfer.cc
===================================================================
--- transfer.cc (revision 23790)
+++ transfer.cc (working copy)
@@ -1210,6 +1210,19 @@
}
bool
+Transfer::processException(xmlNode *localroot)
+{
+ for(xmlNode *i = localroot->children; i != NULL; i = i->next)
+ {
+ if(i->type == XML_ELEMENT_NODE)
+ {
+ return !processTest(i);
+ }
+ }
+ return false;
+}
+
+bool
Transfer::processAnd(xmlNode *localroot)
{
bool val = true;
@@ -1641,17 +1654,26 @@
return result;
}
-void
+bool
Transfer::processRule(xmlNode *localroot)
{
// localroot is suposed to be an 'action' tag
for(xmlNode *i = localroot->children; i != NULL; i = i->next)
{
+ if(!xmlStrcmp(i->name, (const xmlChar *) "exception"))
+ {
+ if(!processException(i))
+ {
+ return false;
+ }
+ }
+
if(i->type == XML_ELEMENT_NODE)
{
processInstruction(i);
}
}
+ return true;
}
TransferToken &
@@ -1754,6 +1776,7 @@
}
int last = 0;
+ int prev = 0;
output = out;
ms.init(me->getInitial());
@@ -1764,8 +1787,16 @@
{
if(lastrule != NULL)
{
- applyRule();
- input_buffer.setPos(last);
+ if (applyRule())
+ {
+ input_buffer.setPos(last);
+ }
+ else
+ {
+ lastrule = prevrule;
+ input_buffer.setPos(prev);
+ applyRule();
+ }
}
else
{
@@ -1820,7 +1851,8 @@
}
tmpword.clear();
input_buffer.setPos(last);
- input_buffer.next();
+ input_buffer.next();
+ prev = last;
last = input_buffer.getPos();
ms.init(me->getInitial());
}
@@ -1828,6 +1860,7 @@
{
fputws_unlocked(tmpblank[0]->c_str(), output);
tmpblank.clear();
+ prev = last;
last = input_buffer.getPos();
ms.init(me->getInitial());
}
@@ -1836,7 +1869,9 @@
int val = ms.classifyFinals(me->getFinals());
if(val != -1)
{
- lastrule = rule_map[val-1];
+ prevrule = lastrule;
+ lastrule = rule_map[val-1];
+ prev = last;
last = input_buffer.getPos();
}
@@ -1874,9 +1909,10 @@
}
}
-void
+bool
Transfer::applyRule()
{
+ bool ret = true;
unsigned int limit = tmpword.size();
for(unsigned int i = 0; i != limit; i++)
@@ -1915,7 +1951,11 @@
UtfConverter::toUtf8(tr.first), tr.second);
}
- processRule(lastrule);
+ if(!processRule(lastrule))
+ {
+ return false;
+ }
+
lastrule = NULL;
if(word)
@@ -1939,6 +1979,8 @@
tmpword.clear();
tmpblank.clear();
ms.init(me->getInitial());
+
+ return ret;
}
void
------------------------------------------------------------------------------
This SF.net email is sponsored by Sprint
What will you do first with EVO, the first 4G phone?
Visit sprint.com/first -- http://p.sf.net/sfu/sprint-com-first
_______________________________________________
Apertium-stuff mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/apertium-stuff