TokenizerPTB.java

tmill Wed, 13 Nov 2013 07:27:16 -0800

Author: tmill
Date: Wed Nov 13 15:26:29 2013
New Revision: 1541553

URL: http://svn.apache.org/r1541553
Log:
Fixes CTAKES-266. Checks for zero-length word token before creating token 
before contraction.


Modified:
    
ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/nlp/tokenizer/TokenizerPTB.java

Modified: 
ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/nlp/tokenizer/TokenizerPTB.java
URL: 
http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/nlp/tokenizer/TokenizerPTB.java?rev=1541553&r1=1541552&r2=1541553&view=diff
==============================================================================
--- 
ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/nlp/tokenizer/TokenizerPTB.java
 (original)
+++ 
ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/nlp/tokenizer/TokenizerPTB.java
 Wed Nov 13 15:26:29 2013
@@ -343,12 +343,13 @@ public class TokenizerPTB {
                                        char c = 
lowerCasedText.charAt(currentPosition+len);
                                        if (c=='n' || c==APOSTROPHE) { // if a 
"n't" contraction or a contraction where contraction token starts with '
                                            if (tokenLen < 0) throw new 
RuntimeException("c = " + c + "tokenLen = " + tokenLen + " currentPosition = " 
+ currentPosition);
-                                           // First create the WordToken (no 
apostrophe)
-                                           bta = createToken(tokenClass, 
textSegment, jcas, currentPosition, currentPosition+tokenLen, offsetAdjustment);
-                                           //System.out.println("bta = " + bta 
+ " class = " + bta.getClass() + " tokenLen = " + tokenLen + " currentPosition 
= " + currentPosition);
-                                           tokens.add(bta);
-                                           currentPosition+=tokenLen; // 
currentPosition
-
+                                           // First create the WordToken (no 
apostrophe)
+                                           if(tokenLen > 0){
+                                             bta = createToken(tokenClass, 
textSegment, jcas, currentPosition, currentPosition+tokenLen, offsetAdjustment);
+                                             //System.out.println("bta = " + 
bta + " class = " + bta.getClass() + " tokenLen = " + tokenLen + " 
currentPosition = " + currentPosition);
+                                             tokens.add(bta);
+                                             currentPosition+=tokenLen; // 
currentPosition
+                                           }
                                            // Set up to create the second 
token, for other contractions, the next token will start with an 
                                            // apostrophe and be handled 
above... but for "n't" contractions, next token won't start with apostrophe
                                            // so just go ahead and handle it 
here instead of having to keep track of previous

svn commit: r1541553 - /ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/nlp/tokenizer/TokenizerPTB.java

Reply via email to