Author: tmill
Date: Wed Nov 13 15:26:29 2013
New Revision: 1541553
URL: http://svn.apache.org/r1541553
Log:
Fixes CTAKES-266. Checks for zero-length word token before creating token
before contraction.
Modified:
ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/nlp/tokenizer/TokenizerPTB.java
Modified:
ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/nlp/tokenizer/TokenizerPTB.java
URL:
http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/nlp/tokenizer/TokenizerPTB.java?rev=1541553&r1=1541552&r2=1541553&view=diff
==============================================================================
---
ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/nlp/tokenizer/TokenizerPTB.java
(original)
+++
ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/nlp/tokenizer/TokenizerPTB.java
Wed Nov 13 15:26:29 2013
@@ -343,12 +343,13 @@ public class TokenizerPTB {
char c =
lowerCasedText.charAt(currentPosition+len);
if (c=='n' || c==APOSTROPHE) { // if a
"n't" contraction or a contraction where contraction token starts with '
if (tokenLen < 0) throw new
RuntimeException("c = " + c + "tokenLen = " + tokenLen + " currentPosition = "
+ currentPosition);
- // First create the WordToken (no
apostrophe)
- bta = createToken(tokenClass,
textSegment, jcas, currentPosition, currentPosition+tokenLen, offsetAdjustment);
- //System.out.println("bta = " + bta
+ " class = " + bta.getClass() + " tokenLen = " + tokenLen + " currentPosition
= " + currentPosition);
- tokens.add(bta);
- currentPosition+=tokenLen; //
currentPosition
-
+ // First create the WordToken (no
apostrophe)
+ if(tokenLen > 0){
+ bta = createToken(tokenClass,
textSegment, jcas, currentPosition, currentPosition+tokenLen, offsetAdjustment);
+ //System.out.println("bta = " +
bta + " class = " + bta.getClass() + " tokenLen = " + tokenLen + "
currentPosition = " + currentPosition);
+ tokens.add(bta);
+ currentPosition+=tokenLen; //
currentPosition
+ }
// Set up to create the second
token, for other contractions, the next token will start with an
// apostrophe and be handled
above... but for "n't" contractions, next token won't start with apostrophe
// so just go ahead and handle it
here instead of having to keep track of previous