Author: rezan
Date: Wed Jul 29 21:47:32 2015
New Revision: 1693352
URL: http://svn.apache.org/r1693352
Log:
changed tokenization from byte to char
Modified:
devicemap/trunk/clients/2.0/reference/src/DeviceMapClient.java
Modified: devicemap/trunk/clients/2.0/reference/src/DeviceMapClient.java
URL:
http://svn.apache.org/viewvc/devicemap/trunk/clients/2.0/reference/src/DeviceMapClient.java?rev=1693352&r1=1693351&r2=1693352&view=diff
==============================================================================
--- devicemap/trunk/clients/2.0/reference/src/DeviceMapClient.java (original)
+++ devicemap/trunk/clients/2.0/reference/src/DeviceMapClient.java Wed Jul 29
21:47:32 2015
@@ -33,7 +33,7 @@ public class DeviceMapClient {
private String domainVersion;
private List<Transformer> transformers;
- private List<byte[]> tokenSeperators;
+ private List<String> tokenSeperators;
private int ngramConcatSize;
private String defaultId;
@@ -100,15 +100,17 @@ public class DeviceMapClient {
}
for(Iterator<JsonNode> i =
inputParser.get("tokenSeperators").iterator(); i.hasNext();) {
- JsonNode tokenSeperator = i.next();
+ JsonNode tokenSeperatorNode = i.next();
- if(tokenSeperator.asText().isEmpty()) {
+ if(tokenSeperatorNode.asText().isEmpty()) {
throw new Exception("Empty tokenSeperator not allowed");
}
- tokenSeperators.add(tokenSeperator.asText().getBytes());
+ String tokenSeperator = tokenSeperatorNode.asText();
- Main.log("Found tokenSeperator: '" + tokenSeperator.asText() + "'",
2);
+ tokenSeperators.add(tokenSeperator);
+
+ Main.log("Found tokenSeperator: '" + tokenSeperator + "'", 2);
}
}
@@ -258,47 +260,47 @@ public class DeviceMapClient {
Main.log("Transformed: '" + transformed + "'", 3);
- //tokenization using bytes
+ //tokenization
List<String> tokens = new ArrayList<>();
- byte[] source = transformed.getBytes();
+ String source = transformed;
int sourcePos = 0;
-
- byte[] dest = new byte[source.length];
- int destPos = 0;
+ int destStart = 0;
+ int destEnd = 0;
source:
- while(sourcePos < source.length) {
+ while(sourcePos < source.length()) {
seperator:
- for(byte[] seperator : tokenSeperators) {
+ for(String seperator : tokenSeperators) {
int i;
- for(i = 0; i < seperator.length; i++) {
- if(source[sourcePos + i] != seperator[i]) {
+ for(i = 0; i < seperator.length(); i++) {
+ if(sourcePos + i >= source.length() || source.charAt(sourcePos + i)
!= seperator.charAt(i)) {
continue seperator;
}
}
- if(destPos > 0) {
- tokens.add(new String(dest, 0, destPos));
- destPos = 0;
+ if(destEnd - destStart > 0) {
+ tokens.add(source.substring(destStart, destEnd));
}
-
+
sourcePos += i;
+ destStart = destEnd = sourcePos;
continue source;
}
- dest[destPos++] = source[sourcePos++];
+ sourcePos++;
+ destEnd++;
}
- if(destPos > 0) {
- tokens.add(new String(dest, 0, destPos));
+ if(destEnd - destStart > 0) {
+ tokens.add(source.substring(destStart, destEnd));
}
Main.log("Tokens: " + tokens, 3);
- List<String> ngrams = new ArrayList<>();
+ List<String> ngramTokenStream = new ArrayList<>();
for(int i = 0; i < tokens.size(); i++) {
String ngram = "";
@@ -310,12 +312,12 @@ public class DeviceMapClient {
ngramParts.add(0, ngram);
}
- ngrams.addAll(ngramParts);
+ ngramTokenStream.addAll(ngramParts);
ngramParts.clear();
}
- Main.log("Ngrams: " + ngrams, 3);
+ Main.log("Ngrams: " + ngramTokenStream, 3);
return "";
}