Author: ssc
Date: Mon Mar 11 17:51:29 2013
New Revision: 1455258
URL: http://svn.apache.org/r1455258
Log:
MAHOUT-1150 ARFF Integration does not support quoted identifiers
Added:
mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFTypeTest.java
Modified:
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFType.java
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java
mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java
Modified:
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFType.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFType.java?rev=1455258&r1=1455257&r2=1455258&view=diff
==============================================================================
---
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFType.java
(original)
+++
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFType.java
Mon Mar 11 17:51:29 2013
@@ -38,6 +38,25 @@ public enum ARFFType {
public String getLabel(String line) {
int idx = line.lastIndexOf(indicator);
- return line.substring(ARFFModel.ATTRIBUTE.length(), idx).trim();
+ return removeQuotes(line.substring(ARFFModel.ATTRIBUTE.length(), idx));
+ }
+
+ /**
+ * Remove quotes and leading/trailing whitespace from a single or double
quoted string
+ * @param str quotes from
+ * @return A string without quotes
+ */
+ public static String removeQuotes(String str) {
+ String cleaned = str;
+ if (cleaned != null) {
+ cleaned = cleaned.trim();
+ boolean isQuoted = cleaned.length() > 1 &&
+ (cleaned.startsWith("\"") && cleaned.endsWith("\"") ||
+ cleaned.startsWith("'") && cleaned.endsWith("'"));
+ if (isQuoted) {
+ cleaned = cleaned.substring(1, cleaned.length() - 1);
+ }
+ }
+ return cleaned;
}
}
Modified:
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java?rev=1455258&r1=1455257&r2=1455258&view=diff
==============================================================================
---
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java
(original)
+++
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java
Mon Mar 11 17:51:29 2013
@@ -85,7 +85,7 @@ public class ARFFVectorIterable implemen
if (lower.startsWith(ARFFModel.ARFF_COMMENT)) {
continue;
} else if (lower.startsWith(ARFFModel.RELATION)) {
- model.setRelation(line.substring(ARFFModel.RELATION.length()).trim());
+
model.setRelation(ARFFType.removeQuotes(line.substring(ARFFModel.RELATION.length())));
} else if (lower.startsWith(ARFFModel.ATTRIBUTE)) {
String label;
ARFFType type;
@@ -108,7 +108,7 @@ public class ARFFVectorIterable implemen
int classIdx = lower.indexOf(ARFFType.NOMINAL.getIndicator());
String[] classes = COMMA_PATTERN.split(line.substring(classIdx + 1,
line.length() - 1));
for (int i = 0; i < classes.length; i++) {
- model.addNominal(label, classes[i].trim(), i + 1);
+ model.addNominal(label, ARFFType.removeQuotes(classes[i]), i + 1);
}
} else if (lower.contains(ARFFType.DATE.getIndicator())) {
label = ARFFType.DATE.getLabel(lower);
Modified:
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java?rev=1455258&r1=1455257&r2=1455258&view=diff
==============================================================================
---
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java
(original)
+++
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java
Mon Mar 11 17:51:29 2013
@@ -117,7 +117,7 @@ public class MapBackedARFFModel implemen
double result;
Map<String,Integer> classes = nominalMap.get(label);
if (classes != null) {
- Integer ord = classes.get(data);
+ Integer ord = classes.get(ARFFType.removeQuotes(data));
if (ord != null) {
result = ord;
} else {
Added:
mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFTypeTest.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFTypeTest.java?rev=1455258&view=auto
==============================================================================
---
mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFTypeTest.java
(added)
+++
mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFTypeTest.java
Mon Mar 11 17:51:29 2013
@@ -0,0 +1,36 @@
+/*
+ * Copyright 2013 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.vectors.arff;
+
+import org.apache.mahout.utils.MahoutTestCase;
+import org.junit.Test;
+
+public class ARFFTypeTest extends MahoutTestCase{
+
+ @Test
+ public void removeQuotes() {
+
+ assertEquals(null, ARFFType.removeQuotes(null));
+ assertEquals("", ARFFType.removeQuotes("\"\""));
+ assertEquals("", ARFFType.removeQuotes("''"));
+ assertEquals("", ARFFType.removeQuotes(""));
+ assertEquals("", ARFFType.removeQuotes(" "));
+ assertEquals("single", ARFFType.removeQuotes("'single'"));
+ assertEquals("double", ARFFType.removeQuotes("\"double\""));
+ assertEquals("trim", ARFFType.removeQuotes(" trim "));
+ }
+}
Modified:
mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java?rev=1455258&r1=1455257&r2=1455258&view=diff
==============================================================================
---
mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java
(original)
+++
mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java
Mon Mar 11 17:51:29 2013
@@ -246,6 +246,55 @@ public final class ARFFVectorIterableTes
assertEquals(3.0, vector.get(2), EPSILON);
}
+ @Test
+ public void testQuotes() throws Exception {
+
+ // ARFF allows quotes on identifiers
+ String arff = "@RELATION 'quotes'\n"
+ + "@ATTRIBUTE 'theNumeric' NUMERIC\n"
+ + "@ATTRIBUTE \"theInteger\" INTEGER\n"
+ + "@ATTRIBUTE theReal REAL\n"
+ + "@ATTRIBUTE theNominal {\"double-quote\", 'single-quote',
no-quote}\n"
+ + "@DATA\n"
+ + "1.0,2,3.0,\"no-quote\"\n"
+ + "4.0,5,6.0,single-quote\n"
+ + "7.0,8,9.0,'double-quote'\n"
+ ;
+ ARFFModel model = new MapBackedARFFModel();
+ ARFFVectorIterable iterable = new ARFFVectorIterable(arff, model);
+ model = iterable.getModel();
+ assertNotNull(model);
+ assertEquals("quotes", model.getRelation());
+
+ // check attribute labels
+ assertEquals(4, model.getLabelSize());
+ assertEquals(ARFFType.NUMERIC, model.getARFFType(0));
+ assertEquals(ARFFType.INTEGER, model.getARFFType(1));
+ assertEquals(ARFFType.REAL, model.getARFFType(2));
+ assertEquals(ARFFType.NOMINAL, model.getARFFType(3));
+
+ Map<String, Integer> labelBindings = model.getLabelBindings();
+ assertTrue(labelBindings.keySet().contains("thenumeric"));
+ assertTrue(labelBindings.keySet().contains("theinteger"));
+ assertTrue(labelBindings.keySet().contains("thereal"));
+ assertTrue(labelBindings.keySet().contains("thenominal"));
+
+ // check nominal values
+ Map<String, Integer> nominalMap = model.getNominalMap().get("thenominal");
+ assertNotNull(nominalMap);
+ assertEquals(3, nominalMap.size());
+ assertTrue(nominalMap.keySet().contains("double-quote"));
+ assertTrue(nominalMap.keySet().contains("single-quote"));
+ assertTrue(nominalMap.keySet().contains("no-quote"));
+
+ // check data values
+ Iterator<Vector> it = iterable.iterator();
+ Vector vector = it.next();
+ assertEquals(nominalMap.get("no-quote"), vector.get(3), EPSILON);
+ assertEquals(nominalMap.get("single-quote"), it.next().get(3), EPSILON);
+ assertEquals(nominalMap.get("double-quote"), it.next().get(3), EPSILON);
+ }
+
private static final String SAMPLE_DENSE_ARFF = " % Comments\n" + " %
\n" + " % Comments go here"
+ " % \n" + " @RELATION
golf\n" + '\n'
+ " @ATTRIBUTE outlook
{sunny,overcast, rain}\n"