Author: siren Date: Mon Nov 13 11:46:56 2006 New Revision: 474464 URL: http://svn.apache.org/viewvc?view=rev&rev=474464 Log: NUTCH-395 Increase fetching speed
Added: lucene/nutch/trunk/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java lucene/nutch/trunk/src/test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java lucene/nutch/trunk/src/test/org/apache/nutch/metadata/TestMetadata.java lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=474464&r1=474463&r2=474464 ============================================================================== --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Mon Nov 13 11:46:56 2006 @@ -71,6 +71,7 @@ 22. NUTCH-399 - Change CommandRunner to use concurrent api from jdk (siren) +23. NUTCH-395 - Increase fetching speed (siren) Release 0.8 - 2006-07-25 Modified: lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java?view=diff&rev=474464&r1=474463&r2=474464 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java Mon Nov 13 11:46:56 2006 @@ -1,4 +1,4 @@ -/** +/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. @@ -16,103 +16,58 @@ */ package org.apache.nutch.metadata; -// JDK imports import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; -import java.lang.reflect.Field; -import java.lang.reflect.Modifier; -import java.text.SimpleDateFormat; -import java.util.ArrayList; import java.util.Enumeration; import java.util.HashMap; -import java.util.Iterator; -import java.util.List; import java.util.Map; import java.util.Properties; - -// Commons Lang imports -import org.apache.commons.lang.StringUtils; - -// Hadoop imports import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; /** - * A syntax tolerant and multi-valued metadata container. - * - * All the static String fields declared by this class are used as reference - * names for syntax correction on meta-data naming. + * A multi-valued metadata container. * * @author Chris Mattmann * @author Jérôme Charron + * */ -public class Metadata implements CreativeCommons, - DublinCore, - HttpHeaders, - Nutch, - Office, - Writable { - - - /** Used to format DC dates for the DATE metadata field */ - public final static SimpleDateFormat DATE_FORMAT = - new SimpleDateFormat("yyyy-MM-dd"); - - - private final static Map NAMES_IDX = new HashMap(); - private static String[] normalized = null; - - // Uses self introspection to fill the metanames index and the - // metanames list. - static { - Field[] fields = Metadata.class.getFields(); - for (int i=0; i<fields.length; i++) { - int mods = fields[i].getModifiers(); - if (Modifier.isFinal(mods) && - Modifier.isPublic(mods) && - Modifier.isStatic(mods) && - fields[i].getType().equals(String.class)) { - try { - String val = (String) fields[i].get(null); - NAMES_IDX.put(normalize(val), val); - } catch (Exception e) { - // Simply ignore... - } - } - } - normalized = (String[]) NAMES_IDX.keySet().toArray(new String[NAMES_IDX.size()]); - } - - - /** A map of all metadata attributes */ - private Map metadata = null; +public class Metadata implements Writable, CreativeCommons, +DublinCore, HttpHeaders, Nutch, Office { + + /** + * A map of all metadata attributes. + */ + private Map<String, String[]> metadata = null; + - - /** Constructs a new, empty metadata. */ + /** + * Constructs a new, empty metadata. + */ public Metadata() { - metadata = new HashMap(); + metadata = new HashMap<String, String[]>(); } /** + * Returns true if named value is multivalued. + * @param name name of metadata + * @return true is named value is multivalued, false if single + * value or null */ - public boolean isMultiValued(String name) { - return getValues(name).length > 1; + public boolean isMultiValued(final String name) { + return metadata.get(name) != null && metadata.get(name).length > 1; } /** * Returns an array of the names contained in the metadata. + * @return Metadata names */ public String[] names() { - Iterator iter = metadata.keySet().iterator(); - List names = new ArrayList(); - while(iter.hasNext()) { - names.add(getNormalizedName((String) iter.next())); - } - return (String[]) names.toArray(new String[names.size()]); + return metadata.keySet().toArray(new String[metadata.keySet().size()]); } - + /** * Get the value associated to a metadata name. * If many values are assiociated to the specified name, then the first @@ -121,12 +76,12 @@ * @param name of the metadata. * @return the value associated to the specified metadata name. */ - public String get(String name) { - Object values = metadata.get(getNormalizedName(name)); - if ((values != null) && (values instanceof List)) { - return (String) ((List) values).get(0); + public String get(final String name) { + String[] values = metadata.get(name); + if (values == null) { + return null; } else { - return (String) values; + return values[0]; } } @@ -135,19 +90,14 @@ * @param name of the metadata. * @return the values associated to a metadata name. */ - public String[] getValues(String name) { - Object values = metadata.get(getNormalizedName(name)); - if (values != null) { - if (values instanceof List) { - List list = (List) values; - return (String[]) list.toArray(new String[list.size()]); - } else { - return new String[] { (String) values }; - } + public String[] getValues(final String name) { + String[] values = metadata.get(name); + if (values == null) { + values = new String[0]; } - return new String[0]; + return values; } - + /** * Add a metadata name/value mapping. * Add the specified value to the list of values associated to the @@ -156,31 +106,30 @@ * @param name the metadata name. * @param value the metadata value. */ - public void add(String name, String value) { - String normalized = getNormalizedName(name); - Object values = metadata.get(normalized); - if (values != null) { - if (values instanceof String) { - List list = new ArrayList(); - list.add(values); - list.add(value); - metadata.put(normalized, list); - } else if (values instanceof List) { - ((List) values).add(value); - } + public void add(final String name, final String value) { + String[] values = metadata.get(name); + if (values == null) { + set(name, value); } else { - metadata.put(normalized, value); + String[] newValues = new String[values.length + 1]; + System.arraycopy(values, 0, newValues, 0, values.length); + newValues[newValues.length - 1] = value; + metadata.put(name, newValues); } } + /** + * Copy All key-value pairs from properties. + * @param properties properties to copy from + */ public void setAll(Properties properties) { Enumeration names = properties.propertyNames(); while (names.hasMoreElements()) { String name = (String) names.nextElement(); - set(name, properties.getProperty(name)); + metadata.put(name, new String[]{properties.getProperty(name)}); } } - + /** * Set metadata name/value. * Associate the specified value to the specified metadata name. If some @@ -190,46 +139,46 @@ * @param value the metadata value. */ public void set(String name, String value) { - remove(name); - add(name, value); + metadata.put(name, new String[]{value}); } /** * Remove a metadata and all its associated values. + * @param name metadata name to remove */ public void remove(String name) { - metadata.remove(getNormalizedName(name)); + metadata.remove(name); } - + /** * Returns the number of metadata names in this metadata. + * @return number of metadata names */ public int size() { return metadata.size(); } - - // Inherited Javadoc + public boolean equals(Object o) { - + if (o == null) { return false; } - + Metadata other = null; try { other = (Metadata) o; } catch (ClassCastException cce) { return false; } - + if (other.size() != size()) { return false; } - + String[] names = names(); - for (int i=0; i<names.length; i++) { + for (int i = 0; i < names.length; i++) { String[] otherValues = other.getValues(names[i]); String[] thisValues = getValues(names[i]); if (otherValues.length != thisValues.length) { return false; } - for (int j=0; j<otherValues.length; j++) { + for (int j = 0; j < otherValues.length; j++) { if (!otherValues[j].equals(thisValues[j])) { return false; } @@ -238,13 +187,12 @@ return true; } - // Inherited Javadoc public String toString() { StringBuffer buf = new StringBuffer(); String[] names = names(); - for (int i=0; i<names.length; i++) { + for (int i = 0; i < names.length; i++) { String[] values = getValues(names[i]); - for (int j=0; j<values.length; j++) { + for (int j = 0; j < values.length; j++) { buf.append(names[i]) .append("=") .append(values[j]) @@ -253,83 +201,31 @@ } return buf.toString(); } - - - /** - * Get the normalized name of metadata attribute name. - * This method tries to find a well-known metadata name (one of the - * metadata names defined in this class) that matches the specified name. - * The matching is error tolerent. For instance, - * <ul> - * <li>content-type gives Content-Type</li> - * <li>CoNtEntType gives Content-Type</li> - * <li>ConTnTtYpe gives Content-Type</li> - * </ul> - * If no matching with a well-known metadata name is found, then the original - * name is returned. - */ - public static String getNormalizedName(String name) { - String searched = normalize(name); - String value = (String) NAMES_IDX.get(searched); - - if ((value == null) && (normalized != null)) { - int threshold = searched.length() / 3; - for (int i=0; i<normalized.length && value == null; i++) { - if (StringUtils.getLevenshteinDistance(searched, normalized[i]) < threshold) { - value = (String) NAMES_IDX.get(normalized[i]); - } - } - } - return (value != null) ? value : name; - } - - private final static String normalize(String str) { - char c; - StringBuffer buf = new StringBuffer(); - for (int i=0; i<str.length(); i++) { - c = str.charAt(i); - if (Character.isLetter(c)) { - buf.append(Character.toLowerCase(c)); - } - } - return buf.toString(); - } - - /* ------------------------- * - * <implementation:Writable> * - * ------------------------- */ - - // Inherited Javadoc public final void write(DataOutput out) throws IOException { out.writeInt(size()); String[] values = null; String[] names = names(); - for (int i=0; i<names.length; i++) { + for (int i = 0; i < names.length; i++) { Text.writeString(out, names[i]); values = getValues(names[i]); out.writeInt(values.length); - for (int j=0; j<values.length; j++) { + for (int j = 0; j < values.length; j++) { Text.writeString(out, values[j]); } } } - // Inherited Javadoc public final void readFields(DataInput in) throws IOException { int keySize = in.readInt(); String key; - for (int i=0; i<keySize; i++) { + for (int i = 0; i < keySize; i++) { key = Text.readString(in); int valueSize = in.readInt(); - for (int j=0; j<valueSize; j++) { + for (int j = 0; j < valueSize; j++) { add(key, Text.readString(in)); } } } - /* -------------------------- * - * </implementation:Writable> * - * -------------------------- */ - } Added: lucene/nutch/trunk/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java?view=auto&rev=474464 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java (added) +++ lucene/nutch/trunk/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java Mon Nov 13 11:46:56 2006 @@ -0,0 +1,147 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.metadata; + +import java.lang.reflect.Field; +import java.lang.reflect.Modifier; +import java.util.HashMap; +import java.util.Map; + +import org.apache.commons.lang.StringUtils; + +/** + * A decorator to Metadata that adds spellchecking capabilities to property + * names. + * + * All the static String fields declared by this class are used as reference + * names for syntax correction on meta-data naming. + */ +public class SpellCheckedMetadata extends Metadata { + + /** + * Treshold divider. + * + * <code>threshold = searched.length() / TRESHOLD_DIVIDER;</code> + */ + private static final int TRESHOLD_DIVIDER = 3; + + /** + * Normalized name to name mapping. + */ + private final static Map<String, String> NAMES_IDX = new HashMap<String, String>(); + + /** + * Array holding map keys. + */ + private static String[] normalized = null; + + // Uses self introspection to fill the metanames index and the + // metanames list. + static { + for (Field field : SpellCheckedMetadata.class.getFields()) { + int mods = field.getModifiers(); + if (Modifier.isFinal(mods) && Modifier.isPublic(mods) + && Modifier.isStatic(mods) && field.getType().equals(String.class)) { + try { + String val = (String) field.get(null); + NAMES_IDX.put(normalize(val), val); + } catch (Exception e) { + // Simply ignore... + } + } + } + normalized = NAMES_IDX.keySet().toArray(new String[NAMES_IDX.size()]); + } + + /** + * Normalizes String. + * + * @param str + * the string to normalize + * @return normalized String + */ + private static String normalize(final String str) { + char c; + StringBuffer buf = new StringBuffer(); + for (int i = 0; i < str.length(); i++) { + c = str.charAt(i); + if (Character.isLetter(c)) { + buf.append(Character.toLowerCase(c)); + } + } + return buf.toString(); + } + + /** + * Get the normalized name of metadata attribute name. This method tries to + * find a well-known metadata name (one of the metadata names defined in this + * class) that matches the specified name. The matching is error tolerent. For + * instance, + * <ul> + * <li>content-type gives Content-Type</li> + * <li>CoNtEntType gives Content-Type</li> + * <li>ConTnTtYpe gives Content-Type</li> + * </ul> + * If no matching with a well-known metadata name is found, then the original + * name is returned. + * + * @param name + * Name to normalize + * @return normalized name + */ + public static String getNormalizedName(final String name) { + String searched = normalize(name); + String value = NAMES_IDX.get(searched); + + if ((value == null) && (normalized != null)) { + int threshold = searched.length() / TRESHOLD_DIVIDER; + for (int i = 0; i < normalized.length && value == null; i++) { + if (StringUtils.getLevenshteinDistance(searched, normalized[i]) < threshold) { + value = NAMES_IDX.get(normalized[i]); + } + } + } + return (value != null) ? value : name; + } + + @Override + public void remove(final String name) { + super.remove(getNormalizedName(name)); + } + + @Override + public void add(final String name, final String value) { + String normalized = getNormalizedName(name); + super.add(normalized, value); + } + + @Override + public String[] getValues(final String name) { + return super.getValues(getNormalizedName(name)); + } + + @Override + public String get(final String name) { + return super.get(getNormalizedName(name)); + } + + @Override + public void set(final String name, final String value) { + super.set(getNormalizedName(name), value); + } + +} Modified: lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java?view=diff&rev=474464&r1=474463&r2=474464 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java Mon Nov 13 11:46:56 2006 @@ -17,17 +17,25 @@ package org.apache.nutch.protocol; -import java.util.*; -import java.io.*; - -import org.apache.hadoop.io.*; -import org.apache.hadoop.fs.*; -import org.apache.hadoop.conf.*; +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.util.Arrays; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.ArrayFile; +import org.apache.hadoop.io.CompressedWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.UTF8; +import org.apache.hadoop.io.VersionMismatchException; import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.metadata.SpellCheckedMetadata; +import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.mime.MimeType; -import org.apache.nutch.util.mime.MimeTypes; import org.apache.nutch.util.mime.MimeTypeException; -import org.apache.nutch.util.NutchConfiguration; +import org.apache.nutch.util.mime.MimeTypes; public final class Content extends CompressedWritable { @@ -36,23 +44,38 @@ private final static byte VERSION = 2; private byte version; + private String url; + private String base; + private byte[] content; + private String contentType; + private Metadata metadata; + private boolean mimeTypeMagic; + private MimeTypes mimeTypes; - public Content() {} - + private boolean inflated; + + public Content() { + inflated = false; + } + public Content(String url, String base, byte[] content, String contentType, - Metadata metadata, Configuration conf) { + Metadata metadata, Configuration conf) { - if (url == null) throw new IllegalArgumentException("null url"); - if (base == null) throw new IllegalArgumentException("null base"); - if (content == null) throw new IllegalArgumentException("null content"); - if (metadata == null) throw new IllegalArgumentException("null metadata"); + if (url == null) + throw new IllegalArgumentException("null url"); + if (base == null) + throw new IllegalArgumentException("null base"); + if (content == null) + throw new IllegalArgumentException("null content"); + if (metadata == null) + throw new IllegalArgumentException("null metadata"); this.url = url; this.base = base; @@ -61,21 +84,30 @@ this.mimeTypeMagic = conf.getBoolean("mime.type.magic", true); this.mimeTypes = MimeTypes.get(conf.get("mime.types.file")); this.contentType = getContentType(contentType, url, content); + inflated = true; + } + + public void ensureInflated() { + if (inflated) { + return; + } + super.ensureInflated(); + inflated = true; } protected final void readFieldsCompressed(DataInput in) throws IOException { version = in.readByte(); - metadata = new Metadata(); + metadata = new SpellCheckedMetadata(); switch (version) { case 0: case 1: - url = UTF8.readString(in); // read url - base = UTF8.readString(in); // read base + url = UTF8.readString(in); // read url + base = UTF8.readString(in); // read base - content = new byte[in.readInt()]; // read content + content = new byte[in.readInt()]; // read content in.readFully(content); - contentType = UTF8.readString(in); // read contentType + contentType = UTF8.readString(in); // read contentType // reconstruct metadata int keySize = in.readInt(); String key; @@ -88,33 +120,33 @@ } break; case VERSION: - url = Text.readString(in); // read url - base = Text.readString(in); // read base + url = Text.readString(in); // read url + base = Text.readString(in); // read base - content = new byte[in.readInt()]; // read content + content = new byte[in.readInt()]; // read content in.readFully(content); - contentType = Text.readString(in); // read contentType - metadata.readFields(in); // read meta data + contentType = Text.readString(in); // read contentType + metadata.readFields(in); // read meta data break; default: throw new VersionMismatchException(VERSION, version); } - + } protected final void writeCompressed(DataOutput out) throws IOException { out.writeByte(VERSION); - Text.writeString(out, url); // write url - Text.writeString(out, base); // write base + Text.writeString(out, url); // write url + Text.writeString(out, base); // write base - out.writeInt(content.length); // write content + out.writeInt(content.length); // write content out.write(content); - Text.writeString(out, contentType); // write contentType - - metadata.write(out); // write metadata + Text.writeString(out, contentType); // write contentType + + metadata.write(out); // write metadata } public static Content read(DataInput in) throws IOException { @@ -146,6 +178,7 @@ ensureInflated(); return content; } + public void setContent(byte[] content) { ensureInflated(); this.content = content; @@ -159,6 +192,7 @@ ensureInflated(); return contentType; } + public void setContentType(String contentType) { ensureInflated(); this.contentType = contentType; @@ -178,30 +212,28 @@ public boolean equals(Object o) { ensureInflated(); - if (!(o instanceof Content)){ + if (!(o instanceof Content)) { return false; } - Content that = (Content)o; + Content that = (Content) o; that.ensureInflated(); - return - this.url.equals(that.url) && - this.base.equals(that.base) && - Arrays.equals(this.getContent(), that.getContent()) && - this.contentType.equals(that.contentType) && - this.metadata.equals(that.metadata); + return this.url.equals(that.url) && this.base.equals(that.base) + && Arrays.equals(this.getContent(), that.getContent()) + && this.contentType.equals(that.contentType) + && this.metadata.equals(that.metadata); } public String toString() { ensureInflated(); StringBuffer buffer = new StringBuffer(); - buffer.append("Version: " + version + "\n" ); - buffer.append("url: " + url + "\n" ); - buffer.append("base: " + base + "\n" ); - buffer.append("contentType: " + contentType + "\n" ); - buffer.append("metadata: " + metadata + "\n" ); + buffer.append("Version: " + version + "\n"); + buffer.append("url: " + url + "\n"); + buffer.append("base: " + base + "\n"); + buffer.append("contentType: " + contentType + "\n"); + buffer.append("metadata: " + metadata + "\n"); buffer.append("Content:\n"); - buffer.append(new String(content)); // try default encoding + buffer.append(new String(content)); // try default encoding return buffer.toString(); @@ -210,7 +242,7 @@ public static void main(String argv[]) throws Exception { String usage = "Content (-local | -dfs <namenode:port>) recno segment"; - + if (argv.length < 3) { System.out.println("usage:" + usage); return; @@ -224,7 +256,8 @@ Path file = new Path(segment, DIR_NAME); System.out.println("Reading from file: " + file); - ArrayFile.Reader contents = new ArrayFile.Reader(fs, file.toString(), conf); + ArrayFile.Reader contents = new ArrayFile.Reader(fs, file.toString(), + conf); Content content = new Content(); contents.get(recno, content); @@ -241,10 +274,10 @@ private String getContentType(String typeName, String url, byte[] data) { MimeType type = null; try { - typeName = MimeType.clean(typeName); - type = typeName == null ? null : this.mimeTypes.forName(typeName); + typeName = MimeType.clean(typeName); + type = typeName == null ? null : this.mimeTypes.forName(typeName); } catch (MimeTypeException mte) { - // Seems to be a malformed mime type name... + // Seems to be a malformed mime type name... } if (typeName == null || type == null || !type.matches(url)) { @@ -254,8 +287,8 @@ type = this.mimeTypes.getMimeType(url); typeName = type == null ? typeName : type.getName(); } - if (typeName == null || type == null || - (this.mimeTypeMagic && type.hasMagic() && !type.matches(data))) { + if (typeName == null || type == null + || (this.mimeTypeMagic && type.hasMagic() && !type.matches(data))) { // If no mime-type already found, or the one found doesn't match // the magic bytes it should be, then, guess a mime-type from the // document content (magic bytes) Modified: lucene/nutch/trunk/src/test/org/apache/nutch/metadata/TestMetadata.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/metadata/TestMetadata.java?view=diff&rev=474464&r1=474463&r2=474464 ============================================================================== --- lucene/nutch/trunk/src/test/org/apache/nutch/metadata/TestMetadata.java (original) +++ lucene/nutch/trunk/src/test/org/apache/nutch/metadata/TestMetadata.java Mon Nov 13 11:46:56 2006 @@ -16,7 +16,6 @@ */ package org.apache.nutch.metadata; -// JDK imports import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.DataInputStream; @@ -24,71 +23,52 @@ import java.io.IOException; import java.util.Properties; import junit.framework.Test; - -// JUnit imports import junit.framework.TestCase; import junit.framework.TestSuite; import junit.textui.TestRunner; -// Nutch imports -import org.apache.nutch.metadata.Metadata; - - /** * JUnit based tests of class [EMAIL PROTECTED] org.apache.nutch.metadata.Metadata}. - * - * @author Chris Mattmann - * @author Jérôme Charron */ public class TestMetadata extends TestCase { - + private static final String CONTENTTYPE = "contenttype"; + public TestMetadata(String testName) { super(testName); } - + public static Test suite() { return new TestSuite(TestMetadata.class); } - + public static void main(String[] args) { TestRunner.run(suite()); } - - - /** Test for the <code>getNormalizedName(String)</code> method. */ - public void testGetNormalizedName() { - assertEquals("Content-Type", Metadata.getNormalizedName("Content-Type")); - assertEquals("Content-Type", Metadata.getNormalizedName("ContentType")); - assertEquals("Content-Type", Metadata.getNormalizedName("Content-type")); - assertEquals("Content-Type", Metadata.getNormalizedName("contenttype")); - assertEquals("Content-Type", Metadata.getNormalizedName("contentype")); - assertEquals("Content-Type", Metadata.getNormalizedName("contntype")); - } /** Test for the <code>add(String, String)</code> method. */ public void testAdd() { String[] values = null; Metadata meta = new Metadata(); - values = meta.getValues("contentype"); + values = meta.getValues(CONTENTTYPE); assertEquals(0, values.length); - meta.add("contentype", "value1"); - values = meta.getValues("contentype"); + meta.add(CONTENTTYPE, "value1"); + values = meta.getValues(CONTENTTYPE); assertEquals(1, values.length); assertEquals("value1", values[0]); - - meta.add("Content-Type", "value2"); - values = meta.getValues("contentype"); + + meta.add(CONTENTTYPE, "value2"); + values = meta.getValues(CONTENTTYPE); assertEquals(2, values.length); assertEquals("value1", values[0]); assertEquals("value2", values[1]); // NOTE : For now, the same value can be added many times. - // Should it be changed? - meta.add("ContentType", "value1"); - values = meta.getValues("Content-Type"); + // Should it be changed? + meta.add(CONTENTTYPE, "value1"); + values = meta.getValues(CONTENTTYPE); assertEquals(3, values.length); assertEquals("value1", values[0]); assertEquals("value2", values[1]); @@ -100,43 +80,43 @@ String[] values = null; Metadata meta = new Metadata(); - values = meta.getValues("contentype"); + values = meta.getValues(CONTENTTYPE); assertEquals(0, values.length); - meta.set("contentype", "value1"); - values = meta.getValues("contentype"); + meta.set(CONTENTTYPE, "value1"); + values = meta.getValues(CONTENTTYPE); assertEquals(1, values.length); assertEquals("value1", values[0]); - - meta.set("Content-Type", "value2"); - values = meta.getValues("contentype"); + + meta.set(CONTENTTYPE, "value2"); + values = meta.getValues(CONTENTTYPE); assertEquals(1, values.length); assertEquals("value2", values[0]); - - meta.set("contenttype", "new value 1"); + + meta.set(CONTENTTYPE, "new value 1"); meta.add("contenttype", "new value 2"); - values = meta.getValues("contentype"); + values = meta.getValues(CONTENTTYPE); assertEquals(2, values.length); assertEquals("new value 1", values[0]); assertEquals("new value 2", values[1]); } - - /** Test for <code>setAll(Properties)</code> method */ + + /** Test for <code>setAll(Properties)</code> method. */ public void testSetProperties() { String[] values = null; Metadata meta = new Metadata(); Properties props = new Properties(); - + meta.setAll(props); assertEquals(0, meta.size()); - + props.setProperty("name-one", "value1.1"); meta.setAll(props); assertEquals(1, meta.size()); values = meta.getValues("name-one"); assertEquals(1, values.length); assertEquals("value1.1", values[0]); - + props.setProperty("name-two", "value2.1"); meta.setAll(props); assertEquals(2, meta.size()); @@ -147,19 +127,18 @@ assertEquals(1, values.length); assertEquals("value2.1", values[0]); } - - /** Test for <code>get(String)</code> method */ + + /** Test for <code>get(String)</code> method. */ public void testGet() { Metadata meta = new Metadata(); assertNull(meta.get("a-name")); - meta.add("a-name", "value-1"); assertEquals("value-1", meta.get("a-name")); meta.add("a-name", "value-2"); assertEquals("value-1", meta.get("a-name")); } - - /** Test for <code>isMultiValued()</code> method */ + + /** Test for <code>isMultiValued()</code> method. */ public void testIsMultiValued() { Metadata meta = new Metadata(); assertFalse(meta.isMultiValued("key")); @@ -169,13 +148,13 @@ assertTrue(meta.isMultiValued("key")); } - /** Test for <code>names</code> method */ + /** Test for <code>names</code> method. */ public void testNames() { String[] names = null; Metadata meta = new Metadata(); names = meta.names(); assertEquals(0, names.length); - + meta.add("name-one", "value"); names = meta.names(); assertEquals(1, names.length); @@ -184,8 +163,8 @@ names = meta.names(); assertEquals(2, names.length); } - - /** Test for <code>remove(String)</code> method */ + + /** Test for <code>remove(String)</code> method. */ public void testRemove() { Metadata meta = new Metadata(); meta.remove("name-one"); @@ -206,7 +185,7 @@ assertNull(meta.get("name-two")); } - /** Test for <code>equals(Object)</code> method */ + /** Test for <code>equals(Object)</code> method. */ public void testObject() { Metadata meta1 = new Metadata(); Metadata meta2 = new Metadata(); @@ -230,8 +209,8 @@ meta2.add("name-two", "value-2.x"); assertFalse(meta1.equals(meta2)); } - - /** Test for <code>Writable</code> implementation */ + + /** Test for <code>Writable</code> implementation. */ public void testWritable() { Metadata result = null; Metadata meta = new Metadata(); @@ -252,17 +231,19 @@ assertEquals("value-2.1", result.getValues("name-two")[0]); assertEquals("value-2.2", result.getValues("name-two")[1]); } - + private Metadata writeRead(Metadata meta) { Metadata readed = new Metadata(); try { ByteArrayOutputStream out = new ByteArrayOutputStream(); meta.write(new DataOutputStream(out)); - readed.readFields(new DataInputStream(new ByteArrayInputStream(out.toByteArray()))); + readed.readFields(new DataInputStream(new ByteArrayInputStream(out + .toByteArray()))); } catch (IOException ioe) { fail(ioe.toString()); } return readed; } - + } + Added: lucene/nutch/trunk/src/test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java?view=auto&rev=474464 ============================================================================== --- lucene/nutch/trunk/src/test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java (added) +++ lucene/nutch/trunk/src/test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java Mon Nov 13 11:46:56 2006 @@ -0,0 +1,267 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.metadata; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.util.Properties; +import junit.framework.Test; +import junit.framework.TestCase; +import junit.framework.TestSuite; +import junit.textui.TestRunner; + +/** + * JUnit based tests of class + * [EMAIL PROTECTED] org.apache.nutch.metadata.SpellCheckedMetadata}. + * + * @author Chris Mattmann + * @author Jérôme Charron + */ +public class TestSpellCheckedMetadata extends TestCase { + + public TestSpellCheckedMetadata(String testName) { + super(testName); + } + + public static Test suite() { + return new TestSuite(TestSpellCheckedMetadata.class); + } + + public static void main(String[] args) { + TestRunner.run(suite()); + } + + /** Test for the <code>getNormalizedName(String)</code> method. */ + public void testGetNormalizedName() { + assertEquals("Content-Type", SpellCheckedMetadata + .getNormalizedName("Content-Type")); + assertEquals("Content-Type", SpellCheckedMetadata + .getNormalizedName("ContentType")); + assertEquals("Content-Type", SpellCheckedMetadata + .getNormalizedName("Content-type")); + assertEquals("Content-Type", SpellCheckedMetadata + .getNormalizedName("contenttype")); + assertEquals("Content-Type", SpellCheckedMetadata + .getNormalizedName("contentype")); + assertEquals("Content-Type", SpellCheckedMetadata + .getNormalizedName("contntype")); + } + + /** Test for the <code>add(String, String)</code> method. */ + public void testAdd() { + String[] values = null; + SpellCheckedMetadata meta = new SpellCheckedMetadata(); + + values = meta.getValues("contentype"); + assertEquals(0, values.length); + + meta.add("contentype", "value1"); + values = meta.getValues("contentype"); + assertEquals(1, values.length); + assertEquals("value1", values[0]); + + meta.add("Content-Type", "value2"); + values = meta.getValues("contentype"); + assertEquals(2, values.length); + assertEquals("value1", values[0]); + assertEquals("value2", values[1]); + + // NOTE : For now, the same value can be added many times. + // Should it be changed? + meta.add("ContentType", "value1"); + values = meta.getValues("Content-Type"); + assertEquals(3, values.length); + assertEquals("value1", values[0]); + assertEquals("value2", values[1]); + assertEquals("value1", values[2]); + } + + /** Test for the <code>set(String, String)</code> method. */ + public void testSet() { + String[] values = null; + SpellCheckedMetadata meta = new SpellCheckedMetadata(); + + values = meta.getValues("contentype"); + assertEquals(0, values.length); + + meta.set("contentype", "value1"); + values = meta.getValues("contentype"); + assertEquals(1, values.length); + assertEquals("value1", values[0]); + + meta.set("Content-Type", "value2"); + values = meta.getValues("contentype"); + assertEquals(1, values.length); + assertEquals("value2", values[0]); + + meta.set("contenttype", "new value 1"); + meta.add("contenttype", "new value 2"); + values = meta.getValues("contentype"); + assertEquals(2, values.length); + assertEquals("new value 1", values[0]); + assertEquals("new value 2", values[1]); + } + + /** Test for <code>setAll(Properties)</code> method. */ + public void testSetProperties() { + String[] values = null; + SpellCheckedMetadata meta = new SpellCheckedMetadata(); + Properties props = new Properties(); + + meta.setAll(props); + assertEquals(0, meta.size()); + + props.setProperty("name-one", "value1.1"); + meta.setAll(props); + assertEquals(1, meta.size()); + values = meta.getValues("name-one"); + assertEquals(1, values.length); + assertEquals("value1.1", values[0]); + + props.setProperty("name-two", "value2.1"); + meta.setAll(props); + assertEquals(2, meta.size()); + values = meta.getValues("name-one"); + assertEquals(1, values.length); + assertEquals("value1.1", values[0]); + values = meta.getValues("name-two"); + assertEquals(1, values.length); + assertEquals("value2.1", values[0]); + } + + /** Test for <code>get(String)</code> method. */ + public void testGet() { + SpellCheckedMetadata meta = new SpellCheckedMetadata(); + assertNull(meta.get("a-name")); + + meta.add("a-name", "value-1"); + assertEquals("value-1", meta.get("a-name")); + meta.add("a-name", "value-2"); + assertEquals("value-1", meta.get("a-name")); + } + + /** Test for <code>isMultiValued()</code> method. */ + public void testIsMultiValued() { + SpellCheckedMetadata meta = new SpellCheckedMetadata(); + assertFalse(meta.isMultiValued("key")); + meta.add("key", "value1"); + assertFalse(meta.isMultiValued("key")); + meta.add("key", "value2"); + assertTrue(meta.isMultiValued("key")); + } + + /** Test for <code>names</code> method. */ + public void testNames() { + String[] names = null; + SpellCheckedMetadata meta = new SpellCheckedMetadata(); + names = meta.names(); + assertEquals(0, names.length); + + meta.add("name-one", "value"); + names = meta.names(); + assertEquals(1, names.length); + assertEquals("name-one", names[0]); + meta.add("name-two", "value"); + names = meta.names(); + assertEquals(2, names.length); + } + + /** Test for <code>remove(String)</code> method. */ + public void testRemove() { + SpellCheckedMetadata meta = new SpellCheckedMetadata(); + meta.remove("name-one"); + assertEquals(0, meta.size()); + meta.add("name-one", "value-1.1"); + meta.add("name-one", "value-1.2"); + meta.add("name-two", "value-2.2"); + assertEquals(2, meta.size()); + assertNotNull(meta.get("name-one")); + assertNotNull(meta.get("name-two")); + meta.remove("name-one"); + assertEquals(1, meta.size()); + assertNull(meta.get("name-one")); + assertNotNull(meta.get("name-two")); + meta.remove("name-two"); + assertEquals(0, meta.size()); + assertNull(meta.get("name-one")); + assertNull(meta.get("name-two")); + } + + /** Test for <code>equals(Object)</code> method. */ + public void testObject() { + SpellCheckedMetadata meta1 = new SpellCheckedMetadata(); + SpellCheckedMetadata meta2 = new SpellCheckedMetadata(); + assertFalse(meta1.equals(null)); + assertFalse(meta1.equals("String")); + assertTrue(meta1.equals(meta2)); + meta1.add("name-one", "value-1.1"); + assertFalse(meta1.equals(meta2)); + meta2.add("name-one", "value-1.1"); + assertTrue(meta1.equals(meta2)); + meta1.add("name-one", "value-1.2"); + assertFalse(meta1.equals(meta2)); + meta2.add("name-one", "value-1.2"); + assertTrue(meta1.equals(meta2)); + meta1.add("name-two", "value-2.1"); + assertFalse(meta1.equals(meta2)); + meta2.add("name-two", "value-2.1"); + assertTrue(meta1.equals(meta2)); + meta1.add("name-two", "value-2.2"); + assertFalse(meta1.equals(meta2)); + meta2.add("name-two", "value-2.x"); + assertFalse(meta1.equals(meta2)); + } + + /** Test for <code>Writable</code> implementation. */ + public void testWritable() { + SpellCheckedMetadata result = null; + SpellCheckedMetadata meta = new SpellCheckedMetadata(); + result = writeRead(meta); + assertEquals(0, result.size()); + meta.add("name-one", "value-1.1"); + result = writeRead(meta); + assertEquals(1, result.size()); + assertEquals(1, result.getValues("name-one").length); + assertEquals("value-1.1", result.get("name-one")); + meta.add("name-two", "value-2.1"); + meta.add("name-two", "value-2.2"); + result = writeRead(meta); + assertEquals(2, result.size()); + assertEquals(1, result.getValues("name-one").length); + assertEquals("value-1.1", result.getValues("name-one")[0]); + assertEquals(2, result.getValues("name-two").length); + assertEquals("value-2.1", result.getValues("name-two")[0]); + assertEquals("value-2.2", result.getValues("name-two")[1]); + } + + private SpellCheckedMetadata writeRead(SpellCheckedMetadata meta) { + SpellCheckedMetadata readed = new SpellCheckedMetadata(); + try { + ByteArrayOutputStream out = new ByteArrayOutputStream(); + meta.write(new DataOutputStream(out)); + readed.readFields(new DataInputStream(new ByteArrayInputStream(out + .toByteArray()))); + } catch (IOException ioe) { + fail(ioe.toString()); + } + return readed; + } + +} Modified: lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java?view=diff&rev=474464&r1=474463&r2=474464 ============================================================================== --- lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java (original) +++ lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java Mon Nov 13 11:46:56 2006 @@ -18,6 +18,7 @@ package org.apache.nutch.protocol; import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.metadata.SpellCheckedMetadata; import org.apache.hadoop.conf.Configuration; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.WritableTestUtils; @@ -28,9 +29,9 @@ /** Unit tests for Content. */ public class TestContent extends TestCase { - + private static Configuration conf = NutchConfiguration.create(); - + public TestContent(String name) { super(name); } public void testContent() throws Exception { @@ -39,13 +40,13 @@ String url = "http://www.foo.com/"; - Metadata metaData = new Metadata(); + SpellCheckedMetadata metaData = new SpellCheckedMetadata(); metaData.add("Host", "www.foo.com"); metaData.add("Content-Type", "text/html"); Content r = new Content(url, url, page.getBytes("UTF8"), "text/html", metaData, conf); - + WritableTestUtils.testWritable(r); assertEquals("text/html", r.getMetadata().get("Content-Type")); assertEquals("text/html", r.getMetadata().get("content-type")); @@ -105,5 +106,5 @@ null, p, conf); assertNull(c.getContentType()); } - + }