Added: lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java?view=auto&rev=449088 ============================================================================== --- lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java (added) +++ lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java Fri Sep 22 14:05:33 2006 @@ -0,0 +1,176 @@ +/** + * Copyright 2006 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.net.urlnormalizer.regex; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileFilter; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.Reader; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.net.URLNormalizers; +import org.apache.nutch.util.NutchConfiguration; + +import junit.framework.TestCase; + +/** Unit tests for RegexUrlNormalizer. */ +public class TestRegexURLNormalizer extends TestCase { + private static final Log LOG = LogFactory.getLog(TestRegexURLNormalizer.class); + + private RegexURLNormalizer normalizer; + private Configuration conf; + private HashMap testData = new HashMap(); + + // This system property is defined in ./src/plugin/build-plugin.xml + private String sampleDir = System.getProperty("test.data", "."); + // Make sure sample files are copied to "test.data" as specified in + // ./src/plugin/urlnormalizer-regex/build.xml during plugin compilation. + + public TestRegexURLNormalizer(String name) throws IOException { + super(name); + normalizer = new RegexURLNormalizer(); + conf = NutchConfiguration.create(); + normalizer.setConf(conf); + File[] configs = new File(sampleDir).listFiles(new FileFilter() { + public boolean accept(File f) { + if (f.getName().endsWith(".xml") && f.getName().startsWith("regex-normalize-")) + return true; + return false; + } + }); + for (int i = 0; i < configs.length; i++) { + try { + FileInputStream fis = new FileInputStream(configs[i]); + String cname = configs[i].getName(); + cname = cname.substring(16, cname.indexOf(".xml")); + normalizer.setConfiguration(fis, cname); + NormalizedURL[] urls = readTestFile(cname); + testData.put(cname, urls); + } catch (Exception e) { + LOG.warn("Could load config from '" + configs[i] + "': " + e.toString()); + } + } + } + + public void testNormalizerDefault() throws Exception { + normalizeTest((NormalizedURL[])testData.get(URLNormalizers.SCOPE_DEFAULT), + URLNormalizers.SCOPE_DEFAULT); + } + + public void testNormalizerScope() throws Exception { + Iterator it = testData.keySet().iterator(); + while (it.hasNext()) { + String scope = (String)it.next(); + normalizeTest((NormalizedURL[])testData.get(scope), scope); + } + } + + private void normalizeTest(NormalizedURL[] urls, String scope) throws Exception { + for (int i = 0; i < urls.length; i++) { + assertEquals(urls[i].expectedURL, + normalizer.normalize(urls[i].url, scope)); + } + } + + private void bench(int loops, String scope) { + long start = System.currentTimeMillis(); + try { + NormalizedURL[] expected = (NormalizedURL[])testData.get(scope); + if (expected == null) return; + for (int i = 0; i < loops; i++) { + normalizeTest(expected, scope); + } + } catch (Exception e) { + fail(e.toString()); + } + LOG.info("bench time (" + loops + ") " + + (System.currentTimeMillis() - start) + "ms"); + } + + private static class NormalizedURL { + String url; + String expectedURL; + + public NormalizedURL(String line) { + String[] fields = line.split("\\s+"); + url = fields[0]; + expectedURL = fields[1]; + } + } + + private NormalizedURL[] readTestFile(String scope) throws IOException { + File f = new File(sampleDir, "regex-normalize-" + scope + ".test"); + BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(f), "UTF-8")); + List list = new ArrayList(); + String line; + while((line = in.readLine()) != null) { + if ( line.trim().length() == 0 || + line.startsWith("#") || + line.startsWith(" ")) continue; + list.add(new NormalizedURL(line)); + } + return (NormalizedURL[]) list.toArray(new NormalizedURL[list.size()]); + } + + public static void main(String[] args) throws Exception { + if (args.length == 0) { + System.err.println("TestRegexURLNormalizer [-bench <iter>] <scope>"); + System.exit(-1); + } + boolean bench = false; + int iter = -1; + String scope = null; + for (int i = 0; i < args.length; i++) { + if (args[i].equals("-bench")) { + bench = true; + iter = Integer.parseInt(args[++i]); + } else scope = args[i]; + } + if (scope == null) { + System.err.println("Missing required scope name."); + System.exit(-1); + } + if (bench && iter < 0) { + System.err.println("Invalid number of iterations: " + iter); + System.exit(-1); + } + TestRegexURLNormalizer test = new TestRegexURLNormalizer("test"); + NormalizedURL[] urls = (NormalizedURL[])test.testData.get(scope); + if (urls == null) { + LOG.warn("Missing test data for scope '" + scope + "', using default scope."); + scope = URLNormalizers.SCOPE_DEFAULT; + urls = (NormalizedURL[])test.testData.get(scope); + } + if (bench) { + test.bench(iter, scope); + } else { + test.normalizeTest(urls, scope); + } + } + + + +}
Propchange: lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java ------------------------------------------------------------------------------ svn:eol-style = native Modified: lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java?view=diff&rev=449088&r1=449087&r2=449088 ============================================================================== --- lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java (original) +++ lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java Fri Sep 22 14:05:33 2006 @@ -51,7 +51,8 @@ urlPath=new Path(testdir,"urls"); crawldbPath=new Path(testdir,"crawldb"); fs=FileSystem.get(conf); - + if (fs.exists(urlPath)) fs.delete(urlPath); + if (fs.exists(crawldbPath)) fs.delete(crawldbPath); } protected void tearDown() throws IOException{ Added: lucene/nutch/trunk/src/test/org/apache/nutch/net/TestURLNormalizers.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/net/TestURLNormalizers.java?view=auto&rev=449088 ============================================================================== --- lucene/nutch/trunk/src/test/org/apache/nutch/net/TestURLNormalizers.java (added) +++ lucene/nutch/trunk/src/test/org/apache/nutch/net/TestURLNormalizers.java Fri Sep 22 14:05:33 2006 @@ -0,0 +1,52 @@ +/* + * Copyright 2006 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.net; + +import java.net.MalformedURLException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.util.NutchConfiguration; + +import junit.framework.TestCase; + +public class TestURLNormalizers extends TestCase { + + public void testURLNormalizers() { + Configuration conf = NutchConfiguration.create(); + String clazz1 = "org.apache.nutch.net.urlnormalizer.regex.RegexURLNormalizer"; + String clazz2 = "org.apache.nutch.net.urlnormalizer.basic.BasicURLNormalizer"; + conf.set("urlnormalizer.order", clazz1 + " " + clazz2); + + URLNormalizers normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_DEFAULT); + + assertNotNull(normalizers); + try { + normalizers.normalize("http://www.example.com/", URLNormalizers.SCOPE_DEFAULT); + } catch (MalformedURLException mue) { + fail(mue.toString()); + } + // check the order + int pos1 = -1, pos2 = -1; + URLNormalizer[] impls = normalizers.getURLNormalizers(URLNormalizers.SCOPE_DEFAULT); + for (int i = 0; i < impls.length; i++) { + if (impls[i].getClass().getName().equals(clazz1)) pos1 = i; + if (impls[i].getClass().getName().equals(clazz2)) pos2 = i; + } + if (pos1 != -1 && pos2 != -1) { + assertTrue("RegexURLNormalizer before BasicURLNormalizer", pos1 < pos2); + } + } +} Propchange: lucene/nutch/trunk/src/test/org/apache/nutch/net/TestURLNormalizers.java ------------------------------------------------------------------------------ svn:eol-style = native