Author: ab
Date: Tue Mar 21 08:43:09 2006
New Revision: 387578

URL: http://svn.apache.org/viewcvs?rev=387578&view=rev
Log:
Cleanup and JUnit test for Carrot2. Contributed by Dawid Weiss (NUTCH-234).

Added:
    
lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/test-input.xml
   (with props)
Modified:
    lucene/nutch/trunk/src/java/org/apache/nutch/clustering/HitsCluster.java
    lucene/nutch/trunk/src/java/org/apache/nutch/clustering/OnlineClusterer.java
    
lucene/nutch/trunk/src/java/org/apache/nutch/clustering/OnlineClustererFactory.java
    
lucene/nutch/trunk/src/plugin/clustering-carrot2/src/test/org/apache/nutch/clustering/carrot2/ClustererTest.java

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/clustering/HitsCluster.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/clustering/HitsCluster.java?rev=387578&r1=387577&r2=387578&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/clustering/HitsCluster.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/clustering/HitsCluster.java 
Tue Mar 21 08:43:09 2006
@@ -19,13 +19,13 @@
 import org.apache.nutch.searcher.HitDetails;
 
 /**
- * An interface representing a group of hits.
+ * An interface representing a group (cluster) of related hits.
  * 
  * <p>If [EMAIL PROTECTED] #isJunkCluster()} method returns <code>true</code>
  * then this cluster contains documents that are grouped together,
- * but no clear semantic relation has been detected; this is mostly
- * the case with "Other topics" clusters. Such clusters may
- * be discarded by the user interface layer.</p>
+ * but no clear semantic relation has been detected. Such clusters may
+ * be hidden in the user interface layer, unless someone wishes to
+ * see an explicit group of documents that didn't belong anywhere else.</p>
  *
  * @author Dawid Weiss
  * @version $Id: HitsCluster.java,v 1.1 2004/08/09 23:23:52 johnnx Exp $
@@ -52,7 +52,7 @@
    * set a cutoff threshold and display only the topmost labels. 
    */
   public String[] getDescriptionLabels();
-  
+
   /**
    * Returns <code>true</code> if this cluster constains documents
    * that did not fit anywhere else (presentation layer may

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/clustering/OnlineClusterer.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/clustering/OnlineClusterer.java?rev=387578&r1=387577&r2=387578&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/java/org/apache/nutch/clustering/OnlineClusterer.java 
(original)
+++ 
lucene/nutch/trunk/src/java/org/apache/nutch/clustering/OnlineClusterer.java 
Tue Mar 21 08:43:09 2006
@@ -23,9 +23,9 @@
  * algorithms.
  *
  * <p>By the term <b>online</b> search results clustering we will understand
- * a clusterer that works on a set of [EMAIL PROTECTED] HitDetails} retrieved 
for a user's
- * query and produces a set of [EMAIL PROTECTED] HitsCluster} that can be 
displayed to help
- * the user gain insight in the topics found in the result.</p>
+ * a clusterer that works on a set of [EMAIL PROTECTED] HitDetails} retrieved 
for a
+ * query and able to produce a set of [EMAIL PROTECTED] HitsCluster} that can 
be displayed
+ * to help the user gain more insight in the topics found in the result.</p>
  *
  * <p>Other clustering options include predefined categories and off-line
  * preclustered groups, but I do not investigate those any further here.</p>

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/clustering/OnlineClustererFactory.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/clustering/OnlineClustererFactory.java?rev=387578&r1=387577&r2=387578&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/java/org/apache/nutch/clustering/OnlineClustererFactory.java
 (original)
+++ 
lucene/nutch/trunk/src/java/org/apache/nutch/clustering/OnlineClustererFactory.java
 Tue Mar 21 08:43:09 2006
@@ -30,19 +30,38 @@
 public class OnlineClustererFactory {
   public static final Logger LOG = LogFormatter
     .getLogger(OnlineClustererFactory.class.getName());
+  
+  /**
+   * Nutch configuration key specifying a particular clustering extension
+   * to use. 
+   */
+  private final static String CONFIG_FIELD_NAME = 
"extension.clustering.extension-name";
+
+  /**
+   * An [EMAIL PROTECTED] ExtensionPoint} pointing to [EMAIL PROTECTED] 
OnlineClusterer}. 
+   */
   private ExtensionPoint extensionPoint;
+  
+  /**
+   * Default clustering extension implementation retrieved from the
+   * configuration file or <code>null</code> if the default (first encountered 
extension)
+   * is to be used.
+   */
   private String extensionName;
 
+  /**
+   * Create an instance of the clustering factory bound to
+   * a given configuration.
+   */
   public OnlineClustererFactory(Configuration conf) {
       this.extensionPoint = 
PluginRepository.get(conf).getExtensionPoint(OnlineClusterer.X_POINT_ID);
-      this.extensionName = conf.get("extension.clustering.extension-name");
+      this.extensionName = conf.get(CONFIG_FIELD_NAME);
   }
 
   /**
   * @return Returns the online clustering extension specified
-  * in nutch configuration's key
-  * <code>extension.clustering.extension-name</code>. If the name is
-  * empty (no preference), the first available clustering extension is
+  * in nutch configuration (key name in this field: [EMAIL PROTECTED] 
#CONFIG_FIELD_NAME}). 
+  * If the name is empty (no preference), the first available clustering 
extension is
   * returned.
   */
   public OnlineClusterer getOnlineClusterer()
@@ -64,7 +83,7 @@
       // not found, fallback to the default, if available.
     }
 
-    Extension[] extensions = this.extensionPoint.getExtensions();
+    final Extension[] extensions = this.extensionPoint.getExtensions();
     if (extensions.length > 0) {
       LOG.info("Using the first clustering extension found: "
         + extensions[0].getId());
@@ -77,11 +96,9 @@
   private Extension findExtension(String name)
     throws PluginRuntimeException {
 
-    Extension[] extensions = this.extensionPoint.getExtensions();
-
+    final Extension[] extensions = this.extensionPoint.getExtensions();
     for (int i = 0; i < extensions.length; i++) {
-      Extension extension = extensions[i];
-
+      final Extension extension = extensions[i];
       if (name.equals(extension.getId()))
         return extension;
     }

Added: 
lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/test-input.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/test-input.xml?rev=387578&view=auto
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/test-input.xml
 (added)
+++ 
lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/test-input.xml
 Tue Mar 21 08:43:09 2006
@@ -0,0 +1,303 @@
+<searchresult>
+<query requested-results="100">data mining</query>
+<document id="0">      <url>http://www.kdnuggets.com/</url>
+       <title>KD Nuggets</title>
+       <snippet>Newsletter on the data mining and knowledge industries, 
offering information on data mining, knowledge discovery, text mining, and web 
mining software, courses, jobs, publications, and meetings.</snippet>
+</document><document id="1">   
<url>http://en.wikipedia.org/wiki/Data_mining</url>
+       <title>Data Mining - Wikipedia</title>
+       <snippet>Article about knowledge-discovery in databases (KDD), the 
practice of automatically searching large stores of data for patterns.</snippet>
+</document><document id="2">   <url>http://www.thearling.com/</url>
+       <title>Thearling.com</title>
+       <snippet>Kurt Thearling&apos;s site dedicated to sharing information 
about data mining, the automated extraction of hidden predictive information 
from databases, and other analytic technologies.</snippet>
+</document><document id="3">   <url>http://www.the-data-mine.com/</url>
+       <title>The Data Mine</title>
+       <snippet>Provides information about data mining also known as knowledge 
discovery in databases (KDD) or simply knowledge discovery. List software, 
events, organizations, and people working in data mining.</snippet>
+</document><document id="4">   <url>http://www.data-miners.com/</url>
+       <title>Data Miners</title>
+       <snippet>Data mining consultancy; services include predictive modeling, 
consulting, and seminars.</snippet>
+</document><document id="5">   <url>http://www.dmg.org/</url>
+       <title>DMG</title>
+       <snippet>The Laboratory for Advanced Computing develops technologies 
for high performance computing, high performance networking, internet 
computing, data mining and related areas. ... Data Mining Group. DMG. DMG Menu 
... The Data Mining Group (DMG) is an independent, vendor led group which 
develops data mining standards, such as the ...</snippet>
+</document><document id="6">   <url>http://www.twocrows.com/glossary.htm</url>
+       <title>Two Crows: Data mining glossary</title>
+       <snippet>Data mining terms concisely defined. ... factor in assessing 
the success of data mining. When applied to data, accuracy refers to the rate 
of ... For example, a data mining software system may have an API which 
...</snippet>
+</document><document id="7">   
<url>http://www.monografias.com/trabajos/datamining/datamining.shtml</url>
+       <title>Data Mining - Monografias.com</title>
+       <snippet>... Data Mining, la extracción de información oculta y 
predecible de grandes bases de ... de Información (Data Warehouse). Las 
herramientas de Data Mining predicen futuras tendencias y ...</snippet>
+</document><document id="8">   
<url>http://www.ccsu.edu/datamining/resources.html</url>
+       <title>CCSU - Data Mining</title>
+       <snippet>Data Mining Resources. Resources. Groups. Data Sets. Papers on 
Data Mining. Commercial. Register at</snippet>
+</document><document id="9">   
<url>http://www-db.stanford.edu/~ullman/mining/mining.html</url>
+       <title>Jeff Ullman&apos;s Data Mining Lecture Notes</title>
+       <snippet>Offers an introduction to various data mining applications and 
techniques: association-rule mining, low-support/high correlation, query 
flocks, searching the Web, web mining, and clustering.</snippet>
+</document><document id="10">  
<url>http://www.statsoft.com/textbook/stdatmin.html</url>
+       <title>Electronic Statistics Textbook: Data Mining Techniques</title>
+       <snippet>Outlines the crucial concepts in data mining, defines the data 
warehousing process, and offers examples of computational and graphical 
exploratory data analysis techniques.</snippet>
+</document><document id="11">  <url>http://www.autonlab.org/tutorials</url>
+       <title>Statistical Data Mining Tutorials</title>
+       <snippet>Includes a set of tutorials on many aspects of statistical 
data mining, including the foundations of probability, the foundations of 
statistical data analysis, and most of the classic machine learning and data 
mining algorithms.</snippet>
+</document><document id="12">  
<url>http://www.sas.com/technologies/data_mining</url>
+       <title>SAS | Data and Text Mining</title>
+       <snippet>... of information, the potential would be enormous. With data 
mining, the possibilities are endless ... almost upon its introduction, our 
data mining technology continues to receive rave ...</snippet>
+</document><document id="13">  <url>http://www.almaden.ibm.com/cs/quest</url>
+       <title>IBM Research | Almaden Research Center | test</title>
+       <snippet>... Privacy-preserving data mining - preserves privacy at the 
individual level, while still allowing accurate data mining models at the 
aggregate level ...</snippet>
+</document><document id="14">  
<url>http://www.oracle.com/technology/products/bi/odm/</url>
+       <title>Oracle Data Mining</title>
+       <snippet>... user interface for Oracle Data Mining that helps data 
analysts mine their Oracle data to find valuable ... With Oracle Data Miner and 
Oracle Data Mining, the data never leaves the ...</snippet>
+</document><document id="15">  
<url>http://www.cs.waikato.ac.nz/~ml/weka/book.html</url>
+       <title>Data Mining: Practical Machine Learning Tools and 
Techniques</title>
+       <snippet>Data Mining: Practical Machine Learning Tools and Techniques 
(Second Edition) Morgan Kaufmann. June 2005. 525 pages. Paper. ISBN 
0-12-088407-0. Comments ... What&apos;s it all about? 1.1 Data mining and 
machine learning ...</snippet>
+</document><document id="16">  <url>http://www.ccsu.edu/datamining</url>
+       <title>Data Mining @ CCSU</title>
+       <snippet>Offers degrees and certificates in data mining. Allows 
students to explore cutting-edge data mining techniques and applications: 
market basket analysis, decision trees, neural networks, machine learning, web 
mining, and data modeling.</snippet>
+</document><document id="17">  
<url>http://searchcrm.techtarget.com/sDefinition/0,,sid11_gci211901,00.html</url>
+       <title>data mining - a Whatis.com definition - see also: data miner, 
data analysis</title>
+       <snippet>... whatis.com: searchCRM.com Definitions - data mining ... 
about the future (This area of data mining is known as predictive analytics.) 
Data mining techniques are used in ...</snippet>
+</document><document id="18">  
<url>http://www.ccsu.edu/datamining/master.html</url>
+       <title>CCSU - Data Mining</title>
+       <snippet>Master of Science Degree. Accredited by the State of 
Connecticut Department of Higher Education. ... Details on how to apply to the 
Master of Science in data mining may be found here ... the Master of Science in 
Data Mining should download the revised Planned Program ...</snippet>
+</document><document id="19">  
<url>http://www.statserv.com/datamining.html</url>
+       <title>[EMAIL PROTECTED] - About Data Mining</title>
+       <snippet>... What is Data Mining ? &quot; Data mining is the process of 
discovering meaningful new correlations, patterns ... Gartner Group). &quot; 
Data mining is the exploration and analysis, by automatic ...</snippet>
+</document><document id="20">  <url>http://www.data-mine.com/</url>
+       <title>Data Mining Technologies, Inc.</title>
+       <snippet>Provides software and consulting for data mining.</snippet>
+</document><document id="21">  
<url>http://www.the-data-mine.com/bin/view/Misc/DataMiningBooksAndPapers</url>
+       <title>Data Mining - Data Mining Books And Papers</title>
+       <snippet>... Mastering Data Mining Michael J. A. Berry, Gordon S ... 
method=&quot;POST&quot; 
action=&quot;http://buybox.amazon.com/o/dt/assoc/handle-buy-box=0471331236&quot;&amp;gt;
 Data Mining Techniques Michael J ...</snippet>
+</document><document id="22">  
<url>http://www.computerworld.com/databasetopics/businessintelligence/datamining</url>
+       <title>Computerworld Data Mining</title>
+       <snippet>This special topic page focuses on data mining software and 
business intelligence tools. ... Latest on Data Mining. Q&amp;A: CA&apos;s new 
CTO discusses development, recruiting ... View more on Data Mining. Data Mining 
Feature. Group files complaint against &apos;adware&apos; firm ...</snippet>
+</document><document id="23">  
<url>http://datamining.typepad.com/data_mining/</url>
+       <title>Data Mining</title>
+       <snippet>Current Reading. On the Stack. January 29, 2006. The Strength 
of BlogAnalytics. A while back, I wrote about how dangerous trend mining over 
blogs could be in the wrong hands. ... Data Mining. About. Weblogs ... company 
providing non-trivial analytics over blog data - or any other data for that 
mater - has already solved this ...</snippet>
+</document><document id="24">  
<url>http://www.wessex.ac.uk/conferences/2002/datamining02</url>
+       <title>DATA MINING 2002 - Post Conference Report</title>
+       <snippet>... Third International Conference on Data Mining Methods and 
Databases for Engineering, Finance and ... The third international conference 
on Data Mining took place recently in Bologna ...</snippet>
+</document><document id="25">  
<url>http://www.thearling.com/text/dmwhite/dmwhite.htm</url>
+       <title>An Introduction to Data Mining</title>
+       <snippet>... Data mining, the extraction of hidden predictive 
information from large databases, is a ... important information in their data 
warehouses. Data mining tools predict future trends ...</snippet>
+</document><document id="26">  <url>http://www.spss.com/datamine</url>
+       <title>Data Mining Software, Data Mining Applications and Data Mining 
Solutions</title>
+       <snippet>Data Mining at SPSS. Your source for data mining software, 
data mining tools, data mining applications and data mining solutions ... Most 
analysts separate data mining software into two groups: data mining tools and 
data mining applications. Data mining tools provide ...</snippet>
+</document><document id="27">  
<url>http://www.onlamp.com/pub/a/onlamp/2004/04/08/datamining_email.html</url>
+       <title>ONLamp.com: Data Mining Email</title>
+       <snippet>Robert Bernier demonstrates how to store data from emails into 
a database, where you can use data-mining techniques to analyze it. ... What is 
data mining anyway? Data mining is a class of database applications that look 
for hidden patterns in a group of data ...</snippet>
+</document><document id="28">  
<url>http://www.aaai.org/AITopics/html/mining.html</url>
+       <title>Data Mining and Discovery</title>
+       <snippet>AI Topics provides basic, understandable information and 
helpful resources concerning artificial intelligence, with an emphasis on 
material available online. ... Data Mining and Discovery. (a subtopic of 
Machine Learning ... Data mining is an AI powered tool that can discover useful 
information within a database that can then be used ...</snippet>
+</document><document id="29">  
<url>http://www.research.microsoft.com/dmx/</url>
+       <title>Data Management, Exploration and Mining- Home</title>
+       <snippet>The Data Management Exploration and Mining Group (DMX). ... 
break down with massive data sets. Therefore, we aim at exploiting data mining 
techniques, i.e ... Our research effort in data mining focuses on ensuring that 
traditional techniques ...</snippet>
+</document><document id="30">  <url>http://www.dmreview.com/</url>
+       <title>DMReview</title>
+       <snippet>An issues and solutions publication that focuses on data 
warehousing as well as client/server and object technology for the 
enterprise.</snippet>
+</document><document id="31">  <url>http://www.megaputer.com/</url>
+       <title>Megaputer Intelligence</title>
+       <snippet>Manufactures multi-strategy data mining and text mining 
software solutions.</snippet>
+</document><document id="32">  
<url>http://databases.about.com/od/datamining</url>
+       <title>Data Mining and Data Warehousing</title>
+       <snippet>The Net&apos;s best collection of data mining and data 
warehousing links from your About.com guide. From data mining tutorials to data 
warehousing techniques, you&apos;ll find it all! ... Benefits of Outsourcing 
Data Warehouse and Data Mining. Many organizations are seeking ...</snippet>
+</document><document id="33">  
<url>http://www.pcc.qub.ac.uk/tec/courses/datamining/stu_notes/dm_book_1.html</url>
+       <title>Data Mining Student Notes, QUB</title>
+       <snippet>Data Mining. An Introduction. Student Notes. Ruth Dilly. 
Parallel Computer Centre. Queens University Belfast. Version 2.0. December1995 
... 1 - Data mining. 1.1 - What is data mining? 1.2 - Data mining background. 
1.2.1 - Inductive learning ...</snippet>
+</document><document id="34">  
<url>http://itmanagement.webopedia.com/TERM/D/data_mining.html</url>
+       <title>data mining - Webopedia.com</title>
+       <snippet>Search for more IT management terms . . . data mining. A class 
of database applications that look for hidden patterns in a group of data that 
can be used to predict future behavior. ... For example, data mining software 
can help retail companies find customers with common interests ... that 
presents data in new ways. True data mining software doesn&apos;t just change 
the ...</snippet>
+</document><document id="35">  <url>http://www.twocrows.com/</url>
+       <title>Two Crows Corporation</title>
+       <snippet>Dedicated to the development, marketing, sales and support of 
tools for knowledge discovery to make data mining accessible and easy to 
use.</snippet>
+</document><document id="36">  
<url>http://databases.about.com/library/weekly/aa100700a.htm</url>
+       <title>Data Mining: An Introduction</title>
+       <snippet>Data mining allows you to find the needles hidden in your 
haystacks of data. Learn how to use these advanced techniques to meet your 
business objectives. ... heard a good deal about data mining -- the database 
industry&apos;s latest buzzword ... of automated statistical analysis (or 
&quot;data mining&quot;) techniques, businesses are discovering new 
...</snippet>
+</document><document id="37">  <url>http://www.kdnuggets.com/software</url>
+       <title>Software for Data Mining and Knowledge Discovery</title>
+       <snippet>This is a directory of general-purpose data mining software. 
To suggest an entry, email to . See also domain-specific data-mining 
solutions.</snippet>
+</document><document id="38">  
<url>http://www.anderson.ucla.edu/faculty/jason.frand/teacher/technologies/palace/datamining.htm</url>
+       <title>Data Mining: What is Data Mining?</title>
+       <snippet>Outlines what knowledge discovery, the process of analyzing 
data from different perspectives and summarizing it into useful information, 
can do and how it works.</snippet>
+</document><document id="39">  
<url>http://www.megaputer.com/products/pa/index.php3</url>
+       <title>Data Mining Software</title>
+       <snippet>Megaputer offers data mining, text mining, and web data mining 
software tools for e-commerce, database marketing, and CRM; seminars, training 
and consulting on data mining. Customer ... and versatile suite of advanced 
data mining tools. PolyAnalyst incorporates the latest ... discovery to analyze 
both structured and unstructured data. The PolyAnalyst platform offers 
...</snippet>
+</document><document id="40">  
<url>http://www.sims.berkeley.edu/~hearst/papers/acl99/acl99-tdm.html</url>
+       <title>Untangling Text Data Mining</title>
+       <snippet>... Untangling Text Data Mining. Marti A. Hearst. School of 
Information Management &amp;amp; Systems ... The possibilities for data mining 
from large text collections are virtually untapped ...</snippet>
+</document><document id="41">  
<url>http://www.megaputer.com/dm/dm101.php3</url>
+       <title>What is Data Mining</title>
+       <snippet>Megaputer offers data mining, text mining, and web data mining 
software tools for e-commerce, database marketing, and CRM; seminars, training 
and consulting on data mining. Customer ... in order to make informed business 
decisions. Data mining automates the process of finding relationships and 
patterns in ... In these situations data mining is your only real option 
...</snippet>
+</document><document id="42">  <url>http://www.ncbi.nih.gov/Tools</url>
+       <title>NCBI Tools for Bioinformatics Research</title>
+       <snippet>... Tools for Data Mining. PubMed. Entrez. BLAST. OMIM. Books 
... results of analyses that have been done on the sequence data. The amount 
and type of information presented depend ...</snippet>
+</document><document id="43">  
<url>http://www.computerworld.com/databasetopics/businessintelligence/story/0,10801,103726,00.html?source=x10</url>
+       <title>Explainer: Data mining - Computerworld</title>
+       <snippet>Often used for predictive modeling, data mining is a subset of 
business intelligence that can help organizations better understand 
relationships among variables. ... into usable shape, however, requires 
sophisticated data mining tools. The same technology that police ... retailers, 
are ideal candidates for data mining technology. Wal-Mart Stores Inc 
...</snippet>
+</document><document id="44">  <url>http://www.dmbenchmarking.com/</url>
+       <title>Data Mining Benchmarking Association (DMBA)</title>
+       <snippet>Association of companies and organizations working to identify 
&quot;best in class&quot; data mining processes through benchmarking 
studies.</snippet>
+</document><document id="45">  <url>http://datamining.typepad.com/</url>
+       <title>Data Mining</title>
+       <snippet>Current Reading. On the Stack. January 30, 2006. Fact versus 
Opinion. Information overload overload is becoming a serious problem for me. 
... Data Mining. About. Weblogs ... company providing non-trivial analytics 
over blog data - or any other data for that mater - has already solved this 
...</snippet>
+</document><document id="46">  
<url>http://www.wessex.ac.uk/conferences/2005/data05</url>
+       <title>DATA MINING 2005</title>
+       <snippet>... International Conference on Data Mining, Text Mining and 
their Business Applications ... Conference on Data Mining, Text Mining and 
their Business Applications (Data Mining ...</snippet>
+</document><document id="47">  
<url>http://www.galaxy.gmu.edu/stats/syllabi/DMLIST.html</url>
+       <title>URL&apos;s for Data Mining</title>
+       <snippet>URL&apos;s for Data Mining. The following URL&apos;s are some 
links to a variety of Data Mining webpages. They are not in any particular 
order. Actually, they are in the order I discovered (mined) them.</snippet>
+</document><document id="48">  
<url>http://www.pcai.com/web/ai_info/data_warehouse_mining.html</url>
+       <title>PC AI - Data Warehouse and Data Mining</title>
+       <snippet>... Data Mining. Overview: Data mining or knowledge discovery 
is becoming more important as more and ... To Distributed Computing. Data 
Warehouse and Data Mining Information on the Internet ...</snippet>
+</document><document id="49">  <url>http://www.gr-fx.com/graf-fx.htm</url>
+       <title>Data Mining</title>
+       <snippet>... databases with graphs and queries using a technique called 
Data Mining. It is also a quick way to ... learn how to use another data mining 
product. All you have to ...</snippet>
+</document><document id="50">  <url>http://www.dwinfocenter.org/</url>
+       <title>Data Warehousing Information Center</title>
+       <snippet>Provides information on tools and techniques to design, build, 
maintain, and retrieve information from a data warehouse.</snippet>
+</document><document id="51">  <url>http://www.siam.org/meetings/sdm02</url>
+       <title>SIAM International Conference on Data Mining</title>
+       <snippet>Co-Sponsored by AHPCRC and University of Illinois at Chicago 
... The field of data mining draws upon extensive work in areas such as; 
statistics ... presentation of recent results in data mining, including; 
applications, algorithms, software, and ...</snippet>
+</document><document id="52">  
<url>http://www.oclc.org/research/projects/mining/</url>
+       <title>Data mining [OCLC - Projects]</title>
+       <snippet>Describes the goals, methodology, and timing of the Data 
mining project. ... Data mining. DCMI Registry DSpace Harvesting Economics of 
Digital Preservation Electronic Theses and Dissertations ... this end, the OCLC 
Research Data-Mining Research Area will focus on ...</snippet>
+</document><document id="53">  
<url>http://www.stat.rutgers.edu/~madigan/datamining</url>
+       <title>Data Mining</title>
+       <snippet>... DATA MINING SPECIAL TOPICS CLASS ... will be using a draft 
version of Principles of Data Mining , by Hand, Mannila, and Smyth (MIT Press, 
forthcoming), as ...</snippet>
+</document><document id="54">  
<url>http://dmoz.org/Computers/Software/Databases/Data_Mining</url>
+       <title>Open Directory - Computers: Software: Databases: Data 
Mining</title>
+       <snippet>the entire directory only in Databases/Data_Mining. See also: 
... About.com on Data Mining - About.com presents a collection of original 
feature articles, net ... room dedicated to data mining and data warehousing 
topics. The Data Mine - Launched ...</snippet>
+</document><document id="55">  
<url>http://www.investorhome.com/mining.htm</url>
+       <title>Investor Home - Data Mining</title>
+       <snippet>... Data Mining. The rapid evolution of computer technology in 
the last few decades has provided ... and consequences of &quot;data 
mining.&quot; Data mining involves searching through databases for ...</snippet>
+</document><document id="56">  
<url>http://www.sas.com/technologies/analytics/datamining</url>
+       <title>SAS | Data and Text Mining</title>
+       <snippet>... of information, the potential would be enormous. With data 
mining, the possibilities are endless ... almost upon its introduction, our 
data mining technology continues to receive rave ...</snippet>
+</document><document id="57">  
<url>http://www.wessex.ac.uk/conferences/2003/datamining03</url>
+       <title>Data Mining 2003</title>
+       <snippet>... Data Mining 2003. Fourth International Conference on Data 
Mining Including Building Applications for CRM ...</snippet>
+</document><document id="58">  <url>http://datamining.itsc.uah.edu/</url>
+       <title>ITSC Data Mining Solutions Center</title>
+       <snippet>... The ITSC Data Mining Solutions Center is the focal point 
for data mining research, development and services at ...</snippet>
+</document><document id="59">  
<url>http://www.webopedia.com/TERM/D/data_mining.html</url>
+       <title>What is data mining? - A Word Definition From the Webopedia 
Computer Dictionary</title>
+       <snippet>This page describes the term data mining and lists other pages 
on the Web where you can find additional information. ... For example, data 
mining software can help retail companies find customers with common interests 
... that presents data in new ways. True data mining software doesn&apos;t just 
change the ...</snippet>
+</document><document id="60">  
<url>http://research.microsoft.com/dmx/DataMining/default.aspx</url>
+       <title>Data Mining Project</title>
+       <snippet>Search: All Research OnlineAll Microsoft.com. Data Mining: 
Efficient Data Exploration and Modeling. Overview. Goal ... will focus on 
exploiting data mining for advanced data summarization and also enable tighter 
... database querying and data mining. Scalable Data Mining Algorithms: We are 
exploring ...</snippet>
+</document><document id="61">  
<url>http://www.fas.org/sgp/crs/intel/RL31798.pdf</url>
+       <title>Data Mining: An Overview</title>
+       <snippet>... assessing risk, and product. retailing, data mining 
involves the use of data analysis tools to discover ... homeland security, data 
mining is often viewed as a potential means to ...</snippet>
+</document><document id="62">  <url>http://www.statsoftinc.com/</url>
+       <title>Data Mining, Statistical Analysis, Quality Control - STATISTICA 
Software</title>
+       <snippet>Statsoft is the creator of STATISTICA, the most comprehensive 
suite of data mining and statistical analysis software. ... StatSoft logo, 
STATISTICA, SEWSS, SEDAS, Data Miner, SEPATH and GTrees are trademarks ... more 
information on STATISTICA, data mining, data analysis, statistical analysis 
&amp;amp; enterprise ...</snippet>
+</document><document id="63">  <url>http://www.insightful.com/</url>
+       <title>Insightful Corporation</title>
+       <snippet>The developer of the technical calculation application 
Mathcad, as well as developer and provider of a variety of other software tools 
for users of PCs, Macintosh computers, and UNIX workstations.</snippet>
+</document><document id="64">  <url>http://www.ncdm.uic.edu/</url>
+       <title>National Center for Data Mining (NCDM) - University of Illinois 
at Chicago</title>
+       <snippet>Conducts research in: scaling algorithms, applications and 
systems to massive data sets, developing algorithms, applications, and systems 
for mining distributed data, and establishing standard languages, protocols, 
and services for data mining and predictive modeling.</snippet>
+</document><document id="65">  
<url>http://www.computerworld.com/hardwaretopics/hardware/desktops/story/0,10801,43509,00.html</url>
+       <title>Data Mining - Computerworld</title>
+       <snippet>Data mining is a process that finds relationships and patterns 
within a large amount of data stored in a database. The process uses tools 
based on algorithms to sift through mounds of data to find relationships. ... 
What has data mining done for Dick&apos;s Supermarkets ... What&apos;s the 
basis of a good data mining program? You have to establish the integrity of 
your data because that&apos;s ...</snippet>
+</document><document id="66">  
<url>http://www.the-data-mine.com/bin/view/Software/WebHome</url>
+       <title>Data Mining - Web Home (Software)</title>
+       <snippet>... To find Data Mining Software, check the Web Index, use Web 
Search or check the most recent changes (Web Changes ... Misc. General Data 
Mining Information - Introductions, Tutorials etc ...</snippet>
+</document><document id="67">  <url>http://www.rulequest.com/</url>
+       <title>Rulequest Research</title>
+       <snippet>Provides software tools for data mining and knowledge 
discovery in databases.</snippet>
+</document><document id="68">  
<url>http://www.bos.frb.org/economic/nerr/rr2000/q3/mining.htm</url>
+       <title>Regional Review: Mining Data</title>
+       <snippet>Mining Data. Quarter 3, 2000. by Miriam Wasserman. SCENE 1: 
It&apos;s late November 1999. The Celtics are struggling with their second 
lineup. ... They both include the use of data-mining computer technology to 
search for patterns in data ... player&apos;s potential is maximized. Although 
data mining by itself is not going to get ...</snippet>
+</document><document id="69">  
<url>http://www.cisl.ucar.edu/hps/GROUPS/dm/dm.html</url>
+       <title>Data Mining Resources</title>
+       <snippet>... and Zantige, D. Data Mining, Harlow, UK: Addison-Wesley, 
1996. Berry, M.J.A. and Linoff, G., Data Mining Techniques for Marketing, 
Sales, and Customer Support, New York, NY: John ...</snippet>
+</document><document id="70">  
<url>http://www.wessex.ac.uk/conferences/2004/datamining04</url>
+       <title>DATA MINING 2004</title>
+       <snippet>... Fifth International Conference on Data Mining, Text Mining 
and their Business Applications ... 5th International Conference on Data 
Mining, Text Mining and their Business Applications ...</snippet>
+</document><document id="71">  
<url>http://www.amazon.com/exec/obidos/tg/detail/-/1558605525?v=glance</url>
+       <title>Amazon.com: Data Mining: Practical Machine Learning Tools and 
Techniques with Java Implementations (The Morgan ... </title>
+       <snippet>... Topics covered: Data mining and machine learning basics, 
sample datasets and applications for data mining ... in the synthesis of data 
mining, data analysis, information theory and ...</snippet>
+</document><document id="72">  
<url>http://www.sas.com/technologies/analytics/datamining/miner</url>
+       <title>SAS | SAS Enterprise Miner</title>
+       <snippet>... Miner streamlines the entire data mining process from data 
access to model deployment by ... It provides a powerful, complete data mining 
solution with unparalleled model development ...</snippet>
+</document><document id="73">  
<url>http://ocw.mit.edu/OcwWeb/Sloan-School-of-Management/15-062Data-MiningSpring2003/CourseHome</url>
+       <title>MIT OpenCourseWare | Sloan School of Management | 15.062 Data 
Mining, Spring 2003 | Home</title>
+       <snippet>... marts specifically intended for management decision 
support. Data mining is a rapidly growing field that is ... The field of data 
mining has evolved from the disciplines of statistics ...</snippet>
+</document><document id="74">  <url>http://www.data-mining-guide.net/</url>
+       <title>Data Mining Software | Guide to Data Mining Software &amp; 
Concepts</title>
+       <snippet>What is Data Mining? Data Mining is the process of analyzing 
large data sets in order to find patterns that can help to isolate key 
variables to build predictive models for management decision making. ... In 
essence, data mining helps businesses to optimize their processes so that 
...</snippet>
+</document><document id="75">  
<url>http://www.cse.ohio-state.edu/~srini/694Z</url>
+       <title>CIS 694Z: Introduction to Data Mining</title>
+       <snippet>... discovery process, key data mining techniques, efficient 
high performance mining algorithms, exposure to applications of data mining 
(bioinformatics and intrusion detection ...</snippet>
+</document><document id="76">  
<url>http://www.gao.gov/new.items/d05866.pdf</url>
+       <title>GAO-05-866 Data Mining: Agencies Have Taken Key Steps to Protect 
Privacy in Selected Efforts, but Significant ... </title>
+       <snippet>... The five data mining efforts we reviewed are used by 
federal agencies to ... individual privacy rights are being appropriately 
protected. Data mining—a technique for ...</snippet>
+</document><document id="77">  
<url>http://datamining.typepad.com/data_mining/2005/08/rumour_mull.html</url>
+       <title>Data Mining: Rumour Mull</title>
+       <snippet>... Data Mining. About. Weblogs ... for 2005-08-15 from 
Emergence Marketing. Data Mining: Rumour Mull Interesting analysis of the 
Technorati takeover rumour ...</snippet>
+</document><document id="78">  <url>http://www.crm2day.com/data_mining</url>
+       <title>CRM Today - Data Mining &amp; Data Warehousing</title>
+       <snippet>... Abstract: The field of data mining, like statistics, 
concerns itself with ... at the connection between data mining and statistics, 
and ask ourselves whether data mining is &quot;statistical ...</snippet>
+</document><document id="79">  <url>http://www.kdnuggets.com/meetings</url>
+       <title>Meetings and Conferences in Data Mining and Knowledge 
Discovery</title>
+       <snippet>Meetings and Conferences in Data Mining, Knowledge Discovery, 
Genomic Mining, and Web Mining. March 7: Proposals due for. March 7: Proposals 
due for. 23-24 Oct, M2006, SAS 9th annual Data Mining Technology Conference, 
Las Vegas, NV, USA. ... with The second workshop on Algorithmic Techniques for 
Data Mining 2006 (ATDM 2006 ...</snippet>
+</document><document id="80">  <url>http://www.siam.org/meetings/sdm01</url>
+       <title>First SIAM International Conference on Data Mining</title>
+       <snippet>Registration. is Closed. Advances in information technology 
and data collection methods have led to the availability of large data sets in 
commercial enterprises and in a wide variety of scientific and engineering 
disciplines. ... The field of data mining draws upon extensive work in areas 
such as statistics ... presentation of recent results in data mining, including 
applications, algorithms, software, and ...</snippet>
+</document><document id="81">  
<url>http://crm.ittoolbox.com/topics/t.asp?t=520&amp;p=520&amp;h1=520</url>
+       <title>CRM Analytical Data Mining</title>
+       <snippet>... Quality&apos; Model (Line56)- Learning from the past; data 
mining and Service Quality provide roadmaps, but CRM ... trade-off analysis. 
Data Mining in Depth: Data Mining and Privacy (DM ...</snippet>
+</document><document id="82">  
<url>http://www.statoo.com/sections/Datamining/</url>
+       <title>Statoo Consulting, Statistical Consulting + Data Analysis + Data 
Mining Services, Lausanne, Switzerland</title>
+       <snippet>Statoo Consulting is a vendor independent Swiss consulting 
firm specialized in statistical consulting and training, data analysis, data 
mining, analytical CRM and bioinformatics services. ... Statistical Consulting 
+ Data Analysis + Data Mining Services. Lausanne, Switzerland. Séminaire de 
méthodologie en data mining statistique, 6-8 Mars, 2006, Paris, France 
...</snippet>
+</document><document id="83">  <url>http://www.cio.com/research/data/</url>
+       <title>Knowledge Management - Data Storage &amp; Mining - Warehouse, 
OLAP, glossary resources - Knowledge Management RC - CIO</title>
+       <snippet>CIO Data Storage &amp;amp; Mining Research Center is a 
compilation of articles, case studies, organizations, conferences, glossary of 
terms, and white papers related to data storage, mining/OLAP, and data 
warehousing.</snippet>
+</document><document id="84">  
<url>http://www.thearling.com/dmintro/dmintro.htm</url>
+       <title>An Introduction to Data Mining by Kurt Thearling</title>
+       <snippet>7-Mar-03: An Introduction to Data Mining</snippet>
+</document><document id="85">  
<url>http://www.stayfreemagazine.org/archives/14/datamining.html</url>
+       <title>Data Mining</title>
+       <snippet>... is arguably at the cutting edge of &quot;data 
mining&quot;: a new kind of information analysis that ... positively timid by 
comparison. Data mining uses artificial intelligence software to hunt 
...</snippet>
+</document><document id="86">  <url>http://www.siam.org/meetings/sdm05</url>
+       <title>SIAM 2005 Data Mining Conference</title>
+       <snippet>... The field of data mining draws upon extensive work in 
areas ... and high-performance data mining. Distributed data mining. Scalable 
algorithms. Integration: mining, warehousing and OLAP ...</snippet>
+</document><document id="87">  
<url>http://www.jcp.org/en/jsr/detail?id=73</url>
+       <title>The Java Community Process(SM) Program - JSRs: Java 
Specification Requests - detail JSR# 73</title>
+       <snippet>... and maintain data and metadata supporting data mining 
models, data scoring, and data mining results serving J2EE ... agreed upon, 
standard API for data mining. By using JDMAPI ...</snippet>
+</document><document id="88">  
<url>http://www.megaputer.com/dm/index.php3</url>
+       <title>Data Mining Introduction</title>
+       <snippet>Megaputer offers data mining, text mining, and web data mining 
software tools for e-commerce, database marketing, and CRM; seminars, training 
and consulting on data mining. Customer ... Data Mining. What is data mining? 
PolyAnalyst Machine Learning Algorithms ... &quot;Data Mining is the process of 
identifying valid, novel, potentially useful, and ultimately comprehensible 
...</snippet>
+</document><document id="89">  
<url>http://www.healthcare-informatics.com/issues/2004/04_04/hagland.htm</url>
+       <title>Healthcare Informatics: Data Mining</title>
+       <snippet>... Data Mining. Stronger computer tools allow deeper analysis 
of medical research, patient care and ... well the tremendous potential of data 
mining--using software programs for pattern ...</snippet>
+</document><document id="90">  
<url>http://www.dmreview.com/article_sub.cfm?articleId=1010449</url>
+       <title>Volume Analytics: Duo-Mining: Combining Data and Text 
Mining</title>
+       <snippet>... As standalone capabilities, the pattern-finding 
technologies of data mining and text mining have been around for years ... of 
all, what are data mining and text mining? They are similar ...</snippet>
+</document><document id="91">  
<url>http://www.itworld.com/App/110/050805datamining</url>
+       <title>ITworld.com - Data mining</title>
+       <snippet>... it into usable shape, however, requires sophisticated data 
mining tools. The same technology that police departments ... How does data 
mining work? Data mining is a subset of business ...</snippet>
+</document><document id="92">  
<url>http://www.statsoft.com/textbook/glosd.html</url>
+       <title>daniell (or equal weight) window. in time series, the daniell 
...</title>
+       <snippet>... Data Mining. StatSoft defines data mining as an analytic 
process designed to ... information, see Data Mining. Data Preparation Phase. 
In Data Mining, the input data are often &quot;noisy ...</snippet>
+</document><document id="93">  
<url>http://oracle.ittoolbox.com/topics/t.asp?t=427&amp;p=427&amp;h1=427</url>
+       <title>Oracle Business Intelligence Data Mining</title>
+       <snippet>... Sub-topic definition: Data Mining is a method of searching 
data with mathematical algorithms to identify ... the product evaluation 
process for Data Mining software. Oracle-BI-l - The ...</snippet>
+</document><document id="94">  
<url>http://www.time.com/time/globalbusiness/article/0,9171,1101021223-400017,00.html?cnn=yes</url>
+       <title>TIME.com: Data Miners -- Dec. 23, 2002 -- Page 1</title>
+       <snippet>New software instantly connects key bits of data that once 
eluded teams of researchers ... The data-mining algorithms of ClearForest, 
based in New York City, are at work within both ... And these days, data-mining 
software, combined with technologies that connect disparate ...</snippet>
+</document><document id="95">  <url>http://www.sqlserverdatamining.com/</url>
+       <title>SQL Server Data Mining</title>
+       <snippet>sql server | data mining. Happy Birthday to 
SQLServerDataMining.com! ... .com with the mission to let the world know about 
the data mining functionality in SQL Server and help them use it ...</snippet>
+</document><document id="96">  <url>http://www.kdd.org/</url>
+       <title>Knowledge Discovery and Data Mining Foundation</title>
+       <snippet>Have you heard about ACM SIGKDD, the newly formed society for 
knowledge discovery and data mining? Click here to see the brand new ACM SIGKDD 
web page. KnowledgeDiscovery &amp;amp; Data Mining ... starting point for 
exploring Internet resources in knowledge discovery and data mining 
...</snippet>
+</document><document id="97">  
<url>http://www.knightsbridge.com/solutions/client/professional/requirements/mining.php</url>
+       <title>Data Mining</title>
+       <snippet>... Data mining is a powerful data warehousing technology to 
assist users with the abundance ... that they have collected. Data mining uses 
sophisticated statistical analyses and modeling ...</snippet>
+</document><document id="98">  <url>http://www.comp.nus.edu.sg/~dm2</url>
+       <title>DM II - Data Mining II</title>
+       <snippet>The DM-II system has two downloadable tools: CBA (v2.1) and 
IAS. CBA (v2.1) (Last Modify June, 25, 2001) is a data mining tool developed at 
School of Computing, National University of Singapore. ... Integrating 
Classification and Association Rule Mining&quot; (KDD-98). Further improvements 
were made ...</snippet>
+</document><document id="99">  
<url>http://www.thearling.com/text/dmviz/modelviz.htm</url>
+       <title>Visualizing Data Mining Models</title>
+       <snippet>... Visualizing Data Mining Models. by Kurt Thearling, Barry 
Becker, Dennis DeCoste, Bill Mawby ... is going on. Since data mining usually 
involves extracting &quot;hidden&quot; information from ...</snippet>
+</document></searchresult>
\ No newline at end of file

Propchange: 
lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/test-input.xml
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: 
lucene/nutch/trunk/src/plugin/clustering-carrot2/src/test/org/apache/nutch/clustering/carrot2/ClustererTest.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/clustering-carrot2/src/test/org/apache/nutch/clustering/carrot2/ClustererTest.java?rev=387578&r1=387577&r2=387578&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/clustering-carrot2/src/test/org/apache/nutch/clustering/carrot2/ClustererTest.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/clustering-carrot2/src/test/org/apache/nutch/clustering/carrot2/ClustererTest.java
 Tue Mar 21 08:43:09 2006
@@ -1,118 +1,167 @@
-/**
- * Copyright 2005 The Apache Software Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.clustering.carrot2;
-
-import java.io.File;
-
-import org.apache.nutch.clustering.HitsCluster;
-import org.apache.nutch.searcher.Hit;
-import org.apache.nutch.searcher.HitDetails;
-import org.apache.nutch.searcher.Hits;
-import org.apache.nutch.searcher.NutchBean;
-import org.apache.nutch.searcher.Query;
-import junit.framework.TestCase;
-
-/**
- * A test case for the Carrot2-based clusterer plugin to Nutch.
- *
- * <p><b>This test case is mostly commented-out because I don't know
- * how to integrate a test that requires an existing Nutch index.</b></p>
- *
- * @author Dawid Weiss
- * @version $Id: ClustererTest.java,v 1.1 2004/08/09 23:23:53 johnnx Exp $
- */
-public class ClustererTest extends TestCase {
-
-  public ClustererTest(String s) {
-    super(s);
-  }
-  
-  public ClustererTest() {
-    super();
-  }
-
-  public void testEmptyInput() {
-    Clusterer c = new Clusterer();
-    
-    HitDetails [] hitDetails = new HitDetails[0];
-    String [] descriptions = new String [0];
-
-    HitsCluster [] clusters = c.clusterHits(hitDetails, descriptions);
-    assertTrue( clusters != null && clusters.length == 0 );
-  }
-
-  /*
-  
-  UNCOMMENT THIS IF YOU HAVE A NUTCH INDEX AVAILABLE. REPLACE
-  THE HARDCODED PATH TO IT.
-  
-  public void testSomeInput() throws Exception {
-    Clusterer c = new Clusterer();
-
-    NutchBean bean = new NutchBean(
-      new File("c:\\dweiss\\data\\mozdex-nutch\\nutch-mozdex\\resin"));
-    Query q = Query.parse( "blog" );
-    Hits hits = bean.search(q, 100);
-
-    Hit[] show = hits.getHits(0, 100);
-    HitDetails[] details = bean.getDetails(show);
-    String[] summaries = bean.getSummary(details, q);
-
-    HitsCluster [] clusters = c.clusterHits(details, summaries);
-    assertTrue( clusters != null );
-    
-    for (int i=0;i<clusters.length;i++) {
-        HitsCluster cluster = clusters[i];
-        dump(0, cluster);
-    }
-  }    
-  */
-  
-  private void dump(int level, HitsCluster cluster) {
-    String [] labels = cluster.getDescriptionLabels();
-    for (int indent = 0; indent<level; indent++) {
-      System.out.print( "   " );
-    }
-    System.out.print(">> ");
-    if (cluster.isJunkCluster()) System.out.print("(Junk) ");
-    System.out.print("CLUSTER: ");
-    for (int i=0;i<labels.length;i++) {
-      System.out.print( labels[i] + "; " );
-    }
-    System.out.println();
-    
-    HitsCluster [] subclusters = cluster.getSubclusters();
-    if (subclusters != null) {
-      for (int i=0;i<subclusters.length;i++) {
-        dump(level + 1, subclusters[i]);
-      }
-    }
-    
-    // dump documents.
-    HitDetails [] hits = cluster.getHits();
-    if (hits != null) {
-      for (int i=0;i<hits.length;i++ ) {
-        for (int indent = 0; indent<level; indent++) {
-          System.out.print( "   " );
-        }
-        System.out.print( hits[i].getValue("url") );
-        System.out.print( "; " );
-        System.out.println( hits[i].getValue("title") );
-      }
-    }
-  }
-}
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.clustering.carrot2;
+
+import java.util.ArrayList;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+
+import junit.framework.TestCase;
+
+import org.apache.nutch.clustering.HitsCluster;
+import org.apache.nutch.searcher.HitDetails;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+
+/**
+ * A test case for the Carrot2-based clusterer plugin to Nutch.
+ *
+ * @author Dawid Weiss
+ * @version $Id: ClustererTest.java,v 1.1 2004/08/09 23:23:53 johnnx Exp $
+ */
+public class ClustererTest extends TestCase {
+
+  public ClustererTest(String s) {
+    super(s);
+  }
+  
+  public ClustererTest() {
+    super();
+  }
+
+  /**
+   * The clusterer should not fail on empty input, returning
+   * an empty array of [EMAIL PROTECTED] HitsCluster}.
+   */
+  public void testEmptyInput() {
+    final Clusterer c = new Clusterer();
+    final HitDetails [] hitDetails = new HitDetails[0];
+    final String [] descriptions = new String [0];
+    final HitsCluster [] clusters = c.clusterHits(hitDetails, descriptions);
+    assertTrue(clusters != null && clusters.length == 0);
+  }
+
+  /**
+   * Tests the clusterer on some cached data.
+   */
+  public void testOnCachedData() throws Exception {
+    final DocumentBuilderFactory factory = 
DocumentBuilderFactory.newInstance();
+    final DocumentBuilder parser = factory.newDocumentBuilder();
+    final Document document = parser.parse(
+        getClass().getResourceAsStream("test-input.xml"));
+
+    final Element data = document.getDocumentElement();
+    final NodeList docs = data.getElementsByTagName("document");
+    
+    final ArrayList summaries = new ArrayList();
+    final ArrayList hitDetails = new ArrayList();
+
+    assertTrue(docs.getLength() > 0);
+    for (int i = 0; i < docs.getLength(); i++) {
+      final Element doc = (Element) docs.item(i);
+      assertTrue(doc.getNodeType() == Node.ELEMENT_NODE);
+      final Element urlElement = (Element) 
doc.getElementsByTagName("url").item(0);
+      final Element snippetElement = (Element) 
doc.getElementsByTagName("snippet").item(0);
+      final Element titleElement = (Element) 
doc.getElementsByTagName("title").item(0);
+
+      summaries.add(toText(titleElement) + " " + toText(snippetElement));
+      hitDetails.add(new HitDetails(
+          new String [] {"url"}, 
+          new String [] {toText(urlElement)}));
+    }
+
+    final Clusterer c = new Clusterer();
+    HitsCluster [] clusters = c.clusterHits(
+        (HitDetails[]) hitDetails.toArray(new HitDetails[hitDetails.size()]),
+        (String[]) summaries.toArray(new String[summaries.size()]));
+    
+    // There should be SOME clusters in the input... words distribution
+    // should not be random because some words have higher probability.
+    assertTrue(clusters != null);
+    assertTrue("Clusters expected, but not found.", clusters.length > 0);
+
+    // Check hit references inside clusters.
+    for (int i = 0; i < clusters.length; i++) {
+      assertTrue(clusters[i].getHits().length > 0);
+    }
+
+    /*
+    // Dump cluster content if you need to.
+    System.out.println("Clusters: " + clusters.length);
+    for (int i = 0; i < clusters.length; i++) {
+      dump(0, clusters[i]);
+    }
+    */
+  }
+  
+  /**
+   * Converts a [EMAIL PROTECTED] Element} to plain text.
+   */
+  private String toText(Element snippetElement) {
+    final StringBuffer buffer = new StringBuffer();
+    final NodeList list = snippetElement.getChildNodes();
+    for (int i = 0; i < list.getLength(); i++) {
+      Node n = list.item(i);
+      if (n.getNodeType() == Node.TEXT_NODE) {
+        buffer.append(n.getNodeValue());
+      } else if (n.getNodeType() == Node.CDATA_SECTION_NODE) {
+        n.getNodeValue();
+      } else throw new RuntimeException("Unexpected nested element when 
converting to text.");
+    }
+    return buffer.toString();
+  }
+
+  /**
+   * Dumps the content of [EMAIL PROTECTED] HitsCluster} to system output 
stream. 
+   */
+  private void dump(int level, HitsCluster cluster) {
+    String [] labels = cluster.getDescriptionLabels();
+    for (int indent = 0; indent<level; indent++) {
+      System.out.print( "   " );
+    }
+    System.out.print(">> ");
+    if (cluster.isJunkCluster()) System.out.print("(Junk) ");
+    System.out.print("CLUSTER: ");
+    for (int i=0;i<labels.length;i++) {
+      System.out.print( labels[i] + "; " );
+    }
+    System.out.println();
+    
+    HitsCluster [] subclusters = cluster.getSubclusters();
+    if (subclusters != null) {
+      for (int i=0;i<subclusters.length;i++) {
+        dump(level + 1, subclusters[i]);
+      }
+    }
+    
+    // dump documents.
+    HitDetails [] hits = cluster.getHits();
+    if (hits != null) {
+      for (int i=0;i<hits.length;i++ ) {
+        for (int indent = 0; indent<level; indent++) {
+          System.out.print( "   " );
+        }
+        System.out.print( hits[i].getValue("url") );
+        System.out.print( "; " );
+        System.out.println( hits[i].getValue("title") );
+      }
+    }
+  }
+}


Reply via email to