I am using this code which is pretty basic.  And it won't index the documents.
 I run the index code and print the document to make sure that it gets
indexed, but when I looked at the output "gen" and "segments" file, there are
only like 20bytes of data in the files.  I am indexing about 300k of text
data.  I am using scala but I dont think that is an issue as I have used
similar code before.  When I do a search 0 documents for everything.

I am using Lucene 2.2.0 (just downloaded).

Here is the core of my code:

  def indexData(writer:IndexWriter, file: File) {
    val doc = new LucDocument()
        
        // Read the content from the file
        val contentReader = new ContentReader(file.getAbsolutePath)
    val (title, content) = contentReader.readFile()

        // Extract data from the java File class
        val link = new DocumentLink(file.getAbsolutePath, file.getName,
                                                        content, 
file.getAbsolutePath)
        
        // Index the document and data.
    doc.add(new Field(LUC_KEY_FULL_PATH, link.fullPath, 
                                          Field.Store.YES, 
Field.Index.TOKENIZED))
    doc.add(new Field(LUC_KEY_FILE_NAME, link.filename, 
                                          Field.Store.YES, 
Field.Index.TOKENIZED))
        doc.add(new Field(LUC_KEY_CONTENT, link.content, 
                                          Field.Store.YES, 
Field.Index.TOKENIZED))
    doc.add(new Field(LUC_KEY_IDENTITY, link.id, 
                                          Field.Store.YES, 
Field.Index.UN_TOKENIZED))
    writer.addDocument(doc)
  }


But if you want the full scala source, here it is.  Think of it as pseudo code:

object BotlistIndexDocuments {
   
  val LUC_KEY_FULL_PATH = "full_path"
  val LUC_KEY_FILE_NAME = "file_name"
  val LUC_KEY_CONTENT = "content"
  val LUC_KEY_IDENTITY = "id"
  
    //
  // Read the content file.  The first line should contain
  // a "#title summary" line and the rest of the document
  // will contain the "wiki" document.
  class ContentReader(filename: String) {
    def readFile(): (String, String) = {
      val file = Source.fromFile(filename)
      var counted = file.getLines.counted
      val fileData = new StringBuilder()
      var title = ""
      counted.foreach { (line: String) =>
                if (counted.count == 0) {
              //title = line.substring(6).trim()
                  title = line
                } else { 
              fileData.append(line)
                }
      }
      (title, fileData.toString())
    }
  } // End of Class //

  case class DocumentLink(abs_path: String, file: String, data: String,
                                                  unique_id:String) {
    val fullPath = abs_path
    val filename = file
        val content = data
    val id = unique_id
  }
  def indexData(writer:IndexWriter, file: File) {
    val doc = new LucDocument()
        
        // Read the content from the file
        val contentReader = new ContentReader(file.getAbsolutePath)
    val (title, content) = contentReader.readFile()

        // Extract data from the java File class
        val link = new DocumentLink(file.getAbsolutePath, file.getName,
                                                        content, 
file.getAbsolutePath)
        
        // Index the document and data.
    doc.add(new Field(LUC_KEY_FULL_PATH, link.fullPath, 
                                          Field.Store.YES, 
Field.Index.TOKENIZED))
    doc.add(new Field(LUC_KEY_FILE_NAME, link.filename, 
                                          Field.Store.YES, 
Field.Index.TOKENIZED))
        doc.add(new Field(LUC_KEY_CONTENT, link.content, 
                                          Field.Store.YES, 
Field.Index.TOKENIZED))
    doc.add(new Field(LUC_KEY_IDENTITY, link.id, 
                                          Field.Store.YES, 
Field.Index.UN_TOKENIZED))
    writer.addDocument(doc)
  }
  
  //
  // Utility for recursively walking directory tree
  // See:
  // override final def flatMap  [B](f : (A) => Iterable[B]) : List[B]
  class DocWalkFile(file: File) {  
        def children = new Iterable[File] {
    def elements = 
      if (file.isDirectory) file.listFiles.elements else Iterator.empty;
        }
        def andTree : Iterable[File] = (
      Seq.single(file) ++ children.flatMap(child => new
DocWalkFile(child).andTree))
  }
  def listDocuments(dir: File): List[File] =
        (new DocWalkFile(dir)).andTree.toList filter (f =>
(f.getName.endsWith(".java") || f.getName.endsWith(".txt")))
          
  def indexDocuments(index_dir: File, files: List[File]) {
        Console.println("INFO: number of files to index=" + files.length)
    val writer = new IndexWriter(index_dir, new StandardAnalyzer(), true)
    for (val file <- files) {
      indexData(writer, file)
    }
  } 
  def main(args: Array[String]): Unit = {
    
    if (args.length != 2) {
      Console.println("usage: java BotlistIndexDocuments parent-index-dir
input-doc-dir")
          Console.println("\n")
          Console.println("\nRun the BotlistIndexDocuments index tool on the 
provided
index directory.")
          Console.println("\nFor bug reporting instructions, please see:")
          Console.println("<URL:http://code.google.com/p/openbotlist>.")
      return
    }

    Console.println("INFO: Indexing Document Data <standby> ...")
    val index = new File(args(0) + "/index")
        val doc_dir = new File(args(1)) 
    if (!index.exists()) {
      index.mkdir();
      Console.println("Creating index directory.")
    } else {
      Console.println("WARN: Index already exists (remove directory to 
continue)")
      Console.println("DIR: " + index.getAbsolutePath())
      //return
    }
        
        // Calculate the processing time to run application
    val timeStart = System.currentTimeMillis()
    indexDocuments(index, (listDocuments(doc_dir)))
    val timeEnd = System.currentTimeMillis()
    Console.println("Done...")
    Console.println("Completed processing in " + (timeEnd - timeStart) + " ms.")
  }
}

--
Berlin Brown
[berlin dot brown at gmail dot com]
http://botspiritcompany.com/botlist/?


---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

Reply via email to