I'm not sure what I'm doing it right.
Here is the code of the program
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.StopwordAnalyzerBase.*;
import org.apache.lucene.document.Field.*;
import org.apache.lucene.document.*;
import org.apache.lucene.store.*;
import org.apache.lucene.index.*;
import org.apache.lucene.util.Version;
import java.io.*;
import java.io.FileOutputStream.*;
public class JavaApplication1 {
public static File dataDir = new File("C:/filestoindex");
public static File indexDir = new File("C:/fileindex");
public static void index(File indexDir,File dataDir) throws IOException
{
if (!dataDir.exists() || !dataDir.isDirectory())
{
throw new IOException(dataDir + " does not exist or is not a
directory");
}
Analyzer ac=new StandardAnalyzer(Version.LUCENE_30);
IndexWriter indexWriter = new
IndexWriter(FSDirectory.open(indexDir),ac, true,
IndexWriter.MaxFieldLength.UNLIMITED);
indexDirectory(indexWriter, dataDir);
indexWriter.close();
}
private static void indexDirectory(IndexWriter writer, File dir)
throws IOException {
File[] files = dir.listFiles();
for (int i = 0; i < files.length; i++)
{ File f = files[i];
if (f.isDirectory())
{ indexDirectory(writer, f);
}
indexFile(writer, f);
}
}
private static void indexFile(IndexWriter writer, File f) throws
IOException
{
System.out.println("Индексация " + f.getName());
Document doc = new Document();
doc.add(new Field("contents" , new FileReader(f),
Field.TermVector.YES));
doc.add(new Field("filename", f.getName(), Field.Store.YES,
Field.Index.NOT_ANALYZED));
doc.add(new Field("path", f.getCanonicalPath(), Field.Store.YES,
Field.Index.NOT_ANALYZED));
writer.addDocument(doc);
}
public static void main(String[] args) throws Exception
{
index(indexDir, dataDir);
IndexReader reader = IndexReader.open(FSDirectory.open(indexDir));
FileOutputStream fr = new FileOutputStream("C:/fileout/f.txt");
for (int docNum=0; docNum<reader.numDocs(); docNum++) {
TermFreqVector tfv = reader.getTermFreqVector(docNum, "contents");
if (tfv == null) {
continue;
}
String terms[] = tfv.getTerms();
int termCount = terms.length;
int freqs[] = tfv.getTermFrequencies();
for (int t=0; t < termCount; t++) {
String
st=reader.document(docNum).getField("filename").stringValue()+" "+ terms[t]
+ " " +freqs[t]+ "\r\n";
fr.write(st.getBytes("UTF-8") );
System.out.println(
reader.document(docNum).getField("filename").stringValue()+" "+ terms[t] + "
" + freqs[t]);
}
}fr.close();
}
}
--
View this message in context:
http://lucene.472066.n3.nabble.com/sorting-frequencies-tp4045197p4045249.html
Sent from the Lucene - General mailing list archive at Nabble.com.