Re: about .frq file format in doc

wangjing Wed, 27 Jun 2012 20:15:30 -0700

thanks

could you help me to solve another problem,


why lucene will reset lastDocID = 0 when finish add one doc?

in source code FormatPostingsDocsWriter.java

        @Override
        void finish() throws IOException {
                long skipPointer = skipListWriter.writeSkip(out);
                // TODO: this is abstraction violation -- we should not
                // peek up into parents terms encoding format
                termInfo.set(df, parent.freqStart, parent.proxStart,
                                (int) (skipPointer - parent.freqStart));
                // TODO: we could do this incrementally
                UnicodeUtil.UTF16toUTF8(parent.currentTerm, 
parent.currentTermStart,
                                utf8);
                if (df > 0) {
                        parent.termsOut.add(fieldInfo.number, utf8.result, 
utf8.length,
                                        termInfo);
                }
                lastDocID = 0;
                df = 0;
        }

@Override
        FormatPostingsPositionsConsumer addDoc(int docID, int termDocFreq)
                        throws IOException {
                System.out.println("docID:" + docID);
                System.out.println("termDocFreq:" + termDocFreq);
                
                final int delta = docID - lastDocID;
                System.out.println("delta:" + delta);
                if (docID < 0 || (df > 0 && delta <= 0))
                        throw new CorruptIndexException("docs out of order (" + 
docID
                                        + " <= " + lastDocID + " ) (out: " + 
out + ")");
                
                if ((++df % skipInterval) == 0) {
                        // TODO: abstraction violation
                        skipListWriter.setSkipData(lastDocID, storePayloads,
                                        posWriter.lastPayloadLength);
                        skipListWriter.bufferSkip(df);
                }
                assert docID < totalNumDocs : "docID=" + docID + " 
totalNumDocs="
                                + totalNumDocs;
                lastDocID = docID;
                if (omitTermFreqAndPositions)
                        out.writeVInt(delta);
                else if (1 == termDocFreq)
                        
                        out.writeVInt((delta << 1) | 1);
                else {
                        out.writeVInt(delta << 1);
                        out.writeVInt(termDocFreq);
                }
                System.out.println("------------");
                return posWriter;
        }



because final int delta = docID - lastDocID;  and  lastDocID set to 0;

it will let  deltaDoc   always equals docID .

docId is 0,1,2 but the delta is 0,1,2, not 0,1,1.

It left me rather confused.


docID:0
termDocFreq:4
delta:0
------------
docID:0
termDocFreq:1
delta:0
------------
docID:0
termDocFreq:1
delta:0
------------
docID:0
termDocFreq:1
delta:0
------------
docID:0
termDocFreq:1
delta:0
------------
docID:0
termDocFreq:1
delta:0
------------
docID:1
termDocFreq:1
delta:1
------------
docID:1
termDocFreq:1
delta:1
------------
docID:1
termDocFreq:1
delta:1
------------
docID:1
termDocFreq:1
delta:1
------------
docID:2
termDocFreq:1
delta:2
------------
docID:2
termDocFreq:1
delta:2
------------
docID:2
termDocFreq:1
delta:2
------------
docID:2
termDocFreq:1
delta:2
------------
docID:2
termDocFreq:1
delta:2
------------
docID:2
termDocFreq:1
delta:2
------------
docID:2
termDocFreq:1
delta:2
------------
docID:2
termDocFreq:1
delta:2




On Thu, Jun 28, 2012 at 12:09 AM, Simon Willnauer
<simon.willna...@googlemail.com> wrote:
> see definitions:
> http://lucene.apache.org/core/3_6_0/fileformats.html#Definitions
>
> simon
>
> On Wed, Jun 27, 2012 at 6:08 PM, Simon Willnauer
> <simon.willna...@googlemail.com> wrote:
>> a term in this context is a (field,text) tuple - does this make sense?
>> simon
>>
>> On Wed, Jun 27, 2012 at 11:40 AM, wangjing <ppm10...@gmail.com> wrote:
>>> http://lucene.apache.org/core/3_6_0/fileformats.html#Frequencies
>>>
>>> The .frq file contains the lists of documents which contain each term,
>>> along with the frequency of the term in that document (except when
>>> frequencies are omitted: IndexOptions.DOCS_ONLY).
>>>
>>> BUT in my example,the frq record the frequency of the term in FIELD?
>>>
>>> my app is
>>> public void createLuceneIndex() {
>>>                try {
>>>                        File file = new File(path);
>>>                        FSDirectory directory = FSDirectory.open(file);
>>>                        // 用来创建索引
>>>                        IndexWriterConfig conf = new 
>>> IndexWriterConfig(Version.LUCENE_36,
>>>                                        new 
>>> StandardAnalyzer(Version.LUCENE_36));
>>>                        // 避免重复运用时每次都附加index导致查询出多条记录
>>>                        conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
>>>                        IndexWriter iw = new IndexWriter(directory, conf);
>>>                        iw.setInfoStream(System.out);
>>>                        Document doc = new Document();
>>>                        // Store.YES 保存 可以查询 可以打印内容
>>>                        Field storeYes = new Field("content1", "my my my my 
>>> storeyes test",
>>>                                        Store.YES, Index.ANALYZED);
>>>                        // Store.NO 不保存 可以查询 不可打印内容 由于不保存内容所以节省空间
>>>                        Field storeNo = new Field("content2", "my storeno 
>>> test", Store.YES,
>>>                                        Index.ANALYZED);
>>>
>>>                        doc.add(storeYes);
>>>                        doc.add(storeNo);
>>>
>>>                        Document doc1 = new Document();
>>>                        Field storeYes1 = new Field("doc1content1",
>>>                                        "doc1 my storeyes test", Store.YES, 
>>> Index.ANALYZED);
>>>                        doc1.add(storeYes1);
>>>
>>>                        Document doc2 = new Document();
>>>                        Field storeYesdoc2 = new Field("doc2content1",
>>>                                        "doc2 my storeyes test", Store.YES, 
>>> Index.ANALYZED);
>>>                        Field storeNodoc2 = new Field("doc2content2",
>>>                                        "doc2 my storeno test", Store.YES, 
>>> Index.ANALYZED);
>>>                        doc2.add(storeYesdoc2);
>>>                        doc2.add(storeNodoc2);
>>>
>>>                        iw.addDocument(doc);
>>>                        iw.addDocument(doc1);
>>>                        iw.addDocument(doc2);
>>>                        System.out.println("doc :" + doc);
>>>                        System.out.println("doc1 :" + doc1);
>>>                        System.out.println("doc2 :" + doc2);
>>>                        iw.forceMerge(1);
>>>                        iw.close();
>>>                } catch (CorruptIndexException e) {
>>>                        // TODO Auto-generated catch block
>>>                        e.printStackTrace();
>>>                } catch (LockObtainFailedException e) {
>>>                        // TODO Auto-generated catch block
>>>                        e.printStackTrace();
>>>                } catch (IOException e) {
>>>                        // TODO Auto-generated catch block
>>>                        e.printStackTrace();
>>>                }
>>>        }
>>>
>>>
>>> THE HEX in .frq file is
>>> 00 04 01 01 01 01 01 03  03 03 03 05 05 05 05 05
>>> 05 05 05
>>>
>>> in my  analysis
>>> 1   [0,4] [0,1] [0,1] [0,1] [0,1]
>>> [0,1][1,1][1,1][1,1][1,1][2,1][2,1][2,1][2,1][2,1][2,1][2,1][2,1]
>>> 2   [0,4] [0,1] [0,1] [0,1] [0,1] [0,1]
>>> [1,1][1,1][1,1][1,1][2,1][2,1][2,1][2,1][2,1][2,1][2,1][2,1]
>>> 3   [0,4][1][1][1][1][1][3][3][3][3][5] [5][5][5][5][5][5][5]
>>>
>>> ---------------------------------------------------------------------
>>> To unsubscribe, e-mail: java-user-unsubscr...@lucene.apache.org
>>> For additional commands, e-mail: java-user-h...@lucene.apache.org
>>>

---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscr...@lucene.apache.org
For additional commands, e-mail: java-user-h...@lucene.apache.org

Re: about .frq file format in doc

Reply via email to