(doris-thirdparty) branch clucene updated: [feature](clucene) Introduce extra statistics for calculating Doris BM25. (#321)

airborne Wed, 18 Jun 2025 02:46:05 -0700

This is an automated email from the ASF dual-hosted git repository.

airborne pushed a commit to branch clucene
in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git



The following commit(s) were added to refs/heads/clucene by this push:
     new f9476125876 [feature](clucene) Introduce extra statistics for 
calculating Doris BM25. (#321)
f9476125876 is described below

commit f9476125876e1cd845f8d36cadc3bae7bd5fc2d5
Author: zzzxl <[email protected]>
AuthorDate: Wed Jun 18 17:38:19 2025 +0800

    [feature](clucene) Introduce extra statistics for calculating Doris BM25. 
(#321)
    
    * [feature](clucene): Introduce extra statistics for calculating Doris 
BM25. (#308)
    
    * fix
    
    ---------
    
    Co-authored-by: Zephyr Guo <[email protected]>
---
 src/core/CLucene/index/DocRange.h                |   3 +
 src/core/CLucene/index/IndexReader.cpp           |   8 +-
 src/core/CLucene/index/IndexReader.h             |  15 ++-
 src/core/CLucene/index/IndexWriter.cpp           | 128 ++++++++++++++++++++++-
 src/core/CLucene/index/IndexWriter.h             |   2 +
 src/core/CLucene/index/MultiReader.cpp           |  32 ++++++
 src/core/CLucene/index/MultiReader.h             |   7 ++
 src/core/CLucene/index/MultiSegmentReader.cpp    |  69 +++++++++++-
 src/core/CLucene/index/MultipleTermPositions.cpp |  14 ++-
 src/core/CLucene/index/MultipleTermPositions.h   |  12 ++-
 src/core/CLucene/index/SegmentMerger.cpp         |   2 +-
 src/core/CLucene/index/SegmentReader.cpp         | 117 ++++++++++++++++++---
 src/core/CLucene/index/SegmentTermDocs.cpp       | 102 ++++++++++++++++--
 src/core/CLucene/index/SegmentTermPositions.cpp  |   8 +-
 src/core/CLucene/index/Terms.h                   |  13 ++-
 src/core/CLucene/index/_MultiSegmentReader.h     |  15 ++-
 src/core/CLucene/index/_SegmentHeader.h          |  72 +++++++++++--
 src/core/CLucene/search/IndexSearcher.cpp        |  15 +++
 src/core/CLucene/search/IndexSearcher.h          |   4 +
 src/core/CLucene/search/MultiSearcher.cpp        |  26 +++++
 src/core/CLucene/search/MultiSearcher.h          |   4 +
 src/core/CLucene/search/Searchable.h             |   8 +-
 src/core/CLucene/search/Similarity.cpp           |   8 ++
 src/core/CLucene/search/Similarity.h             |   9 ++
 src/core/CLucene/search/query/TermIterator.h     |   8 ++
 25 files changed, 643 insertions(+), 58 deletions(-)

diff --git a/src/core/CLucene/index/DocRange.h 
b/src/core/CLucene/index/DocRange.h
index ef7906a24fb..ab417ce5877 100644
--- a/src/core/CLucene/index/DocRange.h
+++ b/src/core/CLucene/index/DocRange.h
@@ -23,8 +23,11 @@ class DocRange {
 
   uint32_t doc_many_size_ = 0;
   uint32_t freq_many_size_ = 0;
+  uint32_t norm_many_size_ = 0;
+
   std::vector<uint32_t>* doc_many = nullptr;
   std::vector<uint32_t>* freq_many = nullptr;
+  std::vector<uint32_t>* norm_many = nullptr;
 
   std::pair<uint32_t, uint32_t> doc_range;
 };
\ No newline at end of file
diff --git a/src/core/CLucene/index/IndexReader.cpp 
b/src/core/CLucene/index/IndexReader.cpp
index 69c66b4f735..77df957f75e 100644
--- a/src/core/CLucene/index/IndexReader.cpp
+++ b/src/core/CLucene/index/IndexReader.cpp
@@ -261,7 +261,7 @@ CL_NS_DEF(index)
     return SegmentInfos::getCurrentSegmentGeneration(directory) != -1;
   }
 
-  TermDocs* IndexReader::termDocs(Term* term, const void* io_ctx) {
+  TermDocs* IndexReader::termDocs(Term* term, bool load_stats, const void* 
io_ctx) {
   //Func - Returns an enumeration of all the documents which contain
   //       term. For each document, the document number, the frequency of
   //       the term in that document is also provided, for use in search 
scoring.
@@ -280,12 +280,12 @@ CL_NS_DEF(index)
       //Reference an instantiated TermDocs instance
       TermDocs* _termDocs = termDocs(io_ctx);
       //Seek all documents containing term
-      _termDocs->seek(term);
+      _termDocs->seek(term, load_stats);
       //return the enumaration
       return _termDocs;
   }
 
-  TermPositions* IndexReader::termPositions(Term* term, const void* io_ctx){
+  TermPositions* IndexReader::termPositions(Term* term, bool load_stats, const 
void* io_ctx){
   //Func - Returns an enumeration of all the documents which contain  term. 
For each
   //       document, in addition to the document number and frequency of the 
term in
   //       that document, a list of all of the ordinal positions of the term 
in the document
@@ -306,7 +306,7 @@ CL_NS_DEF(index)
       //Reference an instantiated termPositions instance
       TermPositions* _termPositions = termPositions(io_ctx);
          //Seek all documents containing term
-      _termPositions->seek(term);
+      _termPositions->seek(term, load_stats);
          //return the enumeration
       return _termPositions;
   }
diff --git a/src/core/CLucene/index/IndexReader.h 
b/src/core/CLucene/index/IndexReader.h
index 29449840c17..da73d64dcab 100644
--- a/src/core/CLucene/index/IndexReader.h
+++ b/src/core/CLucene/index/IndexReader.h
@@ -15,6 +15,8 @@
 #include "CLucene/index/IndexVersion.h"
 #include "CLucene/index/_FieldInfos.h"
 
+#include <optional>
+
 CL_CLASS_DEF(store,Directory)
 CL_CLASS_DEF(store,LuceneLock)
 CL_CLASS_DEF(document,Document)
@@ -59,7 +61,6 @@ class CLUCENE_EXPORT IndexReader: public 
CL_NS(util)::NamedObject{
   bool closed;
 protected:
   bool hasChanges;
-
   /**
   * Legacy Constructor for backwards compatibility.
   *
@@ -562,6 +563,14 @@ public:
    */
        virtual int32_t docFreq(const Term* t) = 0;
 
+    /** Returns the norm of document whoss id is <code>doc</code> in the 
<code>field</code>.
+   */
+        virtual int32_t docNorm(const TCHAR* field, int32_t doc) = 0;
+
+    /** Returns the total norm of all terms appeared in all documents
+   */
+        virtual std::optional<uint64_t> sumTotalTermFreq(const TCHAR* field) = 
0;
+
        /* Returns an unpositioned TermPositions enumerator.
    * @throws IOException if there is a low-level IO error
         * @memory Caller must clean up
@@ -586,7 +595,7 @@ public:
   * @throws IOException if there is a low-level IO error
   * @memory Caller must clean up
        */
-       TermPositions* termPositions(Term* term, const void* io_ctx = nullptr);
+       TermPositions* termPositions(Term* term, bool load_stats = false, const 
void* io_ctx = nullptr);
 
        /** Returns an unpositioned {@link TermDocs} enumerator.
    * @throws IOException if there is a low-level IO error
@@ -604,7 +613,7 @@ public:
   * @throws IOException if there is a low-level IO error
   * @memory Caller must clean up
        */
-       TermDocs* termDocs(Term* term, const void* io_ctx = nullptr);
+       TermDocs* termDocs(Term* term, bool load_stats = false, const void* 
io_ctx = nullptr);
 
        /** Deletes the document numbered <code>docNum</code>.  Once a document 
is
        * deleted it will not appear in TermDocs or TermPostitions enumerations.
diff --git a/src/core/CLucene/index/IndexWriter.cpp 
b/src/core/CLucene/index/IndexWriter.cpp
index fd099731a6f..06bfc437407 100644
--- a/src/core/CLucene/index/IndexWriter.cpp
+++ b/src/core/CLucene/index/IndexWriter.cpp
@@ -1338,6 +1338,12 @@ void 
IndexWriter::indexCompaction(std::vector<lucene::store::Directory *> &src_d
 
     std::vector<lucene::index::IndexWriter *> destIndexWriterList;
     std::vector<lucene::store::IndexOutput *> nullBitmapIndexOutputList;
+    std::vector<lucene::store::IndexOutput *> normsOutputList;
+
+    // first level vector index is src_index_id
+    // <TCHAR, ValueArray<uint8_t>> key is field name, value is the norm of 
src_doc_id
+    std::vector<map<TCHAR, std::vector<uint8_t>>> 
srcFieldNormsMapValues(numIndices);
+
     try {
         // check hasProx, indexVersion
         bool hasProx = false;
@@ -1366,6 +1372,42 @@ void 
IndexWriter::indexCompaction(std::vector<lucene::store::Directory *> &src_d
         /// merge fields
         mergeFields(hasProx, indexVersion);
 
+        // check if field has norms
+        bool hasNorms = false;
+        {
+            for (size_t i = 0; i < fieldInfos->size(); i++) {
+                //Get the i-th FieldInfo
+                FieldInfo* fi = fieldInfos->fieldInfo(i);
+                // Is this Field indexed and field need norms ?
+                if (fi->isIndexed && !fi->omitNorms) {
+                    hasNorms = true;
+                }
+            }
+        }
+
+        if (hasNorms) {
+            for (int srcIndex = 0; srcIndex < numIndices; srcIndex++) {
+                auto reader = readers[srcIndex];
+                for (size_t i = 0; i < fieldInfos->size(); i++) {
+                    //Get the i-th FieldInfo
+                    FieldInfo* fi = fieldInfos->fieldInfo(i);
+                    // Is this Field indexed and field need norms ?
+                    if (fi->isIndexed && !fi->omitNorms) {
+                        CL_NS(util)::ValueArray<uint8_t> normBuffer;
+                        size_t maxDoc = reader->maxDoc();
+                        if ( normBuffer.length < maxDoc){
+                            normBuffer.resize(maxDoc);
+                            memset(normBuffer.values, 0, sizeof(uint8_t) * 
maxDoc);
+                        }
+                        reader->norms(fi->name, normBuffer.values);
+                        for (int j = 0; j < normBuffer.length; j++) {
+                            
srcFieldNormsMapValues[srcIndex][*fi->name].emplace_back(normBuffer.values[j]);
+                        }
+                    }
+                }
+            }
+        }
+
         /// write fields and create files writers
         for (int j = 0; j < numDestIndexes; j++) {
             auto dest_dir = dest_dirs[j];
@@ -1406,6 +1448,13 @@ void 
IndexWriter::indexCompaction(std::vector<lucene::store::Directory *> &src_d
             maxSkipLevels = termInfosWriter->maxSkipLevels;
             skipListWriterList.push_back(_CLNEW 
DefaultSkipListWriter(skipInterval, maxSkipLevels, (int) dest_index_docs[j], 
freqOutputList[j], proxOutputList[j]));
 
+            if (hasNorms) {
+                // create norms output
+                auto* norms_out = 
dest_dir->createOutput(Misc::segmentname(segment.c_str(), ".nrm").c_str());
+                norms_out->writeBytes(SegmentMerger::NORMS_HEADER, 
SegmentMerger::NORMS_HEADER_length);
+                normsOutputList.push_back(norms_out);
+            }
+
             // create null_bitmap index output
             auto* null_bitmap_out = 
dest_dir->createOutput(NULL_BITMAP_FILE_NAME);
             nullBitmapIndexOutputList.push_back(null_bitmap_out);
@@ -1414,6 +1463,11 @@ void 
IndexWriter::indexCompaction(std::vector<lucene::store::Directory *> &src_d
         /// merge terms
         mergeTerms(hasProx, indexVersion);
 
+        /// merge norms if have
+        if (hasNorms){
+            mergeNorms(dest_index_docs, srcFieldNormsMapValues, 
normsOutputList);
+        }
+
         /// merge null_bitmap
         mergeNullBitmap(srcNullBitmapValues, nullBitmapIndexOutputList);
     }
@@ -1451,7 +1505,14 @@ void 
IndexWriter::indexCompaction(std::vector<lucene::store::Directory *> &src_d
                     r->close();
                     _CLDELETE(r);
                 }
-            } readers.clear(););
+                 } readers.clear(););
+            for (auto* norms_out
+                 : normsOutputList) {
+                if (norms_out != nullptr) {
+                    norms_out->close();
+                    _CLDELETE(norms_out);
+                }
+         } normsOutputList.clear();
             for (auto* null_bitmap_out
                  : nullBitmapIndexOutputList) {
                 if (null_bitmap_out != nullptr) {
@@ -1915,6 +1976,71 @@ void IndexWriter::mergeTerms(bool hasProx, IndexVersion 
indexVersion) {
     }
 }
 
+void IndexWriter::mergeNorms(std::vector<uint32_t> dest_index_docs,
+                                std::vector<std::map<TCHAR, 
std::vector<uint8_t>>> srcFieldNormsMapValues,
+                                std::vector<lucene::store::IndexOutput *> 
normsOutputList) {
+    //Func - Merges the norms for all fields
+    //Pre  - fieldInfos != NULL
+    //Post - The norms for all fields have been merged
+    CND_PRECONDITION(fieldInfos != NULL, "fieldInfos is NULL");
+
+    std::vector<std::map<TCHAR, std::vector<uint8_t>>> 
destFieldNormsMapValues(numDestIndexes);
+
+    // iterate srcFieldNormsValues to construct destFieldNormsMapValues
+    for (size_t srcIndex = 0; srcIndex < srcFieldNormsMapValues.size(); 
++srcIndex) {
+        std::map<TCHAR, std::vector<uint8_t>> &srcFieldNormsMap = 
srcFieldNormsMapValues[srcIndex];
+        if (srcFieldNormsMap.empty()) {
+            // empty indicates there is no nrm file in this index
+            continue;
+        }
+        // find field has norms
+        for (int j =0; j < fieldInfos->size(); j++) {
+            FieldInfo* fi = fieldInfos->fieldInfo(j);
+            TCHAR fieldName = *fi->name;
+            // Is this Field indexed and field need norms ?
+            if (fi->isIndexed && !fi->omitNorms) {
+                auto& srcFieldNorms = srcFieldNormsMap[fieldName];
+                // construct srcFieldNorms to destFieldNorms
+                for (int srcDocId = 0; srcDocId < srcFieldNorms.size(); 
srcDocId++) {
+                    auto destIdx = _trans_vec[srcIndex][srcDocId].first;
+                    auto destDocId = _trans_vec[srcIndex][srcDocId].second;
+                    if (destIdx == UINT32_MAX || destDocId == UINT32_MAX) {
+                        continue;
+                    }
+                    auto destDocCount = dest_index_docs[destIdx];
+                    auto& destFieldNormsMap = destFieldNormsMapValues[destIdx];
+                    if (destFieldNormsMap.find(fieldName) == 
destFieldNormsMap.end()) {
+                        destFieldNormsMap[fieldName].resize(destDocCount);
+                        
std::fill(destFieldNormsMap[fieldName].begin(),destFieldNormsMap[fieldName].end(),
 0);
+                    }
+                    auto& destFieldNorms = destFieldNormsMap[fieldName];
+                    destFieldNorms[destDocId] = srcFieldNorms[srcDocId];
+                    destFieldNormsMap[fieldName] = destFieldNorms;
+                }
+            }
+        }
+    }
+
+    // construct nrm and write nrm to dest index
+    for (size_t i = 0; i < destFieldNormsMapValues.size(); ++i) {
+        auto& destFieldNormsMap = destFieldNormsMapValues[i];
+        for (int j =0; j < fieldInfos->size(); j++) {
+            FieldInfo* fi = fieldInfos->fieldInfo(j);
+            TCHAR fieldName = *fi->name;
+            auto destDocCount = dest_index_docs[i];
+            if (fi->isIndexed && !fi->omitNorms) {
+                // if not find then norm is zero
+                if (destFieldNormsMap.find(fieldName) == 
destFieldNormsMap.end()) {
+                    destFieldNormsMap[fieldName].resize(destDocCount);
+                    
std::fill(destFieldNormsMap[fieldName].begin(),destFieldNormsMap[fieldName].end(),
 0);
+                }
+                auto& destFieldNorms = destFieldNormsMap[fieldName];
+                normsOutputList[i]->writeBytes(destFieldNorms.data(), 
destDocCount);
+            }
+        }
+    }
+}
+
 void IndexWriter::mergeNullBitmap(std::vector<std::vector<uint32_t>> 
srcNullBitmapValues, std::vector<lucene::store::IndexOutput *> 
nullBitmapIndexOutputList) {
     // first level vector index is dest_index_id
     // second level vector index is dest_doc_id
diff --git a/src/core/CLucene/index/IndexWriter.h 
b/src/core/CLucene/index/IndexWriter.h
index e890b1ed33c..1fef37e5d7c 100644
--- a/src/core/CLucene/index/IndexWriter.h
+++ b/src/core/CLucene/index/IndexWriter.h
@@ -329,6 +329,8 @@ public:
     void writeFields(lucene::store::Directory* d, std::string segment);
     // merge terms and write files
     void mergeTerms(bool hasProx, IndexVersion indexVersion);
+    // merge norms and write files
+    void mergeNorms(std::vector<uint32_t> dest_index_docs, 
std::vector<std::map<TCHAR, std::vector<uint8_t>>> srcFieldNormsMapValues, 
std::vector<lucene::store::IndexOutput *> normsOutputList);
     // merge null_bitmap
     void mergeNullBitmap(std::vector<std::vector<uint32_t>> srcBitmapValues, 
std::vector<lucene::store::IndexOutput *> nullBitmapIndexOutputList);
 
diff --git a/src/core/CLucene/index/MultiReader.cpp 
b/src/core/CLucene/index/MultiReader.cpp
index 963169d3eb2..3ac68b4af8c 100644
--- a/src/core/CLucene/index/MultiReader.cpp
+++ b/src/core/CLucene/index/MultiReader.cpp
@@ -275,6 +275,38 @@ int32_t MultiReader::docFreq(const Term* t) {
        return total;
 }
 
+int32_t MultiReader::docNorm(const TCHAR* field, int32_t n) {
+    ensureOpen();
+    if (hasNorms(field)) {
+        int32_t i = readerIndex(n);
+        return (*subReaders)[i]->docNorm(field, n - starts[i]);
+    }
+    return 0;
+};
+
+std::optional<uint64_t> MultiReader::sumTotalTermFreq(const TCHAR* field) {
+    ensureOpen();
+
+    if (hasNorms(field)) {
+        int64_t sum = 0;
+        bool hasTotalNorm = false;
+        for (size_t i = 0; i < subReaders->length; i++) {
+            if(!isDeleted(i)) {
+                std::optional<int64_t> totalNorm = 
(*subReaders)[i]->sumTotalTermFreq(field);
+                if (totalNorm != std::nullopt) {
+                    hasTotalNorm = true;
+                    sum += totalNorm.value();
+                }
+            }
+        }
+        if (hasTotalNorm) {
+            return sum;
+        }
+    }
+
+    return std::nullopt;
+}
+
 TermDocs* MultiReader::termDocs(const void* io_ctx) {
     ensureOpen();
        TermDocs* ret =  _CLNEW MultiTermDocs(subReaders, starts);
diff --git a/src/core/CLucene/index/MultiReader.h 
b/src/core/CLucene/index/MultiReader.h
index aa9c440d72c..362415501c2 100644
--- a/src/core/CLucene/index/MultiReader.h
+++ b/src/core/CLucene/index/MultiReader.h
@@ -100,6 +100,13 @@ public:
 
        //Returns the document frequency of the current term in the set
        int32_t docFreq(const Term* t=NULL);
+
+  // Returns the document norm
+  int32_t docNorm(const TCHAR* field, int32_t n);
+
+  // Returns the total norm of all terms appeared in all documents in this 
field
+  std::optional<uint64_t> sumTotalTermFreq(const TCHAR* field);
+
        TermDocs* termDocs(const void* io_ctx = nullptr);
        TermPositions* termPositions(const void* io_ctx = nullptr);
 
diff --git a/src/core/CLucene/index/MultiSegmentReader.cpp 
b/src/core/CLucene/index/MultiSegmentReader.cpp
index e5987023c94..910f7c45c67 100644
--- a/src/core/CLucene/index/MultiSegmentReader.cpp
+++ b/src/core/CLucene/index/MultiSegmentReader.cpp
@@ -363,6 +363,34 @@ int32_t MultiSegmentReader::docFreq(const Term* t) {
        return total;
 }
 
+int32_t MultiSegmentReader::docNorm(const TCHAR* field, int32_t n) {
+    if (hasNorms(field)) {
+        int32_t i = readerIndex(n);                           // find segment 
num
+        return (*subReaders)[i]->docNorm(field,n - starts[i]);
+    }
+    return 0;
+}
+
+std::optional<uint64_t> MultiSegmentReader::sumTotalTermFreq(const TCHAR* 
field) {
+    if (hasNorms(field)) {
+        int64_t sum = 0;
+        bool hasTotalNorm = false;
+        for (size_t i = 0; i < subReaders->length; i++) {
+            if (!isDeleted(i)) {
+                std::optional<int64_t> totalNorm = 
(*subReaders)[i]->sumTotalTermFreq(field);
+                if (totalNorm != std::nullopt) {
+                    sum += totalNorm.value();
+                    hasTotalNorm = true;
+                }
+            }
+        }
+        if (hasTotalNorm) {
+            return sum;
+        }
+    }
+    return std::nullopt;
+}
+
 TermDocs* MultiSegmentReader::termDocs(const void* io_ctx) {
     ensureOpen();
        TermDocs* ret =  _CLNEW MultiTermDocs(subReaders, starts);
@@ -573,6 +601,10 @@ void MultiTermDocs::setIoContext(const void* io_ctx) {
        io_ctx_ = io_ctx;
 }
 
+int32_t MultiTermDocs::docNorm() {
+    return current->docNorm();
+}
+
 int32_t MultiTermDocs::doc() const {
   CND_PRECONDITION(current!=NULL,"current==NULL, check that next() was 
called");
   // if not found term, current will return INT_MAX, we could not add base, 
otherwise it will overflow.
@@ -586,11 +618,16 @@ int32_t MultiTermDocs::freq() const {
        return current->freq();
 }
 
-void MultiTermDocs::seek(TermEnum* termEnum){
-       seek(termEnum->term(false));
+int32_t MultiTermDocs::norm() const {
+    CND_PRECONDITION(current!=NULL,"current==NULL, check that next() was 
called");
+    return current->norm();
 }
 
-void MultiTermDocs::seek( Term* tterm) {
+void MultiTermDocs::seek(TermEnum* termEnum, bool load_stats){
+       seek(termEnum->term(false), load_stats);
+}
+
+void MultiTermDocs::seek( Term* tterm, bool load_stats) {
 //Func - Resets the instance for a new search
 //Pre  - tterm != NULL
 //Post - The instance has been reset for a new search
@@ -659,6 +696,28 @@ int32_t MultiTermDocs::read(int32_t* docs, int32_t* freqs, 
int32_t length) {
        }
 }
 
+int32_t MultiTermDocs::read(int32_t* docs, int32_t* freqs, int32_t* norms, 
int32_t length) {
+    while (true) {
+        while (current == NULL) {
+            if (pointer < subReaders->length) {                  // try next 
segment
+                base = starts[pointer];
+                current = termDocs(pointer++);
+            } else {
+                return 0;
+            }
+        }
+        int32_t end = current->read(docs, freqs, norms, length);
+        if (end == 0) {                                  // none left in 
segment
+            current = NULL;
+        } else {                                         // got some
+            int32_t b = base;                    // adjust doc numbers
+            for (int32_t i = 0; i < end; i++)
+                docs[i] += b;
+            return end;
+        }
+    }
+}
+
 bool MultiTermDocs::readRange(DocRange* docRange) {
        while (true) {
                while (current == NULL) {
@@ -741,7 +800,7 @@ TermDocs* MultiTermDocs::termDocs(IndexReader* reader) {
        return reader->termDocs(io_ctx_);
 }
 
-TermDocs* MultiTermDocs::termDocs(const int32_t i) {
+TermDocs* MultiTermDocs::termDocs(const int32_t i, bool local_stats) {
        if (term == NULL)
          return NULL;
        TermDocs* result = (*readerTermDocs)[i];
@@ -750,7 +809,7 @@ TermDocs* MultiTermDocs::termDocs(const int32_t i) {
          readerTermDocs->values[i] = termDocs((*subReaders)[i]);
          result = (*readerTermDocs)[i];
        }
-       result->seek(term);
+       result->seek(term, local_stats);
 
        return result;
 }
diff --git a/src/core/CLucene/index/MultipleTermPositions.cpp 
b/src/core/CLucene/index/MultipleTermPositions.cpp
index e5bfa5ac24a..b5846516f76 100644
--- a/src/core/CLucene/index/MultipleTermPositions.cpp
+++ b/src/core/CLucene/index/MultipleTermPositions.cpp
@@ -14,11 +14,11 @@ CL_NS_USE(util)
 
 CL_NS_DEF(index)
 
-void MultipleTermPositions::seek(Term*) {
+void MultipleTermPositions::seek(Term*, bool) {
        _CLTHROWA(CL_ERR_UnsupportedOperation, "Unsupported operation: 
MultipleTermPositions::seek");
 }
 
-void MultipleTermPositions::seek(TermEnum*) {
+void MultipleTermPositions::seek(TermEnum*, bool) {
        _CLTHROWA(CL_ERR_UnsupportedOperation, "Unsupported operation: 
MultipleTermPositions::seek");
 }
 
@@ -26,6 +26,10 @@ int32_t MultipleTermPositions::read(int32_t*, 
int32_t*,int32_t) {
        _CLTHROWA(CL_ERR_UnsupportedOperation, "Unsupported operation: 
MultipleTermPositions::read");
 }
 
+int32_t MultipleTermPositions::read(int32_t*, int32_t*, int32_t*, int32_t) {
+    _CLTHROWA(CL_ERR_UnsupportedOperation, "Unsupported operation: 
MultipleTermPositions::read");
+}
+
 bool MultipleTermPositions::readRange(DocRange* docRange) {
        _CLTHROWA(CL_ERR_UnsupportedOperation, "Unsupported operation: 
MultipleTermPositions::readRange");
 }
@@ -144,6 +148,7 @@ bool MultipleTermPositions::next() {
 
        _posList->clear();
        _doc = _termPositionsQueue->peek()->doc();
+        _norm = _termPositionsQueue->peek()->norm();
 
        TermPositions* tp;
        do {
@@ -163,7 +168,6 @@ bool MultipleTermPositions::next() {
 
        _posList->sort();
        _freq = _posList->size();
-
        return true;
 }
 
@@ -192,6 +196,10 @@ int32_t MultipleTermPositions::freq() const {
        return _freq;
 }
 
+int32_t MultipleTermPositions::norm() const {
+    return _norm;
+}
+
 void MultipleTermPositions::close() {
        while (_termPositionsQueue->size() > 0) {
                TermPositions* tp = _termPositionsQueue->pop();
diff --git a/src/core/CLucene/index/MultipleTermPositions.h 
b/src/core/CLucene/index/MultipleTermPositions.h
index 67d03615f62..8ef7be1ac56 100644
--- a/src/core/CLucene/index/MultipleTermPositions.h
+++ b/src/core/CLucene/index/MultipleTermPositions.h
@@ -21,8 +21,9 @@ private:
        class IntQueue;
 
        int32_t _doc;
-       int32_t _freq;
-       TermPositionsQueue* _termPositionsQueue;
+        int32_t _freq;
+        int32_t _norm;
+        TermPositionsQueue* _termPositionsQueue;
        IntQueue* _posList;
 
 public:
@@ -44,25 +45,28 @@ public:
 
        int32_t freq() const;
 
+        int32_t norm() const;
+
        void close();
 
        /**
        * Not implemented.
        * @throws UnsupportedOperationException
        */
-       void seek(Term*);
+       void seek(Term*, bool);
 
        /**
        * Not implemented.
        * @throws UnsupportedOperationException
        */
-       void seek(TermEnum*);
+       void seek(TermEnum*, bool);
 
        /**
        * Not implemented.
        * @throws UnsupportedOperationException
        */
        int32_t read(int32_t*, int32_t*,int32_t);
+        int32_t read(int32_t*, int32_t*, int32_t*, int32_t);
        bool readRange(DocRange* docRange) override;
 
        /**
diff --git a/src/core/CLucene/index/SegmentMerger.cpp 
b/src/core/CLucene/index/SegmentMerger.cpp
index cc910b02c88..f0988ed4c9c 100644
--- a/src/core/CLucene/index/SegmentMerger.cpp
+++ b/src/core/CLucene/index/SegmentMerger.cpp
@@ -739,7 +739,7 @@ void SegmentMerger::mergeNorms() {
     for (size_t i = 0; i < fieldInfos->size(); i++) {
       //Get the i-th FieldInfo
       FieldInfo* fi = fieldInfos->fieldInfo(i);
-      //Is this Field indexed?
+      // Is this Field indexed and field need norms ?
       if (fi->isIndexed && !fi->omitNorms){
         //Instantiate  an IndexOutput to that norm file
         if (output == NULL) {
diff --git a/src/core/CLucene/index/SegmentReader.cpp 
b/src/core/CLucene/index/SegmentReader.cpp
index 2257ff9fc7c..374322761ed 100644
--- a/src/core/CLucene/index/SegmentReader.cpp
+++ b/src/core/CLucene/index/SegmentReader.cpp
@@ -4,6 +4,8 @@
 * Distributable under the terms of either the Apache License (Version 2.0) or
 * the GNU Lesser General Public License, as specified in the COPYING file.
 
------------------------------------------------------------------------------*/
+#include <assert.h>
+
 #include "CLucene/_ApiHeader.h"
 #include "CLucene/search/Similarity.h"
 #include "CLucene/store/FSDirectory.h"
@@ -17,7 +19,6 @@
 #include "_SegmentHeader.h"
 #include "_SegmentMerger.h"
 #include "_TermInfosReader.h"
-#include <assert.h>
 
 CL_NS_USE(util)
 CL_NS_USE(store)
@@ -199,8 +200,8 @@ void SegmentReader::initialize(SegmentInfo *si, int32_t 
readBufferSize, bool doO
         if (_fieldInfos->hasProx()) {
             proxStream = cfsDir->openInput((segment + ".prx").c_str(), 
readBufferSize);
         }
-        // we do not need norms, so we don't read it at all.
-        //openNorms(cfsDir, readBufferSize);
+
+        openNorms(cfsDir, readBufferSize);
 
         if (doOpenStores && _fieldInfos->hasVectors()) {// open term vector 
files only as needed
             string vectorsSegment;
@@ -550,6 +551,31 @@ int32_t SegmentReader::docFreq(const Term *t) {
         return 0;
 }
 
+int32_t SegmentReader::docNorm(const TCHAR* field, int32_t doc) {
+    //Func - Returns the norm of document whose id is doc in this filed
+    //Pre  - field has norm file
+    //Post - The norm of document whose id is doc in this filed has been 
returned, otherwise -1.0f;
+
+    ensureOpen();
+
+    if (hasNorms(field)) {
+        SCOPED_LOCK_MUTEX(THIS_LOCK)
+        uint8_t* field_norms = norms(field);
+        return search::Similarity::decodeNorm(field_norms[doc]);
+    }
+    return 0;
+}
+
+std::optional<uint64_t> SegmentReader::sumTotalTermFreq(const TCHAR* field) {
+    //Func - Returns the sum number of all terms in all docs
+    //Pre  - field has norm file;
+    //Post - The sum number of all terms in all docs has been returned, 
otherwise -1.0f;
+    if (hasNorms(field)) {
+        return sum_total_term_freq[*field];
+    }
+    return std::nullopt;
+}
+
 int32_t SegmentReader::numDocs() {
     //Func - Returns the actual number of documents in the segment
     //Pre  - true
@@ -658,6 +684,53 @@ void SegmentReader::norms(const TCHAR *field, uint8_t 
*bytes) {
     }
 
 
+    {
+        SCOPED_LOCK_MUTEX(norm->THIS_LOCK)
+        if (norm->bytes != NULL) {// can copy from cache
+            memcpy(bytes, norm->bytes, maxDoc());
+            return;
+        }
+
+        // Read from disk.  norm.in may be shared across  multiple norms and
+        // should only be used in a synchronized context.
+        IndexInput *normStream;
+        if (norm->useSingleNormStream) {
+            normStream = singleNormStream;
+        } else {
+            normStream = norm->in;
+        }
+        normStream->seek(norm->normSeek);
+        normStream->readBytes(bytes, maxDoc());
+    }
+}
+uint8_t* SegmentReader::norms(const TCHAR *field) const {
+    CND_PRECONDITION(field != NULL, "field is NULL");
+    Norm *norm = _norms.get(field);
+    if (norm == NULL) {
+        return NULL;
+    }
+    {
+        SCOPED_LOCK_MUTEX(norm->THIS_LOCK)
+        if (norm->bytes == NULL) {// value not yet read
+            uint8_t *bytes = _CL_NEWARRAY(uint8_t, maxDoc());
+            norms(field, bytes);
+            norm->bytes = bytes;// cache it
+            // it's OK to close the underlying IndexInput as we have cached the
+            // norms and will never read them again.
+            norm->close();
+        }
+
+        return norm->bytes;
+    }
+}
+
+void SegmentReader::norms(const TCHAR *field, uint8_t* bytes) const {
+    CND_PRECONDITION(field != NULL, "field is NULL");
+    Norm *norm = _norms.get(field);
+    if (norm == NULL) {
+        return;
+    }
+
     {
         SCOPED_LOCK_MUTEX(norm->THIS_LOCK)
         if (norm->bytes != NULL) {// can copy from cache
@@ -681,14 +754,15 @@ void SegmentReader::norms(const TCHAR *field, uint8_t 
*bytes) {
 uint8_t *SegmentReader::createFakeNorms(int32_t size) {
     uint8_t *ones = _CL_NEWARRAY(uint8_t, size);
     if (size > 0)
-        memset(ones, DefaultSimilarity::encodeNorm(1.0f), size);
+        memset(ones, Similarity::encodeNorm(0), size);
     return ones;
 }
 
 uint8_t *SegmentReader::fakeNorms() {
     if (ones == NULL)
-        // ones = createFakeNorms(maxDoc());
-        ones = createFakeNorms(1);
+        // TODO: this is origin clucene norms
+        ones = createFakeNorms(maxDoc());
+        // ones = createFakeNorms(1);
     return ones;
 }
 // can return NULL if norms aren't stored
@@ -752,12 +826,11 @@ uint8_t *SegmentReader::norms(const TCHAR *field) {
     //       and returned containing the norms for that field. If the named 
field is unknown NULL is returned.
 
     CND_PRECONDITION(field != NULL, "field is NULL");
-    // SCOPED_LOCK_MUTEX(THIS_LOCK)
-    // ensureOpen();
-    // uint8_t *bytes = getNorms(field);
-    // if (bytes == NULL)
-    //     bytes = fakeNorms();
-    uint8_t *bytes = fakeNorms();
+    SCOPED_LOCK_MUTEX(THIS_LOCK)
+    ensureOpen();
+    uint8_t *bytes = getNorms(field);
+    if (bytes == NULL)
+        bytes = fakeNorms();
     return bytes;
 }
 
@@ -830,6 +903,26 @@ void SegmentReader::openNorms(Directory *cfsDir, int32_t 
readBufferSize) {
             }
 
             _norms[fi->name] = _CLNEW Norm(normInput, singleNormFile, 
fi->number, normSeek, this, segment.c_str());
+
+            // read total norm info into cache
+            std::vector<uint8_t> bytes(_maxDoc);
+            IndexInput *normStream;
+            if (_norms[fi->name]->useSingleNormStream) {
+                normStream = singleNormStream;
+            } else {
+                normStream = _norms[fi->name]->in;
+            }
+
+            ensureOpen();
+            SCOPED_LOCK_MUTEX(_norms[fi->name]->THIS_LOCK);
+            normStream->seek(_norms[fi->name]->normSeek);
+            normStream->readBytes(bytes.data(), _maxDoc);
+            uint64_t sum = 0;
+            for (int doc = 0; doc < _maxDoc; doc++) {
+                sum += Similarity::decodeNorm(bytes[doc]);
+            }
+            sum_total_term_freq[*fi->name] = sum;
+
             nextNormSeek += _maxDoc;// increment also if some norms are 
separate
         }
     }
diff --git a/src/core/CLucene/index/SegmentTermDocs.cpp 
b/src/core/CLucene/index/SegmentTermDocs.cpp
index 5f5df366f91..35ffc8fd615 100644
--- a/src/core/CLucene/index/SegmentTermDocs.cpp
+++ b/src/core/CLucene/index/SegmentTermDocs.cpp
@@ -11,6 +11,7 @@
 #include "CLucene/index/CodeMode.h"
 #include "CLucene/util/PFORUtil.h"
 #include "Term.h"
+#include "CLucene/search/Similarity.h"
 
 #include <assert.h>
 #include <memory>
@@ -19,10 +20,10 @@
 CL_NS_DEF(index)
 
 SegmentTermDocs::SegmentTermDocs(const SegmentReader *_parent) : 
parent(_parent), freqStream(_parent->freqStream->clone()),
-                                                                 count(0), 
df(0), deletedDocs(_parent->deletedDocs), _doc(-1), _freq(0), 
skipInterval(_parent->tis->getSkipInterval()),
+                                                                 count(0), 
df(0), maxDoc(_parent->maxDoc()), deletedDocs(_parent->deletedDocs), _doc(-1), 
_freq(0), skipInterval(_parent->tis->getSkipInterval()),
                                                                  
maxSkipLevels(_parent->tis->getMaxSkipLevels()), skipListReader(NULL), 
freqBasePointer(0), proxBasePointer(0),
                                                                  
skipPointer(0), haveSkipped(false), pointer(0), pointerMax(0), 
indexVersion_(_parent->_fieldInfos->getIndexVersion()),
-                                                                 
hasProx(_parent->_fieldInfos->hasProx()), buffer_(freqStream, hasProx, 
indexVersion_, _parent->getCompatibleRead()) {
+                                                                 
hasProx(_parent->_fieldInfos->hasProx()), buffer_(freqStream, hasProx, 
indexVersion_, _parent->getCompatibleRead(), maxDoc) {
     CND_CONDITION(_parent != NULL, "Parent is NULL");
     memset(docs,0,PFOR_BLOCK_SIZE*sizeof(int32_t));
     memset(freqs,0,PFOR_BLOCK_SIZE*sizeof(int32_t));
@@ -47,13 +48,23 @@ int32_t SegmentTermDocs::docFreq() {
     return df;
 }
 
-void SegmentTermDocs::seek(Term *term) {
+int32_t SegmentTermDocs::docNorm() {
+    if (_doc < 0 || _doc >= LUCENE_INT32_MAX_SHOULDBE) {
+        return 0;
+    }
+    if (_doc < maxDoc) {
+        return norms[_doc];
+    }
+    return 0;
+}
+
+void SegmentTermDocs::seek(Term *term, bool load_stats) {
     TermInfo *ti = parent->tis->get(term, io_ctx_);
-    seek(ti, term);
+    seek(ti, term, load_stats);
     _CLDELETE(ti);
 }
 
-void SegmentTermDocs::seek(TermEnum *termEnum) {
+void SegmentTermDocs::seek(TermEnum *termEnum, bool load_stats) {
     TermInfo *ti = NULL;
     Term *term = NULL;
 
@@ -68,13 +79,19 @@ void SegmentTermDocs::seek(TermEnum *termEnum) {
         ti = parent->tis->get(term);
     }
 
-    seek(ti, term);
+    seek(ti, term, load_stats);
     _CLDELETE(ti);
 }
-void SegmentTermDocs::seek(const TermInfo *ti, Term *term) {
+void SegmentTermDocs::seek(const TermInfo *ti, Term *term, bool load_stats) {
     count = 0;
     FieldInfo *fi = parent->_fieldInfos->fieldInfo(term->field());
     currentFieldStoresPayloads = (fi != NULL) ? fi->storePayloads : false;
+    buffer_.needLoadStats(load_stats);
+    if (load_stats && fi != NULL && fi->isIndexed && !fi->omitNorms) {
+        const TCHAR *curField = fi->name;
+        norms = parent->norms(curField);
+        buffer_.setAllDocNorms(norms);
+    }
     // hasProx = (fi != nullptr) && fi->hasProx;
     if (ti == NULL) {
         df = 0;
@@ -100,6 +117,9 @@ int32_t SegmentTermDocs::doc() const {
 int32_t SegmentTermDocs::freq() const {
     return _freq;
 }
+int32_t SegmentTermDocs::norm() const {
+    return _norm;
+}
 
 bool SegmentTermDocs::next()  {
     if (count == df) {
@@ -111,6 +131,7 @@ bool SegmentTermDocs::next()  {
     if (hasProx) {
         _freq = buffer_.getFreq();
     }
+    _norm = buffer_.getNorm();
 
     count++;
 
@@ -132,6 +153,7 @@ int32_t SegmentTermDocs::read(int32_t *docs, int32_t 
*freqs, int32_t length) {
             _freq = buffer_.getFreq();
             freqs[i] = _freq;
         }
+        _norm = buffer_.getNorm();
 
         count++;
         i++;
@@ -140,6 +162,31 @@ int32_t SegmentTermDocs::read(int32_t *docs, int32_t 
*freqs, int32_t length) {
     return i;
 }
 
+int32_t SegmentTermDocs::read(int32_t *docs, int32_t *freqs, int32_t *norms, 
int32_t length) {
+    int32_t i = 0;
+
+    if (count == df) {
+        return i;
+    }
+
+    while (i < length && count < df) {
+        _doc = buffer_.getDoc();
+        docs[i] = _doc;
+
+        if (hasProx) {
+            _freq = buffer_.getFreq();
+            freqs[i] = _freq;
+        }
+
+        _norm = buffer_.getNorm();
+        norms[i] = _norm;
+
+        count++;
+        i++;
+    }
+
+    return i;
+}
 bool SegmentTermDocs::readRange(DocRange* docRange) {
     if (count >= df) {
         return false;
@@ -198,7 +245,7 @@ bool SegmentTermDocs::skipTo(const int32_t target) {
 void TermDocsBuffer::refill() {
     cur_doc_ = 0;
     cur_freq_ = 0;
-
+    cur_norm_ = 0;
     if (indexVersion_ >= IndexVersion::kV1) {
         size_ = refillV1();
     } else {
@@ -220,8 +267,26 @@ void TermDocsBuffer::readRange(DocRange* docRange) {
         docRange->freq_many = &freqs_;
         docRange->freq_many_size_ = size;
     }
+
+    if (load_stats_) {
+        docRange->norm_many = &norms_;
+        docRange->norm_many_size_ = size;
+    }
+
+
+}
+
+void TermDocsBuffer::setAllDocNorms(uint8_t* norms) {
+    if(load_stats_ && norms) {
+        all_doc_norms_ = norms;
+    }
+}
+
+void TermDocsBuffer::needLoadStats(bool load_stats) {
+    load_stats_ = load_stats;
 }
 
+
 int32_t TermDocsBuffer::refillV0() {
     if (hasProx_) {
         char mode = freqStream_->readByte();
@@ -252,6 +317,7 @@ int32_t TermDocsBuffer::refillV0() {
                 }
             }
         }
+        refillNorm(arraySize);
         return arraySize;
     } else {
         uint32_t arraySize = freqStream_->readVInt();
@@ -270,12 +336,30 @@ int32_t TermDocsBuffer::refillV0() {
                 P4DEC(buf.data(), arraySize, docs_.data());
             }
         }
+        refillNorm(arraySize);
         return arraySize;
     }
 }
 
 int32_t TermDocsBuffer::refillV1() {
-    return PforUtil::pfor_decode(freqStream_, docs_, freqs_, hasProx_, 
compatibleRead_);
+    auto arraySize = PforUtil::pfor_decode(freqStream_, docs_, freqs_, 
hasProx_, compatibleRead_);
+    refillNorm(arraySize);
+    return arraySize;
 }
 
+void TermDocsBuffer::refillNorm(int32_t size) {
+    if (!load_stats_) {
+        return;
+    }
+
+    for (int i = 0 ;i < size; i++) {
+        auto doc = docs_[i];
+        // avoid doc norms not set
+        if (doc < maxDoc && all_doc_norms_) {
+            norms_[i] = search::Similarity::decodeNorm(all_doc_norms_[doc]);
+        } else {
+            norms_[i] = 0;
+        }
+    }
+}
 CL_NS_END
diff --git a/src/core/CLucene/index/SegmentTermPositions.cpp 
b/src/core/CLucene/index/SegmentTermPositions.cpp
index 7ddb1a2ad18..7a17496b71f 100644
--- a/src/core/CLucene/index/SegmentTermPositions.cpp
+++ b/src/core/CLucene/index/SegmentTermPositions.cpp
@@ -38,8 +38,8 @@ TermPositions* SegmentTermPositions::__asTermPositions(){
     return (TermPositions*) this;
 }
 
-void SegmentTermPositions::seek(const TermInfo* ti, Term* term) {
-    SegmentTermDocs::seek(ti, term);
+void SegmentTermPositions::seek(const TermInfo* ti, Term* term, bool 
local_stats) {
+    SegmentTermDocs::seek(ti, term, local_stats);
     if (ti != NULL)
        lazySkipPointer = ti->proxPointer;
     
@@ -107,6 +107,10 @@ int32_t SegmentTermPositions::read(int32_t* /*docs*/, 
int32_t* /*freqs*/, int32_
     _CLTHROWA(CL_ERR_UnsupportedOperation,"TermPositions does not support 
processing multiple documents in one call. Use TermDocs instead.");
 }
 
+int32_t SegmentTermPositions::read(int32_t* /*docs*/, int32_t* /*freqs*/, 
int32_t*  /*norms*/, int32_t /*length*/) {
+    _CLTHROWA(CL_ERR_UnsupportedOperation,"TermPositions does not support 
processing multiple documents in one call. Use TermDocs instead.");
+}
+
 bool SegmentTermPositions::readRange(DocRange* docRange) {
     return SegmentTermDocs::readRange(docRange);
 }
diff --git a/src/core/CLucene/index/Terms.h b/src/core/CLucene/index/Terms.h
index 0af1102874c..771eff51873 100644
--- a/src/core/CLucene/index/Terms.h
+++ b/src/core/CLucene/index/Terms.h
@@ -31,12 +31,12 @@ public:
 
        // Sets this to the data for a term.
        // The enumeration is reset to the start of the data for this term.
-       virtual void seek(Term* term)=0;
+       virtual void seek(Term* term, bool load_stats = false) = 0;
 
        /** Sets this to the data for the current term in a {@link TermEnum}.
        * This may be optimized in some implementations.
        */
-       virtual void seek(TermEnum* termEnum)=0;
+       virtual void seek(TermEnum* termEnum,  bool load_stats = false) = 0;
 
        // Returns the current document number.  <p> This is invalid until 
{@link
        //      #next()} is called for the first time.
@@ -46,6 +46,10 @@ public:
        //      is invalid until {@link #next()} is called for the first time.
        virtual int32_t freq() const=0;
 
+        // Returns the current document norm.  <p> This is invalid until {@link
+        //     #next()} is called for the first time.
+        virtual int32_t norm() const=0;
+
        // Moves to the next pair in the enumeration.  <p> Returns true iff 
there is
        //      such a next pair in the enumeration.
        virtual bool next() =0;
@@ -58,6 +62,7 @@ public:
        // <p>Returns the number of entries read.  Zero is only returned when 
the
        // stream has been exhausted.
        virtual int32_t read(int32_t* docs, int32_t* freqs, int32_t length)=0;
+        virtual int32_t read(int32_t* docs, int32_t* freqs, int32_t* norms, 
int32_t length)=0;
        virtual bool readRange(DocRange* docRange) = 0;
 
        // Skips entries to the first beyond the current whose document number 
is
@@ -88,6 +93,10 @@ public:
        virtual int32_t docFreq() {
                _CLTHROWA(CL_ERR_UnsupportedOperation, "TermDocs::docFreq does 
not support this method.");
        }
+
+        virtual int32_t docNorm() {
+           return 0;
+       }
 };
 
 
diff --git a/src/core/CLucene/index/_MultiSegmentReader.h 
b/src/core/CLucene/index/_MultiSegmentReader.h
index 830315208c2..d3dc7c70486 100644
--- a/src/core/CLucene/index/_MultiSegmentReader.h
+++ b/src/core/CLucene/index/_MultiSegmentReader.h
@@ -107,6 +107,12 @@ public:
        TermDocs* termDocs(const void* io_ctx = nullptr);
        TermPositions* termPositions(const void* io_ctx = nullptr);
 
+  // Returns the document norm
+  int32_t docNorm(const TCHAR* field, int32_t n);
+
+  // Returns the total norm of all terms appeared in all documents in this 
field
+  std::optional<uint64_t> sumTotalTermFreq(const TCHAR* field);
+
   void getFieldNames (FieldOption fldOption, StringArrayWithDeletor& retarray);
        static void getFieldNames(FieldOption fldOption, 
StringArrayWithDeletor& retarray, CL_NS(util)::ArrayBase<IndexReader*>* 
subReaders);
 
@@ -146,7 +152,7 @@ protected:
   size_t pointer;
 
   TermDocs* current;              // == segTermDocs[pointer]
-  TermDocs* termDocs(const int32_t i); //< internal use only
+  TermDocs* termDocs(const int32_t i, bool local_stats = false); //< internal 
use only
   virtual TermDocs* termDocs(IndexReader* reader);
   void init(CL_NS(util)::ArrayBase<IndexReader*>* subReaders, const int32_t* 
starts);
 public:
@@ -156,13 +162,15 @@ public:
 
   int32_t doc() const;
   int32_t freq() const;
+  int32_t norm() const;
 
-  void seek(TermEnum* termEnum);
-  void seek(Term* tterm);
+  void seek(TermEnum* termEnum, bool load_stats = false);
+  void seek(Term* tterm, bool load_stats = false);
   bool next();
 
   /** Optimized implementation. */
   int32_t read(int32_t* docs, int32_t* freqs, int32_t length);
+  int32_t read(int32_t* docs, int32_t* freqs, int32_t* norms , int32_t length);
   bool readRange(DocRange* docRange) override;
 
    /* A Possible future optimization could skip entire segments */
@@ -173,6 +181,7 @@ public:
   virtual TermPositions* __asTermPositions();
 
   int32_t docFreq() override;
+  int32_t docNorm() override;
 
   void setIoContext(const void* io_ctx) override;
 
diff --git a/src/core/CLucene/index/_SegmentHeader.h 
b/src/core/CLucene/index/_SegmentHeader.h
index 54e84ad4ffd..e09ed00969e 100644
--- a/src/core/CLucene/index/_SegmentHeader.h
+++ b/src/core/CLucene/index/_SegmentHeader.h
@@ -33,9 +33,11 @@ class SegmentReader;
 
 class TermDocsBuffer {
 public:
-  TermDocsBuffer(CL_NS(store)::IndexInput* freqStream, bool hasProx, 
IndexVersion indexVersion, bool compatibleRead)
+  TermDocsBuffer(CL_NS(store)::IndexInput* freqStream, bool hasProx, 
IndexVersion indexVersion, bool compatibleRead, uint32_t maxDoc)
       : docs_(PFOR_BLOCK_SIZE + 3),
         freqs_(PFOR_BLOCK_SIZE + 3),
+        norms_(PFOR_BLOCK_SIZE + 3),
+        maxDoc(maxDoc),
         freqStream_(freqStream),
         hasProx_(hasProx),
         indexVersion_(indexVersion),
@@ -45,9 +47,11 @@ public:
   ~TermDocsBuffer() {
     cur_doc_ = 0;
     cur_freq_ = 0;
+    cur_norm_ = 0;
 
     docs_.clear();
     freqs_.clear();
+    norms_.clear();
 
     freqStream_ = nullptr;
   }
@@ -66,12 +70,29 @@ public:
     return freqs_[cur_freq_++];
   }
 
+  inline int32_t getNorm() {
+      if (cur_norm_ >= size_) {
+          refill();
+      }
+      if(cur_norm_ >= maxDoc) {
+          return 0;
+      }
+      return norms_[cur_norm_++];
+  }
+
   void refill();
   void readRange(DocRange* docRange);
 
+  // set doc norms before readrange or refill
+  void setAllDocNorms(uint8_t* norms);
+
+  // need load state
+  void needLoadStats(bool load_stats = false);
+
 private:
   int32_t refillV0();
   int32_t refillV1();
+  void refillNorm(int32_t size);
 
 private:
   uint32_t size_ = 0;
@@ -82,8 +103,19 @@ private:
   uint32_t cur_freq_ = 0;
   std::vector<uint32_t> freqs_;
 
+  //cur doc norm
+  uint32_t cur_norm_ = 0;
+  std::vector<uint32_t> norms_;
+
   CL_NS(store)::IndexInput* freqStream_ = nullptr;
 
+  // need load statistic info
+  bool load_stats_ = false;
+
+  // save all doc norms in this term's field
+  uint32_t maxDoc = 0;
+  uint8_t* all_doc_norms_;
+
   bool hasProx_ = false;
   bool compatibleRead_ = false;
   IndexVersion indexVersion_ = IndexVersion::kV0; 
@@ -151,14 +183,19 @@ protected:
   CL_NS(store)::IndexInput* freqStream;
   int32_t count;
   int32_t df;
+  int32_t maxDoc;
+
   CL_NS(util)::BitSet* deletedDocs;
   int32_t _doc = -1;
   int32_t _freq = 0;
+  int32_t _norm = 0;
+
   int32_t docs[PFOR_BLOCK_SIZE];         // buffered doc numbers
   int32_t freqs[PFOR_BLOCK_SIZE];        // buffered term freqs
   int32_t pointer;
   int32_t pointerMax;
 
+  uint8_t* norms;
 private:
   int32_t skipInterval;
   int32_t maxSkipLevels;
@@ -181,18 +218,22 @@ public:
   SegmentTermDocs( const SegmentReader* Parent);
   virtual ~SegmentTermDocs();
 
-  virtual void seek(Term* term);
-  virtual void seek(TermEnum* termEnum);
-  virtual void seek(const TermInfo* ti,Term* term);
+  virtual void seek(Term* term, bool load_stats = false);
+  virtual void seek(TermEnum* termEnum, bool load_stats = false);
+  virtual void seek(const TermInfo* ti,Term* term, bool load_stats = false);
 
   virtual void close();
   virtual int32_t doc()const;
   virtual int32_t freq()const;
+  virtual int32_t norm()const;
 
   virtual bool next();
 
   /** Optimized implementation. */
   virtual int32_t read(int32_t* docs, int32_t* freqs, int32_t length);
+
+  virtual int32_t read(int32_t* docs, int32_t* freqs,int32_t* norms, int32_t 
length);
+
   bool readRange(DocRange* docRange) override;
 
   /** Optimized implementation. */
@@ -204,6 +245,8 @@ public:
 
   int32_t docFreq() override;
 
+  int32_t docNorm() override;
+
 protected:
   virtual void skippingDoc(){}
   virtual void skipProx(const int64_t /*proxPointer*/, const int32_t 
/*payloadLength*/){}
@@ -242,7 +285,7 @@ public:
   void setIoContext(const void* io_ctx) override;
 
 private:
-  void seek(const TermInfo* ti, Term* term);
+  void seek(const TermInfo* ti, Term* term, bool load_stats = false);
 
 public:
   void close();
@@ -257,6 +300,7 @@ protected:
 public:
   bool next();
   int32_t read(int32_t* docs, int32_t* freqs, int32_t length);
+  int32_t read(int32_t* docs, int32_t* freqs, int32_t* norms, int32_t length);
   bool readRange(DocRange* docRange) override;
 
 protected:
@@ -291,10 +335,11 @@ private:
   virtual TermPositions* __asTermPositions();
 
   //resolve SegmentTermDocs/TermPositions ambiguity
-  void seek(Term* term){ SegmentTermDocs::seek(term); }
-  void seek(TermEnum* termEnum){ SegmentTermDocs::seek(termEnum); }
+  void seek(Term* term, bool load_stats = false){ SegmentTermDocs::seek(term, 
load_stats); }
+  void seek(TermEnum* termEnum, bool load_stats = false){ 
SegmentTermDocs::seek(termEnum, load_stats); }
   int32_t doc() const{ return SegmentTermDocs::doc(); }
   int32_t freq() const{ return SegmentTermDocs::freq(); }
+  int32_t norm() const{ return SegmentTermDocs::norm(); }
   bool skipTo(const int32_t target){ return SegmentTermDocs::skipTo(target); }
 
 private:
@@ -370,6 +415,7 @@ class SegmentReader: public DirectoryIndexReader {
     CL_NS(util)::Deletor::Dummy,
     Norm > NormsType;
   NormsType _norms;
+  std::unordered_map<TCHAR, std::optional<int64_t>> sum_total_term_freq;
 
   uint8_t* ones;
   uint8_t* fakeNorms();
@@ -487,13 +533,18 @@ public:
   ///Returns the number of documents which contain the term t
   int32_t docFreq(const Term* t);
 
+  ///Returns the number of document whose id is doc in this field
+  int32_t docNorm(const TCHAR* field, int32_t doc);
+
+  ///Returns the total norm of all terms appeared in all documents in this 
field
+  std::optional<uint64_t> sumTotalTermFreq(const TCHAR* field);
+
   ///Returns the actual number of documents in the segment
   int32_t numDocs();
   ///Returns the number of  all the documents in the segment including the 
ones that have
   ///been marked deleted
   int32_t maxDoc() const;
 
-
   void setTermInfosIndexDivisor(int32_t indexDivisor);
 
   int32_t getTermInfosIndexDivisor();
@@ -502,6 +553,11 @@ public:
   ///Returns fake norms if norms aren't available
   uint8_t* norms(const TCHAR* field);
 
+  uint8_t* norms(const TCHAR* field) const;
+
+  ///Returns the bytes array that holds the norms of a named field.
+  void norms(const TCHAR* field, uint8_t* bytes) const;
+
   ///Reads the Norms for field from disk
   void norms(const TCHAR* field, uint8_t* bytes);
 
diff --git a/src/core/CLucene/search/IndexSearcher.cpp 
b/src/core/CLucene/search/IndexSearcher.cpp
index f5b313a3b26..e20d6f44239 100644
--- a/src/core/CLucene/search/IndexSearcher.cpp
+++ b/src/core/CLucene/search/IndexSearcher.cpp
@@ -200,6 +200,21 @@ CL_NS_DEF(search)
       return reader->docFreq(term);
   }
 
+  // doc norm
+  int32_t IndexSearcher::docNorm(const TCHAR* field, int32_t doc) const {
+
+      CND_PRECONDITION(reader != NULL, "reader is NULL");
+
+      return reader->docNorm(field, doc);
+  }
+
+  std::optional<uint64_t> IndexSearcher::sumTotalTermFreq(const TCHAR* field) 
const {
+
+      CND_PRECONDITION(reader != NULL, "reader is NULL");
+
+      return reader->sumTotalTermFreq(field);
+  }
+
   _CL_DEPRECATED( doc(i, document) ) CL_NS(document)::Document* 
IndexSearcher::doc(int32_t i){
        CL_NS(document)::Document* ret = _CLNEW CL_NS(document)::Document;
        if (!doc(i,ret) )
diff --git a/src/core/CLucene/search/IndexSearcher.h 
b/src/core/CLucene/search/IndexSearcher.h
index 8f0b2000aff..6969b9ed409 100644
--- a/src/core/CLucene/search/IndexSearcher.h
+++ b/src/core/CLucene/search/IndexSearcher.h
@@ -77,6 +77,10 @@ public:
 
        int32_t docFreq(const CL_NS(index)::Term* term) const;
 
+        int32_t docNorm(const TCHAR* field, int32_t doc) const;
+
+        std::optional<uint64_t> sumTotalTermFreq(const TCHAR* field) const;
+
        bool doc(int32_t i, CL_NS(document)::Document& document);
        bool doc(int32_t i, CL_NS(document)::Document* document);
        _CL_DEPRECATED( doc(i, document) ) CL_NS(document)::Document* 
doc(int32_t i);
diff --git a/src/core/CLucene/search/MultiSearcher.cpp 
b/src/core/CLucene/search/MultiSearcher.cpp
index 0f2a6862706..872179ae7f3 100644
--- a/src/core/CLucene/search/MultiSearcher.cpp
+++ b/src/core/CLucene/search/MultiSearcher.cpp
@@ -5,8 +5,10 @@
 * the GNU Lesser General Public License, as specified in the COPYING file.
 
------------------------------------------------------------------------------*/
 #include "CLucene/_ApiHeader.h"
+#include <optional>
 #include "CLucene/index/IndexReader.h"
 #include "MultiSearcher.h"
+
 #include "SearchHeader.h"
 #include "Query.h"
 #include "_HitQueue.h"
@@ -74,6 +76,30 @@ CL_NS_DEF(search)
     return docFreq;
   }
 
+// doc norm
+int32_t MultiSearcher::docNorm(const TCHAR* field, int32_t n) const {
+
+      CND_PRECONDITION(reader != NULL, "reader is NULL");
+      int32_t i = subSearcher(n);                        // find searcher index
+      return searchables[i]->docNorm(field, n - starts[i]);
+  }
+
+std::optional<uint64_t> MultiSearcher::sumTotalTermFreq(const TCHAR* field) 
const {
+      bool fieldHasNorm = false;
+      int64_t sum = 0;
+      for (int32_t i = 0; i < searchablesLen; ++i) {
+           std::optional<int64_t> norm = 
searchables[i]->sumTotalTermFreq(field);
+          if (norm != std::nullopt) {
+              fieldHasNorm = true;
+              sum += norm.value();
+          }
+      }
+      if (fieldHasNorm) {
+           return sum;
+      }
+      return std::nullopt;
+  }
+
   /** For use by {@link HitCollector} implementations. */
   bool MultiSearcher::doc(int32_t n, Document* d) {
     int32_t i = subSearcher(n);                          // find searcher index
diff --git a/src/core/CLucene/search/MultiSearcher.h 
b/src/core/CLucene/search/MultiSearcher.h
index 17adba01e8d..1815b0fc575 100644
--- a/src/core/CLucene/search/MultiSearcher.h
+++ b/src/core/CLucene/search/MultiSearcher.h
@@ -40,6 +40,10 @@ CL_NS_DEF(search)
 
          int32_t docFreq(const CL_NS(index)::Term* term) const ;
 
+    int32_t docNorm(const TCHAR* field, int32_t n) const;
+
+    std::optional<uint64_t> sumTotalTermFreq(const TCHAR* field) const;
+
       /** For use by {@link HitCollector} implementations. */
          bool doc(int32_t n, CL_NS(document)::Document* document);
 
diff --git a/src/core/CLucene/search/Searchable.h 
b/src/core/CLucene/search/Searchable.h
index cb32a88579e..dabd7d45928 100644
--- a/src/core/CLucene/search/Searchable.h
+++ b/src/core/CLucene/search/Searchable.h
@@ -9,6 +9,7 @@
 
 
 //#include "CLucene/index/IndexReader.h"
+#include <optional>
 CL_CLASS_DEF(index,Term)
 //#include "Filter.h"
 CL_CLASS_DEF(document,Document)
@@ -67,7 +68,12 @@ CL_NS_DEF(search)
       * @see IndexReader#docFreq(Term).
       */
       virtual int32_t docFreq(const CL_NS(index)::Term* term) const = 0;
-
+      /** Expert: Returns the norm of document whoss id is <code>doc</code> in 
the <code>field</code>.
+      */
+      virtual int32_t docNorm(const TCHAR* field, int32_t doc) const = 0;
+      /** Expert: Returns the total norm of all terms appeared in all 
documents in this field
+      */
+      virtual std::optional<uint64_t> sumTotalTermFreq(const TCHAR* field) 
const = 0;
       /** Expert: Returns one greater than the largest possible document 
number.
       * Called by search code to compute term weights.
       * @see IndexReader#maxDoc().
diff --git a/src/core/CLucene/search/Similarity.cpp 
b/src/core/CLucene/search/Similarity.cpp
index b78ce677533..922d644ec8f 100644
--- a/src/core/CLucene/search/Similarity.cpp
+++ b/src/core/CLucene/search/Similarity.cpp
@@ -247,4 +247,12 @@ CL_NS_DEF(search)
                return 0.0f;
     return overlap / (float_t)maxOverlap;
   }
+
+  LengthSimilarity::LengthSimilarity(){
+     }
+  LengthSimilarity::~LengthSimilarity(){
+     }
+  float_t LengthSimilarity::lengthNorm(const TCHAR* /*fieldName*/, int32_t 
numTerms) {
+       return numTerms;
+  }
 CL_NS_END
diff --git a/src/core/CLucene/search/Similarity.h 
b/src/core/CLucene/search/Similarity.h
index 388898aba23..74b7a819d06 100644
--- a/src/core/CLucene/search/Similarity.h
+++ b/src/core/CLucene/search/Similarity.h
@@ -275,5 +275,14 @@ public:
   float_t coord(int32_t overlap, int32_t maxOverlap);
 };
 
+/** Expert: Length scoring implementation. */
+class CLUCENE_EXPORT LengthSimilarity: public DefaultSimilarity {
+public:
+    LengthSimilarity();
+    ~LengthSimilarity();
+    /** Implemented as <code>1/sqrt(numTerms)</code>. */
+    float_t lengthNorm(const TCHAR* fieldName, int32_t numTerms) override;
+};
+
 CL_NS_END
 #endif
diff --git a/src/core/CLucene/search/query/TermIterator.h 
b/src/core/CLucene/search/query/TermIterator.h
index 3eb22a254de..82c5c71027d 100644
--- a/src/core/CLucene/search/query/TermIterator.h
+++ b/src/core/CLucene/search/query/TermIterator.h
@@ -27,6 +27,10 @@ public:
     return termDocs_->freq();
   }
 
+  inline int32_t norm() const {
+      return termDocs_->norm();
+  }
+
   inline int32_t nextDoc() const {
     if (termDocs_->next()) {
       return termDocs_->doc();
@@ -45,6 +49,10 @@ public:
     return termDocs_->docFreq();
   }
 
+  inline int32_t docNorm() const {
+      return termDocs_->docNorm();
+  }
+
   inline bool readRange(DocRange* docRange) const {
     return termDocs_->readRange(docRange);
   }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(doris-thirdparty) branch clucene updated: [feature](clucene) Introduce extra statistics for calculating Doris BM25. (#321)

Reply via email to