This is an automated email from the ASF dual-hosted git repository.
jianliangqi pushed a commit to branch clucene
in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git
The following commit(s) were added to refs/heads/clucene by this push:
new e15a89a5 [Optimize](search) Optimizemultiple terms conjunction query
(#117)
e15a89a5 is described below
commit e15a89a5627b1a2e88ef924bee6e0a3c6c6495a3
Author: zzzxl <[email protected]>
AuthorDate: Mon Sep 4 16:38:32 2023 +0800
[Optimize](search) Optimizemultiple terms conjunction query (#117)
---
.../CLucene/analysis/jieba/ChineseTokenizer.cpp | 5 +-
src/core/CLucene/document/Field.h | 7 +
src/core/CLucene/index/CodeMode.h | 3 +-
src/core/CLucene/index/DocRange.h | 7 +-
src/core/CLucene/index/FieldInfos.cpp | 31 +-
src/core/CLucene/index/IndexReader.cpp | 4 +
src/core/CLucene/index/IndexReader.h | 3 +
src/core/CLucene/index/IndexVersion.h | 8 +
src/core/CLucene/index/IndexWriter.cpp | 53 ++--
src/core/CLucene/index/MultiSegmentReader.cpp | 24 +-
src/core/CLucene/index/SDocumentWriter.cpp | 50 ++-
src/core/CLucene/index/SDocumentWriter.h | 1 +
src/core/CLucene/index/SegmentReader.cpp | 5 +
src/core/CLucene/index/SegmentTermDocs.cpp | 348 +++++++++------------
src/core/CLucene/index/Terms.h | 4 +
src/core/CLucene/index/_FieldInfos.h | 8 +
src/core/CLucene/index/_MultiSegmentReader.h | 4 +
src/core/CLucene/index/_SegmentHeader.h | 69 ++++
src/core/CLucene/search/query/DcoIdSetIterator.h | 16 +
src/core/CLucene/search/query/TermIterator.h | 51 +++
src/core/CMakeLists.txt | 2 +
21 files changed, 435 insertions(+), 268 deletions(-)
diff --git a/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp
b/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp
index ef126c97..c72e2451 100644
--- a/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp
+++ b/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp
@@ -50,8 +50,9 @@ CL_NS(analysis)::Token
*ChineseTokenizer::next(lucene::analysis::Token *token) {
dataLen = tokens_text.size();
}
if (bufferIndex < dataLen) {
- auto token_text = tokens_text[bufferIndex++];
- token->setNoCopy(token_text.data(), 0, token_text.size());
+ std::string& token_text = tokens_text[bufferIndex++];
+ size_t size = std::min(token_text.size(),
static_cast<size_t>(LUCENE_MAX_WORD_LEN));
+ token->setNoCopy(token_text.data(), 0, size);
return token;
}
return nullptr;
diff --git a/src/core/CLucene/document/Field.h
b/src/core/CLucene/document/Field.h
index 970d06ee..23c0ad17 100644
--- a/src/core/CLucene/document/Field.h
+++ b/src/core/CLucene/document/Field.h
@@ -9,6 +9,8 @@
#include "CLucene/util/Array.h"
#include "CLucene/util/Equators.h"
+#include "CLucene/index/IndexVersion.h"
+
/*
Fieldable reading:
https://issues.apache.org/jira/browse/LUCENE-1219?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-
tabpanel&focusedCommentId=12578199#action_12578199
@@ -310,6 +312,9 @@ public:
virtual const char* getObjectName() const;
static const char* getClassName();
+ void setIndexVersion(IndexVersion indexVersion) { indexVersion_ =
indexVersion; }
+ IndexVersion getIndexVersion() { return indexVersion_; }
+
protected:
/**
* Set configs using XOR. This resets all the settings
@@ -327,6 +332,8 @@ protected:
const TCHAR* _name;
uint32_t config;
float_t boost;
+
+ IndexVersion indexVersion_ = IndexVersion::kV1;
};
CL_NS_END
#endif
diff --git a/src/core/CLucene/index/CodeMode.h
b/src/core/CLucene/index/CodeMode.h
index 5ac3718f..3c39e94e 100644
--- a/src/core/CLucene/index/CodeMode.h
+++ b/src/core/CLucene/index/CodeMode.h
@@ -4,7 +4,8 @@ CL_NS_DEF(index)
enum class CodeMode {
kDefault = 0,
- kPfor = 1
+ kPfor = 1,
+ kRange = 2
};
CL_NS_END
\ No newline at end of file
diff --git a/src/core/CLucene/index/DocRange.h
b/src/core/CLucene/index/DocRange.h
index 81b27047..ef7906a2 100644
--- a/src/core/CLucene/index/DocRange.h
+++ b/src/core/CLucene/index/DocRange.h
@@ -15,15 +15,16 @@ enum class DocRangeType {
class DocRange {
public:
- DocRange() : doc_many(PFOR_BLOCK_SIZE + 3), freq_many(PFOR_BLOCK_SIZE + 3) {}
+ DocRange() = default;
+ ~DocRange() = default;
public:
DocRangeType type_ = DocRangeType::kNone;
uint32_t doc_many_size_ = 0;
uint32_t freq_many_size_ = 0;
- std::vector<uint32_t> doc_many;
- std::vector<uint32_t> freq_many;
+ std::vector<uint32_t>* doc_many = nullptr;
+ std::vector<uint32_t>* freq_many = nullptr;
std::pair<uint32_t, uint32_t> doc_range;
};
\ No newline at end of file
diff --git a/src/core/CLucene/index/FieldInfos.cpp
b/src/core/CLucene/index/FieldInfos.cpp
index b3260e6d..00e0c427 100644
--- a/src/core/CLucene/index/FieldInfos.cpp
+++ b/src/core/CLucene/index/FieldInfos.cpp
@@ -103,6 +103,17 @@ bool FieldInfos::hasProx() {
return false;
}
+IndexVersion FieldInfos::getIndexVersion() {
+ int numFields = byNumber.size();
+ for (int i = 0; i < numFields; i++) {
+ FieldInfo* fi = fieldInfo(i);
+ if (fi->indexVersion_ > IndexVersion::kV0) {
+ return fi->indexVersion_;
+ }
+ }
+ return IndexVersion::kV0;
+}
+
void FieldInfos::addIndexed(const TCHAR** names, const bool storeTermVectors,
const bool storePositionWithTermVector,
const bool
storeOffsetWithTermVector) {
size_t i = 0;
@@ -228,8 +239,16 @@ void FieldInfos::write(IndexOutput* output) const{
if (fi->storePayloads) bits |= STORE_PAYLOADS;
if (fi->hasProx) bits |= TERM_FREQ_AND_POSITIONS;
+ if (fi->getIndexVersion() > IndexVersion::kV0) {
+ bits |= 0x80;
+ }
+
output->writeString(fi->name,_tcslen(fi->name));
output->writeByte(bits);
+
+ if (fi->getIndexVersion() > IndexVersion::kV0) {
+
output->writeVInt(static_cast<int32_t>(fi->getIndexVersion()));
+ }
}
}
@@ -239,7 +258,7 @@ void FieldInfos::read(IndexInput* input) {
bool
isIndexed,storeTermVector,storePositionsWithTermVector,storeOffsetWithTermVector,omitNorms,hasProx,storePayloads;
for (int32_t i = 0; i < size; ++i){
TCHAR* name = input->readString(); //we could read name into a
string buffer, but we can't be sure what the maximum field length will be.
- bits = input->readByte();
+ bits = input->readByte();
isIndexed = (bits & IS_INDEXED) != 0;
storeTermVector = (bits & STORE_TERMVECTOR) != 0;
storePositionsWithTermVector = (bits &
STORE_POSITIONS_WITH_TERMVECTOR) != 0;
@@ -247,8 +266,14 @@ void FieldInfos::read(IndexInput* input) {
omitNorms = (bits & OMIT_NORMS) != 0;
storePayloads = (bits & STORE_PAYLOADS) != 0;
hasProx = (bits & TERM_FREQ_AND_POSITIONS) != 0;
-
- addInternal(name, isIndexed, storeTermVector,
storePositionsWithTermVector, storeOffsetWithTermVector, omitNorms, hasProx,
storePayloads);
+
+ FieldInfo* fi = addInternal(name, isIndexed, storeTermVector,
storePositionsWithTermVector, storeOffsetWithTermVector, omitNorms, hasProx,
storePayloads);
+ if ((bits & 0x80) == 0) {
+ fi->setIndexVersion(IndexVersion::kV0);
+ } else {
+ IndexVersion indexVersion =
(IndexVersion)input->readVInt();
+ fi->setIndexVersion(indexVersion);
+ }
_CLDELETE_CARRAY(name);
}
}
diff --git a/src/core/CLucene/index/IndexReader.cpp
b/src/core/CLucene/index/IndexReader.cpp
index 91ca18ed..0fb2fe59 100644
--- a/src/core/CLucene/index/IndexReader.cpp
+++ b/src/core/CLucene/index/IndexReader.cpp
@@ -529,4 +529,8 @@ CL_NS(store)::Directory* IndexReader::getDirectory() {
_internal->closeCallbacks.put(callback, parameter);
}
+IndexVersion IndexReader::getIndexVersion() {
+ _CLTHROWA(CL_ERR_UnsupportedOperation, "IndexReader This reader does not
support this method.");
+}
+
CL_NS_END
diff --git a/src/core/CLucene/index/IndexReader.h
b/src/core/CLucene/index/IndexReader.h
index 9079025a..defb6e75 100644
--- a/src/core/CLucene/index/IndexReader.h
+++ b/src/core/CLucene/index/IndexReader.h
@@ -11,6 +11,7 @@
#include "CLucene/util/Array.h"
#include "CLucene/util/VoidList.h"
#include "CLucene/LuceneThreads.h"
+#include "CLucene/index/IndexVersion.h"
CL_CLASS_DEF(store,Directory)
CL_CLASS_DEF(store,LuceneLock)
@@ -676,6 +677,8 @@ public:
*/
void addCloseCallback(CloseCallback callback, void* parameter);
+ virtual IndexVersion getIndexVersion();
+
friend class SegmentReader;
friend class MultiReader;
friend class IndexWriter;
diff --git a/src/core/CLucene/index/IndexVersion.h
b/src/core/CLucene/index/IndexVersion.h
new file mode 100644
index 00000000..448b1872
--- /dev/null
+++ b/src/core/CLucene/index/IndexVersion.h
@@ -0,0 +1,8 @@
+#pragma once
+
+enum class IndexVersion {
+ kV0 = 0,
+ kV1 = 1,
+
+ kNone
+};
\ No newline at end of file
diff --git a/src/core/CLucene/index/IndexWriter.cpp
b/src/core/CLucene/index/IndexWriter.cpp
index 24ac2b08..6abd2716 100644
--- a/src/core/CLucene/index/IndexWriter.cpp
+++ b/src/core/CLucene/index/IndexWriter.cpp
@@ -280,10 +280,10 @@ void IndexWriter::init(Directory *d, Analyzer *a, const
bool create, const bool
if (analyzer->isSDocOpt()) {
docWriter = _CLNEW SDocumentsWriter<char>(directory, this);
} else {
- docWriter = _CLNEW DocumentsWriter(directory, this);
+ _CLTHROWA(CL_ERR_IllegalArgument, "IndexWriter::init: Only
support SDocumentsWriter");
}
} else {
- docWriter = _CLNEW DocumentsWriter(directory, this);
+ _CLTHROWA(CL_ERR_IllegalArgument, "IndexWriter::init: Only support
SDocumentsWriter");
}
// Default deleter (for backwards compatibility) is
// KeepOnlyLastCommitDeleter:
@@ -1702,14 +1702,13 @@ void IndexWriter::mergeTerms(bool hasProx) {
}
if ((++df % skipInterval) == 0) {
+ freqOut->writeByte((char)CodeMode::kPfor);
+ freqOut->writeVInt(docDeltaBuffer.size());
+ encode(freqOut, docDeltaBuffer, true);
if (hasProx) {
- freqOut->writeByte((char)CodeMode::kPfor);
- freqOut->writeVInt(docDeltaBuffer.size());
- // doc
- encode(freqOut, docDeltaBuffer, true);
- // freq
encode(freqOut, freqBuffer, false);
}
+
skipWriter->setSkipData(lastDoc, false, -1);
skipWriter->bufferSkip(df);
}
@@ -1717,8 +1716,8 @@ void IndexWriter::mergeTerms(bool hasProx) {
assert(destDocId > lastDoc || df == 1);
lastDoc = destDocId;
+ docDeltaBuffer.push_back(destDocId);
if (hasProx) {
- // position
int32_t lastPosition = 0;
for (int32_t i = 0; i < descPositions.size(); i++) {
int32_t position = descPositions[i];
@@ -1726,15 +1725,7 @@ void IndexWriter::mergeTerms(bool hasProx) {
proxOut->writeVInt(delta);
lastPosition = position;
}
-
- docDeltaBuffer.push_back(destDocId);
freqBuffer.push_back(destFreq);
- } else {
- docDeltaBuffer.push_back(destDocId);
- if (docDeltaBuffer.size() == PFOR_BLOCK_SIZE) {
- freqOut->writeVInt(docDeltaBuffer.size());
- encode(freqOut, docDeltaBuffer, true);
- }
}
smi = match[destDoc->srcIdx];
@@ -1775,7 +1766,7 @@ void IndexWriter::mergeTerms(bool hasProx) {
proxPointer = proxPointers[i];
}
- if (hasProx) {
+ {
auto& docDeltaBuffer = docDeltaBuffers[i];
auto& freqBuffer = freqBuffers[i];
@@ -1783,26 +1774,24 @@ void IndexWriter::mergeTerms(bool hasProx) {
freqOutput->writeVInt(docDeltaBuffer.size());
uint32_t lastDoc = 0;
for (int32_t i = 0; i < docDeltaBuffer.size(); i++) {
- uint32_t newDocCode = (docDeltaBuffer[i] - lastDoc) << 1;
- lastDoc = docDeltaBuffer[i];
- uint32_t freq = freqBuffer[i];
- if (1 == freq) {
- freqOutput->writeVInt(newDocCode | 1);
+ uint32_t curDoc = docDeltaBuffer[i];
+ if (hasProx) {
+ uint32_t newDocCode = (docDeltaBuffer[i] - lastDoc) <<
1;
+ lastDoc = curDoc;
+ uint32_t freq = freqBuffer[i];
+ if (1 == freq) {
+ freqOutput->writeVInt(newDocCode | 1);
+ } else {
+ freqOutput->writeVInt(newDocCode);
+ freqOutput->writeVInt(freq);
+ }
} else {
- freqOutput->writeVInt(newDocCode);
- freqOutput->writeVInt(freq);
+ freqOutput->writeVInt(curDoc - lastDoc);
+ lastDoc = curDoc;
}
}
docDeltaBuffer.resize(0);
freqBuffer.resize(0);
- } else {
- freqOutput->writeVInt(docDeltaBuffers[i].size());
- uint32_t lDoc = 0;
- for (auto &docDelta: docDeltaBuffers[i]) {
- freqOutput->writeVInt(docDelta - lDoc);
- lDoc = docDelta;
- }
- docDeltaBuffers[i].resize(0);
}
int64_t skipPointer = skipListWriter->writeSkip(freqOutput);
diff --git a/src/core/CLucene/index/MultiSegmentReader.cpp
b/src/core/CLucene/index/MultiSegmentReader.cpp
index 01438080..d4e8c8ea 100644
--- a/src/core/CLucene/index/MultiSegmentReader.cpp
+++ b/src/core/CLucene/index/MultiSegmentReader.cpp
@@ -485,6 +485,12 @@ const char* MultiSegmentReader::getObjectName() const{
return getClassName();
}
+IndexVersion MultiSegmentReader::getIndexVersion() {
+ for (size_t i = 0; i < subReaders->length; i++) {
+ return (*subReaders)[i]->getIndexVersion();
+ }
+ _CLTHROWA(CL_ERR_IllegalState, "MultiSegmentReader::getIndexVersion
index open failed.");
+}
@@ -540,6 +546,14 @@ TermPositions* MultiTermDocs::__asTermPositions(){
return NULL;
}
+int32_t MultiTermDocs::docFreq() {
+ int32_t docFreq = 0;
+ for (size_t i = 0; i < readerTermDocs->length; i++) {
+ docFreq += readerTermDocs->values[i]->docFreq();
+ }
+ return docFreq;
+}
+
int32_t MultiTermDocs::doc() const {
CND_PRECONDITION(current!=NULL,"current==NULL, check that next() was
called");
return base + current->doc();
@@ -577,6 +591,12 @@ void MultiTermDocs::seek( Term* tterm) {
base = 0;
pointer = 0;
current = NULL;
+
+ for (int32_t i = 0; i < readerTermDocs->length; i++) {
+ termDocs(i);
+ }
+ base = starts[pointer];
+ current = termDocs(pointer++);
}
bool MultiTermDocs::next() {
@@ -628,8 +648,8 @@ bool MultiTermDocs::readRange(DocRange* docRange) {
current = nullptr;
} else {
if (docRange->type_ == DocRangeType::kMany) {
- auto begin = docRange->doc_many.begin();
- auto end = docRange->doc_many.begin() +
docRange->doc_many_size_;
+ auto begin = docRange->doc_many->begin();
+ auto end = docRange->doc_many->begin() +
docRange->doc_many_size_;
std::transform(begin, end, begin,
[this](int32_t val) { return val + base; });
} else if (docRange->type_ == DocRangeType::kRange) {
docRange->doc_range.first += base;
diff --git a/src/core/CLucene/index/SDocumentWriter.cpp
b/src/core/CLucene/index/SDocumentWriter.cpp
index 5d6de917..c76b31fb 100644
--- a/src/core/CLucene/index/SDocumentWriter.cpp
+++ b/src/core/CLucene/index/SDocumentWriter.cpp
@@ -166,6 +166,7 @@ void SDocumentsWriter<T>::ThreadState::init(Document *doc,
int32_t doc_id) {
FieldInfo *fi = _parent->fieldInfos->add(field->name(),
field->isIndexed(), field->isTermVectorStored(),
field->isStorePositionWithTermVector(), field->isStoreOffsetWithTermVector(),
field->getOmitNorms(),
!field->getOmitTermFreqAndPositions(), false);
+ fi->setIndexVersion(field->getIndexVersion());
if (fi->isIndexed && !fi->omitNorms) {
// Maybe grow our buffered norms
if (_parent->norms.length <= fi->number) {
@@ -240,6 +241,7 @@ void SDocumentsWriter<T>::ThreadState::init(Document *doc,
int32_t doc_id) {
fp->docFields.values[fp->fieldCount++] = field;
}
_parent->hasProx_ = _parent->fieldInfos->hasProx();
+ _parent->indexVersion_ = _parent->fieldInfos->getIndexVersion();
}
template<typename T>
@@ -1172,12 +1174,10 @@ void
SDocumentsWriter<T>::appendPostings(ArrayBase<typename ThreadState::FieldDa
while (numToMerge > 0) {
if ((++df % skipInterval) == 0) {
+ freqOut->writeByte((char)CodeMode::kPfor);
+ freqOut->writeVInt(docDeltaBuffer.size());
+ encode(freqOut, docDeltaBuffer, true);
if (hasProx_) {
- freqOut->writeByte((char)CodeMode::kPfor);
- freqOut->writeVInt(docDeltaBuffer.size());
- // doc
- encode(freqOut, docDeltaBuffer, true);
- // freq
encode(freqOut, freqBuffer, false);
}
@@ -1205,22 +1205,14 @@ void
SDocumentsWriter<T>::appendPostings(ArrayBase<typename ThreadState::FieldDa
// changing the format to match Lucene's segment
// format.
+ docDeltaBuffer.push_back(doc);
if (hasProx_) {
- // position
for (int32_t j = 0; j < termDocFreq; j++) {
const int32_t code = prox.readVInt();
assert(0 == (code & 1));
proxOut->writeVInt(code >> 1);
}
-
- docDeltaBuffer.push_back(doc);
freqBuffer.push_back(termDocFreq);
- } else {
- docDeltaBuffer.push_back(doc);
- if (docDeltaBuffer.size() == PFOR_BLOCK_SIZE) {
- freqOut->writeVInt(docDeltaBuffer.size());
- encode(freqOut, docDeltaBuffer, true);
- }
}
if (!minState->nextDoc()) {
@@ -1253,13 +1245,14 @@ void
SDocumentsWriter<T>::appendPostings(ArrayBase<typename ThreadState::FieldDa
// Done merging this term
{
- if (hasProx_) {
- freqOut->writeByte((char)CodeMode::kDefault);
- freqOut->writeVInt(docDeltaBuffer.size());
- uint32_t lastDoc = 0;
- for (int32_t i = 0; i < docDeltaBuffer.size(); i++) {
- uint32_t newDocCode = (docDeltaBuffer[i] - lastDoc) << 1;
- lastDoc = docDeltaBuffer[i];
+ freqOut->writeByte((char)CodeMode::kDefault);
+ freqOut->writeVInt(docDeltaBuffer.size());
+ uint32_t lastDoc = 0;
+ for (int32_t i = 0; i < docDeltaBuffer.size(); i++) {
+ uint32_t curDoc = docDeltaBuffer[i];
+ if (hasProx_) {
+ uint32_t newDocCode = (curDoc - lastDoc) << 1;
+ lastDoc = curDoc;
uint32_t freq = freqBuffer[i];
if (1 == freq) {
freqOut->writeVInt(newDocCode | 1);
@@ -1267,18 +1260,13 @@ void
SDocumentsWriter<T>::appendPostings(ArrayBase<typename ThreadState::FieldDa
freqOut->writeVInt(newDocCode);
freqOut->writeVInt(freq);
}
+ } else {
+ freqOut->writeVInt(curDoc - lastDoc);
+ lastDoc = curDoc;
}
- docDeltaBuffer.resize(0);
- freqBuffer.resize(0);
- } else {
- freqOut->writeVInt(docDeltaBuffer.size());
- uint32_t lDoc = 0;
- for (auto& docDelta : docDeltaBuffer) {
- freqOut->writeVInt(docDelta - lDoc);
- lDoc = docDelta;
- }
- docDeltaBuffer.resize(0);
}
+ docDeltaBuffer.resize(0);
+ freqBuffer.resize(0);
}
int64_t skipPointer = skipListWriter->writeSkip(freqOut);
diff --git a/src/core/CLucene/index/SDocumentWriter.h
b/src/core/CLucene/index/SDocumentWriter.h
index 5163c2a1..7e250be3 100644
--- a/src/core/CLucene/index/SDocumentWriter.h
+++ b/src/core/CLucene/index/SDocumentWriter.h
@@ -54,6 +54,7 @@ private:
std::ostream* infoStream{};
int64_t ramBufferSize;
bool hasProx_ = false;
+ IndexVersion indexVersion_ = IndexVersion::kV1;
public:
class FieldMergeState;
diff --git a/src/core/CLucene/index/SegmentReader.cpp
b/src/core/CLucene/index/SegmentReader.cpp
index 0b53de82..e0ec0c14 100644
--- a/src/core/CLucene/index/SegmentReader.cpp
+++ b/src/core/CLucene/index/SegmentReader.cpp
@@ -1123,4 +1123,9 @@ bool SegmentReader::normsClosed() {
}
return true;
}
+
+IndexVersion SegmentReader::getIndexVersion() {
+ return _fieldInfos->getIndexVersion();
+}
+
CL_NS_END
diff --git a/src/core/CLucene/index/SegmentTermDocs.cpp
b/src/core/CLucene/index/SegmentTermDocs.cpp
index 765f6424..9108f1df 100644
--- a/src/core/CLucene/index/SegmentTermDocs.cpp
+++ b/src/core/CLucene/index/SegmentTermDocs.cpp
@@ -21,7 +21,8 @@ CL_NS_DEF(index)
SegmentTermDocs::SegmentTermDocs(const SegmentReader *_parent) :
parent(_parent), freqStream(_parent->freqStream->clone()),
count(0),
df(0), deletedDocs(_parent->deletedDocs), _doc(0), _freq(0),
skipInterval(_parent->tis->getSkipInterval()),
maxSkipLevels(_parent->tis->getMaxSkipLevels()), skipListReader(NULL),
freqBasePointer(0), proxBasePointer(0),
-
skipPointer(0), haveSkipped(false), pointer(0), pointerMax(0) {
+
skipPointer(0), haveSkipped(false), pointer(0), pointerMax(0),
indexVersion_(_parent->_fieldInfos->getIndexVersion()),
+
hasProx(_parent->_fieldInfos->hasProx()), buffer_(freqStream, hasProx,
indexVersion_) {
CND_CONDITION(_parent != NULL, "Parent is NULL");
memset(docs,0,PFOR_BLOCK_SIZE*sizeof(int32_t));
memset(freqs,0,PFOR_BLOCK_SIZE*sizeof(int32_t));
@@ -35,6 +36,10 @@ TermPositions *SegmentTermDocs::__asTermPositions() {
return NULL;
}
+int32_t SegmentTermDocs::docFreq() {
+ return df;
+}
+
void SegmentTermDocs::seek(Term *term) {
TermInfo *ti = parent->tis->get(term);
seek(ti, term);
@@ -63,7 +68,7 @@ void SegmentTermDocs::seek(const TermInfo *ti, Term *term) {
count = 0;
FieldInfo *fi = parent->_fieldInfos->fieldInfo(term->field());
currentFieldStoresPayloads = (fi != NULL) ? fi->storePayloads : false;
- hasProx = (fi != nullptr) && fi->hasProx;
+ // hasProx = (fi != nullptr) && fi->hasProx;
if (ti == NULL) {
df = 0;
} else {// punt case
@@ -90,141 +95,39 @@ int32_t SegmentTermDocs::freq() const {
}
bool SegmentTermDocs::next() {
- pointer++;
- if (pointer >= pointerMax) {
- pointerMax = SegmentTermDocs::read(docs, freqs, PFOR_BLOCK_SIZE);
// refill buffer
- if (pointerMax != 0) {
- pointer = 0;
- } else {
- // NOTE: do not close here, try to close by upper caller or
destruct.
- //SegmentTermDocs::close(); // close
stream
- _doc = LUCENE_INT32_MAX_SHOULDBE; // set to sentinel
value
- return false;
- }
+ if (count == df) {
+ _doc = LUCENE_INT32_MAX_SHOULDBE;
+ return false;
}
- _doc = docs[pointer];
- _freq = freqs[pointer];
- return true;
-}
-/*bool SegmentTermDocs::next() {
- while (true) {
- if (count == df)
- return false;
+ _doc = buffer_.getDoc();
+ if (hasProx) {
+ _freq = buffer_.getFreq();
+ }
- uint32_t docCode = freqStream->readVInt();
- _doc += docCode >> 1; //unsigned shift
- if ((docCode & 1) != 0)// if low bit is set
- _freq = 1; // _freq is one
- else
- _freq = freqStream->readVInt();// else read _freq
- count++;
+ count++;
- if ((deletedDocs == NULL) || (_doc >= 0 && deletedDocs->get(_doc) ==
false))
- break;
- skippingDoc();
- }
return true;
-}*/
+}
int32_t SegmentTermDocs::read(int32_t *docs, int32_t *freqs, int32_t length) {
int32_t i = 0;
+
+ if (count == df) {
+ return i;
+ }
- if (hasProx) {
- if (count == df) {
- return i;
- }
-
- char mode = freqStream->readByte();
-
- uint32_t arraySize = freqStream->readVInt();
- std::vector<uint32_t> _docs(arraySize);
- std::vector<uint32_t> _freqs(arraySize);
-
- if (mode == (char)CodeMode::kDefault) {
- uint32_t docDelta = 0;
- for (uint32_t i = 0; i < arraySize; i++) {
- uint32_t docCode = freqStream->readVInt();
- docDelta += (docCode >> 1);
- _docs[i] = docDelta;
- if ((docCode & 1) != 0) {
- _freqs[i] = 1;
- } else {
- _freqs[i] = freqStream->readVInt();
- }
- }
- } else {
- // NOTE: Pad arraySize from 511 to 512 for alignment since the
first block size is 511, and add one more extra space to prevent overflow.
- auto paddingSize = (arraySize / PFOR_BLOCK_SIZE) * PFOR_BLOCK_SIZE
+ PFOR_BLOCK_SIZE;
- _docs.resize(paddingSize + 1);
- _freqs.resize(paddingSize + 1);
- {
- uint32_t SerializedSize = freqStream->readVInt();
- std::vector<uint8_t> buf(SerializedSize + PFOR_BLOCK_SIZE);
- freqStream->readBytes(buf.data(), SerializedSize);
- P4DEC(buf.data(), arraySize, _docs.data());
- }
- {
- uint32_t SerializedSize = freqStream->readVInt();
- std::vector<uint8_t> buf(SerializedSize + PFOR_BLOCK_SIZE);
- freqStream->readBytes(buf.data(), SerializedSize);
- P4NZDEC(buf.data(), arraySize, _freqs.data());
- }
- }
+ while (i < length && count < df) {
+ _doc = buffer_.getDoc();
+ docs[i] = _doc;
- while (i < arraySize && count < df) {
- _doc = _docs[i];
- _freq = _freqs[i];
- count++;
- if (deletedDocs == NULL || (_doc >= 0 && !deletedDocs->get(_doc)))
{
- docs[i] = _doc;
- freqs[i] = _freq;
- i++;
- }
+ if (hasProx) {
+ _freq = buffer_.getFreq();
+ freqs[i] = _freq;
}
- } else {
- //todo: one optimization would be to get the pointer buffer for ram or
mmap dirs
- //and iterate over them instead of using readByte() intensive
functions.
- if (i < length && count < df) {
- uint32_t arraySize = freqStream->readVInt();
- if (arraySize < PFOR_BLOCK_SIZE) {
- int32_t _docDelta{0};
- while (i < length && count < df && i < arraySize) {
- // manually inlined call to next() for speed
- uint32_t docCode = freqStream->readVInt();
- _docDelta += docCode;
- _doc = _docDelta;
- _freq = 1; // _freq is one
- count++;
-
- if (deletedDocs == NULL || (_doc >= 0 &&
!deletedDocs->get(_doc))) {
- docs[i] = _doc;
- freqs[i] = _freq;
- i++;
- }
- }
- } else {
- // need one more space for decode, otherwise will buffer
overflow when decoding
- std::vector<uint32_t> arr(arraySize + 1);
- uint32_t serializedSize = freqStream->readVInt();
- // need more space for decode, otherwise will buffer overflow
when decoding
- std::vector<uint8_t> buf(serializedSize + PFOR_BLOCK_SIZE);
- freqStream->readBytes(buf.data(), serializedSize);
- P4DEC(buf.data(), arraySize, arr.data());
-
- while (i < length && count < df && i < arraySize) {
- _doc = arr[i];
- _freq = 1;// _freq is one
- if (deletedDocs == NULL || (_doc >= 0 &&
!deletedDocs->get(_doc))) {
- docs[i] = _doc;
- freqs[i] = _freq;
- i++;
- }
- count++;
- }
- }
- }
+ count++;
+ i++;
}
return i;
@@ -235,75 +138,17 @@ bool SegmentTermDocs::readRange(DocRange* docRange) {
return false;
}
- if (hasProx) {
- char mode = freqStream->readByte();
- uint32_t arraySize = freqStream->readVInt();
- if (mode == (char)CodeMode::kDefault) {
- uint32_t docDelta = 0;
- for (uint32_t i = 0; i < arraySize; i++) {
- uint32_t docCode = freqStream->readVInt();
- docDelta += (docCode >> 1);
- docRange->doc_many[i] = docDelta;
- if ((docCode & 1) != 0) {
- docRange->freq_many[i] = 1;
- } else {
- docRange->freq_many[i] = freqStream->readVInt();
- }
- }
- docRange->type_ = DocRangeType::kMany;
- docRange->doc_many_size_ = arraySize;
- docRange->freq_many_size_ = arraySize;
- } else {
- {
- uint32_t SerializedSize = freqStream->readVInt();
- std::vector<uint8_t> buf(SerializedSize + PFOR_BLOCK_SIZE);
- freqStream->readBytes(buf.data(), SerializedSize);
- P4DEC(buf.data(), arraySize, docRange->doc_many.data());
- }
- {
- uint32_t SerializedSize = freqStream->readVInt();
- std::vector<uint8_t> buf(SerializedSize + PFOR_BLOCK_SIZE);
- freqStream->readBytes(buf.data(), SerializedSize);
- P4NZDEC(buf.data(), arraySize, docRange->freq_many.data());
- }
- docRange->type_ = DocRangeType::kMany;
- docRange->doc_many_size_ = arraySize;
- docRange->freq_many_size_ = arraySize;
- }
- count += arraySize;
- } else {
- uint32_t arraySize = freqStream->readVInt();
- if (arraySize < PFOR_BLOCK_SIZE) {
- uint32_t docDelta = 0;
- for (uint32_t i = 0; i < arraySize; i++) {
- uint32_t docCode = freqStream->readVInt();
- docDelta += docCode;
- docRange->doc_many[i] = docDelta;
- }
- docRange->type_ = DocRangeType::kMany;
- docRange->doc_many_size_ = arraySize;
- } else {
- {
- uint32_t serializedSize = freqStream->readVInt();
- std::vector<uint8_t> buf(serializedSize + PFOR_BLOCK_SIZE);
- freqStream->readBytes(buf.data(), serializedSize);
- P4DEC(buf.data(), arraySize, docRange->doc_many.data());
- }
- docRange->type_ = DocRangeType::kMany;
- docRange->doc_many_size_ = arraySize;
- }
- count += arraySize;
- }
+ buffer_.readRange(docRange);
- if (docRange->type_ == DocRangeType::kMany) {
- if (docRange->doc_many_size_ > 0) {
- uint32_t start = docRange->doc_many[0];
- uint32_t end = docRange->doc_many[docRange->doc_many_size_ - 1];
- if ((end - start) == docRange->doc_many_size_ - 1) {
- docRange->doc_range.first = start;
- docRange->doc_range.second = start + docRange->doc_many_size_;
- docRange->type_ = DocRangeType::kRange;
- }
+ count += docRange->doc_many_size_;
+
+ if (docRange->doc_many_size_ > 0) {
+ uint32_t start = (*docRange->doc_many)[0];
+ uint32_t end = (*docRange->doc_many)[docRange->doc_many_size_ - 1];
+ if ((end - start) == docRange->doc_many_size_ - 1) {
+ docRange->doc_range.first = start;
+ docRange->doc_range.second = start + docRange->doc_many_size_;
+ docRange->type_ = DocRangeType::kRange;
}
}
@@ -329,7 +174,7 @@ bool SegmentTermDocs::skipTo(const int32_t target) {
_doc = skipListReader->getDoc();
count = newCount;
- pointer = pointerMax;
+ buffer_.refill();
}
}
@@ -341,5 +186,120 @@ bool SegmentTermDocs::skipTo(const int32_t target) {
return true;
}
+void TermDocsBuffer::refill() {
+ cur_doc_ = 0;
+ cur_freq_ = 0;
+
+ if (indexVersion_ == IndexVersion::kV1) {
+ size_ = refillV1();
+ } else {
+ size_ = refillV0();
+ }
+}
+
+void TermDocsBuffer::readRange(DocRange* docRange) {
+ int32_t size = 0;
+ if (indexVersion_ == IndexVersion::kV1) {
+ size = refillV1();
+ } else {
+ size = refillV0();
+ }
+ docRange->type_ = DocRangeType::kMany;
+ docRange->doc_many = &docs_;
+ docRange->doc_many_size_ = size;
+ if (hasProx_) {
+ docRange->freq_many = &freqs_;
+ docRange->freq_many_size_ = size;
+ }
+}
+
+int32_t TermDocsBuffer::refillV0() {
+ if (hasProx_) {
+ char mode = freqStream_->readByte();
+ uint32_t arraySize = freqStream_->readVInt();
+ if (mode == (char)CodeMode::kPfor) {
+ {
+ uint32_t SerializedSize = freqStream_->readVInt();
+ std::vector<uint8_t> buf(SerializedSize + PFOR_BLOCK_SIZE);
+ freqStream_->readBytes(buf.data(), SerializedSize);
+ P4DEC(buf.data(), arraySize, docs_.data());
+ }
+ {
+ uint32_t SerializedSize = freqStream_->readVInt();
+ std::vector<uint8_t> buf(SerializedSize + PFOR_BLOCK_SIZE);
+ freqStream_->readBytes(buf.data(), SerializedSize);
+ P4NZDEC(buf.data(), arraySize, freqs_.data());
+ }
+ } else if (mode == (char)CodeMode::kDefault) {
+ uint32_t docDelta = 0;
+ for (uint32_t i = 0; i < arraySize; i++) {
+ uint32_t docCode = freqStream_->readVInt();
+ docDelta += (docCode >> 1);
+ docs_[i] = docDelta;
+ if ((docCode & 1) != 0) {
+ freqs_[i] = 1;
+ } else {
+ freqs_[i] = freqStream_->readVInt();
+ }
+ }
+ }
+ return arraySize;
+ } else {
+ uint32_t arraySize = freqStream_->readVInt();
+ if (arraySize < PFOR_BLOCK_SIZE) {
+ uint32_t docDelta = 0;
+ for (uint32_t i = 0; i < arraySize; i++) {
+ uint32_t docCode = freqStream_->readVInt();
+ docDelta += docCode;
+ docs_[i] = docDelta;
+ }
+ } else {
+ {
+ uint32_t serializedSize = freqStream_->readVInt();
+ std::vector<uint8_t> buf(serializedSize + PFOR_BLOCK_SIZE);
+ freqStream_->readBytes(buf.data(), serializedSize);
+ P4DEC(buf.data(), arraySize, docs_.data());
+ }
+ }
+ return arraySize;
+ }
+}
+
+int32_t TermDocsBuffer::refillV1() {
+ char mode = freqStream_->readByte();
+ uint32_t arraySize = freqStream_->readVInt();
+ if (mode == (char)CodeMode::kPfor) {
+ {
+ uint32_t SerializedSize = freqStream_->readVInt();
+ std::vector<uint8_t> buf(SerializedSize + PFOR_BLOCK_SIZE);
+ freqStream_->readBytes(buf.data(), SerializedSize);
+ P4DEC(buf.data(), arraySize, docs_.data());
+ }
+ if (hasProx_) {
+ uint32_t SerializedSize = freqStream_->readVInt();
+ std::vector<uint8_t> buf(SerializedSize + PFOR_BLOCK_SIZE);
+ freqStream_->readBytes(buf.data(), SerializedSize);
+ P4NZDEC(buf.data(), arraySize, freqs_.data());
+ }
+ } else if (mode == (char)CodeMode::kDefault) {
+ uint32_t docDelta = 0;
+ for (uint32_t i = 0; i < arraySize; i++) {
+ uint32_t docCode = freqStream_->readVInt();
+ if (hasProx_) {
+ docDelta += (docCode >> 1);
+ docs_[i] = docDelta;
+ if ((docCode & 1) != 0) {
+ freqs_[i] = 1;
+ } else {
+ freqs_[i] = freqStream_->readVInt();
+ }
+ } else {
+ docDelta += docCode;
+ docs_[i] = docDelta;
+ }
+ }
+ }
+ return arraySize;
+}
CL_NS_END
diff --git a/src/core/CLucene/index/Terms.h b/src/core/CLucene/index/Terms.h
index 4d701ca7..620105fd 100644
--- a/src/core/CLucene/index/Terms.h
+++ b/src/core/CLucene/index/Terms.h
@@ -82,6 +82,10 @@ public:
* No dynamic casting is required and no RTTI data is needed to do this
*/
virtual TermPositions* __asTermPositions()=0;
+
+ virtual int32_t docFreq() {
+ _CLTHROWA(CL_ERR_UnsupportedOperation, "TermDocs::docFreq does
not support this method.");
+ }
};
diff --git a/src/core/CLucene/index/_FieldInfos.h
b/src/core/CLucene/index/_FieldInfos.h
index 414dcc90..ed142c44 100644
--- a/src/core/CLucene/index/_FieldInfos.h
+++ b/src/core/CLucene/index/_FieldInfos.h
@@ -8,6 +8,8 @@
#define _lucene_index_FieldInfos_
#include "CLucene/store/Directory.h"
+#include "CLucene/index/IndexVersion.h"
+
CL_CLASS_DEF(document,Document)
CL_CLASS_DEF(document,Field)
@@ -33,6 +35,7 @@ class FieldInfo :LUCENE_BASE{
bool omitNorms; // omit norms associated with indexed fields
bool hasProx = false;
+ IndexVersion indexVersion_ = IndexVersion::kV1;
bool storePayloads; // whether this field stores payloads together with
term positions
@@ -67,6 +70,10 @@ class FieldInfo :LUCENE_BASE{
* @memory - caller is responsible for deleting the returned object
*/
FieldInfo* clone();
+
+public:
+ void setIndexVersion(IndexVersion indexVersion) { indexVersion_ =
indexVersion; }
+ IndexVersion getIndexVersion() { return indexVersion_; }
};
/** Access to the Field Info file that describes document fields and whether or
@@ -126,6 +133,7 @@ public:
void addIndexed(const TCHAR** names, const bool storeTermVectors, const
bool storePositionWithTermVector, const bool storeOffsetWithTermVector);
bool hasProx();
+ IndexVersion getIndexVersion();
/**
* Assumes the fields are not storing term vectors.
diff --git a/src/core/CLucene/index/_MultiSegmentReader.h
b/src/core/CLucene/index/_MultiSegmentReader.h
index c8460273..46b32caa 100644
--- a/src/core/CLucene/index/_MultiSegmentReader.h
+++ b/src/core/CLucene/index/_MultiSegmentReader.h
@@ -119,6 +119,8 @@ public:
static const char* getClassName();
const char* getObjectName() const;
+
+ IndexVersion getIndexVersion() override;
};
@@ -159,6 +161,8 @@ public:
void close();
virtual TermPositions* __asTermPositions();
+
+ int32_t docFreq() override;
};
diff --git a/src/core/CLucene/index/_SegmentHeader.h
b/src/core/CLucene/index/_SegmentHeader.h
index 8cbb8959..e177f576 100644
--- a/src/core/CLucene/index/_SegmentHeader.h
+++ b/src/core/CLucene/index/_SegmentHeader.h
@@ -25,10 +25,67 @@
#include "DirectoryIndexReader.h"
#include "_SkipListReader.h"
#include "CLucene/util/_ThreadLocal.h"
+#include "CLucene/index/IndexVersion.h"
CL_NS_DEF(index)
class SegmentReader;
+class TermDocsBuffer {
+public:
+ TermDocsBuffer(CL_NS(store)::IndexInput* freqStream, bool hasProx,
IndexVersion indexVersion)
+ : docs_(PFOR_BLOCK_SIZE + 3),
+ freqs_(PFOR_BLOCK_SIZE + 3),
+ freqStream_(freqStream),
+ hasProx_(hasProx),
+ indexVersion_(indexVersion) {
+ }
+
+ ~TermDocsBuffer() {
+ cur_doc_ = 0;
+ cur_freq_ = 0;
+
+ docs_.clear();
+ freqs_.clear();
+
+ freqStream_ = nullptr;
+ }
+
+ inline int32_t getDoc() {
+ if (cur_doc_ >= size_) {
+ refill();
+ }
+ return docs_[cur_doc_++];
+ }
+
+ inline int32_t getFreq() {
+ if (cur_freq_ >= size_) {
+ refill();
+ }
+ return freqs_[cur_freq_++];
+ }
+
+ void refill();
+ void readRange(DocRange* docRange);
+
+private:
+ int32_t refillV0();
+ int32_t refillV1();
+
+private:
+ uint32_t size_ = 0;
+
+ uint32_t cur_doc_ = 0;
+ std::vector<uint32_t> docs_;
+
+ uint32_t cur_freq_ = 0;
+ std::vector<uint32_t> freqs_;
+
+ CL_NS(store)::IndexInput* freqStream_ = nullptr;
+
+ bool hasProx_ = false;
+ IndexVersion indexVersion_ = IndexVersion::kV0;
+};
+
class SegmentTermDocs:public virtual TermDocs {
protected:
const SegmentReader* parent;
@@ -57,6 +114,7 @@ private:
protected:
bool currentFieldStoresPayloads;
bool hasProx = false;
+ IndexVersion indexVersion_ = IndexVersion::kV0;
public:
///\param Parent must be a segment reader
@@ -82,9 +140,18 @@ public:
virtual TermPositions* __asTermPositions();
+ int32_t docFreq() override;
+
protected:
virtual void skippingDoc(){}
virtual void skipProx(const int64_t /*proxPointer*/, const int32_t
/*payloadLength*/){}
+
+private:
+ bool readRangeV0(DocRange* docRange);
+ bool readRangeV1(DocRange* docRange);
+
+private:
+ TermDocsBuffer buffer_;
};
@@ -439,6 +506,8 @@ private:
void startCommit();
void rollbackCommit();
+ IndexVersion getIndexVersion() override;
+
//allow various classes to access the internals of this. this allows us to
have
//a more tight idea of the package
friend class IndexReader;
diff --git a/src/core/CLucene/search/query/DcoIdSetIterator.h
b/src/core/CLucene/search/query/DcoIdSetIterator.h
new file mode 100644
index 00000000..88aa4313
--- /dev/null
+++ b/src/core/CLucene/search/query/DcoIdSetIterator.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "CLucene/index/DocRange.h"
+
+class DocIdSetIterator {
+public:
+ DocIdSetIterator() = default;
+ virtual ~DocIdSetIterator() = default;
+
+ virtual int32_t docID() = 0;
+ virtual int32_t nextDoc() = 0;
+ virtual int32_t advance(int32_t target) = 0;
+
+ virtual int32_t docFreq() const = 0;
+ virtual bool readRange(DocRange* docRange) const = 0;
+};
\ No newline at end of file
diff --git a/src/core/CLucene/search/query/TermIterator.h
b/src/core/CLucene/search/query/TermIterator.h
new file mode 100644
index 00000000..e0cf23a4
--- /dev/null
+++ b/src/core/CLucene/search/query/TermIterator.h
@@ -0,0 +1,51 @@
+#pragma once
+
+#include "CLucene/search/query/DcoIdSetIterator.h"
+#include "CLucene/index/Terms.h"
+
+#include <limits.h>
+
+CL_NS_USE(index)
+
+class TermIterator : public DocIdSetIterator {
+public:
+ TermIterator() = default;
+ TermIterator(TermDocs* termDocs) : termDocs_(termDocs) {
+ }
+
+ virtual ~TermIterator() = default;
+
+ bool isEmpty() {
+ return termDocs_ == nullptr;
+ }
+
+ int32_t docID() override {
+ uint32_t docId = termDocs_->doc();
+ return docId >= INT_MAX ? INT_MAX : docId;
+ }
+
+ int32_t nextDoc() override {
+ if (termDocs_->next()) {
+ return termDocs_->doc();
+ }
+ return INT_MAX;
+ }
+
+ int32_t advance(int32_t target) override {
+ if (termDocs_->skipTo(target)) {
+ return termDocs_->doc();
+ }
+ return INT_MAX;
+ }
+
+ int32_t docFreq() const override {
+ return termDocs_->docFreq();
+ }
+
+ bool readRange(DocRange* docRange) const override {
+ return termDocs_->readRange(docRange);
+ }
+
+private:
+ TermDocs* termDocs_ = nullptr;
+};
\ No newline at end of file
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index e2264120..e1c13305 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -199,6 +199,8 @@ SET(clucene_core_Files
./CLucene/search/spans/SpanWeight.cpp
./CLucene/search/spans/SpanWeight.h
./CLucene/search/spans/TermSpans.cpp
+ ./CLucene/search/query/DcoIdSetIterator.h
+ ./CLucene/search/query/TermIterator.h
)
#if USE_SHARED_OBJECT_FILES then we link directly to the object files (means
rebuilding them for the core)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]