stiga-huang commented on a change in pull request #1056: URL: https://github.com/apache/orc/pull/1056#discussion_r822592473
########## File path: c++/src/RleDecoderV2.cc ########## @@ -703,80 +680,61 @@ uint64_t RleDecoderV2::nextDelta(int64_t* const data, runLength = static_cast<uint64_t>(firstByte & 0x01) << 8; runLength |= readByte(); ++runLength; // account for first value - runRead = deltaBase = 0; + runRead = 0; + int64_t prevValue; // read the first value stored as vint if (isSigned) { - firstValue = static_cast<int64_t>(readVslong()); + prevValue = readVslong(); } else { - firstValue = static_cast<int64_t>(readVulong()); + prevValue = static_cast<int64_t>(readVulong()); } - prevValue = firstValue; + literals[0] = prevValue; // read the fixed delta value stored as vint (deltas can be negative even // if all number are positive) - deltaBase = static_cast<int64_t>(readVslong()); - } - - uint64_t nRead = std::min(runLength - runRead, numValues); - - uint64_t pos = offset; - for ( ; pos < offset + nRead; ++pos) { - // skip null positions - if (!notNull || notNull[pos]) break; - } - if (runRead == 0 && pos < offset + nRead) { - data[pos++] = firstValue; - ++runRead; - } - - if (bitSize == 0) { - // add fixed deltas to adjacent values - for ( ; pos < offset + nRead; ++pos) { - // skip null positions - if (notNull && !notNull[pos]) { - continue; - } - prevValue = data[pos] = prevValue + deltaBase; - ++runRead; - } - } else { - for ( ; pos < offset + nRead; ++pos) { - // skip null positions - if (!notNull || notNull[pos]) break; - } - if (runRead < 2 && pos < offset + nRead) { - // add delta base and first value - prevValue = data[pos++] = firstValue + deltaBase; - ++runRead; - } + int64_t deltaBase = readVslong(); - // write the unpacked values, add it to previous value and store final - // value to result buffer. if the delta base value is negative then it - // is a decreasing sequence else an increasing sequence - uint64_t remaining = (offset + nRead) - pos; - runRead += readLongs(data, pos, remaining, bitSize, notNull); - - if (deltaBase < 0) { - for ( ; pos < offset + nRead; ++pos) { - // skip null positions - if (notNull && !notNull[pos]) { - continue; - } - prevValue = data[pos] = prevValue - data[pos]; + if (bitSize == 0) { + // add fixed deltas to adjacent values + for (uint64_t i = 1; i < runLength; ++i) { + prevValue = literals[i] = prevValue + deltaBase; } } else { - for ( ; pos < offset + nRead; ++pos) { - // skip null positions - if (notNull && !notNull[pos]) { - continue; + prevValue = literals[1] = prevValue + deltaBase; + // write the unpacked values, add it to previous value and store final + // value to result buffer. if the delta base value is negative then it + // is a decreasing sequence else an increasing sequence + readLongs(literals.data(), 2, runLength - 2, bitSize); + if (deltaBase < 0) { + for (uint64_t i = 2; i < runLength; ++i) { + prevValue = literals[i] = prevValue - literals[i]; + } + } else { + for (uint64_t i = 2; i < runLength; ++i) { + prevValue = literals[i] = prevValue + literals[i]; } - prevValue = data[pos] = prevValue + data[pos]; } } } - return nRead; + + return copyDataFromBuffer(data, offset, numValues, notNull); } +uint64_t RleDecoderV2::copyDataFromBuffer(int64_t* data, uint64_t offset, + uint64_t numValues, const char* notNull) { + uint64_t nRead = std::min(runLength - runRead, numValues); + if (notNull) { + for (uint64_t i = offset; i < (offset + nRead); ++i) { + if (notNull[i]) { + data[i] = literals[runRead++]; + } + } + } else { + memcpy(data + offset, literals.data() + runRead, nRead * sizeof(int64_t)); + runRead += nRead; + } + return nRead; +} Review comment: Done. ########## File path: c++/src/RleDecoderV2.cc ########## @@ -703,80 +680,61 @@ uint64_t RleDecoderV2::nextDelta(int64_t* const data, runLength = static_cast<uint64_t>(firstByte & 0x01) << 8; runLength |= readByte(); ++runLength; // account for first value - runRead = deltaBase = 0; + runRead = 0; + int64_t prevValue; // read the first value stored as vint if (isSigned) { - firstValue = static_cast<int64_t>(readVslong()); + prevValue = readVslong(); } else { - firstValue = static_cast<int64_t>(readVulong()); + prevValue = static_cast<int64_t>(readVulong()); } - prevValue = firstValue; + literals[0] = prevValue; // read the fixed delta value stored as vint (deltas can be negative even // if all number are positive) - deltaBase = static_cast<int64_t>(readVslong()); - } - - uint64_t nRead = std::min(runLength - runRead, numValues); - - uint64_t pos = offset; - for ( ; pos < offset + nRead; ++pos) { - // skip null positions - if (!notNull || notNull[pos]) break; - } - if (runRead == 0 && pos < offset + nRead) { - data[pos++] = firstValue; - ++runRead; - } - - if (bitSize == 0) { - // add fixed deltas to adjacent values - for ( ; pos < offset + nRead; ++pos) { - // skip null positions - if (notNull && !notNull[pos]) { - continue; - } - prevValue = data[pos] = prevValue + deltaBase; - ++runRead; - } - } else { - for ( ; pos < offset + nRead; ++pos) { - // skip null positions - if (!notNull || notNull[pos]) break; - } - if (runRead < 2 && pos < offset + nRead) { - // add delta base and first value - prevValue = data[pos++] = firstValue + deltaBase; - ++runRead; - } + int64_t deltaBase = readVslong(); - // write the unpacked values, add it to previous value and store final - // value to result buffer. if the delta base value is negative then it - // is a decreasing sequence else an increasing sequence - uint64_t remaining = (offset + nRead) - pos; - runRead += readLongs(data, pos, remaining, bitSize, notNull); - - if (deltaBase < 0) { - for ( ; pos < offset + nRead; ++pos) { - // skip null positions - if (notNull && !notNull[pos]) { - continue; - } - prevValue = data[pos] = prevValue - data[pos]; + if (bitSize == 0) { + // add fixed deltas to adjacent values + for (uint64_t i = 1; i < runLength; ++i) { + prevValue = literals[i] = prevValue + deltaBase; } } else { - for ( ; pos < offset + nRead; ++pos) { - // skip null positions - if (notNull && !notNull[pos]) { - continue; + prevValue = literals[1] = prevValue + deltaBase; + // write the unpacked values, add it to previous value and store final + // value to result buffer. if the delta base value is negative then it + // is a decreasing sequence else an increasing sequence + readLongs(literals.data(), 2, runLength - 2, bitSize); Review comment: Good point! I think it's true for legal files. Based on the spec (https://orc.apache.org/specification/ORCv1/#delta), the run will at least has the base value and the delta base which encode the 1st and 2nd literal. However, malformed files could have random bytes. We should add a check before this. Added at line 683. ########## File path: c++/src/RleDecoderV2.cc ########## @@ -96,7 +95,7 @@ void RleDecoderV2::readLongsWithoutNulls(int64_t *data, uint64_t offset, uint64_ return; default: // Fallback to the default implementation for deprecated bit size. - readLongs(data, offset, len, fbs); + readLongsSlow(data, offset, len, fbs); Review comment: Yeah, it sounds better than the current name. The "rolled" prefix seems strange that we usually say unrolling a loop but we don't say rolling a loop.. What about `plainUnpackLongs`, `directUnpackLongs`, `naiveUnpackLongs` or `unpackGeneralLongs`? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: dev-unsubscr...@orc.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org