Quanlong Huang created ORC-322: ---------------------------------- Summary: c++ writer should not adjust gmtOffset when writing timestamps Key: ORC-322 URL: https://issues.apache.org/jira/browse/ORC-322 Project: ORC Issue Type: Bug Components: C++ Reporter: Quanlong Huang
The c++ TimestampColumnWriter will adjust timestamp with gmtOffset: {code:c++} void TimestampColumnWriter::add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues) { ...... int64_t *secs = tsBatch->data.data() + offset; int64_t *nanos = tsBatch->nanoseconds.data() + offset; ...... bool hasNull = false; for (uint64_t i = 0; i < numValues; ++i) { if (notNull == nullptr || notNull[i]) { // TimestampVectorBatch already stores data in UTC int64_t millsUTC = secs[i] * 1000 + nanos[i] / 1000000; tsStats->increase(1); tsStats->update(millsUTC); secs[i] -= timezone.getVariant(secs[i]).gmtOffset; <-- should not adjust with gmtOffset here secs[i] -= timezone.getEpoch(); nanos[i] = formatNano(nanos[i]); } else if (!hasNull) { hasNull = true; } } tsStats->setHasNull(hasNull); secRleEncoder->add(secs, numValues, notNull); nanoRleEncoder->add(nanos, numValues, notNull); } {code} The java reader doesn't adjust this: {code:java} public void writeBatch(ColumnVector vector, int offset, int length) throws IOException { super.writeBatch(vector, offset, length); TimestampColumnVector vec = (TimestampColumnVector) vector; if (vector.isRepeating) { ...... } else { for (int i = 0; i < length; ++i) { if (vec.noNulls || !vec.isNull[i + offset]) { // ignore the bottom three digits from the vec.time field final long secs = vec.time[i + offset] / MILLIS_PER_SECOND; final int newNanos = vec.nanos[i + offset]; // set the millis based on the top three digits of the nanos long millis = secs * MILLIS_PER_SECOND + newNanos / 1_000_000; if (millis < 0 && newNanos > 999_999) { millis -= MILLIS_PER_SECOND; } long utc = SerializationUtils.convertToUtc(localTimezone, millis); seconds.write(secs - baseEpochSecsLocalTz); <-- only adjust with ORC epoch nanos.write(formatNanos(newNanos)); indexStatistics.updateTimestamp(utc); if (createBloomFilter) { if (bloomFilter != null) { bloomFilter.addLong(millis); } bloomFilterUtf8.addLong(utc); } } } } } This is a follow-up of ORC-320. I think there's a wrong assumption in c++ codes that timestamps given to the writer's TimestampVectorBatch equal to timestamps got from the reader's TimestampVectorBatch. {code} -- This message was sent by Atlassian JIRA (v7.6.3#76005)