This is an automated email from the ASF dual-hosted git repository.

jackietien pushed a commit to branch develop
in repository https://gitbox.apache.org/repos/asf/tsfile.git


The following commit(s) were added to refs/heads/develop by this push:
     new 9f8ab1266 Correct the retained size calculation for BinaryColumn and 
BinaryColumnBuilder
9f8ab1266 is described below

commit 9f8ab1266de753667d01d87af8cca33280dee7c9
Author: Jackie Tien <[email protected]>
AuthorDate: Fri Feb 7 09:02:51 2025 +0800

    Correct the retained size calculation for BinaryColumn and 
BinaryColumnBuilder
---
 .../org/apache/tsfile/block/column/Column.java     |  7 ++++
 .../org/apache/tsfile/utils/RamUsageEstimator.java | 12 ++++++
 .../tsfile/file/header/ChunkGroupHeader.java       |  2 -
 .../apache/tsfile/read/common/block/TsBlock.java   | 24 ++++++++++++
 .../tsfile/read/common/block/TsBlockBuilder.java   |  4 --
 .../read/common/block/column/BinaryColumn.java     | 45 +++++++++++++++++++---
 .../common/block/column/BinaryColumnBuilder.java   |  5 +--
 .../read/common/block/column/BooleanColumn.java    |  5 +++
 .../read/common/block/column/DictionaryColumn.java |  5 +++
 .../read/common/block/column/DoubleColumn.java     |  5 +++
 .../read/common/block/column/FloatColumn.java      |  5 +++
 .../tsfile/read/common/block/column/IntColumn.java |  5 +++
 .../read/common/block/column/LongColumn.java       |  5 +++
 .../read/common/block/column/NullColumn.java       |  5 +++
 .../block/column/RunLengthEncodedColumn.java       |  5 +++
 .../read/common/block/column/TimeColumn.java       |  8 +++-
 .../read/common/block/column/TsBlockSerde.java     |  7 ++--
 17 files changed, 133 insertions(+), 21 deletions(-)

diff --git 
a/java/common/src/main/java/org/apache/tsfile/block/column/Column.java 
b/java/common/src/main/java/org/apache/tsfile/block/column/Column.java
index e45664634..b5105ed6c 100644
--- a/java/common/src/main/java/org/apache/tsfile/block/column/Column.java
+++ b/java/common/src/main/java/org/apache/tsfile/block/column/Column.java
@@ -143,6 +143,13 @@ public interface Column {
    */
   long getRetainedSizeInBytes();
 
+  /**
+   * Returns the size of this Column as if it was compacted, ignoring any 
over-allocations and any
+   * unloaded nested Columns. For example, in dictionary blocks, this only 
counts each dictionary
+   * entry once, rather than each time a value is referenced.
+   */
+  long getSizeInBytes();
+
   /**
    * Returns a column starting at the specified position and extends for the 
specified length. The
    * specified region must be entirely contained within this column.
diff --git 
a/java/common/src/main/java/org/apache/tsfile/utils/RamUsageEstimator.java 
b/java/common/src/main/java/org/apache/tsfile/utils/RamUsageEstimator.java
index 3a35d2269..d357a42ea 100644
--- a/java/common/src/main/java/org/apache/tsfile/utils/RamUsageEstimator.java
+++ b/java/common/src/main/java/org/apache/tsfile/utils/RamUsageEstimator.java
@@ -271,6 +271,18 @@ public final class RamUsageEstimator {
         : alignObjectSize(NUM_BYTES_ARRAY_HEADER + (long) Double.BYTES * 
arr.length);
   }
 
+  public static long sizeOf(Accountable[] arr) {
+    if (arr == null) {
+      return 0;
+    } else {
+      long size = shallowSizeOf(arr);
+      for (Accountable obj : arr) {
+        size += obj != null ? obj.ramBytesUsed() : 0;
+      }
+      return size;
+    }
+  }
+
   /** Returns the size in bytes of the String[] object. */
   public static long sizeOf(String[] arr) {
     long size = shallowSizeOf(arr);
diff --git 
a/java/tsfile/src/main/java/org/apache/tsfile/file/header/ChunkGroupHeader.java 
b/java/tsfile/src/main/java/org/apache/tsfile/file/header/ChunkGroupHeader.java
index b46f0e0c3..ba3371e9d 100644
--- 
a/java/tsfile/src/main/java/org/apache/tsfile/file/header/ChunkGroupHeader.java
+++ 
b/java/tsfile/src/main/java/org/apache/tsfile/file/header/ChunkGroupHeader.java
@@ -53,7 +53,6 @@ public class ChunkGroupHeader {
   }
 
   private int getSerializedSize(IDeviceID deviceID) {
-    // TODO: add an interface in IDeviceID
     int length = deviceID.serializedSize();
     return Byte.BYTES + ReadWriteForEncodingUtils.varIntSize(length) + length;
   }
@@ -73,7 +72,6 @@ public class ChunkGroupHeader {
       }
     }
 
-    // TODO: add an interface in IDeviceID
     final IDeviceID deviceID = deserializeDeviceID(inputStream, versionNumber);
     return new ChunkGroupHeader(deviceID);
   }
diff --git 
a/java/tsfile/src/main/java/org/apache/tsfile/read/common/block/TsBlock.java 
b/java/tsfile/src/main/java/org/apache/tsfile/read/common/block/TsBlock.java
index 5f5edc6bb..b5224580e 100644
--- a/java/tsfile/src/main/java/org/apache/tsfile/read/common/block/TsBlock.java
+++ b/java/tsfile/src/main/java/org/apache/tsfile/read/common/block/TsBlock.java
@@ -69,6 +69,8 @@ public class TsBlock {
 
   private volatile long retainedSizeInBytes = -1;
 
+  private volatile long sizeInBytes = -1;
+
   public TsBlock(int positionCount) {
     this(false, positionCount, null, EMPTY_COLUMNS);
   }
@@ -122,6 +124,18 @@ public class TsBlock {
     return retainedSizeInBytes;
   }
 
+  /**
+   * Returns the size of this block as if it was compacted, ignoring any 
over-allocations and any
+   * unloaded nested blocks. For example, in dictionary blocks, this only 
counts each dictionary
+   * entry once, rather than each time a value is referenced.
+   */
+  public long getSizeInBytes() {
+    if (sizeInBytes < 0) {
+      return updateSize();
+    }
+    return sizeInBytes;
+  }
+
   /**
    * @param positionOffset start offset
    * @param length slice length
@@ -508,6 +522,16 @@ public class TsBlock {
     return newRetainedSizeInBytes;
   }
 
+  private long updateSize() {
+    long newSizeInBytes = INSTANCE_SIZE;
+    newSizeInBytes += timeColumn.getSizeInBytes();
+    for (Column column : valueColumns) {
+      newSizeInBytes += column.getSizeInBytes();
+    }
+    this.sizeInBytes = newSizeInBytes;
+    return newSizeInBytes;
+  }
+
   public int getTotalInstanceSize() {
     int totalInstanceSize = INSTANCE_SIZE;
     totalInstanceSize += timeColumn.getInstanceSize();
diff --git 
a/java/tsfile/src/main/java/org/apache/tsfile/read/common/block/TsBlockBuilder.java
 
b/java/tsfile/src/main/java/org/apache/tsfile/read/common/block/TsBlockBuilder.java
index 918551d9b..028bffc99 100644
--- 
a/java/tsfile/src/main/java/org/apache/tsfile/read/common/block/TsBlockBuilder.java
+++ 
b/java/tsfile/src/main/java/org/apache/tsfile/read/common/block/TsBlockBuilder.java
@@ -104,8 +104,6 @@ public class TsBlockBuilder {
     valueColumnBuilders = new ColumnBuilder[types.size()];
 
     for (int i = 0; i < valueColumnBuilders.length; i++) {
-      // TODO use Type interface to encapsulate createColumnBuilder to each 
concrete type class
-      // instead of switch-case
       switch (types.get(i)) {
         case BOOLEAN:
           valueColumnBuilders[i] =
@@ -176,8 +174,6 @@ public class TsBlockBuilder {
     valueColumnBuilders = new ColumnBuilder[types.size()];
     int initialExpectedEntries = timeColumnBuilder.getPositionCount();
     for (int i = 0; i < valueColumnBuilders.length; i++) {
-      // TODO use Type interface to encapsulate createColumnBuilder to each 
concrete type class
-      // instead of switch-case
       switch (types.get(i)) {
         case BOOLEAN:
           valueColumnBuilders[i] =
diff --git 
a/java/tsfile/src/main/java/org/apache/tsfile/read/common/block/column/BinaryColumn.java
 
b/java/tsfile/src/main/java/org/apache/tsfile/read/common/block/column/BinaryColumn.java
index ec36fc4f1..8a794508a 100644
--- 
a/java/tsfile/src/main/java/org/apache/tsfile/read/common/block/column/BinaryColumn.java
+++ 
b/java/tsfile/src/main/java/org/apache/tsfile/read/common/block/column/BinaryColumn.java
@@ -32,8 +32,8 @@ import java.util.Optional;
 import static 
org.apache.tsfile.read.common.block.column.ColumnUtil.checkArrayRange;
 import static 
org.apache.tsfile.read.common.block.column.ColumnUtil.checkReadablePosition;
 import static 
org.apache.tsfile.read.common.block.column.ColumnUtil.checkValidRegion;
+import static org.apache.tsfile.utils.RamUsageEstimator.sizeOf;
 import static org.apache.tsfile.utils.RamUsageEstimator.sizeOfBooleanArray;
-import static org.apache.tsfile.utils.RamUsageEstimator.sizeOfObjectArray;
 
 public class BinaryColumn implements Column {
 
@@ -46,6 +46,7 @@ public class BinaryColumn implements Column {
   private final Binary[] values;
 
   private final long retainedSizeInBytes;
+  private final long sizeInBytes;
 
   public BinaryColumn(int initialCapacity) {
     this(0, 0, null, new Binary[initialCapacity]);
@@ -75,9 +76,37 @@ public class BinaryColumn implements Column {
     }
     this.valueIsNull = valueIsNull;
 
-    // TODO we need to sum up all the Binary's retainedSize here
-    retainedSizeInBytes =
-        INSTANCE_SIZE + sizeOfBooleanArray(positionCount) + 
sizeOfObjectArray(positionCount);
+    retainedSizeInBytes = INSTANCE_SIZE + sizeOfBooleanArray(positionCount) + 
sizeOf(values);
+    sizeInBytes = values.length > 0 ? retainedSizeInBytes * positionCount / 
values.length : 0L;
+  }
+
+  // called by getRegion which already knows the underlying retainedSizeInBytes
+  private BinaryColumn(
+      int arrayOffset,
+      int positionCount,
+      boolean[] valueIsNull,
+      Binary[] values,
+      long retainedSizeInBytes) {
+    if (arrayOffset < 0) {
+      throw new IllegalArgumentException("arrayOffset is negative");
+    }
+    this.arrayOffset = arrayOffset;
+    if (positionCount < 0) {
+      throw new IllegalArgumentException("positionCount is negative");
+    }
+    this.positionCount = positionCount;
+
+    if (values.length - arrayOffset < positionCount) {
+      throw new IllegalArgumentException("values length is less than 
positionCount");
+    }
+    this.values = values;
+
+    if (valueIsNull != null && valueIsNull.length - arrayOffset < 
positionCount) {
+      throw new IllegalArgumentException("isNull length is less than 
positionCount");
+    }
+    this.valueIsNull = valueIsNull;
+    this.retainedSizeInBytes = retainedSizeInBytes;
+    this.sizeInBytes = values.length > 0 ? retainedSizeInBytes * positionCount 
/ values.length : 0L;
   }
 
   @Override
@@ -140,10 +169,16 @@ public class BinaryColumn implements Column {
     return retainedSizeInBytes;
   }
 
+  @Override
+  public long getSizeInBytes() {
+    return sizeInBytes;
+  }
+
   @Override
   public Column getRegion(int positionOffset, int length) {
     checkValidRegion(getPositionCount(), positionOffset, length);
-    return new BinaryColumn(positionOffset + arrayOffset, length, valueIsNull, 
values);
+    return new BinaryColumn(
+        positionOffset + arrayOffset, length, valueIsNull, values, 
getRetainedSizeInBytes());
   }
 
   @Override
diff --git 
a/java/tsfile/src/main/java/org/apache/tsfile/read/common/block/column/BinaryColumnBuilder.java
 
b/java/tsfile/src/main/java/org/apache/tsfile/read/common/block/column/BinaryColumnBuilder.java
index a82d82fc0..d9c560dc6 100644
--- 
a/java/tsfile/src/main/java/org/apache/tsfile/read/common/block/column/BinaryColumnBuilder.java
+++ 
b/java/tsfile/src/main/java/org/apache/tsfile/read/common/block/column/BinaryColumnBuilder.java
@@ -32,7 +32,6 @@ import java.util.Arrays;
 
 import static java.lang.Math.max;
 import static 
org.apache.tsfile.read.common.block.column.ColumnUtil.calculateBlockResetSize;
-import static org.apache.tsfile.utils.RamUsageEstimator.shallowSizeOf;
 import static org.apache.tsfile.utils.RamUsageEstimator.sizeOf;
 
 public class BinaryColumnBuilder implements ColumnBuilder {
@@ -129,7 +128,6 @@ public class BinaryColumnBuilder implements ColumnBuilder {
 
   @Override
   public long getRetainedSizeInBytes() {
-    // TODO we need to sum up all the Binary's retainedSize here
     long size = INSTANCE_SIZE + arraysRetainedSizeInBytes;
     if (columnBuilderStatus != null) {
       size += ColumnBuilderStatus.INSTANCE_SIZE;
@@ -139,7 +137,6 @@ public class BinaryColumnBuilder implements ColumnBuilder {
 
   @Override
   public ColumnBuilder newColumnBuilderLike(ColumnBuilderStatus 
columnBuilderStatus) {
-    // TODO we should take retain size into account here
     return new BinaryColumnBuilder(columnBuilderStatus, 
calculateBlockResetSize(positionCount));
   }
 
@@ -158,6 +155,6 @@ public class BinaryColumnBuilder implements ColumnBuilder {
   }
 
   private void updateArraysDataSize() {
-    arraysRetainedSizeInBytes = sizeOf(valueIsNull) + shallowSizeOf(values);
+    arraysRetainedSizeInBytes = sizeOf(valueIsNull) + sizeOf(values);
   }
 }
diff --git 
a/java/tsfile/src/main/java/org/apache/tsfile/read/common/block/column/BooleanColumn.java
 
b/java/tsfile/src/main/java/org/apache/tsfile/read/common/block/column/BooleanColumn.java
index 76b3fb6d7..7b9aca747 100644
--- 
a/java/tsfile/src/main/java/org/apache/tsfile/read/common/block/column/BooleanColumn.java
+++ 
b/java/tsfile/src/main/java/org/apache/tsfile/read/common/block/column/BooleanColumn.java
@@ -138,6 +138,11 @@ public class BooleanColumn implements Column {
     return retainedSizeInBytes;
   }
 
+  @Override
+  public long getSizeInBytes() {
+    return (long) positionCount * SIZE_IN_BYTES_PER_POSITION;
+  }
+
   @Override
   public Column getRegion(int positionOffset, int length) {
     checkValidRegion(getPositionCount(), positionOffset, length);
diff --git 
a/java/tsfile/src/main/java/org/apache/tsfile/read/common/block/column/DictionaryColumn.java
 
b/java/tsfile/src/main/java/org/apache/tsfile/read/common/block/column/DictionaryColumn.java
index 13dbd2261..50a2dd1ef 100644
--- 
a/java/tsfile/src/main/java/org/apache/tsfile/read/common/block/column/DictionaryColumn.java
+++ 
b/java/tsfile/src/main/java/org/apache/tsfile/read/common/block/column/DictionaryColumn.java
@@ -187,6 +187,11 @@ public final class DictionaryColumn implements Column {
     return retainedSizeInBytes + dictionary.getRetainedSizeInBytes();
   }
 
+  @Override
+  public long getSizeInBytes() {
+    return ids.length > 0 ? getRetainedSizeInBytes() * positionCount / 
ids.length : 0L;
+  }
+
   @Override
   public Column getRegion(int positionOffset, int length) {
     checkValidRegion(positionCount, positionOffset, length);
diff --git 
a/java/tsfile/src/main/java/org/apache/tsfile/read/common/block/column/DoubleColumn.java
 
b/java/tsfile/src/main/java/org/apache/tsfile/read/common/block/column/DoubleColumn.java
index afc78208e..e0aff8f7a 100644
--- 
a/java/tsfile/src/main/java/org/apache/tsfile/read/common/block/column/DoubleColumn.java
+++ 
b/java/tsfile/src/main/java/org/apache/tsfile/read/common/block/column/DoubleColumn.java
@@ -139,6 +139,11 @@ public class DoubleColumn implements Column {
     return retainedSizeInBytes;
   }
 
+  @Override
+  public long getSizeInBytes() {
+    return (long) positionCount * SIZE_IN_BYTES_PER_POSITION;
+  }
+
   @Override
   public Column getRegion(int positionOffset, int length) {
     checkValidRegion(getPositionCount(), positionOffset, length);
diff --git 
a/java/tsfile/src/main/java/org/apache/tsfile/read/common/block/column/FloatColumn.java
 
b/java/tsfile/src/main/java/org/apache/tsfile/read/common/block/column/FloatColumn.java
index c008d9353..8a576c0ce 100644
--- 
a/java/tsfile/src/main/java/org/apache/tsfile/read/common/block/column/FloatColumn.java
+++ 
b/java/tsfile/src/main/java/org/apache/tsfile/read/common/block/column/FloatColumn.java
@@ -154,6 +154,11 @@ public class FloatColumn implements Column {
     return retainedSizeInBytes;
   }
 
+  @Override
+  public long getSizeInBytes() {
+    return (long) positionCount * SIZE_IN_BYTES_PER_POSITION;
+  }
+
   @Override
   public Column getRegion(int positionOffset, int length) {
     checkValidRegion(getPositionCount(), positionOffset, length);
diff --git 
a/java/tsfile/src/main/java/org/apache/tsfile/read/common/block/column/IntColumn.java
 
b/java/tsfile/src/main/java/org/apache/tsfile/read/common/block/column/IntColumn.java
index c2065ee32..6820a83eb 100644
--- 
a/java/tsfile/src/main/java/org/apache/tsfile/read/common/block/column/IntColumn.java
+++ 
b/java/tsfile/src/main/java/org/apache/tsfile/read/common/block/column/IntColumn.java
@@ -182,6 +182,11 @@ public class IntColumn implements Column {
     return retainedSizeInBytes;
   }
 
+  @Override
+  public long getSizeInBytes() {
+    return (long) positionCount * SIZE_IN_BYTES_PER_POSITION;
+  }
+
   @Override
   public Column getRegion(int positionOffset, int length) {
     checkValidRegion(getPositionCount(), positionOffset, length);
diff --git 
a/java/tsfile/src/main/java/org/apache/tsfile/read/common/block/column/LongColumn.java
 
b/java/tsfile/src/main/java/org/apache/tsfile/read/common/block/column/LongColumn.java
index ecba9013d..03d8af0e6 100644
--- 
a/java/tsfile/src/main/java/org/apache/tsfile/read/common/block/column/LongColumn.java
+++ 
b/java/tsfile/src/main/java/org/apache/tsfile/read/common/block/column/LongColumn.java
@@ -154,6 +154,11 @@ public class LongColumn implements Column {
     return retainedSizeInBytes;
   }
 
+  @Override
+  public long getSizeInBytes() {
+    return (long) positionCount * SIZE_IN_BYTES_PER_POSITION;
+  }
+
   @Override
   public Column getRegion(int positionOffset, int length) {
     checkValidRegion(getPositionCount(), positionOffset, length);
diff --git 
a/java/tsfile/src/main/java/org/apache/tsfile/read/common/block/column/NullColumn.java
 
b/java/tsfile/src/main/java/org/apache/tsfile/read/common/block/column/NullColumn.java
index 9b999c412..d91359cae 100644
--- 
a/java/tsfile/src/main/java/org/apache/tsfile/read/common/block/column/NullColumn.java
+++ 
b/java/tsfile/src/main/java/org/apache/tsfile/read/common/block/column/NullColumn.java
@@ -85,6 +85,11 @@ public class NullColumn implements Column {
     return retainedSizeInBytes;
   }
 
+  @Override
+  public long getSizeInBytes() {
+    return retainedSizeInBytes;
+  }
+
   @Override
   public Column getRegion(int positionOffset, int length) {
     checkValidRegion(getPositionCount(), positionOffset, length);
diff --git 
a/java/tsfile/src/main/java/org/apache/tsfile/read/common/block/column/RunLengthEncodedColumn.java
 
b/java/tsfile/src/main/java/org/apache/tsfile/read/common/block/column/RunLengthEncodedColumn.java
index 620378182..148407295 100644
--- 
a/java/tsfile/src/main/java/org/apache/tsfile/read/common/block/column/RunLengthEncodedColumn.java
+++ 
b/java/tsfile/src/main/java/org/apache/tsfile/read/common/block/column/RunLengthEncodedColumn.java
@@ -193,6 +193,11 @@ public class RunLengthEncodedColumn implements Column {
     return INSTANCE_SIZE + value.getRetainedSizeInBytes();
   }
 
+  @Override
+  public long getSizeInBytes() {
+    return value.getSizeInBytes();
+  }
+
   @Override
   public Column getRegion(int positionOffset, int length) {
     checkValidRegion(positionCount, positionOffset, length);
diff --git 
a/java/tsfile/src/main/java/org/apache/tsfile/read/common/block/column/TimeColumn.java
 
b/java/tsfile/src/main/java/org/apache/tsfile/read/common/block/column/TimeColumn.java
index 3b1880a96..e7108fc85 100644
--- 
a/java/tsfile/src/main/java/org/apache/tsfile/read/common/block/column/TimeColumn.java
+++ 
b/java/tsfile/src/main/java/org/apache/tsfile/read/common/block/column/TimeColumn.java
@@ -100,8 +100,7 @@ public class TimeColumn implements Column {
 
   @Override
   public boolean[] isNull() {
-    // todo
-    return null;
+    throw new UnsupportedOperationException("isNull is not supported for 
TimeColumn");
   }
 
   @Override
@@ -114,6 +113,11 @@ public class TimeColumn implements Column {
     return retainedSizeInBytes;
   }
 
+  @Override
+  public long getSizeInBytes() {
+    return (long) positionCount * SIZE_IN_BYTES_PER_POSITION;
+  }
+
   @Override
   public Column getRegion(int positionOffset, int length) {
     ColumnUtil.checkValidRegion(getPositionCount(), positionOffset, length);
diff --git 
a/java/tsfile/src/main/java/org/apache/tsfile/read/common/block/column/TsBlockSerde.java
 
b/java/tsfile/src/main/java/org/apache/tsfile/read/common/block/column/TsBlockSerde.java
index e887a5773..a891553b5 100644
--- 
a/java/tsfile/src/main/java/org/apache/tsfile/read/common/block/column/TsBlockSerde.java
+++ 
b/java/tsfile/src/main/java/org/apache/tsfile/read/common/block/column/TsBlockSerde.java
@@ -67,7 +67,6 @@ public class TsBlockSerde {
     }
 
     // Time column.
-    // TODO: a TimeColumn will be deserialized as a LongColumn
     Column timeColumn =
         ColumnEncoderFactory.get(columnEncodings.get(0))
             .readColumn(byteBuffer, TSDataType.INT64, positionCount);
@@ -91,12 +90,12 @@ public class TsBlockSerde {
    * @return Serialized tsblock.
    */
   public ByteBuffer serialize(TsBlock tsBlock) throws IOException {
-    if (tsBlock.getRetainedSizeInBytes() > Integer.MAX_VALUE) {
+    if (tsBlock.getSizeInBytes() > Integer.MAX_VALUE) {
       throw new IllegalStateException(
-          "TsBlock should not be that large: " + 
tsBlock.getRetainedSizeInBytes());
+          "TsBlock should not be that large: " + tsBlock.getSizeInBytes());
     }
     ByteArrayOutputStream byteArrayOutputStream =
-        new ByteArrayOutputStream((int) tsBlock.getRetainedSizeInBytes());
+        new ByteArrayOutputStream((int) tsBlock.getSizeInBytes());
     DataOutputStream dataOutputStream = new 
DataOutputStream(byteArrayOutputStream);
 
     // Value column count.

Reply via email to