zeroshade commented on code in PR #38367:
URL: https://github.com/apache/arrow/pull/38367#discussion_r1376372506
##########
go/parquet/internal/encoding/boolean_decoder.go:
##########
@@ -109,3 +114,76 @@ func (dec *PlainBooleanDecoder) DecodeSpaced(out []bool,
nullCount int, validBit
}
return dec.Decode(out)
}
+
+type RleBooleanDecoder struct {
+ decoder
+
+ rleDec *utils.RleDecoder
+}
+
+func (RleBooleanDecoder) Type() parquet.Type {
+ return parquet.Types.Boolean
+}
+
+func (dec *RleBooleanDecoder) SetData(nvals int, data []byte) error {
+ dec.nvals = nvals
+
+ if len(data) < 4 {
+ return fmt.Errorf("invalid length - %d (corrupt data page?)",
len(data))
+ }
+
+ // load the first 4 bytes in little-endian which indicates the length
+ nbytes := binary.LittleEndian.Uint32(data[:4])
+ if nbytes > uint32(len(data)-4) {
+ return fmt.Errorf("received invalid number of bytes - %d
(corrupt data page?)", nbytes)
+ }
+
+ dec.data = data[4:]
+ if dec.rleDec == nil {
+ dec.rleDec = utils.NewRleDecoder(bytes.NewReader(dec.data), 1)
+ } else {
+ dec.rleDec.Reset(bytes.NewReader(dec.data), 1)
+ }
+ return nil
+}
+
+func (dec *RleBooleanDecoder) Decode(out []bool) (int, error) {
+ max := shared_utils.MinInt(len(out), dec.nvals)
+
+ var (
+ buf [1024]uint64
+ n = max
+ )
+
+ for n > 0 {
+ batch := shared_utils.MinInt(len(buf), n)
+ decoded := dec.rleDec.GetBatch(buf[:batch])
+ if decoded != batch {
+ return max - n, io.ErrUnexpectedEOF
Review Comment:
> No, max is not the real maximum of the underlying data. For rep-def
existing scenerio, it means "rep-def" size, rather than the actual size. So it
may greater than existing value. (At least this is the case in C++)
this is a level above where we are here. This is *solely* the value decoder.
In this function `max` is the maximum number of *values* that are decoded by a
single call to this function (i.e. the minimum of the length of the output
slice and the number of values left to decode).
The handling of rep and def levels is taken care of a level above this by
the actual page reader / column chunk reader, which determines how many
physical values it wants to read from the decoder. The intent of this function
is that the return value is the *number of physical values populated into the
output slice*.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]