zeroshade commented on a change in pull request #10071: URL: https://github.com/apache/arrow/pull/10071#discussion_r631954773
########## File path: go/parquet/schema/logical_types.go ########## @@ -0,0 +1,1089 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package schema + +import ( + "encoding/json" + "fmt" + "math" + + "github.com/apache/arrow/go/parquet" + "github.com/apache/arrow/go/parquet/internal/debug" + format "github.com/apache/arrow/go/parquet/internal/gen-go/parquet" +) + +// DecimalMetadata is a struct for managing scale and precision information between +// converted and logical types. +type DecimalMetadata struct { + IsSet bool + Scale int32 + Precision int32 +} + +func getLogicalType(l *format.LogicalType) LogicalType { + switch { + case l.IsSetSTRING(): + return StringLogicalType{} + case l.IsSetMAP(): + return MapLogicalType{} + case l.IsSetLIST(): + return ListLogicalType{} + case l.IsSetENUM(): + return EnumLogicalType{} + case l.IsSetDECIMAL(): + return &DecimalLogicalType{typ: l.DECIMAL} + case l.IsSetDATE(): + return DateLogicalType{} + case l.IsSetTIME(): + if timeUnitFromThrift(l.TIME.Unit) == TimeUnitUnknown { + panic("parquet: TimeUnit must be one of MILLIS, MICROS, or NANOS for Time logical type") + } + return &TimeLogicalType{typ: l.TIME} + case l.IsSetTIMESTAMP(): + if timeUnitFromThrift(l.TIMESTAMP.Unit) == TimeUnitUnknown { + panic("parquet: TimeUnit must be one of MILLIS, MICROS, or NANOS for Timestamp logical type") + } + return &TimestampLogicalType{typ: l.TIMESTAMP} + case l.IsSetINTEGER(): + return &IntLogicalType{typ: l.INTEGER} + case l.IsSetUNKNOWN(): + return NullLogicalType{} + case l.IsSetJSON(): + return JSONLogicalType{} + case l.IsSetBSON(): + return BSONLogicalType{} + case l.IsSetUUID(): + return UUIDLogicalType{} + case l == nil: + return NoLogicalType{} + default: + panic("invalid logical type") + } +} + +// TimeUnitType is an enum for denoting whether a time based logical type +// is using milliseconds, microseconds or nanoseconds. +type TimeUnitType int + +// Constants for the TimeUnitType +const ( + TimeUnitMillis TimeUnitType = iota + TimeUnitMicros + TimeUnitNanos + TimeUnitUnknown +) + +// LogicalType is the descriptor that defines the usage of a physical primitive +// type in the schema, such as an Interval, Date, etc. +type LogicalType interface { + // Returns true if a nested type like List or Map + IsNested() bool + // Returns true if this type can be serialized, ie: not Unknown/NoType/Interval + IsSerialized() bool + // Returns true if not NoLogicalType + IsValid() bool + // Returns true if it is NoType + IsNone() bool + // returns a string representation of the Logical Type + String() string + toThrift() *format.LogicalType + // Return the equivalent ConvertedType for legacy Parquet systems + ToConvertedType() (ConvertedType, DecimalMetadata) + // Returns true if the specified ConvertedType is compatible with this + // logical type + IsCompatible(ConvertedType, DecimalMetadata) bool + // Returns true if this logical type can be used with the provided physical type + IsApplicable(t parquet.Type, tlen int32) bool + // Returns true if the logical types are the same + Equals(LogicalType) bool + // Returns the default stat sort order for this logical type + SortOrder() SortOrder +} + +// TemporalLogicalType is a smaller interface for Time based logical types +// like Time / Timestamp +type TemporalLogicalType interface { + LogicalType + IsAdjustedToUTC() bool + TimeUnit() TimeUnitType +} + +// SortOrder mirrors the parquet.thrift sort order type +type SortOrder int8 + +// Constants for the Stat sort order definitions +const ( + SortSIGNED SortOrder = iota + SortUNSIGNED + SortUNKNOWN +) + +// DefaultSortOrder returns the default stat sort order for the given physical type +func DefaultSortOrder(primitive format.Type) SortOrder { + switch primitive { + case format.Type_BOOLEAN, format.Type_INT32, format.Type_INT64, format.Type_FLOAT, format.Type_DOUBLE: + return SortSIGNED + case format.Type_BYTE_ARRAY, format.Type_FIXED_LEN_BYTE_ARRAY: + return SortUNSIGNED + case format.Type_INT96: + fallthrough + default: + return SortUNKNOWN + } +} + +// GetLogicalSortOrder returns the default sort order for this logical type +// or falls back to the default sort order for the physical type if not valid +func GetLogicalSortOrder(logical LogicalType, primitive format.Type) SortOrder { + switch { + case logical == nil || !logical.IsValid(): + return SortUNKNOWN + case logical.Equals(NoLogicalType{}): + return DefaultSortOrder(primitive) + default: + return logical.SortOrder() + } +} + +type baseLogicalType struct{} + +func (baseLogicalType) IsSerialized() bool { + return true +} + +func (baseLogicalType) IsValid() bool { + return true +} + +func (baseLogicalType) IsNested() bool { + return false +} + +func (baseLogicalType) IsNone() bool { return false } + +// StringLogicalType is a UTF8 string, only usable with ByteArray and FixedLenByteArray +type StringLogicalType struct{ baseLogicalType } + +func (StringLogicalType) SortOrder() SortOrder { + return SortUNSIGNED +} + +func (StringLogicalType) MarshalJSON() ([]byte, error) { + return json.Marshal(map[string]string{"Type": StringLogicalType{}.String()}) +} + +func (StringLogicalType) String() string { + return "String" +} + +func (StringLogicalType) ToConvertedType() (ConvertedType, DecimalMetadata) { + return ConvertedTypes.UTF8, DecimalMetadata{} +} + +func (StringLogicalType) IsCompatible(t ConvertedType, dec DecimalMetadata) bool { + return t == ConvertedTypes.UTF8 && !dec.IsSet +} + +func (StringLogicalType) IsApplicable(t parquet.Type, _ int32) bool { + return t == parquet.Types.ByteArray +} + +func (StringLogicalType) toThrift() *format.LogicalType { + return &format.LogicalType{STRING: format.NewStringType()} +} + +func (StringLogicalType) Equals(rhs LogicalType) bool { + _, ok := rhs.(StringLogicalType) + return ok +} + +// MapLogicalType represents a mapped type +type MapLogicalType struct{ baseLogicalType } + +func (MapLogicalType) SortOrder() SortOrder { + return SortUNKNOWN +} + +func (MapLogicalType) MarshalJSON() ([]byte, error) { + return json.Marshal(map[string]string{"Type": MapLogicalType{}.String()}) +} + +func (MapLogicalType) String() string { + return "Map" +} + +func (MapLogicalType) IsNested() bool { + return true +} + +func (MapLogicalType) ToConvertedType() (ConvertedType, DecimalMetadata) { + return ConvertedTypes.Map, DecimalMetadata{} +} + +func (MapLogicalType) IsCompatible(t ConvertedType, dec DecimalMetadata) bool { + return (t == ConvertedTypes.Map || t == ConvertedTypes.MapKeyValue) && !dec.IsSet +} + +func (MapLogicalType) IsApplicable(parquet.Type, int32) bool { + return false +} + +func (MapLogicalType) toThrift() *format.LogicalType { + return &format.LogicalType{MAP: format.NewMapType()} +} + +func (MapLogicalType) Equals(rhs LogicalType) bool { + _, ok := rhs.(MapLogicalType) + return ok +} + +func NewListLogicalType() LogicalType { + return ListLogicalType{} +} + +// ListLogicalType is used for columns which are themselves nested lists +type ListLogicalType struct{ baseLogicalType } + +func (ListLogicalType) SortOrder() SortOrder { + return SortUNKNOWN +} + +func (ListLogicalType) MarshalJSON() ([]byte, error) { + return json.Marshal(map[string]string{"Type": ListLogicalType{}.String()}) +} + +func (ListLogicalType) String() string { + return "List" +} + +func (ListLogicalType) IsNested() bool { + return true +} + +func (ListLogicalType) ToConvertedType() (ConvertedType, DecimalMetadata) { + return ConvertedTypes.List, DecimalMetadata{} +} + +func (ListLogicalType) IsCompatible(t ConvertedType, dec DecimalMetadata) bool { + return t == ConvertedTypes.List && !dec.IsSet +} + +func (ListLogicalType) IsApplicable(parquet.Type, int32) bool { + return false +} + +func (ListLogicalType) toThrift() *format.LogicalType { + return &format.LogicalType{LIST: format.NewListType()} +} + +func (ListLogicalType) Equals(rhs LogicalType) bool { + _, ok := rhs.(ListLogicalType) + return ok +} + +// EnumLogicalType is for representing an enum, which should be a byte array type +type EnumLogicalType struct{ baseLogicalType } + +func (EnumLogicalType) SortOrder() SortOrder { + return SortUNSIGNED +} + +func (EnumLogicalType) MarshalJSON() ([]byte, error) { + return json.Marshal(map[string]string{"Type": EnumLogicalType{}.String()}) +} + +func (EnumLogicalType) String() string { + return "Enum" +} + +func (EnumLogicalType) ToConvertedType() (ConvertedType, DecimalMetadata) { + return ConvertedTypes.Enum, DecimalMetadata{} +} + +func (EnumLogicalType) IsCompatible(t ConvertedType, dec DecimalMetadata) bool { + return t == ConvertedTypes.Enum && !dec.IsSet +} + +func (EnumLogicalType) IsApplicable(t parquet.Type, _ int32) bool { + return t == parquet.Types.ByteArray +} + +func (EnumLogicalType) toThrift() *format.LogicalType { + return &format.LogicalType{ENUM: format.NewEnumType()} +} + +func (EnumLogicalType) Equals(rhs LogicalType) bool { + _, ok := rhs.(EnumLogicalType) + return ok +} + +// NewDecimalLogicalType returns a Decimal logical type with the given +// precision and scale. +// +// Panics if precision < 1 or scale is not in the range (0, precision) +func NewDecimalLogicalType(precision int32, scale int32) LogicalType { + if precision < 1 { + panic("parquet: precision must be greater than or equal to 1 for decimal logical type") + } + if scale < 0 || scale > precision { + panic("parquet: scale must be a non-negative integer that does not exceed precision for decimal logical type") + } + return &DecimalLogicalType{typ: &format.DecimalType{Precision: precision, Scale: scale}} +} + +// DecimalLogicalType is used to represent a decimal value of a given +// precision and scale +type DecimalLogicalType struct { + baseLogicalType + typ *format.DecimalType +} + +func (t DecimalLogicalType) Precision() int32 { + return t.typ.Precision +} + +func (t DecimalLogicalType) Scale() int32 { + return t.typ.Scale +} + +func (DecimalLogicalType) SortOrder() SortOrder { + return SortSIGNED +} + +func (t DecimalLogicalType) MarshalJSON() ([]byte, error) { + return json.Marshal(map[string]interface{}{"Type": "Decimal", "precision": t.typ.Precision, "scale": t.typ.Scale}) +} + +func (t DecimalLogicalType) String() string { + return fmt.Sprintf("Decimal(precision=%d, scale=%d)", t.typ.Precision, t.typ.Scale) +} + +func (t DecimalLogicalType) ToConvertedType() (ConvertedType, DecimalMetadata) { + return ConvertedTypes.Decimal, DecimalMetadata{IsSet: true, Scale: t.typ.GetScale(), Precision: t.typ.GetPrecision()} +} + +func (t DecimalLogicalType) IsCompatible(c ConvertedType, dec DecimalMetadata) bool { + return c == ConvertedTypes.Decimal && + dec.IsSet && dec.Scale == t.typ.Scale && dec.Precision == t.typ.Precision +} + +func (t DecimalLogicalType) IsApplicable(typ parquet.Type, tlen int32) bool { + switch typ { + case parquet.Types.Int32: + return 1 <= t.typ.Precision && t.typ.Precision <= 9 + case parquet.Types.Int64: + if t.typ.Precision < 10 { + debug.Log("int64 used for decimal logical, precision is small enough to use int32") + } + return 1 <= t.typ.Precision && t.typ.Precision <= 18 + case parquet.Types.FixedLenByteArray: + return t.typ.Precision <= int32(math.Floor(math.Log10(math.Pow(2.0, (8.0*float64(tlen)-1.0))))) + case parquet.Types.ByteArray: + return true + } + return false +} + +func (t DecimalLogicalType) toThrift() *format.LogicalType { + return &format.LogicalType{DECIMAL: t.typ} +} + +func (t DecimalLogicalType) Equals(rhs LogicalType) bool { + other, ok := rhs.(*DecimalLogicalType) + if !ok { + return false + } + return t.typ.Precision == other.typ.Precision && t.typ.Scale == other.typ.Scale +} + +// DateLogicalType is an int32 representing the number of days since the Unix Epoch +// 1 January 1970 +type DateLogicalType struct{ baseLogicalType } + +func (DateLogicalType) SortOrder() SortOrder { + return SortSIGNED +} + +func (DateLogicalType) MarshalJSON() ([]byte, error) { + return json.Marshal(map[string]string{"Type": DateLogicalType{}.String()}) +} + +func (DateLogicalType) String() string { + return "Date" +} + +func (DateLogicalType) ToConvertedType() (ConvertedType, DecimalMetadata) { + return ConvertedTypes.Date, DecimalMetadata{} +} + +func (DateLogicalType) IsCompatible(t ConvertedType, dec DecimalMetadata) bool { + return t == ConvertedTypes.Date && !dec.IsSet +} + +func (DateLogicalType) IsApplicable(t parquet.Type, _ int32) bool { + return t == parquet.Types.Int32 +} + +func (DateLogicalType) toThrift() *format.LogicalType { + return &format.LogicalType{DATE: format.NewDateType()} +} + +func (DateLogicalType) Equals(rhs LogicalType) bool { + _, ok := rhs.(DateLogicalType) + return ok +} + +func timeUnitFromThrift(unit *format.TimeUnit) TimeUnitType { + switch { + case unit == nil: + return TimeUnitUnknown + case unit.IsSetMILLIS(): + return TimeUnitMillis + case unit.IsSetMICROS(): + return TimeUnitMicros + case unit.IsSetNANOS(): + return TimeUnitNanos + default: + return TimeUnitUnknown + } +} + +func timeUnitToString(unit *format.TimeUnit) string { + switch { + case unit == nil: + return "unknown" + case unit.IsSetMILLIS(): + return "milliseconds" + case unit.IsSetMICROS(): + return "microseconds" + case unit.IsSetNANOS(): + return "nanoseconds" + default: + return "unknown" + } +} + +func timeUnitFromString(v string) TimeUnitType { + switch v { + case "millis": + return TimeUnitMillis + case "micros": + return TimeUnitMicros + case "nanos": + return TimeUnitNanos + default: + return TimeUnitUnknown + } +} + +func createTimeUnit(unit TimeUnitType) *format.TimeUnit { + tunit := format.NewTimeUnit() + switch unit { + case TimeUnitMicros: + tunit.MICROS = format.NewMicroSeconds() + case TimeUnitMillis: + tunit.MILLIS = format.NewMilliSeconds() + case TimeUnitNanos: + tunit.NANOS = format.NewNanoSeconds() + default: + panic("parquet: time unit must be one of MILLIS, MICROS, or NANOS for Time logical type") + } + return tunit +} + +// NewTimeLogicalType returns a time type of the given unit. +func NewTimeLogicalType(isAdjustedToUTC bool, unit TimeUnitType) LogicalType { + return &TimeLogicalType{typ: &format.TimeType{ + IsAdjustedToUTC: isAdjustedToUTC, + Unit: createTimeUnit(unit), + }} +} + +// TimeLogicalType is a time type without a date and must be an +// int32 for milliseconds, or an int64 for micro or nano seconds. +type TimeLogicalType struct { + baseLogicalType + typ *format.TimeType +} + +func (t TimeLogicalType) IsAdjustedToUTC() bool { + return t.typ.IsAdjustedToUTC +} + +func (t TimeLogicalType) TimeUnit() TimeUnitType { + return timeUnitFromThrift(t.typ.Unit) +} + +func (TimeLogicalType) SortOrder() SortOrder { + return SortSIGNED +} + +func (t TimeLogicalType) MarshalJSON() ([]byte, error) { + return json.Marshal(map[string]interface{}{ + "Type": "Time", "isAdjustedToUTC": t.typ.IsAdjustedToUTC, "timeUnit": timeUnitToString(t.typ.GetUnit())}) +} + +func (t TimeLogicalType) String() string { + return fmt.Sprintf("Time(isAdjustedToUTC=%t, timeUnit=%s)", t.typ.GetIsAdjustedToUTC(), timeUnitToString(t.typ.GetUnit())) +} + +func (t TimeLogicalType) ToConvertedType() (ConvertedType, DecimalMetadata) { + unit := timeUnitFromThrift(t.typ.Unit) + if t.typ.IsAdjustedToUTC { + switch unit { + case TimeUnitMillis: + return ConvertedTypes.TimeMillis, DecimalMetadata{} + case TimeUnitMicros: + return ConvertedTypes.TimeMicros, DecimalMetadata{} + } + } + return ConvertedTypes.None, DecimalMetadata{} +} + +func (t TimeLogicalType) IsCompatible(c ConvertedType, dec DecimalMetadata) bool { + if dec.IsSet { + return false + } + unit := timeUnitFromThrift(t.typ.Unit) + if t.typ.IsAdjustedToUTC { + switch unit { + case TimeUnitMillis: + return c == ConvertedTypes.TimeMillis + case TimeUnitMicros: + return c == ConvertedTypes.TimeMicros + } + } + + return c == ConvertedTypes.None || c == ConvertedTypes.NA +} + +func (t TimeLogicalType) IsApplicable(typ parquet.Type, _ int32) bool { + return (typ == parquet.Types.Int32 && t.typ.GetUnit().IsSetMILLIS()) || + (typ == parquet.Types.Int64 && + (t.typ.GetUnit().IsSetMICROS() || t.typ.GetUnit().IsSetNANOS())) +} + +func (t TimeLogicalType) toThrift() *format.LogicalType { + return &format.LogicalType{TIME: t.typ} +} + +func (t TimeLogicalType) Equals(rhs LogicalType) bool { + other, ok := rhs.(*TimeLogicalType) + if !ok { + return false + } + return t.typ.IsAdjustedToUTC == other.typ.IsAdjustedToUTC && + timeUnitFromThrift(t.typ.Unit) == timeUnitFromThrift(other.typ.Unit) +} + +// NewTimestampLogicalType returns a logical timestamp type with "forceConverted" +// set to false +func NewTimestampLogicalType(isAdjustedToUTC bool, unit TimeUnitType) LogicalType { + return &TimestampLogicalType{ + typ: &format.TimestampType{ + IsAdjustedToUTC: isAdjustedToUTC, + Unit: createTimeUnit(unit), + }, + forceConverted: false, + fromConverted: false, + } +} + +// NewTimestampLogicalTypeForce returns a timestamp logical type with +// "forceConverted" set to true +func NewTimestampLogicalTypeForce(isAdjustedToUTC bool, unit TimeUnitType) LogicalType { + return &TimestampLogicalType{ + typ: &format.TimestampType{ + IsAdjustedToUTC: isAdjustedToUTC, + Unit: createTimeUnit(unit), + }, + forceConverted: true, + fromConverted: false, + } +} + +// TimestampLogicalType represents an int64 number that can be decoded +// into a year, month, day, hour, minute, second, and subsecond +type TimestampLogicalType struct { + baseLogicalType + typ *format.TimestampType + forceConverted bool + fromConverted bool +} + +func (t TimestampLogicalType) IsFromConvertedType() bool { + return t.fromConverted +} + +func (t TimestampLogicalType) IsAdjustedToUTC() bool { + return t.typ.IsAdjustedToUTC +} + +func (t TimestampLogicalType) TimeUnit() TimeUnitType { + return timeUnitFromThrift(t.typ.Unit) +} + +func (TimestampLogicalType) SortOrder() SortOrder { + return SortSIGNED +} + +func (t TimestampLogicalType) MarshalJSON() ([]byte, error) { + return json.Marshal(map[string]interface{}{ + "Type": "Timestamp", + "isAdjustedToUTC": t.typ.IsAdjustedToUTC, + "timeUnit": timeUnitToString(t.typ.GetUnit()), + "is_from_converted_type": t.fromConverted, + "force_set_converted_type": t.forceConverted, + }) +} + +func (t TimestampLogicalType) IsSerialized() bool { + return !t.fromConverted +} + +func (t TimestampLogicalType) String() string { + return fmt.Sprintf("Timestamp(isAdjustedToUTC=%t, timeUnit=%s, is_from_converted_type=%t, force_set_converted_type=%t)", + t.typ.GetIsAdjustedToUTC(), timeUnitToString(t.typ.GetUnit()), t.fromConverted, t.forceConverted) +} + +func (t TimestampLogicalType) ToConvertedType() (ConvertedType, DecimalMetadata) { + unit := timeUnitFromThrift(t.typ.Unit) + if t.typ.IsAdjustedToUTC || t.forceConverted { + switch unit { + case TimeUnitMillis: + return ConvertedTypes.TimestampMillis, DecimalMetadata{} + case TimeUnitMicros: + return ConvertedTypes.TimestampMicros, DecimalMetadata{} + } + } + return ConvertedTypes.None, DecimalMetadata{} +} + +func (t TimestampLogicalType) IsCompatible(c ConvertedType, dec DecimalMetadata) bool { + if dec.IsSet { + return false + } + + switch timeUnitFromThrift(t.typ.Unit) { + case TimeUnitMillis: + if t.typ.GetIsAdjustedToUTC() || t.forceConverted { + return c == ConvertedTypes.TimestampMillis + } + case TimeUnitMicros: + if t.typ.GetIsAdjustedToUTC() || t.forceConverted { + return c == ConvertedTypes.TimestampMicros + } + } + + return c == ConvertedTypes.None || c == ConvertedTypes.NA +} + +func (TimestampLogicalType) IsApplicable(t parquet.Type, _ int32) bool { + return t == parquet.Types.Int64 +} + +func (t TimestampLogicalType) toThrift() *format.LogicalType { + return &format.LogicalType{TIMESTAMP: t.typ} +} + +func (t TimestampLogicalType) Equals(rhs LogicalType) bool { + other, ok := rhs.(*TimestampLogicalType) + if !ok { + return false + } + return t.typ.IsAdjustedToUTC == other.typ.IsAdjustedToUTC && + timeUnitFromThrift(t.typ.Unit) == timeUnitFromThrift(other.typ.Unit) +} + +// NewIntLogicalType creates an integer logical type of the desired bitwidth +// and whether it is signed or not. +// +// Bit width must be exactly 8, 16, 32 or 64 for an integer logical type +func NewIntLogicalType(bitWidth int8, signed bool) LogicalType { + switch bitWidth { + case 8, 16, 32, 64: + default: + panic("parquet: bit width must be exactly 8, 16, 32, or 64 for Int logical type") + } + return &IntLogicalType{ + typ: &format.IntType{ + BitWidth: bitWidth, + IsSigned: signed, + }, + } +} + +// IntLogicalType represents an integer type of a specific bit width and +// is either signed or unsigned. +type IntLogicalType struct { + baseLogicalType + typ *format.IntType +} + +func (t IntLogicalType) BitWidth() int8 { + return t.typ.BitWidth +} + +func (t IntLogicalType) IsSigned() bool { + return t.typ.IsSigned +} + +func (t IntLogicalType) SortOrder() SortOrder { + if t.typ.IsSigned { + return SortSIGNED + } + return SortUNSIGNED +} + +func (t IntLogicalType) MarshalJSON() ([]byte, error) { + return json.Marshal(map[string]interface{}{ + "Type": "Int", "bitWidth": t.typ.BitWidth, "isSigned": t.typ.IsSigned, + }) +} + +func (t IntLogicalType) String() string { + return fmt.Sprintf("Int(bitWidth=%d, isSigned=%t)", t.typ.GetBitWidth(), t.typ.GetIsSigned()) +} + +func (t IntLogicalType) ToConvertedType() (ConvertedType, DecimalMetadata) { + var d DecimalMetadata + if t.typ.IsSigned { + switch t.typ.BitWidth { + case 8: + return ConvertedTypes.Int8, d + case 16: + return ConvertedTypes.Int16, d + case 32: + return ConvertedTypes.Int32, d + case 64: + return ConvertedTypes.Int64, d + } + } else { + switch t.typ.BitWidth { + case 8: + return ConvertedTypes.Uint8, d + case 16: + return ConvertedTypes.Uint16, d + case 32: + return ConvertedTypes.Uint32, d + case 64: + return ConvertedTypes.Uint64, d + } + } + return ConvertedTypes.None, d +} + +func (t IntLogicalType) IsCompatible(c ConvertedType, dec DecimalMetadata) bool { + if dec.IsSet { + return false + } + v, _ := t.ToConvertedType() + return c == v +} + +func (t IntLogicalType) IsApplicable(typ parquet.Type, _ int32) bool { + return (typ == parquet.Types.Int32 && t.typ.GetBitWidth() <= 32) || + (typ == parquet.Types.Int64 && t.typ.GetBitWidth() == 64) +} + +func (t IntLogicalType) toThrift() *format.LogicalType { + return &format.LogicalType{INTEGER: t.typ} +} + +func (t IntLogicalType) Equals(rhs LogicalType) bool { + other, ok := rhs.(*IntLogicalType) + if !ok { + return false + } + + return t.typ.GetIsSigned() == other.typ.GetIsSigned() && + t.typ.GetBitWidth() == other.typ.GetBitWidth() +} + +// UnknownLogicalType is a type that is essentially a placeholder for when +// we don't know the type. +type UnknownLogicalType struct{ baseLogicalType } + +func (UnknownLogicalType) SortOrder() SortOrder { + return SortUNKNOWN +} + +func (UnknownLogicalType) MarshalJSON() ([]byte, error) { + return json.Marshal(map[string]string{"Type": UnknownLogicalType{}.String()}) +} + +func (UnknownLogicalType) IsValid() bool { return false } + +func (UnknownLogicalType) IsSerialized() bool { return false } + +func (UnknownLogicalType) String() string { + return "Unknown" +} + +func (UnknownLogicalType) ToConvertedType() (ConvertedType, DecimalMetadata) { + return ConvertedTypes.NA, DecimalMetadata{} +} + +func (UnknownLogicalType) IsCompatible(c ConvertedType, dec DecimalMetadata) bool { + return c == ConvertedTypes.NA && !dec.IsSet +} + +func (UnknownLogicalType) IsApplicable(parquet.Type, int32) bool { return true } + +func (UnknownLogicalType) toThrift() *format.LogicalType { + return &format.LogicalType{UNKNOWN: format.NewNullType()} +} + +func (UnknownLogicalType) Equals(rhs LogicalType) bool { + _, ok := rhs.(UnknownLogicalType) + return ok +} + +// JSONLogicalType represents a byte array column which is to be interpreted +// as a JSON string. +type JSONLogicalType struct{ baseLogicalType } + +func (JSONLogicalType) SortOrder() SortOrder { + return SortUNSIGNED +} + +func (JSONLogicalType) MarshalJSON() ([]byte, error) { + return json.Marshal(map[string]string{"Type": JSONLogicalType{}.String()}) +} + +func (JSONLogicalType) String() string { + return "JSON" +} + +func (JSONLogicalType) ToConvertedType() (ConvertedType, DecimalMetadata) { + return ConvertedTypes.JSON, DecimalMetadata{} +} + +func (JSONLogicalType) IsCompatible(c ConvertedType, dec DecimalMetadata) bool { + return c == ConvertedTypes.JSON && !dec.IsSet +} + +func (JSONLogicalType) IsApplicable(t parquet.Type, _ int32) bool { + return t == parquet.Types.ByteArray +} + +func (JSONLogicalType) toThrift() *format.LogicalType { + return &format.LogicalType{JSON: format.NewJsonType()} +} + +func (JSONLogicalType) Equals(rhs LogicalType) bool { + _, ok := rhs.(JSONLogicalType) + return ok +} + +// BSONLogicalType represents a binary JSON string in the byte array +type BSONLogicalType struct{ baseLogicalType } + +func (BSONLogicalType) SortOrder() SortOrder { + return SortUNSIGNED +} + +func (BSONLogicalType) MarshalJSON() ([]byte, error) { + return json.Marshal(map[string]string{"Type": BSONLogicalType{}.String()}) +} + +func (BSONLogicalType) String() string { + return "BSON" +} + +func (BSONLogicalType) ToConvertedType() (ConvertedType, DecimalMetadata) { + return ConvertedTypes.BSON, DecimalMetadata{} +} + +func (BSONLogicalType) IsCompatible(c ConvertedType, dec DecimalMetadata) bool { + return c == ConvertedTypes.BSON && !dec.IsSet +} + +func (BSONLogicalType) IsApplicable(t parquet.Type, _ int32) bool { + return t == parquet.Types.ByteArray +} + +func (BSONLogicalType) toThrift() *format.LogicalType { + return &format.LogicalType{BSON: format.NewBsonType()} +} + +func (BSONLogicalType) Equals(rhs LogicalType) bool { + _, ok := rhs.(BSONLogicalType) + return ok +} + +// UUIDLogicalType can only be used with a FixedLength byte array column +// that is exactly 16 bytes long +type UUIDLogicalType struct{ baseLogicalType } + +func (UUIDLogicalType) SortOrder() SortOrder { + return SortUNSIGNED +} + +func (UUIDLogicalType) MarshalJSON() ([]byte, error) { + return json.Marshal(map[string]string{"Type": UUIDLogicalType{}.String()}) +} + +func (UUIDLogicalType) String() string { + return "UUID" +} + +func (UUIDLogicalType) ToConvertedType() (ConvertedType, DecimalMetadata) { + return ConvertedTypes.None, DecimalMetadata{} +} + +func (UUIDLogicalType) IsCompatible(c ConvertedType, dec DecimalMetadata) bool { + if dec.IsSet { + return false + } + switch c { + case ConvertedTypes.None, ConvertedTypes.NA: + return true + } + return false +} + +func (UUIDLogicalType) IsApplicable(t parquet.Type, tlen int32) bool { + return t == parquet.Types.FixedLenByteArray && tlen == 16 +} + +func (UUIDLogicalType) toThrift() *format.LogicalType { + return &format.LogicalType{UUID: format.NewUUIDType()} +} + +func (UUIDLogicalType) Equals(rhs LogicalType) bool { + _, ok := rhs.(UUIDLogicalType) + return ok +} + +// IntervalLogicalType is not yet in the thrift spec, but represents +// an interval time and needs to be a fixed length byte array of 12 bytes +type IntervalLogicalType struct{ baseLogicalType } + +func (IntervalLogicalType) SortOrder() SortOrder { + return SortUNKNOWN Review comment: are you saying that the comment in parquet.thrift needs to be updated? Or the docs I pointed at need to be updated and I should change this? Sorry, it's just unclear from your comment and I'm not sure if parquet.thrift is considered the source of truth or if the parquet-format repo is lol -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org