zeroshade commented on a change in pull request #9817: URL: https://github.com/apache/arrow/pull/9817#discussion_r610837844
########## File path: go/parquet/writer_properties.go ########## @@ -0,0 +1,510 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package parquet + +import ( + "github.com/apache/arrow/go/arrow/memory" + "github.com/apache/arrow/go/parquet/compress" +) + +// Constants for default property values used for the default reader, writer and column props. +const ( + // Default Buffer size used for the Reader + DefaultBufSize int64 = 4096 * 4 + // Default data page size limit is 1K it's not guaranteed, but we will try to + // cut data pages off at this size where possible. + DefaultDataPageSize int64 = 1024 * 1024 + // Default is for dictionary encoding to be turned on, use WithDictionaryDefault + // writer property to change that. + DefaultDictionaryEnabled = true + // If the dictionary reaches the size of this limitation, the writer will use + // the fallback encoding (usually plain) instead of continuing to build the + // dictionary index. + DefaultDictionaryPageSizeLimit = DefaultDataPageSize + // In order to attempt to facilitate data page size limits for writing, + // data is written in batches. Increasing the batch size may improve performance + // but the larger the batch size, the easier it is to overshoot the datapage limit. + DefaultWriteBatchSize int64 = 1024 + // Default maximum number of rows for a single row group + DefaultMaxRowGroupLen int64 = 64 * 1024 * 1024 + // Default is to have stats enabled for all columns, use writer properties to + // change the default, or to enable/disable for specific columns. + DefaultStatsEnabled = true + // If the stats are larger than 4K the writer will skip writing them out anyways. + DefaultMaxStatsSize int64 = 4096 + DefaultCreatedBy = "parquet-go version 1.0.0" +) + +// ColumnProperties defines the encoding, codec, and so on for a given column. +type ColumnProperties struct { + Encoding Encoding + Codec compress.Compression + DictionaryEnabled bool + StatsEnabled bool + MaxStatsSize int64 + CompressionLevel int +} + +// DefaultColumnProperties returns the default properties which get utilized for writing. +// +// The default column properties are the following constants: +// Encoding: Encodings.Plain +// Codec: compress.Codecs.Uncompressed +// DictionaryEnabled: DefaultDictionaryEnabled +// StatsEnabled: DefaultStatsEnabled +// MaxStatsSize: DefaultMaxStatsSize +// CompressionLevel: compress.DefaultCompressionLevel +func DefaultColumnProperties() ColumnProperties { + return ColumnProperties{ + Encoding: Encodings.Plain, + Codec: compress.Codecs.Uncompressed, + DictionaryEnabled: DefaultDictionaryEnabled, + StatsEnabled: DefaultStatsEnabled, + MaxStatsSize: DefaultMaxStatsSize, + CompressionLevel: compress.DefaultCompressionLevel, + } +} + +type writerPropConfig struct { + wr *WriterProperties + encodings map[string]Encoding + codecs map[string]compress.Compression + compressLevel map[string]int + dictEnabled map[string]bool + statsEnabled map[string]bool +} + +// WriterProperty is used as the options for building a writer properties instance +type WriterProperty func(*writerPropConfig) + +// WithAllocator specifies the writer to use the given allocator +func WithAllocator(mem memory.Allocator) WriterProperty { + return func(cfg *writerPropConfig) { + cfg.wr.mem = mem + } +} + +// WithDictionaryDefault sets the default value for whether to enable dictionary encoding +func WithDictionaryDefault(dict bool) WriterProperty { + return func(cfg *writerPropConfig) { + cfg.wr.defColumnProps.DictionaryEnabled = dict + } +} + +// WithDictionaryFor allows enabling or disabling dictionary encoding for a given column path string +func WithDictionaryFor(path string, dict bool) WriterProperty { + return func(cfg *writerPropConfig) { + cfg.dictEnabled[path] = true + } +} + +// WithDictionaryPath is like WithDictionaryFor, but takes a ColumnPath type +func WithDictionaryPath(path ColumnPath, dict bool) WriterProperty { + return WithDictionaryFor(path.String(), dict) +} + +// WithDictionaryPageSizeLimit is the limit of the dictionary at which the writer +// will fallback to plain encoding instead +func WithDictionaryPageSizeLimit(limit int64) WriterProperty { + return func(cfg *writerPropConfig) { + cfg.wr.dictPagesize = limit + } +} + +// WithBatchSize specifies the number of rows to use for batch writes to columns +func WithBatchSize(batch int64) WriterProperty { + return func(cfg *writerPropConfig) { + cfg.wr.batchSize = batch + } +} + +// WithMaxRowGroupLength specifies the number of rows as the maximum number of rows for a given row group in the writer. +func WithMaxRowGroupLength(nrows int64) WriterProperty { Review comment: The C++ implementation uses a Builder pattern that constructs an object with a bunch of "Set...." functions on it and then eventually calls "Build" to generate the resulting object. This follows the idiomatic Go pattern for taking options, which is the same pattern used in the Arrow Go library for things like the ipc.NewWriter etc. Essentially it ends up looking like: ```go parquet.NewWriterProperties(parquet.WithMaxRowGroupLength(nrows), parquet.With.........) ``` where `NewWriterProperties` can take any number of these `WriterProperty` options including none which would just produce the default WriterProperties. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [email protected]
