Re: [PR] feat(table): clustered partitioned write path [iceberg-go]

via GitHub Thu, 30 Apr 2026 01:32:54 -0700


twuebi commented on code in PR #948:
URL: https://github.com/apache/iceberg-go/pull/948#discussion_r3166616140



##########
table/clustered_writer.go:
##########
@@ -0,0 +1,246 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package table
+
+import (
+       "cmp"
+       "context"
+       "errors"
+       "fmt"
+       "iter"
+       "slices"
+
+       "github.com/apache/arrow-go/v18/arrow"
+       "github.com/apache/iceberg-go"
+)
+
+// clusteredPartitionedWrite writes records to partitioned data files,
+// keeping at most one partition writer open at a time. When the
+// partition changes, the current writer is closed before opening a
+// new one. This is the memory-efficient write path for pre-clustered
+// input (e.g. compaction reads where each source file belongs to a
+// single partition).
+//
+// The input must be clustered by partition across batches: once a
+// partition's writer has been closed, encountering further records
+// for that partition returns an error. Within a single batch the
+// writer reclusters rows by partition. Use the fanout writer if the
+// input is not clustered across batches.
+//
+// Breaking out of the returned iterator early cancels the producer
+// so it stops opening new writers; in-flight writers finish cleanly.
+func clusteredPartitionedWrite(
+       ctx context.Context,
+       spec iceberg.PartitionSpec,
+       schema *iceberg.Schema,
+       factory *writerFactory,
+       records iter.Seq2[arrow.RecordBatch, error],
+) iter.Seq2[iceberg.DataFile, error] {
+       ctx, cancel := context.WithCancel(ctx)
+
+       outputCh := make(chan iceberg.DataFile, 1)
+       errCh := make(chan error, 1)
+
+       go func() {
+               defer close(outputCh)
+               defer close(errCh)
+               defer factory.stopCount()
+
+               var (
+                       currentRec          partitionRecord
+                       currentWriter       *RollingDataWriter
+                       completedPartitions = &closedPartitionSet{}
+               )
+
+               closeCurrentWriter := func() error {
+                       if currentWriter == nil {
+                               return nil
+                       }
+                       w := currentWriter
+                       currentWriter = nil
+                       completedPartitions.add(currentRec)
+                       close(w.recordCh)
+                       w.wg.Wait()
+
+                       // stream's deferred close(errorCh) runs before its
+                       // deferred wg.Done, so by the time Wait returns the
+                       // channel is closed; this read never blocks and yields
+                       // either the buffered error or nil.
+                       return <-w.errorCh
+               }
+
+               sendErr := func(err error) {
+                       select {
+                       case errCh <- err:
+                       default:
+                       }
+               }
+
+               fail := func(err error) {
+                       sendErr(errors.Join(err, closeCurrentWriter()))
+               }
+
+               // Recover any panic so the consumer is not left blocking on
+               // errCh forever. Declared last so it runs first on goroutine
+               // exit, before the close(errCh) and close(outputCh) defers.
+               defer func() {
+                       if r := recover(); r != nil {
+                               fail(fmt.Errorf("clustered write panic: %v", r))
+                       }
+               }()
+
+               takeFn := partitionBatchByKey(ctx)
+
+               for rec, err := range records {
+                       if ctxErr := ctx.Err(); ctxErr != nil {
+                               fail(context.Cause(ctx))
+
+                               return
+                       }
+                       if err != nil {
+                               fail(err)
+
+                               return
+                       }
+
+                       partitions, err := getRecordPartitions(spec, schema, 
rec)
+                       if err != nil {
+                               fail(err)
+
+                               return
+                       }
+
+                       // Process partitions in input row order so the revisit
+                       // check is deterministic; getRecordPartitions returns
+                       // them in arbitrary (Go map iteration) order.
+                       slices.SortFunc(partitions, func(a, b *partitionInfo) 
int {
+                               return cmp.Compare(a.rows[0], b.rows[0])
+                       })
+
+                       for _, part := range partitions {
+                               select {
+                               case <-ctx.Done():
+                                       fail(context.Cause(ctx))
+
+                                       return
+                               default:
+                               }
+
+                               subBatch, err := takeFn(rec, part.rows)
+                               if err != nil {
+                                       fail(err)
+
+                                       return
+                               }
+
+                               if currentWriter == nil || 
!slices.Equal(currentRec, part.partitionRec) {
+                                       if err := closeCurrentWriter(); err != 
nil {
+                                               subBatch.Release()

Review Comment:
   added a function



##########
table/clustered_writer.go:
##########
@@ -0,0 +1,246 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package table
+
+import (
+       "cmp"
+       "context"
+       "errors"
+       "fmt"
+       "iter"
+       "slices"
+
+       "github.com/apache/arrow-go/v18/arrow"
+       "github.com/apache/iceberg-go"
+)
+
+// clusteredPartitionedWrite writes records to partitioned data files,
+// keeping at most one partition writer open at a time. When the
+// partition changes, the current writer is closed before opening a
+// new one. This is the memory-efficient write path for pre-clustered
+// input (e.g. compaction reads where each source file belongs to a
+// single partition).
+//
+// The input must be clustered by partition across batches: once a
+// partition's writer has been closed, encountering further records
+// for that partition returns an error. Within a single batch the
+// writer reclusters rows by partition. Use the fanout writer if the
+// input is not clustered across batches.
+//
+// Breaking out of the returned iterator early cancels the producer
+// so it stops opening new writers; in-flight writers finish cleanly.
+func clusteredPartitionedWrite(
+       ctx context.Context,
+       spec iceberg.PartitionSpec,
+       schema *iceberg.Schema,
+       factory *writerFactory,
+       records iter.Seq2[arrow.RecordBatch, error],
+) iter.Seq2[iceberg.DataFile, error] {
+       ctx, cancel := context.WithCancel(ctx)
+
+       outputCh := make(chan iceberg.DataFile, 1)
+       errCh := make(chan error, 1)
+
+       go func() {
+               defer close(outputCh)
+               defer close(errCh)
+               defer factory.stopCount()
+
+               var (
+                       currentRec          partitionRecord
+                       currentWriter       *RollingDataWriter
+                       completedPartitions = &closedPartitionSet{}
+               )
+
+               closeCurrentWriter := func() error {
+                       if currentWriter == nil {
+                               return nil
+                       }
+                       w := currentWriter
+                       currentWriter = nil
+                       completedPartitions.add(currentRec)
+                       close(w.recordCh)
+                       w.wg.Wait()
+
+                       // stream's deferred close(errorCh) runs before its
+                       // deferred wg.Done, so by the time Wait returns the
+                       // channel is closed; this read never blocks and yields
+                       // either the buffered error or nil.
+                       return <-w.errorCh
+               }
+
+               sendErr := func(err error) {
+                       select {
+                       case errCh <- err:
+                       default:
+                       }
+               }
+
+               fail := func(err error) {
+                       sendErr(errors.Join(err, closeCurrentWriter()))
+               }
+
+               // Recover any panic so the consumer is not left blocking on
+               // errCh forever. Declared last so it runs first on goroutine
+               // exit, before the close(errCh) and close(outputCh) defers.
+               defer func() {
+                       if r := recover(); r != nil {
+                               fail(fmt.Errorf("clustered write panic: %v", r))
+                       }
+               }()
+
+               takeFn := partitionBatchByKey(ctx)
+
+               for rec, err := range records {
+                       if ctxErr := ctx.Err(); ctxErr != nil {
+                               fail(context.Cause(ctx))
+
+                               return
+                       }
+                       if err != nil {
+                               fail(err)
+
+                               return
+                       }
+
+                       partitions, err := getRecordPartitions(spec, schema, 
rec)
+                       if err != nil {
+                               fail(err)
+
+                               return
+                       }
+
+                       // Process partitions in input row order so the revisit
+                       // check is deterministic; getRecordPartitions returns
+                       // them in arbitrary (Go map iteration) order.
+                       slices.SortFunc(partitions, func(a, b *partitionInfo) 
int {
+                               return cmp.Compare(a.rows[0], b.rows[0])
+                       })
+
+                       for _, part := range partitions {
+                               select {
+                               case <-ctx.Done():
+                                       fail(context.Cause(ctx))
+
+                                       return
+                               default:
+                               }
+
+                               subBatch, err := takeFn(rec, part.rows)
+                               if err != nil {
+                                       fail(err)
+
+                                       return
+                               }
+
+                               if currentWriter == nil || 
!slices.Equal(currentRec, part.partitionRec) {
+                                       if err := closeCurrentWriter(); err != 
nil {
+                                               subBatch.Release()
+                                               sendErr(err)
+
+                                               return
+                                       }
+                                       if 
completedPartitions.contains(part.partitionRec) {
+                                               partitionPath := 
spec.PartitionToPath(part.partitionRec, schema)
+                                               subBatch.Release()
+                                               fail(fmt.Errorf("clustered 
write: incoming records violate the clustering assumption; "+
+                                                       "partition %q has 
records arriving after its writer was already closed", partitionPath))
+
+                                               return
+                                       }
+                                       partitionPath := 
spec.PartitionToPath(part.partitionRec, schema)
+                                       currentWriter = 
factory.newRollingDataWriter(
+                                               ctx, nil, partitionPath, 
part.partitionValues, outputCh)
+                                       currentRec = part.partitionRec
+                               }
+
+                               addErr := currentWriter.Add(subBatch)
+                               subBatch.Release()
+                               if addErr != nil {
+                                       fail(addErr)
+
+                                       return
+                               }
+                       }
+               }
+
+               if err := closeCurrentWriter(); err != nil {
+                       sendErr(err)
+               }
+       }()
+
+       return func(yield func(iceberg.DataFile, error) bool) {
+               // LIFO defer order matters: cancel signals the producer first
+               // (synchronous, instant), then the drain pulls outputCh so
+               // any in-flight stream send can complete and the producer's
+               // closeCurrentWriter / wg.Wait paths unblock.
+               defer func() {
+                       for range outputCh {
+                       }
+               }()
+               defer cancel()
+
+               for df := range outputCh {
+                       if !yield(df, nil) {
+                               return
+                       }
+               }
+
+               if err := <-errCh; err != nil {
+                       yield(nil, err)
+               }
+       }
+}
+
+// closedPartitionSet tracks already-closed partitions by walking a
+// tree keyed on each partition field's value, mirroring the layout of
+// partitionMapNode. Go's any-equality at each level distinguishes SQL
+// NULL from the literal string "null" — the same property that
+// PartitionToPath drops via Transform.ToHumanStr — so this is the
+// structural key the revisit check needs.
+type closedPartitionSet struct {
+       children map[any]*closedPartitionSet
+}

Review Comment:
   changed, thanks



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] feat(table): clustered partitioned write path [iceberg-go]

Reply via email to