Dirt-Nasty commented on issue #744:
URL: https://github.com/apache/arrow-go/issues/744#issuecomment-4183992478
@amoeba
Hard to reproduce without access to a Snowflake environment.
My writer on the go side is pretty large so
This is a small example code block of how I use the go writer:
```
package main
import (
"os"
"github.com/apache/arrow-go/v18/arrow"
"github.com/apache/arrow-go/v18/arrow/array"
"github.com/apache/arrow-go/v18/arrow/memory"
"github.com/apache/arrow-go/v18/parquet"
"github.com/apache/arrow-go/v18/parquet/compress"
"github.com/apache/arrow-go/v18/parquet/pqarrow"
)
func main() {
pool := memory.NewGoAllocator()
opsStruct := arrow.StructOf(
arrow.Field{Name: "id", Type: arrow.BinaryTypes.String,
Nullable: false},
arrow.Field{Name: "token", Type: arrow.BinaryTypes.String,
Nullable: true},
arrow.Field{Name: "amount", Type: arrow.BinaryTypes.String,
Nullable: true},
)
schema := arrow.NewSchema([]arrow.Field{
{Name: "block_num", Type: arrow.PrimitiveTypes.Uint64,
Nullable: false},
{Name: "tx_id", Type: arrow.BinaryTypes.String, Nullable:
false},
// Repro target: array<struct<...>>
{Name: "ops", Type: arrow.ListOf(opsStruct), Nullable: true},
}, nil)
b := array.NewRecordBuilder(pool, schema)
defer b.Release()
// scalar columns
b.Field(0).(*array.Uint64Builder).AppendValues([]uint64{100, 101, 102},
nil)
b.Field(1).(*array.StringBuilder).AppendValues([]string{"tx-a", "tx-b",
"tx-c"}, nil)
// ops: list<struct<id, token, amount>>
lb := b.Field(2).(*array.ListBuilder)
sb := lb.ValueBuilder().(*array.StructBuilder)
idb := sb.FieldBuilder(0).(*array.StringBuilder)
tokb := sb.FieldBuilder(1).(*array.StringBuilder)
amtb := sb.FieldBuilder(2).(*array.StringBuilder)
// row 0 => 2 elements
lb.Append(true)
sb.Append(true); idb.Append("op-1"); tokb.Append("USDC");
amtb.Append("10")
sb.Append(true); idb.Append("op-2"); tokb.Append("ETH");
amtb.Append("1.5")
// row 1 => empty list
lb.Append(true)
// row 2 => 1 element (nullable struct fields populated partially)
lb.Append(true)
sb.Append(true); idb.Append("op-3"); tokb.AppendNull();
amtb.Append("42")
rec := b.NewRecord()
defer rec.Release()
f, err := os.Create("go_nested.parquet")
if err != nil {
panic(err)
}
defer f.Close()
props := parquet.NewWriterProperties(
parquet.WithCompression(compress.Codecs.Zstd),
parquet.WithDictionaryDefault(true),
parquet.WithStats(true),
parquet.WithMaxRowGroupLength(10000),
)
arrowProps :=
pqarrow.NewArrowWriterProperties(pqarrow.WithStoreSchema())
pw, err := pqarrow.NewFileWriter(schema, f, props, arrowProps)
if err != nil {
panic(err)
}
if err := pw.Write(rec); err != nil {
panic(err)
}
if err := pw.Close(); err != nil {
panic(err)
}
}
```
On the python side after the go writer has completed is only a few lines.
```
def rewrite_repack(a_path, out_path, batch_size=65536):
pf_a = pq.ParquetFile(a_path)
schema_a = pf_a.schema_arrow
writer = pq.ParquetWriter(out_path, schema_a)
try:
for batch in pf_a.iter_batches(batch_size=batch_size):
writer.write_batch(batch)
finally:
writer.close()
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]