Fokko commented on code in PR #369:
URL: https://github.com/apache/iceberg-go/pull/369#discussion_r2021749556
##########
table/internal/parquet_files.go:
##########
@@ -170,6 +172,86 @@ func (parquetFormat) PrimitiveTypeToPhysicalType(typ
iceberg.PrimitiveType) stri
}
}
+func (parquetFormat) GetWriteProperties(props iceberg.Properties) any {
+ writerProps := []parquet.WriterProperty{
+ parquet.WithDictionaryDefault(false),
+
parquet.WithMaxRowGroupLength(int64(props.GetInt("write.parquet.row-group-limit",
1048576))),
+
parquet.WithDataPageSize(int64(props.GetInt("write.parquet.page-size-bytes",
1024*1024))),
+ parquet.WithDataPageVersion(parquet.DataPageV2),
+
parquet.WithBatchSize(int64(props.GetInt("write.parquet.page-row-limit",
20000))),
+
parquet.WithDictionaryPageSizeLimit(int64(props.GetInt("write.parquet.dict-size-bytes",
2*1024*1024))),
Review Comment:
Should we move these in constants? Is that common in Go?
##########
table/table_test.go:
##########
@@ -835,6 +839,271 @@ func (t *TableWritingTestSuite) TestReplaceDataFiles() {
}, staged.CurrentSnapshot().Summary)
}
+func (t *TableWritingTestSuite) TestWriteSpecialCharacterColumn() {
+ ident := table.Identifier{"default", "write_special_character_column"}
+ colNameWithSpecialChar := "letter/abc"
+
+ s := iceberg.NewSchema(0,
+ iceberg.NestedField{ID: 1, Name: colNameWithSpecialChar, Type:
iceberg.PrimitiveTypes.String},
+ iceberg.NestedField{ID: 2, Name: "id", Type:
iceberg.PrimitiveTypes.Int32},
+ iceberg.NestedField{ID: 3, Name: "name", Type:
iceberg.PrimitiveTypes.String, Required: true},
+ iceberg.NestedField{ID: 4, Name: "address", Required: true,
Type: &iceberg.StructType{
+ FieldList: []iceberg.NestedField{
+ {ID: 5, Name: "street", Type:
iceberg.PrimitiveTypes.String, Required: true},
+ {ID: 6, Name: "city", Type:
iceberg.PrimitiveTypes.String, Required: true},
+ {ID: 7, Name: "zip", Type:
iceberg.PrimitiveTypes.Int32, Required: true},
+ {ID: 8, Name: colNameWithSpecialChar, Type:
iceberg.PrimitiveTypes.String, Required: true},
+ },
+ }})
+
+ arrowSchema := arrow.NewSchema([]arrow.Field{
+ {Name: colNameWithSpecialChar, Type: arrow.BinaryTypes.String,
Nullable: true},
+ {Name: "id", Type: arrow.PrimitiveTypes.Int32, Nullable: true},
+ {Name: "name", Type: arrow.BinaryTypes.String},
+ {Name: "address", Type: arrow.StructOf(
+ arrow.Field{Name: "street", Type:
arrow.BinaryTypes.String},
+ arrow.Field{Name: "city", Type:
arrow.BinaryTypes.String},
+ arrow.Field{Name: "zip", Type:
arrow.PrimitiveTypes.Int32},
+ arrow.Field{Name: colNameWithSpecialChar, Type:
arrow.BinaryTypes.String},
+ )},
+ }, nil)
+
+ arrowTable, err := array.TableFromJSON(memory.DefaultAllocator,
arrowSchema, []string{
+ `[
+ {
+ "letter/abc": "a",
+ "id": 1,
+ "name": "AB",
+ "address": {"street": "123", "city": "SFO",
"zip": 12345, "letter/abc": "a"}
+ },
+ {
+ "letter/abc": null,
+ "id": 2,
+ "name": "CD",
+ "address": {"street": "456", "city": "SW",
"zip": 67890, "letter/abc": "b"}
+ },
+ {
+ "letter/abc": "z",
+ "id": 3,
+ "name": "EF",
+ "address": {"street": "789", "city": "Random",
"zip": 10112, "letter/abc": "c"}
+ }
+ ]`,
+ })
+ t.Require().NoError(err)
+ defer arrowTable.Release()
+
+ tbl := t.createTable(ident, t.formatVersion,
*iceberg.UnpartitionedSpec, s)
+ rdr := array.NewTableReader(arrowTable, 1)
+ defer rdr.Release()
+
+ tx := tbl.NewTransaction()
+ t.Require().NoError(tx.Append(t.ctx, rdr, nil))
+
+ scan, err := tx.Scan()
+ t.Require().NoError(err)
+
+ result, err := scan.ToArrowTable(t.ctx)
+ t.Require().NoError(err)
+ defer result.Release()
+
+ t.True(array.TableEqual(arrowTable, result), "expected:\n %s\ngot:\n
%s", arrowTable, result)
+}
+
+func (t *TableWritingTestSuite) getInMemCatalog() catalog.Catalog {
+ cat, err := catalog.Load(context.Background(), "default",
iceberg.Properties{
+ "uri": ":memory:",
+ "type": "sql",
+ sql.DriverKey: sqliteshim.ShimName,
+ sql.DialectKey: string(sql.SQLite),
+ "warehouse": "file://" + t.location,
+ })
+ t.Require().NoError(err)
+
+ return cat
+}
+
+func (t *TableWritingTestSuite) createTableWithProps(identifier
table.Identifier, props iceberg.Properties, sc *iceberg.Schema) *table.Table {
+ cat := t.getInMemCatalog()
+ cat.DropTable(t.ctx, identifier)
+ cat.DropNamespace(t.ctx, catalog.NamespaceFromIdent(identifier))
+
+ t.Require().NoError(cat.CreateNamespace(t.ctx,
catalog.NamespaceFromIdent(identifier), nil))
+ tbl, err := cat.CreateTable(t.ctx, identifier, sc,
catalog.WithProperties(props),
+ catalog.WithLocation(t.location))
+
+ t.Require().NoError(err)
+
+ return tbl
+}
+
+func tableSchema() *iceberg.Schema {
+ return iceberg.NewSchema(0,
+ iceberg.NestedField{ID: 1, Name: "bool", Type:
iceberg.PrimitiveTypes.Bool},
+ iceberg.NestedField{ID: 2, Name: "string", Type:
iceberg.PrimitiveTypes.String},
+ iceberg.NestedField{ID: 3, Name: "string_long", Type:
iceberg.PrimitiveTypes.String},
+ iceberg.NestedField{ID: 4, Name: "int", Type:
iceberg.PrimitiveTypes.Int32},
+ iceberg.NestedField{ID: 5, Name: "long", Type:
iceberg.PrimitiveTypes.Int64},
+ iceberg.NestedField{ID: 6, Name: "float", Type:
iceberg.PrimitiveTypes.Float32},
+ iceberg.NestedField{ID: 7, Name: "double", Type:
iceberg.PrimitiveTypes.Float64},
+ iceberg.NestedField{ID: 8, Name: "time", Type:
iceberg.PrimitiveTypes.Time},
+ iceberg.NestedField{ID: 9, Name: "timestamp", Type:
iceberg.PrimitiveTypes.Timestamp},
+ iceberg.NestedField{ID: 10, Name: "timestamptz", Type:
iceberg.PrimitiveTypes.TimestampTz},
+ iceberg.NestedField{ID: 11, Name: "date", Type:
iceberg.PrimitiveTypes.Date},
+ iceberg.NestedField{ID: 12, Name: "uuid", Type:
iceberg.PrimitiveTypes.UUID},
+ iceberg.NestedField{ID: 13, Name: "binary", Type:
iceberg.PrimitiveTypes.Binary},
+ iceberg.NestedField{ID: 14, Name: "fixed", Type:
iceberg.FixedTypeOf(16)},
+ )
Review Comment:
Should we add decimal as well?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]