caldempsey commented on issue #30: URL: https://github.com/apache/arrow-go/issues/30#issuecomment-3071145315
@loicalleyne nice work, but even though it gets the schema looking right it fails a test of ``` package main import ( "bytes" "fmt" "log" "os" "github.com/apache/arrow-go/v18/arrow" "github.com/apache/arrow-go/v18/arrow/array" "github.com/apache/arrow-go/v18/arrow/memory" "github.com/apache/arrow-go/v18/parquet/pqarrow" "github.com/loicalleyne/bodkin" ) func main() { // Complex nested JSON data (NDJSON format) // Note: Changed phone country to string format to avoid type issues jsonData := `{"id": 1, "user": {"name": "Alice Johnson", "age": 30, "contact": {"email": "al...@example.com", "phone": {"country": "+1", "number": "555-0123"}, "address": {"street": "123 Main St", "city": "NYC", "state": "NY", "coordinates": {"latitude": 40.7128, "longitude": -74.0060}}}}, "employment": {"company": {"name": "TechCorp", "industry": "Software", "location": {"building": "Tower A", "floor": 15}}, "position": {"title": "Senior Engineer", "department": "Backend", "level": 4}, "compensation": {"base": 150000, "bonus": {"target": 20000, "multiplier": 1.5}, "equity": {"vested": 5000, "unvested": 15000}}}, "projects": [{"id": "proj-001", "name": "API Gateway", "status": "completed", "metrics": {"linesOfCode": 25000, "coverage": 87.5}}, {"id": "proj-002", "name": "Data Pipeline", "status": "in-progress", "metrics": {"linesOfCode": 12000, "coverage": 92.1}}]} {"id": 2, "user": {"name": "Bob Smith", "age": 28, "contact": {"email": "b...@example.com", "phone": {"country": "+1", "number": "555-0456"}, "address": {"street": "456 Oak Ave", "city": "LA", "state": "CA", "coordinates": {"latitude": 34.0522, "longitude": -118.2437}}}}, "employment": {"company": {"name": "DataFlow", "industry": "Analytics", "location": {"building": "Campus B", "floor": 3}}, "position": {"title": "Data Scientist", "department": "ML", "level": 3}, "compensation": {"base": 130000, "bonus": {"target": 15000, "multiplier": 1.2}, "equity": {"vested": 2000, "unvested": 8000}}}, "projects": [{"id": "proj-003", "name": "ML Model v2", "status": "completed", "metrics": {"linesOfCode": 8000, "coverage": 94.2}}]} {"id": 3, "user": {"name": "Charlie Davis", "age": 35, "contact": {"email": "char...@example.com", "phone": {"country": "+44", "number": "7700-900123"}, "address": {"street": "789 High St", "city": "London", "state": "UK", "coordinates": {"latitude": 51.5074, "longitude": -0.1278}}}}, "employment": {"company": {"name": "GlobalTech", "industry": "Consulting", "location": {"building": "HQ", "floor": 22}}, "position": {"title": "Tech Lead", "department": "Architecture", "level": 5}, "compensation": {"base": 180000, "bonus": {"target": 30000, "multiplier": 2.0}, "equity": {"vested": 10000, "unvested": 20000}}}, "projects": [{"id": "proj-004", "name": "Cloud Migration", "status": "planning", "metrics": {"linesOfCode": 0, "coverage": 0}}, {"id": "proj-005", "name": "Security Audit", "status": "completed", "metrics": {"linesOfCode": 5000, "coverage": 99.1}}, {"id": "proj-006", "name": "Performance Optimization", "status": "in-progress", "metrics": {"linesOfCode": 3500, "coverage": 88.7}} ]}` pool := memory.NewGoAllocator() // Create bodkin instance with options u := bodkin.NewBodkin( bodkin.WithInferTimeUnits(), bodkin.WithTypeConversion(), bodkin.WithQuotedValuesAreStrings(), // This helps with "+1" being treated as string ) // Unify to infer schema err := u.Unify(jsonData) if err != nil { log.Fatal("Failed to unify:", err) } schema, err := u.Schema() if err != nil { log.Fatal("Failed to get schema:", err) } // Print schema summary fmt.Println("=== Inferred Schema Summary ===") for i, field := range schema.Fields() { fmt.Printf("%d: %s (%s)\n", i, field.Name, field.Type) } fmt.Println() // Create JSON reader reader := array.NewJSONReader(bytes.NewReader([]byte(jsonData)), schema) defer reader.Release() // Read all records var records []arrow.Record for reader.Next() { rec := reader.Record() rec.Retain() records = append(records, rec) } if err := reader.Err(); err != nil { log.Fatal("JSON reader error:", err) } fmt.Printf("Read %d records successfully\n", len(records)) // Method 1: Write individual records with added column fmt.Println("\n=== Method 1: Individual Records with Score ===") for i, rec := range records { // Add a score column to each record newRec := addScoreColumn(rec, schema, float64(90+i*5), pool) writeToParquet(newRec, newRec.Schema(), fmt.Sprintf("output_individual_%d.parquet", i+1)) newRec.Release() } // Clean up for _, rec := range records { rec.Release() } } func addScoreColumn(rec arrow.Record, schema *arrow.Schema, score float64, pool memory.Allocator) arrow.Record { // Create score array with same number of rows as the record scoreBuilder := array.NewFloat64Builder(pool) defer scoreBuilder.Release() for i := 0; i < int(rec.NumRows()); i++ { scoreBuilder.Append(score) } scoreArray := scoreBuilder.NewArray() defer scoreArray.Release() // Create new schema with score field newFields := make([]arrow.Field, len(schema.Fields())+1) copy(newFields, schema.Fields()) newFields[len(schema.Fields())] = arrow.Field{ Name: "performance_score", Type: arrow.PrimitiveTypes.Float64, } newSchema := arrow.NewSchema(newFields, nil) // Create new columns array newColumns := make([]arrow.Array, rec.NumCols()+1) for i := 0; i < int(rec.NumCols()); i++ { newColumns[i] = rec.Column(i) } newColumns[rec.NumCols()] = scoreArray return array.NewRecord(newSchema, newColumns, rec.NumRows()) } func writeToParquet(record arrow.Record, schema *arrow.Schema, filename string) { file, err := os.Create(filename) if err != nil { log.Fatal("Failed to create file:", err) } defer file.Close() writer, err := pqarrow.NewFileWriter(schema, file, nil, // use default parquet properties pqarrow.NewArrowWriterProperties()) if err != nil { log.Fatal("Failed to create parquet writer:", err) } defer writer.Close() err = writer.Write(record) if err != nil { log.Fatal("Failed to write parquet:", err) } fmt.Printf("Created %s (%d rows, %d columns)\n", filename, record.NumRows(), record.NumCols()) } ``` when data is written out using the combination of bodkin and arrow; as you can see the email is botched; im not sure if this is a bodkin issue or an arrow issue. unfortunately this is a bit out of scope for me too, i'll look away from arrow for a solution to needs here (going from JSON -> Arrow format when the JSON is unstructured data). not sure if this is something you will want to look into, but something to try. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org