caldempsey commented on issue #30:
URL: https://github.com/apache/arrow-go/issues/30#issuecomment-3071145315

   @loicalleyne nice work, but even though it gets the schema looking right it 
fails a test of 
   ```
   package main
   
   import (
        "bytes"
        "fmt"
        "log"
        "os"
   
        "github.com/apache/arrow-go/v18/arrow"
        "github.com/apache/arrow-go/v18/arrow/array"
        "github.com/apache/arrow-go/v18/arrow/memory"
        "github.com/apache/arrow-go/v18/parquet/pqarrow"
        "github.com/loicalleyne/bodkin"
   )
   
   func main() {
        // Complex nested JSON data (NDJSON format)
        // Note: Changed phone country to string format to avoid type issues
        jsonData := `{"id": 1, "user": {"name": "Alice Johnson", "age": 30, 
"contact": {"email": "al...@example.com", "phone": {"country": "+1", "number": 
"555-0123"}, "address": {"street": "123 Main St", "city": "NYC", "state": "NY", 
"coordinates": {"latitude": 40.7128, "longitude": -74.0060}}}}, "employment": 
{"company": {"name": "TechCorp", "industry": "Software", "location": 
{"building": "Tower A", "floor": 15}}, "position": {"title": "Senior Engineer", 
"department": "Backend", "level": 4}, "compensation": {"base": 150000, "bonus": 
{"target": 20000, "multiplier": 1.5}, "equity": {"vested": 5000, "unvested": 
15000}}}, "projects": [{"id": "proj-001", "name": "API Gateway", "status": 
"completed", "metrics": {"linesOfCode": 25000, "coverage": 87.5}}, {"id": 
"proj-002", "name": "Data Pipeline", "status": "in-progress", "metrics": 
{"linesOfCode": 12000, "coverage": 92.1}}]}
   {"id": 2, "user": {"name": "Bob Smith", "age": 28, "contact": {"email": 
"b...@example.com", "phone": {"country": "+1", "number": "555-0456"}, 
"address": {"street": "456 Oak Ave", "city": "LA", "state": "CA", 
"coordinates": {"latitude": 34.0522, "longitude": -118.2437}}}}, "employment": 
{"company": {"name": "DataFlow", "industry": "Analytics", "location": 
{"building": "Campus B", "floor": 3}}, "position": {"title": "Data Scientist", 
"department": "ML", "level": 3}, "compensation": {"base": 130000, "bonus": 
{"target": 15000, "multiplier": 1.2}, "equity": {"vested": 2000, "unvested": 
8000}}}, "projects": [{"id": "proj-003", "name": "ML Model v2", "status": 
"completed", "metrics": {"linesOfCode": 8000, "coverage": 94.2}}]}
   {"id": 3, "user": {"name": "Charlie Davis", "age": 35, "contact": {"email": 
"char...@example.com", "phone": {"country": "+44", "number": "7700-900123"}, 
"address": {"street": "789 High St", "city": "London", "state": "UK", 
"coordinates": {"latitude": 51.5074, "longitude": -0.1278}}}}, "employment": 
{"company": {"name": "GlobalTech", "industry": "Consulting", "location": 
{"building": "HQ", "floor": 22}}, "position": {"title": "Tech Lead", 
"department": "Architecture", "level": 5}, "compensation": {"base": 180000, 
"bonus": {"target": 30000, "multiplier": 2.0}, "equity": {"vested": 10000, 
"unvested": 20000}}}, "projects": [{"id": "proj-004", "name": "Cloud 
Migration", "status": "planning", "metrics": {"linesOfCode": 0, "coverage": 
0}}, {"id": "proj-005", "name": "Security Audit", "status": "completed", 
"metrics": {"linesOfCode": 5000, "coverage": 99.1}}, {"id": "proj-006", "name": 
"Performance Optimization", "status": "in-progress", "metrics": {"linesOfCode": 
3500, "coverage": 88.7}}
 ]}`
   
        pool := memory.NewGoAllocator()
   
        // Create bodkin instance with options
        u := bodkin.NewBodkin(
                bodkin.WithInferTimeUnits(),
                bodkin.WithTypeConversion(),
                bodkin.WithQuotedValuesAreStrings(), // This helps with "+1" 
being treated as string
        )
   
        // Unify to infer schema
        err := u.Unify(jsonData)
        if err != nil {
                log.Fatal("Failed to unify:", err)
        }
   
        schema, err := u.Schema()
        if err != nil {
                log.Fatal("Failed to get schema:", err)
        }
   
        // Print schema summary
        fmt.Println("=== Inferred Schema Summary ===")
        for i, field := range schema.Fields() {
                fmt.Printf("%d: %s (%s)\n", i, field.Name, field.Type)
        }
        fmt.Println()
   
        // Create JSON reader
        reader := array.NewJSONReader(bytes.NewReader([]byte(jsonData)), schema)
        defer reader.Release()
   
        // Read all records
        var records []arrow.Record
        for reader.Next() {
                rec := reader.Record()
                rec.Retain()
                records = append(records, rec)
        }
   
        if err := reader.Err(); err != nil {
                log.Fatal("JSON reader error:", err)
        }
   
        fmt.Printf("Read %d records successfully\n", len(records))
   
        // Method 1: Write individual records with added column
        fmt.Println("\n=== Method 1: Individual Records with Score ===")
        for i, rec := range records {
                // Add a score column to each record
                newRec := addScoreColumn(rec, schema, float64(90+i*5), pool)
                writeToParquet(newRec, newRec.Schema(), 
fmt.Sprintf("output_individual_%d.parquet", i+1))
                newRec.Release()
        }
   
        // Clean up
        for _, rec := range records {
                rec.Release()
        }
   }
   
   func addScoreColumn(rec arrow.Record, schema *arrow.Schema, score float64, 
pool memory.Allocator) arrow.Record {
        // Create score array with same number of rows as the record
        scoreBuilder := array.NewFloat64Builder(pool)
        defer scoreBuilder.Release()
   
        for i := 0; i < int(rec.NumRows()); i++ {
                scoreBuilder.Append(score)
        }
        scoreArray := scoreBuilder.NewArray()
        defer scoreArray.Release()
   
        // Create new schema with score field
        newFields := make([]arrow.Field, len(schema.Fields())+1)
        copy(newFields, schema.Fields())
        newFields[len(schema.Fields())] = arrow.Field{
                Name: "performance_score",
                Type: arrow.PrimitiveTypes.Float64,
        }
        newSchema := arrow.NewSchema(newFields, nil)
   
        // Create new columns array
        newColumns := make([]arrow.Array, rec.NumCols()+1)
        for i := 0; i < int(rec.NumCols()); i++ {
                newColumns[i] = rec.Column(i)
        }
        newColumns[rec.NumCols()] = scoreArray
   
        return array.NewRecord(newSchema, newColumns, rec.NumRows())
   }
   
   func writeToParquet(record arrow.Record, schema *arrow.Schema, filename 
string) {
        file, err := os.Create(filename)
        if err != nil {
                log.Fatal("Failed to create file:", err)
        }
        defer file.Close()
   
        writer, err := pqarrow.NewFileWriter(schema, file,
                nil, // use default parquet properties
                pqarrow.NewArrowWriterProperties())
        if err != nil {
                log.Fatal("Failed to create parquet writer:", err)
        }
        defer writer.Close()
   
        err = writer.Write(record)
        if err != nil {
                log.Fatal("Failed to write parquet:", err)
        }
   
        fmt.Printf("Created %s (%d rows, %d columns)\n", filename, 
record.NumRows(), record.NumCols())
   }
   
   ```
   
   when data is written out using the combination of bodkin and arrow; as you 
can see the email is botched; im not sure if this is a bodkin issue or an arrow 
issue. unfortunately this is a bit out of scope for me too, i'll look away from 
arrow for a solution to needs here (going from JSON -> Arrow format when the 
JSON is unstructured data). not sure if this is something you will want to look 
into, but something to try.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to