This is an automated email from the ASF dual-hosted git repository. sbinet pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push: new cc0a9cb ARROW-4735: [Go] Optimize CSV writer CPU/Mem performances cc0a9cb is described below commit cc0a9cb620a2f866e6b6d4ad61721ea1e880605e Author: Sebastien Binet <bi...@cern.ch> AuthorDate: Tue Mar 12 22:34:53 2019 +0100 ARROW-4735: [Go] Optimize CSV writer CPU/Mem performances This CL replaces all the naive calls to fmt.Sprintf with strconv.FormatXYZ. ``` $> benchstat old.txt new.txt name old time/op new time/op delta Write-4 2.21ms ±12% 1.15ms ± 1% -47.78% (p=0.000 n=30+30) name old alloc/op new alloc/op delta Write-4 335kB ± 0% 308kB ± 0% -8.21% (p=0.000 n=30+29) name old allocs/op new allocs/op delta Write-4 21.8k ± 0% 11.6k ± 0% -46.96% (p=0.000 n=30+30) ``` Author: Sebastien Binet <bi...@cern.ch> Closes #3879 from sbinet/issue-4735 and squashes the following commits: 48f53222 <Sebastien Binet> go/arrow/csv: fix record index display 26665af2 <Sebastien Binet> ARROW-4735: Optimize CSV writer CPU/Mem performances --- go/arrow/csv/reader_test.go | 248 ++++++++++++++++++++++---------------------- go/arrow/csv/writer.go | 26 ++--- go/arrow/csv/writer_test.go | 60 +++++++++++ 3 files changed, 197 insertions(+), 137 deletions(-) diff --git a/go/arrow/csv/reader_test.go b/go/arrow/csv/reader_test.go index 0c9507e..35bc4bb 100644 --- a/go/arrow/csv/reader_test.go +++ b/go/arrow/csv/reader_test.go @@ -192,7 +192,7 @@ func TestCSVReader(t *testing.T) { for r.Next() { rec := r.Record() for i, col := range rec.Columns() { - fmt.Fprintf(out, "rec[%d][%q]: %v\n", i, rec.ColumnName(i), col) + fmt.Fprintf(out, "rec[%d][%q]: %v\n", n, rec.ColumnName(i), col) } n++ } @@ -202,29 +202,29 @@ func TestCSVReader(t *testing.T) { } want := `rec[0]["bool"]: [true] -rec[1]["i8"]: [-1] -rec[2]["i16"]: [-1] -rec[3]["i32"]: [-1] -rec[4]["i64"]: [-1] -rec[5]["u8"]: [1] -rec[6]["u16"]: [1] -rec[7]["u32"]: [1] -rec[8]["u64"]: [1] -rec[9]["f32"]: [1.1] -rec[10]["f64"]: [1.1] -rec[11]["str"]: ["str-1"] -rec[0]["bool"]: [false] +rec[0]["i8"]: [-1] +rec[0]["i16"]: [-1] +rec[0]["i32"]: [-1] +rec[0]["i64"]: [-1] +rec[0]["u8"]: [1] +rec[0]["u16"]: [1] +rec[0]["u32"]: [1] +rec[0]["u64"]: [1] +rec[0]["f32"]: [1.1] +rec[0]["f64"]: [1.1] +rec[0]["str"]: ["str-1"] +rec[1]["bool"]: [false] rec[1]["i8"]: [-2] -rec[2]["i16"]: [-2] -rec[3]["i32"]: [-2] -rec[4]["i64"]: [-2] -rec[5]["u8"]: [2] -rec[6]["u16"]: [2] -rec[7]["u32"]: [2] -rec[8]["u64"]: [2] -rec[9]["f32"]: [2.2] -rec[10]["f64"]: [2.2] -rec[11]["str"]: ["str-2"] +rec[1]["i16"]: [-2] +rec[1]["i32"]: [-2] +rec[1]["i64"]: [-2] +rec[1]["u8"]: [2] +rec[1]["u16"]: [2] +rec[1]["u32"]: [2] +rec[1]["u64"]: [2] +rec[1]["f32"]: [2.2] +rec[1]["f64"]: [2.2] +rec[1]["str"]: ["str-2"] ` if got, want := out.String(), want; got != want { @@ -278,35 +278,35 @@ func TestCSVReaderWithChunk(t *testing.T) { opts: []csv.Option{csv.WithAllocator(mem), csv.WithComment('#'), csv.WithComma(';')}, records: 10, want: `rec[0]["i64"]: [0] -rec[1]["f64"]: [0] -rec[2]["str"]: ["str-0"] -rec[0]["i64"]: [1] +rec[0]["f64"]: [0] +rec[0]["str"]: ["str-0"] +rec[1]["i64"]: [1] rec[1]["f64"]: [1] -rec[2]["str"]: ["str-1"] -rec[0]["i64"]: [2] -rec[1]["f64"]: [2] +rec[1]["str"]: ["str-1"] +rec[2]["i64"]: [2] +rec[2]["f64"]: [2] rec[2]["str"]: ["str-2"] -rec[0]["i64"]: [3] -rec[1]["f64"]: [3] -rec[2]["str"]: ["str-3"] -rec[0]["i64"]: [4] -rec[1]["f64"]: [4] -rec[2]["str"]: ["str-4"] -rec[0]["i64"]: [5] -rec[1]["f64"]: [5] -rec[2]["str"]: ["str-5"] -rec[0]["i64"]: [6] -rec[1]["f64"]: [6] -rec[2]["str"]: ["str-6"] -rec[0]["i64"]: [7] -rec[1]["f64"]: [7] -rec[2]["str"]: ["str-7"] -rec[0]["i64"]: [8] -rec[1]["f64"]: [8] -rec[2]["str"]: ["str-8"] -rec[0]["i64"]: [9] -rec[1]["f64"]: [9] -rec[2]["str"]: ["str-9"] +rec[3]["i64"]: [3] +rec[3]["f64"]: [3] +rec[3]["str"]: ["str-3"] +rec[4]["i64"]: [4] +rec[4]["f64"]: [4] +rec[4]["str"]: ["str-4"] +rec[5]["i64"]: [5] +rec[5]["f64"]: [5] +rec[5]["str"]: ["str-5"] +rec[6]["i64"]: [6] +rec[6]["f64"]: [6] +rec[6]["str"]: ["str-6"] +rec[7]["i64"]: [7] +rec[7]["f64"]: [7] +rec[7]["str"]: ["str-7"] +rec[8]["i64"]: [8] +rec[8]["f64"]: [8] +rec[8]["str"]: ["str-8"] +rec[9]["i64"]: [9] +rec[9]["f64"]: [9] +rec[9]["str"]: ["str-9"] `, }, { @@ -317,35 +317,35 @@ rec[2]["str"]: ["str-9"] }, records: 10, want: `rec[0]["i64"]: [0] -rec[1]["f64"]: [0] -rec[2]["str"]: ["str-0"] -rec[0]["i64"]: [1] +rec[0]["f64"]: [0] +rec[0]["str"]: ["str-0"] +rec[1]["i64"]: [1] rec[1]["f64"]: [1] -rec[2]["str"]: ["str-1"] -rec[0]["i64"]: [2] -rec[1]["f64"]: [2] +rec[1]["str"]: ["str-1"] +rec[2]["i64"]: [2] +rec[2]["f64"]: [2] rec[2]["str"]: ["str-2"] -rec[0]["i64"]: [3] -rec[1]["f64"]: [3] -rec[2]["str"]: ["str-3"] -rec[0]["i64"]: [4] -rec[1]["f64"]: [4] -rec[2]["str"]: ["str-4"] -rec[0]["i64"]: [5] -rec[1]["f64"]: [5] -rec[2]["str"]: ["str-5"] -rec[0]["i64"]: [6] -rec[1]["f64"]: [6] -rec[2]["str"]: ["str-6"] -rec[0]["i64"]: [7] -rec[1]["f64"]: [7] -rec[2]["str"]: ["str-7"] -rec[0]["i64"]: [8] -rec[1]["f64"]: [8] -rec[2]["str"]: ["str-8"] -rec[0]["i64"]: [9] -rec[1]["f64"]: [9] -rec[2]["str"]: ["str-9"] +rec[3]["i64"]: [3] +rec[3]["f64"]: [3] +rec[3]["str"]: ["str-3"] +rec[4]["i64"]: [4] +rec[4]["f64"]: [4] +rec[4]["str"]: ["str-4"] +rec[5]["i64"]: [5] +rec[5]["f64"]: [5] +rec[5]["str"]: ["str-5"] +rec[6]["i64"]: [6] +rec[6]["f64"]: [6] +rec[6]["str"]: ["str-6"] +rec[7]["i64"]: [7] +rec[7]["f64"]: [7] +rec[7]["str"]: ["str-7"] +rec[8]["i64"]: [8] +rec[8]["f64"]: [8] +rec[8]["str"]: ["str-8"] +rec[9]["i64"]: [9] +rec[9]["f64"]: [9] +rec[9]["str"]: ["str-9"] `, }, { @@ -356,35 +356,35 @@ rec[2]["str"]: ["str-9"] }, records: 10, want: `rec[0]["i64"]: [0] -rec[1]["f64"]: [0] -rec[2]["str"]: ["str-0"] -rec[0]["i64"]: [1] +rec[0]["f64"]: [0] +rec[0]["str"]: ["str-0"] +rec[1]["i64"]: [1] rec[1]["f64"]: [1] -rec[2]["str"]: ["str-1"] -rec[0]["i64"]: [2] -rec[1]["f64"]: [2] +rec[1]["str"]: ["str-1"] +rec[2]["i64"]: [2] +rec[2]["f64"]: [2] rec[2]["str"]: ["str-2"] -rec[0]["i64"]: [3] -rec[1]["f64"]: [3] -rec[2]["str"]: ["str-3"] -rec[0]["i64"]: [4] -rec[1]["f64"]: [4] -rec[2]["str"]: ["str-4"] -rec[0]["i64"]: [5] -rec[1]["f64"]: [5] -rec[2]["str"]: ["str-5"] -rec[0]["i64"]: [6] -rec[1]["f64"]: [6] -rec[2]["str"]: ["str-6"] -rec[0]["i64"]: [7] -rec[1]["f64"]: [7] -rec[2]["str"]: ["str-7"] -rec[0]["i64"]: [8] -rec[1]["f64"]: [8] -rec[2]["str"]: ["str-8"] -rec[0]["i64"]: [9] -rec[1]["f64"]: [9] -rec[2]["str"]: ["str-9"] +rec[3]["i64"]: [3] +rec[3]["f64"]: [3] +rec[3]["str"]: ["str-3"] +rec[4]["i64"]: [4] +rec[4]["f64"]: [4] +rec[4]["str"]: ["str-4"] +rec[5]["i64"]: [5] +rec[5]["f64"]: [5] +rec[5]["str"]: ["str-5"] +rec[6]["i64"]: [6] +rec[6]["f64"]: [6] +rec[6]["str"]: ["str-6"] +rec[7]["i64"]: [7] +rec[7]["f64"]: [7] +rec[7]["str"]: ["str-7"] +rec[8]["i64"]: [8] +rec[8]["f64"]: [8] +rec[8]["str"]: ["str-8"] +rec[9]["i64"]: [9] +rec[9]["f64"]: [9] +rec[9]["str"]: ["str-9"] `, }, { @@ -395,17 +395,17 @@ rec[2]["str"]: ["str-9"] }, records: 4, want: `rec[0]["i64"]: [0 1 2] -rec[1]["f64"]: [0 1 2] -rec[2]["str"]: ["str-0" "str-1" "str-2"] -rec[0]["i64"]: [3 4 5] +rec[0]["f64"]: [0 1 2] +rec[0]["str"]: ["str-0" "str-1" "str-2"] +rec[1]["i64"]: [3 4 5] rec[1]["f64"]: [3 4 5] -rec[2]["str"]: ["str-3" "str-4" "str-5"] -rec[0]["i64"]: [6 7 8] -rec[1]["f64"]: [6 7 8] +rec[1]["str"]: ["str-3" "str-4" "str-5"] +rec[2]["i64"]: [6 7 8] +rec[2]["f64"]: [6 7 8] rec[2]["str"]: ["str-6" "str-7" "str-8"] -rec[0]["i64"]: [9] -rec[1]["f64"]: [9] -rec[2]["str"]: ["str-9"] +rec[3]["i64"]: [9] +rec[3]["f64"]: [9] +rec[3]["str"]: ["str-9"] `, }, { @@ -416,11 +416,11 @@ rec[2]["str"]: ["str-9"] }, records: 2, want: `rec[0]["i64"]: [0 1 2 3 4 5] -rec[1]["f64"]: [0 1 2 3 4 5] -rec[2]["str"]: ["str-0" "str-1" "str-2" "str-3" "str-4" "str-5"] -rec[0]["i64"]: [6 7 8 9] +rec[0]["f64"]: [0 1 2 3 4 5] +rec[0]["str"]: ["str-0" "str-1" "str-2" "str-3" "str-4" "str-5"] +rec[1]["i64"]: [6 7 8 9] rec[1]["f64"]: [6 7 8 9] -rec[2]["str"]: ["str-6" "str-7" "str-8" "str-9"] +rec[1]["str"]: ["str-6" "str-7" "str-8" "str-9"] `, }, { @@ -431,8 +431,8 @@ rec[2]["str"]: ["str-6" "str-7" "str-8" "str-9"] }, records: 1, want: `rec[0]["i64"]: [0 1 2 3 4 5 6 7 8 9] -rec[1]["f64"]: [0 1 2 3 4 5 6 7 8 9] -rec[2]["str"]: ["str-0" "str-1" "str-2" "str-3" "str-4" "str-5" "str-6" "str-7" "str-8" "str-9"] +rec[0]["f64"]: [0 1 2 3 4 5 6 7 8 9] +rec[0]["str"]: ["str-0" "str-1" "str-2" "str-3" "str-4" "str-5" "str-6" "str-7" "str-8" "str-9"] `, }, { @@ -443,8 +443,8 @@ rec[2]["str"]: ["str-0" "str-1" "str-2" "str-3" "str-4" "str-5" "str-6" "str-7" }, records: 1, want: `rec[0]["i64"]: [0 1 2 3 4 5 6 7 8 9] -rec[1]["f64"]: [0 1 2 3 4 5 6 7 8 9] -rec[2]["str"]: ["str-0" "str-1" "str-2" "str-3" "str-4" "str-5" "str-6" "str-7" "str-8" "str-9"] +rec[0]["f64"]: [0 1 2 3 4 5 6 7 8 9] +rec[0]["str"]: ["str-0" "str-1" "str-2" "str-3" "str-4" "str-5" "str-6" "str-7" "str-8" "str-9"] `, }, { @@ -455,8 +455,8 @@ rec[2]["str"]: ["str-0" "str-1" "str-2" "str-3" "str-4" "str-5" "str-6" "str-7" }, records: 1, want: `rec[0]["i64"]: [0 1 2 3 4 5 6 7 8 9] -rec[1]["f64"]: [0 1 2 3 4 5 6 7 8 9] -rec[2]["str"]: ["str-0" "str-1" "str-2" "str-3" "str-4" "str-5" "str-6" "str-7" "str-8" "str-9"] +rec[0]["f64"]: [0 1 2 3 4 5 6 7 8 9] +rec[0]["str"]: ["str-0" "str-1" "str-2" "str-3" "str-4" "str-5" "str-6" "str-7" "str-8" "str-9"] `, }, } { @@ -478,7 +478,7 @@ rec[2]["str"]: ["str-0" "str-1" "str-2" "str-3" "str-4" "str-5" "str-6" "str-7" for r.Next() { rec := r.Record() for i, col := range rec.Columns() { - fmt.Fprintf(out, "rec[%d][%q]: %v\n", i, rec.ColumnName(i), col) + fmt.Fprintf(out, "rec[%d][%q]: %v\n", n, rec.ColumnName(i), col) } n++ } diff --git a/go/arrow/csv/writer.go b/go/arrow/csv/writer.go index ded3fa2..c42635f 100644 --- a/go/arrow/csv/writer.go +++ b/go/arrow/csv/writer.go @@ -18,8 +18,8 @@ package csv import ( "encoding/csv" - "fmt" "io" + "strconv" "github.com/apache/arrow/go/arrow" "github.com/apache/arrow/go/arrow/array" @@ -65,62 +65,62 @@ func (w *Writer) Write(record array.Record) error { case *arrow.BooleanType: arr := col.(*array.Boolean) for i := 0; i < arr.Len(); i++ { - recs[i][j] = fmt.Sprintf("%v", arr.Value(i)) + recs[i][j] = strconv.FormatBool(arr.Value(i)) } case *arrow.Int8Type: arr := col.(*array.Int8) for i := 0; i < arr.Len(); i++ { - recs[i][j] = fmt.Sprintf("%v", arr.Value(i)) + recs[i][j] = strconv.FormatInt(int64(arr.Value(i)), 10) } case *arrow.Int16Type: arr := col.(*array.Int16) for i := 0; i < arr.Len(); i++ { - recs[i][j] = fmt.Sprintf("%v", arr.Value(i)) + recs[i][j] = strconv.FormatInt(int64(arr.Value(i)), 10) } case *arrow.Int32Type: arr := col.(*array.Int32) for i := 0; i < arr.Len(); i++ { - recs[i][j] = fmt.Sprintf("%v", arr.Value(i)) + recs[i][j] = strconv.FormatInt(int64(arr.Value(i)), 10) } case *arrow.Int64Type: arr := col.(*array.Int64) for i := 0; i < arr.Len(); i++ { - recs[i][j] = fmt.Sprintf("%v", arr.Value(i)) + recs[i][j] = strconv.FormatInt(int64(arr.Value(i)), 10) } case *arrow.Uint8Type: arr := col.(*array.Uint8) for i := 0; i < arr.Len(); i++ { - recs[i][j] = fmt.Sprintf("%v", arr.Value(i)) + recs[i][j] = strconv.FormatUint(uint64(arr.Value(i)), 10) } case *arrow.Uint16Type: arr := col.(*array.Uint16) for i := 0; i < arr.Len(); i++ { - recs[i][j] = fmt.Sprintf("%v", arr.Value(i)) + recs[i][j] = strconv.FormatUint(uint64(arr.Value(i)), 10) } case *arrow.Uint32Type: arr := col.(*array.Uint32) for i := 0; i < arr.Len(); i++ { - recs[i][j] = fmt.Sprintf("%v", arr.Value(i)) + recs[i][j] = strconv.FormatUint(uint64(arr.Value(i)), 10) } case *arrow.Uint64Type: arr := col.(*array.Uint64) for i := 0; i < arr.Len(); i++ { - recs[i][j] = fmt.Sprintf("%v", arr.Value(i)) + recs[i][j] = strconv.FormatUint(uint64(arr.Value(i)), 10) } case *arrow.Float32Type: arr := col.(*array.Float32) for i := 0; i < arr.Len(); i++ { - recs[i][j] = fmt.Sprintf("%v", arr.Value(i)) + recs[i][j] = strconv.FormatFloat(float64(arr.Value(i)), 'g', -1, 32) } case *arrow.Float64Type: arr := col.(*array.Float64) for i := 0; i < arr.Len(); i++ { - recs[i][j] = fmt.Sprintf("%v", arr.Value(i)) + recs[i][j] = strconv.FormatFloat(float64(arr.Value(i)), 'g', -1, 64) } case *arrow.StringType: arr := col.(*array.String) for i := 0; i < arr.Len(); i++ { - recs[i][j] = fmt.Sprintf("%v", arr.Value(i)) + recs[i][j] = arr.Value(i) } } } diff --git a/go/arrow/csv/writer_test.go b/go/arrow/csv/writer_test.go index d5cb326..5aa2b6f 100644 --- a/go/arrow/csv/writer_test.go +++ b/go/arrow/csv/writer_test.go @@ -19,6 +19,7 @@ package csv_test import ( "bytes" "fmt" + "io/ioutil" "log" "strings" "testing" @@ -180,3 +181,62 @@ true;1;1;1;1;2;2;2;2;0.2;0.2;str-2 t.Fatalf("invalid output:\ngot=%s\nwant=%s\n", got, want) } } + +func BenchmarkWrite(b *testing.B) { + pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer pool.AssertSize(b, 0) + + schema := arrow.NewSchema( + []arrow.Field{ + {Name: "bool", Type: arrow.FixedWidthTypes.Boolean}, + {Name: "i8", Type: arrow.PrimitiveTypes.Int8}, + {Name: "i16", Type: arrow.PrimitiveTypes.Int16}, + {Name: "i32", Type: arrow.PrimitiveTypes.Int32}, + {Name: "i64", Type: arrow.PrimitiveTypes.Int64}, + {Name: "u8", Type: arrow.PrimitiveTypes.Uint8}, + {Name: "u16", Type: arrow.PrimitiveTypes.Uint16}, + {Name: "u32", Type: arrow.PrimitiveTypes.Uint32}, + {Name: "u64", Type: arrow.PrimitiveTypes.Uint64}, + {Name: "f32", Type: arrow.PrimitiveTypes.Float32}, + {Name: "f64", Type: arrow.PrimitiveTypes.Float64}, + {Name: "str", Type: arrow.BinaryTypes.String}, + }, + nil, + ) + + bldr := array.NewRecordBuilder(pool, schema) + defer bldr.Release() + + const N = 1000 + for i := 0; i < N; i++ { + bldr.Field(0).(*array.BooleanBuilder).Append(i%10 == 0) + bldr.Field(1).(*array.Int8Builder).Append(int8(i)) + bldr.Field(2).(*array.Int16Builder).Append(int16(i)) + bldr.Field(3).(*array.Int32Builder).Append(int32(i)) + bldr.Field(4).(*array.Int64Builder).Append(int64(i)) + bldr.Field(5).(*array.Uint8Builder).Append(uint8(i)) + bldr.Field(6).(*array.Uint16Builder).Append(uint16(i)) + bldr.Field(7).(*array.Uint32Builder).Append(uint32(i)) + bldr.Field(8).(*array.Uint64Builder).Append(uint64(i)) + bldr.Field(9).(*array.Float32Builder).Append(float32(i)) + bldr.Field(10).(*array.Float64Builder).Append(float64(i)) + bldr.Field(11).(*array.StringBuilder).Append(fmt.Sprintf("str-%d", i)) + } + + rec := bldr.NewRecord() + defer rec.Release() + + w := csv.NewWriter(ioutil.Discard, schema, csv.WithComma(';'), csv.WithCRLF(false)) + + b.ResetTimer() + for i := 0; i < b.N; i++ { + err := w.Write(rec) + if err != nil { + b.Fatal(err) + } + err = w.Flush() + if err != nil { + b.Fatal(err) + } + } +}