GeneTinderholm commented on issue #279:
URL: https://github.com/apache/arrow-go/issues/279#issuecomment-2658365514

   That seemed to drop performance rather significantly, which is primary in my 
use-case. I don't mind managing the os thread locking personally, but it felt 
hacky enough that I thought double checking might be a good idea.
   
   I'm unfortunately not at liberty to give the parquet files away without 
checking with some other people first. There's nested columns in some cases, 
but the leaves are the only thing that are nullable
   
   I wrote a simplified version of the code that's being used that does 
essentially the same thing.
   
   ```go
   const batchSize = 4096
   var errWrongType = errors.New("enountered column of wrong type")
   
   func ParseFloatCol(rdr *file.Reader, colName string) ([]float64, error) {
       idx := rdr.MetaData().Schema.ColumnIndexByName(colName)
       floatBatch := [batchSize]float64{}
       defLevels := [batchSize]int16{}
   
       current := int64(0)
       result := make([]float64, rdr.NumRows())
       for i := range rdr.NumRowGroups() {
           rgr := rdr.RowGroup(i)
           col, err := rgr.Column(idx)
           if err != nil {
               return nil, err
           }
           f64Col, ok := col.(*file.Float64ColumnChunkReader)
           if !ok {
               return nil, errWrongType
           }
           total, _, err := f64Col.ReadBatch(batchSize, floatBatch[:], 
defLevels[:], nil)
           if err != nil {
               return nil, err
           }
           batchIdx := 0
           maxDefLevel := col.Descriptor().MaxDefinitionLevel()
           for i := range total {
               if defLevels[i] == maxDefLevel {
                   result[current+i] = floatBatch[batchIdx]
                   batchIdx++
               } else {
                   result[current+i] = math.NaN()
               }
           }
       }
       return result, nil
   }
   ```


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to