Hello,

 I have been trying to do some CSV-style processing. My code works
fine for small input (up to 10MB), but performs poorly for moderate to
large input (it can't seem to finish 100MB of input with 700MB heap
space). I have gone through several optimization passes with profiler
help, and now I am hoping someone else can point out some other
approaches to improving the code's performance (both space and time).

 The code breaks a large file into smaller files all of whose entries
have the same date.

thanks,
 Jeff

------------------------

module Main where

import Debug.Trace
import Control.Monad
import Data.List
import qualified Data.ByteString.Lazy.Char8 as B
import qualified Data.Map as M
import System.Environment (getArgs)

dataDir = "dataH/"

myRead file = do
v <- B.readFile file
let (cols' : rows) = map (B.split ',') $ B.lines v
let cols = foldl' (\mp (k,v) -> M.insert k v mp) M.empty (zip cols' [0 ..])
return (cols, rows)

getColId cols col = M.lookup col cols

getCol cols col row = do
i <- getColId cols col
return $! row!!i

dates file nRows = do
  (cols, rows) <- myRead file
  let addDate mp row | mp `seq` row `seq` False = undefined
                               | otherwise = do
          date <- getCol cols (B.pack "\"Date\"") row
          let old = M.findWithDefault [] date mp
          return $ M.insert date (row:old) mp
  res <- foldM addDate M.empty $ take nRows rows
  mapM_ writeDate $ M.toList res
where
  fmt = B.unpack . B.map (\x -> if x == '-' then '_' else x) .
B.takeWhile (/= ' ')
  writeDate (date,rows) =
      B.writeFile (dataDir++fmt date)
                  (B.unlines $ map (B.join (B.pack ",")) rows)

main = do
  args <- getArgs
  case args of
    ["dates",file,nRows] -> dates file (read nRows)
_______________________________________________
Haskell-Cafe mailing list
Haskell-Cafe@haskell.org
http://www.haskell.org/mailman/listinfo/haskell-cafe

Reply via email to