Attoparsec does not have something like the Stream class, so I do not
see how I could do UTF8 parsing easily.
On Jan 17, 2009, at 11:50 PM, Don Stewart wrote:
It occurs to me you could also use attoparsec, which is specifically
optimised for bytestring processing.
sjoerd:
Hi,
Somebody told me about Parsec 3, which uses a Stream type class so it
can parse any data type. This sounded like the right way to do
encoding independent parsing, so I decided to see how it would work
to
parse UTF8 JSON.
Sadly I could not use Text.JSON.Parsec directly, because it uses the
old Parsec CharParser type. So I copied to code, and also replaced
p_number with the "floating" parser from Text.Parsec.Token, because
Text.JSON.Parsec uses readFloat (a dirty hack imho) which works only
on String.
If Text.JSON.Parsec was written for Parsec 3, the only thing to write
to get UTF8 JSON parsing would be:
instance (Monad m, U.UTF8Bytes string index) => Stream (U.UTF8
string)
m Char where
uncons = return . U.uncons
I did not do any performance measuring yet, I was glad I got it
working. Any comments on the code is appreciated!
greetings,
Sjoerd Visscher
{-# LANGUAGE FlexibleInstances, MultiParamTypeClasses,
UndecidableInstances #-}
import qualified Data.String.UTF8 as U
import qualified Data.ByteString as B
import Text.Parsec hiding (many, optional, (<|>))
import Control.Applicative
import Text.JSON.Types
import Control.Monad
import Data.Char
import Numeric
instance (Monad m, U.UTF8Bytes string index) => Stream (U.UTF8
string)
m Char where
uncons = return . U.uncons
type CharParser st = Parsec (U.UTF8 B.ByteString) st
parseFile :: FilePath -> IO (Either ParseError JSValue)
parseFile fileName = do
bs <- B.readFile fileName
return $ runParser json () fileName (U.fromRep bs)
parseString :: String -> Either ParseError JSValue
parseString s = runParser json () "(unknown)" (U.fromString s)
json :: CharParser () JSValue
json = spaces *> p_value
tok :: CharParser () a -> CharParser () a
tok p = p <* spaces
p_value :: CharParser () JSValue
p_value = (JSNull <$ p_null)
<|> (JSBool <$> p_boolean)
<|> (JSArray <$> p_array)
<|> (JSString <$> p_js_string)
<|> (JSObject <$> p_js_object)
<|> (JSRational False <$> p_number)
<?> "JSON value"
p_null :: CharParser () ()
p_null = tok (string "null") >> return ()
p_boolean :: CharParser () Bool
p_boolean = tok
( (True <$ string "true")
<|> (False <$ string "false")
)
p_array :: CharParser () [JSValue]
p_array = between (tok (char '[')) (tok (char ']'))
$ p_value `sepBy` tok (char ',')
p_string :: CharParser () String
p_string = between (tok (char '"')) (char '"') (many p_char)
where p_char = (char '\\' >> p_esc)
<|> (satisfy (\x -> x /= '"' && x /= '\\'))
p_esc = ('"' <$ char '"')
<|> ('\\' <$ char '\\')
<|> ('/' <$ char '/')
<|> ('\b' <$ char 'b')
<|> ('\f' <$ char 'f')
<|> ('\n' <$ char 'n')
<|> ('\r' <$ char 'r')
<|> ('\t' <$ char 't')
<|> (char 'u' *> p_uni)
<?> "escape character"
p_uni = check =<< count 4 (satisfy isHexDigit)
where check x | code <= max_char = pure (toEnum code)
| otherwise = empty
where code = fst $ head $ readHex x
max_char = fromEnum (maxBound :: Char)
p_object :: CharParser () [(String,JSValue)]
p_object = between (tok (char '{')) (tok (char '}'))
$ p_field `sepBy` tok (char ',')
where p_field = (,) <$> (p_string <* tok (char ':')) <*> p_value
p_number :: CharParser () Rational
p_number = tok floating where
floating :: CharParser () Rational
floating = do{ n <- decimal
; fract <- option 0 fraction
; expo <- option 1 exponent'
; return ((fromInteger n + fract)*expo)
}
fraction = do{ char '.'
; digits <- many1 digit <?> "fraction"
; return (foldr op 0 digits)
}
<?> "fraction"
where
op d f = (f + fromIntegral (digitToInt d))/10
exponent' = do{ oneOf "eE"
; f <- sign
; e <- decimal <?> "exponent"
; return (power (f e))
}
<?> "exponent"
where
power e | e < 0 = 1/power(-e)
| otherwise = fromInteger (10^e)
sign = (char '-' >> return negate)
<|> (char '+' >> return id)
<|> return id
decimal = number 10 digit
number base baseDigit
= do{ digits <- many1 baseDigit
; let n = foldl (\x d -> base*x + toInteger (digitToInt
d)) 0 digits
; seq n (return n)
}
p_js_string :: CharParser () JSString
p_js_string = toJSString <$> p_string
p_js_object :: CharParser () (JSObject JSValue)
p_js_object = toJSObject <$> p_object
_______________________________________________
Haskell-Cafe mailing list
Haskell-Cafe@haskell.org
http://www.haskell.org/mailman/listinfo/haskell-cafe
--
Sjoerd Visscher
sjo...@w3future.com
_______________________________________________
Haskell-Cafe mailing list
Haskell-Cafe@haskell.org
http://www.haskell.org/mailman/listinfo/haskell-cafe