[Haskell-cafe] Slow Text.JSON parser

Sjoerd Visscher sjoerd at w3future.com
Sat Jan 17 16:07:57 EST 2009


Hi,

Somebody told me about Parsec 3, which uses a Stream type class so it  
can parse any data type. This sounded like the right way to do  
encoding independent parsing, so I decided to see how it would work to  
parse UTF8 JSON.

Sadly I could not use Text.JSON.Parsec directly, because it uses the  
old Parsec CharParser type. So I copied to code, and also replaced  
p_number with the "floating" parser from Text.Parsec.Token, because  
Text.JSON.Parsec uses readFloat (a dirty hack imho) which works only  
on String.

If Text.JSON.Parsec was written for Parsec 3, the only thing to write  
to get UTF8 JSON parsing would be:

instance (Monad m, U.UTF8Bytes string index) => Stream (U.UTF8 string)  
m Char where
     uncons = return . U.uncons

I did not do any performance measuring yet, I was glad I got it  
working. Any comments on the code is appreciated!

greetings,
Sjoerd Visscher

{-# LANGUAGE FlexibleInstances, MultiParamTypeClasses,  
UndecidableInstances #-}
import qualified Data.String.UTF8 as U
import qualified Data.ByteString as B

import Text.Parsec hiding (many, optional, (<|>))
import Control.Applicative

import Text.JSON.Types
import Control.Monad
import Data.Char
import Numeric

instance (Monad m, U.UTF8Bytes string index) => Stream (U.UTF8 string)  
m Char where
     uncons = return . U.uncons

type CharParser st = Parsec (U.UTF8 B.ByteString) st

parseFile :: FilePath -> IO (Either ParseError JSValue)
parseFile fileName = do
   bs <- B.readFile fileName
   return $ runParser json () fileName (U.fromRep bs)

parseString      :: String -> Either ParseError JSValue
parseString s     = runParser json () "(unknown)" (U.fromString s)

json             :: CharParser () JSValue
json              = spaces *> p_value

tok              :: CharParser () a -> CharParser () a
tok p             = p <* spaces

p_value          :: CharParser () JSValue
p_value           =  (JSNull      <$  p_null)
                  <|> (JSBool      <$> p_boolean)
                  <|> (JSArray     <$> p_array)
                  <|> (JSString    <$> p_js_string)
                  <|> (JSObject    <$> p_js_object)
                  <|> (JSRational False <$> p_number)
                  <?> "JSON value"

p_null           :: CharParser () ()
p_null            = tok (string "null") >> return ()

p_boolean        :: CharParser () Bool
p_boolean         = tok
                       (  (True  <$ string "true")
                      <|> (False <$ string "false")
                       )

p_array          :: CharParser () [JSValue]
p_array           = between (tok (char '[')) (tok (char ']'))
                   $ p_value `sepBy` tok (char ',')

p_string         :: CharParser () String
p_string          = between (tok (char '"')) (char '"') (many p_char)
   where p_char    =  (char '\\' >> p_esc)
                  <|> (satisfy (\x -> x /= '"' && x /= '\\'))

         p_esc     =  ('"'   <$ char '"')
                  <|> ('\\'  <$ char '\\')
                  <|> ('/'   <$ char '/')
                  <|> ('\b'  <$ char 'b')
                  <|> ('\f'  <$ char 'f')
                  <|> ('\n'  <$ char 'n')
                  <|> ('\r'  <$ char 'r')
                  <|> ('\t'  <$ char 't')
                  <|> (char 'u' *> p_uni)
                  <?> "escape character"

         p_uni     = check =<< count 4 (satisfy isHexDigit)
           where check x | code <= max_char  = pure (toEnum code)
                         | otherwise         = empty
                   where code      = fst $ head $ readHex x
                         max_char  = fromEnum (maxBound :: Char)

p_object         :: CharParser () [(String,JSValue)]
p_object          = between (tok (char '{')) (tok (char '}'))
                   $ p_field `sepBy` tok (char ',')
   where p_field   = (,) <$> (p_string <* tok (char ':')) <*> p_value

p_number         :: CharParser () Rational
p_number          = tok floating where

     floating       :: CharParser () Rational
     floating        = do{ n <- decimal
                         ; fract <- option 0 fraction
                         ; expo  <- option 1 exponent'
                         ; return ((fromInteger n + fract)*expo)
                         }

     fraction        = do{ char '.'
                         ; digits <- many1 digit <?> "fraction"
                         ; return (foldr op 0 digits)
                         }
                       <?> "fraction"
                     where
                       op d f    = (f + fromIntegral (digitToInt d))/10

     exponent'       = do{ oneOf "eE"
                         ; f <- sign
                         ; e <- decimal <?> "exponent"
                         ; return (power (f e))
                         }
                       <?> "exponent"
                     where
                        power e  | e < 0      = 1/power(-e)
                                 | otherwise  = fromInteger (10^e)

     sign            =   (char '-' >> return negate)
                     <|> (char '+' >> return id)
                     <|> return id

     decimal         = number 10 digit

     number base baseDigit
         = do{ digits <- many1 baseDigit
             ; let n = foldl (\x d -> base*x + toInteger (digitToInt  
d)) 0 digits
             ; seq n (return n)
             }


p_js_string      :: CharParser () JSString
p_js_string       = toJSString <$> p_string

p_js_object      :: CharParser () (JSObject JSValue)
p_js_object       = toJSObject <$> p_object




More information about the Haskell-Cafe mailing list