diff --git a/Data/Aeson/Parser/Internal.hs b/Data/Aeson/Parser/Internal.hs index a9b2aa1ac..04daa0d93 100644 --- a/Data/Aeson/Parser/Internal.hs +++ b/Data/Aeson/Parser/Internal.hs @@ -1,9 +1,6 @@ {-# LANGUAGE BangPatterns #-} {-# LANGUAGE CPP #-} {-# LANGUAGE OverloadedStrings #-} -#if MIN_VERSION_ghc_prim(0,3,1) -{-# LANGUAGE MagicHash #-} -#endif -- | -- Module: Data.Aeson.Parser.Internal @@ -37,29 +34,21 @@ import Prelude () import Prelude.Compat import Data.Aeson.Types.Internal (IResult(..), JSONPath, Result(..), Value(..)) -import Data.Attoparsec.ByteString.Char8 (Parser, char, endOfInput, scientific, skipSpace, string) -import Data.Attoparsec.ByteString.Char8 (Parser, char, endOfInput, scientific, skipSpace, string) -import Data.Bits ((.|.), shiftL) -import Data.ByteString.Internal (ByteString(..)) -import Data.Char (chr) +import Data.Binary.Parser (Parser, endOfInput, scientific, skipSpaces, string) import Data.Text (Text) -import Data.Vector as Vector (Vector, empty, fromListN, reverse) -import qualified Data.Attoparsec.ByteString as A -import qualified Data.Attoparsec.Lazy as L +import qualified Data.Vector as Vector (Vector, empty, fromListN, reverse) +import qualified Data.Binary.Parser as BP import qualified Data.ByteString as B import qualified Data.ByteString.Lazy as L import qualified Data.HashMap.Strict as H import Data.Aeson.Parser.Unescape -#if MIN_VERSION_ghc_prim(0,3,1) -import GHC.Base (Int#, (==#), isTrue#, word2Int#) -import GHC.Word (Word8(W8#)) -#endif #define BACKSLASH 92 #define CLOSE_CURLY 125 #define CLOSE_SQUARE 93 #define COMMA 44 +#define COLON 58 #define DOUBLE_QUOTE 34 #define OPEN_CURLY 123 #define OPEN_SQUARE 91 @@ -111,21 +100,21 @@ object_' = {-# SCC "object_'" #-} do objectValues :: Parser Text -> Parser Value -> Parser (H.HashMap Text Value) objectValues str val = do - skipSpace - w <- A.peekWord8' + skipSpaces + w <- BP.peek if w == CLOSE_CURLY - then A.anyWord8 >> return H.empty + then BP.skipN 1 >> return H.empty else loop [] where -- Why use acc pattern here, you may ask? because 'H.fromList' use 'unsafeInsert' -- and it's much faster because it's doing in place update to the 'HashMap'! loop acc = do - k <- str <* skipSpace <* char ':' - v <- val <* skipSpace - ch <- A.satisfy $ \w -> w == COMMA || w == CLOSE_CURLY + k <- str <* skipSpaces <* BP.word8 COLON + v <- val <* skipSpaces + ch <- BP.satisfy $ \w -> w == COMMA || w == CLOSE_CURLY let acc' = (k, v) : acc if ch == COMMA - then skipSpace >> loop acc' + then skipSpaces >> loop acc' else return (H.fromList acc') {-# INLINE objectValues #-} @@ -137,19 +126,19 @@ array_' = {-# SCC "array_'" #-} do !vals <- arrayValues value' return (Array vals) -arrayValues :: Parser Value -> Parser (Vector Value) +arrayValues :: Parser Value -> Parser (Vector.Vector Value) arrayValues val = do - skipSpace - w <- A.peekWord8' + skipSpaces + w <- BP.peek if w == CLOSE_SQUARE - then A.anyWord8 >> return Vector.empty + then BP.skipN 1 >> return Vector.empty else loop [] 1 where loop acc !len = do - v <- val <* skipSpace - ch <- A.satisfy $ \w -> w == COMMA || w == CLOSE_SQUARE + v <- val <* skipSpaces + ch <- BP.satisfy $ \w -> w == COMMA || w == CLOSE_SQUARE if ch == COMMA - then skipSpace >> loop (v:acc) (len+1) + then skipSpaces >> loop (v:acc) (len+1) else return (Vector.reverse (Vector.fromListN len (v:acc))) {-# INLINE arrayValues #-} @@ -165,12 +154,12 @@ arrayValues val = do -- to preserve interoperability and security. value :: Parser Value value = do - skipSpace - w <- A.peekWord8' + skipSpaces + w <- BP.peek case w of - DOUBLE_QUOTE -> A.anyWord8 *> (String <$> jstring_) - OPEN_CURLY -> A.anyWord8 *> object_ - OPEN_SQUARE -> A.anyWord8 *> array_ + DOUBLE_QUOTE -> BP.skipN 1 *> (String <$> jstring_) + OPEN_CURLY -> BP.skipN 1 *> object_ + OPEN_SQUARE -> BP.skipN 1 *> array_ C_f -> string "false" *> pure (Bool False) C_t -> string "true" *> pure (Bool True) C_n -> string "null" *> pure Null @@ -181,14 +170,14 @@ value = do -- | Strict version of 'value'. See also 'json''. value' :: Parser Value value' = do - skipSpace - w <- A.peekWord8' + skipSpaces + w <- BP.peek case w of DOUBLE_QUOTE -> do - !s <- A.anyWord8 *> jstring_ + !s <- BP.skipN 1 *> jstring_ return (String s) - OPEN_CURLY -> A.anyWord8 *> object_' - OPEN_SQUARE -> A.anyWord8 *> array_' + OPEN_CURLY -> BP.skipN 1 *> object_' + OPEN_SQUARE -> BP.skipN 1 *> array_' C_f -> string "false" *> pure (Bool False) C_t -> string "true" *> pure (Bool True) C_n -> string "null" *> pure Null @@ -200,40 +189,22 @@ value' = do -- | Parse a quoted JSON string. jstring :: Parser Text -jstring = A.word8 DOUBLE_QUOTE *> jstring_ +jstring = BP.word8 DOUBLE_QUOTE *> jstring_ +{-# INLINE jstring #-} -- | Parse a string without a leading quote. jstring_ :: Parser Text {-# INLINE jstring_ #-} jstring_ = {-# SCC "jstring_" #-} do - s <- A.scan startState go <* A.anyWord8 + s <- BP.scanChunks (-1) unescapeTextScanner <* BP.skipN 1 case unescapeText s of Right r -> return r Left err -> fail $ show err - where -#if MIN_VERSION_ghc_prim(0,3,1) - startState = S 0# - go (S a) (W8# c) - | isTrue# a = Just (S 0#) - | isTrue# (word2Int# c ==# 34#) = Nothing -- double quote - | otherwise = let a' = word2Int# c ==# 92# -- backslash - in Just (S a') - -data S = S Int# -#else - startState = False - go a c - | a = Just False - | c == DOUBLE_QUOTE = Nothing - | otherwise = let a' = c == backslash - in Just a' - where backslash = BACKSLASH -#endif decodeWith :: Parser Value -> (Value -> Result a) -> L.ByteString -> Maybe a decodeWith p to s = - case L.parse p s of - L.Done _ v -> case to v of + case BP.parseLazy p s of + Right v -> case to v of Success a -> Just a _ -> Nothing _ -> Nothing @@ -242,7 +213,7 @@ decodeWith p to s = decodeStrictWith :: Parser Value -> (Value -> Result a) -> B.ByteString -> Maybe a decodeStrictWith p to s = - case either Error to (A.parseOnly p s) of + case either Error to (BP.parseOnly p s) of Success a -> Just a _ -> Nothing {-# INLINE decodeStrictWith #-} @@ -250,17 +221,17 @@ decodeStrictWith p to s = eitherDecodeWith :: Parser Value -> (Value -> IResult a) -> L.ByteString -> Either (JSONPath, String) a eitherDecodeWith p to s = - case L.parse p s of - L.Done _ v -> case to v of + case BP.parseLazy p s of + Right v -> case to v of ISuccess a -> Right a IError path msg -> Left (path, msg) - L.Fail _ _ msg -> Left ([], msg) + Left msg -> Left ([], msg) {-# INLINE eitherDecodeWith #-} eitherDecodeStrictWith :: Parser Value -> (Value -> IResult a) -> B.ByteString -> Either (JSONPath, String) a eitherDecodeStrictWith p to s = - case either (IError []) to (A.parseOnly p s) of + case either (IError []) to (BP.parseOnly p s) of ISuccess a -> Right a IError path msg -> Left (path, msg) {-# INLINE eitherDecodeStrictWith #-} @@ -287,9 +258,9 @@ eitherDecodeStrictWith p to s = -- | Parse a top-level JSON value followed by optional whitespace and -- end-of-input. See also: 'json'. jsonEOF :: Parser Value -jsonEOF = json <* skipSpace <* endOfInput +jsonEOF = json <* skipSpaces <* endOfInput -- | Parse a top-level JSON value followed by optional whitespace and -- end-of-input. See also: 'json''. jsonEOF' :: Parser Value -jsonEOF' = json' <* skipSpace <* endOfInput +jsonEOF' = json' <* skipSpaces <* endOfInput diff --git a/Data/Aeson/Parser/Unescape.hs b/Data/Aeson/Parser/Unescape.hs index d3682fd16..ebaa59285 100644 --- a/Data/Aeson/Parser/Unescape.hs +++ b/Data/Aeson/Parser/Unescape.hs @@ -3,13 +3,13 @@ {-# LANGUAGE UnliftedFFITypes #-} module Data.Aeson.Parser.Unescape ( - unescapeText + unescapeTextScanner +, unescapeText ) where import Control.Exception (evaluate, throw, try) import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) -import Data.ByteString as B -import Data.ByteString.Internal as B hiding (c2w) +import Data.ByteString.Internal (ByteString(..)) import qualified Data.Text.Array as A import Data.Text.Encoding.Error (UnicodeException (..)) import Data.Text.Internal (Text (..)) @@ -27,6 +27,19 @@ foreign import ccall unsafe "_js_decode_string" c_js_decode :: MutableByteArray# s -> Ptr CSize -> Ptr Word8 -> Ptr Word8 -> IO CInt +foreign import ccall unsafe "_js_find_string_end" c_js_find_string_end + :: CInt -> Ptr Word8 -> Ptr Word8 -> IO CInt + +unescapeTextScanner :: CInt -> ByteString -> Either CInt (ByteString, ByteString) +unescapeTextScanner backslashed bs@(PS fp off len) = unsafeDupablePerformIO $ + withForeignPtr fp $ \ptr -> do + s <- c_js_find_string_end backslashed (ptr `plusPtr` off) (ptr `plusPtr` (off + len)) + if s >= 0 + then let s' = fromIntegral s + in return $ Right (PS fp off s', PS fp (off + s') (len - s')) + else return (Left s) +{-# INLINE unescapeTextScanner #-} + unescapeText' :: ByteString -> Text unescapeText' (PS fp off len) = runText $ \done -> do let go dest = withForeignPtr fp $ \ptr -> diff --git a/aeson.cabal b/aeson.cabal index 9ce97750f..315fca39f 100644 --- a/aeson.cabal +++ b/aeson.cabal @@ -102,12 +102,12 @@ library build-depends: attoparsec >= 0.13.0.1, + binary-parsers >= 0.2.3, base >= 4.5 && < 5, base-compat >= 0.9.1 && < 0.10, containers >= 0.2.4.1, deepseq >= 1.3, dlist >= 0.2, - ghc-prim >= 0.2, hashable >= 1.1.2.0, scientific >= 0.3.4.7 && < 0.4, tagged >=0.8.3 && <0.9, @@ -176,13 +176,13 @@ test-suite tests QuickCheck >= 2.7 && <2.9.3, aeson, attoparsec, + binary-parsers, base, base-compat, base-orphans >= 0.5.3 && <0.6, containers, dlist, generic-deriving >= 1.10 && < 1.12, - ghc-prim >= 0.2, hashable >= 1.2.4.0, scientific, tagged, diff --git a/benchmarks/AesonEncode.hs b/benchmarks/AesonEncode.hs index dae95b2c2..154575aaf 100644 --- a/benchmarks/AesonEncode.hs +++ b/benchmarks/AesonEncode.hs @@ -10,7 +10,7 @@ import Control.DeepSeq import Control.Exception import Control.Monad import Data.Aeson -import Data.Attoparsec.ByteString (IResult(..), parseWith) +import Data.Binary.Parser (parse, Parser, Decoder(..)) import Data.Char (isDigit) import Data.Time.Clock import System.Environment (getArgs) @@ -30,7 +30,7 @@ main = do let refill = B.hGet h 16384 result0 <- parseWith refill json =<< refill r0 <- case result0 of - Done _ r -> return r + Done _ _ r -> return r _ -> fail $ "failed to read " ++ show arg start <- getCurrentTime let loop !n r @@ -42,3 +42,22 @@ main = do let rate = fromIntegral count / realToFrac delta :: Double putStrLn $ " " ++ cnt ++ " good, " ++ show delta putStrLn $ " " ++ show (round rate :: Int) ++ " per second" + +-- | Run a parser with an initial input string, and a monadic action +-- that can supply more input if needed. +parseWith :: Monad m => + (m B.ByteString) + -- ^ An action that will be executed to provide the parser + -- with more input, if necessary. The action must return an + -- 'B.empty' string when there is no more input available. + -> Parser a + -> B.ByteString + -- ^ Initial input for the parser. + -> m (Decoder a) +parseWith refill p s = step $ parse p s + where step (Partial k) = do + bs <- refill + if B.null bs then step (k Nothing) + else step (k (Just bs)) + step r = return r +{-# INLINE parseWith #-} diff --git a/benchmarks/AesonParse.hs b/benchmarks/AesonParse.hs index b437d03a2..9e0ed2b39 100644 --- a/benchmarks/AesonParse.hs +++ b/benchmarks/AesonParse.hs @@ -10,7 +10,7 @@ import Prelude.Compat import "aeson-benchmarks" Data.Aeson import Control.Exception import Control.Monad -import Data.Attoparsec.ByteString (IResult(..), parseWith) +import Data.Binary.Parser (parse, Parser, Decoder(..)) import Data.Time.Clock import System.Environment (getArgs) import System.IO @@ -20,7 +20,7 @@ main :: IO () main = do (bs:cnt:args) <- getArgs let count = read cnt :: Int - blkSize = read bs + blkSize = read bs :: Int forM_ args $ \arg -> bracket (openFile arg ReadMode) hClose $ \h -> do putStrLn $ arg ++ ":" start <- getCurrentTime @@ -31,10 +31,29 @@ main = do let refill = B.hGet h blkSize result <- parseWith refill json =<< refill case result of - Done _ _ -> loop (good+1) bad - _ -> loop good (bad+1) + Done _ _ _ -> loop (good+1) bad + _ -> loop good (bad+1) (good, _) <- loop 0 0 delta <- flip diffUTCTime start `fmap` getCurrentTime putStrLn $ " " ++ show good ++ " good, " ++ show delta let rate = fromIntegral count / realToFrac delta :: Double putStrLn $ " " ++ show (round rate :: Int) ++ " per second" + +-- | Run a parser with an initial input string, and a monadic action +-- that can supply more input if needed. +parseWith :: Monad m => + (m B.ByteString) + -- ^ An action that will be executed to provide the parser + -- with more input, if necessary. The action must return an + -- 'B.empty' string when there is no more input available. + -> Parser a + -> B.ByteString + -- ^ Initial input for the parser. + -> m (Decoder a) +parseWith refill p s = step $ parse p s + where step (Partial k) = do + bs <- refill + if B.null bs then step (k Nothing) + else step (k (Just bs)) + step r = return r +{-# INLINE parseWith #-} diff --git a/benchmarks/aeson-benchmarks.cabal b/benchmarks/aeson-benchmarks.cabal index 00e477c9c..909d7ff11 100644 --- a/benchmarks/aeson-benchmarks.cabal +++ b/benchmarks/aeson-benchmarks.cabal @@ -35,6 +35,7 @@ library build-depends: attoparsec >= 0.13.0.1, + binary-parsers >= 0.2.3, base == 4.*, base-compat >= 0.9.1 && <0.10, time-locale-compat >=0.1.1 && <0.2, @@ -42,7 +43,6 @@ library deepseq, dlist >= 0.2, fail == 4.9.*, - ghc-prim >= 0.2, hashable >= 1.1.2.0, mtl, scientific >= 0.3.4.7 && < 0.4, @@ -85,7 +85,6 @@ executable aeson-benchmark-compare bytestring, criterion >= 1.0, deepseq, - ghc-prim, json-builder, text @@ -100,7 +99,6 @@ executable aeson-benchmark-micro bytestring, criterion >= 1.0, deepseq, - ghc-prim, text executable aeson-benchmark-typed @@ -117,7 +115,6 @@ executable aeson-benchmark-typed base-compat, criterion >= 1.0, deepseq, - ghc-prim, text, time @@ -147,6 +144,7 @@ executable aeson-benchmark-aeson-encode build-depends: aeson-benchmarks, attoparsec, + binary-parsers, base, base-compat, bytestring, @@ -159,6 +157,7 @@ executable aeson-benchmark-aeson-parse build-depends: aeson-benchmarks, attoparsec, + binary-parsers, base, base-compat, bytestring, diff --git a/cbits/unescape_string.c b/cbits/unescape_string.c index 2d3120946..97cc174d7 100644 --- a/cbits/unescape_string.c +++ b/cbits/unescape_string.c @@ -148,3 +148,26 @@ int _js_decode_string(uint16_t *const dest, size_t *destoff, DISPATCH_ASCII(unicode1) } +// if return >= 0, it's the split offset +// if return == -1, then string ends without a tailing backslash +// if return == -2, then string ends with a tailing backslash +int _js_find_string_end(const int bs, uint8_t *const start, uint8_t *const end){ + uint8_t *s = start; + int backslash = bs; + while (s < end) { + if (backslash == -2){ + backslash = -1; + s++; + if (s == end) break; + } + if (*s == 92) { // backslash + backslash = -2; + s++; + } + else if (*s == 34) { // double quote + return (s - start); + } + else { s++; } + } + return backslash; +} diff --git a/stack-bench.yaml b/stack-bench.yaml index 3a96f94fd..876d1c9f8 100644 --- a/stack-bench.yaml +++ b/stack-bench.yaml @@ -3,6 +3,7 @@ packages: - benchmarks extra-deps: - semigroups-0.18.2 +- binary-parsers-0.2.3 flags: semigroups: bytestring-builder: false diff --git a/stack-lts6.yaml b/stack-lts6.yaml index f4bbb7c81..eb28d6fd4 100644 --- a/stack-lts6.yaml +++ b/stack-lts6.yaml @@ -3,6 +3,7 @@ packages: - '.' extra-deps: - semigroups-0.18.2 +- binary-parsers-0.2.3 flags: aeson: fast: true diff --git a/stack-lts7.yaml b/stack-lts7.yaml index 92a1898c3..92859c31d 100644 --- a/stack-lts7.yaml +++ b/stack-lts7.yaml @@ -1,6 +1,8 @@ resolver: lts-7.0 packages: - '.' +extra-deps: +- binary-parsers-0.2.3 flags: aeson: fast: true diff --git a/tests/Properties.hs b/tests/Properties.hs index 1ad68345b..3319a8528 100644 --- a/tests/Properties.hs +++ b/tests/Properties.hs @@ -35,7 +35,7 @@ import Test.Framework (Test, testGroup) import Test.Framework.Providers.QuickCheck2 (testProperty) import Test.QuickCheck (Arbitrary(..), Property, (===), (.&&.), counterexample) import Types -import qualified Data.Attoparsec.Lazy as L +import qualified Data.Binary.Parser as BP import qualified Data.ByteString.Lazy.Char8 as L import qualified Data.HashMap.Strict as H import qualified Data.Map as Map @@ -73,10 +73,10 @@ toParseJSON1 parsejson1 tojson1 = toParseJSON parsejson tojson roundTripEnc :: (FromJSON a, ToJSON a, Show a) => (a -> a -> Property) -> a -> a -> Property roundTripEnc eq _ i = - case fmap ifromJSON . L.parse value . encode $ i of - L.Done _ (ISuccess v) -> v `eq` i - L.Done _ (IError path err) -> failure "fromJSON" (formatError path err) i - L.Fail _ _ err -> failure "parse" err i + case fmap ifromJSON . BP.parseLazy value . encode $ i of + Right (ISuccess v) -> v `eq` i + Right (IError path err) -> failure "fromJSON" (formatError path err) i + Left err -> failure "parse" err i roundTripNoEnc :: (FromJSON a, ToJSON a, Show a) => (a -> a -> Property) -> a -> a -> Property