Introduction / refresher of parser combinators & transormers
Introduce the basics of parser combinators
Implement parsers in two combinator libraries and our own toy library
Popularized / introduced to Haskell by the parsec library around 2001
Latter there was the attoparsec with slightly different focus
Latter still there was the megaparsec as an advanced fork / successor to parsec
We will use attoparsec and megaparsec
We will define the minimum parsers primitives common to attoparsec and megaparsec which we will implement ourselves
This is all we actually need.
-- replace the result of a parser with a constant
(<$) :: Functor f => b -> f a -> f b
($>) :: Functor f => f a -> b -> f b
-- Combine a parser producing a function and
-- one producing a value. Run them sequentially
-- and produce the value applied to the function.
(<*>) :: Applicative f => f (a -> b) -> f a -> f b
-- Combine two parsers running them sequentially
-- ignoring the result of the second and producing
-- the result of the first.
(<*) :: Applicative f => f a -> f b -> f a
-- Create a parser that never fails and always
-- produces the provided value
pure :: Applicative f => a -> f a
-- Combine two parsers taking the result
-- of the first if it succeeds ignoring and
-- on failure running the second.
(<|>) :: Alternative f => f a -> f a -> f a
-- Combine a list of parsers attempting them
-- each in order returning the result of the
-- first one that does not fail ignoring the rest.
asum, choice :: Alternative f => [f a] -> f a
-- Repeatedly run a parser until it fails
-- producing a list with the result of
-- each successful parse
many :: Alternative f => f a -> f [a]
A list of dot-separated DNS labels, each label consisting of
parseDNS :: Parser (NonEmpty Text)
parseDNS = do
labels <- (:|) <$> parseDNSLabel <*> many (char '.' >> parseDNSLabel)
if Text.all Char.isDigit . NonEmpty.last $ labels
then fail "TLD can not be all digits"
else pure labels
parseDNSLabel :: Parser Text
parseDNSLabel = do
label <- (:|) <$> parseAlphaNum <*> many (char '-' <|> parseAlphaNum)
if NonEmpty.last label == '-'
then
fail "label can't end in hyphen"
else
pure $ Text.pack $ NonEmpty.toList label
parseAlphaNum :: Parser Char
parseAlphaNum = satisfy (\c -> Char.isAlphaNum c && Char.isAscii c)
testCase "DNS" do
assertEqual "" (Right $ ["example", "com"]) (NonEmpty.toList <$> parseOnly parseDNS "example.com")
assertEqual "" (Right $ ["com"]) (NonEmpty.toList <$> parseOnly parseDNS "com")
assertEqual "" (Right $ ["jimmiy-james", "1235", "no"])
(NonEmpty.toList <$> parseOnly parseDNS "jimmiy-james.1235.no")
assertEqual "" (Left ()) (first (const ()) $ parseOnly parseDNS "jimmiy-.1235.no")
assertEqual "" (Left ()) (first (const ()) $ parseOnly parseDNS "blah.456")
data Root
= RootA A
| RootB B
| RootC C
deriving stock (Show, Read, Eq, Ord, Generic)
deriving anyclass (NFData)
data A = A
{ id :: Char
, children :: [Child]
}
deriving stock (Show, Read, Eq, Ord, Generic)
deriving anyclass (NFData)
data Child
= ChildB B
| ChildC C
deriving stock (Show, Read, Eq, Ord, Generic)
deriving anyclass (NFData)
data B = B
{ id :: Int
, mchild :: Maybe A
}
deriving stock (Show, Read, Eq, Ord, Generic)
deriving anyclass (NFData)
data C = C
{ happy :: Bool
, name :: Text
}
deriving stock (Show, Read, Eq, Ord, Generic)
deriving anyclass (NFData)
prettyRoot :: Root -> Doc ann
prettyRoot = \case
RootA x -> prettyA x
RootB x -> prettyB x
RootC x -> prettyC x
prettyA :: A -> Doc ann
prettyA x =
vsep
[ "A["
, indent 2 $
vsep
[ fillSep [ "id" , "=" , squotes (pretty $ escapedId)]
, fillSep [ "children" , "="
, braces (align (sep (map prettyChild x.children))) ]
]
, "]A"
]
where
escapedId
| x.id `elem` ['\'', '\\'] = ['\\', x.id]
| otherwise =[x.id]
prettyChild :: Child -> Doc ann
prettyChild = \case
ChildB x -> prettyB x
ChildC x -> prettyC x
prettyB :: B -> Doc ann
prettyB x =
Pretty.vsep
[ "B["
, indent 2 $
vsep
[ fillSep ["id", "=", pretty x.id]
, maybe
mempty
(\child -> fillSep ["mchild", "=", align (prettyA child)])
x.mchild
]
, "]B"
]
prettyC :: C -> Doc ann
prettyC x =
vsep
[ "C["
, indent 2 $
vsep
[ fillSep ["happy", "=", if x.happy then "yes" else "no"]
, fillSep [ "name" , "=" , dquotes (pretty escapeName)]
]
, "]C"
]
where
escapeName =
Text.replace "\"" "\\\"" $ Text.replace "\\" "\\\\" x.name
parseRoot :: Parser Root
parseRoot = choice [RootA <$> parseA, RootB <$> parseB, RootC <$> parseC]
anyExceptOrEscaped :: Char -> Parser Char
anyExceptOrEscaped ec = do
nc <- anyChar
guard (nc /= ec)
choice
[ do
guard (nc == '\\')
nnc <- anyChar
guard (nnc `elem` ['\\', ec])
pure nnc
, pure nc
]
skipSpaces :: Parser ()
skipSpaces = skipMany (satisfy Char.isSpace)
parseA :: Parser A
parseA = do
_ <- skipSpaces >> string "A["
id_ <- do
skipSpaces >> string "id" >> skipSpaces >> string "=" >> skipSpaces
char '\'' *> some (anyExceptOrEscaped '\'') <* char '\''
>>= \case
"\\'" -> pure '\''
[x] -> pure x
[] -> fail "expecting single character but got none"
xs -> fail ("expecting single character but got many: " <> xs)
children <- do
skipSpaces >> string "children" >> skipSpaces >> string "=" >> skipSpaces
char '{' *> many (choice [ChildB <$> parseB, ChildC <$> parseC]) <* char '}'
_ <- skipSpaces >> string "]A"
pure A{id = id_, ..}
parseB :: Parser B
parseB = do
_ <- skipSpaces >> string "B["
id_ <- do
skipSpaces >> string "id" >> skipSpaces >> string "=" >> skipSpaces
(:) <$> char '-' <*> some (satisfy Char.isDigit) <|> some (satisfy Char.isDigit)
>>= maybe (fail "expecting an integer") pure . readMaybe
mchild <- optional do
skipSpaces >> string "mchild" >> skipSpaces >> string "=" >> skipSpaces
parseA
_ <- skipSpaces >> string "]B"
pure B{id = id_, ..}
parseC :: Parser C
parseC = do
_ <- skipSpaces >> string "C["
happy <- do
skipSpaces >> string "happy" >> skipSpaces >> string "=" >> skipSpaces
(True <$ string "yes") <|> (False <$ string "no")
name <- do
skipSpaces >> string "name" >> skipSpaces >> string "=" >> skipSpaces
Text.pack <$> (char '"' *> many (anyExceptOrEscaped '"') <* char '"')
_ <- skipSpaces >> string "]C"
pure C{..}
Same code as for attoparsec except for
When we run it on the first example we get the following error.
2:11:
|
2 | id = 'w'
| ^
unknown parse error
<|>
and
try
The parser try p
behaves like the parser p
,
except that it backtracks the parser state when p
fails
(either consuming input or not).
This combinator is used whenever arbitrary look ahead is needed.
Since it pretends that it hasn’t consumed any input when p
fails, the (<|>)
combinator will try its second
alternative even if the first parser failed while consuming input.
For example, here is a parser that is supposed to parse the word “let” or the word “lexical”:
>>> parseTest (string "let" <|> string "lexical") "lexical"
1:1:
unexpected "lex"
expecting "let"
What happens here? The first parser consumes “le” and fails (because it doesn’t see a “t”). The second parser, however, isn’t tried, since the first parser has already consumed some input! try fixes this behavior and allows backtracking to work:
>>> parseTest (try (string "let") <|> string "lexical") "lexical"
"lexical"
empty <|> a == a
a <|> empty == a
Technically no since
it depends on your definition of empty
empty is a parser that fails without consuming input
There are good reasons for it but it is still surprising with respect to other alternative instances.
Sprinkle
try
’s wherever <|>
, many
,
optional
or choice
is used
@@ -43,7 +43,7 @@
anyChar = anySingle
parseRoot :: Parser Root
-parseRoot = choice [RootA <$> parseA, RootB <$> parseB, RootC <$> parseC]
+parseRoot = choice [RootA <$> try parseA, RootB <$> try parseB, RootC <$> try parseC]
parseA :: Parser A
parseA = do
@@ -51,7 +51,7 @@
id_ <- do
skipSpaces >> string "id" >> skipSpaces >> string "=" >> skipSpaces
- char '\'' *> some (anyExceptOrEscaped '\'') <* char '\''
+ char '\'' *> some (try $ anyExceptOrEscaped '\'') <* char '\''
>>= \case
"\\'" -> pure '\''
[x] -> pure x
@@ -60,7 +60,7 @@
children <- do
skipSpaces >> string "children" >> skipSpaces >> string "=" >> skipSpaces
- char '{' *> many (choice [ChildB <$> parseB, ChildC <$> parseC]) <* char '}'
+ char '{' *> many (choice [ChildB <$> try parseB, ChildC <$> try parseC]) <* char '}'
_ <- skipSpaces >> string "]A"
pure A{id = id_, ..}
@@ -74,7 +74,8 @@
(:) <$> char '-' <*> some (satisfy Char.isDigit) <|> some (satisfy Char.isDigit)
>>= maybe (fail "expecting an integer") pure . readMaybe
- mchild <- optional do
+ mchild <-
+ (optional . try) do
skipSpaces >> string "mchild" >> skipSpaces >> string "=" >> skipSpaces
parseA
@@ -92,7 +93,7 @@
name <- do
skipSpaces >> string "name" >> skipSpaces >> string "=" >> skipSpaces
- Text.pack <$> (char '"' *> many (anyExceptOrEscaped '"') <* char '"')
+ Text.pack <$> (char '"' *> many (try $ anyExceptOrEscaped '"') <* char '"')
_ <- skipSpaces >> string "]C"
@@ -103,7 +104,7 @@
nc <- anyChar
guard (nc /= ec)
choice
- [ do
+ [ try do
guard (nc == '\\')
nnc <- anyChar
guard (nnc `elem` ['\\', ec])
@@ -112,4 +113,4 @@
]
skipSpaces :: Parser ()
-skipSpaces = skipMany (satisfy Char.isSpace)
+skipSpaces = skipMany (try $ satisfy Char.isSpace)
data Parser a
instance Functor Parser
instance Applicative Parser
instance Monad Parser
instance Alternative Parser
instance MonadPlus Parser
instance MonadFail Parser
parseOnly :: Parser a -> Text -> Either String a
parseOnly = undefined
anyChar :: Parser Char
anyChar = undefined
char :: Char -> Parser Char
char = undefined
string :: Text -> Parser Text
string = undefined
satisfy :: (Char -> Bool) -> Parser Char
satisfy = undefined
This looks like quite a few instances to implement and get wrong.
This is where
transformers come in. We will get all the instances except
MonadFail
for free.
MonadTrans
and the lift
function.lift :: (MonadTrans t, Monad m) => m a -> t m a
lift
MaybeT
MonadFail
and Alternative
ExceptT
MonadError
and Alternative
StateT
MonadState
ReaderT
MonadReader
WriterT
MonadWriter
Depending on which tranformers you combine the order in which you layer them changes runtime semantics
StateT s (ExceptT e Identity) a
ExceptT e (StateT s Identity) a
<|>
I can never remember which way is the “correct” way
Luckily Haskell is succinct so you can just look at the
implementation
For
StateT
newtype Parser a = Parser (Parser' a)
deriving newtype (Functor, Applicative, Monad
, Alternative, MonadPlus, MonadFail)
newtype State = State
{ getText :: Text
}
newtype Parser' a =
Parser' (StateT State (ExceptT String Identity) a)
deriving newtype (Functor, Applicative, Monad
, Alternative, MonadPlus, MonadState State
, MonadError String)
instance MonadFail Parser' where
fail = Parser' . throwError
parseOnly :: Parser a -> Text -> Either String a
parseOnly (Parser (Parser' p')) text =
runIdentity (runExceptT (evalStateT p' initS))
where
initS = State{getText = text}
satisfy :: (Char -> Bool) -> Parser Char
satisfy = satisfy' ""
satisfy' :: String -> (Char -> Bool) -> Parser Char
satisfy' msg condition = Parser do
s <- get
let onNextChar (nc, t)
| condition nc = nc <$ put (s{getText = t})
| otherwise = fail msg
maybe (fail "empty") onNextChar (Text.uncons s.getText)
anyChar :: Parser Char
anyChar = satisfy' "" (const True)
char :: Char -> Parser Char
char c = satisfy' ['\'', c, '\''] (c ==)
string :: Text -> Parser Text
string txt = Parser do
s <- get
let onFail = fail $ "expecting '" ++ Text.unpack txt ++ "'"
maybe onFail (\t -> txt <$ put (s{getText = t}))
(Text.stripPrefix txt s.getText)
The second A.id
is not valid here.
A[
id = 'w'
children = {B[
id = 2721420254829773030
mchild = A[
id = no
children = {}
]A
]B}
]A
The error reported is strange and unhelpful.
'}'expecting 'B['expecting 'C['
There are a couple of reasons for this
newtype Parser a = Parser (Parser' a)
deriving newtype (Functor, Applicative
, Monad, Alternative, MonadPlus, MonadFail)
data State = State
{ getText :: !Text
, getStack :: ![String]
}
data Error = Error
{ stack :: [String]
, msg :: String
}
newtype Parser' a =
Parser' (StateT State (ExceptT [Error] Identity) a)
deriving newtype (Functor, Applicative, Monad, Alternative
, MonadPlus, MonadState State, MonadError [Error])
instance MonadFail Parser' where
fail msg = Parser' do
s <- get
throwError [Error{stack = s.getStack, msg}]
errorToString :: [Error] -> String
errorToString = unlines .
map (\err -> err.msg ++
" (" ++ List.intercalate " > " (reverse err.stack) ++ ")")
parseOnly :: Parser a -> Text -> Either String a
parseOnly (Parser (Parser' p')) text =
first errorToString $
runIdentity (runExceptT (evalStateT p' initS))
where
initS = State{getText = text, getStack = []}
satisfy :: (Char -> Bool) -> Parser Char
satisfy = (<?> "satisfy") . satisfy' ""
satisfy' :: String -> (Char -> Bool) -> Parser Char
satisfy' msg condition = Parser do
s <- get
let onNextChar (nc, t)
| condition nc = nc <$ put (s{getText = t})
| otherwise = fail msg
maybe (fail "empty") onNextChar (Text.uncons s.getText)
anyChar :: Parser Char
anyChar = satisfy' "" (const True) <?> "anyChar"
char :: Char -> Parser Char
char c = satisfy' ['\'', c, '\''] (c ==) <?> "char"
string :: Text -> Parser Text
string txt = (<?> "string") $ Parser do
s <- get
let onFail = fail $ "expecting '" ++ Text.unpack txt ++ "'"
maybe onFail (\t -> txt <$ put (s{getText = t}))
(Text.stripPrefix txt s.getText)
infix 0 <?>
(<?>) :: Parser a -> String -> Parser a
Parser p <?> tag = Parser $ pushStack *> p <* popStack
where
pushStack = modify' (\s -> s{getStack = tag : s.getStack})
popStack = modify' (\s -> s{getStack = drop 1 s.getStack})
The second A.id
is not valid here.
A[
id = 'w'
children = {B[
id = 2721420254829773030
mchild = A[
id = no
children = {}
]A
]B}
]A
Much clearer error reported
'}' (parseRoot > parserA > children > char)
expecting 'B[' (parseRoot > parserB > start > string)
expecting 'C[' (parseRoot > parserC > begin > string)
We can now clearly see the problem is with the scope over which apply
<|>
, choice
etc.
In our toyparsec and attoparsec
based parsers we would need to change the scope over which we apply
<|>
In our megaparsec based parsers we can play with
moving try
around.
But we want a general solution for both.
parseRoot :: Parser Root
parseRoot =
(<?> "parseRoot") do
choice [parseAStart, parseBStart, parseCStart]
>>= \case
StartA -> RootA <$> parseABody
StartB -> RootB <$> parseBBody
StartC -> RootC <$> parseCBody
data Start
= StartA
| StartB
| StartC
parseAStart :: Parser Start
parseAStart = StartA <$ (skipSpaces >> string "A[" <?> "start A")
parseABody :: Parser A
parseABody = undefined
parseBStart :: Parser Start
parseBStart = StartB <$ (skipSpaces >> string "B[" <?> "start")
parseBBody :: Parser B
parseBBody = undefined
parseCStart :: Parser Start
parseCStart = StartC <$ (skipSpaces >> string "C[" <?> "begin")
parseCBody :: Parser C
parseCBody = undefined
parseRoot :: Parser Root
parseRoot = (<?> "parseRoot") do
join $ choice [fmap RootA <$> parseAStart
, fmap RootB <$> parseBStart, fmap RootC <$> parseCStart]
parseAStart :: Parser (Parser A)
parseAStart = (skipSpaces >> string "A[" <?> "start A") >> pure parseABody
parseABody :: Parser A
parseABody = undefined
parseBStart :: Parser (Parser B)
parseBStart = (skipSpaces >> string "B[" <?> "start") >> pure parseBBody
parseBBody :: Parser B
parseBBody = undefined
parseCStart :: Parser (Parser C)
parseCStart = (skipSpaces >> string "C[" <?> "begin") >> pure parseCBody
parseCBody :: Parser C
parseCBody = undefined
skipSpaces :: Parser ()
skipSpaces = skipMany (satisfy Char.isSpace) <?> "skipSpaces"
parseA :: Parser A
parseA = (<?> "parserA") do
_ <- skipSpaces >> string "A[" <?> "start"
id_ <- (<?> "id") do
skipSpaces >> string "id" >> skipSpaces >> string "=" >> skipSpaces
char '\'' *> some (anyExceptOrEscaped '\'') <* char '\''
>>= \case
"\\'" -> pure '\''
[x] -> pure x
[] ->
fail "expecting single character but got none"
xs ->
fail ("expecting single character but got many: " <> xs)
children <- (<?> "children") do
skipSpaces >> string "children" >> skipSpaces >> string "=" skipSpaces
char '{' *>
many (choice [ChildB <$> parseB, ChildC <$> parseC])
<* char '}'
_ <- skipSpaces >> string "]A" <?> "end"
pure A{id = id_, ..}
parseAStart :: Parser (Parser A)
parseAStart =
(skipSpaces >> string "A[" <?> "start A") >> pure parseABody
parseABody :: Parser A
parseABody = (<?> "body A") do
id_ <- (<?> "id") do
skipSpaces >> string "id" >> skipSpaces >> string "=" >> skipSpaces
char '\'' *> some (anyExceptOrEscaped '\'') <* char '\''
>>= \case
"\\'" -> pure '\''
[x] -> pure x
[] ->
fail "expecting single character but got none"
xs ->
fail ("expecting single character but got many: " <> xs)
children <- (<?> "children") do
skipSpaces >> string "children" >> skipSpaces >> string "=" >> skipSpaces
_ <- char '{'
choice
[ [] <$ skipSpaces <* char '}' <?> "no children"
, some (join $ choice [ fmap ChildB <$> parseBStart
, fmap ChildC <$> parseCStart])
<* char '}'
]
_ <- skipSpaces >> string "]A" <?> "end A"
pure A{id = id_, ..}
parseB :: Parser B
parseB = (<?> "parserB") do
_ <- skipSpaces >> string "B[" <?> "start"
id_ <- (<?> "id") do
skipSpaces >> string "id" >> skipSpaces >> string "=" >> skipSpaces
(:) <$> char '-' <*> some (satisfy Char.isDigit)
<|> some (satisfy Char.isDigit)
>>= maybe (fail "expecting an integer") pure . readMaybe
mchild <- optional do
skipSpaces >> string "mchild" >> skipSpaces >> string "=" >> skipSpaces
parseA
_ <- skipSpaces >> string "]B" <?> "end"
pure B{id = id_, ..}
parseBStart :: Parser (Parser B)
parseBStart =
(skipSpaces >> string "B[" <?> "start") >> pure parseBBody
parseBBody :: Parser B
parseBBody = (<?> "parserB") do
id_ <- (<?> "id") do
skipSpaces >> string "id" >> skipSpaces >> string "=" >> skipSpaces
(:) <$> char '-' <*> some (satisfy Char.isDigit)
<|> some (satisfy Char.isDigit)
>>= maybe (fail "expecting an integer") pure . readMaybe
mchild <- (join . fmap sequence . optional) do
skipSpaces >> string "mchild"
>> pure (skipSpaces >> string "=" >> skipSpaces >>
join parseAStart)
_ <- skipSpaces >> string "]B" <?> "end"
pure B{id = id_, ..}
parseC :: Parser C
parseC = (<?> "parserC") do
_ <- skipSpaces >> string "C[" <?> "begin"
happy <- (<?> "happy") do
skipSpaces >> string "happy" >> skipSpaces >> string "=" >> skipSpaces
(True <$ string "yes") <|> (False <$ string "no")
name <- (<?> "name") do
skipSpaces >> string "name" >> skipSpaces >> string "=" >> skipSpaces
Text.pack
<$> (char '"' *> many (anyExceptOrEscaped '"') <* char '"')
_ <- skipSpaces >> string "]C" <?> "end"
pure C{..}
parseCStart :: Parser (Parser C)
parseCStart =
(skipSpaces >> string "C[" <?> "begin") >> pure parseCBody
parseCBody :: Parser C
parseCBody = (<?> "parserC") do
happy <- (<?> "happy") do
skipSpaces >> string "happy" >> skipSpaces >> string "=" >> skipSpaces
(True <$ string "yes") <|> (False <$ string "no")
name <- (<?> "name") do
skipSpaces >> string "name" >> skipSpaces >> string "=" >> skipSpaces
Text.pack
<$> (char '"' *> many (anyExceptOrEscaped '"') <* char '"')
_ <- skipSpaces >> string "]C" <?> "end"
pure C{..}
The second A.id
is not valid here.
A[
id = 'w'
children = {B[
id = 2721420254829773030
mchild = A[
id = no
children = {}
]A
]B}
]A
Now we have a much more accurate error
'}' (parseRoot > body A > children > no children > char)
''' (parseRoot > body A > children > parserB > body A > id > char)
We didn’t have no children
When we tried to parse our children we found an issue with the
id
field inside the A
child of
B
.