[backend/parsing] Drastically improve MFM parser performance

This commit improves MFM parser performance by up to 22x, depending on input and platform.
This commit is contained in:
Laura Hausmann 2024-11-15 23:43:44 +01:00
parent 8b3e25e7db
commit 9ca2deba67
No known key found for this signature in database
GPG key ID: D044E84C5BE01605

View file

@ -197,45 +197,53 @@ module private MfmParser =
} }
// Patterns // Patterns
let italicPattern = ((notFollowedBy <| str "**") >>. skipChar '*') let italicPatternAsterisk = notFollowedByString "**" >>. skipChar '*'
let italicPatternAlt = ((notFollowedBy <| str "__") >>. skipChar '_') let italicPatternUnderscore = notFollowedByString "__" >>. skipChar '_'
let codePattern = (notFollowedBy <| str "```") >>. skipChar '`' let codePattern = notFollowedByString "```" >>. skipChar '`'
// Matchers // Matchers
let hashtagMatcher = letter <|> digit <|> anyOf "-_" let hashtagMatcher = letter <|> digit <|> anyOf "-_"
let hashtagSatisfier = attempt hashtagMatcher let hashtagSatisfier = attempt hashtagMatcher
// Node parsers // Node parsers
let italicAsteriskNode =
let italicNode1 =
previousCharSatisfiesNot isNotWhitespace previousCharSatisfiesNot isNotWhitespace
>>. italicPattern >>. italicPatternAsterisk
>>. pushLine >>. pushLine
>>. manyTill inlineNode italicPattern >>. manyTill inlineNode italicPatternAsterisk
.>> assertLine .>> assertLine
let italicNode2 =
previousCharSatisfiesNot isNotWhitespace
>>. italicPatternAlt
>>. pushLine
>>. manyTill inlineNode italicPatternAlt
.>> assertLine
let italicNode3 =
skipString "<i>" >>. pushLine >>. manyTill inlineNode (skipString "</i>")
.>> assertLine
let italicNode =
italicNode1 <|> italicNode2 <|> italicNode3
|>> fun c -> MfmItalicNode(aggregateTextInline c) :> MfmNode |>> fun c -> MfmItalicNode(aggregateTextInline c) :> MfmNode
let boldNode = let italicUnderscoreNode =
(skipString "**" >>. pushLine >>. manyTill inlineNode (skipString "**") previousCharSatisfiesNot isNotWhitespace
.>> assertLine) >>. italicPatternUnderscore
<|> (skipString "__" >>. pushLine >>. manyTill inlineNode (skipString "__") >>. pushLine
.>> assertLine) >>. manyTill inlineNode italicPatternUnderscore
<|> (skipString "<b>" >>. pushLine >>. manyTill inlineNode (skipString "</b>") .>> assertLine
.>> assertLine) |>> fun c -> MfmItalicNode(aggregateTextInline c) :> MfmNode
let italicTagNode =
skipString "<i>" >>. manyTill inlineNode (skipString "</i>")
|>> fun c -> MfmItalicNode(aggregateTextInline c) :> MfmNode
let boldAsteriskNode =
previousCharSatisfiesNot isNotWhitespace
>>. skipString "**"
>>. pushLine
>>. manyTill inlineNode (skipString "**")
.>> assertLine
|>> fun c -> MfmBoldNode(aggregateTextInline c) :> MfmNode
let boldUnderscoreNode =
previousCharSatisfiesNot isNotWhitespace
>>. skipString "__"
>>. pushLine
>>. manyTill inlineNode (skipString "__")
.>> assertLine
|>> fun c -> MfmBoldNode(aggregateTextInline c) :> MfmNode
let boldTagNode =
skipString "<b>" >>. manyTill inlineNode (skipString "</b>")
|>> fun c -> MfmBoldNode(aggregateTextInline c) :> MfmNode |>> fun c -> MfmBoldNode(aggregateTextInline c) :> MfmNode
let strikeNode = let strikeNode =
@ -250,6 +258,7 @@ module private MfmParser =
let codeBlockNode = let codeBlockNode =
opt skipNewline opt skipNewline
>>. opt skipNewline >>. opt skipNewline
>>. followedByString "```"
>>. previousCharSatisfiesNot isNotNewline >>. previousCharSatisfiesNot isNotNewline
>>. skipString "```" >>. skipString "```"
>>. opt (many1CharsTill asciiLetter (lookAhead newline)) >>. opt (many1CharsTill asciiLetter (lookAhead newline))
@ -317,7 +326,7 @@ module private MfmParser =
>>. many1CharsTill hashtagMatcher (notFollowedBy hashtagSatisfier) >>. many1CharsTill hashtagMatcher (notFollowedBy hashtagSatisfier)
|>> fun h -> MfmHashtagNode(h) :> MfmNode |>> fun h -> MfmHashtagNode(h) :> MfmNode
let urlNodePlain = let urlNode =
lookAhead (skipString "https://" <|> skipString "http://") lookAhead (skipString "https://" <|> skipString "http://")
>>. manyCharsTill anyChar (nextCharSatisfies isWhitespace <|> nextCharSatisfies (isAnyOf "()") <|> eof) //FIXME: this needs significant improvements >>. manyCharsTill anyChar (nextCharSatisfies isWhitespace <|> nextCharSatisfies (isAnyOf "()") <|> eof) //FIXME: this needs significant improvements
>>= fun uri -> >>= fun uri ->
@ -342,8 +351,6 @@ module private MfmParser =
| _ -> fail "invalid scheme" | _ -> fail "invalid scheme"
| _ -> fail "invalid url" | _ -> fail "invalid url"
let urlNode = urlNodePlain <|> urlNodeBrackets
let linkNode = let linkNode =
(opt (pchar '?')) (opt (pchar '?'))
.>>. (pchar '[' >>. manyCharsTill anyChar (pchar ']')) .>>. (pchar '[' >>. manyCharsTill anyChar (pchar ']'))
@ -380,35 +387,48 @@ module private MfmParser =
let charNode = anyChar |>> fun v -> MfmCharNode(v) :> MfmNode let charNode = anyChar |>> fun v -> MfmCharNode(v) :> MfmNode
// Node collection // Custom parser for higher throughput
let inlineNodeSeq = type ParseMode =
[ plainNode | Full
smallNode | Inline
italicNode | Simple
boldNode
strikeNode
hashtagNode
mentionNode
codeNode
urlNode
linkNode
mathNode
emojiCodeNode
fnNode
charNode ]
let simpleNodeSeq = [ plainNode; emojiCodeNode; charNode ] let parseNode (m: ParseMode) =
let prefixedNode (m: ParseMode) : Parser<MfmNode, int64> =
fun (stream: CharStream<_>) ->
match (stream.Peek(), m) with
// Block nodes, ordered by expected frequency
| '`', Full -> codeBlockNode <|> codeNode
| '\n', Full when stream.Match("\n```") -> codeBlockNode
| '\n', Full when stream.Match("\n\n```") -> codeBlockNode
| '>', Full -> quoteNode
| '<', Full when stream.Match "<center>" -> centerNode
| '\\', Full when stream.Match "\\[" -> mathBlockNode
// Inline nodes, ordered by expected frequency
| '*', (Full | Inline) -> italicAsteriskNode <|> boldAsteriskNode
| '_', (Full | Inline) -> italicUnderscoreNode <|> boldUnderscoreNode
| '@', (Full | Inline) -> mentionNode
| '#', (Full | Inline) -> hashtagNode
| '`', Inline -> codeNode
| 'h', (Full | Inline) when stream.Match "http" -> urlNode
| ':', (Full | Inline | Simple) -> emojiCodeNode
| '~', (Full | Inline) when stream.Match "~~" -> strikeNode
| '[', (Full | Inline) -> linkNode
| '<', (Full | Inline) -> choice [ plainNode; smallNode; italicTagNode; boldTagNode; urlNodeBrackets ]
| '<', Simple when stream.Match "<plain>" -> plainNode
| '\\', (Full | Inline) when stream.Match "\\(" -> mathNode
| '$', (Full | Inline) when stream.Match "$[" -> fnNode
| '?', (Full | Inline) when stream.Match "[" -> linkNode
// Fallback to char node
| _ -> charNode
<| stream
let blockNodeSeq = [ centerNode; codeBlockNode; mathBlockNode; quoteNode ] attempt <| prefixedNode m <|> charNode
let nodeSeq = [ blockNodeSeq; inlineNodeSeq ]
// Populate references // Populate references
do nodeRef.Value <- choice <| seqAttempt (seqFlatten <| nodeSeq) do nodeRef.Value <- parseNode Full
do inlineNodeRef.Value <- parseNode Inline |>> fun v -> v :?> MfmInlineNode
do inlineNodeRef.Value <- choice <| (seqAttempt inlineNodeSeq) |>> fun v -> v :?> MfmInlineNode do simpleRef.Value <- parseNode Simple
do simpleRef.Value <- choice <| seqAttempt simpleNodeSeq
// Final parse command // Final parse command
let parse = spaces >>. manyTill node eof .>> spaces let parse = spaces >>. manyTill node eof .>> spaces