From 9ca2deba67e071a9d26e81a1b4a04eebc170862d Mon Sep 17 00:00:00 2001 From: Laura Hausmann Date: Fri, 15 Nov 2024 23:43:44 +0100 Subject: [PATCH] [backend/parsing] Drastically improve MFM parser performance This commit improves MFM parser performance by up to 22x, depending on input and platform. --- Iceshrimp.Parsing/Mfm.fs | 132 ++++++++++++++++++++++----------------- 1 file changed, 76 insertions(+), 56 deletions(-) diff --git a/Iceshrimp.Parsing/Mfm.fs b/Iceshrimp.Parsing/Mfm.fs index 3665ac74..f9dbc51d 100644 --- a/Iceshrimp.Parsing/Mfm.fs +++ b/Iceshrimp.Parsing/Mfm.fs @@ -197,45 +197,53 @@ module private MfmParser = } // Patterns - let italicPattern = ((notFollowedBy <| str "**") >>. skipChar '*') - let italicPatternAlt = ((notFollowedBy <| str "__") >>. skipChar '_') - let codePattern = (notFollowedBy <| str "```") >>. skipChar '`' + let italicPatternAsterisk = notFollowedByString "**" >>. skipChar '*' + let italicPatternUnderscore = notFollowedByString "__" >>. skipChar '_' + let codePattern = notFollowedByString "```" >>. skipChar '`' // Matchers let hashtagMatcher = letter <|> digit <|> anyOf "-_" let hashtagSatisfier = attempt hashtagMatcher // Node parsers - - let italicNode1 = + let italicAsteriskNode = previousCharSatisfiesNot isNotWhitespace - >>. italicPattern + >>. italicPatternAsterisk >>. pushLine - >>. manyTill inlineNode italicPattern + >>. manyTill inlineNode italicPatternAsterisk .>> assertLine - - let italicNode2 = - previousCharSatisfiesNot isNotWhitespace - >>. italicPatternAlt - >>. pushLine - >>. manyTill inlineNode italicPatternAlt - .>> assertLine - - let italicNode3 = - skipString "" >>. pushLine >>. manyTill inlineNode (skipString "") - .>> assertLine - - let italicNode = - italicNode1 <|> italicNode2 <|> italicNode3 |>> fun c -> MfmItalicNode(aggregateTextInline c) :> MfmNode - let boldNode = - (skipString "**" >>. pushLine >>. manyTill inlineNode (skipString "**") - .>> assertLine) - <|> (skipString "__" >>. pushLine >>. manyTill inlineNode (skipString "__") - .>> assertLine) - <|> (skipString "" >>. pushLine >>. manyTill inlineNode (skipString "") - .>> assertLine) + let italicUnderscoreNode = + previousCharSatisfiesNot isNotWhitespace + >>. italicPatternUnderscore + >>. pushLine + >>. manyTill inlineNode italicPatternUnderscore + .>> assertLine + |>> fun c -> MfmItalicNode(aggregateTextInline c) :> MfmNode + + let italicTagNode = + skipString "" >>. manyTill inlineNode (skipString "") + |>> fun c -> MfmItalicNode(aggregateTextInline c) :> MfmNode + + let boldAsteriskNode = + previousCharSatisfiesNot isNotWhitespace + >>. skipString "**" + >>. pushLine + >>. manyTill inlineNode (skipString "**") + .>> assertLine + |>> fun c -> MfmBoldNode(aggregateTextInline c) :> MfmNode + + let boldUnderscoreNode = + previousCharSatisfiesNot isNotWhitespace + >>. skipString "__" + >>. pushLine + >>. manyTill inlineNode (skipString "__") + .>> assertLine + |>> fun c -> MfmBoldNode(aggregateTextInline c) :> MfmNode + + let boldTagNode = + skipString "" >>. manyTill inlineNode (skipString "") |>> fun c -> MfmBoldNode(aggregateTextInline c) :> MfmNode let strikeNode = @@ -250,6 +258,7 @@ module private MfmParser = let codeBlockNode = opt skipNewline >>. opt skipNewline + >>. followedByString "```" >>. previousCharSatisfiesNot isNotNewline >>. skipString "```" >>. opt (many1CharsTill asciiLetter (lookAhead newline)) @@ -317,7 +326,7 @@ module private MfmParser = >>. many1CharsTill hashtagMatcher (notFollowedBy hashtagSatisfier) |>> fun h -> MfmHashtagNode(h) :> MfmNode - let urlNodePlain = + let urlNode = lookAhead (skipString "https://" <|> skipString "http://") >>. manyCharsTill anyChar (nextCharSatisfies isWhitespace <|> nextCharSatisfies (isAnyOf "()") <|> eof) //FIXME: this needs significant improvements >>= fun uri -> @@ -342,8 +351,6 @@ module private MfmParser = | _ -> fail "invalid scheme" | _ -> fail "invalid url" - let urlNode = urlNodePlain <|> urlNodeBrackets - let linkNode = (opt (pchar '?')) .>>. (pchar '[' >>. manyCharsTill anyChar (pchar ']')) @@ -380,35 +387,48 @@ module private MfmParser = let charNode = anyChar |>> fun v -> MfmCharNode(v) :> MfmNode - // Node collection - let inlineNodeSeq = - [ plainNode - smallNode - italicNode - boldNode - strikeNode - hashtagNode - mentionNode - codeNode - urlNode - linkNode - mathNode - emojiCodeNode - fnNode - charNode ] + // Custom parser for higher throughput + type ParseMode = + | Full + | Inline + | Simple - let simpleNodeSeq = [ plainNode; emojiCodeNode; charNode ] + let parseNode (m: ParseMode) = + let prefixedNode (m: ParseMode) : Parser = + fun (stream: CharStream<_>) -> + match (stream.Peek(), m) with + // Block nodes, ordered by expected frequency + | '`', Full -> codeBlockNode <|> codeNode + | '\n', Full when stream.Match("\n```") -> codeBlockNode + | '\n', Full when stream.Match("\n\n```") -> codeBlockNode + | '>', Full -> quoteNode + | '<', Full when stream.Match "
" -> centerNode + | '\\', Full when stream.Match "\\[" -> mathBlockNode + // Inline nodes, ordered by expected frequency + | '*', (Full | Inline) -> italicAsteriskNode <|> boldAsteriskNode + | '_', (Full | Inline) -> italicUnderscoreNode <|> boldUnderscoreNode + | '@', (Full | Inline) -> mentionNode + | '#', (Full | Inline) -> hashtagNode + | '`', Inline -> codeNode + | 'h', (Full | Inline) when stream.Match "http" -> urlNode + | ':', (Full | Inline | Simple) -> emojiCodeNode + | '~', (Full | Inline) when stream.Match "~~" -> strikeNode + | '[', (Full | Inline) -> linkNode + | '<', (Full | Inline) -> choice [ plainNode; smallNode; italicTagNode; boldTagNode; urlNodeBrackets ] + | '<', Simple when stream.Match "" -> plainNode + | '\\', (Full | Inline) when stream.Match "\\(" -> mathNode + | '$', (Full | Inline) when stream.Match "$[" -> fnNode + | '?', (Full | Inline) when stream.Match "[" -> linkNode + // Fallback to char node + | _ -> charNode + <| stream - let blockNodeSeq = [ centerNode; codeBlockNode; mathBlockNode; quoteNode ] - - let nodeSeq = [ blockNodeSeq; inlineNodeSeq ] + attempt <| prefixedNode m <|> charNode // Populate references - do nodeRef.Value <- choice <| seqAttempt (seqFlatten <| nodeSeq) - - do inlineNodeRef.Value <- choice <| (seqAttempt inlineNodeSeq) |>> fun v -> v :?> MfmInlineNode - - do simpleRef.Value <- choice <| seqAttempt simpleNodeSeq + do nodeRef.Value <- parseNode Full + do inlineNodeRef.Value <- parseNode Inline |>> fun v -> v :?> MfmInlineNode + do simpleRef.Value <- parseNode Simple // Final parse command let parse = spaces >>. manyTill node eof .>> spaces