From 799d1054a9e1be686e9f808bc8bdf182e80e204f Mon Sep 17 00:00:00 2001 From: Laura Hausmann Date: Sun, 17 Mar 2024 20:52:18 +0100 Subject: [PATCH] [parsing/mfm] Reimplement MfmParser in F# (ISH-112) --- Iceshrimp.Parsing/Iceshrimp.Parsing.fsproj | 1 + Iceshrimp.Parsing/Mfm.fs | 337 +++++++++++++++++++++ Iceshrimp.Tests/Parsing/MfmTestsFParsec.cs | 195 ++++++++++++ 3 files changed, 533 insertions(+) create mode 100644 Iceshrimp.Parsing/Mfm.fs create mode 100644 Iceshrimp.Tests/Parsing/MfmTestsFParsec.cs diff --git a/Iceshrimp.Parsing/Iceshrimp.Parsing.fsproj b/Iceshrimp.Parsing/Iceshrimp.Parsing.fsproj index b6bd4a15..0860afdc 100644 --- a/Iceshrimp.Parsing/Iceshrimp.Parsing.fsproj +++ b/Iceshrimp.Parsing/Iceshrimp.Parsing.fsproj @@ -7,6 +7,7 @@ + diff --git a/Iceshrimp.Parsing/Mfm.fs b/Iceshrimp.Parsing/Mfm.fs new file mode 100644 index 00000000..97e3737c --- /dev/null +++ b/Iceshrimp.Parsing/Mfm.fs @@ -0,0 +1,337 @@ +namespace Iceshrimp.Parsing + +open System +open System.Collections.Generic +open System.Runtime.InteropServices.JavaScript +open FParsec + +module MfmNodeTypes = + [] + type MfmNode() = + member val Children: MfmNode list = [] with get, set + + [] + type MfmInlineNode(c: MfmInlineNode list) = + inherit MfmNode() + do base.Children <- c |> List.map (fun x -> x :> MfmNode) + + [] + type MfmBlockNode(c: MfmInlineNode list) = + inherit MfmNode() + do base.Children <- c |> List.map (fun x -> x :> MfmNode) + + type MfmTextNode(v: string) = + inherit MfmInlineNode([]) + member val Text = v + + type MfmItalicNode(c) = + inherit MfmInlineNode(c) + + type MfmBoldNode(c) = + inherit MfmInlineNode(c) + + type MfmStrikeNode(c) = + inherit MfmInlineNode(c) + + type MfmInlineCodeNode(v: string) = + inherit MfmInlineNode([]) + member val Code = v + + type MfmPlainNode(v: string) = + inherit MfmInlineNode([ MfmTextNode(v) ]) + + type MfmSmallNode(c) = + inherit MfmInlineNode(c) + + type MfmQuoteNode(c) = + inherit MfmBlockNode(c) + + type MfmSearchNode(content: string, query: string) = + inherit MfmBlockNode([]) + member val Content = content + member val Query = query + + type MfmCodeBlockNode(code, lang: string option) = + inherit MfmBlockNode([]) + member val Code = code + member val Language = lang + + type MfmMathBlockNode(f) = + inherit MfmBlockNode([]) + member val Formula = f + + type MfmCenterNode(c) = + inherit MfmBlockNode(c) + + type MfmUnicodeEmojiNode(e: string) = + inherit MfmInlineNode([]) + member val Emoji = e + + type MfmEmojiCodeNode(n) = + inherit MfmInlineNode([]) + member val Name = n + + type MfmMathInlineNode(f) = + inherit MfmInlineNode([]) + member val Formula = f + + type MfmMentionNode(acct, user, host: string option) = + inherit MfmInlineNode([]) + member val Acct = acct + member val Username = user + member val Host = host + + type MfmHashtagNode(h) = + inherit MfmInlineNode([]) + member val Hashtag = h + + type MfmUrlNode(url, brackets) = + inherit MfmInlineNode([]) + member val Url = url + member val Brackets = brackets + + type MfmLinkNode(url, text, silent) = + inherit MfmInlineNode([ MfmTextNode(text) ]) + member val Url = url + member val Silent = silent + + type MfmFnNode(args: Dictionary, name, children) = + inherit MfmInlineNode(children) + // (string, bool) args = (string, null as string?) + member val Args = args + member val Name = name + + type internal MfmCharNode(v: char) = + inherit MfmInlineNode([]) + member val Char = v + +open MfmNodeTypes + +module private MfmParser = + // Abstractions + let str s = pstring s + let seqAttempt s = s |> Seq.map attempt + let isWhitespace c = Char.IsWhiteSpace c + let isNotWhitespace c = Char.IsWhiteSpace c = false + + let isAsciiLetterOrNumber c = Char.IsAsciiLetter c || Char.IsDigit c + + let (|CharNode|MfmNode|) (x: MfmNode) = + if x :? MfmCharNode then + CharNode(x :?> MfmCharNode) + else + MfmNode x + + let folder (current, result) node = + match (node: MfmNode), (current: char list) with + | CharNode node, _ -> node.Char :: current, result + | MfmNode node, [] -> current, node :: result + | MfmNode node, _ -> [], node :: (MfmTextNode(current |> List.toArray |> String) :: result) + + let aggregateText nodes = + nodes + |> List.rev + |> List.fold folder ([], []) + |> function + | [], result -> result + | current, result -> MfmTextNode(current |> List.toArray |> String) :: result + + let aggregateTextInline nodes = + nodes |> aggregateText |> List.map (fun x -> x :?> MfmInlineNode) + + let domainFirstComponent = + many1Chars (satisfy isAsciiLetter <|> satisfy isDigit <|> anyOf "_-") + + let domainComponent = + many1Chars (satisfy isAsciiLetter <|> satisfy isDigit <|> anyOf "._-") + + let domainStart = (satisfy isAsciiLetter <|> satisfy isDigit) + + let domainFull = + domainStart .>>. domainFirstComponent .>>. pchar '.' .>>. many1 domainComponent + + let domainAggregate1 (a: char, b: string) = string a + b + let domainAggregate2 (a: char * string, b: char) = (domainAggregate1 a) + string b + + let domainAggregate (x: (char * string) * char, y: string list) = + domainAggregate2 x + (String.concat "" y) + + let domain = domainFull |>> domainAggregate + + let acct (user: string, host: string option) = + match host with + | None -> user + | Some v -> user + "@" + v + + // References + let node, nodeRef = createParserForwardedToRef () + let inlineNode, inlineNodeRef = createParserForwardedToRef () + + let seqFlatten items = + seq { + for item in items do + yield! item + } + + // Patterns + let italicPattern = (notFollowedBy <| str "**") >>. skipChar '*' + let codePattern = (notFollowedBy <| str "```") >>. skipChar '`' + + // Node parsers + + let italicNode = + italicPattern >>. manyTill inlineNode italicPattern + |>> fun c -> MfmItalicNode(aggregateTextInline c) :> MfmNode + + //TODO: https://github.com/pzp1997/harkdown/blob/master/src/InlineParser.hs#L173-L201 + + let boldNode = + skipString "**" >>. manyTill inlineNode (skipString "**") + |>> fun c -> MfmBoldNode(aggregateTextInline c) :> MfmNode + + let strikeNode = + skipString "~~" >>. manyTill inlineNode (skipString "~~") + |>> fun c -> MfmStrikeNode(aggregateTextInline c) :> MfmNode + + let codeNode = + codePattern >>. manyCharsTill anyChar codePattern + |>> fun v -> MfmInlineCodeNode(v) :> MfmNode + + let codeBlockNode = + previousCharSatisfiesNot isNotWhitespace + >>. skipString "```" + >>. opt (many1CharsTill asciiLetter (lookAhead newline)) + .>>. (skipNewline >>. manyCharsTill anyChar (skipNewline >>. skipString "```")) + |>> fun (lang: string option, code: string) -> MfmCodeBlockNode(code, lang) :> MfmNode + + let mathNode = + skipString "\(" >>. manyCharsTill anyChar (skipString "\)") + |>> fun f -> MfmMathInlineNode(f) :> MfmNode + + let mathBlockNode = + previousCharSatisfiesNot isNotWhitespace + >>. skipString "\[" + >>. many1CharsTill anyChar (skipString "\]") + |>> fun f -> MfmMathBlockNode(f) :> MfmNode + + let emojiCodeNode = + skipChar ':' >>. manyCharsTill letter (skipChar ':') + |>> fun e -> MfmEmojiCodeNode(e) :> MfmNode + + let plainNode = + skipString "" >>. manyCharsTill anyChar (skipString "") + |>> fun v -> MfmPlainNode(v) :> MfmNode + + let smallNode = + skipString "" >>. manyTill inlineNode (skipString "") + |>> fun c -> MfmSmallNode(aggregateTextInline c) :> MfmNode + + let centerNode = + skipString "
" >>. manyTill inlineNode (skipString "
") + |>> fun c -> MfmSmallNode(aggregateTextInline c) :> MfmNode + + let mentionNode = + previousCharSatisfiesNot isNotWhitespace + >>. skipString "@" + >>. manyChars (satisfy isAsciiLetter <|> satisfy isDigit <|> anyOf "._-") + .>>. opt (skipChar '@' >>. domain) + .>> (lookAhead + <| choice + [ spaces1 + eof + skipChar ')' + skipChar ',' + skipChar '\'' + skipChar ':' .>> nextCharSatisfiesNot isAsciiLetterOrNumber ]) + |>> fun (user: string, host: string option) -> MfmMentionNode(acct (user, host), user, host) :> MfmNode + + let hashtagNode = + previousCharSatisfiesNot isNotWhitespace + >>. skipChar '#' + >>. many1CharsTill letter (nextCharSatisfies isWhitespace <|> eof) + |>> fun h -> MfmHashtagNode(h) :> MfmNode + + let urlNodePlain = + lookAhead (skipString "https://" <|> skipString "http://") + >>. manyCharsTill anyChar (nextCharSatisfies isWhitespace <|> eof) //FIXME: this needs significant improvements + >>= fun uri -> + match Uri.TryCreate(uri, UriKind.Absolute) with + | true, finalUri -> + match finalUri.Scheme with + | "http" -> preturn (MfmUrlNode(uri, false) :> MfmNode) + | "https" -> preturn (MfmUrlNode(uri, false) :> MfmNode) + | _ -> fail "invalid scheme" + | _ -> fail "invalid url" + + let urlNodeBrackets = + skipChar '<' + >>. lookAhead (skipString "https://" <|> skipString "http://") + >>. manyCharsTill anyChar (skipChar '>') + >>= fun uri -> + match Uri.TryCreate(uri, UriKind.Absolute) with + | true, finalUri -> + match finalUri.Scheme with + | "http" -> preturn (MfmUrlNode(uri, true) :> MfmNode) + | "https" -> preturn (MfmUrlNode(uri, true) :> MfmNode) + | _ -> fail "invalid scheme" + | _ -> fail "invalid url" + + let urlNode = urlNodePlain <|> urlNodeBrackets + + let linkNode = + (opt (pchar '?')) + .>>. (pchar '[' >>. manyCharsTill anyChar (pchar ']')) + .>>. (pchar '(' + >>. lookAhead (skipString "https://" <|> skipString "http://") + >>. manyCharsTill anyChar (pchar ')')) + >>= fun ((silent, text), uri) -> + match Uri.TryCreate(uri, UriKind.Absolute) with + | true, finalUri -> + match finalUri.Scheme with + | "http" -> preturn (MfmLinkNode(uri, text, silent.IsSome) :> MfmNode) + | "https" -> preturn (MfmLinkNode(uri, text, silent.IsSome) :> MfmNode) + | _ -> fail "invalid scheme" + | _ -> fail "invalid url" + + + let charNode = anyChar |>> fun v -> MfmCharNode(v) :> MfmNode + + // Node collection + let inlineNodeSeq = + [ italicNode + boldNode + strikeNode + hashtagNode + mentionNode + codeNode + urlNode + linkNode + mathNode + emojiCodeNode + charNode ] + + //TODO: still missing: FnNode, MfmSearchNode, MfmQuoteNode + + // This intentionally doesn't implement the node type UnicodeEmojiNode, both for performance, + // and because it's not needed for backend processing + + let blockNodeSeq = + [ plainNode; centerNode; smallNode; codeBlockNode; mathBlockNode ] + + let nodeSeq = [ blockNodeSeq; inlineNodeSeq ] + + // Populate references + do nodeRef.Value <- choice <| seqAttempt (seqFlatten <| nodeSeq) + do inlineNodeRef.Value <- choice <| (seqAttempt inlineNodeSeq) |>> fun v -> v :?> MfmInlineNode + + // Final parse command + let parse = spaces >>. manyTill node eof .>> spaces + +open MfmParser + +module Mfm = + [] + let parse str = + match run parse str with + | Success(result, _, _) -> aggregateText result + | Failure(s, _, _) -> failwith $"Failed to parse MFM: {s}" diff --git a/Iceshrimp.Tests/Parsing/MfmTestsFParsec.cs b/Iceshrimp.Tests/Parsing/MfmTestsFParsec.cs new file mode 100644 index 00000000..30a33ff0 --- /dev/null +++ b/Iceshrimp.Tests/Parsing/MfmTestsFParsec.cs @@ -0,0 +1,195 @@ +using Iceshrimp.Parsing; +using Microsoft.FSharp.Collections; +using static Iceshrimp.Parsing.MfmNodeTypes; + +namespace Iceshrimp.Tests.Parsing; + +[TestClass] +public class MfmTestsFParsec +{ + [TestMethod] + public void TestParseBoldItalic() + { + List expected = + [ + new MfmItalicNode(ListModule.OfSeq([ + new MfmTextNode("italic "), + new MfmBoldNode(ListModule.OfSeq([new MfmTextNode("bold")])), + new MfmTextNode(" italic") + ])) + ]; + + var res = Mfm.parse("*italic **bold** italic*").ToList(); + AssertionOptions.FormattingOptions.MaxDepth = 100; + res.Should().Equal(expected, MfmNodeEqual); + } + + [TestMethod] + public void TestParseCode() + { + List expected = + [ + new MfmInlineCodeNode("test"), + new MfmTextNode("\n"), + new MfmCodeBlockNode("test", null), + new MfmTextNode("\n"), + new MfmCodeBlockNode("test", "lang") + ]; + + var res = Mfm.parse(""" + `test` + ``` + test + ``` + ```lang + test + ``` + """); + + AssertionOptions.FormattingOptions.MaxDepth = 100; + res.ToList().Should().Equal(expected, MfmNodeEqual); + } + + [TestMethod] + public void Benchmark() + { + const string mfm = + "*blabla* *test* #example @example @example@invalid @example@example.com @invalid:matrix.org https://hello.com http://test.de javascript://sdfgsdf [test](https://asdfg) ?[test](https://asdfg) `asd`"; + + double duration = 100; + for (var i = 0; i < 4; i++) duration = RunBenchmark(); + + duration.Should().BeLessThan(2); + + return; + + double RunBenchmark() + { + var pre = DateTime.Now; + Mfm.parse(mfm); + var post = DateTime.Now; + var ms = (post - pre).TotalMilliseconds; + Console.WriteLine($"Took {ms} ms"); + return ms; + } + } + + private class MfmNodeEquality : IEqualityComparer + { + public bool Equals(MfmNode? x, MfmNode? y) + { + if (x == null && y == null) return true; + if (x == null && y != null) return false; + if (x != null && y == null) return false; + + return MfmNodeEqual(x!, y!); + } + + public int GetHashCode(MfmNode obj) + { + return obj.GetHashCode(); + } + } + + private static bool MfmNodeEqual(MfmNode a, MfmNode b) + { + if (a.GetType() != b.GetType()) return false; + + if (!a.Children.IsEmpty || !b.Children.IsEmpty) + { + if (!a.Children.IsEmpty && b.Children.IsEmpty || a.Children.IsEmpty && !b.Children.IsEmpty) + return false; + if (!a.Children.SequenceEqual(b.Children, new MfmNodeEquality())) + return false; + } + + switch (a) + { + case MfmTextNode textNode when ((MfmTextNode)b).Text != textNode.Text: + return false; + case MfmMentionNode ax: + { + var bx = (MfmMentionNode)b; + if (bx.Acct != ax.Acct) return false; + if (bx.Username != ax.Username) return false; + if (bx.Host?.Value != ax.Host?.Value) return false; + break; + } + case MfmCodeBlockNode ax: + { + var bx = (MfmCodeBlockNode)b; + if (ax.Code != bx.Code) return false; + if (ax.Language?.Value != bx.Language?.Value) return false; + break; + } + case MfmInlineCodeNode ax: + { + var bx = (MfmInlineCodeNode)b; + if (ax.Code != bx.Code) return false; + break; + } + case MfmMathBlockNode ax: + { + var bx = (MfmMathBlockNode)b; + if (ax.Formula != bx.Formula) return false; + if (ax.Formula != bx.Formula) return false; + break; + } + case MfmMathInlineNode ax: + { + var bx = (MfmMathInlineNode)b; + if (ax.Formula != bx.Formula) return false; + if (ax.Formula != bx.Formula) return false; + break; + } + case MfmSearchNode searchNode: + { + var bx = (MfmSearchNode)b; + if (searchNode.Query != bx.Query) return false; + if (searchNode.Content != bx.Content) return false; + break; + } + case MfmUnicodeEmojiNode ax: + { + var bx = (MfmUnicodeEmojiNode)b; + if (ax.Emoji != bx.Emoji) return false; + break; + } + case MfmEmojiCodeNode ax: + { + var bx = (MfmEmojiCodeNode)b; + if (ax.Name != bx.Name) return false; + break; + } + case MfmHashtagNode ax: + { + var bx = (MfmHashtagNode)b; + if (ax.Hashtag != bx.Hashtag) return false; + break; + } + case MfmUrlNode ax: + { + var bx = (MfmUrlNode)b; + if (ax.Url != bx.Url) return false; + if (ax.Brackets != bx.Brackets) return false; + break; + } + case MfmLinkNode ax: + { + var bx = (MfmLinkNode)b; + if (ax.Url != bx.Url) return false; + if (ax.Silent != bx.Silent) return false; + break; + } + case MfmFnNode ax: + { + var bx = (MfmFnNode)b; + if (ax.Args != bx.Args) return false; + if (ax.Name != bx.Name) return false; + break; + } + } + + return true; + } +} \ No newline at end of file