[parsing/mfm] Reimplement MfmParser in F# (ISH-112)

This commit is contained in:
Laura Hausmann 2024-03-17 20:52:18 +01:00
parent 2065a73ad9
commit 799d1054a9
No known key found for this signature in database
GPG key ID: D044E84C5BE01605
3 changed files with 533 additions and 0 deletions

View file

@ -7,6 +7,7 @@
<ItemGroup> <ItemGroup>
<Compile Include="SearchQuery.fs"/> <Compile Include="SearchQuery.fs"/>
<Compile Include="Mfm.fs" />
</ItemGroup> </ItemGroup>
<ItemGroup> <ItemGroup>

337
Iceshrimp.Parsing/Mfm.fs Normal file
View file

@ -0,0 +1,337 @@
namespace Iceshrimp.Parsing
open System
open System.Collections.Generic
open System.Runtime.InteropServices.JavaScript
open FParsec
module MfmNodeTypes =
[<AbstractClass>]
type MfmNode() =
member val Children: MfmNode list = [] with get, set
[<AbstractClass>]
type MfmInlineNode(c: MfmInlineNode list) =
inherit MfmNode()
do base.Children <- c |> List.map (fun x -> x :> MfmNode)
[<AbstractClass>]
type MfmBlockNode(c: MfmInlineNode list) =
inherit MfmNode()
do base.Children <- c |> List.map (fun x -> x :> MfmNode)
type MfmTextNode(v: string) =
inherit MfmInlineNode([])
member val Text = v
type MfmItalicNode(c) =
inherit MfmInlineNode(c)
type MfmBoldNode(c) =
inherit MfmInlineNode(c)
type MfmStrikeNode(c) =
inherit MfmInlineNode(c)
type MfmInlineCodeNode(v: string) =
inherit MfmInlineNode([])
member val Code = v
type MfmPlainNode(v: string) =
inherit MfmInlineNode([ MfmTextNode(v) ])
type MfmSmallNode(c) =
inherit MfmInlineNode(c)
type MfmQuoteNode(c) =
inherit MfmBlockNode(c)
type MfmSearchNode(content: string, query: string) =
inherit MfmBlockNode([])
member val Content = content
member val Query = query
type MfmCodeBlockNode(code, lang: string option) =
inherit MfmBlockNode([])
member val Code = code
member val Language = lang
type MfmMathBlockNode(f) =
inherit MfmBlockNode([])
member val Formula = f
type MfmCenterNode(c) =
inherit MfmBlockNode(c)
type MfmUnicodeEmojiNode(e: string) =
inherit MfmInlineNode([])
member val Emoji = e
type MfmEmojiCodeNode(n) =
inherit MfmInlineNode([])
member val Name = n
type MfmMathInlineNode(f) =
inherit MfmInlineNode([])
member val Formula = f
type MfmMentionNode(acct, user, host: string option) =
inherit MfmInlineNode([])
member val Acct = acct
member val Username = user
member val Host = host
type MfmHashtagNode(h) =
inherit MfmInlineNode([])
member val Hashtag = h
type MfmUrlNode(url, brackets) =
inherit MfmInlineNode([])
member val Url = url
member val Brackets = brackets
type MfmLinkNode(url, text, silent) =
inherit MfmInlineNode([ MfmTextNode(text) ])
member val Url = url
member val Silent = silent
type MfmFnNode(args: Dictionary<string, string>, name, children) =
inherit MfmInlineNode(children)
// (string, bool) args = (string, null as string?)
member val Args = args
member val Name = name
type internal MfmCharNode(v: char) =
inherit MfmInlineNode([])
member val Char = v
open MfmNodeTypes
module private MfmParser =
// Abstractions
let str s = pstring s
let seqAttempt s = s |> Seq.map attempt
let isWhitespace c = Char.IsWhiteSpace c
let isNotWhitespace c = Char.IsWhiteSpace c = false
let isAsciiLetterOrNumber c = Char.IsAsciiLetter c || Char.IsDigit c
let (|CharNode|MfmNode|) (x: MfmNode) =
if x :? MfmCharNode then
CharNode(x :?> MfmCharNode)
else
MfmNode x
let folder (current, result) node =
match (node: MfmNode), (current: char list) with
| CharNode node, _ -> node.Char :: current, result
| MfmNode node, [] -> current, node :: result
| MfmNode node, _ -> [], node :: (MfmTextNode(current |> List.toArray |> String) :: result)
let aggregateText nodes =
nodes
|> List.rev
|> List.fold folder ([], [])
|> function
| [], result -> result
| current, result -> MfmTextNode(current |> List.toArray |> String) :: result
let aggregateTextInline nodes =
nodes |> aggregateText |> List.map (fun x -> x :?> MfmInlineNode)
let domainFirstComponent =
many1Chars (satisfy isAsciiLetter <|> satisfy isDigit <|> anyOf "_-")
let domainComponent =
many1Chars (satisfy isAsciiLetter <|> satisfy isDigit <|> anyOf "._-")
let domainStart = (satisfy isAsciiLetter <|> satisfy isDigit)
let domainFull =
domainStart .>>. domainFirstComponent .>>. pchar '.' .>>. many1 domainComponent
let domainAggregate1 (a: char, b: string) = string a + b
let domainAggregate2 (a: char * string, b: char) = (domainAggregate1 a) + string b
let domainAggregate (x: (char * string) * char, y: string list) =
domainAggregate2 x + (String.concat "" y)
let domain = domainFull |>> domainAggregate
let acct (user: string, host: string option) =
match host with
| None -> user
| Some v -> user + "@" + v
// References
let node, nodeRef = createParserForwardedToRef ()
let inlineNode, inlineNodeRef = createParserForwardedToRef ()
let seqFlatten items =
seq {
for item in items do
yield! item
}
// Patterns
let italicPattern = (notFollowedBy <| str "**") >>. skipChar '*'
let codePattern = (notFollowedBy <| str "```") >>. skipChar '`'
// Node parsers
let italicNode =
italicPattern >>. manyTill inlineNode italicPattern
|>> fun c -> MfmItalicNode(aggregateTextInline c) :> MfmNode
//TODO: https://github.com/pzp1997/harkdown/blob/master/src/InlineParser.hs#L173-L201
let boldNode =
skipString "**" >>. manyTill inlineNode (skipString "**")
|>> fun c -> MfmBoldNode(aggregateTextInline c) :> MfmNode
let strikeNode =
skipString "~~" >>. manyTill inlineNode (skipString "~~")
|>> fun c -> MfmStrikeNode(aggregateTextInline c) :> MfmNode
let codeNode =
codePattern >>. manyCharsTill anyChar codePattern
|>> fun v -> MfmInlineCodeNode(v) :> MfmNode
let codeBlockNode =
previousCharSatisfiesNot isNotWhitespace
>>. skipString "```"
>>. opt (many1CharsTill asciiLetter (lookAhead newline))
.>>. (skipNewline >>. manyCharsTill anyChar (skipNewline >>. skipString "```"))
|>> fun (lang: string option, code: string) -> MfmCodeBlockNode(code, lang) :> MfmNode
let mathNode =
skipString "\(" >>. manyCharsTill anyChar (skipString "\)")
|>> fun f -> MfmMathInlineNode(f) :> MfmNode
let mathBlockNode =
previousCharSatisfiesNot isNotWhitespace
>>. skipString "\["
>>. many1CharsTill anyChar (skipString "\]")
|>> fun f -> MfmMathBlockNode(f) :> MfmNode
let emojiCodeNode =
skipChar ':' >>. manyCharsTill letter (skipChar ':')
|>> fun e -> MfmEmojiCodeNode(e) :> MfmNode
let plainNode =
skipString "<plain>" >>. manyCharsTill anyChar (skipString "</plain>")
|>> fun v -> MfmPlainNode(v) :> MfmNode
let smallNode =
skipString "<small>" >>. manyTill inlineNode (skipString "</small>")
|>> fun c -> MfmSmallNode(aggregateTextInline c) :> MfmNode
let centerNode =
skipString "<center>" >>. manyTill inlineNode (skipString "</center>")
|>> fun c -> MfmSmallNode(aggregateTextInline c) :> MfmNode
let mentionNode =
previousCharSatisfiesNot isNotWhitespace
>>. skipString "@"
>>. manyChars (satisfy isAsciiLetter <|> satisfy isDigit <|> anyOf "._-")
.>>. opt (skipChar '@' >>. domain)
.>> (lookAhead
<| choice
[ spaces1
eof
skipChar ')'
skipChar ','
skipChar '\''
skipChar ':' .>> nextCharSatisfiesNot isAsciiLetterOrNumber ])
|>> fun (user: string, host: string option) -> MfmMentionNode(acct (user, host), user, host) :> MfmNode
let hashtagNode =
previousCharSatisfiesNot isNotWhitespace
>>. skipChar '#'
>>. many1CharsTill letter (nextCharSatisfies isWhitespace <|> eof)
|>> fun h -> MfmHashtagNode(h) :> MfmNode
let urlNodePlain =
lookAhead (skipString "https://" <|> skipString "http://")
>>. manyCharsTill anyChar (nextCharSatisfies isWhitespace <|> eof) //FIXME: this needs significant improvements
>>= fun uri ->
match Uri.TryCreate(uri, UriKind.Absolute) with
| true, finalUri ->
match finalUri.Scheme with
| "http" -> preturn (MfmUrlNode(uri, false) :> MfmNode)
| "https" -> preturn (MfmUrlNode(uri, false) :> MfmNode)
| _ -> fail "invalid scheme"
| _ -> fail "invalid url"
let urlNodeBrackets =
skipChar '<'
>>. lookAhead (skipString "https://" <|> skipString "http://")
>>. manyCharsTill anyChar (skipChar '>')
>>= fun uri ->
match Uri.TryCreate(uri, UriKind.Absolute) with
| true, finalUri ->
match finalUri.Scheme with
| "http" -> preturn (MfmUrlNode(uri, true) :> MfmNode)
| "https" -> preturn (MfmUrlNode(uri, true) :> MfmNode)
| _ -> fail "invalid scheme"
| _ -> fail "invalid url"
let urlNode = urlNodePlain <|> urlNodeBrackets
let linkNode =
(opt (pchar '?'))
.>>. (pchar '[' >>. manyCharsTill anyChar (pchar ']'))
.>>. (pchar '('
>>. lookAhead (skipString "https://" <|> skipString "http://")
>>. manyCharsTill anyChar (pchar ')'))
>>= fun ((silent, text), uri) ->
match Uri.TryCreate(uri, UriKind.Absolute) with
| true, finalUri ->
match finalUri.Scheme with
| "http" -> preturn (MfmLinkNode(uri, text, silent.IsSome) :> MfmNode)
| "https" -> preturn (MfmLinkNode(uri, text, silent.IsSome) :> MfmNode)
| _ -> fail "invalid scheme"
| _ -> fail "invalid url"
let charNode = anyChar |>> fun v -> MfmCharNode(v) :> MfmNode
// Node collection
let inlineNodeSeq =
[ italicNode
boldNode
strikeNode
hashtagNode
mentionNode
codeNode
urlNode
linkNode
mathNode
emojiCodeNode
charNode ]
//TODO: still missing: FnNode, MfmSearchNode, MfmQuoteNode
// This intentionally doesn't implement the node type UnicodeEmojiNode, both for performance,
// and because it's not needed for backend processing
let blockNodeSeq =
[ plainNode; centerNode; smallNode; codeBlockNode; mathBlockNode ]
let nodeSeq = [ blockNodeSeq; inlineNodeSeq ]
// Populate references
do nodeRef.Value <- choice <| seqAttempt (seqFlatten <| nodeSeq)
do inlineNodeRef.Value <- choice <| (seqAttempt inlineNodeSeq) |>> fun v -> v :?> MfmInlineNode
// Final parse command
let parse = spaces >>. manyTill node eof .>> spaces
open MfmParser
module Mfm =
[<JSExport>]
let parse str =
match run parse str with
| Success(result, _, _) -> aggregateText result
| Failure(s, _, _) -> failwith $"Failed to parse MFM: {s}"

View file

@ -0,0 +1,195 @@
using Iceshrimp.Parsing;
using Microsoft.FSharp.Collections;
using static Iceshrimp.Parsing.MfmNodeTypes;
namespace Iceshrimp.Tests.Parsing;
[TestClass]
public class MfmTestsFParsec
{
[TestMethod]
public void TestParseBoldItalic()
{
List<MfmNode> expected =
[
new MfmItalicNode(ListModule.OfSeq<MfmInlineNode>([
new MfmTextNode("italic "),
new MfmBoldNode(ListModule.OfSeq<MfmInlineNode>([new MfmTextNode("bold")])),
new MfmTextNode(" italic")
]))
];
var res = Mfm.parse("*italic **bold** italic*").ToList();
AssertionOptions.FormattingOptions.MaxDepth = 100;
res.Should().Equal(expected, MfmNodeEqual);
}
[TestMethod]
public void TestParseCode()
{
List<MfmNode> expected =
[
new MfmInlineCodeNode("test"),
new MfmTextNode("\n"),
new MfmCodeBlockNode("test", null),
new MfmTextNode("\n"),
new MfmCodeBlockNode("test", "lang")
];
var res = Mfm.parse("""
`test`
```
test
```
```lang
test
```
""");
AssertionOptions.FormattingOptions.MaxDepth = 100;
res.ToList().Should().Equal(expected, MfmNodeEqual);
}
[TestMethod]
public void Benchmark()
{
const string mfm =
"<plain>*blabla*</plain> *test* #example @example @example@invalid @example@example.com @invalid:matrix.org https://hello.com http://test.de <https://大石泉すき.example.com> javascript://sdfgsdf [test](https://asdfg) ?[test](https://asdfg) `asd`";
double duration = 100;
for (var i = 0; i < 4; i++) duration = RunBenchmark();
duration.Should().BeLessThan(2);
return;
double RunBenchmark()
{
var pre = DateTime.Now;
Mfm.parse(mfm);
var post = DateTime.Now;
var ms = (post - pre).TotalMilliseconds;
Console.WriteLine($"Took {ms} ms");
return ms;
}
}
private class MfmNodeEquality : IEqualityComparer<MfmNode>
{
public bool Equals(MfmNode? x, MfmNode? y)
{
if (x == null && y == null) return true;
if (x == null && y != null) return false;
if (x != null && y == null) return false;
return MfmNodeEqual(x!, y!);
}
public int GetHashCode(MfmNode obj)
{
return obj.GetHashCode();
}
}
private static bool MfmNodeEqual(MfmNode a, MfmNode b)
{
if (a.GetType() != b.GetType()) return false;
if (!a.Children.IsEmpty || !b.Children.IsEmpty)
{
if (!a.Children.IsEmpty && b.Children.IsEmpty || a.Children.IsEmpty && !b.Children.IsEmpty)
return false;
if (!a.Children.SequenceEqual(b.Children, new MfmNodeEquality()))
return false;
}
switch (a)
{
case MfmTextNode textNode when ((MfmTextNode)b).Text != textNode.Text:
return false;
case MfmMentionNode ax:
{
var bx = (MfmMentionNode)b;
if (bx.Acct != ax.Acct) return false;
if (bx.Username != ax.Username) return false;
if (bx.Host?.Value != ax.Host?.Value) return false;
break;
}
case MfmCodeBlockNode ax:
{
var bx = (MfmCodeBlockNode)b;
if (ax.Code != bx.Code) return false;
if (ax.Language?.Value != bx.Language?.Value) return false;
break;
}
case MfmInlineCodeNode ax:
{
var bx = (MfmInlineCodeNode)b;
if (ax.Code != bx.Code) return false;
break;
}
case MfmMathBlockNode ax:
{
var bx = (MfmMathBlockNode)b;
if (ax.Formula != bx.Formula) return false;
if (ax.Formula != bx.Formula) return false;
break;
}
case MfmMathInlineNode ax:
{
var bx = (MfmMathInlineNode)b;
if (ax.Formula != bx.Formula) return false;
if (ax.Formula != bx.Formula) return false;
break;
}
case MfmSearchNode searchNode:
{
var bx = (MfmSearchNode)b;
if (searchNode.Query != bx.Query) return false;
if (searchNode.Content != bx.Content) return false;
break;
}
case MfmUnicodeEmojiNode ax:
{
var bx = (MfmUnicodeEmojiNode)b;
if (ax.Emoji != bx.Emoji) return false;
break;
}
case MfmEmojiCodeNode ax:
{
var bx = (MfmEmojiCodeNode)b;
if (ax.Name != bx.Name) return false;
break;
}
case MfmHashtagNode ax:
{
var bx = (MfmHashtagNode)b;
if (ax.Hashtag != bx.Hashtag) return false;
break;
}
case MfmUrlNode ax:
{
var bx = (MfmUrlNode)b;
if (ax.Url != bx.Url) return false;
if (ax.Brackets != bx.Brackets) return false;
break;
}
case MfmLinkNode ax:
{
var bx = (MfmLinkNode)b;
if (ax.Url != bx.Url) return false;
if (ax.Silent != bx.Silent) return false;
break;
}
case MfmFnNode ax:
{
var bx = (MfmFnNode)b;
if (ax.Args != bx.Args) return false;
if (ax.Name != bx.Name) return false;
break;
}
}
return true;
}
}