[parsing/mfm] Reimplement MfmParser in F# (ISH-112)
This commit is contained in:
parent
2065a73ad9
commit
799d1054a9
3 changed files with 533 additions and 0 deletions
|
@ -7,6 +7,7 @@
|
|||
|
||||
<ItemGroup>
|
||||
<Compile Include="SearchQuery.fs"/>
|
||||
<Compile Include="Mfm.fs" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
|
|
337
Iceshrimp.Parsing/Mfm.fs
Normal file
337
Iceshrimp.Parsing/Mfm.fs
Normal file
|
@ -0,0 +1,337 @@
|
|||
namespace Iceshrimp.Parsing
|
||||
|
||||
open System
|
||||
open System.Collections.Generic
|
||||
open System.Runtime.InteropServices.JavaScript
|
||||
open FParsec
|
||||
|
||||
module MfmNodeTypes =
|
||||
[<AbstractClass>]
|
||||
type MfmNode() =
|
||||
member val Children: MfmNode list = [] with get, set
|
||||
|
||||
[<AbstractClass>]
|
||||
type MfmInlineNode(c: MfmInlineNode list) =
|
||||
inherit MfmNode()
|
||||
do base.Children <- c |> List.map (fun x -> x :> MfmNode)
|
||||
|
||||
[<AbstractClass>]
|
||||
type MfmBlockNode(c: MfmInlineNode list) =
|
||||
inherit MfmNode()
|
||||
do base.Children <- c |> List.map (fun x -> x :> MfmNode)
|
||||
|
||||
type MfmTextNode(v: string) =
|
||||
inherit MfmInlineNode([])
|
||||
member val Text = v
|
||||
|
||||
type MfmItalicNode(c) =
|
||||
inherit MfmInlineNode(c)
|
||||
|
||||
type MfmBoldNode(c) =
|
||||
inherit MfmInlineNode(c)
|
||||
|
||||
type MfmStrikeNode(c) =
|
||||
inherit MfmInlineNode(c)
|
||||
|
||||
type MfmInlineCodeNode(v: string) =
|
||||
inherit MfmInlineNode([])
|
||||
member val Code = v
|
||||
|
||||
type MfmPlainNode(v: string) =
|
||||
inherit MfmInlineNode([ MfmTextNode(v) ])
|
||||
|
||||
type MfmSmallNode(c) =
|
||||
inherit MfmInlineNode(c)
|
||||
|
||||
type MfmQuoteNode(c) =
|
||||
inherit MfmBlockNode(c)
|
||||
|
||||
type MfmSearchNode(content: string, query: string) =
|
||||
inherit MfmBlockNode([])
|
||||
member val Content = content
|
||||
member val Query = query
|
||||
|
||||
type MfmCodeBlockNode(code, lang: string option) =
|
||||
inherit MfmBlockNode([])
|
||||
member val Code = code
|
||||
member val Language = lang
|
||||
|
||||
type MfmMathBlockNode(f) =
|
||||
inherit MfmBlockNode([])
|
||||
member val Formula = f
|
||||
|
||||
type MfmCenterNode(c) =
|
||||
inherit MfmBlockNode(c)
|
||||
|
||||
type MfmUnicodeEmojiNode(e: string) =
|
||||
inherit MfmInlineNode([])
|
||||
member val Emoji = e
|
||||
|
||||
type MfmEmojiCodeNode(n) =
|
||||
inherit MfmInlineNode([])
|
||||
member val Name = n
|
||||
|
||||
type MfmMathInlineNode(f) =
|
||||
inherit MfmInlineNode([])
|
||||
member val Formula = f
|
||||
|
||||
type MfmMentionNode(acct, user, host: string option) =
|
||||
inherit MfmInlineNode([])
|
||||
member val Acct = acct
|
||||
member val Username = user
|
||||
member val Host = host
|
||||
|
||||
type MfmHashtagNode(h) =
|
||||
inherit MfmInlineNode([])
|
||||
member val Hashtag = h
|
||||
|
||||
type MfmUrlNode(url, brackets) =
|
||||
inherit MfmInlineNode([])
|
||||
member val Url = url
|
||||
member val Brackets = brackets
|
||||
|
||||
type MfmLinkNode(url, text, silent) =
|
||||
inherit MfmInlineNode([ MfmTextNode(text) ])
|
||||
member val Url = url
|
||||
member val Silent = silent
|
||||
|
||||
type MfmFnNode(args: Dictionary<string, string>, name, children) =
|
||||
inherit MfmInlineNode(children)
|
||||
// (string, bool) args = (string, null as string?)
|
||||
member val Args = args
|
||||
member val Name = name
|
||||
|
||||
type internal MfmCharNode(v: char) =
|
||||
inherit MfmInlineNode([])
|
||||
member val Char = v
|
||||
|
||||
open MfmNodeTypes
|
||||
|
||||
module private MfmParser =
|
||||
// Abstractions
|
||||
let str s = pstring s
|
||||
let seqAttempt s = s |> Seq.map attempt
|
||||
let isWhitespace c = Char.IsWhiteSpace c
|
||||
let isNotWhitespace c = Char.IsWhiteSpace c = false
|
||||
|
||||
let isAsciiLetterOrNumber c = Char.IsAsciiLetter c || Char.IsDigit c
|
||||
|
||||
let (|CharNode|MfmNode|) (x: MfmNode) =
|
||||
if x :? MfmCharNode then
|
||||
CharNode(x :?> MfmCharNode)
|
||||
else
|
||||
MfmNode x
|
||||
|
||||
let folder (current, result) node =
|
||||
match (node: MfmNode), (current: char list) with
|
||||
| CharNode node, _ -> node.Char :: current, result
|
||||
| MfmNode node, [] -> current, node :: result
|
||||
| MfmNode node, _ -> [], node :: (MfmTextNode(current |> List.toArray |> String) :: result)
|
||||
|
||||
let aggregateText nodes =
|
||||
nodes
|
||||
|> List.rev
|
||||
|> List.fold folder ([], [])
|
||||
|> function
|
||||
| [], result -> result
|
||||
| current, result -> MfmTextNode(current |> List.toArray |> String) :: result
|
||||
|
||||
let aggregateTextInline nodes =
|
||||
nodes |> aggregateText |> List.map (fun x -> x :?> MfmInlineNode)
|
||||
|
||||
let domainFirstComponent =
|
||||
many1Chars (satisfy isAsciiLetter <|> satisfy isDigit <|> anyOf "_-")
|
||||
|
||||
let domainComponent =
|
||||
many1Chars (satisfy isAsciiLetter <|> satisfy isDigit <|> anyOf "._-")
|
||||
|
||||
let domainStart = (satisfy isAsciiLetter <|> satisfy isDigit)
|
||||
|
||||
let domainFull =
|
||||
domainStart .>>. domainFirstComponent .>>. pchar '.' .>>. many1 domainComponent
|
||||
|
||||
let domainAggregate1 (a: char, b: string) = string a + b
|
||||
let domainAggregate2 (a: char * string, b: char) = (domainAggregate1 a) + string b
|
||||
|
||||
let domainAggregate (x: (char * string) * char, y: string list) =
|
||||
domainAggregate2 x + (String.concat "" y)
|
||||
|
||||
let domain = domainFull |>> domainAggregate
|
||||
|
||||
let acct (user: string, host: string option) =
|
||||
match host with
|
||||
| None -> user
|
||||
| Some v -> user + "@" + v
|
||||
|
||||
// References
|
||||
let node, nodeRef = createParserForwardedToRef ()
|
||||
let inlineNode, inlineNodeRef = createParserForwardedToRef ()
|
||||
|
||||
let seqFlatten items =
|
||||
seq {
|
||||
for item in items do
|
||||
yield! item
|
||||
}
|
||||
|
||||
// Patterns
|
||||
let italicPattern = (notFollowedBy <| str "**") >>. skipChar '*'
|
||||
let codePattern = (notFollowedBy <| str "```") >>. skipChar '`'
|
||||
|
||||
// Node parsers
|
||||
|
||||
let italicNode =
|
||||
italicPattern >>. manyTill inlineNode italicPattern
|
||||
|>> fun c -> MfmItalicNode(aggregateTextInline c) :> MfmNode
|
||||
|
||||
//TODO: https://github.com/pzp1997/harkdown/blob/master/src/InlineParser.hs#L173-L201
|
||||
|
||||
let boldNode =
|
||||
skipString "**" >>. manyTill inlineNode (skipString "**")
|
||||
|>> fun c -> MfmBoldNode(aggregateTextInline c) :> MfmNode
|
||||
|
||||
let strikeNode =
|
||||
skipString "~~" >>. manyTill inlineNode (skipString "~~")
|
||||
|>> fun c -> MfmStrikeNode(aggregateTextInline c) :> MfmNode
|
||||
|
||||
let codeNode =
|
||||
codePattern >>. manyCharsTill anyChar codePattern
|
||||
|>> fun v -> MfmInlineCodeNode(v) :> MfmNode
|
||||
|
||||
let codeBlockNode =
|
||||
previousCharSatisfiesNot isNotWhitespace
|
||||
>>. skipString "```"
|
||||
>>. opt (many1CharsTill asciiLetter (lookAhead newline))
|
||||
.>>. (skipNewline >>. manyCharsTill anyChar (skipNewline >>. skipString "```"))
|
||||
|>> fun (lang: string option, code: string) -> MfmCodeBlockNode(code, lang) :> MfmNode
|
||||
|
||||
let mathNode =
|
||||
skipString "\(" >>. manyCharsTill anyChar (skipString "\)")
|
||||
|>> fun f -> MfmMathInlineNode(f) :> MfmNode
|
||||
|
||||
let mathBlockNode =
|
||||
previousCharSatisfiesNot isNotWhitespace
|
||||
>>. skipString "\["
|
||||
>>. many1CharsTill anyChar (skipString "\]")
|
||||
|>> fun f -> MfmMathBlockNode(f) :> MfmNode
|
||||
|
||||
let emojiCodeNode =
|
||||
skipChar ':' >>. manyCharsTill letter (skipChar ':')
|
||||
|>> fun e -> MfmEmojiCodeNode(e) :> MfmNode
|
||||
|
||||
let plainNode =
|
||||
skipString "<plain>" >>. manyCharsTill anyChar (skipString "</plain>")
|
||||
|>> fun v -> MfmPlainNode(v) :> MfmNode
|
||||
|
||||
let smallNode =
|
||||
skipString "<small>" >>. manyTill inlineNode (skipString "</small>")
|
||||
|>> fun c -> MfmSmallNode(aggregateTextInline c) :> MfmNode
|
||||
|
||||
let centerNode =
|
||||
skipString "<center>" >>. manyTill inlineNode (skipString "</center>")
|
||||
|>> fun c -> MfmSmallNode(aggregateTextInline c) :> MfmNode
|
||||
|
||||
let mentionNode =
|
||||
previousCharSatisfiesNot isNotWhitespace
|
||||
>>. skipString "@"
|
||||
>>. manyChars (satisfy isAsciiLetter <|> satisfy isDigit <|> anyOf "._-")
|
||||
.>>. opt (skipChar '@' >>. domain)
|
||||
.>> (lookAhead
|
||||
<| choice
|
||||
[ spaces1
|
||||
eof
|
||||
skipChar ')'
|
||||
skipChar ','
|
||||
skipChar '\''
|
||||
skipChar ':' .>> nextCharSatisfiesNot isAsciiLetterOrNumber ])
|
||||
|>> fun (user: string, host: string option) -> MfmMentionNode(acct (user, host), user, host) :> MfmNode
|
||||
|
||||
let hashtagNode =
|
||||
previousCharSatisfiesNot isNotWhitespace
|
||||
>>. skipChar '#'
|
||||
>>. many1CharsTill letter (nextCharSatisfies isWhitespace <|> eof)
|
||||
|>> fun h -> MfmHashtagNode(h) :> MfmNode
|
||||
|
||||
let urlNodePlain =
|
||||
lookAhead (skipString "https://" <|> skipString "http://")
|
||||
>>. manyCharsTill anyChar (nextCharSatisfies isWhitespace <|> eof) //FIXME: this needs significant improvements
|
||||
>>= fun uri ->
|
||||
match Uri.TryCreate(uri, UriKind.Absolute) with
|
||||
| true, finalUri ->
|
||||
match finalUri.Scheme with
|
||||
| "http" -> preturn (MfmUrlNode(uri, false) :> MfmNode)
|
||||
| "https" -> preturn (MfmUrlNode(uri, false) :> MfmNode)
|
||||
| _ -> fail "invalid scheme"
|
||||
| _ -> fail "invalid url"
|
||||
|
||||
let urlNodeBrackets =
|
||||
skipChar '<'
|
||||
>>. lookAhead (skipString "https://" <|> skipString "http://")
|
||||
>>. manyCharsTill anyChar (skipChar '>')
|
||||
>>= fun uri ->
|
||||
match Uri.TryCreate(uri, UriKind.Absolute) with
|
||||
| true, finalUri ->
|
||||
match finalUri.Scheme with
|
||||
| "http" -> preturn (MfmUrlNode(uri, true) :> MfmNode)
|
||||
| "https" -> preturn (MfmUrlNode(uri, true) :> MfmNode)
|
||||
| _ -> fail "invalid scheme"
|
||||
| _ -> fail "invalid url"
|
||||
|
||||
let urlNode = urlNodePlain <|> urlNodeBrackets
|
||||
|
||||
let linkNode =
|
||||
(opt (pchar '?'))
|
||||
.>>. (pchar '[' >>. manyCharsTill anyChar (pchar ']'))
|
||||
.>>. (pchar '('
|
||||
>>. lookAhead (skipString "https://" <|> skipString "http://")
|
||||
>>. manyCharsTill anyChar (pchar ')'))
|
||||
>>= fun ((silent, text), uri) ->
|
||||
match Uri.TryCreate(uri, UriKind.Absolute) with
|
||||
| true, finalUri ->
|
||||
match finalUri.Scheme with
|
||||
| "http" -> preturn (MfmLinkNode(uri, text, silent.IsSome) :> MfmNode)
|
||||
| "https" -> preturn (MfmLinkNode(uri, text, silent.IsSome) :> MfmNode)
|
||||
| _ -> fail "invalid scheme"
|
||||
| _ -> fail "invalid url"
|
||||
|
||||
|
||||
let charNode = anyChar |>> fun v -> MfmCharNode(v) :> MfmNode
|
||||
|
||||
// Node collection
|
||||
let inlineNodeSeq =
|
||||
[ italicNode
|
||||
boldNode
|
||||
strikeNode
|
||||
hashtagNode
|
||||
mentionNode
|
||||
codeNode
|
||||
urlNode
|
||||
linkNode
|
||||
mathNode
|
||||
emojiCodeNode
|
||||
charNode ]
|
||||
|
||||
//TODO: still missing: FnNode, MfmSearchNode, MfmQuoteNode
|
||||
|
||||
// This intentionally doesn't implement the node type UnicodeEmojiNode, both for performance,
|
||||
// and because it's not needed for backend processing
|
||||
|
||||
let blockNodeSeq =
|
||||
[ plainNode; centerNode; smallNode; codeBlockNode; mathBlockNode ]
|
||||
|
||||
let nodeSeq = [ blockNodeSeq; inlineNodeSeq ]
|
||||
|
||||
// Populate references
|
||||
do nodeRef.Value <- choice <| seqAttempt (seqFlatten <| nodeSeq)
|
||||
do inlineNodeRef.Value <- choice <| (seqAttempt inlineNodeSeq) |>> fun v -> v :?> MfmInlineNode
|
||||
|
||||
// Final parse command
|
||||
let parse = spaces >>. manyTill node eof .>> spaces
|
||||
|
||||
open MfmParser
|
||||
|
||||
module Mfm =
|
||||
[<JSExport>]
|
||||
let parse str =
|
||||
match run parse str with
|
||||
| Success(result, _, _) -> aggregateText result
|
||||
| Failure(s, _, _) -> failwith $"Failed to parse MFM: {s}"
|
195
Iceshrimp.Tests/Parsing/MfmTestsFParsec.cs
Normal file
195
Iceshrimp.Tests/Parsing/MfmTestsFParsec.cs
Normal file
|
@ -0,0 +1,195 @@
|
|||
using Iceshrimp.Parsing;
|
||||
using Microsoft.FSharp.Collections;
|
||||
using static Iceshrimp.Parsing.MfmNodeTypes;
|
||||
|
||||
namespace Iceshrimp.Tests.Parsing;
|
||||
|
||||
[TestClass]
|
||||
public class MfmTestsFParsec
|
||||
{
|
||||
[TestMethod]
|
||||
public void TestParseBoldItalic()
|
||||
{
|
||||
List<MfmNode> expected =
|
||||
[
|
||||
new MfmItalicNode(ListModule.OfSeq<MfmInlineNode>([
|
||||
new MfmTextNode("italic "),
|
||||
new MfmBoldNode(ListModule.OfSeq<MfmInlineNode>([new MfmTextNode("bold")])),
|
||||
new MfmTextNode(" italic")
|
||||
]))
|
||||
];
|
||||
|
||||
var res = Mfm.parse("*italic **bold** italic*").ToList();
|
||||
AssertionOptions.FormattingOptions.MaxDepth = 100;
|
||||
res.Should().Equal(expected, MfmNodeEqual);
|
||||
}
|
||||
|
||||
[TestMethod]
|
||||
public void TestParseCode()
|
||||
{
|
||||
List<MfmNode> expected =
|
||||
[
|
||||
new MfmInlineCodeNode("test"),
|
||||
new MfmTextNode("\n"),
|
||||
new MfmCodeBlockNode("test", null),
|
||||
new MfmTextNode("\n"),
|
||||
new MfmCodeBlockNode("test", "lang")
|
||||
];
|
||||
|
||||
var res = Mfm.parse("""
|
||||
`test`
|
||||
```
|
||||
test
|
||||
```
|
||||
```lang
|
||||
test
|
||||
```
|
||||
""");
|
||||
|
||||
AssertionOptions.FormattingOptions.MaxDepth = 100;
|
||||
res.ToList().Should().Equal(expected, MfmNodeEqual);
|
||||
}
|
||||
|
||||
[TestMethod]
|
||||
public void Benchmark()
|
||||
{
|
||||
const string mfm =
|
||||
"<plain>*blabla*</plain> *test* #example @example @example@invalid @example@example.com @invalid:matrix.org https://hello.com http://test.de <https://大石泉すき.example.com> javascript://sdfgsdf [test](https://asdfg) ?[test](https://asdfg) `asd`";
|
||||
|
||||
double duration = 100;
|
||||
for (var i = 0; i < 4; i++) duration = RunBenchmark();
|
||||
|
||||
duration.Should().BeLessThan(2);
|
||||
|
||||
return;
|
||||
|
||||
double RunBenchmark()
|
||||
{
|
||||
var pre = DateTime.Now;
|
||||
Mfm.parse(mfm);
|
||||
var post = DateTime.Now;
|
||||
var ms = (post - pre).TotalMilliseconds;
|
||||
Console.WriteLine($"Took {ms} ms");
|
||||
return ms;
|
||||
}
|
||||
}
|
||||
|
||||
private class MfmNodeEquality : IEqualityComparer<MfmNode>
|
||||
{
|
||||
public bool Equals(MfmNode? x, MfmNode? y)
|
||||
{
|
||||
if (x == null && y == null) return true;
|
||||
if (x == null && y != null) return false;
|
||||
if (x != null && y == null) return false;
|
||||
|
||||
return MfmNodeEqual(x!, y!);
|
||||
}
|
||||
|
||||
public int GetHashCode(MfmNode obj)
|
||||
{
|
||||
return obj.GetHashCode();
|
||||
}
|
||||
}
|
||||
|
||||
private static bool MfmNodeEqual(MfmNode a, MfmNode b)
|
||||
{
|
||||
if (a.GetType() != b.GetType()) return false;
|
||||
|
||||
if (!a.Children.IsEmpty || !b.Children.IsEmpty)
|
||||
{
|
||||
if (!a.Children.IsEmpty && b.Children.IsEmpty || a.Children.IsEmpty && !b.Children.IsEmpty)
|
||||
return false;
|
||||
if (!a.Children.SequenceEqual(b.Children, new MfmNodeEquality()))
|
||||
return false;
|
||||
}
|
||||
|
||||
switch (a)
|
||||
{
|
||||
case MfmTextNode textNode when ((MfmTextNode)b).Text != textNode.Text:
|
||||
return false;
|
||||
case MfmMentionNode ax:
|
||||
{
|
||||
var bx = (MfmMentionNode)b;
|
||||
if (bx.Acct != ax.Acct) return false;
|
||||
if (bx.Username != ax.Username) return false;
|
||||
if (bx.Host?.Value != ax.Host?.Value) return false;
|
||||
break;
|
||||
}
|
||||
case MfmCodeBlockNode ax:
|
||||
{
|
||||
var bx = (MfmCodeBlockNode)b;
|
||||
if (ax.Code != bx.Code) return false;
|
||||
if (ax.Language?.Value != bx.Language?.Value) return false;
|
||||
break;
|
||||
}
|
||||
case MfmInlineCodeNode ax:
|
||||
{
|
||||
var bx = (MfmInlineCodeNode)b;
|
||||
if (ax.Code != bx.Code) return false;
|
||||
break;
|
||||
}
|
||||
case MfmMathBlockNode ax:
|
||||
{
|
||||
var bx = (MfmMathBlockNode)b;
|
||||
if (ax.Formula != bx.Formula) return false;
|
||||
if (ax.Formula != bx.Formula) return false;
|
||||
break;
|
||||
}
|
||||
case MfmMathInlineNode ax:
|
||||
{
|
||||
var bx = (MfmMathInlineNode)b;
|
||||
if (ax.Formula != bx.Formula) return false;
|
||||
if (ax.Formula != bx.Formula) return false;
|
||||
break;
|
||||
}
|
||||
case MfmSearchNode searchNode:
|
||||
{
|
||||
var bx = (MfmSearchNode)b;
|
||||
if (searchNode.Query != bx.Query) return false;
|
||||
if (searchNode.Content != bx.Content) return false;
|
||||
break;
|
||||
}
|
||||
case MfmUnicodeEmojiNode ax:
|
||||
{
|
||||
var bx = (MfmUnicodeEmojiNode)b;
|
||||
if (ax.Emoji != bx.Emoji) return false;
|
||||
break;
|
||||
}
|
||||
case MfmEmojiCodeNode ax:
|
||||
{
|
||||
var bx = (MfmEmojiCodeNode)b;
|
||||
if (ax.Name != bx.Name) return false;
|
||||
break;
|
||||
}
|
||||
case MfmHashtagNode ax:
|
||||
{
|
||||
var bx = (MfmHashtagNode)b;
|
||||
if (ax.Hashtag != bx.Hashtag) return false;
|
||||
break;
|
||||
}
|
||||
case MfmUrlNode ax:
|
||||
{
|
||||
var bx = (MfmUrlNode)b;
|
||||
if (ax.Url != bx.Url) return false;
|
||||
if (ax.Brackets != bx.Brackets) return false;
|
||||
break;
|
||||
}
|
||||
case MfmLinkNode ax:
|
||||
{
|
||||
var bx = (MfmLinkNode)b;
|
||||
if (ax.Url != bx.Url) return false;
|
||||
if (ax.Silent != bx.Silent) return false;
|
||||
break;
|
||||
}
|
||||
case MfmFnNode ax:
|
||||
{
|
||||
var bx = (MfmFnNode)b;
|
||||
if (ax.Args != bx.Args) return false;
|
||||
if (ax.Name != bx.Name) return false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
Loading…
Add table
Reference in a new issue