Iceshrimp.NET/Iceshrimp.Backend/Core/Helpers/LibMfm/Parsing/MfmParser.cs
Laura Hausmann 67d1d776c8
[backend/federation] Basic mentions handling (ISH-38)
This implementation adds handling of incoming mentions, including rewriting non-canonical mentions of split domain users into their canonical form when inserting notes into the database.
2024-02-11 18:50:28 +01:00

496 lines
No EOL
15 KiB
C#

using System.Collections.Immutable;
using System.Text.RegularExpressions;
using Iceshrimp.Backend.Core.Helpers.LibMfm.Types;
namespace Iceshrimp.Backend.Core.Helpers.LibMfm.Parsing;
public static class MfmParser {
private static readonly ImmutableList<INodeParser> Parsers = [
new PlainNodeParser(),
new ItalicNodeParser(),
new BoldNodeParser(),
new SmallNodeParser(),
new StrikeNodeParser(),
new CenterNodeParser(),
new HashtagNodeParser(),
new MentionNodeParser(),
new UrlNodeParser(),
new AltUrlNodeParser(),
new LinkNodeParser(),
new SilentLinkNodeParser(),
new InlineCodeNodeParser(),
new EmojiCodeNodeParser(),
new MathInlineNodeParser(),
new MathBlockNodeParser(),
new CodeBlockParser()
];
/// <remarks>
/// This intentionally doesn't implement the node type UnicodeEmojiNode, both for performance and because it's not
/// needed for backend processing
/// </remarks>
public static IEnumerable<MfmNode> Parse(string buffer, int position = 0, int nestLimit = 20) {
if (nestLimit <= 0) return [];
var nodes = new List<MfmNode>();
while (position < buffer.Length) {
var parser = Parsers.FirstOrDefault(p => p.IsValid(buffer, position));
if (parser == null) {
if (nodes.LastOrDefault() is MfmTextNode textNode) {
textNode.Text += buffer[position++];
}
else {
var node = new MfmTextNode {
Text = buffer[position++].ToString()
};
nodes.Add(node);
}
continue;
}
var result = parser.Parse(buffer, position, nestLimit);
position += result.chars;
nodes.Add(result.node);
}
return nodes;
}
}
internal static class NodeParserAbstractions {
public static (int start, int end, int chars) HandlePosition(string pre, string post, string buffer, int position) {
var start = position + pre.Length;
//TODO: cover case of buffer == string.empty
var end = buffer.IndexOf(post, start, StringComparison.Ordinal);
int chars;
if (end == -1) {
end = buffer.Length;
chars = end - position;
}
else {
chars = end - position + post.Length;
}
return (start, end, chars);
}
public static (int start, int end, int chars) HandlePosition(string character, string buffer, int position) {
return HandlePosition(character, character, buffer, position);
}
public static (int start, int end, int chars) HandlePosition(string pre, Regex regex, string buffer, int position) {
var start = position + pre.Length;
var end = regex.Match(buffer[start..]).Index + start;
var chars = end - position;
return (start, end, chars);
}
}
internal interface INodeParser {
public bool IsValid(string buffer, int position);
public (MfmNode node, int chars) Parse(string buffer, int position, int nestLimit);
}
internal class ItalicNodeParser : INodeParser {
private const string Char = "*";
public bool IsValid(string buffer, int position) {
return buffer[position..].StartsWith(Char) && !buffer[position..].StartsWith("**");
}
public (MfmNode node, int chars) Parse(string buffer, int position, int nestLimit) {
var (start, end, chars) = NodeParserAbstractions.HandlePosition(Char, buffer, position);
var node = new MfmItalicNode {
Children = MfmParser.Parse(buffer[start..end], 0, --nestLimit).OfType<MfmInlineNode>()
};
return (node, chars);
}
}
internal class InlineCodeNodeParser : INodeParser {
private const string Char = "`";
public bool IsValid(string buffer, int position) {
return buffer[position..].StartsWith(Char) && !buffer[position..].StartsWith("```");
}
public (MfmNode node, int chars) Parse(string buffer, int position, int nestLimit) {
var (start, end, chars) = NodeParserAbstractions.HandlePosition(Char, buffer, position);
var node = new MfmInlineCodeNode {
Code = buffer[start..end]
};
return (node, chars);
}
}
internal class BoldNodeParser : INodeParser {
private const string Char = "**";
public bool IsValid(string buffer, int position) {
return buffer[position..].StartsWith(Char);
}
public (MfmNode node, int chars) Parse(string buffer, int position, int nestLimit) {
var (start, end, chars) = NodeParserAbstractions.HandlePosition(Char, buffer, position);
var node = new MfmBoldNode {
Children = MfmParser.Parse(buffer[start..end], 0, --nestLimit).OfType<MfmInlineNode>()
};
return (node, chars);
}
}
internal class PlainNodeParser : INodeParser {
private const string Pre = "<plain>";
private const string Post = "</plain>";
public bool IsValid(string buffer, int position) {
return buffer[position..].StartsWith(Pre);
}
public (MfmNode node, int chars) Parse(string buffer, int position, int nestLimit) {
var (start, end, chars) = NodeParserAbstractions.HandlePosition(Pre, Post, buffer, position);
var node = new MfmPlainNode {
Children = [
new MfmTextNode {
Text = buffer[start..end]
}
]
};
return (node, chars);
}
}
internal class SmallNodeParser : INodeParser {
private const string Pre = "<small>";
private const string Post = "</small>";
public bool IsValid(string buffer, int position) {
return buffer[position..].StartsWith(Pre);
}
public (MfmNode node, int chars) Parse(string buffer, int position, int nestLimit) {
var (start, end, chars) = NodeParserAbstractions.HandlePosition(Pre, Post, buffer, position);
var node = new MfmSmallNode {
Children = MfmParser.Parse(buffer[start..end], 0, --nestLimit).OfType<MfmInlineNode>()
};
return (node, chars);
}
}
internal class CenterNodeParser : INodeParser {
private const string Pre = "<center>";
private const string Post = "</center>";
public bool IsValid(string buffer, int position) {
return buffer[position..].StartsWith(Pre);
}
public (MfmNode node, int chars) Parse(string buffer, int position, int nestLimit) {
var (start, end, chars) = NodeParserAbstractions.HandlePosition(Pre, Post, buffer, position);
var node = new MfmCenterNode {
Children = MfmParser.Parse(buffer[start..end], 0, --nestLimit).OfType<MfmInlineNode>()
};
return (node, chars);
}
}
internal class StrikeNodeParser : INodeParser {
private const string Char = "~~";
public bool IsValid(string buffer, int position) {
return buffer[position..].StartsWith(Char);
}
public (MfmNode node, int chars) Parse(string buffer, int position, int nestLimit) {
var (start, end, chars) = NodeParserAbstractions.HandlePosition(Char, buffer, position);
var node = new MfmStrikeNode {
Children = MfmParser.Parse(buffer[start..end], 0, --nestLimit).OfType<MfmInlineNode>()
};
return (node, chars);
}
}
internal class HashtagNodeParser : INodeParser {
private const string Pre = "#";
private static readonly Regex Post = new(@"\s|$");
public bool IsValid(string buffer, int position) {
return buffer[position..].StartsWith(Pre);
}
public (MfmNode node, int chars) Parse(string buffer, int position, int nestLimit) {
var (start, end, chars) = NodeParserAbstractions.HandlePosition(Pre, Post, buffer, position);
var node = new MfmHashtagNode {
Hashtag = buffer[start..end]
};
return (node, chars);
}
}
internal class MentionNodeParser : INodeParser {
private const string Pre = "@";
private static readonly Regex Post = new(@"\s|$");
private static readonly Regex Full = new(@"^[a-zA-Z0-9._\-]+(?:@[a-zA-Z0-9._\-]+\.[a-zA-Z0-9._\-]+)?$");
private static readonly Regex Lookbehind = new(@"\s");
public bool IsValid(string buffer, int position) {
if (!buffer[position..].StartsWith(Pre)) return false;
if (position != 0 && !Lookbehind.IsMatch(buffer[position - 1].ToString())) return false;
var (start, end, _) = NodeParserAbstractions.HandlePosition(Pre, Post, buffer, position);
return buffer[start..end].Split("@").Length <= 2 && Full.IsMatch(buffer[start..end]);
}
public (MfmNode node, int chars) Parse(string buffer, int position, int nestLimit) {
//TODO: make sure this handles non-ascii/puny domains
var (start, end, chars) = NodeParserAbstractions.HandlePosition(Pre, Post, buffer, position);
var split = buffer[start..end].Split("@");
var node = new MfmMentionNode {
Username = split[0],
Host = split.Length == 2 ? split[1] : null,
Acct = $"@{buffer[start..end]}"
};
return (node, chars);
}
}
internal class UrlNodeParser : INodeParser {
private const string Pre = "https://";
private const string PreAlt = "http://";
private static readonly Regex Post = new(@"\s|$");
public bool IsValid(string buffer, int position) {
if (!buffer[position..].StartsWith(Pre) && !buffer[position..].StartsWith(PreAlt))
return false;
var prefix = buffer[position..].StartsWith(Pre) ? Pre : PreAlt;
var (start, end, _) = NodeParserAbstractions.HandlePosition(prefix, Post, buffer, position);
var result = Uri.TryCreate(prefix + buffer[start..end], UriKind.Absolute, out var uri);
return result && uri?.Scheme is "http" or "https";
}
public (MfmNode node, int chars) Parse(string buffer, int position, int nestLimit) {
var prefix = buffer[position..].StartsWith(Pre) ? Pre : PreAlt;
var (start, end, chars) = NodeParserAbstractions.HandlePosition(prefix, Post, buffer, position);
var node = new MfmUrlNode {
Url = prefix + buffer[start..end],
Brackets = false
};
return (node, chars);
}
}
internal class AltUrlNodeParser : INodeParser {
private const string Pre = "<https://";
private const string PreAlt = "<http://";
private const string Post = ">";
public bool IsValid(string buffer, int position) {
if (!buffer[position..].StartsWith(Pre) && !buffer[position..].StartsWith(PreAlt))
return false;
var prefix = buffer[position..].StartsWith(Pre) ? Pre : PreAlt;
var (start, end, _) = NodeParserAbstractions.HandlePosition(prefix, Post, buffer, position);
var result = Uri.TryCreate(prefix[1..] + buffer[start..end], UriKind.Absolute, out var uri);
return result && uri?.Scheme is "http" or "https";
}
public (MfmNode node, int chars) Parse(string buffer, int position, int nestLimit) {
var prefix = buffer[position..].StartsWith(Pre) ? Pre : PreAlt;
var (start, end, chars) = NodeParserAbstractions.HandlePosition(prefix, Post, buffer, position);
var node = new MfmUrlNode {
Url = prefix[1..] + buffer[start..end],
Brackets = true
};
return (node, chars);
}
}
internal class LinkNodeParser : INodeParser {
private const string Pre = "[";
private const string Post = ")";
private static readonly Regex Full = new(@"^\[(.+?)\]\((.+?)\)$");
public bool IsValid(string buffer, int position) {
if (!buffer[position..].StartsWith(Pre))
return false;
var (_, end, _) = NodeParserAbstractions.HandlePosition(Pre, Post, buffer, position);
if (end == buffer.Length)
return false;
var match = Full.Match(buffer[position..(end + 1)]);
if (match.Groups.Count != 3)
return false;
var result = Uri.TryCreate(match.Groups[2].Value, UriKind.Absolute, out var uri);
return result && uri?.Scheme is "http" or "https";
}
public (MfmNode node, int chars) Parse(string buffer, int position, int nestLimit) {
var (start, end, chars) = NodeParserAbstractions.HandlePosition(Pre, Post, buffer, position);
var textEnd = buffer[position..].IndexOf(']') + position;
var match = Full.Match(buffer[position..(end + 1)]);
var node = new MfmLinkNode {
Url = match.Groups[2].Value,
Children = MfmParser.Parse(buffer[start..textEnd], 0, --nestLimit).OfType<MfmInlineNode>(),
Silent = false
};
return (node, chars);
}
}
internal class SilentLinkNodeParser : INodeParser {
private const string Pre = "?[";
private const string Post = ")";
private static readonly Regex Full = new(@"^\?\[(.+?)\]\((.+?)\)$");
public bool IsValid(string buffer, int position) {
if (!buffer[position..].StartsWith(Pre))
return false;
var (_, end, _) = NodeParserAbstractions.HandlePosition(Pre, Post, buffer, position);
if (end == buffer.Length)
return false;
var match = Full.Match(buffer[position..(end + 1)]);
if (match.Groups.Count != 3)
return false;
var result = Uri.TryCreate(match.Groups[2].Value, UriKind.Absolute, out var uri);
return result && uri?.Scheme is "http" or "https";
}
public (MfmNode node, int chars) Parse(string buffer, int position, int nestLimit) {
var (start, end, chars) = NodeParserAbstractions.HandlePosition(Pre, Post, buffer, position);
var textEnd = buffer[position..].IndexOf(']') + position;
var match = Full.Match(buffer[position..(end + 1)]);
var node = new MfmLinkNode {
Url = match.Groups[2].Value,
Children = MfmParser.Parse(buffer[start..textEnd], 0, --nestLimit).OfType<MfmInlineNode>(),
Silent = true
};
return (node, chars);
}
}
internal class EmojiCodeNodeParser : INodeParser {
private const string Char = ":";
private static readonly Regex Full = new("^[a-z0-9_+-]+$");
public bool IsValid(string buffer, int position) {
if (!buffer[position..].StartsWith(Char))
return false;
var (start, end, _) = NodeParserAbstractions.HandlePosition(Char, buffer, position);
return end != buffer.Length && Full.IsMatch(buffer[start..end]);
}
public (MfmNode node, int chars) Parse(string buffer, int position, int nestLimit) {
var (start, end, chars) = NodeParserAbstractions.HandlePosition(Char, buffer, position);
var node = new MfmEmojiCodeNode {
Name = buffer[start..end]
};
return (node, chars);
}
}
internal class MathInlineNodeParser : INodeParser {
private const string Pre = @"\(";
private const string Post = @"\)";
public bool IsValid(string buffer, int position) {
return buffer[position..].StartsWith(Pre);
}
public (MfmNode node, int chars) Parse(string buffer, int position, int nestLimit) {
var (start, end, chars) = NodeParserAbstractions.HandlePosition(Pre, Post, buffer, position);
var node = new MfmMathInlineNode {
Formula = buffer[start..end]
};
return (node, chars);
}
}
internal class MathBlockNodeParser : INodeParser {
private const string Pre = @"\[";
private const string Post = @"\]";
public bool IsValid(string buffer, int position) {
return buffer[position..].StartsWith(Pre);
}
public (MfmNode node, int chars) Parse(string buffer, int position, int nestLimit) {
var (start, end, chars) = NodeParserAbstractions.HandlePosition(Pre, Post, buffer, position);
var node = new MfmMathBlockNode {
Formula = buffer[start..end]
};
return (node, chars);
}
}
internal class CodeBlockParser : INodeParser {
private const string Char = "```";
public bool IsValid(string buffer, int position) {
if (!buffer[position..].StartsWith(Char)) return false;
var (start, end, _) = NodeParserAbstractions.HandlePosition(Char, buffer, position);
return buffer[start..end].EndsWith('\n');
}
public (MfmNode node, int chars) Parse(string buffer, int position, int nestLimit) {
var (start, end, chars) = NodeParserAbstractions.HandlePosition(Char, buffer, position);
var split = buffer[start..end].Split('\n');
var lang = split[0].Length > 0 ? split[0] : null;
var code = string.Join('\n', split[1..^1]);
var node = new MfmCodeBlockNode {
Code = code,
Language = lang
};
return (node, chars);
}
}
//TODO: still missing: FnNode, MfmSearchNode, MfmQuoteNode
//TODO: "*italic **bold** *" doesn't work yet