using AngleSharp.Dom; using AngleSharp.Html.Dom; using Iceshrimp.Backend.Core.Database.Tables; using Iceshrimp.Backend.Core.Helpers.LibMfm.Conversion; namespace Iceshrimp.Backend.Core.Helpers.LibMfm.Parsing; internal class HtmlParser( IEnumerable mentions, IEnumerable hashtags, ICollection media ) { internal string? ParseNode(INode node) { if (node.NodeType is NodeType.Text) return node.TextContent; if (node.NodeType is NodeType.Comment or NodeType.Document) return null; switch (node.NodeName) { case "BR": { return "\n"; } case "A": { if (node is not HtmlElement el) return node.TextContent; var href = el.GetAttribute("href"); if (href == null) return $"{el.TextContent}"; if (el.ClassList.Contains("u-url") && el.ClassList.Contains("mention")) { var mention = mentions.FirstOrDefault(p => p.Uri == href || p.Url == href); return mention != null ? $"@{mention.Username}@{mention.Host}" : $"{el.TextContent}"; } // Hubzilla marks tags as class="zrl", so we have to account for that here if ((el.GetAttribute("rel") is "tag" || el.ClassList.Contains("zrl")) && el.HasAttribute("href")) { var text = el.TextContent; if (hashtags.Contains((text.StartsWith('#') ? text[1..] : text).ToLowerInvariant())) return text; } if (el.TextContent == href && (href.StartsWith("http://") || href.StartsWith("https://"))) return href; return $"[{el.TextContent}]({href})"; } case "H1": { return $"【{ParseChildren(node)}】\n"; } case "B": case "STRONG": { return $"{ParseChildren(node)}"; } case "SMALL": { return $"{ParseChildren(node)}"; } case "S": case "DEL": { return $"{ParseChildren(node)}"; } case "I": case "EM": { return $"{ParseChildren(node)}"; } case "PRE": { return node.ChildNodes is [{ NodeName: "CODE" }] ? $"\n```\n{string.Join(null, node.ChildNodes[0].TextContent)}\n```\n" : ParseChildren(node); } case "CODE": { return $"`{ParseChildren(node)}`"; } case "BLOCKQUOTE": { return node.TextContent.Length > 0 ? $"\n> {string.Join("\n> ", node.TextContent.Split("\n"))}" : null; } case "VIDEO": case "AUDIO": case "IMG": { if (node is not HtmlElement el) return node.TextContent; var src = el.GetAttribute("src"); if (src == null || !Uri.TryCreate(src, UriKind.Absolute, out var uri) && uri is { Scheme: "http" or "https" }) return node.TextContent; var alt = el.GetAttribute("alt") ?? el.GetAttribute("title"); var type = node.NodeName switch { "VIDEO" => MfmInlineMedia.MediaType.Video, "AUDIO" => MfmInlineMedia.MediaType.Audio, "IMG" => MfmInlineMedia.MediaType.Image, _ => MfmInlineMedia.MediaType.Other, }; media.Add(new MfmInlineMedia(type, src, alt)); return $"$[media {src}]"; } case "P": case "H2": case "H3": case "H4": case "H5": case "H6": { return $"\n\n{ParseChildren(node)}"; } case "DIV": case "HEADER": case "FOOTER": case "ARTICLE": case "LI": case "DT": case "DD": { return $"\n{ParseChildren(node)}"; } default: { return ParseChildren(node); } } } private string ParseChildren(INode node) { return string.Join(null, node.ChildNodes.Select(ParseNode)); } }