[backend/core] Improve emoji detection regex, add unit tests

This commit is contained in:
Laura Hausmann 2024-08-16 20:54:24 +02:00
parent e342d6f010
commit 05f6546f48
No known key found for this signature in database
GPG key ID: D044E84C5BE01605
3 changed files with 63 additions and 9 deletions

File diff suppressed because one or more lines are too long

View file

@ -144,7 +144,7 @@ public partial class EmojiService(
public async Task<string> ResolveEmojiName(string name, string? host)
{
if (EmojiRegex().IsMatch(name))
if (EmojiHelpers.IsEmoji(name))
return name;
host = host?.ToPunycode();
@ -224,14 +224,6 @@ public partial class EmojiService(
return emoji;
}
// Generated for Unicode 15.1 by https://iceshrimp.dev/iceshrimp/UnicodeEmojiRegex
// Note to future maintainers: make sure to anchor the regex generated by the tool above!
// Example: ^(?:<generated regex>)
// @formatter:off
[GeneratedRegex(@"^(?:\uD83C[\uDDE6-\uDDFF]\uD83C[\uDDE6-\uDDFF]|\uD83C[\uDC04\uDCCF\uDD70\uDD71\uDD7E\uDD7F\uDD8E\uDD91-\uDD9A\uDDE6-\uDDFF\uDE01\uDE02\uDE1A\uDE2F\uDE32-\uDE3A\uDE50\uDE51\uDF00-\uDF21\uDF24-\uDF93\uDF96\uDF97\uDF99-\uDF9B\uDF9E-\uDFF0\uDFF3-\uDFF5\uDFF7-\uDFFF]|\uD83D[\uDC00-\uDCFD\uDCFF-\uDD3D\uDD49-\uDD4E\uDD50-\uDD67\uDD6F\uDD70\uDD73-\uDD7A\uDD87\uDD8A-\uDD8D\uDD90\uDD95\uDD96\uDDA4\uDDA5\uDDA8\uDDB1\uDDB2\uDDBC\uDDC2-\uDDC4\uDDD1-\uDDD3\uDDDC-\uDDDE\uDDE1\uDDE3\uDDE8\uDDEF\uDDF3\uDDFA-\uDE4F\uDE80-\uDEC5\uDECB-\uDED2\uDED5-\uDED7\uDEDC-\uDEE5\uDEE9\uDEEB\uDEEC\uDEF0\uDEF3-\uDEFC\uDFE0-\uDFEB\uDFF0]|\uD83E[\uDD0C-\uDD3A\uDD3C-\uDD45\uDD47-\uDDFF\uDE70-\uDE7C\uDE80-\uDE88\uDE90-\uDEBD\uDEBF-\uDEC5\uDECE-\uDEDB\uDEE0-\uDEE8\uDEF0-\uDEF8]|[\#\*0-9\u00A9\u00AE\u203C\u2049\u2122\u2139\u2194-\u2199\u21A9\u21AA\u231A\u231B\u2328\u23CF\u23E9-\u23F3\u23F8-\u23FA\u24C2\u25AA\u25AB\u25B6\u25C0\u25FB-\u25FE\u2600-\u2604\u260E\u2611\u2614\u2615\u2618\u261D\u2620\u2622\u2623\u2626\u262A\u262E\u262F\u2638-\u263A\u2640\u2642\u2648-\u2653\u265F\u2660\u2663\u2665\u2666\u2668\u267B\u267E\u267F\u2692-\u2697\u2699\u269B\u269C\u26A0\u26A1\u26A7\u26AA\u26AB\u26B0\u26B1\u26BD\u26BE\u26C4\u26C5\u26C8\u26CE\u26CF\u26D1\u26D3\u26D4\u26E9\u26EA\u26F0-\u26F5\u26F7-\u26FA\u26FD\u2702\u2705\u2708-\u270D\u270F\u2712\u2714\u2716\u271D\u2721\u2728\u2733\u2734\u2744\u2747\u274C\u274E\u2753-\u2755\u2757\u2763\u2764\u2795-\u2797\u27A1\u27B0\u27BF\u2934\u2935\u2B05-\u2B07\u2B1B\u2B1C\u2B50\u2B55\u3030\u303D\u3297\u3299](?:\uD83C[\uDFFB-\uDFFF]|\uFE0F\u20E3?|(?:\uDB40[\uDC20-\uDC7E])+\uDB40\uDC7F)?(?:\u200D\uD83C[\uDC04\uDCCF\uDD70\uDD71\uDD7E\uDD7F\uDD8E\uDD91-\uDD9A\uDDE6-\uDDFF\uDE01\uDE02\uDE1A\uDE2F\uDE32-\uDE3A\uDE50\uDE51\uDF00-\uDF21\uDF24-\uDF93\uDF96\uDF97\uDF99-\uDF9B\uDF9E-\uDFF0\uDFF3-\uDFF5\uDFF7-\uDFFF]|\uD83D[\uDC00-\uDCFD\uDCFF-\uDD3D\uDD49-\uDD4E\uDD50-\uDD67\uDD6F\uDD70\uDD73-\uDD7A\uDD87\uDD8A-\uDD8D\uDD90\uDD95\uDD96\uDDA4\uDDA5\uDDA8\uDDB1\uDDB2\uDDBC\uDDC2-\uDDC4\uDDD1-\uDDD3\uDDDC-\uDDDE\uDDE1\uDDE3\uDDE8\uDDEF\uDDF3\uDDFA-\uDE4F\uDE80-\uDEC5\uDECB-\uDED2\uDED5-\uDED7\uDEDC-\uDEE5\uDEE9\uDEEB\uDEEC\uDEF0\uDEF3-\uDEFC\uDFE0-\uDFEB\uDFF0]|\uD83E[\uDD0C-\uDD3A\uDD3C-\uDD45\uDD47-\uDDFF\uDE70-\uDE7C\uDE80-\uDE88\uDE90-\uDEBD\uDEBF-\uDEC5\uDECE-\uDEDB\uDEE0-\uDEE8\uDEF0-\uDEF8]|[\#\*0-9\u00A9\u00AE\u203C\u2049\u2122\u2139\u2194-\u2199\u21A9\u21AA\u231A\u231B\u2328\u23CF\u23E9-\u23F3\u23F8-\u23FA\u24C2\u25AA\u25AB\u25B6\u25C0\u25FB-\u25FE\u2600-\u2604\u260E\u2611\u2614\u2615\u2618\u261D\u2620\u2622\u2623\u2626\u262A\u262E\u262F\u2638-\u263A\u2640\u2642\u2648-\u2653\u265F\u2660\u2663\u2665\u2666\u2668\u267B\u267E\u267F\u2692-\u2697\u2699\u269B\u269C\u26A0\u26A1\u26A7\u26AA\u26AB\u26B0\u26B1\u26BD\u26BE\u26C4\u26C5\u26C8\u26CE\u26CF\u26D1\u26D3\u26D4\u26E9\u26EA\u26F0-\u26F5\u26F7-\u26FA\u26FD\u2702\u2705\u2708-\u270D\u270F\u2712\u2714\u2716\u271D\u2721\u2728\u2733\u2734\u2744\u2747\u274C\u274E\u2753-\u2755\u2757\u2763\u2764\u2795-\u2797\u27A1\u27B0\u27BF\u2934\u2935\u2B05-\u2B07\u2B1B\u2B1C\u2B50\u2B55\u3030\u303D\u3297\u3299](?:\uD83C[\uDFFB-\uDFFF]|\uFE0F\u20E3?|(?:\uDB40[\uDC20-\uDC7E])+\uDB40\uDC7F)?)*)$")]
private static partial Regex EmojiRegex();
// @formatter:on
[GeneratedRegex(@"^:?([\w+-]+)(?:@\.)?:?$", RegexOptions.Compiled)]
private static partial Regex CustomEmojiRegex();

View file

@ -0,0 +1,41 @@
using Iceshrimp.Backend.Core.Helpers;
namespace Iceshrimp.Tests.Parsing;
[TestClass]
public class EmojiTests
{
private static void TestEmojiRegexTemplate(string input, bool expectedOutput) =>
EmojiHelpers.IsEmoji(input).Should().Be(expectedOutput);
[TestMethod]
[DataRow("\ud83d\ude84")] // high speed train (E1.0/U6.0)
[DataRow("\ud83c\udfc2\ud83c\udffd")] // snowboarder: medium skin tone (E2.0)
[DataRow("\ud83e\udd23")] // rolling on the floor laughing (E3.0/U9.0)
[DataRow("\ud83c\uddfa\ud83c\uddf3")] // flag: united nations (E4.0)
[DataRow("\ud83e\udddc\ud83c\udffc")] // merperson: medium-light skin tone (E5.0/U10.0)
[DataRow("\ud83d\udc69\ud83c\udffe\u200d\ud83e\uddb1")] // woman: medium-dark skin tone, curly hair (E11.0)
[DataRow("\ud83d\udc68\ud83c\udffe\u200d\ud83e\udd1d\u200d\ud83d\udc68\ud83c\udffd")] // men holding hands: medium-dark skin tone, medium skin tone (E12.0)
[DataRow("\ud83e\uddd1\ud83c\udffc\u200d\ud83d\ude80")] // astronaut: medium-light skin tone (E12.1)
[DataRow("\ud83e\udd72")] // smiling face with tear (E13.0)
[DataRow("\ud83d\ude35\u200d\ud83d\udcab")] // face with spiral eyes (E13.1)
[DataRow("\ud83e\udee0")] // melting face (E14.0)
[DataRow("\ud83e\udebc")] // jellyfish (E15.0)
[DataRow("\ud83e\ude75")] // light blue heart (E15.0)
[DataRow("\ud83d\ude42\u200d\u2194\ufe0f")] // head shaking horizontally (E15.1)
public void TestEmojiRegexEmoji(string input) => TestEmojiRegexTemplate(input, true);
[TestMethod]
[DataRow("test")]
[DataRow("1")]
public void TestEmojiRegexPlainText(string input) => TestEmojiRegexTemplate(input, false);
[TestMethod]
[DataRow("\u2122", "\ufe0f")] // trademark sign
[DataRow("\ud83d\udd74", "\ufe0f")] // man in business suit levitating
public void TestEmojiRegexEmojiSelector(string input, string selector)
{
TestEmojiRegexTemplate(input, false);
TestEmojiRegexTemplate(input + selector, true);
}
}