Use tokenizer for emote parsing :HideThePain:

2024-05-20 20:14:41 +02:00
parent aab4977ecf
commit 47a4c5ab8d
2 changed files with 147 additions and 14 deletions
@@ -202,26 +202,42 @@ internal partial class Message
                continue;
            }
-            var builder = new StringBuilder();
+            var wordBuilder = new StringBuilder();
-            foreach (var word in text.Content.Split(" "))
+            var sentenceBuilder = new StringBuilder();
            foreach (var token in Tokenizer.PrecedenceBasedRegexTokenizer.Tokenize(text.Content))
            {
-                if (checkForEmotes && EmoteCache.Exists(word) && !Plugin.Config.BlockedEmotes.Contains(word))
+                if (token.TokenType == Tokenizer.TokenType.StringValue)
                {
-                    // We add all the previous collected text parts
+                    wordBuilder.Append(token.Value);
                    AddContentAfterURLCheck(builder.ToString(), text, chunk);
                    builder.Clear();
                    AddChunkWithMessage(new TextChunk(chunk.Source, EmotePayload.ResolveEmote(word), word) { FallbackColour = text.FallbackColour });
                    builder.Append(' ');
                    continue;
                }
-                builder.Append($"{word} ");
+                var word = wordBuilder.ToString();
-            }
+                wordBuilder.Clear();
-            // We add the leftovers
+                if (checkForEmotes && EmoteCache.Exists(word) && !Plugin.Config.BlockedEmotes.Contains(word))
-            // Removing the last whitespace as it is set by us
+                {
-            AddContentAfterURLCheck(builder.ToString()[..^1], text, chunk);
+                    // Add the previous punctuation, including whitespaces
                    AddContentAfterURLCheck(sentenceBuilder.ToString(), text, chunk);
                    AddChunkWithMessage(new TextChunk(chunk.Source, EmotePayload.ResolveEmote(word), word) { FallbackColour = text.FallbackColour });
                    // Append our current match as it is a special split symbol
                    sentenceBuilder.Clear();
                    sentenceBuilder.Append(token.Value);
                    continue;
                }
                // Append match if we haven't reached end of string yet
                if (token.TokenType != Tokenizer.TokenType.SequenceTerminator)
                {
                    sentenceBuilder.Append(word);
                    sentenceBuilder.Append(token.Value);
                    continue;
                }
                // End of string reached, we add our leftover
                AddContentAfterURLCheck(sentenceBuilder.Append(word).ToString(), text, chunk);
            }
        }
        return newChunks;
@@ -0,0 +1,117 @@
 using System.Text.RegularExpressions;
 namespace ChatTwo.Util;
 // Modified from: https://jack-vanlightly.com/blog/2016/2/24/a-more-efficient-regex-tokenizer
 public static class Tokenizer
 {
    public enum TokenType
    {
        CloseParenthesis,
        Comma,
        Dot,
        QuestionMark,
        ExclamationMark,
        Semicolon,
        Whitespace,
        Equals,
        OpenParenthesis,
        StringValue,
        Leftover,
        SequenceTerminator
    }
    public class Token(TokenType tokenType, string value)
    {
        public Token(TokenType tokenType) : this(tokenType, string.Empty) { }
        public TokenType TokenType { get; } = tokenType;
        public string Value { get; } = value;
    }
    public static class PrecedenceBasedRegexTokenizer
    {
        private static readonly List<TokenDefinition> TokenDefinitions;
        static PrecedenceBasedRegexTokenizer()
        {
            TokenDefinitions = new List<TokenDefinition>
            {
                new(TokenType.CloseParenthesis, "\\)", 1),
                new(TokenType.Comma, ",", 1),
                new(TokenType.Dot, "\\.", 1),
                new(TokenType.QuestionMark, "\\?", 1),
                new(TokenType.ExclamationMark, "!", 1),
                new(TokenType.Semicolon, ";", 1),
                new(TokenType.Whitespace, "\\s", 1),
                new(TokenType.Equals, "=", 1),
                new(TokenType.OpenParenthesis, "\\(", 1),
                new(TokenType.StringValue, "\\p{IsBasicLatin}", 2),
                new(TokenType.Leftover, ".", 3)
            };
        }
        public static IEnumerable<Token> Tokenize(string lqlText)
        {
            var tokenMatches = FindTokenMatches(lqlText);
            var groupedByIndex = tokenMatches.GroupBy(x => x.StartIndex)
                .OrderBy(x => x.Key)
                .ToList();
            TokenMatch? lastMatch = null;
            foreach (var t in groupedByIndex)
            {
                var bestMatch = t.OrderBy(x => x.Precedence).First();
                if (lastMatch != null && bestMatch.StartIndex < lastMatch.EndIndex)
                    continue;
                yield return new Token(bestMatch.TokenType, bestMatch.Value);
                lastMatch = bestMatch;
            }
            yield return new Token(TokenType.SequenceTerminator);
        }
        private static List<TokenMatch> FindTokenMatches(string lqlText)
        {
            var tokenMatches = new List<TokenMatch>();
            foreach (var tokenDefinition in TokenDefinitions)
                tokenMatches.AddRange(tokenDefinition.FindMatches(lqlText).ToList());
            return tokenMatches;
        }
    }
    private class TokenDefinition(TokenType returnsToken, string regexPattern, int precedence)
    {
        private readonly Regex Regex = new(regexPattern, RegexOptions.IgnoreCase|RegexOptions.Compiled);
        public IEnumerable<TokenMatch> FindMatches(string inputString)
        {
            var matches = Regex.Matches(inputString);
            for(var i = 0; i < matches.Count; i++)
            {
                yield return new TokenMatch
                {
                    StartIndex = matches[i].Index,
                    EndIndex = matches[i].Index + matches[i].Length,
                    TokenType = returnsToken,
                    Value = matches[i].Value,
                    Precedence = precedence
                };
            }
        }
    }
    private class TokenMatch
    {
        public TokenType TokenType { get; set; }
        public string Value { get; set; }
        public int StartIndex { get; set; }
        public int EndIndex { get; set; }
        public int Precedence { get; set; }
    }
 }