diff --git a/ChatTwo/Message.cs b/ChatTwo/Message.cs index b21fee4..7fcd775 100755 --- a/ChatTwo/Message.cs +++ b/ChatTwo/Message.cs @@ -202,26 +202,42 @@ internal partial class Message continue; } - var builder = new StringBuilder(); - foreach (var word in text.Content.Split(" ")) + var wordBuilder = new StringBuilder(); + var sentenceBuilder = new StringBuilder(); + foreach (var token in Tokenizer.PrecedenceBasedRegexTokenizer.Tokenize(text.Content)) { - if (checkForEmotes && EmoteCache.Exists(word) && !Plugin.Config.BlockedEmotes.Contains(word)) + if (token.TokenType == Tokenizer.TokenType.StringValue) { - // We add all the previous collected text parts - AddContentAfterURLCheck(builder.ToString(), text, chunk); - builder.Clear(); - - AddChunkWithMessage(new TextChunk(chunk.Source, EmotePayload.ResolveEmote(word), word) { FallbackColour = text.FallbackColour }); - builder.Append(' '); + wordBuilder.Append(token.Value); continue; } - builder.Append($"{word} "); - } + var word = wordBuilder.ToString(); + wordBuilder.Clear(); - // We add the leftovers - // Removing the last whitespace as it is set by us - AddContentAfterURLCheck(builder.ToString()[..^1], text, chunk); + if (checkForEmotes && EmoteCache.Exists(word) && !Plugin.Config.BlockedEmotes.Contains(word)) + { + // Add the previous punctuation, including whitespaces + AddContentAfterURLCheck(sentenceBuilder.ToString(), text, chunk); + AddChunkWithMessage(new TextChunk(chunk.Source, EmotePayload.ResolveEmote(word), word) { FallbackColour = text.FallbackColour }); + + // Append our current match as it is a special split symbol + sentenceBuilder.Clear(); + sentenceBuilder.Append(token.Value); + continue; + } + + // Append match if we haven't reached end of string yet + if (token.TokenType != Tokenizer.TokenType.SequenceTerminator) + { + sentenceBuilder.Append(word); + sentenceBuilder.Append(token.Value); + continue; + } + + // End of string reached, we add our leftover + AddContentAfterURLCheck(sentenceBuilder.Append(word).ToString(), text, chunk); + } } return newChunks; diff --git a/ChatTwo/Util/Tokenizer.cs b/ChatTwo/Util/Tokenizer.cs new file mode 100644 index 0000000..0b5ac1c --- /dev/null +++ b/ChatTwo/Util/Tokenizer.cs @@ -0,0 +1,117 @@ +using System.Text.RegularExpressions; + +namespace ChatTwo.Util; + +// Modified from: https://jack-vanlightly.com/blog/2016/2/24/a-more-efficient-regex-tokenizer +public static class Tokenizer +{ + public enum TokenType + { + CloseParenthesis, + Comma, + Dot, + QuestionMark, + ExclamationMark, + Semicolon, + Whitespace, + Equals, + OpenParenthesis, + StringValue, + Leftover, + SequenceTerminator + } + + public class Token(TokenType tokenType, string value) + { + public Token(TokenType tokenType) : this(tokenType, string.Empty) { } + + public TokenType TokenType { get; } = tokenType; + public string Value { get; } = value; + } + + public static class PrecedenceBasedRegexTokenizer + { + private static readonly List TokenDefinitions; + + static PrecedenceBasedRegexTokenizer() + { + TokenDefinitions = new List + { + new(TokenType.CloseParenthesis, "\\)", 1), + new(TokenType.Comma, ",", 1), + new(TokenType.Dot, "\\.", 1), + new(TokenType.QuestionMark, "\\?", 1), + new(TokenType.ExclamationMark, "!", 1), + new(TokenType.Semicolon, ";", 1), + new(TokenType.Whitespace, "\\s", 1), + new(TokenType.Equals, "=", 1), + new(TokenType.OpenParenthesis, "\\(", 1), + new(TokenType.StringValue, "\\p{IsBasicLatin}", 2), + new(TokenType.Leftover, ".", 3) + }; + } + + public static IEnumerable Tokenize(string lqlText) + { + var tokenMatches = FindTokenMatches(lqlText); + + var groupedByIndex = tokenMatches.GroupBy(x => x.StartIndex) + .OrderBy(x => x.Key) + .ToList(); + + TokenMatch? lastMatch = null; + foreach (var t in groupedByIndex) + { + var bestMatch = t.OrderBy(x => x.Precedence).First(); + if (lastMatch != null && bestMatch.StartIndex < lastMatch.EndIndex) + continue; + + yield return new Token(bestMatch.TokenType, bestMatch.Value); + + lastMatch = bestMatch; + } + + yield return new Token(TokenType.SequenceTerminator); + } + + private static List FindTokenMatches(string lqlText) + { + var tokenMatches = new List(); + + foreach (var tokenDefinition in TokenDefinitions) + tokenMatches.AddRange(tokenDefinition.FindMatches(lqlText).ToList()); + + return tokenMatches; + } + } + + private class TokenDefinition(TokenType returnsToken, string regexPattern, int precedence) + { + private readonly Regex Regex = new(regexPattern, RegexOptions.IgnoreCase|RegexOptions.Compiled); + + public IEnumerable FindMatches(string inputString) + { + var matches = Regex.Matches(inputString); + for(var i = 0; i < matches.Count; i++) + { + yield return new TokenMatch + { + StartIndex = matches[i].Index, + EndIndex = matches[i].Index + matches[i].Length, + TokenType = returnsToken, + Value = matches[i].Value, + Precedence = precedence + }; + } + } + } + + private class TokenMatch + { + public TokenType TokenType { get; set; } + public string Value { get; set; } + public int StartIndex { get; set; } + public int EndIndex { get; set; } + public int Precedence { get; set; } + } +} \ No newline at end of file