From c07cc8738b2100b5839fe03bf93c34920670ea7a Mon Sep 17 00:00:00 2001 From: Paul Irwin Date: Fri, 10 Jan 2025 20:50:10 -0700 Subject: [PATCH] Cache GB2312 encoding lookup, #1076 --- .../Analysis/Hunspell/Dictionary.cs | 2 +- .../Analysis/Hunspell/ISO8859_14Decoder.cs | 7 ++++++- .../Hhmm/AbstractDictionary.cs | 11 +++++++---- .../Hhmm/BigramDictionary.cs | 2 +- .../Hhmm/WordDictionary.cs | 2 +- 5 files changed, 16 insertions(+), 8 deletions(-) diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Hunspell/Dictionary.cs b/src/Lucene.Net.Analysis.Common/Analysis/Hunspell/Dictionary.cs index ceeb7eb532..b790df5e67 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/Hunspell/Dictionary.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/Hunspell/Dictionary.cs @@ -746,7 +746,7 @@ private static Encoding GetSystemEncoding(string encoding) // LUCENENET: CA1822: } if ("ISO8859-14".Equals(encoding, StringComparison.OrdinalIgnoreCase)) { - return new ISO8859_14Encoding(); + return ISO8859_14Encoding.Default; } // .NET doesn't recognize the encoding without a dash between ISO and the number // https://msdn.microsoft.com/en-us/library/system.text.encodinginfo.getencoding(v=vs.110).aspx diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Hunspell/ISO8859_14Decoder.cs b/src/Lucene.Net.Analysis.Common/Analysis/Hunspell/ISO8859_14Decoder.cs index 6078954049..7b7eb59c1f 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/Hunspell/ISO8859_14Decoder.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/Hunspell/ISO8859_14Decoder.cs @@ -28,6 +28,11 @@ namespace Lucene.Net.Analysis.Hunspell [ExceptionToClassNameConvention] internal sealed class ISO8859_14Encoding : Encoding { + /// + /// The default singleton instance of the class. + /// + public static new ISO8859_14Encoding Default { get; } = new ISO8859_14Encoding(); + private static readonly Decoder decoder = new ISO8859_14Decoder(); public override Decoder GetDecoder() { @@ -119,4 +124,4 @@ public override int GetChars(byte[] bytesIn, int byteIndex, int byteCount, char[ return writeCount; } } -} \ No newline at end of file +} diff --git a/src/Lucene.Net.Analysis.SmartCn/Hhmm/AbstractDictionary.cs b/src/Lucene.Net.Analysis.SmartCn/Hhmm/AbstractDictionary.cs index 5e2139018e..1d5da6d3af 100644 --- a/src/Lucene.Net.Analysis.SmartCn/Hhmm/AbstractDictionary.cs +++ b/src/Lucene.Net.Analysis.SmartCn/Hhmm/AbstractDictionary.cs @@ -32,6 +32,9 @@ namespace Lucene.Net.Analysis.Cn.Smart.Hhmm /// internal abstract class AbstractDictionary { + // LUCENENET specific: cached GB2312 encoding to avoid repeated calls to Encoding.GetEncoding("GB2312") + protected static readonly Encoding gb2312Encoding = Encoding.GetEncoding("GB2312"); + /// /// First Chinese Character in GB2312 (15 * 94) /// Characters in GB2312 are arranged in a grid of 94 * 94, 0-14 are unassigned or punctuation. @@ -39,7 +42,7 @@ internal abstract class AbstractDictionary public const int GB2312_FIRST_CHAR = 1410; /// - /// Last Chinese Character in GB2312 (87 * 94). + /// Last Chinese Character in GB2312 (87 * 94). /// Characters in GB2312 are arranged in a grid of 94 * 94, 88-94 are unassigned. /// public const int GB2312_CHAR_NUM = 87 * 94; @@ -98,7 +101,7 @@ public virtual string GetCCByGB2312Id(int ccid) try { //String cchar = new String(buffer, "GB2312"); - string cchar = Encoding.GetEncoding("GB2312").GetString(buffer); + string cchar = gb2312Encoding.GetString(buffer); // LUCENENET specific: use cached encoding instance return cchar; } catch (Exception e) when (e.IsUnsupportedEncodingException()) // Encoding is not supported by the platform @@ -117,7 +120,7 @@ public virtual short GetGB2312Id(char ch) try { //byte[] buffer = Character.ToString(ch).getBytes("GB2312"); - byte[] buffer = Encoding.GetEncoding("GB2312").GetBytes(ch.ToString()); + byte[] buffer = gb2312Encoding.GetBytes(ch.ToString()); // LUCENENET specific: use cached encoding instance //byte[] buffer = Encoding.GetEncoding("hz-gb-2312").GetBytes(ch.ToString()); if (buffer.Length != 2) { @@ -125,7 +128,7 @@ public virtual short GetGB2312Id(char ch) return -1; } int b0 = (buffer[0] & 0x0FF) - 161; // Code starts from A1, therefore subtract 0xA1=161 - int b1 = (buffer[1] & 0x0FF) - 161; // There is no Chinese char for the first and last symbol. + int b1 = (buffer[1] & 0x0FF) - 161; // There is no Chinese char for the first and last symbol. // Therefore, each code page only has 16*6-2=94 characters. return (short)(b0 * 94 + b1); } diff --git a/src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs b/src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs index b9d16273ae..da712cb0de 100644 --- a/src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs +++ b/src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs @@ -302,7 +302,7 @@ public virtual void LoadFromFile(string dctFilePath) byte[] lchBuffer = new byte[length]; dctFile.Read(lchBuffer, 0, lchBuffer.Length); //tmpword = new String(lchBuffer, "GB2312"); - tmpword = Encoding.GetEncoding("GB2312").GetString(lchBuffer); + tmpword = gb2312Encoding.GetString(lchBuffer); // LUCENENET specific: use cached encoding instance from base class //tmpword = Encoding.GetEncoding("hz-gb-2312").GetString(lchBuffer); if (i != 3755 + GB2312_FIRST_CHAR) { diff --git a/src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs b/src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs index b8cd7cbbfa..b6e42be522 100644 --- a/src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs +++ b/src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs @@ -395,7 +395,7 @@ private int LoadMainDataFromFile(string dctFilePath) { byte[] lchBuffer = new byte[length]; dctFile.Read(lchBuffer, 0, lchBuffer.Length); - tmpword = Encoding.GetEncoding("GB2312").GetString(lchBuffer); + tmpword = gb2312Encoding.GetString(lchBuffer); // LUCENENET specific: use cached encoding instance from base class wordItem_charArrayTable[i][j] = tmpword.ToCharArray(); } else