Skip to content

Commit

Permalink
Cache GB2312 encoding lookup, #1076
Browse files Browse the repository at this point in the history
  • Loading branch information
paulirwin committed Jan 11, 2025
1 parent 66f3e3c commit c07cc87
Show file tree
Hide file tree
Showing 5 changed files with 16 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -746,7 +746,7 @@ private static Encoding GetSystemEncoding(string encoding) // LUCENENET: CA1822:
}
if ("ISO8859-14".Equals(encoding, StringComparison.OrdinalIgnoreCase))
{
return new ISO8859_14Encoding();
return ISO8859_14Encoding.Default;
}
// .NET doesn't recognize the encoding without a dash between ISO and the number
// https://msdn.microsoft.com/en-us/library/system.text.encodinginfo.getencoding(v=vs.110).aspx
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,11 @@ namespace Lucene.Net.Analysis.Hunspell
[ExceptionToClassNameConvention]
internal sealed class ISO8859_14Encoding : Encoding
{
/// <summary>
/// The default singleton instance of the <see cref="ISO8859_14Encoding"/> class.
/// </summary>
public static new ISO8859_14Encoding Default { get; } = new ISO8859_14Encoding();

private static readonly Decoder decoder = new ISO8859_14Decoder();
public override Decoder GetDecoder()
{
Expand Down Expand Up @@ -119,4 +124,4 @@ public override int GetChars(byte[] bytesIn, int byteIndex, int byteCount, char[
return writeCount;
}
}
}
}
11 changes: 7 additions & 4 deletions src/Lucene.Net.Analysis.SmartCn/Hhmm/AbstractDictionary.cs
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,17 @@ namespace Lucene.Net.Analysis.Cn.Smart.Hhmm
/// </summary>
internal abstract class AbstractDictionary
{
// LUCENENET specific: cached GB2312 encoding to avoid repeated calls to Encoding.GetEncoding("GB2312")
protected static readonly Encoding gb2312Encoding = Encoding.GetEncoding("GB2312");

/// <summary>
/// First Chinese Character in GB2312 (15 * 94)
/// Characters in GB2312 are arranged in a grid of 94 * 94, 0-14 are unassigned or punctuation.
/// </summary>
public const int GB2312_FIRST_CHAR = 1410;

/// <summary>
/// Last Chinese Character in GB2312 (87 * 94).
/// Last Chinese Character in GB2312 (87 * 94).
/// Characters in GB2312 are arranged in a grid of 94 * 94, 88-94 are unassigned.
/// </summary>
public const int GB2312_CHAR_NUM = 87 * 94;
Expand Down Expand Up @@ -98,7 +101,7 @@ public virtual string GetCCByGB2312Id(int ccid)
try
{
//String cchar = new String(buffer, "GB2312");
string cchar = Encoding.GetEncoding("GB2312").GetString(buffer);
string cchar = gb2312Encoding.GetString(buffer); // LUCENENET specific: use cached encoding instance
return cchar;
}
catch (Exception e) when (e.IsUnsupportedEncodingException()) // Encoding is not supported by the platform
Expand All @@ -117,15 +120,15 @@ public virtual short GetGB2312Id(char ch)
try
{
//byte[] buffer = Character.ToString(ch).getBytes("GB2312");
byte[] buffer = Encoding.GetEncoding("GB2312").GetBytes(ch.ToString());
byte[] buffer = gb2312Encoding.GetBytes(ch.ToString()); // LUCENENET specific: use cached encoding instance
//byte[] buffer = Encoding.GetEncoding("hz-gb-2312").GetBytes(ch.ToString());
if (buffer.Length != 2)
{
// Should be a two-byte character
return -1;
}
int b0 = (buffer[0] & 0x0FF) - 161; // Code starts from A1, therefore subtract 0xA1=161
int b1 = (buffer[1] & 0x0FF) - 161; // There is no Chinese char for the first and last symbol.
int b1 = (buffer[1] & 0x0FF) - 161; // There is no Chinese char for the first and last symbol.
// Therefore, each code page only has 16*6-2=94 characters.
return (short)(b0 * 94 + b1);
}
Expand Down
2 changes: 1 addition & 1 deletion src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,7 @@ public virtual void LoadFromFile(string dctFilePath)
byte[] lchBuffer = new byte[length];
dctFile.Read(lchBuffer, 0, lchBuffer.Length);
//tmpword = new String(lchBuffer, "GB2312");
tmpword = Encoding.GetEncoding("GB2312").GetString(lchBuffer);
tmpword = gb2312Encoding.GetString(lchBuffer); // LUCENENET specific: use cached encoding instance from base class
//tmpword = Encoding.GetEncoding("hz-gb-2312").GetString(lchBuffer);
if (i != 3755 + GB2312_FIRST_CHAR)
{
Expand Down
2 changes: 1 addition & 1 deletion src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs
Original file line number Diff line number Diff line change
Expand Up @@ -395,7 +395,7 @@ private int LoadMainDataFromFile(string dctFilePath)
{
byte[] lchBuffer = new byte[length];
dctFile.Read(lchBuffer, 0, lchBuffer.Length);
tmpword = Encoding.GetEncoding("GB2312").GetString(lchBuffer);
tmpword = gb2312Encoding.GetString(lchBuffer); // LUCENENET specific: use cached encoding instance from base class
wordItem_charArrayTable[i][j] = tmpword.ToCharArray();
}
else
Expand Down

0 comments on commit c07cc87

Please sign in to comment.