Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use DecoderFallback.ExceptionFallback to match Java's CodingErrorAction.REPORT, #1076 #1089

Draft
wants to merge 10 commits into
base: master
Choose a base branch
from
Draft
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -65,4 +65,5 @@ websites/apidocs/api/**/*.manifest
svn-*/

# vscode files
.vscode/
.vscode/
.idea/**/misc.xml
7 changes: 7 additions & 0 deletions Directory.Build.targets
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,13 @@

</PropertyGroup>

<!-- Features in .NET 8.x and .NET 9.x only -->
<PropertyGroup Condition=" $(TargetFramework.StartsWith('net8.')) Or $(TargetFramework.StartsWith('net9.')) ">

<DefineConstants>$(DefineConstants);FEATURE_UTF8_TOUTF16</DefineConstants>

</PropertyGroup>

<!-- Features in .NET 6.x, .NET 7.x, .NET 8.x, and .NET 9.x only -->
<PropertyGroup Condition=" $(TargetFramework.StartsWith('net6.')) Or $(TargetFramework.StartsWith('net7.')) Or $(TargetFramework.StartsWith('net8.')) Or $(TargetFramework.StartsWith('net9.')) ">

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -746,7 +746,7 @@ private static Encoding GetSystemEncoding(string encoding) // LUCENENET: CA1822:
}
if ("ISO8859-14".Equals(encoding, StringComparison.OrdinalIgnoreCase))
{
return new ISO8859_14Encoding();
return ISO8859_14Encoding.Default;
}
// .NET doesn't recognize the encoding without a dash between ISO and the number
// https://msdn.microsoft.com/en-us/library/system.text.encodinginfo.getencoding(v=vs.110).aspx
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,11 @@ namespace Lucene.Net.Analysis.Hunspell
[ExceptionToClassNameConvention]
internal sealed class ISO8859_14Encoding : Encoding
{
/// <summary>
/// The default singleton instance of the <see cref="ISO8859_14Encoding"/> class.
/// </summary>
public static new ISO8859_14Encoding Default { get; } = new ISO8859_14Encoding();

private static readonly Decoder decoder = new ISO8859_14Decoder();
public override Decoder GetDecoder()
{
Expand Down Expand Up @@ -119,4 +124,4 @@ public override int GetChars(byte[] bytesIn, int byteIndex, int byteCount, char[
return writeCount;
}
}
}
}
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// Lucene version compatibility level 4.8.1
using Lucene.Net.Analysis.Core;
using Lucene.Net.Analysis.Util;
using Lucene.Net.Support.Text;
using Lucene.Net.Util;
using System;
using System.Collections.Generic;
Expand Down Expand Up @@ -117,7 +118,7 @@ public void Inform(IResourceLoader loader)
/// </summary>
private SynonymMap LoadSynonyms(IResourceLoader loader, string cname, bool dedup, Analyzer analyzer)
{
Encoding decoder = Encoding.UTF8;
Encoding decoder = Encoding.UTF8.WithDecoderExceptionFallback();

SynonymMap.Parser parser;
Type clazz = loader.FindType(cname /*, typeof(SynonymMap.Parser) */);
Expand Down Expand Up @@ -165,4 +166,4 @@ private TokenizerFactory LoadTokenizerFactory(IResourceLoader loader, string cna
}
}
}
}
}
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// Lucene version compatibility level 4.8.1
using Lucene.Net.Analysis.Core;
using Lucene.Net.Support;
using Lucene.Net.Support.Text;
using Lucene.Net.Util;
using System;
using System.Collections.Generic;
Expand Down Expand Up @@ -385,8 +386,9 @@ protected CharArraySet GetSnowballWordSet(IResourceLoader loader, string wordFil
words = new CharArraySet(m_luceneMatchVersion, files.Count * 10, ignoreCase);
foreach (string file in files)
{
Encoding decoder = Encoding.UTF8.WithDecoderExceptionFallback();
using (Stream stream = loader.OpenResource(file.Trim()))
using (TextReader reader = new StreamReader(stream, Encoding.UTF8))
using (TextReader reader = new StreamReader(stream, decoder))
{
WordlistLoader.GetSnowballWordSet(reader, words);
}
Expand Down
3 changes: 2 additions & 1 deletion src/Lucene.Net.Analysis.Kuromoji/JapaneseTokenizerFactory.cs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
using Lucene.Net.Analysis.Ja.Dict;
using Lucene.Net.Analysis.Util;
using Lucene.Net.Support.Text;
using Lucene.Net.Util;
using System;
using System.Collections.Generic;
Expand Down Expand Up @@ -88,7 +89,7 @@ public virtual void Inform(IResourceLoader loader)
{
encoding = Encoding.UTF8.WebName;
}
Encoding decoder = Encoding.GetEncoding(encoding);
Encoding decoder = Encoding.GetEncoding(encoding).WithDecoderExceptionFallback();
TextReader reader = new StreamReader(stream, decoder);
userDictionary = new UserDictionary(reader);
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
using J2N.Text;
using Lucene.Net.Diagnostics;
using Lucene.Net.Support.Text;
using System.Globalization;
using System.IO;
using System.Text;
Expand Down Expand Up @@ -31,7 +32,8 @@ public static class ConnectionCostsBuilder // LUCENENET specific: CA1052 Static
public static ConnectionCostsWriter Build(string filename)
{
using Stream inputStream = new FileStream(filename, FileMode.Open, FileAccess.Read);
using StreamReader streamReader = new StreamReader(inputStream, Encoding.ASCII, detectEncodingFromByteOrderMarks: true, bufferSize: 1024, leaveOpen: true); // LUCENENET: CA2000: Use using statement
Encoding decoder = Encoding.ASCII.WithDecoderExceptionFallback();
using StreamReader streamReader = new StreamReader(inputStream, decoder, detectEncodingFromByteOrderMarks: true, bufferSize: 1024, leaveOpen: true); // LUCENENET: CA2000: Use using statement

string line = streamReader.ReadLine();
string[] dimensions = whiteSpaceRegex.Split(line).TrimEnd();
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
using J2N.Text;
using Lucene.Net.Support;
using Lucene.Net.Support.Text;
using Lucene.Net.Util;
using Lucene.Net.Util.Fst;
using Lucene.Net.Util.Packed;
Expand Down Expand Up @@ -71,7 +72,7 @@ public virtual TokenInfoDictionaryWriter BuildDictionary(IList<string> csvFiles)
foreach (string file in csvFiles)
{
using Stream inputStream = new FileStream(file, FileMode.Open, FileAccess.Read);
Encoding decoder = Encoding.GetEncoding(encoding);
Encoding decoder = Encoding.GetEncoding(encoding).WithDecoderExceptionFallback();
using TextReader reader = new StreamReader(inputStream, decoder, detectEncodingFromByteOrderMarks: true, bufferSize: 1024, leaveOpen: true); // LUCENENET: CA2000: Use using statement

string line = null;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
using J2N.Text;
using Lucene.Net.Analysis.Ja.Dict;
using Lucene.Net.Support.Text;
using System;
using System.Collections.Generic;
using System.Globalization;
Expand Down Expand Up @@ -55,7 +56,7 @@ public virtual UnknownDictionaryWriter ReadDictionaryFile(string filename, strin
UnknownDictionaryWriter dictionary = new UnknownDictionaryWriter(5 * 1024 * 1024);

JCG.List<string[]> lines = new JCG.List<string[]>();
Encoding decoder = Encoding.GetEncoding(encoding);
Encoding decoder = Encoding.GetEncoding(encoding).WithDecoderExceptionFallback();
using (Stream inputStream = new FileStream(filename, FileMode.Open, FileAccess.Read))
using (TextReader reader = new StreamReader(inputStream, decoder))
{
Expand Down
11 changes: 7 additions & 4 deletions src/Lucene.Net.Analysis.SmartCn/Hhmm/AbstractDictionary.cs
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,17 @@ namespace Lucene.Net.Analysis.Cn.Smart.Hhmm
/// </summary>
internal abstract class AbstractDictionary
{
// LUCENENET specific: cached GB2312 encoding to avoid repeated calls to Encoding.GetEncoding("GB2312")
protected static readonly Encoding gb2312Encoding = Encoding.GetEncoding("GB2312");

/// <summary>
/// First Chinese Character in GB2312 (15 * 94)
/// Characters in GB2312 are arranged in a grid of 94 * 94, 0-14 are unassigned or punctuation.
/// </summary>
public const int GB2312_FIRST_CHAR = 1410;

/// <summary>
/// Last Chinese Character in GB2312 (87 * 94).
/// Last Chinese Character in GB2312 (87 * 94).
/// Characters in GB2312 are arranged in a grid of 94 * 94, 88-94 are unassigned.
/// </summary>
public const int GB2312_CHAR_NUM = 87 * 94;
Expand Down Expand Up @@ -98,7 +101,7 @@ public virtual string GetCCByGB2312Id(int ccid)
try
{
//String cchar = new String(buffer, "GB2312");
string cchar = Encoding.GetEncoding("GB2312").GetString(buffer);
string cchar = gb2312Encoding.GetString(buffer); // LUCENENET specific: use cached encoding instance
return cchar;
}
catch (Exception e) when (e.IsUnsupportedEncodingException()) // Encoding is not supported by the platform
Expand All @@ -117,15 +120,15 @@ public virtual short GetGB2312Id(char ch)
try
{
//byte[] buffer = Character.ToString(ch).getBytes("GB2312");
byte[] buffer = Encoding.GetEncoding("GB2312").GetBytes(ch.ToString());
byte[] buffer = gb2312Encoding.GetBytes(ch.ToString()); // LUCENENET specific: use cached encoding instance
//byte[] buffer = Encoding.GetEncoding("hz-gb-2312").GetBytes(ch.ToString());
if (buffer.Length != 2)
{
// Should be a two-byte character
return -1;
}
int b0 = (buffer[0] & 0x0FF) - 161; // Code starts from A1, therefore subtract 0xA1=161
int b1 = (buffer[1] & 0x0FF) - 161; // There is no Chinese char for the first and last symbol.
int b1 = (buffer[1] & 0x0FF) - 161; // There is no Chinese char for the first and last symbol.
// Therefore, each code page only has 16*6-2=94 characters.
return (short)(b0 * 94 + b1);
}
Expand Down
2 changes: 1 addition & 1 deletion src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,7 @@ public virtual void LoadFromFile(string dctFilePath)
byte[] lchBuffer = new byte[length];
dctFile.Read(lchBuffer, 0, lchBuffer.Length);
//tmpword = new String(lchBuffer, "GB2312");
tmpword = Encoding.GetEncoding("GB2312").GetString(lchBuffer);
tmpword = gb2312Encoding.GetString(lchBuffer); // LUCENENET specific: use cached encoding instance from base class
//tmpword = Encoding.GetEncoding("hz-gb-2312").GetString(lchBuffer);
if (i != 3755 + GB2312_FIRST_CHAR)
{
Expand Down
2 changes: 1 addition & 1 deletion src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs
Original file line number Diff line number Diff line change
Expand Up @@ -395,7 +395,7 @@ private int LoadMainDataFromFile(string dctFilePath)
{
byte[] lchBuffer = new byte[length];
dctFile.Read(lchBuffer, 0, lchBuffer.Length);
tmpword = Encoding.GetEncoding("GB2312").GetString(lchBuffer);
tmpword = gb2312Encoding.GetString(lchBuffer); // LUCENENET specific: use cached encoding instance from base class
wordItem_charArrayTable[i][j] = tmpword.ToCharArray();
}
else
Expand Down
6 changes: 3 additions & 3 deletions src/Lucene.Net.Benchmark/ByTask/Feeds/ContentItemsSource.cs
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,15 @@ namespace Lucene.Net.Benchmarks.ByTask.Feeds
/// Base class for source of data for benchmarking.
/// </summary>
/// <remarks>
/// Keeps track of various statistics, such as how many data items were generated,
/// Keeps track of various statistics, such as how many data items were generated,
/// size in bytes etc.
/// <para/>
/// Supports the following configuration parameters:
/// <list type="bullet">
/// <item><term>content.source.forever</term><description>specifies whether to generate items forever (<b>default=true</b>).</description></item>
/// <item><term>content.source.verbose</term><description>specifies whether messages should be output by the content source (<b>default=false</b>).</description></item>
/// <item><term>content.source.encoding</term><description>
/// specifies which encoding to use when
/// specifies which encoding to use when
/// reading the files of that content source. Certain implementations may define
/// a default value if this parameter is not specified. (<b>default=null</b>).
/// </description></item>
Expand Down Expand Up @@ -199,7 +199,7 @@ public virtual void SetConfig(Config config)
}
else
{
m_encoding = Encoding.GetEncoding(0); // Default system encoding
m_encoding = Encoding.Default; // Default system encoding
}
}

Expand Down
2 changes: 1 addition & 1 deletion src/Lucene.Net.Benchmark/ByTask/Tasks/CreateIndexTask.cs
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ public static IndexWriter ConfigureWriter(Config config, PerfRunData runData, Op
else
{
FileInfo f = new FileInfo(infoStreamVal);
iwc.SetInfoStream(new StreamWriter(new FileStream(f.FullName, FileMode.Create, FileAccess.Write), Encoding.GetEncoding(0)));
iwc.SetInfoStream(new StreamWriter(new FileStream(f.FullName, FileMode.Create, FileAccess.Write), Encoding.Default));
}
}
IndexWriter writer = new IndexWriter(runData.Directory, iwc);
Expand Down
9 changes: 6 additions & 3 deletions src/Lucene.Net.Benchmark/Quality/Trec/QueryDriver.cs
Original file line number Diff line number Diff line change
Expand Up @@ -77,10 +77,13 @@ public static void Main(string[] args)
string fieldSpec = args.Length == 5 ? args[4] : "T"; // default to Title-only if not specified.
IndexSearcher searcher = new IndexSearcher(reader);

int maxResults = 1000;
string docNameField = "docname";
const int maxResults = 1000;
const string docNameField = "docname";

TextWriter logger = Console.Out; //new StreamWriter(Console, Encoding.GetEncoding(0));
using TextWriter logger = new StreamWriter(System.Console.OpenStandardOutput(), Encoding.Default)
{
AutoFlush = true,
};

// use trec utilities to read trec topics into quality queries
TrecTopicsReader qReader = new TrecTopicsReader();
Expand Down
6 changes: 4 additions & 2 deletions src/Lucene.Net.TestFramework/Util/LineFileDocs.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
using J2N.Threading.Atomic;
using Lucene.Net.Documents;
using Lucene.Net.Support.IO;
using Lucene.Net.Support.Text;
using Lucene.Net.Support.Threading;
using RandomizedTesting.Generators;
using System;
Expand Down Expand Up @@ -236,7 +237,8 @@ private void Open(Random random)
} while (b >= 0 && b != 13 && b != 10);
}

reader = new StreamReader(@is, Encoding.UTF8, detectEncodingFromByteOrderMarks: false, bufferSize: BUFFER_SIZE);
Encoding decoder = Encoding.UTF8.WithDecoderExceptionFallback();
reader = new StreamReader(@is, decoder, detectEncodingFromByteOrderMarks: false, bufferSize: BUFFER_SIZE);

if (seekTo > 0L)
{
Expand Down Expand Up @@ -399,4 +401,4 @@ internal static string MaybeCreateTempFile(bool removeAfterClass = true)
return result;
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
using System.Reflection;
using System.Resources;
using System.Security;
using System.Text;
using Assert = Lucene.Net.TestFramework.Assert;

namespace Lucene.Net.Support.ExceptionHandling
Expand Down Expand Up @@ -184,6 +185,8 @@ private static IEnumerable<Type> LoadKnownErrorExceptionTypes()
typeof(UnauthorizedAccessException),
typeof(ObjectDisposedException),
typeof(Lucene.AlreadyClosedException),
typeof(EncoderFallbackException), // In Java, CharacterCodingException subclasses IOException
typeof(DecoderFallbackException),
}.Union(AllIOExceptionTypes)
// .NET Framework only - Subclasses UnauthorizedAccessException
.Union(new[] { PrivilegeNotHeldExceptionType });
Expand Down Expand Up @@ -221,8 +224,6 @@ private static IEnumerable<Type> LoadKnownErrorExceptionTypes()
// Subclasses
typeof(System.DuplicateWaitObjectException),
typeof(System.Globalization.CultureNotFoundException),
typeof(System.Text.DecoderFallbackException),
typeof(System.Text.EncoderFallbackException),
};

public static readonly IEnumerable<Type> KnownIllegalArgumentExceptionTypes_TestEnvironment = new Type[] {
Expand All @@ -234,8 +235,6 @@ private static IEnumerable<Type> LoadKnownErrorExceptionTypes()
// Subclasses
typeof(System.DuplicateWaitObjectException),
typeof(System.Globalization.CultureNotFoundException),
typeof(System.Text.DecoderFallbackException),
typeof(System.Text.EncoderFallbackException),
};

public static readonly IEnumerable<Type> KnownRuntimeExceptionTypes = LoadKnownRuntimeExceptionTypes();
Expand Down Expand Up @@ -367,8 +366,6 @@ private static IEnumerable<Type> LoadKnownRuntimeExceptionTypes()
typeof(System.Runtime.Serialization.SerializationException),
typeof(System.Security.Cryptography.CryptographicException),
typeof(System.Security.VerificationException),
typeof(System.Text.DecoderFallbackException), // LUCENENET TODO: Need to be sure about this one
typeof(System.Text.EncoderFallbackException), // LUCENENET TODO: Need to be sure about this one
typeof(System.Threading.AbandonedMutexException),
typeof(System.Threading.SemaphoreFullException),
typeof(System.Threading.SynchronizationLockException),
Expand Down
Loading
Loading