Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use DecoderFallback.ExceptionFallback to match Java's CodingErrorAction.REPORT, #1076 #1089

Draft
wants to merge 10 commits into
base: master
Choose a base branch
from
Draft
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -65,4 +65,5 @@ websites/apidocs/api/**/*.manifest
svn-*/

# vscode files
.vscode/
.vscode/
.idea/**/misc.xml
7 changes: 7 additions & 0 deletions Directory.Build.targets
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,13 @@

</PropertyGroup>

<!-- Features in .NET 8.x and .NET 9.x only -->
<PropertyGroup Condition=" $(TargetFramework.StartsWith('net8.')) Or $(TargetFramework.StartsWith('net9.')) ">

<DefineConstants>$(DefineConstants);FEATURE_UTF8_TOUTF16</DefineConstants>

</PropertyGroup>

<!-- Features in .NET 6.x, .NET 7.x, .NET 8.x, and .NET 9.x only -->
<PropertyGroup Condition=" $(TargetFramework.StartsWith('net6.')) Or $(TargetFramework.StartsWith('net7.')) Or $(TargetFramework.StartsWith('net8.')) Or $(TargetFramework.StartsWith('net9.')) ">

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// Lucene version compatibility level 4.8.1
using Lucene.Net.Analysis.Core;
using Lucene.Net.Analysis.Util;
using Lucene.Net.Support.Text;
using Lucene.Net.Util;
using System;
using System.Collections.Generic;
Expand Down Expand Up @@ -117,7 +118,7 @@ public void Inform(IResourceLoader loader)
/// </summary>
private SynonymMap LoadSynonyms(IResourceLoader loader, string cname, bool dedup, Analyzer analyzer)
{
Encoding decoder = Encoding.UTF8;
Encoding decoder = Encoding.UTF8.WithDecoderExceptionFallback();

SynonymMap.Parser parser;
Type clazz = loader.FindType(cname /*, typeof(SynonymMap.Parser) */);
Expand Down Expand Up @@ -165,4 +166,4 @@ private TokenizerFactory LoadTokenizerFactory(IResourceLoader loader, string cna
}
}
}
}
}
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// Lucene version compatibility level 4.8.1
using Lucene.Net.Analysis.Core;
using Lucene.Net.Support;
using Lucene.Net.Support.Text;
using Lucene.Net.Util;
using System;
using System.Collections.Generic;
Expand Down Expand Up @@ -385,8 +386,9 @@ protected CharArraySet GetSnowballWordSet(IResourceLoader loader, string wordFil
words = new CharArraySet(m_luceneMatchVersion, files.Count * 10, ignoreCase);
foreach (string file in files)
{
Encoding decoder = Encoding.UTF8.WithDecoderExceptionFallback();
using (Stream stream = loader.OpenResource(file.Trim()))
using (TextReader reader = new StreamReader(stream, Encoding.UTF8))
using (TextReader reader = new StreamReader(stream, decoder))
{
WordlistLoader.GetSnowballWordSet(reader, words);
}
Expand Down
3 changes: 2 additions & 1 deletion src/Lucene.Net.Analysis.Kuromoji/JapaneseTokenizerFactory.cs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
using Lucene.Net.Analysis.Ja.Dict;
using Lucene.Net.Analysis.Util;
using Lucene.Net.Support.Text;
using Lucene.Net.Util;
using System;
using System.Collections.Generic;
Expand Down Expand Up @@ -88,7 +89,7 @@ public virtual void Inform(IResourceLoader loader)
{
encoding = Encoding.UTF8.WebName;
}
Encoding decoder = Encoding.GetEncoding(encoding);
Encoding decoder = Encoding.GetEncoding(encoding).WithDecoderExceptionFallback();
TextReader reader = new StreamReader(stream, decoder);
userDictionary = new UserDictionary(reader);
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
using J2N.Text;
using Lucene.Net.Diagnostics;
using Lucene.Net.Support.Text;
using System.Globalization;
using System.IO;
using System.Text;
Expand Down Expand Up @@ -31,7 +32,8 @@ public static class ConnectionCostsBuilder // LUCENENET specific: CA1052 Static
public static ConnectionCostsWriter Build(string filename)
{
using Stream inputStream = new FileStream(filename, FileMode.Open, FileAccess.Read);
using StreamReader streamReader = new StreamReader(inputStream, Encoding.ASCII, detectEncodingFromByteOrderMarks: true, bufferSize: 1024, leaveOpen: true); // LUCENENET: CA2000: Use using statement
Encoding decoder = Encoding.ASCII.WithDecoderExceptionFallback();
using StreamReader streamReader = new StreamReader(inputStream, decoder, detectEncodingFromByteOrderMarks: true, bufferSize: 1024, leaveOpen: true); // LUCENENET: CA2000: Use using statement

string line = streamReader.ReadLine();
string[] dimensions = whiteSpaceRegex.Split(line).TrimEnd();
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
using J2N.Text;
using Lucene.Net.Support;
using Lucene.Net.Support.Text;
using Lucene.Net.Util;
using Lucene.Net.Util.Fst;
using Lucene.Net.Util.Packed;
Expand Down Expand Up @@ -71,7 +72,7 @@ public virtual TokenInfoDictionaryWriter BuildDictionary(IList<string> csvFiles)
foreach (string file in csvFiles)
{
using Stream inputStream = new FileStream(file, FileMode.Open, FileAccess.Read);
Encoding decoder = Encoding.GetEncoding(encoding);
Encoding decoder = Encoding.GetEncoding(encoding).WithDecoderExceptionFallback();
using TextReader reader = new StreamReader(inputStream, decoder, detectEncodingFromByteOrderMarks: true, bufferSize: 1024, leaveOpen: true); // LUCENENET: CA2000: Use using statement

string line = null;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
using J2N.Text;
using Lucene.Net.Analysis.Ja.Dict;
using Lucene.Net.Support.Text;
using System;
using System.Collections.Generic;
using System.Globalization;
Expand Down Expand Up @@ -55,7 +56,7 @@ public virtual UnknownDictionaryWriter ReadDictionaryFile(string filename, strin
UnknownDictionaryWriter dictionary = new UnknownDictionaryWriter(5 * 1024 * 1024);

JCG.List<string[]> lines = new JCG.List<string[]>();
Encoding decoder = Encoding.GetEncoding(encoding);
Encoding decoder = Encoding.GetEncoding(encoding).WithDecoderExceptionFallback();
using (Stream inputStream = new FileStream(filename, FileMode.Open, FileAccess.Read))
using (TextReader reader = new StreamReader(inputStream, decoder))
{
Expand Down
6 changes: 4 additions & 2 deletions src/Lucene.Net.TestFramework/Util/LineFileDocs.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
using J2N.Threading.Atomic;
using Lucene.Net.Documents;
using Lucene.Net.Support.IO;
using Lucene.Net.Support.Text;
using Lucene.Net.Support.Threading;
using RandomizedTesting.Generators;
using System;
Expand Down Expand Up @@ -236,7 +237,8 @@ private void Open(Random random)
} while (b >= 0 && b != 13 && b != 10);
}

reader = new StreamReader(@is, Encoding.UTF8, detectEncodingFromByteOrderMarks: false, bufferSize: BUFFER_SIZE);
Encoding decoder = Encoding.UTF8.WithDecoderExceptionFallback();
reader = new StreamReader(@is, decoder, detectEncodingFromByteOrderMarks: false, bufferSize: BUFFER_SIZE);

if (seekTo > 0L)
{
Expand Down Expand Up @@ -399,4 +401,4 @@ internal static string MaybeCreateTempFile(bool removeAfterClass = true)
return result;
}
}
}
}
60 changes: 59 additions & 1 deletion src/Lucene.Net.Tests/Index/TestTerm.cs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
using Lucene.Net.Attributes;
using Lucene.Net.Util;
using NUnit.Framework;
using Assert = Lucene.Net.TestFramework.Assert;

Expand Down Expand Up @@ -39,5 +41,61 @@ public virtual void TestEquals()
Assert.IsFalse(@base.Equals(differentText));
Assert.IsFalse(@base.Equals(differentType));
}

[Test, LuceneNetSpecific]
public void TestToString_ValidUtf8Data()
{
// Arrange
var validUtf8 = new byte[] { 0x48, 0x65, 0x6C, 0x6C, 0x6F }; // "Hello"
var bytesRef = new BytesRef(validUtf8, 0, validUtf8.Length);

// Act
string result = Term.ToString(bytesRef);

// Assert
Assert.AreEqual("Hello", result);
}

[Test, LuceneNetSpecific]
public void TestToString_InvalidUtf8Data()
{
// Arrange
var invalidUtf8 = new byte[] { 0xC3, 0x28 }; // Invalid UTF-8 sequence
var bytesRef = new BytesRef(invalidUtf8, 0, invalidUtf8.Length);

// Act
string result = Term.ToString(bytesRef);

// Assert
Assert.AreEqual("[c3 28]", result); // Should match BytesRef.ToString()
}

[Test, LuceneNetSpecific]
public void TestToString_Utf8WithBom()
{
// Arrange
var utf8WithBom = new byte[] { 0xEF, 0xBB, 0xBF, 0x48, 0x69 }; // BOM + "Hi"
var bytesRef = new BytesRef(utf8WithBom, 0, utf8WithBom.Length);

// Act
string result = Term.ToString(bytesRef);

// Assert
Assert.AreEqual("\uFEFFHi", result); // BOM is preserved in the string
}

[Test, LuceneNetSpecific]
public void TestToString_Utf8WithoutBom()
{
// Arrange
var utf8WithoutBom = new byte[] { 0x48, 0x69 }; // "Hi"
var bytesRef = new BytesRef(utf8WithoutBom, 0, utf8WithoutBom.Length);

// Act
string result = Term.ToString(bytesRef);

// Assert
Assert.AreEqual("Hi", result);
}
}
}
}
2 changes: 1 addition & 1 deletion src/Lucene.Net.Tests/Support/TestApiConsistency.cs
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ public override void TestProtectedFieldNames(Type typeFromTargetAssembly)
[TestCase(typeof(Lucene.Net.Analysis.Analyzer))]
public override void TestPrivateFieldNames(Type typeFromTargetAssembly)
{
base.TestPrivateFieldNames(typeFromTargetAssembly, @"^Lucene\.Net\.Support\.(?:ConcurrentHashSet|PlatformHelper|DateTimeOffsetUtil|Arrays|IO\.FileSupport)|^Lucene\.ExceptionExtensions|^Lucene\.Net\.Util\.Constants\.MaxStackByteLimit|^Lucene\.Net\.Search\.TopDocs\.ShardByteSize|^Lucene\.Net\.Store\.BaseDirectory\.(?:True|False)");
base.TestPrivateFieldNames(typeFromTargetAssembly, @"^Lucene\.Net\.Support\.(?:ConcurrentHashSet|PlatformHelper|DateTimeOffsetUtil|Arrays|IO\.FileSupport)|^Lucene\.ExceptionExtensions|^Lucene\.Net\.Util\.Constants\.MaxStackByteLimit|^Lucene\.Net\.Search\.TopDocs\.ShardByteSize|^Lucene\.Net\.Store\.BaseDirectory\.(?:True|False)|CharStackBufferSize$");
}

[Test, LuceneNetSpecific]
Expand Down
42 changes: 42 additions & 0 deletions src/Lucene.Net.Tests/Support/Text/TestEncodingExtensions.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
using Lucene.Net.Attributes;
using Lucene.Net.Util;
using NUnit.Framework;
using System.Text;

namespace Lucene.Net.Support.Text
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

[TestFixture]
public class TestEncodingExtensions : LuceneTestCase
{
[Test, LuceneNetSpecific]
public void TestWithDecoderExceptionFallback()
{
Encoding encoding = Encoding.UTF8;
Encoding newEncoding = encoding.WithDecoderExceptionFallback();
Assert.AreNotSame(encoding, newEncoding);
Assert.AreEqual(DecoderFallback.ExceptionFallback, newEncoding.DecoderFallback);

Assert.Throws<DecoderFallbackException>(() =>
{
_ = newEncoding.GetString(new byte[] { 0xF0 });
});
}
}
}
52 changes: 49 additions & 3 deletions src/Lucene.Net/Index/Term.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
using J2N.Text;
using Lucene.Net.Support;
using Lucene.Net.Support.Buffers;
using Lucene.Net.Support.Text;
using System;
using System.Buffers;
using System.Text;

namespace Lucene.Net.Index
Expand Down Expand Up @@ -34,6 +37,8 @@ namespace Lucene.Net.Index
/// </summary>
public sealed class Term : IComparable<Term>, IEquatable<Term> // LUCENENET specific - class implements IEquatable<T>
{
private const int CharStackBufferSize = 64;

/// <summary>
/// Constructs a <see cref="Term"/> with the given field and bytes.
/// <para/>Note that a null field or null bytes value results in undefined
Expand Down Expand Up @@ -84,24 +89,65 @@ public Term(string fld)
/// </summary>
public string Text => ToString(Bytes); // LUCENENET: Changed to a property. While this calls a method internally, its expected usage is that it will return a deterministic value.

#nullable enable
/// <summary>
/// Returns human-readable form of the term text. If the term is not unicode,
/// the raw bytes will be printed instead.
/// </summary>
public static string ToString(BytesRef termText)
{
if (termText is null)
throw new ArgumentNullException(nameof(termText)); // LUCENENET: Added guard clause
#if FEATURE_UTF8_TOUTF16
// View the relevant portion of the byte array
ReadOnlySpan<byte> utf8Span = new ReadOnlySpan<byte>(termText.Bytes, termText.Offset, termText.Length);

// Allocate a buffer for the maximum possible UTF-16 output
int maxChars = utf8Span.Length; // Worst case: 1 byte -> 1 char (ASCII)
char[]? arrayToReturnToPool = null;

Span<char> charBuffer = maxChars > CharStackBufferSize
? (arrayToReturnToPool = ArrayPool<char>.Shared.Rent(maxChars))
: stackalloc char[CharStackBufferSize];
try
{
// Decode the UTF-8 bytes to UTF-16 chars
OperationStatus status = System.Text.Unicode.Utf8.ToUtf16(
utf8Span,
charBuffer,
out int bytesConsumed,
out int charsWritten,
replaceInvalidSequences: false); // Causes OperationStatus.InvalidData to occur rather than replace

// NOTE: We handle OperationStatus.InvalidData below in the fallback path.
if (status == OperationStatus.Done)
{
// Successfully decoded the UTF-8 input
return charBuffer.Slice(0, charsWritten).ToString();
}
}
finally
{
// Return the buffer to the pool
ArrayPool<char>.Shared.ReturnIfNotNull(arrayToReturnToPool);
}

// Fallback to the default string representation if decoding fails
return termText.ToString();
#else
// the term might not be text, but usually is. so we make a best effort
// LUCENENET TODO: determine if we should use DecoderFallback.ExceptionFallback here
Encoding decoder = StandardCharsets.UTF_8;
Encoding decoder = StandardCharsets.UTF_8.WithDecoderExceptionFallback();
try
{
return decoder.GetString(termText.Bytes, termText.Offset, termText.Length);
}
catch
catch (DecoderFallbackException)
{
return termText.ToString();
}
#endif
}
#nullable restore

/// <summary>
/// Returns the bytes of this term.
Expand Down
Loading
Loading