apache · paulirwin · Jan 6, 2025 · Jan 6, 2025 · Jan 6, 2025 · Jan 8, 2025
diff --git a/.gitignore b/.gitignore
@@ -65,4 +65,5 @@ websites/apidocs/api/**/*.manifest
 svn-*/
 
 # vscode files
-.vscode/
+.vscode/
+.idea/**/misc.xml
diff --git a/Directory.Build.targets b/Directory.Build.targets
@@ -37,6 +37,13 @@
 
   </PropertyGroup>
 
+  <!-- Features in .NET 8.x and .NET 9.x only -->
+  <PropertyGroup Condition=" $(TargetFramework.StartsWith('net8.')) Or $(TargetFramework.StartsWith('net9.')) ">
+
+    <DefineConstants>$(DefineConstants);FEATURE_UTF8_TOUTF16</DefineConstants>
+
+  </PropertyGroup>
+
   <!-- Features in .NET 6.x, .NET 7.x, .NET 8.x, and .NET 9.x only -->
   <PropertyGroup Condition=" $(TargetFramework.StartsWith('net6.')) Or $(TargetFramework.StartsWith('net7.')) Or $(TargetFramework.StartsWith('net8.')) Or $(TargetFramework.StartsWith('net9.')) ">
 

diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Synonym/FSTSynonymFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Synonym/FSTSynonymFilterFactory.cs
@@ -1,6 +1,7 @@
 // Lucene version compatibility level 4.8.1
 using Lucene.Net.Analysis.Core;
 using Lucene.Net.Analysis.Util;
+using Lucene.Net.Support.Text;
 using Lucene.Net.Util;
 using System;
 using System.Collections.Generic;
@@ -117,7 +118,7 @@ public void Inform(IResourceLoader loader)
         /// </summary>
         private SynonymMap LoadSynonyms(IResourceLoader loader, string cname, bool dedup, Analyzer analyzer)
         {
-            Encoding decoder = Encoding.UTF8;
+            Encoding decoder = Encoding.UTF8.WithDecoderExceptionFallback();
 
             SynonymMap.Parser parser;
             Type clazz = loader.FindType(cname /*, typeof(SynonymMap.Parser) */);
@@ -165,4 +166,4 @@ private TokenizerFactory LoadTokenizerFactory(IResourceLoader loader, string cna
             }
         }
     }
-}
+}
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Util/AbstractAnalysisFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Util/AbstractAnalysisFactory.cs
@@ -1,6 +1,7 @@
 // Lucene version compatibility level 4.8.1
 using Lucene.Net.Analysis.Core;
 using Lucene.Net.Support;
+using Lucene.Net.Support.Text;
 using Lucene.Net.Util;
 using System;
 using System.Collections.Generic;
@@ -385,8 +386,9 @@ protected CharArraySet GetSnowballWordSet(IResourceLoader loader, string wordFil
                 words = new CharArraySet(m_luceneMatchVersion, files.Count * 10, ignoreCase);
                 foreach (string file in files)
                 {
+                    Encoding decoder = Encoding.UTF8.WithDecoderExceptionFallback();
                     using (Stream stream = loader.OpenResource(file.Trim()))
-                    using (TextReader reader = new StreamReader(stream, Encoding.UTF8))
+                    using (TextReader reader = new StreamReader(stream, decoder))
                     {
                         WordlistLoader.GetSnowballWordSet(reader, words);
                     }

diff --git a/src/Lucene.Net.Analysis.Kuromoji/JapaneseTokenizerFactory.cs b/src/Lucene.Net.Analysis.Kuromoji/JapaneseTokenizerFactory.cs
@@ -1,5 +1,6 @@
 using Lucene.Net.Analysis.Ja.Dict;
 using Lucene.Net.Analysis.Util;
+using Lucene.Net.Support.Text;
 using Lucene.Net.Util;
 using System;
 using System.Collections.Generic;
@@ -88,7 +89,7 @@ public virtual void Inform(IResourceLoader loader)
                 {
                     encoding = Encoding.UTF8.WebName;
                 }
-                Encoding decoder = Encoding.GetEncoding(encoding);
+                Encoding decoder = Encoding.GetEncoding(encoding).WithDecoderExceptionFallback();
                 TextReader reader = new StreamReader(stream, decoder);
                 userDictionary = new UserDictionary(reader);
             }

diff --git a/src/Lucene.Net.Analysis.Kuromoji/Tools/ConnectionCostsBuilder.cs b/src/Lucene.Net.Analysis.Kuromoji/Tools/ConnectionCostsBuilder.cs
@@ -1,5 +1,6 @@
 using J2N.Text;
 using Lucene.Net.Diagnostics;
+using Lucene.Net.Support.Text;
 using System.Globalization;
 using System.IO;
 using System.Text;
@@ -31,7 +32,8 @@ public static class ConnectionCostsBuilder // LUCENENET specific: CA1052 Static
         public static ConnectionCostsWriter Build(string filename)
         {
             using Stream inputStream = new FileStream(filename, FileMode.Open, FileAccess.Read);
-            using StreamReader streamReader = new StreamReader(inputStream, Encoding.ASCII, detectEncodingFromByteOrderMarks: true, bufferSize: 1024, leaveOpen: true); // LUCENENET: CA2000: Use using statement
+            Encoding decoder = Encoding.ASCII.WithDecoderExceptionFallback();
+            using StreamReader streamReader = new StreamReader(inputStream, decoder, detectEncodingFromByteOrderMarks: true, bufferSize: 1024, leaveOpen: true); // LUCENENET: CA2000: Use using statement
 
             string line = streamReader.ReadLine();
             string[] dimensions = whiteSpaceRegex.Split(line).TrimEnd();

diff --git a/src/Lucene.Net.Analysis.Kuromoji/Tools/TokenInfoDictionaryBuilder.cs b/src/Lucene.Net.Analysis.Kuromoji/Tools/TokenInfoDictionaryBuilder.cs
@@ -1,5 +1,6 @@
 using J2N.Text;
 using Lucene.Net.Support;
+using Lucene.Net.Support.Text;
 using Lucene.Net.Util;
 using Lucene.Net.Util.Fst;
 using Lucene.Net.Util.Packed;
@@ -71,7 +72,7 @@ public virtual TokenInfoDictionaryWriter BuildDictionary(IList<string> csvFiles)
             foreach (string file in csvFiles)
             {
                 using Stream inputStream = new FileStream(file, FileMode.Open, FileAccess.Read);
-                Encoding decoder = Encoding.GetEncoding(encoding);
+                Encoding decoder = Encoding.GetEncoding(encoding).WithDecoderExceptionFallback();
                 using TextReader reader = new StreamReader(inputStream, decoder, detectEncodingFromByteOrderMarks: true, bufferSize: 1024, leaveOpen: true); // LUCENENET: CA2000: Use using statement
 
                 string line = null;

diff --git a/src/Lucene.Net.Analysis.Kuromoji/Tools/UnknownDictionaryBuilder.cs b/src/Lucene.Net.Analysis.Kuromoji/Tools/UnknownDictionaryBuilder.cs
@@ -1,5 +1,6 @@
 using J2N.Text;
 using Lucene.Net.Analysis.Ja.Dict;
+using Lucene.Net.Support.Text;
 using System;
 using System.Collections.Generic;
 using System.Globalization;
@@ -55,7 +56,7 @@ public virtual UnknownDictionaryWriter ReadDictionaryFile(string filename, strin
             UnknownDictionaryWriter dictionary = new UnknownDictionaryWriter(5 * 1024 * 1024);
 
             JCG.List<string[]> lines = new JCG.List<string[]>();
-            Encoding decoder = Encoding.GetEncoding(encoding);
+            Encoding decoder = Encoding.GetEncoding(encoding).WithDecoderExceptionFallback();
             using (Stream inputStream = new FileStream(filename, FileMode.Open, FileAccess.Read))
             using (TextReader reader = new StreamReader(inputStream, decoder))
             {

diff --git a/src/Lucene.Net.TestFramework/Util/LineFileDocs.cs b/src/Lucene.Net.TestFramework/Util/LineFileDocs.cs
@@ -2,6 +2,7 @@
 using J2N.Threading.Atomic;
 using Lucene.Net.Documents;
 using Lucene.Net.Support.IO;
+using Lucene.Net.Support.Text;
 using Lucene.Net.Support.Threading;
 using RandomizedTesting.Generators;
 using System;
@@ -236,7 +237,8 @@ private void Open(Random random)
                     } while (b >= 0 && b != 13 && b != 10);
                 }
 
-                reader = new StreamReader(@is, Encoding.UTF8, detectEncodingFromByteOrderMarks: false, bufferSize: BUFFER_SIZE);
+                Encoding decoder = Encoding.UTF8.WithDecoderExceptionFallback();
+                reader = new StreamReader(@is, decoder, detectEncodingFromByteOrderMarks: false, bufferSize: BUFFER_SIZE);
 
                 if (seekTo > 0L)
                 {
@@ -399,4 +401,4 @@ internal static string MaybeCreateTempFile(bool removeAfterClass = true)
             return result;
         }
     }
-}
+}
diff --git a/src/Lucene.Net.Tests/Index/TestTerm.cs b/src/Lucene.Net.Tests/Index/TestTerm.cs
@@ -1,3 +1,5 @@
+using Lucene.Net.Attributes;
+using Lucene.Net.Util;
 using NUnit.Framework;
 using Assert = Lucene.Net.TestFramework.Assert;
 
@@ -39,5 +41,61 @@ public virtual void TestEquals()
             Assert.IsFalse(@base.Equals(differentText));
             Assert.IsFalse(@base.Equals(differentType));
         }
+
+        [Test, LuceneNetSpecific]
+        public void TestToString_ValidUtf8Data()
+        {
+            // Arrange
+            var validUtf8 = new byte[] { 0x48, 0x65, 0x6C, 0x6C, 0x6F }; // "Hello"
+            var bytesRef = new BytesRef(validUtf8, 0, validUtf8.Length);
+
+            // Act
+            string result = Term.ToString(bytesRef);
+
+            // Assert
+            Assert.AreEqual("Hello", result);
+        }
+
+        [Test, LuceneNetSpecific]
+        public void TestToString_InvalidUtf8Data()
+        {
+            // Arrange
+            var invalidUtf8 = new byte[] { 0xC3, 0x28 }; // Invalid UTF-8 sequence
+            var bytesRef = new BytesRef(invalidUtf8, 0, invalidUtf8.Length);
+
+            // Act
+            string result = Term.ToString(bytesRef);
+
+            // Assert
+            Assert.AreEqual("[c3 28]", result); // Should match BytesRef.ToString()
+        }
+
+        [Test, LuceneNetSpecific]
+        public void TestToString_Utf8WithBom()
+        {
+            // Arrange
+            var utf8WithBom = new byte[] { 0xEF, 0xBB, 0xBF, 0x48, 0x69 }; // BOM + "Hi"
+            var bytesRef = new BytesRef(utf8WithBom, 0, utf8WithBom.Length);
+
+            // Act
+            string result = Term.ToString(bytesRef);
+
+            // Assert
+            Assert.AreEqual("\uFEFFHi", result); // BOM is preserved in the string
+        }
+
+        [Test, LuceneNetSpecific]
+        public void TestToString_Utf8WithoutBom()
+        {
+            // Arrange
+            var utf8WithoutBom = new byte[] { 0x48, 0x69 }; // "Hi"
+            var bytesRef = new BytesRef(utf8WithoutBom, 0, utf8WithoutBom.Length);
+
+            // Act
+            string result = Term.ToString(bytesRef);
+
+            // Assert
+            Assert.AreEqual("Hi", result);
+        }
     }
-}
+}
diff --git a/src/Lucene.Net.Tests/Support/TestApiConsistency.cs b/src/Lucene.Net.Tests/Support/TestApiConsistency.cs
@@ -38,7 +38,7 @@ public override void TestProtectedFieldNames(Type typeFromTargetAssembly)
         [TestCase(typeof(Lucene.Net.Analysis.Analyzer))]
         public override void TestPrivateFieldNames(Type typeFromTargetAssembly)
         {
-            base.TestPrivateFieldNames(typeFromTargetAssembly, @"^Lucene\.Net\.Support\.(?:ConcurrentHashSet|PlatformHelper|DateTimeOffsetUtil|Arrays|IO\.FileSupport)|^Lucene\.ExceptionExtensions|^Lucene\.Net\.Util\.Constants\.MaxStackByteLimit|^Lucene\.Net\.Search\.TopDocs\.ShardByteSize|^Lucene\.Net\.Store\.BaseDirectory\.(?:True|False)");
+            base.TestPrivateFieldNames(typeFromTargetAssembly, @"^Lucene\.Net\.Support\.(?:ConcurrentHashSet|PlatformHelper|DateTimeOffsetUtil|Arrays|IO\.FileSupport)|^Lucene\.ExceptionExtensions|^Lucene\.Net\.Util\.Constants\.MaxStackByteLimit|^Lucene\.Net\.Search\.TopDocs\.ShardByteSize|^Lucene\.Net\.Store\.BaseDirectory\.(?:True|False)|CharStackBufferSize$");
         }
 
         [Test, LuceneNetSpecific]

diff --git a/src/Lucene.Net.Tests/Support/Text/TestEncodingExtensions.cs b/src/Lucene.Net.Tests/Support/Text/TestEncodingExtensions.cs
@@ -0,0 +1,42 @@
+using Lucene.Net.Attributes;
+using Lucene.Net.Util;
+using NUnit.Framework;
+using System.Text;
+
+namespace Lucene.Net.Support.Text
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    [TestFixture]
+    public class TestEncodingExtensions : LuceneTestCase
+    {
+        [Test, LuceneNetSpecific]
+        public void TestWithDecoderExceptionFallback()
+        {
+            Encoding encoding = Encoding.UTF8;
+            Encoding newEncoding = encoding.WithDecoderExceptionFallback();
+            Assert.AreNotSame(encoding, newEncoding);
+            Assert.AreEqual(DecoderFallback.ExceptionFallback, newEncoding.DecoderFallback);
+
+            Assert.Throws<DecoderFallbackException>(() =>
+            {
+                _ = newEncoding.GetString(new byte[] { 0xF0 });
+            });
+        }
+    }
+}
diff --git a/src/Lucene.Net/Index/Term.cs b/src/Lucene.Net/Index/Term.cs
@@ -1,6 +1,9 @@
 using J2N.Text;
 using Lucene.Net.Support;
+using Lucene.Net.Support.Buffers;
+using Lucene.Net.Support.Text;
 using System;
+using System.Buffers;
 using System.Text;
 
 namespace Lucene.Net.Index
@@ -34,6 +37,8 @@ namespace Lucene.Net.Index
     /// </summary>
     public sealed class Term : IComparable<Term>, IEquatable<Term> // LUCENENET specific - class implements IEquatable<T>
     {
+        private const int CharStackBufferSize = 64;
+
         /// <summary>
         /// Constructs a <see cref="Term"/> with the given field and bytes.
         /// <para/>Note that a null field or null bytes value results in undefined
@@ -84,24 +89,65 @@ public Term(string fld)
         /// </summary>
         public string Text => ToString(Bytes); // LUCENENET: Changed to a property. While this calls a method internally, its expected usage is that it will return a deterministic value.
 
+#nullable enable
         /// <summary>
         /// Returns human-readable form of the term text. If the term is not unicode,
         /// the raw bytes will be printed instead.
         /// </summary>
         public static string ToString(BytesRef termText)
         {
+            if (termText is null)
+                throw new ArgumentNullException(nameof(termText)); // LUCENENET: Added guard clause
+#if FEATURE_UTF8_TOUTF16
+            // View the relevant portion of the byte array
+            ReadOnlySpan<byte> utf8Span = new ReadOnlySpan<byte>(termText.Bytes, termText.Offset, termText.Length);
+
+            // Allocate a buffer for the maximum possible UTF-16 output
+            int maxChars = utf8Span.Length; // Worst case: 1 byte -> 1 char (ASCII)
+            char[]? arrayToReturnToPool = null;
+
+            Span<char> charBuffer = maxChars > CharStackBufferSize
+                ? (arrayToReturnToPool = ArrayPool<char>.Shared.Rent(maxChars))
+                : stackalloc char[CharStackBufferSize];
+            try
+            {
+                // Decode the UTF-8 bytes to UTF-16 chars
+                OperationStatus status = System.Text.Unicode.Utf8.ToUtf16(
+                    utf8Span,
+                    charBuffer,
+                    out int bytesConsumed,
+                    out int charsWritten,
+                    replaceInvalidSequences: false); // Causes OperationStatus.InvalidData to occur rather than replace
+
+                // NOTE: We handle OperationStatus.InvalidData below in the fallback path.
+                if (status == OperationStatus.Done)
+                {
+                    // Successfully decoded the UTF-8 input
+                    return charBuffer.Slice(0, charsWritten).ToString();
+                }
+            }
+            finally
+            {
+                // Return the buffer to the pool
+                ArrayPool<char>.Shared.ReturnIfNotNull(arrayToReturnToPool);
+            }
+
+            // Fallback to the default string representation if decoding fails
+            return termText.ToString();
+#else
             // the term might not be text, but usually is. so we make a best effort
-            // LUCENENET TODO: determine if we should use DecoderFallback.ExceptionFallback here
-            Encoding decoder = StandardCharsets.UTF_8;
+            Encoding decoder = StandardCharsets.UTF_8.WithDecoderExceptionFallback();
             try
             {
                 return decoder.GetString(termText.Bytes, termText.Offset, termText.Length);
             }
-            catch
+            catch (DecoderFallbackException)
             {
                 return termText.ToString();
             }
+#endif
         }
+#nullable restore
 
         /// <summary>
         /// Returns the bytes of this term.