diff --git a/Sources/_RegexParser/Regex/AST/MatchingOptions.swift b/Sources/_RegexParser/Regex/AST/MatchingOptions.swift index c85c2b3d1..be288491d 100644 --- a/Sources/_RegexParser/Regex/AST/MatchingOptions.swift +++ b/Sources/_RegexParser/Regex/AST/MatchingOptions.swift @@ -44,6 +44,9 @@ extension AST { // Swift-only default possessive quantifier case possessiveByDefault // t.b.d. + + // NSRegularExpression compatibility special-case + case nsreCompatibleDot // no AST representation } public var kind: Kind diff --git a/Sources/_RegexParser/Regex/Parse/Sema.swift b/Sources/_RegexParser/Regex/Parse/Sema.swift index ea541fba7..d2f7c622d 100644 --- a/Sources/_RegexParser/Regex/Parse/Sema.swift +++ b/Sources/_RegexParser/Regex/Parse/Sema.swift @@ -142,7 +142,8 @@ extension RegexValidator { case .caseInsensitive, .possessiveByDefault, .reluctantByDefault, .singleLine, .multiline, .namedCapturesOnly, .extended, .extraExtended, - .asciiOnlyDigit, .asciiOnlyWord, .asciiOnlySpace, .asciiOnlyPOSIXProps: + .asciiOnlyDigit, .asciiOnlyWord, .asciiOnlySpace, .asciiOnlyPOSIXProps, + .nsreCompatibleDot: break } } diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index cb2e9ed04..8f2a52a3c 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -67,7 +67,7 @@ fileprivate extension Compiler.ByteCodeGen { emitAnyNonNewline() case .dot: - emitDot() + try emitDot() case let .char(c): emitCharacter(c) @@ -238,9 +238,15 @@ fileprivate extension Compiler.ByteCodeGen { } } - mutating func emitDot() { + mutating func emitDot() throws { if options.dotMatchesNewline { - emitAny() + if options.usesNSRECompatibleDot { + try emitAlternation([ + .atom(.characterClass(.newlineSequence)), + .atom(.anyNonNewline)]) + } else { + emitAny() + } } else { emitAnyNonNewline() } @@ -964,7 +970,7 @@ fileprivate extension Compiler.ByteCodeGen { case let .customCharacterClass(ccc): if ccc.containsDot { if !ccc.isInverted { - emitDot() + try emitDot() } else { throw Unsupported("Inverted any") } diff --git a/Sources/_StringProcessing/MatchingOptions.swift b/Sources/_StringProcessing/MatchingOptions.swift index d511c9f7c..94e5769b9 100644 --- a/Sources/_StringProcessing/MatchingOptions.swift +++ b/Sources/_StringProcessing/MatchingOptions.swift @@ -120,6 +120,10 @@ extension MatchingOptions { ? .graphemeCluster : .unicodeScalar } + + var usesNSRECompatibleDot: Bool { + stack.last!.contains(.nsreCompatibleDot) + } } // MARK: - Implementation @@ -141,6 +145,7 @@ extension MatchingOptions { // Not available via regex literal flags case transparentBounds case withoutAnchoringBounds + case nsreCompatibleDot // Oniguruma options case asciiOnlyDigit @@ -197,6 +202,8 @@ extension MatchingOptions { self = .byteSemantics case .possessiveByDefault: self = .possessiveByDefault + case .nsreCompatibleDot: + self = .nsreCompatibleDot // Whitespace options are only relevant during parsing, not compilation. case .extended, .extraExtended: diff --git a/Sources/_StringProcessing/Regex/Options.swift b/Sources/_StringProcessing/Regex/Options.swift index 1e58e78d5..d8ca30e56 100644 --- a/Sources/_StringProcessing/Regex/Options.swift +++ b/Sources/_StringProcessing/Regex/Options.swift @@ -159,6 +159,18 @@ extension Regex { return wrapInOption(.unicodeScalarSemantics, addingIf: true) } } + + /// Returns a regular expression that uses an NSRegularExpression + /// compatibility mode. + /// + /// This mode includes using Unicode scalar semantics and treating a `dot` + /// as matching newline sequences (when in the unrelated dot-matches-newlines + /// mode). + @_spi(Foundation) + public var _nsreCompatibility: Regex { + wrapInOption(.nsreCompatibleDot, addingIf: true) + .wrapInOption(.unicodeScalarSemantics, addingIf: true) + } } /// A semantic level to use during regex matching. diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 30087eac1..791c0850a 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -11,7 +11,7 @@ import XCTest @testable import _RegexParser -@testable @_spi(RegexBenchmark) import _StringProcessing +@testable @_spi(RegexBenchmark) @_spi(Foundation) import _StringProcessing import TestSupport struct MatchError: Error { @@ -2726,4 +2726,40 @@ extension RegexTests { XCTAssertNotNil(str.wholeMatch(of: possessiveRegex)) } } + + func testNSRECompatibility() throws { + // NSRE-compatibility includes scalar matching, so `[\r\n]` should match + // either `\r` or `\n`. + let text = #""" + y=sin(x)+sin(2x)+sin(3x);\#rText "This is a function of x.";\r + """# + let lineTerminationRegex = try Regex(#";[\r\n]"#) + ._nsreCompatibility + + let afterLine = try XCTUnwrap(text.firstRange(of: "Text")) + let match = try lineTerminationRegex.firstMatch(in: text) + XCTAssert(match?.range.upperBound == afterLine.lowerBound) + + // NSRE-compatibility treats "dot" as special, in that it can match a + // newline sequence as well as a single Unicode scalar. + let aDotBRegex = try Regex(#"a.b"#) + ._nsreCompatibility + .dotMatchesNewlines() + for input in ["a\rb", "a\nb", "a\r\nb"] { + XCTAssertNotNil(try aDotBRegex.wholeMatch(in: input)) + } + + // NSRE-compatibility doesn't give special treatment to newline sequences + // when matching other "match everything" regex patterns, like `[[^z]z]`, + // so this pattern doesn't match "a\r\nb". + let aCCBRegex = try Regex(#"a[[^z]z]b"#) + ._nsreCompatibility + for input in ["a\rb", "a\nb", "a\r\nb"] { + if input.unicodeScalars.count == 3 { + XCTAssertNotNil(try aCCBRegex.wholeMatch(in: input)) + } else { + XCTAssertNil(try aCCBRegex.wholeMatch(in: input)) + } + } + } }