Skip to content

Commit

Permalink
m-m-m/base#8: finally fixed unicode support
Browse files Browse the repository at this point in the history
  • Loading branch information
hohwille committed Jan 2, 2025
1 parent 987a27b commit a35e0a9
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 26 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@ protected boolean fill() {
public int next() {

if (hasNext()) {
return handleCodePoint(this.buffer.codePointAt(this.offset++));
return handleCodePoint(this.buffer.codePointAt(this.offset));
}
return EOS;
}
Expand All @@ -261,6 +261,11 @@ protected int handleCodePoint(int codePoint) {
} else {
this.column++;
}
if (codePoint >= 0x010000) {
this.offset += 2;
} else {
this.offset++;
}
return codePoint;
}

Expand All @@ -272,7 +277,7 @@ protected void setOffset(int newOffset) {
assert (newOffset >= this.offset);
assert (newOffset <= this.limit);
while (this.offset < newOffset) {
handleCodePoint(this.buffer.codePointAt(this.offset++));
handleCodePoint(this.buffer.codePointAt(this.offset));
}
}

Expand Down Expand Up @@ -312,7 +317,7 @@ public String readUntil(int stop, boolean acceptEot) {
while (true) {
int start = this.offset;
while (this.offset < this.limit) {
int codePoint = this.buffer.codePointAt(this.offset++);
int codePoint = this.buffer.codePointAt(this.offset);
handleCodePoint(codePoint);
if (codePoint == stop) {
return getAppended(builder, start, this.offset - 1);
Expand Down Expand Up @@ -349,7 +354,6 @@ public String readUntil(CharFilter filter, boolean acceptEot, CharScannerSyntax
return state.builder.toString();
}
handleCodePoint(codePoint);
this.offset++;
}
boolean eot = isEot();
if (!eot || acceptEot) {
Expand Down Expand Up @@ -389,7 +393,7 @@ public String readUntil(int stop, boolean acceptEot, int escape) {
while (true) {
int start = this.offset;
while (this.offset < this.limit) {
int codePoint = this.buffer.codePointAt(this.offset++);
int codePoint = this.buffer.codePointAt(this.offset);
handleCodePoint(codePoint);
if (codePoint == escape) {
builder = append(builder, start, this.offset - 1);
Expand All @@ -407,7 +411,6 @@ public String readUntil(int stop, boolean acceptEot, int escape) {
builder = builder(builder);
builder.appendCodePoint(codePoint);
handleCodePoint(codePoint);
this.offset++;
start = this.offset;
}
} else if (codePoint == stop) {
Expand Down Expand Up @@ -436,7 +439,6 @@ public String readUntil(CharFilter filter, boolean acceptEot) {
return getAppended(builder, start, this.offset);
}
handleCodePoint(codePoint);
this.offset++;
}
builder = append(builder, start, this.limit);
if (!fill()) {
Expand Down Expand Up @@ -561,7 +563,6 @@ public boolean expectOne(int expected, boolean warning) {

if (hasNext() && (this.buffer.codePointAt(this.offset) == expected)) {
handleCodePoint(expected);
this.offset++;
return true;
}
if (warning) {
Expand Down Expand Up @@ -602,7 +603,6 @@ public boolean expectUnsafe(String expected, boolean ignoreCase) {
}
}
handleCodePoint(codePoint);
this.offset++;
}
return true;
}
Expand All @@ -622,15 +622,13 @@ public String readLine(boolean trim) {
appender.trimEnd = this.offset;
while (this.offset < this.limit) {
int codePoint = this.buffer.codePointAt(this.offset);
handleCodePoint(codePoint);
if (codePoint == '\r') {
int end = this.offset;
this.offset++;
handleCodePoint(codePoint);
if (this.offset < this.limit) {
codePoint = this.buffer.codePointAt(this.offset);
if (codePoint == '\n') {
handleCodePoint(codePoint);
this.offset++;
}
return appender.getAppended(end);
} else { // EOL insanity...
Expand All @@ -639,19 +637,18 @@ public String readLine(boolean trim) {
codePoint = this.buffer.codePointAt(this.offset);
if (codePoint == '\n') {
handleCodePoint(codePoint);
this.offset++;
}
}
return appender.toString();
}
} else if (codePoint == '\n') {
String result = appender.getAppended();
this.offset++;
handleCodePoint(codePoint);
return result;
} else if (codePoint != ' ') {
appender.foundNonSpace();
}
this.offset++;
handleCodePoint(codePoint);
}
appender.append(this.limit);
if (!fill()) {
Expand All @@ -671,12 +668,11 @@ public String readJavaStringLiteral(TextFormatMessageType severity) {
return null;
}
handleCodePoint(codePoint);
this.offset++;
StringBuilder builder = null;
while (hasNext()) {
int start = this.offset;
while (this.offset < this.limit) {
codePoint = this.buffer.codePointAt(this.offset++);
codePoint = this.buffer.codePointAt(this.offset);
handleCodePoint(codePoint);
if (codePoint == '"') {
return getAppended(builder, start, this.offset - 1);
Expand Down Expand Up @@ -998,7 +994,6 @@ public int readDigit(int radix) {
if ((value >= 0) && (value < radix)) {
result = value;
handleCodePoint(codePoint);
this.offset++;
}
}
return result;
Expand Down Expand Up @@ -1313,7 +1308,6 @@ public int skipWhile(int c) {
return count + (this.offset - start);
}
handleCodePoint(c);
this.offset++;
}
count += (this.offset - start);
}
Expand Down Expand Up @@ -1344,7 +1338,6 @@ public int skipWhile(CharFilter filter, int max) {
break;
}
handleCodePoint(cp);
this.offset++;
}
int len = this.offset - start;
remain -= len;
Expand Down Expand Up @@ -1391,10 +1384,10 @@ public boolean skipOver(String substring, boolean ignoreCase, CharFilter stopFil
if (found) {
return true;
}
next();
} else {
handleCodePoint(cp);
}
this.offset++;
}
if (!fill()) {
// TODO reset text position
Expand Down Expand Up @@ -1434,7 +1427,6 @@ public String readWhile(CharFilter filter, int min, int max) {
return requireMin(getAppended(builder, start, this.offset), min, filter);
}
handleCodePoint(cp);
this.offset++;
}
int len = this.offset - start;
remain -= len;
Expand Down
12 changes: 10 additions & 2 deletions core/src/main/java/io/github/mmm/scanner/CharReaderScanner.java
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ public CharReaderScanner(int capacity, Reader reader) {
public CharReaderScanner(int capacity, TextFormatMessageHandler messageHandler, Reader reader) {

super("", messageHandler);
this.charBuffer = new char[capacity];
this.charBuffer = new char[capacity + 1];
this.reader = reader;
}

Expand Down Expand Up @@ -252,14 +252,22 @@ protected boolean fill() {
try {
this.limit = 0;
while (this.limit == 0) {
this.limit = this.reader.read(this.charBuffer);
this.limit = this.reader.read(this.charBuffer, 0, this.charBuffer.length - 1);
}
if (this.limit == -1) {
close();
this.buffer = "";
this.limit = 0;
return false;
}
char last = this.charBuffer[this.limit - 1];
if (Character.isSurrogate(last)) {
int next = this.reader.read();
if (next >= 0) {
this.charBuffer[this.limit] = (char) next;
this.limit++;
}
}
this.buffer = new String(this.charBuffer, 0, this.limit);
this.limit = this.buffer.length();
return true;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,7 @@ public boolean hasNext() {
public int next() {

if (this.offset < this.limit) {
return handleCodePoint(this.buffer.codePointAt(this.offset++));
return handleCodePoint(this.buffer.codePointAt(this.offset));
} else {
return 0;
}
Expand Down Expand Up @@ -295,7 +295,6 @@ public String readUntil(CharFilter filter, boolean acceptEot) {
return this.buffer.substring(start, this.offset);
}
handleCodePoint(cp);
this.offset++;
}
if (acceptEot) {
if (this.offset > start) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1112,4 +1112,22 @@ private void readJavaCharLiteralInvalid(String string, String expectedErrorValue
assertThat(e).isInstanceOf(exception).hasMessageContaining(expectedErrorValue);
}
}

@Test
public void testUnicode() {

// arrange
String gClev = "\uD834\uDD1E";
String dBar = "\uD834\uDD01";
String x = "x";
String note1_8 = "\uD834\uDD60";
String text = gClev + dBar + x + note1_8;
CharStreamScanner scanner = scanner(text);
// act + assert
assertThat(scanner.next()).isEqualTo(gClev.codePointAt(0));
assertThat(scanner.next()).isEqualTo(dBar.codePointAt(0));
assertThat(scanner.next()).isEqualTo(x.codePointAt(0));
assertThat(scanner.next()).isEqualTo(note1_8.codePointAt(0));
assertThat(scanner.hasNext()).isFalse();
}
}

0 comments on commit a35e0a9

Please sign in to comment.