From cdebea7278cdd52aeadbd7e960cef15bd788fbea Mon Sep 17 00:00:00 2001 From: Maxim Date: Fri, 27 Oct 2023 16:30:36 +0300 Subject: [PATCH] Add BaseParser and BaseParserInputStream --- .../as/filters/io/ASBufferedInFilter.java | 3 +- .../org/verapdf/io/SeekableInputStream.java | 3 +- .../java/org/verapdf/parser/BaseParser.java | 524 ++++++++++++++++++ .../verapdf/parser/BaseParserInputStream.java | 27 + .../parser/DecodedObjectStreamParser.java | 2 +- .../verapdf/parser/NotSeekableBaseParser.java | 503 +---------------- .../java/org/verapdf/parser/PDFParser.java | 46 +- .../org/verapdf/parser/PDFStreamParser.java | 10 +- .../verapdf/parser/SeekableBaseParser.java | 461 +-------------- .../org/verapdf/parser/SeekableCOSParser.java | 30 +- .../org/verapdf/parser/SignatureParser.java | 32 +- .../pd/font/type1/Type1FontProgram.java | 4 +- .../pd/font/type1/Type1PrivateParser.java | 10 +- 13 files changed, 656 insertions(+), 999 deletions(-) create mode 100644 src/main/java/org/verapdf/parser/BaseParser.java create mode 100644 src/main/java/org/verapdf/parser/BaseParserInputStream.java diff --git a/src/main/java/org/verapdf/as/filters/io/ASBufferedInFilter.java b/src/main/java/org/verapdf/as/filters/io/ASBufferedInFilter.java index 7161e97d..ea6daea8 100644 --- a/src/main/java/org/verapdf/as/filters/io/ASBufferedInFilter.java +++ b/src/main/java/org/verapdf/as/filters/io/ASBufferedInFilter.java @@ -23,6 +23,7 @@ import org.verapdf.as.filters.ASInFilter; import org.verapdf.as.io.ASInputStream; import org.verapdf.as.io.ASMemoryInStream; +import org.verapdf.parser.BaseParserInputStream; import org.verapdf.parser.NotSeekableBaseParser; import java.io.IOException; @@ -41,7 +42,7 @@ * * @author Sergey Shemyakov */ -public class ASBufferedInFilter extends ASInFilter { +public class ASBufferedInFilter extends ASInFilter implements BaseParserInputStream { public static final int START_BUFFER_SIZE = 10240; public static final int BF_BUFFER_SIZE = 2048; diff --git a/src/main/java/org/verapdf/io/SeekableInputStream.java b/src/main/java/org/verapdf/io/SeekableInputStream.java index 2cc1c285..c645439f 100644 --- a/src/main/java/org/verapdf/io/SeekableInputStream.java +++ b/src/main/java/org/verapdf/io/SeekableInputStream.java @@ -24,6 +24,7 @@ import org.verapdf.as.io.ASInputStream; import org.verapdf.as.io.ASMemoryInStream; import org.verapdf.exceptions.VeraPDFParserException; +import org.verapdf.parser.BaseParserInputStream; import java.io.IOException; import java.io.InputStream; @@ -35,7 +36,7 @@ * * @author Sergey Shemyakov */ -public abstract class SeekableInputStream extends ASInputStream { +public abstract class SeekableInputStream extends ASInputStream implements BaseParserInputStream { private static final int MAX_BUFFER_SIZE = 10240; diff --git a/src/main/java/org/verapdf/parser/BaseParser.java b/src/main/java/org/verapdf/parser/BaseParser.java new file mode 100644 index 00000000..3526749a --- /dev/null +++ b/src/main/java/org/verapdf/parser/BaseParser.java @@ -0,0 +1,524 @@ +package org.verapdf.parser; + +import org.verapdf.as.CharTable; +import org.verapdf.cos.filters.COSFilterASCIIHexDecode; + +import java.io.IOException; +import java.util.logging.Level; +import java.util.logging.Logger; + +/** + * @author Maxim Plushchov + */ +public abstract class BaseParser { + + private static final Logger LOGGER = Logger.getLogger(BaseParser.class.getCanonicalName()); + + // max string length in bytes + private static final int MAX_STRING_LENGTH = 65535; + private static final byte ASCII_ZERO = 48; + private static final byte ASCII_NINE = 57; + + // indicates if this parser is a postscript parser + protected boolean isPSParser = false; + + protected Token token; + + protected BaseParserInputStream source; + + protected Token getToken() { + return this.token; + } + + protected void initializeToken() { + if (this.token == null) { + this.token = new Token(); + } + } + + protected void appendToToken(final int ch) { + this.token.append(ch); + } + + protected void clearToken() { + this.token.clearValue(); + } + + protected String readUntilDelimiter() throws IOException { + initializeToken(); + this.token.clearValue(); + byte ch = this.source.readByte(); + while (!CharTable.isSpace(ch) && !CharTable.isTokenDelimiter(ch)) { + appendToToken(ch); + if (!this.source.isEOF()) { + ch = this.source.readByte(); + } else { + break; + } + } + if (CharTable.isSpace(ch) || CharTable.isTokenDelimiter(ch)) { + this.source.unread(); + } + return this.token.getValue(); + } + + protected boolean findKeyword(final Token.Keyword keyword) throws IOException { + nextToken(); + while (this.token.type != Token.Type.TT_EOF && (this.token.type != Token.Type.TT_KEYWORD || this.token.keyword != keyword)) { + nextToken(); + } + return this.token.type == Token.Type.TT_KEYWORD && this.token.keyword == keyword; + } + + protected abstract boolean findKeyword(final Token.Keyword keyword, final int lookUpSize) throws IOException; + + protected void skipSpaces() throws IOException { + this.skipSpaces(false); + } + + public void skipSpaces(boolean skipComment) throws IOException { + while (skipSingleSpace(skipComment)); + } + + protected boolean skipSingleSpace(boolean skipComment) throws IOException { + if (this.source.isEOF()) { + return false; + } + byte ch = this.source.readByte(); + if (CharTable.isSpace(ch)) { + return true; + } + if (ch == '%' && skipComment) { + skipComment(); + return true; + } + this.source.unread(); + return false; + } + + protected abstract void skipComment() throws IOException; + + protected boolean isDigit() throws IOException { + return isDigit((byte) this.source.peek()); + } + + protected static boolean isDigit(byte c) { + return c >= ASCII_ZERO && c <= ASCII_NINE; + } + + protected static boolean isLF(int c) { + return CharTable.ASCII_LF == c; + } + + protected static boolean isCR(int c) { + return CharTable.ASCII_CR == c; + } + + protected static boolean isFF(int c) { + return CharTable.ASCII_FF == c; + } + + protected abstract void readASCII85() throws IOException; + + private void readToken() throws IOException { + this.token.clearValue(); + byte ch; + while (!this.source.isEOF()) { + ch = this.source.readByte(); + if (CharTable.isTokenDelimiter(ch)) { + this.source.unread(); + break; + } + + appendToToken(ch); + } + } + + protected void nextToken() throws IOException { + skipSpaces(true); + if (this.source.isEOF()) { + this.token.type = Token.Type.TT_EOF; + return; + } + + this.token.type = Token.Type.TT_NONE; + + byte ch = this.source.readByte(); + + switch (ch) { + case '(': + this.token.type = Token.Type.TT_LITSTRING; + readLitString(); + break; + case ')': + //error + break; + case '<': + ch = source.readByte(); + if (ch == '<') { + this.token.type = Token.Type.TT_OPENDICT; + } else if (ch == '~') { + this.token.type = Token.Type.TT_HEXSTRING; + readASCII85(); + } else { + this.source.unread(); + this.token.type = Token.Type.TT_HEXSTRING; + readHexString(); + } + break; + case '>': + ch = this.source.readByte(); + if (ch == '>') { + this.token.type = Token.Type.TT_CLOSEDICT; + } else { + throw new IOException(getErrorMessage("Unknown symbol " + ch + " after \'>\'")); + } + break; + case '[': + this.token.type = Token.Type.TT_OPENARRAY; + break; + case ']': + this.token.type = Token.Type.TT_CLOSEARRAY; + break; + case '{': // as delimiter in PostScript calculator functions 181 + if (isPSParser) { + this.token.type = Token.Type.TT_STARTPROC; + } + break; + case '}': + if (isPSParser) { + this.token.type = Token.Type.TT_ENDPROC; + } + break; + case '/': + this.token.type = Token.Type.TT_NAME; + readName(); + break; + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + case '.': + this.source.unread(); + readNumber(); + break; + case '+': + readNumber(); + break; + case '-': + readNumber(); + this.token.integer = -this.token.integer; + this.token.real = -this.token.real; + break; + default: + this.source.unread(); + readToken(); + this.token.toKeyword(); + if (this.token.keyword == Token.Keyword.KW_NONE) { + this.token.type = Token.Type.TT_NONE; + } + break; + } + } + + protected String getLine() throws IOException { + initializeToken(); + this.token.clearValue(); + byte ch = this.source.readByte(); + while (!this.source.isEOF()) { + if (ch == CharTable.ASCII_LF || ch == CharTable.ASCII_CR) { + break; + } + appendToToken(ch); + ch = this.source.readByte(); + } + return this.token.getValue(); + } + + protected byte[] getLineBytes() throws IOException { + getLine(); + return this.token.getByteValue(); + } + + private void readHexString() throws IOException { + this.token.clearValue(); + byte ch; + int uc = 0; + int hex; + + //these are required for pdf/a validation + boolean containsOnlyHex = true; + long hexCount = 0; + + boolean odd = false; + while (!this.source.isEOF()) { + ch = this.source.readByte(); + if (ch == '>') { + if (odd) { + uc <<= 4; + appendToToken(uc); + } + this.token.setContainsOnlyHex(containsOnlyHex); + this.token.setHexCount(Long.valueOf(hexCount)); + return; + } else if (!CharTable.isSpace(ch)) { + hex = COSFilterASCIIHexDecode.decodeLoHex(ch); + hexCount++; + if (hex < 16 && hex > -1) { // skip all non-Hex characters + if (odd) { + uc = (uc << 4) + hex; + appendToToken(uc); + uc = 0; + } else { + uc = hex; + } + odd = !odd; + } else { + containsOnlyHex = false; + } + } + } + + this.token.setContainsOnlyHex(containsOnlyHex); + this.token.setHexCount(Long.valueOf(hexCount)); + } + + protected void readNumber() throws IOException { + try { + int radix = 10; + initializeToken(); + this.token.clearValue(); + this.token.type = Token.Type.TT_INTEGER; + byte ch; + while (!this.source.isEOF()) { + ch = this.source.readByte(); + if (CharTable.isTokenDelimiter(ch)) { + this.source.unread(); + break; + } + if (ch >= '0' && ch <= '9') { + appendToToken(ch); + } else if (ch == '.') { + this.token.type = Token.Type.TT_REAL; + appendToToken(ch); + } else if (ch == '#' && isPSParser) { + if (this.token.type == Token.Type.TT_INTEGER) { + radix = Integer.valueOf(this.token.getValue()); + } + token.clearValue(); + } else { + this.source.unread(); + break; + } + } + if (this.token.type == Token.Type.TT_INTEGER) { + long value = Long.valueOf(this.token.getValue(), radix).longValue(); + this.token.integer = value; + this.token.real = value; + } else { + double value = Double.valueOf(this.token.getValue()).doubleValue(); + this.token.integer = Math.round(value); + this.token.real = value; + } + } catch (NumberFormatException e) { + LOGGER.log(Level.FINE, getErrorMessage(""), e); + this.token.integer = Math.round(Double.MAX_VALUE); + this.token.real = Double.MAX_VALUE; + } + } + + private void readLitString() throws IOException { + this.token.clearValue(); + + int parenthesesDepth = 0; + + byte ch = this.source.readByte(); + while (!this.source.isEOF()) { + switch (ch) { + default: + appendToToken(ch); + break; + case '(': + parenthesesDepth++; + appendToToken(ch); + break; + case ')': + if (parenthesesDepth == 0) { + return; + } + + parenthesesDepth--; + appendToToken(ch); + break; + case '\\': { + ch = this.source.readByte(); + switch (ch) { + case '(': + appendToToken(CharTable.ASCII_LEFT_PAR); + break; + case ')': + appendToToken(CharTable.ASCII_RIGHT_PAR); + break; + case 'n': + appendToToken(CharTable.ASCII_LF); + break; + case 'r': + appendToToken(CharTable.ASCII_CR); + break; + case 't': + appendToToken(CharTable.ASCII_HT); + break; + case 'b': + appendToToken(CharTable.ASCII_BS); + break; + case 'f': + appendToToken(CharTable.ASCII_FF); + break; + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': { + if (!isPSParser) { + // look for 1, 2, or 3 octal characters + char ch1 = (char) (ch - '0'); + for (int i = 1; i < 3; i++) { + ch = this.source.readByte(); + if (ch < '0' || ch > '7') { + this.source.unread(); + break; + } + ch1 = (char) ((ch1 << 3) + (ch - '0')); + } + appendToToken(ch1); + } + break; + } + case CharTable.ASCII_LF: + break; + case CharTable.ASCII_CR: + ch = this.source.readByte(); + if (ch != CharTable.ASCII_LF) { + this.source.unread(); + } + break; + default: + appendToToken(ch); + break; + } + break; + } + } + ch = source.readByte(); + if (token.getSize() > MAX_STRING_LENGTH) { + LOGGER.log(Level.WARNING, getErrorMessage("Content stream string token exceeds " + MAX_STRING_LENGTH + " bytes")); + break; + } + } + while (!this.source.isEOF()) { + switch (ch) { + default: + break; + case '(': + parenthesesDepth++; + break; + case ')': + if (parenthesesDepth == 0) { + return; + } + parenthesesDepth--; + break; + case '\\': { + ch = this.source.readByte(); + switch (ch) { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': { + if (!isPSParser) { + // look for 1, 2, or 3 octal characters + for (int i = 1; i < 3; i++) { + ch = this.source.readByte(); + if (ch < '0' || ch > '7') { + this.source.unread(); + break; + } + } + } + break; + } + case CharTable.ASCII_CR: + ch = this.source.readByte(); + if (ch != CharTable.ASCII_LF) { + this.source.unread(); + } + break; + default: + break; + } + break; + } + } + + ch = source.readByte(); + } + } + + protected void readName() throws IOException { + this.token.clearValue(); + byte ch; + while (!this.source.isEOF()) { + ch = this.source.readByte(); + if (CharTable.isTokenDelimiter(ch)) { + this.source.unread(); + break; + } + + // if ch == # (0x23) + if (ch == '#' && !isPSParser) { + byte ch1; + byte ch2; + byte dc; + ch1 = this.source.readByte(); + if (!source.isEOF() && COSFilterASCIIHexDecode.decodeLoHex(ch1) != COSFilterASCIIHexDecode.ER) { + dc = COSFilterASCIIHexDecode.decodeLoHex(ch1); + ch2 = this.source.readByte(); + if (!this.source.isEOF() && COSFilterASCIIHexDecode.decodeLoHex(ch2) != COSFilterASCIIHexDecode.ER) { + dc = (byte) ((dc << 4) + COSFilterASCIIHexDecode.decodeLoHex(ch2)); + appendToToken(dc); + } else { + appendToToken(ch); + appendToToken(ch1); + this.source.unread(); + } + } else { + appendToToken(ch); + this.source.unread(); + } + } else { + appendToToken(ch); + } + } + } + + protected String getErrorMessage(String message) { + return message; + } + + protected BaseParserInputStream getSource() { + return source; + } +} diff --git a/src/main/java/org/verapdf/parser/BaseParserInputStream.java b/src/main/java/org/verapdf/parser/BaseParserInputStream.java new file mode 100644 index 00000000..29b0c10d --- /dev/null +++ b/src/main/java/org/verapdf/parser/BaseParserInputStream.java @@ -0,0 +1,27 @@ +package org.verapdf.parser; + +import java.io.IOException; + +/** + * @author Maxim Plushchov + */ +public interface BaseParserInputStream { + + int read() throws IOException; + + int read(byte[] buffer) throws IOException; + + byte readByte() throws IOException; + + void unread() throws IOException; + + void unread(int i) throws IOException; + + int peek() throws IOException; + + int skip(int size) throws IOException; + + void close() throws IOException; + + boolean isEOF() throws IOException; +} diff --git a/src/main/java/org/verapdf/parser/DecodedObjectStreamParser.java b/src/main/java/org/verapdf/parser/DecodedObjectStreamParser.java index 982e8228..dec41d97 100644 --- a/src/main/java/org/verapdf/parser/DecodedObjectStreamParser.java +++ b/src/main/java/org/verapdf/parser/DecodedObjectStreamParser.java @@ -108,7 +108,7 @@ public COSObject getObject(COSKey key) throws IOException { if (!this.internalOffsets.containsKey(objNum)) { return new COSObject(); } - this.source.seek(internalOffsets.get(objNum)); + this.getSource().seek(internalOffsets.get(objNum)); this.flag = true; this.objects.clear(); // In case if some COSInteger was read before. this.integers.clear(); diff --git a/src/main/java/org/verapdf/parser/NotSeekableBaseParser.java b/src/main/java/org/verapdf/parser/NotSeekableBaseParser.java index 1debfb09..65f4102c 100644 --- a/src/main/java/org/verapdf/parser/NotSeekableBaseParser.java +++ b/src/main/java/org/verapdf/parser/NotSeekableBaseParser.java @@ -20,12 +20,10 @@ */ package org.verapdf.parser; -import org.verapdf.as.CharTable; import org.verapdf.as.filters.io.ASBufferedInFilter; import org.verapdf.as.io.ASInputStream; import org.verapdf.as.io.ASMemoryInStream; import org.verapdf.cos.filters.COSFilterASCII85Decode; -import org.verapdf.cos.filters.COSFilterASCIIHexDecode; import java.io.Closeable; import java.io.IOException; @@ -41,22 +39,9 @@ * * @author Sergey Shemyakov */ -public class NotSeekableBaseParser implements Closeable { +public class NotSeekableBaseParser extends BaseParser implements Closeable { - private static final Logger LOGGER = Logger.getLogger( - NotSeekableBaseParser.class.getCanonicalName()); - - private static final byte ASCII_ZERO = 48; - private static final byte ASCII_NINE = 57; - - // max string length in bytes - private static final int MAX_STRING_LENGTH = 65535; - - // indicates if this parser is a postscript parser - protected boolean isPSParser = false; - - protected ASBufferedInFilter source; - private Token token; + private static final Logger LOGGER = Logger.getLogger(NotSeekableBaseParser.class.getCanonicalName()); /** * Constructor from stream. New buffered stream from given stream is created. @@ -68,7 +53,7 @@ public NotSeekableBaseParser(ASInputStream stream) throws IOException { } this.source = new ASBufferedInFilter(stream); try { - source.initialize(); + getSource().initialize(); } catch (IOException e) { // Someone have to close source in case of // initialization exception source.close(); @@ -76,13 +61,11 @@ public NotSeekableBaseParser(ASInputStream stream) throws IOException { } } - public NotSeekableBaseParser(ASInputStream fileStream, boolean isPSParser) throws IOException { this(fileStream); this.isPSParser = isPSParser; } - /** * Closes source stream. */ @@ -92,70 +75,12 @@ public void close() throws IOException { // PROTECTED METHODS - protected void initializeToken() { - if (this.token == null) { - this.token = new Token(); - } - } - - private void appendToToken(final int ch) { - this.token.append(ch); - } - - protected Token getToken() { - return this.token; - } - - protected void readLine() throws IOException { - initializeToken(); - this.token.clearValue(); - byte ch = this.source.readByte(); - while (!this.source.isEOF()) { - if (ch == ASCII_LF || ch == ASCII_CR) { - break; - } - appendToToken(ch); - ch = this.source.readByte(); - } - } - - protected byte[] getLineBytes() throws IOException { - readLine(); - return this.token.getByteValue(); - } - - protected String readUntilDelimiter() throws IOException { - initializeToken(); - this.token.clearValue(); - byte ch = this.source.readByte(); - while (!isSpace(ch) && !isTokenDelimiter(ch)) { - appendToToken(ch); - if (!this.source.isEOF()) { - ch = this.source.readByte(); - } else { - break; - } - } - if (isSpace(ch) || isTokenDelimiter(ch)) { - this.source.unread(); - } - return this.token.getValue(); - } - - protected boolean findKeyword(final Token.Keyword keyword) throws IOException { - nextToken(); - while (this.token.type != Token.Type.TT_EOF && ( - this.token.type != Token.Type.TT_KEYWORD || this.token.keyword != keyword)) { - nextToken(); - } - return this.token.type == Token.Type.TT_KEYWORD && this.token.keyword == keyword; - } - + @Override protected boolean findKeyword(final Token.Keyword keyword, final int lookUpSize) throws IOException { - source.resetReadCounter(); + getSource().resetReadCounter(); nextToken(); while (this.token.type != Token.Type.TT_EOF && (this.token.type != Token.Type.TT_KEYWORD || this.token.keyword != keyword)) { - if (this.source.getReadCounter() >= lookUpSize) { + if (this.getSource().getReadCounter() >= lookUpSize) { break; } nextToken(); @@ -163,117 +88,6 @@ protected boolean findKeyword(final Token.Keyword keyword, final int lookUpSize) return this.token.type == Token.Type.TT_KEYWORD && this.token.keyword == keyword; } - protected void nextToken() throws IOException { - skipSpaces(true); - if (this.source.isEOF()) { - this.token.type = Token.Type.TT_EOF; - return; - } - - this.token.type = Token.Type.TT_NONE; - - byte ch = this.source.readByte(); - - switch (ch) { - case '(': - this.token.type = Token.Type.TT_LITSTRING; - readLitString(); - break; - case ')': - //error - break; - case '<': - ch = source.readByte(); - if (ch == '<') { - this.token.type = Token.Type.TT_OPENDICT; - } else if (ch == '~') { - this.token.type = Token.Type.TT_HEXSTRING; - readASCII85(); - } else { - this.source.unread(); - this.token.type = Token.Type.TT_HEXSTRING; - readHexString(); - } - break; - case '>': - ch = this.source.readByte(); - if (ch == '>') { - this.token.type = Token.Type.TT_CLOSEDICT; - } else { - throw new IOException("Unknown symbol " + ch + " after \'>\'"); - } - break; - case '[': - this.token.type = Token.Type.TT_OPENARRAY; - break; - case ']': - this.token.type = Token.Type.TT_CLOSEARRAY; - break; - case '{': // as delimiter in PostScript calculator functions 181 - if (isPSParser) { - this.token.type = Token.Type.TT_STARTPROC; - } - break; - case '}': - if (isPSParser) { - this.token.type = Token.Type.TT_ENDPROC; - } - break; - case '/': - this.token.type = Token.Type.TT_NAME; - readName(); - break; - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - case '.': - this.source.unread(); - readNumber(); - break; - case '-': - readNumber(); - this.token.integer = -this.token.integer; - this.token.real = -this.token.real; - break; - default: - this.source.unread(); - readToken(); - this.token.toKeyword(); - if (this.token.keyword == Token.Keyword.KW_NONE) { - this.token.type = Token.Type.TT_NONE; - } - break; - } - } - - protected void skipSpaces() throws IOException { - this.skipSpaces(false); - } - - protected void skipSpaces(boolean skipComment) throws IOException { - byte ch; - while (!this.source.isEOF()) { - ch = this.source.readByte(); - if (CharTable.isSpace(ch)) { - continue; - } - if (ch == '%' && skipComment) { - skipComment(); - continue; - } - - this.source.unread(); - break; - } - } - protected void skipStreamSpaces() throws IOException { byte space = this.source.readByte(); @@ -292,15 +106,8 @@ protected void skipStreamSpaces() throws IOException { } } - protected boolean isDigit() throws IOException { - return isDigit(this.source.peek()); - } - - protected static boolean isDigit(int c) { - return c >= ASCII_ZERO && c <= ASCII_NINE; - } - - private void skipComment() throws IOException { + @Override + protected void skipComment() throws IOException { // skips all characters till EOL == { CR, LF, CRLF } byte ch; while (!this.source.isEOF()) { @@ -324,206 +131,8 @@ protected boolean isEndOfComment(byte ch) { return isCR(ch); } - protected static boolean isFF(int c) { - return ASCII_FF == c; - } - - protected static boolean isLF(int c) { - return ASCII_LF == c; - } - - protected static boolean isCR(int c) { - return ASCII_CR == c; - } - - private void readLitString() throws IOException { - this.token.clearValue(); - - int parenthesesDepth = 0; - - byte ch = this.source.readByte(); - while (!this.source.isEOF()) { - switch (ch) { - default: - appendToToken(ch); - break; - case '(': - parenthesesDepth++; - appendToToken(ch); - break; - case ')': - if (parenthesesDepth == 0) { - return; - } - - parenthesesDepth--; - appendToToken(ch); - break; - case '\\': { - ch = this.source.readByte(); - switch (ch) { - case '(': - appendToToken(CharTable.ASCII_LEFT_PAR); - break; - case ')': - appendToToken(CharTable.ASCII_RIGHT_PAR); - break; - case 'n': - appendToToken(ASCII_LF); - break; - case 'r': - appendToToken(ASCII_CR); - break; - case 't': - appendToToken(CharTable.ASCII_HT); - break; - case 'b': - appendToToken(CharTable.ASCII_BS); - break; - case 'f': - appendToToken(CharTable.ASCII_FF); - break; - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': { - if (!isPSParser) { - // look for 1, 2, or 3 octal characters - char ch1 = (char) (ch - '0'); - for (int i = 1; i < 3; i++) { - ch = this.source.readByte(); - if (ch < '0' || ch > '7') { - this.source.unread(); - break; - } - ch1 = (char) ((ch1 << 3) + (ch - '0')); - } - appendToToken(ch1); - } - break; - } - case ASCII_LF: - break; - case ASCII_CR: - ch = this.source.readByte(); - if (ch != ASCII_LF) { - this.source.unread(); - } - break; - default: - appendToToken(ch); - break; - } - break; - } - } - ch = source.readByte(); - if (token.getSize() > MAX_STRING_LENGTH) { - LOGGER.log(Level.WARNING, "Content stream string token exceeds " + MAX_STRING_LENGTH + " bytes"); - break; - } - } - while (!this.source.isEOF()) { - switch (ch) { - default: - break; - case '(': - parenthesesDepth++; - break; - case ')': - if (parenthesesDepth == 0) { - return; - } - parenthesesDepth--; - break; - case '\\': { - ch = this.source.readByte(); - switch (ch) { - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': { - if (!isPSParser) { - // look for 1, 2, or 3 octal characters - for (int i = 1; i < 3; i++) { - ch = this.source.readByte(); - if (ch < '0' || ch > '7') { - this.source.unread(); - break; - } - } - } - break; - } - case ASCII_CR: - ch = this.source.readByte(); - if (ch != ASCII_LF) { - this.source.unread(); - } - break; - default: - break; - } - break; - } - } - - ch = source.readByte(); - } - } - - private void readHexString() throws IOException { - this.token.clearValue(); - byte ch; - int uc = 0; - int hex; - - //these are required for pdf/a validation - boolean containsOnlyHex = true; - long hexCount = 0; - - boolean odd = false; - while (!this.source.isEOF()) { - ch = this.source.readByte(); - if (ch == '>') { - if (odd) { - uc <<= 4; - appendToToken(uc); - } - this.token.setContainsOnlyHex(containsOnlyHex); - this.token.setHexCount(Long.valueOf(hexCount)); - return; - } else if (!CharTable.isSpace(ch)) { - hex = COSFilterASCIIHexDecode.decodeLoHex(ch); - hexCount++; - if (hex < 16 && hex > -1) { // skip all non-Hex characters - if (odd) { - uc = (uc << 4) + hex; - appendToToken(uc); - uc = 0; - } else { - uc = hex; - } - odd = !odd; - } else { - containsOnlyHex = false; - } - } - } - - this.token.setContainsOnlyHex(containsOnlyHex); - this.token.setHexCount(Long.valueOf(hexCount)); - } - - private void readASCII85() throws IOException { + @Override + protected void readASCII85() throws IOException { byte[] buf = new byte[ASBufferedInFilter.START_BUFFER_SIZE]; int pointer = 0; byte readByte = this.source.readByte(); @@ -557,95 +166,7 @@ public static byte[] extendArray(byte[] array) { return res; } - private void readName() throws IOException { - this.token.clearValue(); - byte ch; - while (!this.source.isEOF()) { - ch = this.source.readByte(); - if (CharTable.isTokenDelimiter(ch)) { - this.source.unread(); - break; - } - - if (ch == '#' && !isPSParser) { - byte ch1; - byte ch2; - byte dc; - ch1 = this.source.readByte(); - if (!source.isEOF() && COSFilterASCIIHexDecode.decodeLoHex(ch1) != COSFilterASCIIHexDecode.ER) { - dc = COSFilterASCIIHexDecode.decodeLoHex(ch1); - ch2 = this.source.readByte(); - if (!this.source.isEOF() && COSFilterASCIIHexDecode.decodeLoHex(ch2) != COSFilterASCIIHexDecode.ER) { - dc = (byte) ((dc << 4) + COSFilterASCIIHexDecode.decodeLoHex(ch2)); - appendToToken(dc); - } else { - appendToToken(ch); - appendToToken(ch1); - this.source.unread(); - } - } else { - appendToToken(ch); - this.source.unread(); - } - } else { - appendToToken(ch); - } - } - } - - private void readToken() throws IOException { - this.token.clearValue(); - byte ch; - while (!this.source.isEOF()) { - ch = this.source.readByte(); - if (CharTable.isTokenDelimiter(ch)) { - this.source.unread(); - break; - } - - appendToToken(ch); - } - } - - protected void readNumber() throws IOException { - try { - int radix = 10; - initializeToken(); - this.token.clearValue(); - this.token.type = Token.Type.TT_INTEGER; - byte ch; - while (!this.source.isEOF()) { - ch = this.source.readByte(); - if (CharTable.isTokenDelimiter(ch)) { - this.source.unread(); - break; - } - if (ch >= '0' && ch <= '9') { - appendToToken(ch); - } else if (ch == '.') { - this.token.type = Token.Type.TT_REAL; - appendToToken(ch); - } else if (ch == '#' && isPSParser) { - if (this.token.type == Token.Type.TT_INTEGER) { - radix = Integer.valueOf(this.token.getValue()); - } - token.clearValue(); - } else { - this.source.unread(); - break; - } - } - if (this.token.type == Token.Type.TT_INTEGER) { - long value = Long.valueOf(this.token.getValue(), radix).longValue(); - this.token.integer = value; - this.token.real = value; - } else { - double value = Double.valueOf(this.token.getValue()).doubleValue(); - this.token.integer = Math.round(value); - this.token.real = value; - } - } catch (NumberFormatException e) { - LOGGER.log(Level.FINE, "", e); - } + protected ASBufferedInFilter getSource() { + return (ASBufferedInFilter) source; } } diff --git a/src/main/java/org/verapdf/parser/PDFParser.java b/src/main/java/org/verapdf/parser/PDFParser.java index adfb612d..6211aa1e 100644 --- a/src/main/java/org/verapdf/parser/PDFParser.java +++ b/src/main/java/org/verapdf/parser/PDFParser.java @@ -81,7 +81,7 @@ public COSHeader getHeader() throws IOException { } public SeekableInputStream getPDFSource() { - return this.source; + return this.getSource(); } private COSHeader parseHeader() throws IOException { @@ -104,7 +104,7 @@ private COSHeader parseHeader() throws IOException { source.readByte(); final int headerStart = header.indexOf(HEADER_PATTERN); - final long headerOffset = source.getOffset() - header.length() + headerStart; + final long headerOffset = getSource().getOffset() - header.length() + headerStart; this.offsetShift = headerOffset; result.setHeaderOffset(headerOffset); @@ -151,7 +151,7 @@ private COSHeader parseHeader() throws IOException { checkComment(result); // rewind - source.seek(0); + getSource().seek(0); return result; } @@ -162,7 +162,7 @@ public boolean isLinearized() { if (isLinearizationDictionary(linDict)) { long length = linDict.getIntegerKey(ASAtom.L); if (length != 0) { - return length == this.source.getStreamLength() && this.source.getOffset() < LINEARIZATION_DICTIONARY_LOOKUP_SIZE; + return length == this.getSource().getStreamLength() && this.getSource().getOffset() < LINEARIZATION_DICTIONARY_LOOKUP_SIZE; } } } catch (IOException e) { @@ -189,7 +189,7 @@ private static boolean isLinearizationDictionary(COSObject object) { } private COSObject findFirstDictionary() throws IOException { - source.seek(0L); + getSource().seek(0L); if (findKeyword(Token.Keyword.KW_OBJ, LINEARIZATION_DICTIONARY_LOOKUP_SIZE)) { source.unread(7); @@ -199,7 +199,7 @@ private COSObject findFirstDictionary() throws IOException { while (!CharTable.isSpace(this.source.read())) { source.unread(2); } - return getObject(source.getOffset()); + return getObject(getSource().getOffset()); } return null; } @@ -232,14 +232,14 @@ private void checkComment(final COSHeader header) throws IOException { public void getXRefInfo(List infos) throws IOException { calculatePostEOFDataSize(); - document.setFileSize(source.getStreamLength()); + document.setFileSize(getSource().getStreamLength()); this.getXRefInfo(infos, new HashSet(), null); } public COSObject getObject(final long offset) throws IOException { clear(); - source.seek(offset); + getSource().seek(offset); final Token token = getToken(); @@ -250,7 +250,7 @@ public COSObject getObject(final long offset) throws IOException { //Check that if offset doesn't point to obj key there is eol character before obj key //pdf/a-1b spec, clause 6.1.8 skipSpaces(false); - source.seek(source.getOffset() - 1); + getSource().seek(getSource().getOffset() - 1); if (!isNextByteEOL()) { headerOfObjectComplyPDFA = false; } @@ -309,16 +309,16 @@ public COSObject getObject(final long offset) throws IOException { } } - long beforeSkip = this.source.getOffset(); + long beforeSkip = this.getSource().getOffset(); skipSpaces(); - if (this.source.getOffset() != beforeSkip) { + if (this.getSource().getOffset() != beforeSkip) { this.source.unread(); } if (!isNextByteEOL()) { endOfObjectComplyPDFA = false; } - long offsetBeforeEndobj = this.source.getOffset(); + long offsetBeforeEndobj = this.getSource().getOffset(); if (this.flag) { nextToken(); } @@ -328,7 +328,7 @@ public COSObject getObject(final long offset) throws IOException { token.keyword != Token.Keyword.KW_ENDOBJ) { // TODO : replace with ASException LOGGER.log(Level.WARNING, getErrorMessage("No endobj keyword" + offsetBeforeEndobj)); - this.source.seek(offsetBeforeEndobj); + this.getSource().seek(offsetBeforeEndobj); } if (!isNextByteEOL()) { @@ -349,29 +349,29 @@ private void clear() { } private Long findLastXRef() throws IOException { - source.seekFromEnd(STARTXREF.length); + getSource().seekFromEnd(STARTXREF.length); byte[] buf = new byte[STARTXREF.length]; - while (source.getStreamLength() - source.getOffset() < 1024) { + while (getSource().getStreamLength() - getSource().getOffset() < 1024) { source.read(buf); if (Arrays.equals(buf, STARTXREF)) { nextToken(); return this.getToken().integer; } - if (source.getOffset() <= STARTXREF.length) { + if (getSource().getOffset() <= STARTXREF.length) { throw new IOException("Document doesn't contain startxref keyword"); } - source.seekFromCurrentPosition(-STARTXREF.length - 1); + getSource().seekFromCurrentPosition(-STARTXREF.length - 1); } return null; } private void calculatePostEOFDataSize() throws IOException { - long size = source.getStreamLength(); + long size = getSource().getStreamLength(); final int lookupSize = 1024 > size ? (int) size : 1024; - source.seekFromEnd(lookupSize); + getSource().seekFromEnd(lookupSize); byte[] buffer = new byte[lookupSize]; - source.read(buffer, lookupSize); + getSource().read(buffer, lookupSize); byte postEOFDataSize = -1; @@ -426,7 +426,7 @@ private void calculatePostEOFDataSize() throws IOException { private void getXRefSectionAndTrailer(final COSXRefInfo section) throws IOException { if (this.lastTrailerOffset == 0) { - this.lastTrailerOffset = this.source.getOffset(); + this.lastTrailerOffset = this.getSource().getOffset(); } nextToken(); if ((getToken().type != Token.Type.TT_KEYWORD || @@ -492,7 +492,7 @@ protected void parseXrefTable(final COSXRefSection xrefs) throws IOException { } nextToken(); } - this.source.seekFromCurrentPosition(-7); + this.getSource().seekFromCurrentPosition(-7); } /** @@ -570,7 +570,7 @@ private void getXRefInfo(final List info, Set processedOffset } //we will skip eol marker in any case - source.seek(Math.max(0, offset - 1)); + getSource().seek(Math.max(0, offset - 1)); COSXRefInfo section = new COSXRefInfo(); info.add(0, section); diff --git a/src/main/java/org/verapdf/parser/PDFStreamParser.java b/src/main/java/org/verapdf/parser/PDFStreamParser.java index efcd8aad..0db16d5c 100644 --- a/src/main/java/org/verapdf/parser/PDFStreamParser.java +++ b/src/main/java/org/verapdf/parser/PDFStreamParser.java @@ -293,7 +293,7 @@ protected String nextOperator() throws IOException { } private ASInputStream readInlineImage() throws IOException { - source.resetReadCounter(); + getSource().resetReadCounter(); Long l = this.lastInlineImageDict == null ? Long.valueOf(0) : PDInlineImage.getInlineImageKey(lastInlineImageDict, ASAtom.LENGTH).getInteger(); ArrayList image = new ArrayList<>(INLINE_IMAGE_BUFFER_SIZE); byte previousByte = source.readByte(); @@ -319,11 +319,11 @@ private ASInputStream readInlineImage() throws IOException { LOGGER.log(Level.WARNING, "End of inline image not found"); } return new ASMemoryInStream(getByteArrayFromArrayList(image), - source.getReadCounter(), false); + getSource().getReadCounter(), false); } private boolean checkInlineImage() throws IOException { - int readCounter = source.getReadCounter(); + int readCounter = getSource().getReadCounter(); try { Object token = parseNextToken(); if (token instanceof Operator && !Operators.operators.contains(((Operator)token).getOperator())) { @@ -332,13 +332,13 @@ private boolean checkInlineImage() throws IOException { } catch (IOException e) { return false; } finally { - source.unread(source.getReadCounter() - readCounter); + source.unread(getSource().getReadCounter() - readCounter); } return true; } private boolean isSourceAfterImage(Long length) { - return length == null || source.getReadCounter() >= length; + return length == null || getSource().getReadCounter() >= length; } public List getImageDataStreams() { diff --git a/src/main/java/org/verapdf/parser/SeekableBaseParser.java b/src/main/java/org/verapdf/parser/SeekableBaseParser.java index 4b1546ec..14ec49b3 100644 --- a/src/main/java/org/verapdf/parser/SeekableBaseParser.java +++ b/src/main/java/org/verapdf/parser/SeekableBaseParser.java @@ -20,10 +20,8 @@ */ package org.verapdf.parser; -import org.verapdf.as.CharTable; import org.verapdf.as.io.ASInputStream; import org.verapdf.cos.filters.COSFilterASCII85Decode; -import org.verapdf.cos.filters.COSFilterASCIIHexDecode; import org.verapdf.io.InternalInputStream; import org.verapdf.io.SeekableInputStream; @@ -31,23 +29,13 @@ import java.io.IOException; import java.io.InputStream; import java.util.Arrays; -import java.util.logging.Level; -import java.util.logging.Logger; import static org.verapdf.as.CharTable.*; /** * @author Timur Kamalov */ -public class SeekableBaseParser { - - private static final Logger LOGGER = Logger.getLogger(SeekableBaseParser.class.getCanonicalName()); - - private static final byte ASCII_ZERO = 48; - private static final byte ASCII_NINE = 57; - - protected SeekableInputStream source; - private Token token; +public class SeekableBaseParser extends BaseParser { public SeekableBaseParser(SeekableInputStream stream) throws IOException { if (stream == null) { @@ -84,33 +72,10 @@ public void closeInputStream() throws IOException { // PROTECTED METHODS - protected Token getToken() { - return this.token; - } - - protected String getLine() throws IOException { - initializeToken(); - this.token.clearValue(); - byte ch = this.source.readByte(); - while (!this.source.isEOF()) { - if (ch == ASCII_LF || ch == ASCII_CR) { - break; - } - appendToToken(ch); - ch = this.source.readByte(); - } - return this.token.getValue(); - } - - protected byte[] getLineBytes() throws IOException { - getLine(); - return this.token.getByteValue(); - } - protected String getLine(final int offset) throws IOException { initializeToken(); this.token.clearValue(); - this.source.seek(offset); + this.getSource().seek(offset); byte ch = this.source.readByte(); while (!this.source.isEOF()) { if (ch == ASCII_LF || ch == ASCII_CR) { @@ -122,39 +87,14 @@ protected String getLine(final int offset) throws IOException { return this.token.getValue(); } - protected String readUntilDelimiter() throws IOException { - initializeToken(); - this.token.clearValue(); - byte ch = this.source.readByte(); - while (!isSpace(ch) && !isTokenDelimiter(ch)) { - appendToToken(ch); - if (!this.source.isEOF()) { - ch = this.source.readByte(); - } else { - break; - } - } - if (isSpace(ch) || isTokenDelimiter(ch)) { - this.source.unread(); - } - return this.token.getValue(); - } - - protected boolean findKeyword(final Token.Keyword keyword) throws IOException { - nextToken(); - while (this.token.type != Token.Type.TT_EOF && (this.token.type != Token.Type.TT_KEYWORD || this.token.keyword != keyword)) { - nextToken(); - } - return this.token.type == Token.Type.TT_KEYWORD && this.token.keyword == keyword; - } - // lookUpSize starts from current offset + @Override protected boolean findKeyword(final Token.Keyword keyword, final int lookUpSize) throws IOException { - long endOffset = Math.min(this.source.getOffset() + lookUpSize, this.source.getStreamLength()); + long endOffset = Math.min(this.getSource().getOffset() + lookUpSize, this.getSource().getStreamLength()); nextToken(); while (this.token.type != Token.Type.TT_EOF && (this.token.type != Token.Type.TT_KEYWORD || this.token.keyword != keyword)) { - if (this.source.getOffset() >= endOffset) { + if (this.getSource().getOffset() >= endOffset) { break; } nextToken(); @@ -162,97 +102,10 @@ protected boolean findKeyword(final Token.Keyword keyword, final int lookUpSize) return this.token.type == Token.Type.TT_KEYWORD && this.token.keyword == keyword; } - protected void nextToken() throws IOException { - skipSpaces(true); - if (this.source.isEOF()) { - this.token.type = Token.Type.TT_EOF; - return; - } - - this.token.type = Token.Type.TT_NONE; - - byte ch = this.source.readByte(); - - switch (ch) { - case '(': - this.token.type = Token.Type.TT_LITSTRING; - readLitString(); - break; - case ')': - //error - break; - case '<': - ch = source.readByte(); - if (ch == '<') { - this.token.type = Token.Type.TT_OPENDICT; - } else if (ch == '~') { - this.token.type = Token.Type.TT_HEXSTRING; - readASCII85(); - } else { - this.source.unread(); - this.token.type = Token.Type.TT_HEXSTRING; - readHexString(); - } - break; - case '>': - ch = this.source.readByte(); - if (ch == '>') { - this.token.type = Token.Type.TT_CLOSEDICT; - } else { - throw new IOException(getErrorMessage("Unknown symbol " + ch + " after \'>\'")); - } - break; - case '[': - this.token.type = Token.Type.TT_OPENARRAY; - break; - case ']': - this.token.type = Token.Type.TT_CLOSEARRAY; - break; - case '{': // as delimiter in PostScript calculator functions 181 - break; - case '}': - break; - case '/': - this.token.type = Token.Type.TT_NAME; - readName(); - break; - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - case '.': - this.source.unread(); - readNumber(); - break; - case '+': - readNumber(); - break; - case '-': - readNumber(); - this.token.integer = -this.token.integer; - this.token.real = -this.token.real; - break; - default: - this.source.unread(); - readToken(); - this.token.toKeyword(); - if (this.token.keyword == Token.Keyword.KW_NONE) { - this.token.type = Token.Type.TT_NONE; - } - break; - } - } - public ASInputStream getRandomAccess(final long length) throws IOException { ASInputStream result = - this.source.getStream(this.source.getOffset(), length); - source.seekFromCurrentPosition(length); + this.getSource().getStream(this.getSource().getOffset(), length); + getSource().seekFromCurrentPosition(length); return result; } @@ -273,59 +126,16 @@ protected void skipSingleEol() throws IOException { } } - protected void skipSpaces() throws IOException { - this.skipSpaces(false); - } - protected void skipSingleSpace() throws IOException { this.skipSingleSpace(false); } - protected void skipSpaces(boolean skipComment) throws IOException { - while (skipSingleSpace(skipComment)); - } - - protected boolean skipSingleSpace(boolean skipComment) throws IOException { - if (this.source.isEOF()) { - return false; - } - byte ch = this.source.readByte(); - if (CharTable.isSpace(ch)) { - return true; - } - if (ch == '%' && skipComment) { - skipComment(); - return true; - } - this.source.unread(); - return false; - } - - protected boolean isDigit() throws IOException { - return isDigit((byte) this.source.peek()); - } - - protected static boolean isDigit(byte c) { - return c >= ASCII_ZERO && c <= ASCII_NINE; - } - protected static boolean isHexDigit(byte ch) { return isDigit(ch) || (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F'); } - protected static boolean isLF(int c) { - return ASCII_LF == c; - } - - protected static boolean isCR(int c) { - return ASCII_CR == c; - } - - protected static boolean isFF(int c) { - return ASCII_FF == c; - } // PRIVATE METHODS private void skipEOL() throws IOException { @@ -346,7 +156,8 @@ private void skipEOL() throws IOException { this.source.unread(); } - private void skipComment() throws IOException { + @Override + protected void skipComment() throws IOException { // skips all characters till EOL == { CR, LF, CRLF } byte ch; while (!this.source.isEOF()) { @@ -383,150 +194,20 @@ protected boolean isEOL(byte ch) throws IOException { } } - private void readLitString() throws IOException { - this.token.clearValue(); - - int parenthesesDepth = 0; - - byte ch = this.source.readByte(); - while (!this.source.isEOF()) { - switch (ch) { - default: - appendToToken(ch); - break; - case '(': - parenthesesDepth++; - appendToToken(ch); - break; - case ')': - if (parenthesesDepth == 0) { - return; - } - - parenthesesDepth--; - appendToToken(ch); - break; - case '\\': { - ch = this.source.readByte(); - switch (ch) { - case '(': - appendToToken(CharTable.ASCII_LEFT_PAR); - break; - case ')': - appendToToken(CharTable.ASCII_RIGHT_PAR); - break; - case 'n': - appendToToken(ASCII_LF); - break; - case 'r': - appendToToken(ASCII_CR); - break; - case 't': - appendToToken(CharTable.ASCII_HT); - break; - case 'b': - appendToToken(CharTable.ASCII_BS); - break; - case 'f': - appendToToken(CharTable.ASCII_FF); - break; - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': { - // look for 1, 2, or 3 octal characters - char ch1 = (char) (ch - '0'); - for (int i = 1; i < 3; i++) { - ch = this.source.readByte(); - if (ch < '0' || ch > '7') { - this.source.unread(); - break; - } - ch1 = (char) ((ch1 << 3) + (ch - '0')); - } - appendToToken(ch1); - break; - } - case ASCII_LF: - break; - case ASCII_CR: - ch = this.source.readByte(); - if (ch != ASCII_LF) { - this.source.unread(); - } - break; - default: - appendToToken(ch); - break; - } - break; - } - } - - ch = source.readByte(); - } - } - - private void readHexString() throws IOException { - this.token.clearValue(); - byte ch; - int uc = 0; - int hex; - - //these are required for pdf/a validation - boolean containsOnlyHex = true; - long hexCount = 0; - - boolean odd = false; - while (!this.source.isEOF()) { - ch = this.source.readByte(); - if (ch == '>') { - if (odd) { - uc <<= 4; - appendToToken(uc); - } - this.token.setContainsOnlyHex(containsOnlyHex); - this.token.setHexCount(Long.valueOf(hexCount)); - return; - } else if (!CharTable.isSpace(ch)) { - hex = COSFilterASCIIHexDecode.decodeLoHex(ch); - hexCount++; - if (hex < 16 && hex > -1) { // skip all non-Hex characters - if (odd) { - uc = (uc << 4) + hex; - appendToToken(uc); - uc = 0; - } else { - uc = hex; - } - odd = !odd; - } else { - containsOnlyHex = false; - } - } - } - - this.token.setContainsOnlyHex(containsOnlyHex); - this.token.setHexCount(Long.valueOf(hexCount)); - } - - private void readASCII85() throws IOException { - long ascii85Start = this.source.getOffset(); - long ascii85End = this.source.getStreamLength(); + @Override + protected void readASCII85() throws IOException { + long ascii85Start = this.getSource().getOffset(); + long ascii85End = this.getSource().getStreamLength(); byte b = this.source.readByte(); while (!source.isEOF()) { if (b == '~' && this.source.peek() == '>') { - ascii85End = this.source.getOffset() - 1; + ascii85End = this.getSource().getOffset() - 1; this.source.readByte(); // here we finished reading all ascii85 string break; } b = source.readByte(); } - ASInputStream ascii85 = this.source.getStream(ascii85Start, ascii85End - ascii85Start); + ASInputStream ascii85 = this.getSource().getStream(ascii85Start, ascii85End - ascii85Start); COSFilterASCII85Decode ascii85Decode = new COSFilterASCII85Decode(ascii85); byte[] buf = new byte[(int) (ascii85End - ascii85Start)]; int read = ascii85Decode.read(buf); @@ -537,109 +218,6 @@ private void readASCII85() throws IOException { this.token.setByteValue(buf); } - protected void readName() throws IOException { - this.token.clearValue(); - byte ch; - while (!this.source.isEOF()) { - ch = this.source.readByte(); - if (CharTable.isTokenDelimiter(ch)) { - this.source.unread(); - break; - } - - // if ch == # (0x23) - if (ch == 0x23) { - byte ch1; - byte ch2; - byte dc; - ch1 = this.source.readByte(); - if (!source.isEOF() && COSFilterASCIIHexDecode.decodeLoHex(ch1) != COSFilterASCIIHexDecode.ER) { - dc = COSFilterASCIIHexDecode.decodeLoHex(ch1); - ch2 = this.source.readByte(); - if (!this.source.isEOF() && COSFilterASCIIHexDecode.decodeLoHex(ch2) != COSFilterASCIIHexDecode.ER) { - dc = (byte) ((dc << 4) + COSFilterASCIIHexDecode.decodeLoHex(ch2)); - appendToToken(dc); - } else { - appendToToken(ch); - appendToToken(ch1); - this.source.unread(); - } - } else { - appendToToken(ch); - this.source.unread(); - } - } else { - appendToToken(ch); - } - } - } - - private void readToken() throws IOException { - this.token.clearValue(); - byte ch; - while (!this.source.isEOF()) { - ch = this.source.readByte(); - if (CharTable.isTokenDelimiter(ch)) { - this.source.unread(); - break; - } - - appendToToken(ch); - } - } - - protected void readNumber() throws IOException { - try { - initializeToken(); - this.token.clearValue(); - this.token.type = Token.Type.TT_INTEGER; - byte ch; - while (!this.source.isEOF()) { - ch = this.source.readByte(); - if (CharTable.isTokenDelimiter(ch)) { - this.source.unread(); - break; - } - if (ch >= '0' && ch <= '9') { - appendToToken(ch); - } else if (ch == '.') { - this.token.type = Token.Type.TT_REAL; - appendToToken(ch); - } else { - this.source.unread(); - break; - } - } - if (this.token.type == Token.Type.TT_INTEGER) { - long value = Long.valueOf(this.token.getValue()).longValue(); - this.token.integer = value; - this.token.real = value; - } else { - double value = Double.valueOf(this.token.getValue()).doubleValue(); - this.token.integer = Math.round(value); - this.token.real = value; - } - } catch (NumberFormatException e) { - LOGGER.log(Level.FINE, "", e); - this.token.integer = Math.round(Double.MAX_VALUE); - this.token.real = Double.MAX_VALUE; - } - } - - protected void initializeToken() { - if (this.token == null) { - this.token = new Token(); - } - } - - protected void clearToken() { - this.token.clearValue(); - } - - protected void appendToToken(final int ch) { - this.token.append(ch); - } - public static byte[] getRawBytes(String string) { byte[] res = new byte[string.length()]; for (int i = 0; i < string.length(); ++i) { @@ -652,15 +230,20 @@ protected void skipExpectedCharacter(char exp) throws IOException { char c = (char) this.source.readByte(); if (c != exp) { throw new IOException(getErrorMessage("Unexpected character: expected " + exp + " but got " + c, - this.source.getCurrentOffset() - 1)); + this.getSource().getCurrentOffset() - 1)); } } protected String getErrorMessage(String message) { - return getErrorMessage(message, source.getCurrentOffset()); + return getErrorMessage(message, getSource().getCurrentOffset()); } protected String getErrorMessage(String message, long offset) { return message + "(offset = " + offset + ")"; } + + @Override + protected SeekableInputStream getSource() { + return (SeekableInputStream) source; + } } diff --git a/src/main/java/org/verapdf/parser/SeekableCOSParser.java b/src/main/java/org/verapdf/parser/SeekableCOSParser.java index 92edf17d..cc675e60 100644 --- a/src/main/java/org/verapdf/parser/SeekableCOSParser.java +++ b/src/main/java/org/verapdf/parser/SeekableCOSParser.java @@ -254,7 +254,7 @@ protected COSObject getDictionary() throws IOException { throw new IOException(getErrorMessage(StringExceptions.INVALID_PDF_DICTONARY)); } - long reset = this.source.getOffset(); + long reset = this.getSource().getOffset(); if (this.flag) { nextToken(); } @@ -264,7 +264,7 @@ protected COSObject getDictionary() throws IOException { token.keyword == Token.Keyword.KW_STREAM) { return getStream(dict); } - this.source.seek(reset); + this.getSource().seek(reset); this.flag = true; return dict; @@ -285,7 +285,7 @@ protected COSObject getStream(COSObject dict) throws IOException { } checkStreamSpacings(dict); - long streamStartOffset = source.getOffset(); + long streamStartOffset = getSource().getOffset(); COSObject length = dict.getKey(ASAtom.LENGTH); if (this.keyOfCurrentObject != null && length.isIndirect() && this.keyOfCurrentObject.equals(length.getKey())) { @@ -293,7 +293,7 @@ protected COSObject getStream(COSObject dict) throws IOException { " which references to its own object key")); } Long size = length.getInteger(); - source.seek(streamStartOffset); + getSource().seek(streamStartOffset); boolean streamLengthValid = checkStreamLength(size); @@ -312,27 +312,27 @@ protected COSObject getStream(COSObject dict) throws IOException { int eolLength = 0; boolean isPrevCR = false; while (realStreamSize == -1 && !source.isEOF()) { - long bytesRead = source.read(buffer, bufferLength); + long bytesRead = getSource().read(buffer, bufferLength); for (int i = 0; i < bytesRead; i++) { if (buffer[i] == 101) { - long reset = source.getOffset(); + long reset = getSource().getOffset(); long possibleEndStreamOffset = reset - bytesRead + i - eolLength; - source.seek(possibleEndStreamOffset); + getSource().seek(possibleEndStreamOffset); nextToken(); if (token.type == Token.Type.TT_KEYWORD && token.keyword == Token.Keyword.KW_ENDSTREAM) { realStreamSize = possibleEndStreamOffset - streamStartOffset; dict.setRealStreamSize(realStreamSize); - source.seek(streamStartOffset); + getSource().seek(streamStartOffset); ASInputStream stm = super.getRandomAccess(realStreamSize); dict.setData(stm); - source.seek(possibleEndStreamOffset); + getSource().seek(possibleEndStreamOffset); if (stm instanceof InternalInputStream) { this.document.addFileResource(new ASFileStreamCloser(stm)); } break; } - source.seek(reset); + getSource().seek(reset); } //we need to subtract eol before endstream length from stream length @@ -382,13 +382,13 @@ private boolean checkStreamLength(Long streamLength) throws IOException { return false; } boolean validLength = true; - long start = source.getOffset(); + long start = getSource().getOffset(); long expectedEndstreamOffset = start + streamLength; - if (expectedEndstreamOffset > source.getStreamLength()) { + if (expectedEndstreamOffset > getSource().getStreamLength()) { validLength = false; LOGGER.log(Level.WARNING, getErrorMessage("Couldn't find expected endstream keyword", expectedEndstreamOffset)); } else { - source.seek(expectedEndstreamOffset); + getSource().seek(expectedEndstreamOffset); nextToken(); final Token token = getToken(); @@ -398,7 +398,7 @@ private boolean checkStreamLength(Long streamLength) throws IOException { LOGGER.log(Level.WARNING, getErrorMessage("Couldn't find expected endstream keyword", expectedEndstreamOffset)); } - source.seek(start); + getSource().seek(start); } return validLength; } @@ -407,7 +407,7 @@ private void checkEndstreamSpacings(COSObject stream, long streamStartOffset, Lo skipSpaces(); byte eolCount = 0; - long approximateLength = source.getOffset() - streamStartOffset; + long approximateLength = getSource().getOffset() - streamStartOffset; long expected = expectedLength == null ? 0 : expectedLength; long diff = approximateLength - expected; diff --git a/src/main/java/org/verapdf/parser/SignatureParser.java b/src/main/java/org/verapdf/parser/SignatureParser.java index 2f7a645b..f353a8c1 100644 --- a/src/main/java/org/verapdf/parser/SignatureParser.java +++ b/src/main/java/org/verapdf/parser/SignatureParser.java @@ -97,7 +97,7 @@ private void parseDictionary() * @throws IOException If there is an error parsing the dictionary object. */ private void passCOSDictionaryValue() throws IOException { - long numOffset = source.getOffset(); + long numOffset = getSource().getOffset(); COSObject number = nextObject(); skipSpaces(); if (!isDigit()) { @@ -120,7 +120,7 @@ private void passCOSDictionaryValue() throws IOException { * @return array of 4 longs, which is byte range array. */ public long[] getByteRangeBySignatureOffset(long signatureOffset) throws IOException { - source.seek(signatureOffset); + getSource().seek(signatureOffset); skipID(); byteRange[0] = 0; parseDictionary(); @@ -152,16 +152,16 @@ private boolean parseSignatureNameValuePair() throws IOException { private void parseSignatureValue() throws IOException { skipSpaces(); - long numOffset1 = source.getOffset(); + long numOffset1 = getSource().getOffset(); COSObject number = nextObject(); - long numOffset2 = source.getOffset(); + long numOffset2 = getSource().getOffset(); skipSpaces(); if (!isDigit()) { byteRange[1] = numOffset1; byteRange[2] = numOffset2; return; } - long genOffset = source.getOffset(); + long genOffset = getSource().getOffset(); COSObject generationNumber = nextObject(); skipSpaces(); int c = source.read(); @@ -175,16 +175,16 @@ private void parseSignatureValue() throws IOException { COSKey key = new COSKey(number.getInteger().intValue(), generationNumber.getInteger().intValue()); long keyOffset = this.document.getOffset(key).longValue(); - source.seek(keyOffset + document.getHeader().getHeaderOffset()); + getSource().seek(keyOffset + document.getHeader().getHeaderOffset()); parseSignatureValue(); // Recursive parsing to get to the contents hex string itself } if (c == 'o') { // Object itself skipExpectedCharacter('b'); skipExpectedCharacter('j'); skipSpaces(); - numOffset1 = source.getOffset(); + numOffset1 = getSource().getOffset(); nextObject(); - numOffset2 = source.getOffset(); + numOffset2 = getSource().getOffset(); byteRange[1] = numOffset1; byteRange[2] = numOffset2; } else { @@ -209,7 +209,7 @@ private long getOffsetOfNextXRef(long currentOffset) { */ private long getOffsetOfNextEOF(long currentOffset) throws IOException { byte[] buffer = new byte[EOF_STRING.length]; - source.seek(currentOffset + document.getHeader().getHeaderOffset()); + getSource().seek(currentOffset + document.getHeader().getHeaderOffset()); source.read(buffer); source.unread(buffer.length - 1); isStream = false; @@ -228,14 +228,14 @@ private long getOffsetOfNextEOF(long currentOffset) throws IOException { } if (source.isEOF()) { - source.seek(currentOffset + document.getHeader().getHeaderOffset()); - return source.getStreamLength(); + getSource().seek(currentOffset + document.getHeader().getHeaderOffset()); + return getSource().getStreamLength(); } if (unreadLength > 0) { source.unread(unreadLength); } } - long result = source.getOffset() - 1 + buffer.length; // byte right after '%%EOF' + long result = getSource().getOffset() - 1 + buffer.length; // byte right after '%%EOF' this.source.skip(EOF_STRING.length - 1); this.floatingBytesNumber = 0; this.isStreamEnd = false; @@ -257,7 +257,7 @@ private long getOffsetOfNextEOF(long currentOffset) throws IOException { if (nextByte == -1) { this.isStreamEnd = true; } - source.seek(currentOffset + document.getHeader().getHeaderOffset()); + getSource().seek(currentOffset + document.getHeader().getHeaderOffset()); return result; } @@ -265,18 +265,18 @@ private boolean isEOFFound(byte[] buffer) throws IOException { if (!Arrays.equals(buffer, EOF_STRING)) { return false; } - long pointer = this.source.getOffset(); + long pointer = this.getSource().getOffset(); this.source.unread(2); int byteBeforeEOF = this.source.peek(); while (!isLF(byteBeforeEOF)) { this.source.unread(); byteBeforeEOF = this.source.peek(); if (byteBeforeEOF != CharTable.ASCII_SPACE) { - this.source.seek(pointer); + this.getSource().seek(pointer); return false; } } - this.source.seek(pointer); + this.getSource().seek(pointer); return true; } diff --git a/src/main/java/org/verapdf/pd/font/type1/Type1FontProgram.java b/src/main/java/org/verapdf/pd/font/type1/Type1FontProgram.java index a347c6c7..4cc6496d 100644 --- a/src/main/java/org/verapdf/pd/font/type1/Type1FontProgram.java +++ b/src/main/java/org/verapdf/pd/font/type1/Type1FontProgram.java @@ -161,7 +161,7 @@ private void processObject(COSObject nextObject) throws IOException, PostScriptE nextObject.getString().equals(Type1StringConstants.EEXEC_STRING)) { this.skipSpacesExceptNullByte(); Type1PrivateParser parser = null; - try (ASInputStream eexecEncoded = this.source.getStreamUntilToken( + try (ASInputStream eexecEncoded = this.getSource().getStreamUntilToken( CLEAR_TO_MARK_BYTES)) { try (ASInputStream eexecDecoded = new EexecFilterDecode( eexecEncoded, false)) { @@ -332,7 +332,7 @@ private String getGlyph(int code) { * program. */ public ASFileStreamCloser getFontProgramResource() { - return new ASFileStreamCloser(this.source); + return new ASFileStreamCloser(this.getSource()); } @Override diff --git a/src/main/java/org/verapdf/pd/font/type1/Type1PrivateParser.java b/src/main/java/org/verapdf/pd/font/type1/Type1PrivateParser.java index a194eefb..ad25a480 100644 --- a/src/main/java/org/verapdf/pd/font/type1/Type1PrivateParser.java +++ b/src/main/java/org/verapdf/pd/font/type1/Type1PrivateParser.java @@ -146,9 +146,9 @@ private void processToken() throws IOException { long toSkip = this.getToken().integer; skipRD(); this.skipSpaces(); - long beginOffset = this.source.getOffset(); - this.source.skip(toSkip); - try (ASInputStream chunk = this.source.getStream(beginOffset, toSkip); + long beginOffset = this.getSource().getOffset(); + this.getSource().skip(toSkip); + try (ASInputStream chunk = this.getSource().getStream(beginOffset, toSkip); ASInputStream eexecDecode = new EexecFilterDecode( chunk, true, this.lenIV); ASInputStream decodedCharString = new ASMemoryInStream(eexecDecode)) { Type1CharStringParser parser = new Type1CharStringParser(decodedCharString, subrWidths); @@ -192,9 +192,9 @@ private boolean decodeCharString() throws IOException { long charstringLength = this.getToken().integer; this.skipRD(); this.skipSingleSpace(); - long beginOffset = this.source.getOffset(); + long beginOffset = this.getSource().getOffset(); this.source.skip((int) charstringLength); - try (ASInputStream chunk = this.source.getStream(beginOffset, charstringLength); + try (ASInputStream chunk = this.getSource().getStream(beginOffset, charstringLength); ASInputStream eexecDecode = new EexecFilterDecode( chunk, true, this.lenIV); ASInputStream decodedCharString = new ASMemoryInStream(eexecDecode)) { Type1CharStringParser parser = new Type1CharStringParser(decodedCharString, subrWidths);