com.sun.xml.rpc.sp.InputEntity Maven / Gradle / Ivy
/*
* Copyright (c) 1997, 2018 Oracle and/or its affiliates. All rights reserved.
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License v. 2.0, which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* This Source Code may also be made available under the following Secondary
* Licenses when the conditions for such availability set forth in the
* Eclipse Public License v. 2.0 are satisfied: GNU General Public License,
* version 2 with the GNU Classpath Exception, which is available at
* https://www.gnu.org/software/classpath/license.html.
*
* SPDX-License-Identifier: EPL-2.0 OR GPL-2.0 WITH Classpath-exception-2.0
*/
package com.sun.xml.rpc.sp;
import java.io.CharConversionException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.util.Locale;
import org.xml.sax.InputSource;
import org.xml.sax.Locator;
/**
* This is how the parser talks to its input entities, of all kinds.
* The entities are in a stack.
*
* For internal entities, the character arrays are referenced here,
* and read from as needed (they're read-only). External entities have
* mutable buffers, that are read into as needed.
*
*
Note: This maps CRLF (and CR) to LF without regard for
* whether it's in an external (parsed) entity or not. The XML 1.0 spec
* is inconsistent in explaining EOL handling; this is the sensible way.
*
* @author David Brownell
* @author JAX-RPC RI Development Team
*/
final class InputEntity implements Locator {
private int start, finish;
private char buf[];
private int lineNumber = 1;
private boolean returnedFirstHalf = false;
private boolean maybeInCRLF = false;
// name of entity (never main document or unnamed DTD PE)
private String name;
private InputEntity next;
// for system and public IDs in diagnostics
private InputSource input;
// this is a buffer; some buffers can be replenished.
private Reader reader;
private boolean isClosed;
private Locale locale;
private StringBuffer rememberedText;
private int startRemember;
// record if this is a PE, so endParsedEntity won't be called
private boolean isPE;
// InputStreamReader throws an internal per-read exception, so
// we minimize reads. We also add a byte to compensate for the
// "ungetc" byte we keep, so that our downstream reads are as
// nicely sized as we can make them.
final private static int BUFSIZ = 2 * 1024 + 1;
final private static char newline[] = { '\n' };
// buffer used for storing unparsed data
private char[] cdataBuf = null;
public static InputEntity getInputEntity(Locale l) {
InputEntity retval = new InputEntity();
retval.locale = l;
return retval;
}
private InputEntity() {
}
//
// predicate: return true iff this is an internal entity reader,
// and so may safely be "popped" as needed. external entities have
// syntax to uphold; internal parameter entities have at most validity
// constraints to monitor. also, only external entities get decent
// location diagnostics.
//
public boolean isInternal() {
return reader == null;
}
//
// predicate: return true iff this is the toplevel document
//
public boolean isDocument() {
return next == null;
}
//
// predicate: return true iff this is a PE expansion (so that
// LexicalEventListner.endParsedEntity won't be called)
//
public boolean isParameterEntity() {
return isPE;
}
//
// return name of current entity
//
public String getName() {
return name;
}
//
// use this for an external parsed entity
//
public void init(
InputSource in,
String name,
InputEntity stack,
boolean isPE)
throws ParseException, IOException {
input = in;
this.isPE = isPE;
reader = in.getCharacterStream();
if (reader == null) {
InputStream bytes = in.getByteStream();
if (bytes == null) {
reader =
XmlReader.createReader(
new URL(in.getSystemId()).openStream());
} else if (in.getEncoding() != null) {
reader =
XmlReader.createReader(
in.getByteStream(),
in.getEncoding());
}
else {
reader = XmlReader.createReader(in.getByteStream());
}
}
next = stack;
buf = new char[BUFSIZ];
this.name = name;
checkRecursion(stack);
}
//
// use this for an internal parsed entity; buffer is readonly
//
public void init(char b[], String name, InputEntity stack, boolean isPE)
throws ParseException {
next = stack;
buf = b;
finish = b.length;
this.name = name;
this.isPE = isPE;
checkRecursion(stack);
}
private void checkRecursion(InputEntity stack) throws ParseException {
if (stack == null)
return;
for (stack = stack.next; stack != null; stack = stack.next) {
if (stack.name != null && stack.name.equals(name))
fatal("P-069", new Object[] { name });
}
}
public InputEntity pop() throws ParseException, IOException {
// caller has ensured there's nothing left to read
close();
return next;
}
/** returns true iff there's no more data to consume ... */
public boolean isEOF() throws ParseException, IOException {
// called to ensure WF-ness of included entities and to pop
// input entities appropriately ... EOF is not always legal.
if (start >= finish) {
fillbuf();
return start >= finish;
} else {
return false;
}
}
/**
* Returns the name of the encoding in use, else null; the name
* returned is in as standard a form as we can get.
*/
public String getEncoding() {
if (reader == null)
return null;
if (reader instanceof XmlReader)
return ((XmlReader) reader).getEncoding();
// prefer a java2std() call to normalize names...
if (reader instanceof InputStreamReader)
return ((InputStreamReader) reader).getEncoding();
return null;
}
/**
* returns the next name char, or NUL ... faster than getc(),
* and the common "name or nmtoken must be next" case won't
* need ungetc().
*/
public char getNameChar() throws ParseException, IOException {
if (finish <= start)
fillbuf();
if (finish > start) {
char c = buf[start++];
if (XmlChars.isNameChar(c))
return c;
start--;
}
return 0;
}
/**
* gets the next Java character -- might be part of an XML
* text character represented by a surrogate pair, or be
* the end of the entity.
*/
public char getc() throws ParseException, IOException {
if (finish <= start)
fillbuf();
if (finish > start) {
char c = buf[start++];
// [2] Char ::= #x0009 | #x000A | #x000D
// | [#x0020-#xD7FF]
// | [#xE000-#xFFFD]
// plus surrogate _pairs_ representing [#x10000-#x10ffff]
if (returnedFirstHalf) {
if (c >= 0xdc00 && c <= 0xdfff) {
returnedFirstHalf = false;
return c;
} else
fatal("P-070", new Object[] { Integer.toHexString(c)});
}
if ((c >= 0x0020 && c <= 0xD7FF)
|| c == 0x0009 // no surrogates!
|| (c >= 0xE000 && c <= 0xFFFD))
return c;
//
// CRLF and CR are both line ends; map both to LF, and
// keep line count correct.
//
else if (c == '\r' && !isInternal()) {
maybeInCRLF = true;
c = getc();
if (c != '\n')
ungetc();
maybeInCRLF = false;
lineNumber++;
return '\n';
} else if (c == '\n' || c == '\r') { // LF, or 2nd char in CRLF
if (!isInternal() && !maybeInCRLF)
lineNumber++;
return c;
}
// surrogates...
if (c >= 0xd800 && c < 0xdc00) {
returnedFirstHalf = true;
return c;
}
fatal("P-071", new Object[] { Integer.toHexString(c)});
}
throw new EndOfInputException();
}
public boolean peekc(char c) throws ParseException, IOException {
if (finish <= start)
fillbuf();
if (finish > start) {
if (buf[start] == c) {
start++;
return true;
} else
return false;
}
return false;
}
/**
* two character pushback is guaranteed
*/
public void ungetc() {
if (start == 0)
throw new InternalError("ungetc");
start--;
if (buf[start] == '\n' || buf[start] == '\r') {
if (!isInternal())
lineNumber--;
} else if (returnedFirstHalf)
returnedFirstHalf = false;
}
/**
* optional grammatical whitespace (discarded)
*/
public boolean maybeWhitespace() throws ParseException, IOException {
char c;
boolean isSpace = false;
boolean sawCR = false;
// [3] S ::= #20 | #09 | #0D | #0A
for (;;) {
if (finish <= start)
fillbuf();
if (finish <= start)
return isSpace;
c = buf[start++];
if (c == 0x20 || c == 0x09 || c == '\n' || c == '\r') {
isSpace = true;
//
// CR, LF are line endings ... CLRF is one, not two!
//
if ((c == '\n' || c == '\r') && !isInternal()) {
if (!(c == '\n' && sawCR)) {
lineNumber++;
sawCR = false;
}
if (c == '\r')
sawCR = true;
}
} else {
start--;
return isSpace;
}
}
}
/**
* retrieve normal content
*/
// in certain cases, start will not mean the end of parsed
// content, so use this variable to record the actual end
// of input.
private int end = -1;
String getParsedContent(boolean coalescing)
throws ParseException, IOException {
if (!coalescing) {
// added this branch to deal with non-coalescing mode faster by
// avoiding the creation of a StringBuffer
int s = start;
if (parsedContent()) {
if (end == -1)
end = start;
return new String(buf, s, start - s);
} else {
return null;
}
} else {
int s = start;
StringBuffer content = null;
while (parsedContent()) {
/* lazy initiating */
if (content == null) {
content = new StringBuffer();
}
/*
* if it is not specially marked, use the default
* start as our end pointer
*/
if (end == -1)
end = start;
// bug fix for bug: 4780479
if (start < s) // must have started new buffer
s = 0;
content.append(buf, s, end - s);
end = -1;
/*
* calling isEOF has the side effect of fillbuf,
* so start will be properly updated.
*/
if (!coalescing || isEOF()) {
break;
}
s = start;
}
return (content == null ? null : content.toString());
}
}
/**
* normal content; whitespace in markup may be handled
* specially if the parser uses the content model.
*
*
content terminates with markup delimiter characters,
* namely ampersand (&) and left angle bracket (<).
*
*
the document handler's characters() method is called
* on all the content found
*/
public boolean parsedContent() throws ParseException, IOException {
// [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
int first; // first char to return
int last; // last char to return
boolean sawContent; // sent any chars?
char c;
// deliver right out of the buffer, until delimiter, EOF,
// or error, refilling as we go
for (first = last = start, sawContent = false;; last++) {
// buffer empty?
if (last >= finish) {
if (last > first) {
sawContent = true;
start = last;
return sawContent;
}
if (isEOF()) { // calls fillbuf
return sawContent;
}
first = start;
last = first - 1; // incremented in loop
continue;
}
c = buf[last];
//
// pass most chars through ASAP; this inlines the code of
// [2] !XmlChars.isChar(c) leaving only characters needing
// special treatment ... line ends, surrogates, and:
// 0x0026 == '&'
// 0x003C == '<'
// 0x005D == ']'
// Comparisons ordered for speed on 'typical' text
//
if ((c > 0x005D && c <= 0xD7FF) // a-z and more
|| (c < 0x0026 && c >= 0x0020) // space & punct
|| (c > 0x003C && c < 0x005D) // A-Z & punct
|| (c > 0x0026 && c < 0x003C) // 0-9 & punct
|| c == 0x0009
|| (c >= 0xE000 && c <= 0xFFFD))
continue;
// terminate on markup delimiters
if (c == '<' || c == '&')
break;
// count lines
if (c == '\n') {
if (!isInternal())
lineNumber++;
continue;
}
// External entities get CR, CRLF --> LF mapping
// Internal ones got it already, and we can't repeat
// else we break char ref handling!!
if (c == '\r') {
if (isInternal())
continue;
sawContent = true;
lineNumber++;
if (finish > (last + 1)) {
if (buf[last + 1] == '\n') {
last++;
buf[last - 1] = '\n';
end = last;
} else {
buf[last] = '\n';
}
} else { // CR at end of buffer
// case not yet handled: CRLF here will look like two lines
buf[last] = '\n';
}
first = start = last + 1;
//continue;
return sawContent;
}
// ']]>' is a WF error -- must fail if we see it
if (c == ']') {
switch (finish - last) {
// for suspicious end-of-buffer cases, get more data
// into the buffer to rule out this sequence.
case 2 :
if (buf[last + 1] != ']')
continue;
// FALLTHROUGH
case 1 :
if (reader == null || isClosed)
continue;
if (last == first)
throw new InternalError("fillbuf");
last--;
if (last > first) {
sawContent = true;
start = last;
return sawContent;
}
fillbuf();
first = last = start;
continue;
// otherwise any "]]>" would be buffered, and we can
// see right away if that's what we have
default :
if (buf[last + 1] == ']' && buf[last + 2] == '>')
fatal("P-072", null);
continue;
}
}
// correctly paired surrogates are OK
if (c >= 0xd800 && c <= 0xdfff) {
if ((last + 1) >= finish) {
if (last > first) {
sawContent = true;
end = last;
start = last + 1;
return sawContent;
}
if (isEOF()) { // calls fillbuf
fatal("P-081", new Object[] { Integer.toHexString(c)});
}
first = start;
last = first;
continue;
}
if (checkSurrogatePair(last))
last++;
else {
last--;
// also terminate on surrogate pair oddities
break;
}
continue;
}
fatal("P-071", new Object[] { Integer.toHexString(c)});
}
if (last == first)
return sawContent;
start = last;
return true;
}
/**
* retrieve unparsed content
*/
String getUnparsedContent(
boolean ignorableWhitespace,
String whitespaceInvalidMessage)
throws ParseException, IOException {
int s = start;
String ret = null;
if (!unparsedContent(ignorableWhitespace, whitespaceInvalidMessage))
return null;
else {
return new String(cdataBuf);
}
}
/**
* CDATA -- character data, terminated by "]]>" and optionally
* including unescaped markup delimiters (ampersand and left angle
* bracket). This should otherwise be exactly like character data,
* modulo differences in error report details.
*
*
The document handler's characters() or ignorableWhitespace()
* methods are invoked on all the character data found
*
* @param ignorableWhitespace if true, whitespace characters will
* be reported using docHandler.ignorableWhitespace(); implicitly,
* non-whitespace characters will cause validation errors
* @param standaloneWhitespaceInvalid if true, ignorable whitespace
* causes a validity error report as well as a callback
*/
public boolean unparsedContent(
boolean ignorableWhitespace,
String whitespaceInvalidMessage)
throws ParseException, IOException {
// [18] CDSect ::= CDStart CData CDEnd
// [19] CDStart ::= '' Char*))
// [21] CDEnd ::= ']]>'
// caller peeked the leading '<' ...
if (!peek("![CDATA[", null))
return false;
// only a literal ']]>' stops this ...
int last;
char[] tempBuf = null;
int cdataLast = 0;
for (;;) { // until ']]>' seen
boolean done = false;
char c;
int s = start;
// don't report ignorable whitespace as "text" for
// validation purposes.
boolean white = ignorableWhitespace;
for (last = start; last < finish; last++) {
c = buf[last];
//
// Reject illegal characters.
//
if (!XmlChars.isChar(c)) {
white = false;
if (c >= 0xd800 && c <= 0xdfff) {
if (checkSurrogatePair(last)) {
last++;
continue;
} else {
last--;
break;
}
}
fatal(
"P-071",
new Object[] { Integer.toHexString(buf[last])});
}
if (c == '\n') {
if (!isInternal())
lineNumber++;
continue;
}
if (c == '\r') {
// As above, we can't repeat CR/CRLF --> LF mapping
if (isInternal())
continue;
if (white) {
if (whitespaceInvalidMessage != null)
fatal(
Parser.messages.getMessage(
locale,
whitespaceInvalidMessage));
}
lineNumber++;
if (finish > (last + 1)) {
if (buf[last + 1] == '\n')
last++;
} else { // CR at end of buffer
// case not yet handled ... as above
}
start = last + 1;
continue;
}
if (c != ']') {
if (c != ' ' && c != '\t')
white = false;
continue;
}
if ((last + 2) < finish) {
if (buf[last + 1] == ']' && buf[last + 2] == '>') {
done = true;
break;
}
white = false;
continue;
} else {
//last--;
break;
}
}
if (white) {
if (whitespaceInvalidMessage != null)
fatal(
Parser.messages.getMessage(
locale,
whitespaceInvalidMessage));
}
if (done) {
// fix #4798903
if (cdataBuf != null) {
tempBuf = new char[cdataLast+last-s];
System.arraycopy(cdataBuf, 0, tempBuf, 0, cdataLast);
} else
tempBuf = new char[last-s];
System.arraycopy(buf, s, tempBuf, cdataLast, last - s);
cdataBuf = tempBuf;
start = last + 3;
break;
}
// buffers are read in 2K chunk and thus copied
// over to cdataBuf before next buffer is read
if (cdataBuf != null) {
tempBuf = new char[cdataBuf.length + BUFSIZ];
System.arraycopy(cdataBuf, 0, tempBuf, 0, cdataBuf.length);
} else {
tempBuf = new char[BUFSIZ];
}
System.arraycopy(buf, s, tempBuf, cdataLast, last - s);
cdataBuf = tempBuf;
cdataLast += last - s;
start = last;
fillbuf();
if (isEOF())
fatal("P-073", null);
}
return true;
}
// return false to backstep at end of buffer)
private boolean checkSurrogatePair(int offset) throws ParseException {
if ((offset + 1) >= finish)
return false;
char c1 = buf[offset++];
char c2 = buf[offset];
if ((c1 >= 0xd800 && c1 < 0xdc00) && (c2 >= 0xdc00 && c2 <= 0xdfff))
return true;
fatal(
"P-074",
new Object[] {
Integer.toHexString(c1 & 0x0ffff),
Integer.toHexString(c2 & 0x0ffff)});
return false;
}
/**
* whitespace in markup (flagged to app, discardable)
*
*
the document handler's ignorableWhitespace() method
* is called on all the whitespace found
*/
public boolean ignorableWhitespace() throws ParseException, IOException {
char c;
boolean isSpace = false;
int first;
// [3] S ::= #20 | #09 | #0D | #0A
for (first = start;;) {
if (finish <= start) {
fillbuf();
first = start;
}
if (finish <= start)
return isSpace;
c = buf[start++];
switch (c) {
case '\n' :
if (!isInternal())
lineNumber++;
// handles Macintosh line endings wrong
// fallthrough
case 0x09 :
case 0x20 :
isSpace = true;
continue;
case '\r' :
isSpace = true;
if (!isInternal())
lineNumber++;
if (start < finish && buf[start] == '\n')
++start;
first = start;
continue;
default :
ungetc();
return isSpace;
}
}
}
/**
* returns false iff 'next' string isn't as provided,
* else skips that text and returns true
*
*
NOTE: two alternative string representations are
* both passed in, since one is faster.
*/
public boolean peek(String next, char chars[])
throws ParseException, IOException {
int len;
int i;
if (chars != null)
len = chars.length;
else
len = next.length();
// buffer should hold the whole thing ... give it a
// chance for the end-of-buffer case and cope with EOF
// by letting fillbuf compact and fill
if (finish <= start || (finish - start) < len)
fillbuf();
// can't peek past EOF
if (finish <= start)
return false;
// compare the string; consume iff it matches
if (chars != null) {
for (i = 0; i < len && (start + i) < finish; i++) {
if (buf[start + i] != chars[i])
return false;
}
} else {
for (i = 0; i < len && (start + i) < finish; i++) {
if (buf[start + i] != next.charAt(i))
return false;
}
}
// if the first fillbuf didn't get enough data, give
// fillbuf another chance to read
if (i < len) {
if (reader == null || isClosed)
return false;
//
// This diagnostic "knows" that the only way big strings would
// fail to be peeked is where it's a symbol ... e.g. for an
// construct. That knowledge could also be applied
// to get rid of the symbol length constraint, since having
// the wrong symbol is a fatal error anyway ...
//
if (len > buf.length)
fatal("P-077", new Object[] { new Integer(buf.length)});
fillbuf();
return peek(next, chars);
}
start += len;
return true;
}
//
// Support for reporting the internal DTD subset, so
// declarations can be recreated. This is collected as a single
// string; such subsets are normally small, and many applications
// don't even care about this.
//
public void startRemembering() {
if (startRemember != 0)
throw new InternalError();
startRemember = start;
}
public String rememberText() {
String retval;
// If the internal subset crossed a buffer boundary, we
// created a temporary buffer.
if (rememberedText != null) {
rememberedText.append(buf, startRemember, start - startRemember);
retval = rememberedText.toString();
} else
retval = new String(buf, startRemember, start - startRemember);
startRemember = 0;
rememberedText = null;
return retval;
}
// LOCATOR METHODS
private Locator getLocator() {
InputEntity current = this;
// don't report locations within internal entities!
while (current != null && current.input == null)
current = current.next;
return current == null ? this : current;
}
/** Returns the public ID of this input source, if known */
public String getPublicId() {
Locator where = getLocator();
if (where == this)
return input.getPublicId();
return where.getPublicId();
}
/** Returns the system ID of this input source, if known */
public String getSystemId() {
Locator where = getLocator();
if (where == this)
return input.getSystemId();
return where.getSystemId();
}
/** Returns the current line number in this input source */
public int getLineNumber() {
Locator where = getLocator();
if (where == this)
return lineNumber;
return where.getLineNumber();
}
/** returns -1; maintaining column numbers hurts performance */
public int getColumnNumber() {
return -1; // not maintained (speed)
}
//
// n.b. for non-EOF end-of-buffer cases, reader should return
// at least a handful of bytes so various lookaheads behave.
//
// two character pushback exists except at first; characters
// represented by surrogate pairs can't be pushed back (they'd
// only be in character data anyway).
//
// SAX exception thrown on char conversion problems; line number
// will be low, as a rule.
//
private void fillbuf() throws ParseException, IOException {
// don't touched fixed buffers, that'll usually
// change entity values (and isn't needed anyway)
// likewise, ignore closed streams
if (reader == null || isClosed)
return;
// if remembering DTD text, copy!
if (startRemember != 0) {
if (rememberedText == null)
rememberedText = new StringBuffer(buf.length);
rememberedText.append(buf, startRemember, start - startRemember);
}
boolean extra = (finish > 0) && (start > 0);
int len;
if (extra) // extra pushback
start--;
len = finish - start;
System.arraycopy(buf, start, buf, 0, len);
start = 0;
finish = len;
try {
len = buf.length - len;
len = reader.read(buf, finish, len);
} catch (UnsupportedEncodingException e) {
fatal("P-075", new Object[] { e.getMessage()});
} catch (CharConversionException e) {
fatal("P-076", new Object[] { e.getMessage()});
}
if (len >= 0)
finish += len;
else
close();
if (extra) // extra pushback
start++;
if (startRemember != 0)
// assert extra == true
startRemember = 1;
}
public void close() {
try {
if (reader != null && !isClosed)
reader.close();
isClosed = true;
} catch (IOException e) {
/* NOTHING */
}
}
private void fatal(String message) throws ParseException {
ParseException x =
new ParseException(
message,
getPublicId(),
getSystemId(),
getLineNumber(),
getColumnNumber());
// not continuable ... e.g. WF errors
close();
throw x;
}
private void fatal(String messageId, Object params[])
throws ParseException {
fatal(Parser.messages.getMessage(locale, messageId, params));
}
}