All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.sun.xml.dtdparser.InputEntity Maven / Gradle / Ivy

/**
Copyright (c) 2009, Sun Microsystems
All rights reserved.

Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:

  * Redistributions of source code must retain the above copyright notice, this
    list of conditions and the following disclaimer.
  * Redistributions in binary form must reproduce the above copyright notice,
    this list of conditions and the following disclaimer in the documentation
    and/or other materials provided with the distribution.
  * Neither the name of the Sun Microsystems nor the names of its contributors
    may be used to endorse or promote products derived from this software
    without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

package com.sun.xml.dtdparser;

import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;

import java.io.CharConversionException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.util.Locale;

/**
 * This is how the parser talks to its input entities, of all kinds.
 * The entities are in a stack.
 * 

*

For internal entities, the character arrays are referenced here, * and read from as needed (they're read-only). External entities have * mutable buffers, that are read into as needed. *

*

Note: This maps CRLF (and CR) to LF without regard for * whether it's in an external (parsed) entity or not. The XML 1.0 spec * is inconsistent in explaining EOL handling; this is the sensible way. * * @author David Brownell * @author Janet Koenig * @version 1.4 00/08/05 */ public class InputEntity { private int start, finish; private char buf []; private int lineNumber = 1; private boolean returnedFirstHalf = false; private boolean maybeInCRLF = false; // name of entity (never main document or unnamed DTD PE) private String name; private InputEntity next; // for system and public IDs in diagnostics private InputSource input; // this is a buffer; some buffers can be replenished. private Reader reader; private boolean isClosed; private DTDEventListener errHandler; private Locale locale; private StringBuffer rememberedText; private int startRemember; // record if this is a PE, so endParsedEntity won't be called private boolean isPE; // InputStreamReader throws an internal per-read exception, so // we minimize reads. We also add a byte to compensate for the // "ungetc" byte we keep, so that our downstream reads are as // nicely sized as we can make them. final private static int BUFSIZ = 8 * 1024 + 1; final private static char newline [] = {'\n'}; public static InputEntity getInputEntity(DTDEventListener h, Locale l) { InputEntity retval = new InputEntity(); retval.errHandler = h; retval.locale = l; return retval; } private InputEntity() { } // // predicate: return true iff this is an internal entity reader, // and so may safely be "popped" as needed. external entities have // syntax to uphold; internal parameter entities have at most validity // constraints to monitor. also, only external entities get decent // location diagnostics. // public boolean isInternal() { return reader == null; } // // predicate: return true iff this is the toplevel document // public boolean isDocument() { return next == null; } // // predicate: return true iff this is a PE expansion (so that // LexicalEventListner.endParsedEntity won't be called) // public boolean isParameterEntity() { return isPE; } // // return name of current entity // public String getName() { return name; } // // use this for an external parsed entity // public void init(InputSource in, String name, InputEntity stack, boolean isPE) throws IOException, SAXException { input = in; this.isPE = isPE; reader = in.getCharacterStream(); if (reader == null) { InputStream bytes = in.getByteStream(); if (bytes == null) reader = XmlReader.createReader(new URL(in.getSystemId()) .openStream()); else if (in.getEncoding() != null) reader = XmlReader.createReader(in.getByteStream(), in.getEncoding()); else reader = XmlReader.createReader(in.getByteStream()); } next = stack; buf = new char[BUFSIZ]; this.name = name; checkRecursion(stack); } // // use this for an internal parsed entity; buffer is readonly // public void init(char b [], String name, InputEntity stack, boolean isPE) throws SAXException { next = stack; buf = b; finish = b.length; this.name = name; this.isPE = isPE; checkRecursion(stack); } private void checkRecursion(InputEntity stack) throws SAXException { if (stack == null) return; for (stack = stack.next; stack != null; stack = stack.next) { if (stack.name != null && stack.name.equals(name)) fatal("P-069", new Object[]{name}); } } public InputEntity pop() throws IOException { // caller has ensured there's nothing left to read close(); return next; } /** * returns true iff there's no more data to consume ... */ public boolean isEOF() throws IOException, SAXException { // called to ensure WF-ness of included entities and to pop // input entities appropriately ... EOF is not always legal. if (start >= finish) { fillbuf(); return start >= finish; } else return false; } /** * Returns the name of the encoding in use, else null; the name * returned is in as standard a form as we can get. */ public String getEncoding() { if (reader == null) return null; if (reader instanceof XmlReader) return ((XmlReader) reader).getEncoding(); // XXX prefer a java2std() call to normalize names... if (reader instanceof InputStreamReader) return ((InputStreamReader) reader).getEncoding(); return null; } /** * returns the next name char, or NUL ... faster than getc(), * and the common "name or nmtoken must be next" case won't * need ungetc(). */ public char getNameChar() throws IOException, SAXException { if (finish <= start) fillbuf(); if (finish > start) { char c = buf[start++]; if (XmlChars.isNameChar(c)) return c; start--; } return 0; } /** * gets the next Java character -- might be part of an XML * text character represented by a surrogate pair, or be * the end of the entity. */ public char getc() throws IOException, SAXException { if (finish <= start) fillbuf(); if (finish > start) { char c = buf[start++]; // [2] Char ::= #x0009 | #x000A | #x000D // | [#x0020-#xD7FF] // | [#xE000-#xFFFD] // plus surrogate _pairs_ representing [#x10000-#x10ffff] if (returnedFirstHalf) { if (c >= 0xdc00 && c <= 0xdfff) { returnedFirstHalf = false; return c; } else fatal("P-070", new Object[]{Integer.toHexString(c)}); } if ((c >= 0x0020 && c <= 0xD7FF) || c == 0x0009 // no surrogates! || (c >= 0xE000 && c <= 0xFFFD)) return c; // // CRLF and CR are both line ends; map both to LF, and // keep line count correct. // else if (c == '\r' && !isInternal()) { maybeInCRLF = true; c = getc(); if (c != '\n') ungetc(); maybeInCRLF = false; lineNumber++; return '\n'; } else if (c == '\n' || c == '\r') { // LF, or 2nd char in CRLF if (!isInternal() && !maybeInCRLF) lineNumber++; return c; } // surrogates... if (c >= 0xd800 && c < 0xdc00) { returnedFirstHalf = true; return c; } fatal("P-071", new Object[]{Integer.toHexString(c)}); } throw new EndOfInputException(); } /** * lookahead one character */ public boolean peekc(char c) throws IOException, SAXException { if (finish <= start) fillbuf(); if (finish > start) { if (buf[start] == c) { start++; return true; } else return false; } return false; } /** * two character pushback is guaranteed */ public void ungetc() { if (start == 0) throw new InternalError("ungetc"); start--; if (buf[start] == '\n' || buf[start] == '\r') { if (!isInternal()) lineNumber--; } else if (returnedFirstHalf) returnedFirstHalf = false; } /** * optional grammatical whitespace (discarded) */ public boolean maybeWhitespace() throws IOException, SAXException { char c; boolean isSpace = false; boolean sawCR = false; // [3] S ::= #20 | #09 | #0D | #0A for (; ;) { if (finish <= start) fillbuf(); if (finish <= start) return isSpace; c = buf[start++]; if (c == 0x20 || c == 0x09 || c == '\n' || c == '\r') { isSpace = true; // // CR, LF are line endings ... CLRF is one, not two! // if ((c == '\n' || c == '\r') && !isInternal()) { if (!(c == '\n' && sawCR)) { lineNumber++; sawCR = false; } if (c == '\r') sawCR = true; } } else { start--; return isSpace; } } } /** * normal content; whitespace in markup may be handled * specially if the parser uses the content model. *

*

content terminates with markup delimiter characters, * namely ampersand (&amp;) and left angle bracket (&lt;). *

*

the document handler's characters() method is called * on all the content found */ public boolean parsedContent(DTDEventListener docHandler /*ElementValidator validator*/) throws IOException, SAXException { // [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*) int first; // first char to return int last; // last char to return boolean sawContent; // sent any chars? char c; // deliver right out of the buffer, until delimiter, EOF, // or error, refilling as we go for (first = last = start, sawContent = false; ; last++) { // buffer empty? if (last >= finish) { if (last > first) { // validator.text (); docHandler.characters(buf, first, last - first); sawContent = true; start = last; } if (isEOF()) // calls fillbuf return sawContent; first = start; last = first - 1; // incremented in loop continue; } c = buf[last]; // // pass most chars through ASAP; this inlines the code of // [2] !XmlChars.isChar(c) leaving only characters needing // special treatment ... line ends, surrogates, and: // 0x0026 == '&' // 0x003C == '<' // 0x005D == ']' // Comparisons ordered for speed on 'typical' text // if ((c > 0x005D && c <= 0xD7FF) // a-z and more || (c < 0x0026 && c >= 0x0020) // space & punct || (c > 0x003C && c < 0x005D) // A-Z & punct || (c > 0x0026 && c < 0x003C) // 0-9 & punct || c == 0x0009 || (c >= 0xE000 && c <= 0xFFFD) ) continue; // terminate on markup delimiters if (c == '<' || c == '&') break; // count lines if (c == '\n') { if (!isInternal()) lineNumber++; continue; } // External entities get CR, CRLF --> LF mapping // Internal ones got it already, and we can't repeat // else we break char ref handling!! if (c == '\r') { if (isInternal()) continue; docHandler.characters(buf, first, last - first); docHandler.characters(newline, 0, 1); sawContent = true; lineNumber++; if (finish > (last + 1)) { if (buf[last + 1] == '\n') last++; } else { // CR at end of buffer // XXX case not yet handled: CRLF here will look like two lines } first = start = last + 1; continue; } // ']]>' is a WF error -- must fail if we see it if (c == ']') { switch (finish - last) { // for suspicious end-of-buffer cases, get more data // into the buffer to rule out this sequence. case 2: if (buf[last + 1] != ']') continue; // FALLTHROUGH case 1: if (reader == null || isClosed) continue; if (last == first) throw new InternalError("fillbuf"); last--; if (last > first) { // validator.text (); docHandler.characters(buf, first, last - first); sawContent = true; start = last; } fillbuf(); first = last = start; continue; // otherwise any "]]>" would be buffered, and we can // see right away if that's what we have default: if (buf[last + 1] == ']' && buf[last + 2] == '>') fatal("P-072", null); continue; } } // correctly paired surrogates are OK if (c >= 0xd800 && c <= 0xdfff) { if ((last + 1) >= finish) { if (last > first) { // validator.text (); docHandler.characters(buf, first, last - first); sawContent = true; start = last + 1; } if (isEOF()) { // calls fillbuf fatal("P-081", new Object[]{Integer.toHexString(c)}); } first = start; last = first; continue; } if (checkSurrogatePair(last)) last++; else { last--; // also terminate on surrogate pair oddities break; } continue; } fatal("P-071", new Object[]{Integer.toHexString(c)}); } if (last == first) return sawContent; // validator.text (); docHandler.characters(buf, first, last - first); start = last; return true; } /** * CDATA -- character data, terminated by "]]>" and optionally * including unescaped markup delimiters (ampersand and left angle * bracket). This should otherwise be exactly like character data, * modulo differences in error report details. *

*

The document handler's characters() or ignorableWhitespace() * methods are invoked on all the character data found * * @param docHandler gets callbacks for character data * @param ignorableWhitespace if true, whitespace characters will * be reported using docHandler.ignorableWhitespace(); implicitly, * non-whitespace characters will cause validation errors * @param whitespaceInvalidMessage if true, ignorable whitespace * causes a validity error report as well as a callback */ public boolean unparsedContent(DTDEventListener docHandler, /*ElementValidator validator,*/ boolean ignorableWhitespace, String whitespaceInvalidMessage) throws IOException, SAXException { // [18] CDSect ::= CDStart CData CDEnd // [19] CDStart ::= '' Char*)) // [21] CDEnd ::= ']]>' // caller peeked the leading '<' ... if (!peek("![CDATA[", null)) return false; docHandler.startCDATA(); // only a literal ']]>' stops this ... int last; for (; ;) { // until ']]>' seen boolean done = false; char c; // don't report ignorable whitespace as "text" for // validation purposes. boolean white = ignorableWhitespace; for (last = start; last < finish; last++) { c = buf[last]; // // Reject illegal characters. // if (!XmlChars.isChar(c)) { white = false; if (c >= 0xd800 && c <= 0xdfff) { if (checkSurrogatePair(last)) { last++; continue; } else { last--; break; } } fatal("P-071", new Object[] {Integer.toHexString(buf[last])}); } if (c == '\n') { if (!isInternal()) lineNumber++; continue; } if (c == '\r') { // As above, we can't repeat CR/CRLF --> LF mapping if (isInternal()) continue; if (white) { if (whitespaceInvalidMessage != null) errHandler.error(new SAXParseException(DTDParser.messages.getMessage(locale, whitespaceInvalidMessage), null)); docHandler.ignorableWhitespace(buf, start, last - start); docHandler.ignorableWhitespace(newline, 0, 1); } else { // validator.text (); docHandler.characters(buf, start, last - start); docHandler.characters(newline, 0, 1); } lineNumber++; if (finish > (last + 1)) { if (buf[last + 1] == '\n') last++; } else { // CR at end of buffer // XXX case not yet handled ... as above } start = last + 1; continue; } if (c != ']') { if (c != ' ' && c != '\t') white = false; continue; } if ((last + 2) < finish) { if (buf[last + 1] == ']' && buf[last + 2] == '>') { done = true; break; } white = false; continue; } else { //last--; break; } } if (white) { if (whitespaceInvalidMessage != null) errHandler.error(new SAXParseException(DTDParser.messages.getMessage(locale, whitespaceInvalidMessage), null)); docHandler.ignorableWhitespace(buf, start, last - start); } else { // validator.text (); docHandler.characters(buf, start, last - start); } if (done) { start = last + 3; break; } start = last; if (isEOF()) fatal("P-073", null); } docHandler.endCDATA(); return true; } // return false to backstep at end of buffer) private boolean checkSurrogatePair(int offset) throws SAXException { if ((offset + 1) >= finish) return false; char c1 = buf[offset++]; char c2 = buf[offset]; if ((c1 >= 0xd800 && c1 < 0xdc00) && (c2 >= 0xdc00 && c2 <= 0xdfff)) return true; fatal("P-074", new Object[]{ Integer.toHexString(c1 & 0x0ffff), Integer.toHexString(c2 & 0x0ffff) }); return false; } /** * whitespace in markup (flagged to app, discardable) *

*

the document handler's ignorableWhitespace() method * is called on all the whitespace found */ public boolean ignorableWhitespace(DTDEventListener handler) throws IOException, SAXException { char c; boolean isSpace = false; int first; // [3] S ::= #20 | #09 | #0D | #0A for (first = start; ;) { if (finish <= start) { if (isSpace) handler.ignorableWhitespace(buf, first, start - first); fillbuf(); first = start; } if (finish <= start) return isSpace; c = buf[start++]; switch (c) { case '\n': if (!isInternal()) lineNumber++; // XXX handles Macintosh line endings wrong // fallthrough case 0x09: case 0x20: isSpace = true; continue; case '\r': isSpace = true; if (!isInternal()) lineNumber++; handler.ignorableWhitespace(buf, first, (start - 1) - first); handler.ignorableWhitespace(newline, 0, 1); if (start < finish && buf[start] == '\n') ++start; first = start; continue; default: ungetc(); if (isSpace) handler.ignorableWhitespace(buf, first, start - first); return isSpace; } } } /** * returns false iff 'next' string isn't as provided, * else skips that text and returns true. *

*

NOTE: two alternative string representations are * both passed in, since one is faster. */ public boolean peek(String next, char chars []) throws IOException, SAXException { int len; int i; if (chars != null) len = chars.length; else len = next.length(); // buffer should hold the whole thing ... give it a // chance for the end-of-buffer case and cope with EOF // by letting fillbuf compact and fill if (finish <= start || (finish - start) < len) fillbuf(); // can't peek past EOF if (finish <= start) return false; // compare the string; consume iff it matches if (chars != null) { for (i = 0; i < len && (start + i) < finish; i++) { if (buf[start + i] != chars[i]) return false; } } else { for (i = 0; i < len && (start + i) < finish; i++) { if (buf[start + i] != next.charAt(i)) return false; } } // if the first fillbuf didn't get enough data, give // fillbuf another chance to read if (i < len) { if (reader == null || isClosed) return false; // // This diagnostic "knows" that the only way big strings would // fail to be peeked is where it's a symbol ... e.g. for an // construct. That knowledge could also be applied // to get rid of the symbol length constraint, since having // the wrong symbol is a fatal error anyway ... // if (len > buf.length) fatal("P-077", new Object[]{new Integer(buf.length)}); fillbuf(); return peek(next, chars); } start += len; return true; } // // Support for reporting the internal DTD subset, so // declarations can be recreated. This is collected as a single // string; such subsets are normally small, and many applications // don't even care about this. // public void startRemembering() { if (startRemember != 0) throw new InternalError(); startRemember = start; } public String rememberText() { String retval; // If the internal subset crossed a buffer boundary, we // created a temporary buffer. if (rememberedText != null) { rememberedText.append(buf, startRemember, start - startRemember); retval = rememberedText.toString(); } else retval = new String(buf, startRemember, start - startRemember); startRemember = 0; rememberedText = null; return retval; } private InputEntity getTopEntity() { InputEntity current = this; // don't report locations within internal entities! while (current != null && current.input == null) current = current.next; return current == null ? this : current; } /** * Returns the public ID of this input source, if known */ public String getPublicId() { InputEntity where = getTopEntity(); if (where == this) return input.getPublicId(); return where.getPublicId(); } /** * Returns the system ID of this input source, if known */ public String getSystemId() { InputEntity where = getTopEntity(); if (where == this) return input.getSystemId(); return where.getSystemId(); } /** * Returns the current line number in this input source */ public int getLineNumber() { InputEntity where = getTopEntity(); if (where == this) return lineNumber; return where.getLineNumber(); } /** * returns -1; maintaining column numbers hurts performance */ public int getColumnNumber() { return -1; // not maintained (speed) } // // n.b. for non-EOF end-of-buffer cases, reader should return // at least a handful of bytes so various lookaheads behave. // // two character pushback exists except at first; characters // represented by surrogate pairs can't be pushed back (they'd // only be in character data anyway). // // DTD exception thrown on char conversion problems; line number // will be low, as a rule. // private void fillbuf() throws IOException, SAXException { // don't touched fixed buffers, that'll usually // change entity values (and isn't needed anyway) // likewise, ignore closed streams if (reader == null || isClosed) return; // if remembering DTD text, copy! if (startRemember != 0) { if (rememberedText == null) rememberedText = new StringBuffer(buf.length); rememberedText.append(buf, startRemember, start - startRemember); } boolean extra = (finish > 0) && (start > 0); int len; if (extra) // extra pushback start--; len = finish - start; System.arraycopy(buf, start, buf, 0, len); start = 0; finish = len; try { len = buf.length - len; len = reader.read(buf, finish, len); } catch (UnsupportedEncodingException e) { fatal("P-075", new Object[]{e.getMessage()}); } catch (CharConversionException e) { fatal("P-076", new Object[]{e.getMessage()}); } if (len >= 0) finish += len; else close(); if (extra) // extra pushback start++; if (startRemember != 0) // assert extra == true startRemember = 1; } public void close() { try { if (reader != null && !isClosed) reader.close(); isClosed = true; } catch (IOException e) { /* NOTHING */ } } private void fatal(String messageId, Object params []) throws SAXException { SAXParseException x = new SAXParseException(DTDParser.messages.getMessage(locale, messageId, params), null); // not continuable ... e.g. WF errors close(); errHandler.fatalError(x); throw x; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy