nu.validator.htmlparser.io.HtmlInputStreamReader.orig Maven / Gradle / Ivy
/*
* Copyright (c) 2007 Henri Sivonen
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
package nu.validator.htmlparser.io;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CoderResult;
import java.nio.charset.CodingErrorAction;
import nu.validator.htmlparser.common.ByteReadable;
import nu.validator.htmlparser.common.Heuristics;
import nu.validator.htmlparser.common.XmlViolationPolicy;
import nu.validator.htmlparser.extra.ChardetSniffer;
import nu.validator.htmlparser.extra.IcuDetectorSniffer;
import nu.validator.htmlparser.impl.Tokenizer;
import org.xml.sax.ErrorHandler;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
/**
* Be very careful with this class. It is not a general-purpose subclass of of
* Reader
. Instead, it is the minimal implementation that does
* what Tokenizer
needs while being an instance of
* Reader
.
*
* The only reason why this is a public class is that it needs to be visible to
* test code in another package.
*
* @version $Id$
* @author hsivonen
*/
public final class HtmlInputStreamReader extends Reader implements
ByteReadable, Locator {
private static final int SNIFFING_LIMIT = 1024;
private final InputStream inputStream;
private final ErrorHandler errorHandler;
private final Tokenizer tokenizer;
private final Driver driver;
private CharsetDecoder decoder = null;
private boolean sniffing = true;
private int limit = 0;
private int position = 0;
private int bytesRead = 0;
private boolean eofSeen = false;
private boolean shouldReadBytes = false;
private boolean charsetBoundaryPassed = false;
private final byte[] byteArray = new byte[4096]; // Length must be >=
// SNIFFING_LIMIT
private final ByteBuffer byteBuffer = ByteBuffer.wrap(byteArray);
private boolean needToNotifyTokenizer = false;
private boolean flushing = false;
private int line = -1;
private int col = -1;
private int lineColPos;
private boolean hasPendingReplacementCharacter = false;
private boolean nextCharOnNewLine;
private boolean prevWasCR;
/**
* @param inputStream
* @param errorHandler
* @param locator
* @throws IOException
* @throws SAXException
*/
public HtmlInputStreamReader(InputStream inputStream,
ErrorHandler errorHandler, Tokenizer tokenizer, Driver driver,
Heuristics heuristics) throws SAXException, IOException {
this.inputStream = inputStream;
this.errorHandler = errorHandler;
this.tokenizer = tokenizer;
this.driver = driver;
this.sniffing = true;
Encoding encoding = (new BomSniffer(this)).sniff();
if (encoding == null) {
position = 0;
encoding = (new MetaSniffer(errorHandler, this)).sniff(this);
if (encoding == null
&& (heuristics == Heuristics.CHARDET || heuristics == Heuristics.ALL)) {
encoding = (new ChardetSniffer(byteArray, limit)).sniff();
}
if (encoding == null
&& (heuristics == Heuristics.ICU || heuristics == Heuristics.ALL)) {
position = 0;
encoding = (new IcuDetectorSniffer(this)).sniff();
}
sniffing = false;
if (encoding == null) {
encoding = Encoding.WINDOWS1252;
}
if (driver != null) {
driver.setEncoding(encoding, Confidence.TENTATIVE);
}
} else {
if (encoding == Encoding.UTF8) {
if (driver != null) {
driver.setEncoding(Encoding.UTF8, Confidence.CERTAIN);
}
} else {
if (driver != null) {
driver.setEncoding(Encoding.UTF16, Confidence.CERTAIN);
}
}
}
this.decoder = encoding.newDecoder();
sniffing = false;
position = 0;
bytesRead = 0;
byteBuffer.position(position);
byteBuffer.limit(limit);
initDecoder();
}
/**
*
*/
private void initDecoder() {
this.decoder.onMalformedInput(CodingErrorAction.REPORT);
this.decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
}
public HtmlInputStreamReader(InputStream inputStream,
ErrorHandler errorHandler, Tokenizer tokenizer, Driver driver,
Encoding encoding) throws SAXException, IOException {
this.inputStream = inputStream;
this.errorHandler = errorHandler;
this.tokenizer = tokenizer;
this.driver = driver;
this.decoder = encoding.newDecoder();
this.sniffing = false;
position = 0;
bytesRead = 0;
byteBuffer.position(0);
byteBuffer.limit(0);
shouldReadBytes = true;
initDecoder();
}
@Override public void close() throws IOException {
inputStream.close();
}
@Override public int read(char[] charArray) throws IOException {
lineColPos = 0;
assert !sniffing;
assert charArray.length >= 2;
if (needToNotifyTokenizer) {
if (driver != null) {
driver.notifyAboutMetaBoundary();
}
needToNotifyTokenizer = false;
}
CharBuffer charBuffer = CharBuffer.wrap(charArray);
charBuffer.limit(charArray.length);
charBuffer.position(0);
if (flushing) {
decoder.flush(charBuffer);
// return -1 if zero
int cPos = charBuffer.position();
return cPos == 0 ? -1 : cPos;
}
if (hasPendingReplacementCharacter) {
charBuffer.put('\uFFFD');
hasPendingReplacementCharacter = false;
}
for (;;) {
if (shouldReadBytes) {
int oldLimit = byteBuffer.limit();
int readLen;
if (charsetBoundaryPassed) {
readLen = byteArray.length - oldLimit;
} else {
readLen = SNIFFING_LIMIT - oldLimit;
}
int num = inputStream.read(byteArray, oldLimit, readLen);
if (num == -1) {
eofSeen = true;
inputStream.close();
} else {
byteBuffer.position(0);
byteBuffer.limit(oldLimit + num);
}
shouldReadBytes = false;
}
boolean finalDecode = false;
for (;;) {
int oldBytePos = byteBuffer.position();
CoderResult cr = decoder.decode(byteBuffer, charBuffer,
finalDecode);
bytesRead += byteBuffer.position() - oldBytePos;
if (cr == CoderResult.OVERFLOW) {
// Decoder will remember surrogates
int pos = charBuffer.position();
if (pos == -1) {
System.err.println("WTF1");
}
return pos;
} else if (cr == CoderResult.UNDERFLOW) {
int remaining = byteBuffer.remaining();
if (!charsetBoundaryPassed) {
if (bytesRead + remaining >= SNIFFING_LIMIT) {
needToNotifyTokenizer = true;
charsetBoundaryPassed = true;
}
}
// XXX what happens if the entire byte buffer consists of
// a pathologically long malformed sequence?
// If the buffer was not fully consumed, there may be an
// incomplete byte sequence that needs to seed the next
// buffer.
if (remaining > 0) {
System.arraycopy(byteArray, byteBuffer.position(),
byteArray, 0, remaining);
}
byteBuffer.position(0);
byteBuffer.limit(remaining);
if (flushing) {
// The final decode was successful. Not sure if this
// ever happens.
// Let's get out in any case.
int cPos = charBuffer.position();
if (cPos == -1 || cPos == 0) {
System.err.println("WTF2");
}
return cPos == 0 ? -1 : cPos;
} else if (eofSeen) {
// If there's something left, it isn't something that
// would be
// consumed in the middle of the stream. Rerun the loop
// once
// in the final mode.
shouldReadBytes = false;
finalDecode = true;
flushing = true;
continue;
} else {
// The usual stuff. Want more bytes next time.
shouldReadBytes = true;
// return -1 if zero
int cPos = charBuffer.position();
if (cPos == 0) {
// No output. Read more bytes right away
break;
}
return cPos;
}
} else {
// The result is in error. No need to test.
StringBuilder sb = new StringBuilder();
for (int i = 0; i < cr.length(); i++) {
if (i > 0) {
sb.append(", ");
}
sb.append('\u201C');
sb.append(Integer.toHexString(byteBuffer.get() & 0xFF));
bytesRead++;
sb.append('\u201D');
}
if (charBuffer.hasRemaining()) {
charBuffer.put('\uFFFD');
} else {
hasPendingReplacementCharacter = true;
}
calculateLineAndCol(charBuffer);
if (cr.isMalformed()) {
err("Malformed byte sequence: " + sb + ".");
} else if (cr.isUnmappable()) {
err("Unmappable byte sequence: " + sb + ".");
} else {
throw new RuntimeException(
"CoderResult was none of overflow, underflow, malformed or unmappable.");
}
if (finalDecode) {
// These were the last bytes of input. Return without
// relooping.
// return -1 if zero
int cPos = charBuffer.position();
return cPos == 0 ? -1 : cPos;
}
}
}
}
}
private void calculateLineAndCol(CharBuffer charBuffer) {
if (tokenizer != null) {
if (lineColPos == 0) {
line = tokenizer.getLine();
col = tokenizer.getCol();
nextCharOnNewLine = tokenizer.isNextCharOnNewLine();
prevWasCR = tokenizer.isPrevCR();
}
char[] charArray = charBuffer.array();
int i = lineColPos;
while (i < charBuffer.position()) {
char c;
if (nextCharOnNewLine) {
line++;
col = 1;
nextCharOnNewLine = false;
} else {
col++;
}
c = charArray[i];
switch (c) {
case '\r':
nextCharOnNewLine = true;
prevWasCR = true;
break;
case '\n':
if (prevWasCR) {
col--;
} else {
nextCharOnNewLine = true;
}
break;
}
i++;
}
lineColPos = i;
}
}
public int readByte() throws IOException {
if (!sniffing) {
throw new IllegalStateException(
"readByte() called when not in the sniffing state.");
}
if (position == SNIFFING_LIMIT) {
return -1;
} else if (position < limit) {
return byteArray[position++] & 0xFF;
} else {
int num = inputStream.read(byteArray, limit, SNIFFING_LIMIT - limit);
if (num == -1) {
return -1;
} else {
limit += num;
return byteArray[position++] & 0xFF;
}
}
}
public static void main(String[] args) {
CharsetDecoder dec = Charset.forName("UTF-8").newDecoder();
dec.onMalformedInput(CodingErrorAction.REPORT);
dec.onUnmappableCharacter(CodingErrorAction.REPORT);
byte[] bytes = { (byte) 0xF0, (byte) 0x9D, (byte) 0x80, (byte) 0x80 };
byte[] bytes2 = { (byte) 0xB8, (byte) 0x80, 0x63, 0x64, 0x65 };
ByteBuffer byteBuf = ByteBuffer.wrap(bytes);
ByteBuffer byteBuf2 = ByteBuffer.wrap(bytes2);
char[] chars = new char[1];
CharBuffer charBuf = CharBuffer.wrap(chars);
CoderResult cr = dec.decode(byteBuf, charBuf, false);
System.out.println(cr);
System.out.println(byteBuf);
// byteBuf.get();
cr = dec.decode(byteBuf2, charBuf, false);
System.out.println(cr);
System.out.println(byteBuf2);
}
public int getColumnNumber() {
if (tokenizer != null) {
return col;
}
return -1;
}
public int getLineNumber() {
if (tokenizer != null) {
return line;
}
return -1;
}
public String getPublicId() {
if (tokenizer != null) {
return tokenizer.getPublicId();
}
return null;
}
public String getSystemId() {
if (tokenizer != null) {
return tokenizer.getSystemId();
}
return null;
}
/**
* @param string
* @throws SAXException
*/
private void err(String message) throws IOException {
// TODO remove wrapping when changing read() to take a CharBuffer
try {
if (errorHandler != null) {
SAXParseException spe = new SAXParseException(message, this);
errorHandler.error(spe);
}
} catch (SAXException e) {
throw (IOException) new IOException(e.getMessage()).initCause(e);
}
}
public Charset getCharset() {
return decoder.charset();
}
/**
* @see java.io.Reader#read()
*/
@Override public int read() throws IOException {
throw new UnsupportedOperationException();
}
/**
* @see java.io.Reader#read(char[], int, int)
*/
@Override public int read(char[] cbuf, int off, int len) throws IOException {
throw new UnsupportedOperationException();
}
/**
* @see java.io.Reader#read(java.nio.CharBuffer)
*/
@Override public int read(CharBuffer target) throws IOException {
throw new UnsupportedOperationException();
}
public void switchEncoding(Encoding newEnc) {
this.decoder = newEnc.newDecoder();
initDecoder();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy