com.googlecode.html.HTMLScanner Maven / Gradle / Ivy
/*
* Copyright 2002-2009 Andy Clark, Marc Guillemot
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
* in compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License
* is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing permissions and limitations under
* the License.
*/
package com.googlecode.html;
import com.googlecode.html.xercesbridge.XercesBridge;
import org.apache.xerces.util.*;
import org.apache.xerces.xni.*;
import org.apache.xerces.xni.parser.XMLComponentManager;
import org.apache.xerces.xni.parser.XMLConfigurationException;
import org.apache.xerces.xni.parser.XMLDocumentScanner;
import org.apache.xerces.xni.parser.XMLInputSource;
import java.io.*;
import java.net.URL;
import java.util.BitSet;
import java.util.Stack;
/**
* A simple HTML scanner. This scanner makes no attempt to balance tags or fix other problems in the
* source document — it just scans what it can and generates XNI document "events", ignoring
* errors of all kinds.
*
* This component recognizes the following features:
*
* - http://cyberneko.org/html/features/augmentations
*
- http://cyberneko.org/html/features/report-errors
*
- http://apache.org/xml/features/scanner/notify-char-refs
*
- http://apache.org/xml/features/scanner/notify-builtin-refs
*
- http://cyberneko.org/html/features/scanner/notify-builtin-refs
*
- http://cyberneko.org/html/features/scanner/fix-mswindows-refs
*
- http://cyberneko.org/html/features/scanner/script/strip-cdata-delims
*
- http://cyberneko.org/html/features/scanner/script/strip-comment-delims
*
- http://cyberneko.org/html/features/scanner/style/strip-cdata-delims
*
- http://cyberneko.org/html/features/scanner/style/strip-comment-delims
*
- http://cyberneko.org/html/features/scanner/ignore-specified-charset
*
- http://cyberneko.org/html/features/scanner/cdata-sections
*
- http://cyberneko.org/html/features/override-doctype
*
- http://cyberneko.org/html/features/insert-doctype
*
- http://cyberneko.org/html/features/parse-noscript-content
*
- http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe
*
*
* This component recognizes the following properties:
*
* - http://cyberneko.org/html/properties/names/elems
*
- http://cyberneko.org/html/properties/names/attrs
*
- http://cyberneko.org/html/properties/default-encoding
*
- http://cyberneko.org/html/properties/error-reporter
*
- http://cyberneko.org/html/properties/doctype/pubid
*
- http://cyberneko.org/html/properties/doctype/sysid
*
*
* @author Andy Clark
* @author Marc Guillemot
* @author Ahmed Ashour
* @version $Id: HTMLScanner.java,v 1.19 2005/06/14 05:52:37 andyc Exp $
* @see HTMLElements
* @see HTMLEntities
*/
public class HTMLScanner implements XMLDocumentScanner, XMLLocator, HTMLComponent {
//
// Constants
//
// doctype info: HTML 4.01 strict
/**
* The primary HTML document scanner.
*
* @author Andy Clark
*/
public class ContentScanner implements Scanner {
//
// Data
//
// temp vars
/**
* Attributes.
*/
private final XMLAttributesImpl fAttributes = new XMLAttributesImpl();
/**
* A qualified name.
*/
private final QName fQName = new QName();
//
// Scanner methods
//
/**
* Scan.
*/
public boolean scan(boolean complete) throws IOException {
boolean next;
do {
try {
next = false;
switch (fScannerState) {
case STATE_CONTENT: {
fBeginLineNumber = fCurrentEntity.getLineNumber();
fBeginColumnNumber = fCurrentEntity.getColumnNumber();
fBeginCharacterOffset = fCurrentEntity.getCharacterOffset();
int c = fCurrentEntity.read();
if (c == '<') {
setScannerState(STATE_MARKUP_BRACKET);
next = true;
} else if (c == '&') {
scanEntityRef(fStringBuffer, true);
} else if (c == -1) {
throw new EOFException();
} else {
fCurrentEntity.rewind();
scanCharacters();
}
break;
}
case STATE_MARKUP_BRACKET: {
int c = fCurrentEntity.read();
if (c == '!') {
if (skip("--", false)) {
scanComment();
} else if (skip("[CDATA[", false)) {
scanCDATA();
} else if (skip("DOCTYPE", false)) {
scanDoctype();
} else {
if (fReportErrors) {
fErrorReporter.reportError("HTML1002", null);
}
skipMarkup(true);
}
} else if (c == '?') {
scanPI();
} else if (c == '/') {
scanEndElement();
} else if (c == -1) {
if (fReportErrors) {
fErrorReporter.reportError("HTML1003", null);
}
if (fDocumentHandler != null && fElementCount >= fElementDepth) {
fStringBuffer.clear();
fStringBuffer.append('<');
fDocumentHandler.characters(fStringBuffer, null);
}
throw new EOFException();
} else {
fCurrentEntity.rewind();
fElementCount++;
fSingleBoolean[0] = false;
final String ename = scanStartElement(fSingleBoolean);
final String enameLC = ename == null ? null : ename.toLowerCase();
fBeginLineNumber = fCurrentEntity.getLineNumber();
fBeginColumnNumber = fCurrentEntity.getColumnNumber();
fBeginCharacterOffset = fCurrentEntity.getCharacterOffset();
if ("script".equals(enameLC)) {
scanScriptContent();
} else if (!fAllowSelfclosingIframe && "iframe".equals(enameLC)) {
scanUntilEndTag("iframe");
} else if (!fParseNoScriptContent && "noscript".equals(enameLC)) {
scanUntilEndTag("noscript");
} else if (!fParseNoFramesContent && "noframes".equals(enameLC)) {
scanUntilEndTag("noframes");
} else if (ename != null && !fSingleBoolean[0]
&& HTMLElements.getElement(enameLC).isSpecial()
&& (!ename.equalsIgnoreCase("TITLE") || isEnded(enameLC))) {
setScanner(fSpecialScanner.setElementName(ename));
setScannerState(STATE_CONTENT);
return true;
}
}
setScannerState(STATE_CONTENT);
break;
}
case STATE_START_DOCUMENT: {
if (fDocumentHandler != null && fElementCount >= fElementDepth) {
if (DEBUG_CALLBACKS) {
System.out.println("startDocument()");
}
XMLLocator locator = HTMLScanner.this;
String encoding = fIANAEncoding;
Augmentations augs = locationAugs();
NamespaceContext nscontext = new NamespaceSupport();
XercesBridge.getInstance().XMLDocumentHandler_startDocument(
fDocumentHandler, locator, encoding, nscontext, augs);
}
if (fInsertDoctype && fDocumentHandler != null) {
String root = HTMLElements.getElement(HTMLElements.HTML).name;
root = modifyName(root, fNamesElems);
String pubid = fDoctypePubid;
String sysid = fDoctypeSysid;
fDocumentHandler.doctypeDecl(root, pubid, sysid, synthesizedAugs());
}
setScannerState(STATE_CONTENT);
break;
}
case STATE_END_DOCUMENT: {
if (fDocumentHandler != null && fElementCount >= fElementDepth && complete) {
if (DEBUG_CALLBACKS) {
System.out.println("endDocument()");
}
fEndLineNumber = fCurrentEntity.getLineNumber();
fEndColumnNumber = fCurrentEntity.getColumnNumber();
fEndCharacterOffset = fCurrentEntity.getCharacterOffset();
fDocumentHandler.endDocument(locationAugs());
}
return false;
}
default: {
throw new RuntimeException("unknown scanner state: " + fScannerState);
}
}
} catch (EOFException e) {
if (fCurrentEntityStack.empty()) {
setScannerState(STATE_END_DOCUMENT);
} else {
fCurrentEntity = (CurrentEntity) fCurrentEntityStack.pop();
}
next = true;
}
} while (next || complete);
return true;
} // scan(boolean):boolean
/**
* Adds location augmentations to the specified attribute.
*/
protected void addLocationItem(XMLAttributes attributes, int index) {
fEndLineNumber = fCurrentEntity.getLineNumber();
fEndColumnNumber = fCurrentEntity.getColumnNumber();
fEndCharacterOffset = fCurrentEntity.getCharacterOffset();
LocationItem locationItem = new LocationItem();
locationItem.setValues(fBeginLineNumber, fBeginColumnNumber, fBeginCharacterOffset,
fEndLineNumber, fEndColumnNumber, fEndCharacterOffset);
Augmentations augs = attributes.getAugmentations(index);
augs.putItem(AUGMENTATIONS, locationItem);
} // addLocationItem(XMLAttributes,int)
/**
* Reads the next characters WITHOUT impacting the buffer content up to current offset.
*
* @param len the number of characters to read
* @return the read string (length may be smaller if EOF is encountered)
*/
protected String nextContent(int len) throws IOException {
final int originalOffset = fCurrentEntity.offset;
final int originalColumnNumber = fCurrentEntity.getColumnNumber();
final int originalCharacterOffset = fCurrentEntity.getCharacterOffset();
char[] buff = new char[len];
int nbRead = 0;
for (nbRead = 0; nbRead < len; ++nbRead) {
// read() should not clear the buffer
if (fCurrentEntity.offset == fCurrentEntity.length) {
if (fCurrentEntity.length == fCurrentEntity.buffer.length) {
fCurrentEntity.load(fCurrentEntity.buffer.length);
} else { // everything was already loaded
break;
}
}
int c = fCurrentEntity.read();
if (c == -1) {
break;
} else {
buff[nbRead] = (char) c;
}
}
fCurrentEntity.restorePosition(originalOffset, originalColumnNumber,
originalCharacterOffset);
return new String(buff, 0, nbRead);
}
/**
* Scans a real attribute.
*
* @param attributes The list of attributes.
* @param empty Is used for a second return value to indicate whether the start element tag is
* empty (e.g. "/>").
*/
protected boolean scanAttribute(XMLAttributesImpl attributes, boolean[] empty)
throws IOException {
return scanAttribute(attributes, empty, '/');
} // scanAttribute(XMLAttributesImpl,boolean[]):boolean
//
// Protected methods
//
/**
* Scans an attribute, pseudo or real.
*
* @param attributes The list of attributes.
* @param empty Is used for a second return value to indicate whether the start element tag is
* empty (e.g. "/>").
* @param endc The end character that appears before the closing angle bracket ('>').
*/
protected boolean scanAttribute(XMLAttributesImpl attributes, boolean[] empty, char endc)
throws IOException {
boolean skippedSpaces = skipSpaces();
fBeginLineNumber = fCurrentEntity.getLineNumber();
fBeginColumnNumber = fCurrentEntity.getColumnNumber();
fBeginCharacterOffset = fCurrentEntity.getCharacterOffset();
int c = fCurrentEntity.read();
if (c == -1) {
if (fReportErrors) {
fErrorReporter.reportError("HTML1007", null);
}
return false;
} else if (c == '>') {
return false;
} else if (c == '<') {
fCurrentEntity.rewind();
return false;
}
fCurrentEntity.rewind();
String aname = scanName();
if (aname == null) {
if (fReportErrors) {
fErrorReporter.reportError("HTML1011", null);
}
empty[0] = skipMarkup(false);
return false;
}
if (!skippedSpaces && fReportErrors) {
fErrorReporter.reportError("HTML1013", new Object[]{aname});
}
aname = modifyName(aname, fNamesAttrs);
skipSpaces();
c = fCurrentEntity.read();
if (c == -1) {
if (fReportErrors) {
fErrorReporter.reportError("HTML1007", null);
}
throw new EOFException();
}
if (c == '/' || c == '>') {
fQName.setValues(null, aname, aname, null);
attributes.addAttribute(fQName, "CDATA", "");
attributes.setSpecified(attributes.getLength() - 1, true);
if (fAugmentations) {
addLocationItem(attributes, attributes.getLength() - 1);
}
if (c == '/') {
fCurrentEntity.rewind();
empty[0] = skipMarkup(false);
}
return false;
}
/***
* // REVISIT: [Q] Why is this still here? -Ac if (c == '/' || c == '>') { if (c == '/') {
* fCurrentEntity.offset--; fCurrentEntity.columnNumber--; empty[0] = skipMarkup(false); }
* fQName.setValues(null, aname, aname, null); attributes.addAttribute(fQName, "CDATA",
* ""); attributes.setSpecified(attributes.getLength()-1, true); if (fAugmentations) {
* addLocationItem(attributes, attributes.getLength() - 1); } return false; } /
***/
if (c == '=') {
skipSpaces();
c = fCurrentEntity.read();
if (c == -1) {
if (fReportErrors) {
fErrorReporter.reportError("HTML1007", null);
}
throw new EOFException();
}
// Xiaowei/Ac: Fix for ...
if (c == '>') {
fQName.setValues(null, aname, aname, null);
attributes.addAttribute(fQName, "CDATA", "");
attributes.setSpecified(attributes.getLength() - 1, true);
if (fAugmentations) {
addLocationItem(attributes, attributes.getLength() - 1);
}
return false;
}
fStringBuffer.clear();
fNonNormAttr.clear();
if (c != '\'' && c != '"') {
fCurrentEntity.rewind();
while (true) {
c = fCurrentEntity.read();
// Xiaowei/Ac: Fix for ...
if (Character.isWhitespace((char) c) || c == '>') {
// fCharOffset--;
fCurrentEntity.rewind();
break;
}
if (c == -1) {
if (fReportErrors) {
fErrorReporter.reportError("HTML1007", null);
}
throw new EOFException();
}
if (c == '&') {
int ce = scanEntityRef(fStringBuffer2, false);
if (ce != -1) {
fStringBuffer.append((char) ce);
} else {
fStringBuffer.append(fStringBuffer2);
}
fNonNormAttr.append(fStringBuffer2);
} else {
fStringBuffer.append((char) c);
fNonNormAttr.append((char) c);
}
}
fQName.setValues(null, aname, aname, null);
String avalue = fStringBuffer.toString();
attributes.addAttribute(fQName, "CDATA", avalue);
int lastattr = attributes.getLength() - 1;
attributes.setSpecified(lastattr, true);
attributes.setNonNormalizedValue(lastattr, fNonNormAttr.toString());
if (fAugmentations) {
addLocationItem(attributes, attributes.getLength() - 1);
}
return true;
}
char quote = (char) c;
boolean isStart = true;
boolean prevSpace = false;
do {
boolean acceptSpace = !fNormalizeAttributes || (!isStart && !prevSpace);
c = fCurrentEntity.read();
if (c == -1) {
if (fReportErrors) {
fErrorReporter.reportError("HTML1007", null);
}
break;
// throw new EOFException();
}
if (c == '&') {
isStart = false;
int ce = scanEntityRef(fStringBuffer2, false);
if (ce != -1) {
fStringBuffer.append((char) ce);
} else {
fStringBuffer.append(fStringBuffer2);
}
fNonNormAttr.append(fStringBuffer2);
} else if (c == ' ' || c == '\t') {
if (acceptSpace) {
fStringBuffer.append(fNormalizeAttributes ? ' ' : (char) c);
}
fNonNormAttr.append((char) c);
} else if (c == '\r' || c == '\n') {
if (c == '\r') {
int c2 = fCurrentEntity.read();
if (c2 != '\n') {
fCurrentEntity.rewind();
} else {
fNonNormAttr.append('\r');
c = c2;
}
}
if (acceptSpace) {
fStringBuffer.append(fNormalizeAttributes ? ' ' : '\n');
}
fCurrentEntity.incLine();
fNonNormAttr.append((char) c);
} else if (c != quote) {
isStart = false;
fStringBuffer.append((char) c);
fNonNormAttr.append((char) c);
}
prevSpace = c == ' ' || c == '\t' || c == '\r' || c == '\n';
isStart = isStart && prevSpace;
} while (c != quote);
if (fNormalizeAttributes && fStringBuffer.length > 0) {
// trailing whitespace already normalized to single space
if (fStringBuffer.ch[fStringBuffer.length - 1] == ' ') {
fStringBuffer.length--;
}
}
fQName.setValues(null, aname, aname, null);
String avalue = fStringBuffer.toString();
attributes.addAttribute(fQName, "CDATA", avalue);
int lastattr = attributes.getLength() - 1;
attributes.setSpecified(lastattr, true);
attributes.setNonNormalizedValue(lastattr, fNonNormAttr.toString());
if (fAugmentations) {
addLocationItem(attributes, attributes.getLength() - 1);
}
} else {
fQName.setValues(null, aname, aname, null);
attributes.addAttribute(fQName, "CDATA", "");
attributes.setSpecified(attributes.getLength() - 1, true);
fCurrentEntity.rewind();
if (fAugmentations) {
addLocationItem(attributes, attributes.getLength() - 1);
}
}
return true;
} // scanAttribute(XMLAttributesImpl):boolean
/**
* Scans a CDATA section.
*/
protected void scanCDATA() throws IOException {
fCurrentEntity.debugBufferIfNeeded("(scanCDATA: ");
fStringBuffer.clear();
if (fCDATASections) {
if (fDocumentHandler != null && fElementCount >= fElementDepth) {
fEndLineNumber = fCurrentEntity.getLineNumber();
fEndColumnNumber = fCurrentEntity.getColumnNumber();
fEndCharacterOffset = fCurrentEntity.getCharacterOffset();
if (DEBUG_CALLBACKS) {
System.out.println("startCDATA()");
}
fDocumentHandler.startCDATA(locationAugs());
}
} else {
fStringBuffer.append("[CDATA[");
}
boolean eof = scanMarkupContent(fStringBuffer, ']');
if (!fCDATASections) {
fStringBuffer.append("]]");
}
if (fDocumentHandler != null && fElementCount >= fElementDepth) {
fEndLineNumber = fCurrentEntity.getLineNumber();
fEndColumnNumber = fCurrentEntity.getColumnNumber();
fEndCharacterOffset = fCurrentEntity.getCharacterOffset();
if (fCDATASections) {
if (DEBUG_CALLBACKS) {
System.out.println("characters(" + fStringBuffer + ")");
}
fDocumentHandler.characters(fStringBuffer, locationAugs());
if (DEBUG_CALLBACKS) {
System.out.println("endCDATA()");
}
fDocumentHandler.endCDATA(locationAugs());
} else {
if (DEBUG_CALLBACKS) {
System.out.println("comment(" + fStringBuffer + ")");
}
fDocumentHandler.comment(fStringBuffer, locationAugs());
}
}
fCurrentEntity.debugBufferIfNeeded(")scanCDATA: ");
if (eof) {
throw new EOFException();
}
} // scanCDATA()
/**
* Scans characters.
*/
protected void scanCharacters() throws IOException {
fCurrentEntity.debugBufferIfNeeded("(scanCharacters: ");
fStringBuffer.clear();
while (true) {
int newlines = skipNewlines();
if (newlines == 0 && fCurrentEntity.offset == fCurrentEntity.length) {
fCurrentEntity.debugBufferIfNeeded(")scanCharacters: ");
break;
}
char c;
int offset = fCurrentEntity.offset - newlines;
for (int i = offset; i < fCurrentEntity.offset; i++) {
fCurrentEntity.buffer[i] = '\n';
}
while (fCurrentEntity.hasNext()) {
c = fCurrentEntity.getNextChar();
if (c == '<' || c == '&' || c == '\n' || c == '\r') {
fCurrentEntity.rewind();
break;
}
}
if (fCurrentEntity.offset > offset && fDocumentHandler != null
&& fElementCount >= fElementDepth) {
if (DEBUG_CALLBACKS) {
final XMLString xmlString = new XMLString(fCurrentEntity.buffer, offset,
fCurrentEntity.offset - offset);
System.out.println("characters(" + xmlString + ")");
}
fEndLineNumber = fCurrentEntity.getLineNumber();
fEndColumnNumber = fCurrentEntity.getColumnNumber();
fEndCharacterOffset = fCurrentEntity.getCharacterOffset();
fStringBuffer.append(fCurrentEntity.buffer, offset, fCurrentEntity.offset - offset);
}
fCurrentEntity.debugBufferIfNeeded(")scanCharacters: ");
boolean hasNext = fCurrentEntity.offset < fCurrentEntity.buffer.length;
int next = hasNext ? fCurrentEntity.getCurrentChar() : -1;
if (next == '&' || next == '<' || next == -1) {
break;
}
} // end while
if (fStringBuffer.length != 0) {
fDocumentHandler.characters(fStringBuffer, locationAugs());
}
} // scanCharacters()
/**
* Scans a comment.
*/
protected void scanComment() throws IOException {
fCurrentEntity.debugBufferIfNeeded("(scanComment: ");
fEndLineNumber = fCurrentEntity.getLineNumber();
fEndColumnNumber = fCurrentEntity.getColumnNumber();
fEndCharacterOffset = fCurrentEntity.getCharacterOffset();
XMLStringBuffer buffer = new XMLStringBuffer();
boolean eof = scanMarkupContent(buffer, '-');
// no --> found, comment with end only with >
if (eof) {
fCurrentEntity.resetBuffer(buffer, fEndLineNumber, fEndColumnNumber,
fEndCharacterOffset);
buffer = new XMLStringBuffer(); // take a new one to avoid
// interactions
while (true) {
int c = fCurrentEntity.read();
if (c == -1) {
if (fReportErrors) {
fErrorReporter.reportError("HTML1007", null);
}
eof = true;
break;
} else if (c != '>') {
buffer.append((char) c);
continue;
} else if (c == '\n' || c == '\r') {
fCurrentEntity.rewind();
int newlines = skipNewlines();
for (int i = 0; i < newlines; i++) {
buffer.append('\n');
}
continue;
}
eof = false;
break;
}
}
if (fDocumentHandler != null && fElementCount >= fElementDepth) {
if (DEBUG_CALLBACKS) {
System.out.println("comment(" + buffer + ")");
}
fEndLineNumber = fCurrentEntity.getLineNumber();
fEndColumnNumber = fCurrentEntity.getColumnNumber();
fEndCharacterOffset = fCurrentEntity.getCharacterOffset();
fDocumentHandler.comment(buffer, locationAugs());
}
fCurrentEntity.debugBufferIfNeeded(")scanComment: ");
if (eof) {
throw new EOFException();
}
} // scanComment()
/**
* Scans an end element.
*/
protected void scanEndElement() throws IOException {
String ename = scanName();
if (fReportErrors && ename == null) {
fErrorReporter.reportError("HTML1012", null);
}
skipMarkup(false);
if (ename != null) {
ename = modifyName(ename, fNamesElems);
if (fDocumentHandler != null && fElementCount >= fElementDepth) {
fQName.setValues(null, ename, ename, null);
if (DEBUG_CALLBACKS) {
System.out.println("endElement(" + fQName + ")");
}
fEndLineNumber = fCurrentEntity.getLineNumber();
fEndColumnNumber = fCurrentEntity.getColumnNumber();
fEndCharacterOffset = fCurrentEntity.getCharacterOffset();
fDocumentHandler.endElement(fQName, locationAugs());
}
}
} // scanEndElement()
/**
* Scans markup content.
*/
protected boolean scanMarkupContent(XMLStringBuffer buffer, char cend) throws IOException {
int c = -1;
OUTER:
while (true) {
c = fCurrentEntity.read();
if (c == cend) {
int count = 1;
while (true) {
c = fCurrentEntity.read();
if (c == cend) {
count++;
continue;
}
break;
}
if (c == -1) {
if (fReportErrors) {
fErrorReporter.reportError("HTML1007", null);
}
break OUTER;
}
if (count < 2) {
buffer.append(cend);
// if (c != -1) {
fCurrentEntity.rewind();
// }
continue;
}
if (c != '>') {
for (int i = 0; i < count; i++) {
buffer.append(cend);
}
fCurrentEntity.rewind();
continue;
}
for (int i = 0; i < count - 2; i++) {
buffer.append(cend);
}
break;
} else if (c == '\n' || c == '\r') {
fCurrentEntity.rewind();
int newlines = skipNewlines();
for (int i = 0; i < newlines; i++) {
buffer.append('\n');
}
continue;
} else if (c == -1) {
if (fReportErrors) {
fErrorReporter.reportError("HTML1007", null);
}
break;
}
buffer.append((char) c);
}
return c == -1;
} // scanMarkupContent(XMLStringBuffer,char):boolean
/**
* Scans a processing instruction.
*/
protected void scanPI() throws IOException {
fCurrentEntity.debugBufferIfNeeded("(scanPI: ");
if (fReportErrors) {
fErrorReporter.reportWarning("HTML1008", null);
}
// scan processing instruction
String target = scanName();
if (target != null && !target.equalsIgnoreCase("xml")) {
while (true) {
int c = fCurrentEntity.read();
if (c == '\r' || c == '\n') {
if (c == '\r') {
c = fCurrentEntity.read();
if (c != '\n') {
fCurrentEntity.offset--;
fCurrentEntity.characterOffset_--;
}
}
fCurrentEntity.incLine();
continue;
}
if (c == -1) {
break;
}
if (c != ' ' && c != '\t') {
fCurrentEntity.rewind();
break;
}
}
fStringBuffer.clear();
while (true) {
int c = fCurrentEntity.read();
if (c == '?' || c == '/') {
char c0 = (char) c;
c = fCurrentEntity.read();
if (c == '>') {
break;
} else {
fStringBuffer.append(c0);
fCurrentEntity.rewind();
continue;
}
} else if (c == '\r' || c == '\n') {
fStringBuffer.append('\n');
if (c == '\r') {
c = fCurrentEntity.read();
if (c != '\n') {
fCurrentEntity.offset--;
fCurrentEntity.characterOffset_--;
}
}
fCurrentEntity.incLine();
continue;
} else if (c == -1) {
break;
} else {
fStringBuffer.append((char) c);
}
}
XMLString data = fStringBuffer;
if (fDocumentHandler != null) {
fEndLineNumber = fCurrentEntity.getLineNumber();
fEndColumnNumber = fCurrentEntity.getColumnNumber();
fEndCharacterOffset = fCurrentEntity.getCharacterOffset();
fDocumentHandler.processingInstruction(target, data, locationAugs());
}
}
// scan xml/text declaration
else {
int beginLineNumber = fBeginLineNumber;
int beginColumnNumber = fBeginColumnNumber;
int beginCharacterOffset = fBeginCharacterOffset;
fAttributes.removeAllAttributes();
int aindex = 0;
while (scanPseudoAttribute(fAttributes)) {
// if we haven't scanned a value, remove the entry as values have
// special signification
if (fAttributes.getValue(aindex).length() == 0) {
fAttributes.removeAttributeAt(aindex);
} else {
fAttributes.getName(aindex, fQName);
fQName.rawname = fQName.rawname.toLowerCase();
fAttributes.setName(aindex, fQName);
aindex++;
}
}
if (fDocumentHandler != null) {
String version = fAttributes.getValue("version");
String encoding = fAttributes.getValue("encoding");
String standalone = fAttributes.getValue("standalone");
// if the encoding is successfully changed, the stream will be
// processed again
// with the right encoding an we will come here again but without
// need to change the encoding
final boolean xmlDeclNow = fIgnoreSpecifiedCharset || !changeEncoding(encoding);
if (xmlDeclNow) {
fBeginLineNumber = beginLineNumber;
fBeginColumnNumber = beginColumnNumber;
fBeginCharacterOffset = beginCharacterOffset;
fEndLineNumber = fCurrentEntity.getLineNumber();
fEndColumnNumber = fCurrentEntity.getColumnNumber();
fEndCharacterOffset = fCurrentEntity.getCharacterOffset();
fDocumentHandler.xmlDecl(version, encoding, standalone, locationAugs());
}
}
}
fCurrentEntity.debugBufferIfNeeded(")scanPI: ");
} // scanPI()
/**
* Scans a pseudo attribute.
*
* @param attributes The list of attributes.
*/
protected boolean scanPseudoAttribute(XMLAttributesImpl attributes) throws IOException {
return scanAttribute(attributes, fSingleBoolean, '?');
} // scanPseudoAttribute(XMLAttributesImpl):boolean
/**
* Scans a start element.
*
* @param empty Is used for a second return value to indicate whether the start element tag is
* empty (e.g. "/>").
*/
protected String scanStartElement(boolean[] empty) throws IOException {
String ename = scanName();
int length = ename != null ? ename.length() : 0;
int c = length > 0 ? ename.charAt(0) : -1;
if (length == 0 || !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))) {
if (fReportErrors) {
fErrorReporter.reportError("HTML1009", null);
}
if (fDocumentHandler != null && fElementCount >= fElementDepth) {
fStringBuffer.clear();
fStringBuffer.append('<');
if (length > 0) {
fStringBuffer.append(ename);
}
fDocumentHandler.characters(fStringBuffer, null);
}
return null;
}
ename = modifyName(ename, fNamesElems);
fAttributes.removeAllAttributes();
int beginLineNumber = fBeginLineNumber;
int beginColumnNumber = fBeginColumnNumber;
int beginCharacterOffset = fBeginCharacterOffset;
while (scanAttribute(fAttributes, empty)) {
// do nothing
}
fBeginLineNumber = beginLineNumber;
fBeginColumnNumber = beginColumnNumber;
fBeginCharacterOffset = beginCharacterOffset;
if (fByteStream != null && fElementDepth == -1) {
if (ename.equalsIgnoreCase("META")) {
if (DEBUG_CHARSET) {
System.out.println("+++ ");
}
String httpEquiv = getValue(fAttributes, "http-equiv");
if (httpEquiv != null && httpEquiv.equalsIgnoreCase("content-type")) {
if (DEBUG_CHARSET) {
System.out.println("+++ @content-type: \"" + httpEquiv + '"');
}
String content = getValue(fAttributes, "content");
if (content != null) {
content = removeSpaces(content);
int index1 = content.toLowerCase().indexOf("charset=");
if (index1 != -1 && !fIgnoreSpecifiedCharset) {
final int index2 = content.indexOf(';', index1);
final String charset = index2 != -1 ? content.substring(index1 + 8, index2)
: content.substring(index1 + 8);
changeEncoding(charset);
}
}
}
} else if (ename.equalsIgnoreCase("BODY")) {
fByteStream.clear();
fByteStream = null;
} else {
HTMLElements.Element element = HTMLElements.getElement(ename);
if (element.parent != null && element.parent.length > 0) {
if (element.parent[0].code == HTMLElements.BODY) {
fByteStream.clear();
fByteStream = null;
}
}
}
}
if (fDocumentHandler != null && fElementCount >= fElementDepth) {
fQName.setValues(null, ename, ename, null);
if (DEBUG_CALLBACKS) {
System.out.println("startElement(" + fQName + ',' + fAttributes + ")");
}
fEndLineNumber = fCurrentEntity.getLineNumber();
fEndColumnNumber = fCurrentEntity.getColumnNumber();
fEndCharacterOffset = fCurrentEntity.getCharacterOffset();
if (empty[0]) {
fDocumentHandler.emptyElement(fQName, fAttributes, locationAugs());
} else {
fDocumentHandler.startElement(fQName, fAttributes, locationAugs());
}
}
return ename;
} // scanStartElement():ename
/**
* Tries to change the encoding used to read the input stream to the specified one
*
* @param charset the charset that should be used
* @return true
when the encoding has been changed
*/
private boolean changeEncoding(String charset) {
if (charset == null || fByteStream == null) {
return false;
}
charset = charset.trim();
boolean encodingChanged = false;
try {
String ianaEncoding = charset;
String javaEncoding = EncodingMap.getIANA2JavaMapping(ianaEncoding.toUpperCase());
if (DEBUG_CHARSET) {
System.out.println("+++ ianaEncoding: " + ianaEncoding);
System.out.println("+++ javaEncoding: " + javaEncoding);
}
if (javaEncoding == null) {
javaEncoding = ianaEncoding;
if (fReportErrors) {
fErrorReporter.reportError("HTML1001", new Object[]{ianaEncoding});
}
}
// patch: Marc Guillemot
if (!javaEncoding.equals(fJavaEncoding)) {
if (!isEncodingCompatible(javaEncoding, fJavaEncoding)) {
if (fReportErrors) {
fErrorReporter.reportError("HTML1015", new Object[]{
javaEncoding, fJavaEncoding});
}
}
// change the charset
else {
fIso8859Encoding = ianaEncoding == null
|| ianaEncoding.toUpperCase().startsWith("ISO-8859")
|| ianaEncoding.equalsIgnoreCase(fDefaultIANAEncoding);
fJavaEncoding = javaEncoding;
fCurrentEntity.setStream(new InputStreamReader(fByteStream, javaEncoding));
fByteStream.playback();
fElementDepth = fElementCount;
fElementCount = 0;
encodingChanged = true;
}
}
} catch (UnsupportedEncodingException e) {
if (fReportErrors) {
fErrorReporter.reportError("HTML1010", new Object[]{charset});
}
// NOTE: If the encoding change doesn't work,
// then there's no point in continuing to
// buffer the input stream.
fByteStream.clear();
fByteStream = null;
}
return encodingChanged;
}
/**
* Returns true if the given element has an end-tag.
*/
private boolean isEnded(String ename) {
String content = new String(fCurrentEntity.buffer, fCurrentEntity.offset,
fCurrentEntity.length - fCurrentEntity.offset);
return content.toLowerCase().indexOf("") != -1;
}
/**
* Removes all spaces for the string (remember: JDK 1.3!)
*/
private String removeSpaces(final String content) {
StringBuffer sb = null;
for (int i = content.length() - 1; i >= 0; --i) {
if (Character.isWhitespace(content.charAt(i))) {
if (sb == null) {
sb = new StringBuffer(content);
}
sb.deleteCharAt(i);
}
}
return (sb == null) ? content : sb.toString();
}
private void scanScriptContent() throws IOException {
final XMLStringBuffer buffer = new XMLStringBuffer();
boolean waitForEndComment = false;
while (true) {
int c = fCurrentEntity.read();
if (c == -1) {
break;
} else if (c == '-' && endsWith(buffer, "= 8 && "/script".equalsIgnoreCase(next.substring(0, 7))
&& ('>' == next.charAt(7) || Character.isWhitespace(next.charAt(7)))) {
fCurrentEntity.rewind();
break;
}
} else if (c == '>' && endsWith(buffer, "--")) {
waitForEndComment = false;
}
if (c == '\r' || c == '\n') {
fCurrentEntity.rewind();
int newlines = skipNewlines();
for (int i = 0; i < newlines; i++) {
buffer.append('\n');
}
} else {
buffer.append((char) c);
}
}
if (fScriptStripCommentDelims) {
reduceToContent(buffer, "");
}
if (fScriptStripCDATADelims) {
reduceToContent(buffer, "");
}
if (buffer.length > 0 && fDocumentHandler != null && fElementCount >= fElementDepth) {
if (DEBUG_CALLBACKS) {
System.out.println("characters(" + buffer + ")");
}
fEndLineNumber = fCurrentEntity.getLineNumber();
fEndColumnNumber = fCurrentEntity.getColumnNumber();
fEndCharacterOffset = fCurrentEntity.getCharacterOffset();
fDocumentHandler.characters(buffer, locationAugs());
}
}
//
// Private methods
//
/**
* Scans the content of