All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.googlecode.html.HTMLScanner Maven / Gradle / Ivy

There is a newer version: 0.63
Show newest version
/*
 * Copyright 2002-2009 Andy Clark, Marc Guillemot
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 * in compliance with the License. You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software distributed under the License
 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 * or implied. See the License for the specific language governing permissions and limitations under
 * the License.
 */

package com.googlecode.html;

import com.googlecode.html.xercesbridge.XercesBridge;
import org.apache.xerces.util.*;
import org.apache.xerces.xni.*;
import org.apache.xerces.xni.parser.XMLComponentManager;
import org.apache.xerces.xni.parser.XMLConfigurationException;
import org.apache.xerces.xni.parser.XMLDocumentScanner;
import org.apache.xerces.xni.parser.XMLInputSource;

import java.io.*;
import java.net.URL;
import java.util.BitSet;
import java.util.Stack;

/**
 * A simple HTML scanner. This scanner makes no attempt to balance tags or fix other problems in the
 * source document — it just scans what it can and generates XNI document "events", ignoring
 * errors of all kinds.
 * 

* This component recognizes the following features: *

    *
  • http://cyberneko.org/html/features/augmentations *
  • http://cyberneko.org/html/features/report-errors *
  • http://apache.org/xml/features/scanner/notify-char-refs *
  • http://apache.org/xml/features/scanner/notify-builtin-refs *
  • http://cyberneko.org/html/features/scanner/notify-builtin-refs *
  • http://cyberneko.org/html/features/scanner/fix-mswindows-refs *
  • http://cyberneko.org/html/features/scanner/script/strip-cdata-delims *
  • http://cyberneko.org/html/features/scanner/script/strip-comment-delims *
  • http://cyberneko.org/html/features/scanner/style/strip-cdata-delims *
  • http://cyberneko.org/html/features/scanner/style/strip-comment-delims *
  • http://cyberneko.org/html/features/scanner/ignore-specified-charset *
  • http://cyberneko.org/html/features/scanner/cdata-sections *
  • http://cyberneko.org/html/features/override-doctype *
  • http://cyberneko.org/html/features/insert-doctype *
  • http://cyberneko.org/html/features/parse-noscript-content *
  • http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe *
*

* This component recognizes the following properties: *

    *
  • http://cyberneko.org/html/properties/names/elems *
  • http://cyberneko.org/html/properties/names/attrs *
  • http://cyberneko.org/html/properties/default-encoding *
  • http://cyberneko.org/html/properties/error-reporter *
  • http://cyberneko.org/html/properties/doctype/pubid *
  • http://cyberneko.org/html/properties/doctype/sysid *
* * @author Andy Clark * @author Marc Guillemot * @author Ahmed Ashour * @version $Id: HTMLScanner.java,v 1.19 2005/06/14 05:52:37 andyc Exp $ * @see HTMLElements * @see HTMLEntities */ public class HTMLScanner implements XMLDocumentScanner, XMLLocator, HTMLComponent { // // Constants // // doctype info: HTML 4.01 strict /** * The primary HTML document scanner. * * @author Andy Clark */ public class ContentScanner implements Scanner { // // Data // // temp vars /** * Attributes. */ private final XMLAttributesImpl fAttributes = new XMLAttributesImpl(); /** * A qualified name. */ private final QName fQName = new QName(); // // Scanner methods // /** * Scan. */ public boolean scan(boolean complete) throws IOException { boolean next; do { try { next = false; switch (fScannerState) { case STATE_CONTENT: { fBeginLineNumber = fCurrentEntity.getLineNumber(); fBeginColumnNumber = fCurrentEntity.getColumnNumber(); fBeginCharacterOffset = fCurrentEntity.getCharacterOffset(); int c = fCurrentEntity.read(); if (c == '<') { setScannerState(STATE_MARKUP_BRACKET); next = true; } else if (c == '&') { scanEntityRef(fStringBuffer, true); } else if (c == -1) { throw new EOFException(); } else { fCurrentEntity.rewind(); scanCharacters(); } break; } case STATE_MARKUP_BRACKET: { int c = fCurrentEntity.read(); if (c == '!') { if (skip("--", false)) { scanComment(); } else if (skip("[CDATA[", false)) { scanCDATA(); } else if (skip("DOCTYPE", false)) { scanDoctype(); } else { if (fReportErrors) { fErrorReporter.reportError("HTML1002", null); } skipMarkup(true); } } else if (c == '?') { scanPI(); } else if (c == '/') { scanEndElement(); } else if (c == -1) { if (fReportErrors) { fErrorReporter.reportError("HTML1003", null); } if (fDocumentHandler != null && fElementCount >= fElementDepth) { fStringBuffer.clear(); fStringBuffer.append('<'); fDocumentHandler.characters(fStringBuffer, null); } throw new EOFException(); } else { fCurrentEntity.rewind(); fElementCount++; fSingleBoolean[0] = false; final String ename = scanStartElement(fSingleBoolean); final String enameLC = ename == null ? null : ename.toLowerCase(); fBeginLineNumber = fCurrentEntity.getLineNumber(); fBeginColumnNumber = fCurrentEntity.getColumnNumber(); fBeginCharacterOffset = fCurrentEntity.getCharacterOffset(); if ("script".equals(enameLC)) { scanScriptContent(); } else if (!fAllowSelfclosingIframe && "iframe".equals(enameLC)) { scanUntilEndTag("iframe"); } else if (!fParseNoScriptContent && "noscript".equals(enameLC)) { scanUntilEndTag("noscript"); } else if (!fParseNoFramesContent && "noframes".equals(enameLC)) { scanUntilEndTag("noframes"); } else if (ename != null && !fSingleBoolean[0] && HTMLElements.getElement(enameLC).isSpecial() && (!ename.equalsIgnoreCase("TITLE") || isEnded(enameLC))) { setScanner(fSpecialScanner.setElementName(ename)); setScannerState(STATE_CONTENT); return true; } } setScannerState(STATE_CONTENT); break; } case STATE_START_DOCUMENT: { if (fDocumentHandler != null && fElementCount >= fElementDepth) { if (DEBUG_CALLBACKS) { System.out.println("startDocument()"); } XMLLocator locator = HTMLScanner.this; String encoding = fIANAEncoding; Augmentations augs = locationAugs(); NamespaceContext nscontext = new NamespaceSupport(); XercesBridge.getInstance().XMLDocumentHandler_startDocument( fDocumentHandler, locator, encoding, nscontext, augs); } if (fInsertDoctype && fDocumentHandler != null) { String root = HTMLElements.getElement(HTMLElements.HTML).name; root = modifyName(root, fNamesElems); String pubid = fDoctypePubid; String sysid = fDoctypeSysid; fDocumentHandler.doctypeDecl(root, pubid, sysid, synthesizedAugs()); } setScannerState(STATE_CONTENT); break; } case STATE_END_DOCUMENT: { if (fDocumentHandler != null && fElementCount >= fElementDepth && complete) { if (DEBUG_CALLBACKS) { System.out.println("endDocument()"); } fEndLineNumber = fCurrentEntity.getLineNumber(); fEndColumnNumber = fCurrentEntity.getColumnNumber(); fEndCharacterOffset = fCurrentEntity.getCharacterOffset(); fDocumentHandler.endDocument(locationAugs()); } return false; } default: { throw new RuntimeException("unknown scanner state: " + fScannerState); } } } catch (EOFException e) { if (fCurrentEntityStack.empty()) { setScannerState(STATE_END_DOCUMENT); } else { fCurrentEntity = (CurrentEntity) fCurrentEntityStack.pop(); } next = true; } } while (next || complete); return true; } // scan(boolean):boolean /** * Adds location augmentations to the specified attribute. */ protected void addLocationItem(XMLAttributes attributes, int index) { fEndLineNumber = fCurrentEntity.getLineNumber(); fEndColumnNumber = fCurrentEntity.getColumnNumber(); fEndCharacterOffset = fCurrentEntity.getCharacterOffset(); LocationItem locationItem = new LocationItem(); locationItem.setValues(fBeginLineNumber, fBeginColumnNumber, fBeginCharacterOffset, fEndLineNumber, fEndColumnNumber, fEndCharacterOffset); Augmentations augs = attributes.getAugmentations(index); augs.putItem(AUGMENTATIONS, locationItem); } // addLocationItem(XMLAttributes,int) /** * Reads the next characters WITHOUT impacting the buffer content up to current offset. * * @param len the number of characters to read * @return the read string (length may be smaller if EOF is encountered) */ protected String nextContent(int len) throws IOException { final int originalOffset = fCurrentEntity.offset; final int originalColumnNumber = fCurrentEntity.getColumnNumber(); final int originalCharacterOffset = fCurrentEntity.getCharacterOffset(); char[] buff = new char[len]; int nbRead = 0; for (nbRead = 0; nbRead < len; ++nbRead) { // read() should not clear the buffer if (fCurrentEntity.offset == fCurrentEntity.length) { if (fCurrentEntity.length == fCurrentEntity.buffer.length) { fCurrentEntity.load(fCurrentEntity.buffer.length); } else { // everything was already loaded break; } } int c = fCurrentEntity.read(); if (c == -1) { break; } else { buff[nbRead] = (char) c; } } fCurrentEntity.restorePosition(originalOffset, originalColumnNumber, originalCharacterOffset); return new String(buff, 0, nbRead); } /** * Scans a real attribute. * * @param attributes The list of attributes. * @param empty Is used for a second return value to indicate whether the start element tag is * empty (e.g. "/>"). */ protected boolean scanAttribute(XMLAttributesImpl attributes, boolean[] empty) throws IOException { return scanAttribute(attributes, empty, '/'); } // scanAttribute(XMLAttributesImpl,boolean[]):boolean // // Protected methods // /** * Scans an attribute, pseudo or real. * * @param attributes The list of attributes. * @param empty Is used for a second return value to indicate whether the start element tag is * empty (e.g. "/>"). * @param endc The end character that appears before the closing angle bracket ('>'). */ protected boolean scanAttribute(XMLAttributesImpl attributes, boolean[] empty, char endc) throws IOException { boolean skippedSpaces = skipSpaces(); fBeginLineNumber = fCurrentEntity.getLineNumber(); fBeginColumnNumber = fCurrentEntity.getColumnNumber(); fBeginCharacterOffset = fCurrentEntity.getCharacterOffset(); int c = fCurrentEntity.read(); if (c == -1) { if (fReportErrors) { fErrorReporter.reportError("HTML1007", null); } return false; } else if (c == '>') { return false; } else if (c == '<') { fCurrentEntity.rewind(); return false; } fCurrentEntity.rewind(); String aname = scanName(); if (aname == null) { if (fReportErrors) { fErrorReporter.reportError("HTML1011", null); } empty[0] = skipMarkup(false); return false; } if (!skippedSpaces && fReportErrors) { fErrorReporter.reportError("HTML1013", new Object[]{aname}); } aname = modifyName(aname, fNamesAttrs); skipSpaces(); c = fCurrentEntity.read(); if (c == -1) { if (fReportErrors) { fErrorReporter.reportError("HTML1007", null); } throw new EOFException(); } if (c == '/' || c == '>') { fQName.setValues(null, aname, aname, null); attributes.addAttribute(fQName, "CDATA", ""); attributes.setSpecified(attributes.getLength() - 1, true); if (fAugmentations) { addLocationItem(attributes, attributes.getLength() - 1); } if (c == '/') { fCurrentEntity.rewind(); empty[0] = skipMarkup(false); } return false; } /*** * // REVISIT: [Q] Why is this still here? -Ac if (c == '/' || c == '>') { if (c == '/') { * fCurrentEntity.offset--; fCurrentEntity.columnNumber--; empty[0] = skipMarkup(false); } * fQName.setValues(null, aname, aname, null); attributes.addAttribute(fQName, "CDATA", * ""); attributes.setSpecified(attributes.getLength()-1, true); if (fAugmentations) { * addLocationItem(attributes, attributes.getLength() - 1); } return false; } / ***/ if (c == '=') { skipSpaces(); c = fCurrentEntity.read(); if (c == -1) { if (fReportErrors) { fErrorReporter.reportError("HTML1007", null); } throw new EOFException(); } // Xiaowei/Ac: Fix for ... if (c == '>') { fQName.setValues(null, aname, aname, null); attributes.addAttribute(fQName, "CDATA", ""); attributes.setSpecified(attributes.getLength() - 1, true); if (fAugmentations) { addLocationItem(attributes, attributes.getLength() - 1); } return false; } fStringBuffer.clear(); fNonNormAttr.clear(); if (c != '\'' && c != '"') { fCurrentEntity.rewind(); while (true) { c = fCurrentEntity.read(); // Xiaowei/Ac: Fix for ... if (Character.isWhitespace((char) c) || c == '>') { // fCharOffset--; fCurrentEntity.rewind(); break; } if (c == -1) { if (fReportErrors) { fErrorReporter.reportError("HTML1007", null); } throw new EOFException(); } if (c == '&') { int ce = scanEntityRef(fStringBuffer2, false); if (ce != -1) { fStringBuffer.append((char) ce); } else { fStringBuffer.append(fStringBuffer2); } fNonNormAttr.append(fStringBuffer2); } else { fStringBuffer.append((char) c); fNonNormAttr.append((char) c); } } fQName.setValues(null, aname, aname, null); String avalue = fStringBuffer.toString(); attributes.addAttribute(fQName, "CDATA", avalue); int lastattr = attributes.getLength() - 1; attributes.setSpecified(lastattr, true); attributes.setNonNormalizedValue(lastattr, fNonNormAttr.toString()); if (fAugmentations) { addLocationItem(attributes, attributes.getLength() - 1); } return true; } char quote = (char) c; boolean isStart = true; boolean prevSpace = false; do { boolean acceptSpace = !fNormalizeAttributes || (!isStart && !prevSpace); c = fCurrentEntity.read(); if (c == -1) { if (fReportErrors) { fErrorReporter.reportError("HTML1007", null); } break; // throw new EOFException(); } if (c == '&') { isStart = false; int ce = scanEntityRef(fStringBuffer2, false); if (ce != -1) { fStringBuffer.append((char) ce); } else { fStringBuffer.append(fStringBuffer2); } fNonNormAttr.append(fStringBuffer2); } else if (c == ' ' || c == '\t') { if (acceptSpace) { fStringBuffer.append(fNormalizeAttributes ? ' ' : (char) c); } fNonNormAttr.append((char) c); } else if (c == '\r' || c == '\n') { if (c == '\r') { int c2 = fCurrentEntity.read(); if (c2 != '\n') { fCurrentEntity.rewind(); } else { fNonNormAttr.append('\r'); c = c2; } } if (acceptSpace) { fStringBuffer.append(fNormalizeAttributes ? ' ' : '\n'); } fCurrentEntity.incLine(); fNonNormAttr.append((char) c); } else if (c != quote) { isStart = false; fStringBuffer.append((char) c); fNonNormAttr.append((char) c); } prevSpace = c == ' ' || c == '\t' || c == '\r' || c == '\n'; isStart = isStart && prevSpace; } while (c != quote); if (fNormalizeAttributes && fStringBuffer.length > 0) { // trailing whitespace already normalized to single space if (fStringBuffer.ch[fStringBuffer.length - 1] == ' ') { fStringBuffer.length--; } } fQName.setValues(null, aname, aname, null); String avalue = fStringBuffer.toString(); attributes.addAttribute(fQName, "CDATA", avalue); int lastattr = attributes.getLength() - 1; attributes.setSpecified(lastattr, true); attributes.setNonNormalizedValue(lastattr, fNonNormAttr.toString()); if (fAugmentations) { addLocationItem(attributes, attributes.getLength() - 1); } } else { fQName.setValues(null, aname, aname, null); attributes.addAttribute(fQName, "CDATA", ""); attributes.setSpecified(attributes.getLength() - 1, true); fCurrentEntity.rewind(); if (fAugmentations) { addLocationItem(attributes, attributes.getLength() - 1); } } return true; } // scanAttribute(XMLAttributesImpl):boolean /** * Scans a CDATA section. */ protected void scanCDATA() throws IOException { fCurrentEntity.debugBufferIfNeeded("(scanCDATA: "); fStringBuffer.clear(); if (fCDATASections) { if (fDocumentHandler != null && fElementCount >= fElementDepth) { fEndLineNumber = fCurrentEntity.getLineNumber(); fEndColumnNumber = fCurrentEntity.getColumnNumber(); fEndCharacterOffset = fCurrentEntity.getCharacterOffset(); if (DEBUG_CALLBACKS) { System.out.println("startCDATA()"); } fDocumentHandler.startCDATA(locationAugs()); } } else { fStringBuffer.append("[CDATA["); } boolean eof = scanMarkupContent(fStringBuffer, ']'); if (!fCDATASections) { fStringBuffer.append("]]"); } if (fDocumentHandler != null && fElementCount >= fElementDepth) { fEndLineNumber = fCurrentEntity.getLineNumber(); fEndColumnNumber = fCurrentEntity.getColumnNumber(); fEndCharacterOffset = fCurrentEntity.getCharacterOffset(); if (fCDATASections) { if (DEBUG_CALLBACKS) { System.out.println("characters(" + fStringBuffer + ")"); } fDocumentHandler.characters(fStringBuffer, locationAugs()); if (DEBUG_CALLBACKS) { System.out.println("endCDATA()"); } fDocumentHandler.endCDATA(locationAugs()); } else { if (DEBUG_CALLBACKS) { System.out.println("comment(" + fStringBuffer + ")"); } fDocumentHandler.comment(fStringBuffer, locationAugs()); } } fCurrentEntity.debugBufferIfNeeded(")scanCDATA: "); if (eof) { throw new EOFException(); } } // scanCDATA() /** * Scans characters. */ protected void scanCharacters() throws IOException { fCurrentEntity.debugBufferIfNeeded("(scanCharacters: "); fStringBuffer.clear(); while (true) { int newlines = skipNewlines(); if (newlines == 0 && fCurrentEntity.offset == fCurrentEntity.length) { fCurrentEntity.debugBufferIfNeeded(")scanCharacters: "); break; } char c; int offset = fCurrentEntity.offset - newlines; for (int i = offset; i < fCurrentEntity.offset; i++) { fCurrentEntity.buffer[i] = '\n'; } while (fCurrentEntity.hasNext()) { c = fCurrentEntity.getNextChar(); if (c == '<' || c == '&' || c == '\n' || c == '\r') { fCurrentEntity.rewind(); break; } } if (fCurrentEntity.offset > offset && fDocumentHandler != null && fElementCount >= fElementDepth) { if (DEBUG_CALLBACKS) { final XMLString xmlString = new XMLString(fCurrentEntity.buffer, offset, fCurrentEntity.offset - offset); System.out.println("characters(" + xmlString + ")"); } fEndLineNumber = fCurrentEntity.getLineNumber(); fEndColumnNumber = fCurrentEntity.getColumnNumber(); fEndCharacterOffset = fCurrentEntity.getCharacterOffset(); fStringBuffer.append(fCurrentEntity.buffer, offset, fCurrentEntity.offset - offset); } fCurrentEntity.debugBufferIfNeeded(")scanCharacters: "); boolean hasNext = fCurrentEntity.offset < fCurrentEntity.buffer.length; int next = hasNext ? fCurrentEntity.getCurrentChar() : -1; if (next == '&' || next == '<' || next == -1) { break; } } // end while if (fStringBuffer.length != 0) { fDocumentHandler.characters(fStringBuffer, locationAugs()); } } // scanCharacters() /** * Scans a comment. */ protected void scanComment() throws IOException { fCurrentEntity.debugBufferIfNeeded("(scanComment: "); fEndLineNumber = fCurrentEntity.getLineNumber(); fEndColumnNumber = fCurrentEntity.getColumnNumber(); fEndCharacterOffset = fCurrentEntity.getCharacterOffset(); XMLStringBuffer buffer = new XMLStringBuffer(); boolean eof = scanMarkupContent(buffer, '-'); // no --> found, comment with end only with > if (eof) { fCurrentEntity.resetBuffer(buffer, fEndLineNumber, fEndColumnNumber, fEndCharacterOffset); buffer = new XMLStringBuffer(); // take a new one to avoid // interactions while (true) { int c = fCurrentEntity.read(); if (c == -1) { if (fReportErrors) { fErrorReporter.reportError("HTML1007", null); } eof = true; break; } else if (c != '>') { buffer.append((char) c); continue; } else if (c == '\n' || c == '\r') { fCurrentEntity.rewind(); int newlines = skipNewlines(); for (int i = 0; i < newlines; i++) { buffer.append('\n'); } continue; } eof = false; break; } } if (fDocumentHandler != null && fElementCount >= fElementDepth) { if (DEBUG_CALLBACKS) { System.out.println("comment(" + buffer + ")"); } fEndLineNumber = fCurrentEntity.getLineNumber(); fEndColumnNumber = fCurrentEntity.getColumnNumber(); fEndCharacterOffset = fCurrentEntity.getCharacterOffset(); fDocumentHandler.comment(buffer, locationAugs()); } fCurrentEntity.debugBufferIfNeeded(")scanComment: "); if (eof) { throw new EOFException(); } } // scanComment() /** * Scans an end element. */ protected void scanEndElement() throws IOException { String ename = scanName(); if (fReportErrors && ename == null) { fErrorReporter.reportError("HTML1012", null); } skipMarkup(false); if (ename != null) { ename = modifyName(ename, fNamesElems); if (fDocumentHandler != null && fElementCount >= fElementDepth) { fQName.setValues(null, ename, ename, null); if (DEBUG_CALLBACKS) { System.out.println("endElement(" + fQName + ")"); } fEndLineNumber = fCurrentEntity.getLineNumber(); fEndColumnNumber = fCurrentEntity.getColumnNumber(); fEndCharacterOffset = fCurrentEntity.getCharacterOffset(); fDocumentHandler.endElement(fQName, locationAugs()); } } } // scanEndElement() /** * Scans markup content. */ protected boolean scanMarkupContent(XMLStringBuffer buffer, char cend) throws IOException { int c = -1; OUTER: while (true) { c = fCurrentEntity.read(); if (c == cend) { int count = 1; while (true) { c = fCurrentEntity.read(); if (c == cend) { count++; continue; } break; } if (c == -1) { if (fReportErrors) { fErrorReporter.reportError("HTML1007", null); } break OUTER; } if (count < 2) { buffer.append(cend); // if (c != -1) { fCurrentEntity.rewind(); // } continue; } if (c != '>') { for (int i = 0; i < count; i++) { buffer.append(cend); } fCurrentEntity.rewind(); continue; } for (int i = 0; i < count - 2; i++) { buffer.append(cend); } break; } else if (c == '\n' || c == '\r') { fCurrentEntity.rewind(); int newlines = skipNewlines(); for (int i = 0; i < newlines; i++) { buffer.append('\n'); } continue; } else if (c == -1) { if (fReportErrors) { fErrorReporter.reportError("HTML1007", null); } break; } buffer.append((char) c); } return c == -1; } // scanMarkupContent(XMLStringBuffer,char):boolean /** * Scans a processing instruction. */ protected void scanPI() throws IOException { fCurrentEntity.debugBufferIfNeeded("(scanPI: "); if (fReportErrors) { fErrorReporter.reportWarning("HTML1008", null); } // scan processing instruction String target = scanName(); if (target != null && !target.equalsIgnoreCase("xml")) { while (true) { int c = fCurrentEntity.read(); if (c == '\r' || c == '\n') { if (c == '\r') { c = fCurrentEntity.read(); if (c != '\n') { fCurrentEntity.offset--; fCurrentEntity.characterOffset_--; } } fCurrentEntity.incLine(); continue; } if (c == -1) { break; } if (c != ' ' && c != '\t') { fCurrentEntity.rewind(); break; } } fStringBuffer.clear(); while (true) { int c = fCurrentEntity.read(); if (c == '?' || c == '/') { char c0 = (char) c; c = fCurrentEntity.read(); if (c == '>') { break; } else { fStringBuffer.append(c0); fCurrentEntity.rewind(); continue; } } else if (c == '\r' || c == '\n') { fStringBuffer.append('\n'); if (c == '\r') { c = fCurrentEntity.read(); if (c != '\n') { fCurrentEntity.offset--; fCurrentEntity.characterOffset_--; } } fCurrentEntity.incLine(); continue; } else if (c == -1) { break; } else { fStringBuffer.append((char) c); } } XMLString data = fStringBuffer; if (fDocumentHandler != null) { fEndLineNumber = fCurrentEntity.getLineNumber(); fEndColumnNumber = fCurrentEntity.getColumnNumber(); fEndCharacterOffset = fCurrentEntity.getCharacterOffset(); fDocumentHandler.processingInstruction(target, data, locationAugs()); } } // scan xml/text declaration else { int beginLineNumber = fBeginLineNumber; int beginColumnNumber = fBeginColumnNumber; int beginCharacterOffset = fBeginCharacterOffset; fAttributes.removeAllAttributes(); int aindex = 0; while (scanPseudoAttribute(fAttributes)) { // if we haven't scanned a value, remove the entry as values have // special signification if (fAttributes.getValue(aindex).length() == 0) { fAttributes.removeAttributeAt(aindex); } else { fAttributes.getName(aindex, fQName); fQName.rawname = fQName.rawname.toLowerCase(); fAttributes.setName(aindex, fQName); aindex++; } } if (fDocumentHandler != null) { String version = fAttributes.getValue("version"); String encoding = fAttributes.getValue("encoding"); String standalone = fAttributes.getValue("standalone"); // if the encoding is successfully changed, the stream will be // processed again // with the right encoding an we will come here again but without // need to change the encoding final boolean xmlDeclNow = fIgnoreSpecifiedCharset || !changeEncoding(encoding); if (xmlDeclNow) { fBeginLineNumber = beginLineNumber; fBeginColumnNumber = beginColumnNumber; fBeginCharacterOffset = beginCharacterOffset; fEndLineNumber = fCurrentEntity.getLineNumber(); fEndColumnNumber = fCurrentEntity.getColumnNumber(); fEndCharacterOffset = fCurrentEntity.getCharacterOffset(); fDocumentHandler.xmlDecl(version, encoding, standalone, locationAugs()); } } } fCurrentEntity.debugBufferIfNeeded(")scanPI: "); } // scanPI() /** * Scans a pseudo attribute. * * @param attributes The list of attributes. */ protected boolean scanPseudoAttribute(XMLAttributesImpl attributes) throws IOException { return scanAttribute(attributes, fSingleBoolean, '?'); } // scanPseudoAttribute(XMLAttributesImpl):boolean /** * Scans a start element. * * @param empty Is used for a second return value to indicate whether the start element tag is * empty (e.g. "/>"). */ protected String scanStartElement(boolean[] empty) throws IOException { String ename = scanName(); int length = ename != null ? ename.length() : 0; int c = length > 0 ? ename.charAt(0) : -1; if (length == 0 || !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))) { if (fReportErrors) { fErrorReporter.reportError("HTML1009", null); } if (fDocumentHandler != null && fElementCount >= fElementDepth) { fStringBuffer.clear(); fStringBuffer.append('<'); if (length > 0) { fStringBuffer.append(ename); } fDocumentHandler.characters(fStringBuffer, null); } return null; } ename = modifyName(ename, fNamesElems); fAttributes.removeAllAttributes(); int beginLineNumber = fBeginLineNumber; int beginColumnNumber = fBeginColumnNumber; int beginCharacterOffset = fBeginCharacterOffset; while (scanAttribute(fAttributes, empty)) { // do nothing } fBeginLineNumber = beginLineNumber; fBeginColumnNumber = beginColumnNumber; fBeginCharacterOffset = beginCharacterOffset; if (fByteStream != null && fElementDepth == -1) { if (ename.equalsIgnoreCase("META")) { if (DEBUG_CHARSET) { System.out.println("+++ "); } String httpEquiv = getValue(fAttributes, "http-equiv"); if (httpEquiv != null && httpEquiv.equalsIgnoreCase("content-type")) { if (DEBUG_CHARSET) { System.out.println("+++ @content-type: \"" + httpEquiv + '"'); } String content = getValue(fAttributes, "content"); if (content != null) { content = removeSpaces(content); int index1 = content.toLowerCase().indexOf("charset="); if (index1 != -1 && !fIgnoreSpecifiedCharset) { final int index2 = content.indexOf(';', index1); final String charset = index2 != -1 ? content.substring(index1 + 8, index2) : content.substring(index1 + 8); changeEncoding(charset); } } } } else if (ename.equalsIgnoreCase("BODY")) { fByteStream.clear(); fByteStream = null; } else { HTMLElements.Element element = HTMLElements.getElement(ename); if (element.parent != null && element.parent.length > 0) { if (element.parent[0].code == HTMLElements.BODY) { fByteStream.clear(); fByteStream = null; } } } } if (fDocumentHandler != null && fElementCount >= fElementDepth) { fQName.setValues(null, ename, ename, null); if (DEBUG_CALLBACKS) { System.out.println("startElement(" + fQName + ',' + fAttributes + ")"); } fEndLineNumber = fCurrentEntity.getLineNumber(); fEndColumnNumber = fCurrentEntity.getColumnNumber(); fEndCharacterOffset = fCurrentEntity.getCharacterOffset(); if (empty[0]) { fDocumentHandler.emptyElement(fQName, fAttributes, locationAugs()); } else { fDocumentHandler.startElement(fQName, fAttributes, locationAugs()); } } return ename; } // scanStartElement():ename /** * Tries to change the encoding used to read the input stream to the specified one * * @param charset the charset that should be used * @return true when the encoding has been changed */ private boolean changeEncoding(String charset) { if (charset == null || fByteStream == null) { return false; } charset = charset.trim(); boolean encodingChanged = false; try { String ianaEncoding = charset; String javaEncoding = EncodingMap.getIANA2JavaMapping(ianaEncoding.toUpperCase()); if (DEBUG_CHARSET) { System.out.println("+++ ianaEncoding: " + ianaEncoding); System.out.println("+++ javaEncoding: " + javaEncoding); } if (javaEncoding == null) { javaEncoding = ianaEncoding; if (fReportErrors) { fErrorReporter.reportError("HTML1001", new Object[]{ianaEncoding}); } } // patch: Marc Guillemot if (!javaEncoding.equals(fJavaEncoding)) { if (!isEncodingCompatible(javaEncoding, fJavaEncoding)) { if (fReportErrors) { fErrorReporter.reportError("HTML1015", new Object[]{ javaEncoding, fJavaEncoding}); } } // change the charset else { fIso8859Encoding = ianaEncoding == null || ianaEncoding.toUpperCase().startsWith("ISO-8859") || ianaEncoding.equalsIgnoreCase(fDefaultIANAEncoding); fJavaEncoding = javaEncoding; fCurrentEntity.setStream(new InputStreamReader(fByteStream, javaEncoding)); fByteStream.playback(); fElementDepth = fElementCount; fElementCount = 0; encodingChanged = true; } } } catch (UnsupportedEncodingException e) { if (fReportErrors) { fErrorReporter.reportError("HTML1010", new Object[]{charset}); } // NOTE: If the encoding change doesn't work, // then there's no point in continuing to // buffer the input stream. fByteStream.clear(); fByteStream = null; } return encodingChanged; } /** * Returns true if the given element has an end-tag. */ private boolean isEnded(String ename) { String content = new String(fCurrentEntity.buffer, fCurrentEntity.offset, fCurrentEntity.length - fCurrentEntity.offset); return content.toLowerCase().indexOf("") != -1; } /** * Removes all spaces for the string (remember: JDK 1.3!) */ private String removeSpaces(final String content) { StringBuffer sb = null; for (int i = content.length() - 1; i >= 0; --i) { if (Character.isWhitespace(content.charAt(i))) { if (sb == null) { sb = new StringBuffer(content); } sb.deleteCharAt(i); } } return (sb == null) ? content : sb.toString(); } private void scanScriptContent() throws IOException { final XMLStringBuffer buffer = new XMLStringBuffer(); boolean waitForEndComment = false; while (true) { int c = fCurrentEntity.read(); if (c == -1) { break; } else if (c == '-' && endsWith(buffer, "= 8 && "/script".equalsIgnoreCase(next.substring(0, 7)) && ('>' == next.charAt(7) || Character.isWhitespace(next.charAt(7)))) { fCurrentEntity.rewind(); break; } } else if (c == '>' && endsWith(buffer, "--")) { waitForEndComment = false; } if (c == '\r' || c == '\n') { fCurrentEntity.rewind(); int newlines = skipNewlines(); for (int i = 0; i < newlines; i++) { buffer.append('\n'); } } else { buffer.append((char) c); } } if (fScriptStripCommentDelims) { reduceToContent(buffer, ""); } if (fScriptStripCDATADelims) { reduceToContent(buffer, ""); } if (buffer.length > 0 && fDocumentHandler != null && fElementCount >= fElementDepth) { if (DEBUG_CALLBACKS) { System.out.println("characters(" + buffer + ")"); } fEndLineNumber = fCurrentEntity.getLineNumber(); fEndColumnNumber = fCurrentEntity.getColumnNumber(); fEndCharacterOffset = fCurrentEntity.getCharacterOffset(); fDocumentHandler.characters(buffer, locationAugs()); } } // // Private methods // /** * Scans the content of




© 2015 - 2024 Weber Informatics LLC | Privacy Policy