Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* (c) Copyright 2007-2009 by Volker Bergmann. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, is permitted under the terms of the
* GNU General Public License.
*
* For redistributing this software or a derivative work under a license other
* than the GPL-compatible Free Software License as defined by the Free
* Software Foundation or approved by OSI, you must first obtain a commercial
* license to this software product from Volker Bergmann.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* WITHOUT A WARRANTY OF ANY KIND. ALL EXPRESS OR IMPLIED CONDITIONS,
* REPRESENTATIONS AND WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT, ARE
* HEREBY EXCLUDED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* (c) Copyright 2007 by Volker Bergmann. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, is permitted under the terms of the
* GNU General Public License.
*
* For redistributing this software or a derivative work under a license other
* than the GPL-compatible Free Software License as defined by the Free
* Software Foundation or approved by OSI, you must first obtain a commercial
* license to this software product from Volker Bergmann.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* WITHOUT A WARRANTY OF ANY KIND. ALL EXPRESS OR IMPLIED CONDITIONS,
* REPRESENTATIONS AND WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT, ARE
* HEREBY EXCLUDED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
package org.databene.html;
import org.databene.commons.CharSet;
import org.databene.commons.OrderedMap;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.Map;
import java.io.PushbackReader;
import java.io.Reader;
import java.io.IOException;
import java.text.ParseException;
/**
* Default implementation of an {@link HTMLTokenizer}.
*
* Created: 15.06.2007 05:56:21
* @author Volker Bergmann
*/
public class DefaultHTMLTokenizer implements HTMLTokenizer {
private static Logger logger = LoggerFactory.getLogger(DefaultHTMLTokenizer.class);
private static final CharSet ELEMENT_NAME_CHARS = new CharSet('A','Z').addRange('a', 'z').addRange('0', '9').add('_').add(':').add('-');
private static final CharSet ATTR_NAME_CHARS = new CharSet('A','Z').addRange('a', 'z').addRange('0', '9').add('_').add('-').add(':');
// parser state
private PushbackReader reader;
private boolean script;
// token state
private int tokenType;
private int cursor;
private String text;
private int nameStart;
private int nameLength;
private String name;
private int attribCount;
private Map attributeMap;
// buffers
private char[] textBuffer;
private int[] attribNameFrom = new int[256];
private int[] attribNameUntil = new int[256];
private int[] attribValueFrom = new int[256];
private int[] attribValueUntil = new int[256];
public DefaultHTMLTokenizer(Reader reader) {
// create buffers
textBuffer = new char[65536];
attribNameFrom = new int[256];
attribNameUntil = new int[256];
attribValueFrom = new int[256];
attribValueUntil = new int[256];
// init parsing state
this.reader = new PushbackReader(reader, 256);
this.script = false;
}
public int nextToken() throws IOException, ParseException {
// init token state
cursor = 0;
nameStart = -1;
nameLength = 0;
name = null;
text = null;
attributeMap = null;
attribCount = 0;
// we've reached the EOF before - do nothing more!
if (this.tokenType == END)
return END;
if (script) {
// if the last start tag was to be script text
parseScript();
} else {
int nextChar = peek(reader);
switch (nextChar) {
case -1 : this.tokenType = END; break;
case '<' :
reader.read();
int c = peek(reader);
if (Character.isLetter(c) || c == '!' || c=='?' | c=='/') {
// it's the start of a tag
reader.unread(nextChar);
parseTag();
} else {
reader.unread(nextChar);
parseText();
}
break;
default : parseText(); break; // must be text
}
if (logger.isDebugEnabled())
logger.debug(text());
}
return this.tokenType;
}
/**
* @return if it's a kind of tag then the tag name, else null
*/
public int tokenType() {
return this.tokenType;
}
public String name() {
if (name == null && nameStart >= 0)
name = new String(textBuffer, nameStart, nameLength).intern();
return name;
}
/**
* @return the text that constitutes the current token as read from the source
*/
public String text() {
if (text == null)
text = new String(textBuffer, 0, cursor);
return text;
}
/**
* @return a map with all attributes of the token.
* In case of non-tag tokens or empty tags, an empty map is returned.
*/
public Map attributes() {
if (attributeMap == null) {
attributeMap = new OrderedMap();
for (int i = 0; i < attribCount; i++) {
String attribName = new String(textBuffer, attribNameFrom[i], attribNameUntil[i] - attribNameFrom[i]);
attribName = attribName.intern();
String attribValue = null;
if (attribValueFrom[i] >= 0)
attribValue = new String(textBuffer, attribValueFrom[i], attribValueUntil[i] - attribValueFrom[i]);
attributeMap.put(attribName, attribValue);
}
}
return attributeMap;
}
// parser implementation -------------------------------------------------------------------------------------------
/**
* parses anything that follows until it hits a </script>
* @throws IOException
*/
private void parseScript() throws IOException {
readUntil("", false, false);
this.script = false;
this.tokenType = SCRIPT;
}
private void parseTag() throws IOException, ParseException {
assertChar('<');
switch (peek(reader)) {
case '!' : // comment or doctype
assertChar('!');
if (peek(reader) == '-') {
// comment
assertChar('-');
assertChar('-');
readUntil("-->", true);
this.tokenType = COMMENT;
} else{
// doctype
parseElementName();
readUntil('>'); // no detailed paring for the doctype
assertChar('>');
this.tokenType = DOCUMENT_TYPE;
}
break;
case '/' :
assertChar('/');
parseElementName();
expectChar('>');
this.tokenType = END_TAG;
break;
case '?' : // processing instruction
assertChar('?');
parseElementName();
parseAttributes();
assertChar('?');
assertChar('>');
this.tokenType = PROCESSING_INSTRUCTION;
break;
default :
// regular tag
parseElementName();
parseAttributes();
skipWhitespace();
if (peek(reader) == '/') {
assertChar('/');
this.tokenType = CLOSED_TAG;
} else {
this.tokenType = START_TAG;
}
expectChar('>');
if ("SCRIPT".equalsIgnoreCase(name())) // if it's a script start tag,
// set a marker to parse the following stuff specially
script = true;
break;
}
}
private void parseAttributes() throws IOException, ParseException {
while (parseAttribute()) {
}
readUntilOneOf("?/>");
}
private void parseElementName() throws IOException {
nameStart = cursor;
parseString(ELEMENT_NAME_CHARS);
nameLength = cursor - nameStart;
}
private boolean parseAttribute() throws IOException, ParseException {
skipWhitespace();
attribNameFrom[attribCount] = cursor;
if (!parseAttributeName())
return false;
attribNameUntil[attribCount] = cursor;
while (textBuffer[attribNameUntil[attribCount] - 1] == ':') // fix for bad HTML: remove trailing colons
attribNameUntil[attribCount]--;
skipWhitespace();
if (peek(reader) == '=') {
parseAttributeValueAssignment();
} else {
attribValueFrom[attribCount] = -1;
attribValueUntil[attribCount] = -1;
}
attribCount++;
return true;
}
private void parseAttributeValueAssignment() throws IOException, ParseException {
assertChar('=', true);
skipWhitespace();
parseAttributeValue();
}
private void parseAttributeValue() throws IOException, ParseException {
char quoteChar;
int c = peek(reader);
if (c == '\'' || c == '"') {
quoteChar = (char) c;
textBuffer[cursor++] = (char)c;
parseQuotedAttributeValue(String.valueOf(quoteChar));
} else {
attribValueFrom[attribCount] = cursor;
readUntilOneOf(" >");
attribValueUntil[attribCount] = cursor;
}
}
private void parseQuotedAttributeValue(String quoteChars) throws IOException, ParseException {
int quoteChar = reader.read();
attribValueFrom[attribCount] = cursor;
if (quoteChars.indexOf(quoteChar) < 0)
throw new ParseException("Expected quotation like " + quoteChars + ", found: " + quoteChar, 0);
readUntil((char)quoteChar);
attribValueUntil[attribCount] = cursor;
assertChar((char)quoteChar);
}
private boolean parseAttributeName() throws IOException {
return parseString(ATTR_NAME_CHARS);
}
private boolean parseString(CharSet charSet) throws IOException {
boolean stringFound = false;
int c;
while ((c = reader.read()) != -1 && charSet.contains((char)c)) {
textBuffer[cursor++] = (char)c;
stringFound = true;
}
if (c != -1)
reader.unread(c);
return stringFound;
}
private void parseText() throws IOException {
this.tokenType = TEXT;
boolean end = false;
do {
readUntil('<');
int c = reader.read();
if (c == -1)
end = true;
else if (c == '<') {
int next = reader.read();
if (next == '/' || next == '!' || next == '?' || Character.isLetter(next)) {
reader.unread(next);
reader.unread(c);
end = true;
} else {
textBuffer[cursor++] = (char)c;
textBuffer[cursor++] = (char)next;
}
} else
throw new RuntimeException("Unexpected token: " + (char)c);
} while (!end);
}
public void readUntil(String endText) throws IOException {
readUntil(endText, false);
}
public void readUntil(String delimiter, boolean includeDelimiter) throws IOException {
readUntil(delimiter, true, includeDelimiter);
}
public void readUntil(String delimiter, boolean caseSensitive, boolean includeDelimiter) throws IOException {
String cmp = (caseSensitive ? delimiter : delimiter.toUpperCase());
char[] endChars = new char[cmp.length()];
cmp.getChars(0, delimiter.length(), endChars, 0);
do {
int c;
while((c = reader.read()) != -1 && (caseSensitive ? c : Character.toUpperCase(c)) != endChars[0]) {
textBuffer[cursor++] = (char)c;
}
if (c == -1)
return;
int tmpCursor = cursor;
textBuffer[tmpCursor++] = (char)c;
int i;
if (endChars.length == 1) {
if (includeDelimiter) {
cursor++;
} else
reader.unread(c);
return;
} else {
for (i = 1; i < endChars.length; i++) {
c = reader.read();
textBuffer[tmpCursor++] = (char)c;
if ((caseSensitive ? c : Character.toUpperCase(c)) != endChars[i]) {
cursor += i + 1;
break;
}
}
if (i == delimiter.length()) {
if (includeDelimiter) {
//System.arraycopy(endChars, 0, textBuffer, cursor, endChars.length);
cursor += endChars.length;
} else
reader.unread(textBuffer, cursor, delimiter.length());
return;
}
}
} while (true);
}
private int readUntilOneOf(String delimiters) throws IOException {
int c;
while ((c = reader.read()) != -1 && delimiters.indexOf(c) < 0)
textBuffer[cursor++] = (char)c;
if (c != -1)
reader.unread(c);
return c;
}
private void readUntil(char delimiter) throws IOException {
int c;
boolean escaped = false;
while((c = reader.read()) != -1 && (c != delimiter || escaped)) {
textBuffer[cursor++] = (char)c;
escaped = (c == '\\');
}
if (c != -1)
reader.unread(c);
}
private static int peek(PushbackReader reader) throws IOException {
int c = reader.read();
reader.unread(c);
return c;
}
private void assertChar(char expectedChar) throws ParseException, IOException {
int c = reader.read();
if (c != expectedChar)
throw new ParseException("Expected: '" + expectedChar + "', found: '" + (char)c + "'", 0);
textBuffer[cursor++] = expectedChar;
}
private void expectChar(char expectedChar) throws IOException {
int c = reader.read();
if (c != expectedChar) {
String message = "Expected: '" + expectedChar + "', found: '" + (char)c + "'";
logger.error(message, new ParseException(message, -1));
reader.unread(c);
}
textBuffer[cursor++] = expectedChar;
}
private void assertChar(char expectedChar, boolean skipSpace) throws ParseException, IOException {
int c;
do {
c = reader.read();
if (c != -1)
textBuffer[cursor++] = (char)c;
} while (c != -1 && skipSpace && Character.isWhitespace(c));
if (c != expectedChar)
throw new ParseException("Expected: '" + expectedChar + "', found: '" + (char)c + "'", 0);
}
private void skipWhitespace() throws IOException {
int c;
while ((c = reader.read()) != -1 && Character.isWhitespace(c)) {
textBuffer[cursor++] = (char)c;
}
if (c != -1)
reader.unread(c);
}
}