com.ximpleware.VTDGen Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of vtd-xml Show documentation
Show all versions of vtd-xml Show documentation
XimpleWare's VTD-XML is, far and away, the industry's most advanced and powerful XML processing model for SOA and Cloud Computing
/*
* Copyright (C) 2002-2015 XimpleWare, [email protected]
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
/*VTD-XML is protected by US patent 7133857, 7260652, an 7761459*/
/*All licenses to any parties in litigation with XimpleWare have been expressly terminated. No new license, and no renewal of any revoked license,
* is granted to those parties as a result of re-downloading software from this or any other website*/
package com.ximpleware;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.zip.GZIPInputStream;
import java.util.zip.ZipFile;
import java.util.zip.ZipInputStream;
import com.ximpleware.parser.ISO8859_10;
import com.ximpleware.parser.ISO8859_11;
import com.ximpleware.parser.ISO8859_2;
import com.ximpleware.parser.ISO8859_3;
import com.ximpleware.parser.ISO8859_4;
import com.ximpleware.parser.ISO8859_5;
import com.ximpleware.parser.ISO8859_6;
import com.ximpleware.parser.ISO8859_7;
import com.ximpleware.parser.ISO8859_8;
import com.ximpleware.parser.ISO8859_9;
import com.ximpleware.parser.UTF8Char;
import com.ximpleware.parser.WIN1250;
import com.ximpleware.parser.WIN1251;
import com.ximpleware.parser.WIN1252;
import com.ximpleware.parser.WIN1253;
import com.ximpleware.parser.WIN1254;
import com.ximpleware.parser.WIN1255;
import com.ximpleware.parser.WIN1256;
import com.ximpleware.parser.WIN1257;
import com.ximpleware.parser.WIN1258;
import com.ximpleware.parser.XMLChar;
//import com.ximpleware.parser.ISO8859_11;
import com.ximpleware.parser.ISO8859_13;
import com.ximpleware.parser.ISO8859_14;
import com.ximpleware.parser.ISO8859_15;
/**
* VTD Generator implementation.
* Current support built-in entities only
* It parses DTD, but doesn't resolve declared entities
*/
public class VTDGen {
class ASCIIReader implements IReader {
public ASCIIReader() {
}
final public int getChar()
throws EOFException, ParseException, EncodingException {
int a;
if (offset >= endOffset)
throw e;
a= XMLDoc[offset++];
if (a<0)
throw new ParseException(
"ASCII encoding error: invalid ASCII Char");
return a;
}
final public boolean skipChar(int ch)
throws ParseException, EOFException, EncodingException {
if (ch == XMLDoc[offset]) {
offset++;
return true;
} else {
return false;
}
}
final public long _getChar(int offset){
int c = XMLDoc[offset];
if (c=='\r' && XMLDoc[offset+1]=='\n')
return (2L<<32)|'\n';
return (1L<<32)|c;
}
final public char decode(int offset){
return (char) XMLDoc[offset];
}
final public int getPrevOffset(){
return offset-1;
}
}
class ISO8859_10Reader implements IReader {
public ISO8859_10Reader() {
}
final public int getChar()
throws EOFException, ParseException, EncodingException {
if (offset >= endOffset)
throw e;
return ISO8859_10.decode(XMLDoc[offset++]);
}
final public boolean skipChar(int ch)
throws EOFException, ParseException, EncodingException {
if (ch == ISO8859_10.decode(XMLDoc[offset])) {
offset++;
return true;
} else {
return false;
}
}
final public long _getChar(int offset){
int c = ISO8859_10.decode(XMLDoc[offset]);
if (c=='\r' && XMLDoc[offset+1]=='\n')
return (2L<<32)|'\n';
return (1L<<32)|c;
}
final public char decode(int offset){
return ISO8859_10.decode(XMLDoc[offset]);
}
final public int getPrevOffset(){
return offset-1;
}
}
class ISO8859_1Reader implements IReader {
public ISO8859_1Reader() {
}
final public int getChar()
throws EOFException, ParseException, EncodingException {
if (offset >= endOffset)
throw e;
return XMLDoc[offset++] & 0xff;
}
final public boolean skipChar(int ch)
throws EOFException, ParseException, EncodingException {
if (ch == XMLDoc[offset]) {
offset++;
return true;
} else {
return false;
}
}
final public long _getChar(int offset){
int c = 0xff & XMLDoc[offset];
if (c=='\r' && XMLDoc[offset+1]=='\n')
return (2L<<32)|'\n';
return (1L<<32)|c;
}
final public char decode(int offset){
return (char)(XMLDoc[offset] & 0xff);
}
final public int getPrevOffset(){
return offset-1;
}
}
class ISO8859_2Reader implements IReader {
public ISO8859_2Reader() {
}
final public int getChar()
throws EOFException, ParseException, EncodingException {
if (offset >= endOffset)
throw e;
return ISO8859_2.decode(XMLDoc[offset++]);
}
final public boolean skipChar(int ch)
throws EOFException, ParseException, EncodingException {
if (ch == ISO8859_2.decode(XMLDoc[offset])) {
offset++;
return true;
} else {
return false;
}
}
final public long _getChar(int offset){
int c = ISO8859_2.decode(XMLDoc[offset]);
if (c=='\r' && XMLDoc[offset+1]=='\n')
return (2L<<32)|'\n';
return (1L<<32)|c;
}
final public char decode(int offset){
return ISO8859_2.decode(XMLDoc[offset]);
}
final public int getPrevOffset(){
return offset-1;
}
}
class ISO8859_3Reader implements IReader {
public ISO8859_3Reader() {
}
final public int getChar()
throws EOFException, ParseException, EncodingException {
if (offset >= endOffset)
throw e;
return ISO8859_3.decode(XMLDoc[offset++]);
}
final public boolean skipChar(int ch)
throws EOFException, ParseException, EncodingException {
if (ch == ISO8859_3.decode(XMLDoc[offset])) {
offset++;
return true;
} else {
return false;
}
}
final public long _getChar(int offset){
int c = ISO8859_3.decode(XMLDoc[offset]);
if (c=='\r' && XMLDoc[offset+1]=='\n')
return (2L<<32)|'\n';
return (1L<<32)|c;
}
final public char decode(int offset){
return ISO8859_3.decode(XMLDoc[offset]);
}
final public int getPrevOffset(){
return offset-1;
}
}
class ISO8859_4Reader implements IReader {
public ISO8859_4Reader() {
}
final public int getChar()
throws EOFException, ParseException, EncodingException {
if (offset >= endOffset)
throw e;
return ISO8859_4.decode(XMLDoc[offset++]);
}
final public boolean skipChar(int ch)
throws EOFException, ParseException, EncodingException {
if (ch == ISO8859_4.decode(XMLDoc[offset])) {
offset++;
return true;
} else {
return false;
}
}
final public long _getChar(int offset){
int c = ISO8859_4.decode(XMLDoc[offset]);
if (c=='\r' && XMLDoc[offset+1]=='\n')
return (2L<<32)|'\n';
return (1L<<32)|c;
}
final public char decode(int offset){
return ISO8859_4.decode(XMLDoc[offset]);
}
final public int getPrevOffset(){
return offset-1;
}
}
class ISO8859_5Reader implements IReader {
public ISO8859_5Reader() {
}
final public int getChar()
throws EOFException, ParseException, EncodingException {
if (offset >= endOffset)
throw e;
return ISO8859_5.decode(XMLDoc[offset++]);
}
final public boolean skipChar(int ch)
throws EOFException, ParseException, EncodingException {
if (ch == ISO8859_5.decode(XMLDoc[offset])) {
offset++;
return true;
} else {
return false;
}
}
final public long _getChar(int offset){
int c = ISO8859_5.decode(XMLDoc[offset]);
if (c=='\r' && XMLDoc[offset+1]=='\n')
return (2L<<32)|'\n';
return (1L<<32)|c;
}
final public char decode(int offset){
return ISO8859_5.decode(XMLDoc[offset]);
}
final public int getPrevOffset(){
return offset-1;
}
}
class ISO8859_6Reader implements IReader {
public ISO8859_6Reader() {
}
final public int getChar()
throws EOFException, ParseException, EncodingException {
if (offset >= endOffset)
throw e;
return ISO8859_6.decode(XMLDoc[offset++]);
}
final public boolean skipChar(int ch)
throws EOFException, ParseException, EncodingException {
if (ch == ISO8859_6.decode(XMLDoc[offset])) {
offset++;
return true;
} else {
return false;
}
}
final public long _getChar(int offset){
int c = ISO8859_6.decode(XMLDoc[offset]);
if (c=='\r' && XMLDoc[offset+1]=='\n')
return (2L<<32)|'\n';
return (1L<<32)|c;
}
final public char decode(int offset){
return ISO8859_6.decode(XMLDoc[offset]);
}
final public int getPrevOffset(){
return offset-1;
}
}
class ISO8859_7Reader implements IReader {
public ISO8859_7Reader() {
}
final public int getChar()
throws EOFException, ParseException, EncodingException {
if (offset >= endOffset)
throw e;
return ISO8859_7.decode(XMLDoc[offset++]);
}
final public boolean skipChar(int ch)
throws EOFException, ParseException, EncodingException {
if (ch == ISO8859_7.decode(XMLDoc[offset])) {
offset++;
return true;
} else {
return false;
}
}
final public long _getChar(int offset){
int c = ISO8859_7.decode(XMLDoc[offset]);
if (c=='\r' && XMLDoc[offset+1]=='\n')
return (2L<<32)|'\n';
return (1L<<32)|c;
}
final public char decode(int offset){
return ISO8859_7.decode(XMLDoc[offset]);
}
final public int getPrevOffset(){
return offset-1;
}
}
class ISO8859_8Reader implements IReader {
public ISO8859_8Reader() {
}
final public int getChar()
throws EOFException, ParseException, EncodingException {
if (offset >= endOffset)
throw e;
return ISO8859_8.decode(XMLDoc[offset++]);
}
final public boolean skipChar(int ch)
throws EOFException, ParseException, EncodingException {
if (ch == ISO8859_8.decode(XMLDoc[offset])) {
offset++;
return true;
} else {
return false;
}
}
final public long _getChar(int offset){
int c = ISO8859_8.decode(XMLDoc[offset]);
if (c=='\r' && XMLDoc[offset+1]=='\n')
return (2L<<32)|'\n';
return (1L<<32)|c;
}
final public char decode(int offset){
return ISO8859_8.decode(XMLDoc[offset]);
}
final public int getPrevOffset(){
return offset-1;
}
}
class ISO8859_9Reader implements IReader {
public ISO8859_9Reader() {
}
final public int getChar()
throws EOFException, ParseException, EncodingException {
if (offset >= endOffset)
throw e;
return ISO8859_9.decode(XMLDoc[offset++]);
}
final public boolean skipChar(int ch)
throws EOFException, ParseException, EncodingException {
if (ch == ISO8859_9.decode(XMLDoc[offset])) {
offset++;
return true;
} else {
return false;
}
}
final public long _getChar(int offset){
int c = ISO8859_9.decode(XMLDoc[offset]);
if (c=='\r' && XMLDoc[offset+1]=='\n')
return (2L<<32)|'\n';
return (1L<<32)|c;
}
final public char decode(int offset){
return ISO8859_9.decode(XMLDoc[offset]);
}
final public int getPrevOffset(){
return offset-1;
}
}
class ISO8859_11Reader implements IReader {
public ISO8859_11Reader() {
}
final public int getChar()
throws EOFException, ParseException, EncodingException {
if (offset >= endOffset)
throw e;
return ISO8859_11.decode(XMLDoc[offset++]);
}
final public boolean skipChar(int ch)
throws EOFException, ParseException, EncodingException {
if (ch == ISO8859_11.decode(XMLDoc[offset])) {
offset++;
return true;
} else {
return false;
}
}
final public long _getChar(int offset){
int c = ISO8859_11.decode(XMLDoc[offset]);
if (c=='\r' && XMLDoc[offset+1]=='\n')
return (2L<<32)|'\n';
return (1L<<32)|c;
}
final public char decode(int offset){
return ISO8859_11.decode(XMLDoc[offset]);
}
final public int getPrevOffset(){
return offset-1;
}
}
class ISO8859_13Reader implements IReader {
public ISO8859_13Reader() {
}
final public int getChar()
throws EOFException, ParseException, EncodingException {
if (offset >= endOffset)
throw e;
return ISO8859_13.decode(XMLDoc[offset++]);
}
final public boolean skipChar(int ch)
throws EOFException, ParseException, EncodingException {
if (ch == ISO8859_13.decode(XMLDoc[offset])) {
offset++;
return true;
} else {
return false;
}
}
final public long _getChar(int offset){
int c = ISO8859_13.decode(XMLDoc[offset]);
if (c=='\r' && XMLDoc[offset+1]=='\n')
return (2L<<32)|'\n';
return (1L<<32)|c;
}
final public char decode(int offset){
return ISO8859_13.decode(XMLDoc[offset]);
}
final public int getPrevOffset(){
return offset-1;
}
}
class ISO8859_14Reader implements IReader {
public ISO8859_14Reader() {
}
final public int getChar()
throws EOFException, ParseException, EncodingException {
if (offset >= endOffset)
throw e;
return ISO8859_14.decode(XMLDoc[offset++]);
}
final public boolean skipChar(int ch)
throws EOFException, ParseException, EncodingException {
if (ch == ISO8859_14.decode(XMLDoc[offset])) {
offset++;
return true;
} else {
return false;
}
}
final public long _getChar(int offset){
int c = ISO8859_14.decode(XMLDoc[offset]);
if (c=='\r' && XMLDoc[offset+1]=='\n')
return (2L<<32)|'\n';
return (1L<<32)|c;
}
final public char decode(int offset){
return ISO8859_14.decode(XMLDoc[offset]);
}
final public int getPrevOffset(){
return offset-1;
}
}
class ISO8859_15Reader implements IReader {
public ISO8859_15Reader() {
}
final public int getChar()
throws EOFException, ParseException, EncodingException {
if (offset >= endOffset)
throw e;
return ISO8859_15.decode(XMLDoc[offset++]);
}
final public boolean skipChar(int ch)
throws EOFException, ParseException, EncodingException {
if (ch == ISO8859_15.decode(XMLDoc[offset])) {
offset++;
return true;
} else {
return false;
}
}
final public long _getChar(int offset){
int c = ISO8859_15.decode(XMLDoc[offset]);
if (c=='\r' && XMLDoc[offset+1]=='\n')
return (2L<<32)|'\n';
return (1L<<32)|c;
}
final public char decode(int offset){
return ISO8859_15.decode(XMLDoc[offset]);
}
final public int getPrevOffset(){
return offset-1;
}
}
class UTF16BEReader implements IReader {
public UTF16BEReader() {
}
final public int getChar()
throws EOFException, ParseException, EncodingException {
int val = 0;
if (offset >= endOffset)
throw e;
int temp = (XMLDoc[offset]&0xff) << 8 | (XMLDoc[offset + 1]&0xff);
if ((temp < 0xd800) || (temp > 0xdfff)) { // not a high surrogate
offset += 2;
return temp;
} else {
if (temp<0xd800 || temp>0xdbff)
throw new EncodingException("UTF 16 BE encoding error: should never happen");
val = temp;
temp = (XMLDoc[offset + 2]&0xff) << 8 | (XMLDoc[offset + 3]&0xff);
if (temp < 0xdc00 || temp > 0xdfff) {
// has to be a low surrogate here
throw new EncodingException("UTF 16 BE encoding error: should never happen");
}
val = ((val - 0xd800)<<10) + (temp - 0xdc00) + 0x10000;
offset += 4;
return val;
}
}
final public boolean skipChar(int ch)
throws EOFException, ParseException, EncodingException {
// implement UTF-16BE to UCS4 conversion
int temp = (XMLDoc[offset]&0xff) << 8 | (XMLDoc[offset + 1]&0xff);
if ((temp < 0xd800) || (temp > 0xdfff)) { // not a high surrogate
//offset += 2;
if (temp == ch) {
offset += 2;
return true;
} else
return false;
} else {
if (temp<0xd800 || temp>0xdbff)
throw new EncodingException("UTF 16 BE encoding error: should never happen");
int val = temp;
temp = (XMLDoc[offset + 2]&0xff) << 8 | (XMLDoc[offset + 3]&0xff);
if (temp < 0xdc00 || temp > 0xdfff) {
// has to be a low surrogate here
throw new EncodingException("UTF 16 BE encoding error: should never happen");
}
val = ((val - 0xd800) << 10) + (temp - 0xdc00) + 0x10000;
if (val == ch) {
offset += 4;
return true;
} else
return false;
}
}
final public char decode(int offset){
return 0;
}
final public long _getChar(int offset){
long val;
int temp =
((XMLDoc[offset ] & 0xff) << 8)
|(XMLDoc[offset + 1]& 0xff);
if ((temp < 0xd800)
|| (temp > 0xdfff)) { // not a high surrogate
if (temp == '\r') {
if (XMLDoc[offset + 3] == '\n'
&& XMLDoc[offset + 2] == 0) {
return '\n'|(4L<<32);
} else {
return '\n'|(2L<<32);
}
}
//currentOffset++;
return temp| (2L<<32);
} else {
val = temp;
temp =
((XMLDoc[offset + 2] & 0xff)
<< 8) | (XMLDoc[offset+ 3] & 0xff);
val = ((temp - 0xd800) << 10) + (val - 0xdc00) + 0x10000;
//currentOffset += 2;
return val | (4L<<32);
}
}
final public int getPrevOffset(){
//int prevOffset = offset;
int temp;
temp= (XMLDoc[offset]&0xff) << 8 | (XMLDoc[offset + 1]&0xff);
if (temp < 0xd800 || temp > 0xdfff) {
return offset - 2;
} else
return offset - 4;
}
}
class UTF16LEReader implements IReader {
public UTF16LEReader() {
}
final public int getChar()
throws EOFException, ParseException, EncodingException {
int val = 0;
if (offset >= endOffset)
throw e;
int temp = (XMLDoc[offset + 1] &0xff) << 8 | (XMLDoc[offset]& 0xff);
if (temp < 0xd800 || temp > 0xdfff) { // check for low surrogate
offset += 2;
return temp;
} else {
if (temp<0xd800 || temp>0xdbff)
throw new EncodingException("UTF 16 LE encoding error: should never happen");
val = temp;
temp = (XMLDoc[offset + 3] &0xff) << 8 | (XMLDoc[offset + 2]&0xff);
if (temp < 0xdc00 || temp > 0xdfff) {
// has to be high surrogate
throw new EncodingException("UTF 16 LE encoding error: should never happen");
}
val = ((val - 0xd800) <<10) + (temp - 0xdc00) + 0x10000;
offset += 4;
return val;
}
}
final public boolean skipChar(int ch)
throws EOFException, EncodingException, ParseException {
int temp = (XMLDoc[offset + 1]&0xff) << 8 | (XMLDoc[offset]&0xff);
if (temp < 0xd800 ||temp > 0xdfff) { // check for low surrogate
if (temp == ch) {
offset += 2;
return true;
} else {
return false;
}
} else {
if (temp<0xd800 || temp>0xdbff)
throw new EncodingException("UTF 16 LE encoding error: should never happen");
int val = temp;
temp = (XMLDoc[offset + 3] &0xff)<< 8 | (XMLDoc[offset + 2]&0xff);
if (temp < 0xdc00 || temp > 0xdfff) {
// has to be high surrogate
throw new EncodingException("UTF 16 LE encoding error: should never happen");
}
val = ((val - 0xd800)<<10) + (temp - 0xdc00) + 0x10000;
if (val == ch) {
offset += 4;
return true;
} else
return false;
}
}
final public char decode(int offset){
return 0;
}
final public long _getChar(int offset){
// implement UTF-16LE to UCS4 conversion
int val, temp =
(XMLDoc[offset + 1 ] & 0xff)
<< 8 | (XMLDoc[offset] & 0xff);
if (temp < 0xdc00 || temp > 0xdfff) { // check for low surrogate
if (temp == '\r') {
if (XMLDoc[offset + 2] == '\n'
&& XMLDoc[offset + 3] == 0) {
return '\n' | (4L<<32) ;
} else {
return '\n' | (2L<<32);
}
}
return temp | (2L<<32);
} else {
val = temp;
temp =
(XMLDoc[offset + 3]&0xff)
<< 8 | (XMLDoc[offset + 2] & 0xff);
val = ((temp - 0xd800)<<10) + (val - 0xdc00) + 0x10000;
return val | (4L<<32);
}
}
final public int getPrevOffset(){
//int prevOffset = offset;
int temp;
temp =(XMLDoc[offset]&0xff) << 8 | (XMLDoc[offset + 1]&0xff);
if (temp < 0xd800 || temp > 0xdfff) {
return offset - 2;
} else
return offset - 4;
}
}
class UTF8Reader implements IReader {
public UTF8Reader() {
}
final public int getChar()
throws EOFException, ParseException, EncodingException {
if (offset >= endOffset)
throw e;
int temp = XMLDoc[offset];
//int a = 0, c = 0, d = 0, val = 0;
if (temp >= 0) {
++offset;
return temp;
}
return handleUTF8(temp);
}
private int handleUTF8(int temp) throws EncodingException, ParseException{
int val,c,d,a,i;
temp = temp & 0xff;
switch (UTF8Char.byteCount(temp)) { // handle multi-byte code
case 2:
c = 0x1f;
// A mask determine the val portion of the first byte
d = 6; //
a = 1; //
break;
case 3:
c = 0x0f;
d = 12;
a = 2;
break;
case 4:
c = 0x07;
d = 18;
a = 3;
break;
case 5:
c = 0x03;
d = 24;
a = 4;
break;
case 6:
c = 0x01;
d = 30;
a = 5;
break;
default:
throw new ParseException(
"UTF 8 encoding error: should never happen");
}
val = (temp & c) << d;
i = a - 1;
while (i >= 0) {
temp = XMLDoc[offset + a - i];
if ((temp & 0xc0) != 0x80)
throw new ParseException(
"UTF 8 encoding error: should never happen");
val = val | ((temp & 0x3f) << ((i << 2) + (i << 1)));
i--;
}
offset += a + 1;
return val;
}
final public boolean skipChar(int ch)
throws EOFException, EncodingException, ParseException {
//int a = 0, c = 0, d = 0, val = 0;
int temp = XMLDoc[offset];
if (temp >= 0)
if (ch == temp) {
offset++;
return true;
} else {
return false;
}
return skipUTF8(temp, ch);
}
private boolean skipUTF8(int temp, int ch) throws EncodingException, ParseException{
int val, c, d, a, i;
temp = temp & 0xff;
switch (UTF8Char.byteCount(temp)) { // handle multi-byte code
case 2:
c = 0x1f;
// A mask determine the val portion of the first byte
d = 6; //
a = 1; //
break;
case 3:
c = 0x0f;
d = 12;
a = 2;
break;
case 4:
c = 0x07;
d = 18;
a = 3;
break;
case 5:
c = 0x03;
d = 24;
a = 4;
break;
case 6:
c = 0x01;
d = 30;
a = 5;
break;
default:
throw new ParseException(
"UTF 8 encoding error: should never happen");
}
val = (temp & c) << d;
i = a - 1;
while (i >= 0) {
temp = XMLDoc[offset + a - i];
if ((temp & 0xc0) != 0x80)
throw new ParseException(
"UTF 8 encoding error: should never happen");
val = val | ((temp & 0x3f) << ((i << 2) + (i << 1)));
i--;
}
if (val == ch){
offset += a + 1;
return true;
}else
return false;
}
final public long _getChar(int offset){
int temp = XMLDoc[offset];
if (temp>=0){
if (temp == '\r') {
if (XMLDoc[offset + 1] == '\n') {
return '\n'|(2L<<32);
} else {
return '\n'|(1L<<32);
}
}
//currentOffset++;
return temp|(1L<<32);
}
return handle_utf8(temp,offset);
}
private long handle_utf8(int temp, int offset) {
// TODO Auto-generated method stub
int c=0, d=0, a=0;
long val;
switch (UTF8Char.byteCount((int)temp & 0xff)) {
case 2:
c = 0x1f;
d = 6;
a = 1;
break;
case 3:
c = 0x0f;
d = 12;
a = 2;
break;
case 4:
c = 0x07;
d = 18;
a = 3;
break;
case 5:
c = 0x03;
d = 24;
a = 4;
break;
case 6:
c = 0x01;
d = 30;
a = 5;
break;
}
val = (temp & c) << d;
int i = a - 1;
while (i >= 0) {
temp = XMLDoc[offset + a - i];
val = val | ((temp & 0x3f) << ((i << 2) + (i << 1)));
i--;
}
//currentOffset += a + 1;
return val | (((long)(a+1))<<32);
}
final public char decode(int offset){
return 0;
}
final public int getPrevOffset(){
int prevOffset = offset;
int temp;
do {
prevOffset--;
} while (XMLDoc[prevOffset] <0 &&
((XMLDoc[prevOffset] & (byte)0xc0) == (byte)0x80));
return prevOffset;
}
}
class WIN1250Reader implements IReader {
public WIN1250Reader() {
}
final public int getChar()
throws EOFException, ParseException, EncodingException {
if (offset >= endOffset)
throw e;
return WIN1250.decode(XMLDoc[offset++]);
}
final public boolean skipChar(int ch)
throws EOFException, ParseException, EncodingException {
if (ch == WIN1250.decode(XMLDoc[offset])) {
offset++;
return true;
} else {
return false;
}
}
final public long _getChar(int offset){
int c = WIN1250.decode(XMLDoc[offset]);
if (c=='\r' && XMLDoc[offset+1]=='\n')
return (2L<<32)|'\n';
return (1L<<32)|c;
}
final public char decode(int offset){
return WIN1250.decode(XMLDoc[offset]);
}
final public int getPrevOffset(){
return offset-1;
}
}
class WIN1251Reader implements IReader {
public WIN1251Reader() {
}
final public int getChar()
throws EOFException, ParseException, EncodingException {
if (offset >= endOffset)
throw e;
return WIN1251.decode(XMLDoc[offset++]);
}
final public boolean skipChar(int ch)
throws EOFException, ParseException, EncodingException {
if (ch == WIN1251.decode(XMLDoc[offset])) {
offset++;
return true;
} else {
return false;
}
}
final public long _getChar(int offset){
int c = WIN1251.decode(XMLDoc[offset]);
if (c=='\r' && XMLDoc[offset+1]=='\n')
return (2L<<32)|'\n';
return (1L<<32)|c;
}
final public char decode(int offset){
return WIN1251.decode(XMLDoc[offset]);
}
final public int getPrevOffset(){
return offset-1;
}
}
class WIN1252Reader implements IReader {
public WIN1252Reader() {
}
final public int getChar()
throws EOFException, ParseException, EncodingException {
if (offset >= endOffset)
throw e;
return WIN1252.decode(XMLDoc[offset++]);
}
final public boolean skipChar(int ch)
throws EOFException, ParseException, EncodingException {
if (ch == WIN1252.decode(XMLDoc[offset])) {
offset++;
return true;
} else {
return false;
}
}
final public long _getChar(int offset){
int c = WIN1252.decode(XMLDoc[offset]);
if (c=='\r' && XMLDoc[offset+1]=='\n')
return (2L<<32)|'\n';
return (1L<<32)|c;
}
final public char decode(int offset){
return WIN1252.decode(XMLDoc[offset]);
}
final public int getPrevOffset(){
return offset-1;
}
}
class WIN1253Reader implements IReader {
public WIN1253Reader() {
}
final public int getChar()
throws EOFException, ParseException, EncodingException {
if (offset >= endOffset)
throw e;
return WIN1253.decode(XMLDoc[offset++]);
}
final public boolean skipChar(int ch)
throws EOFException, ParseException, EncodingException {
if (ch == WIN1253.decode(XMLDoc[offset])) {
offset++;
return true;
} else {
return false;
}
}
final public long _getChar(int offset){
int c = WIN1253.decode(XMLDoc[offset]);
if (c=='\r' && XMLDoc[offset+1]=='\n')
return (2L<<32)|'\n';
return (1L<<32)|c;
}
final public char decode(int offset){
return WIN1253.decode(XMLDoc[offset]);
}
final public int getPrevOffset(){
return offset-1;
}
}
class WIN1254Reader implements IReader {
public WIN1254Reader() {
}
final public int getChar()
throws EOFException, ParseException, EncodingException {
if (offset >= endOffset)
throw e;
return WIN1254.decode(XMLDoc[offset++]);
}
final public boolean skipChar(int ch)
throws EOFException, ParseException, EncodingException {
if (ch == WIN1254.decode(XMLDoc[offset])) {
offset++;
return true;
} else {
return false;
}
}
final public long _getChar(int offset){
int c = WIN1254.decode(XMLDoc[offset]);
if (c=='\r' && XMLDoc[offset+1]=='\n')
return (2L<<32)|'\n';
return (1L<<32)|c;
}
final public char decode(int offset){
return WIN1254.decode(XMLDoc[offset]);
}
final public int getPrevOffset(){
return offset-1;
}
}
class WIN1255Reader implements IReader {
public WIN1255Reader() {
}
final public int getChar()
throws EOFException, ParseException, EncodingException {
if (offset >= endOffset)
throw e;
return WIN1255.decode(XMLDoc[offset++]);
}
final public boolean skipChar(int ch)
throws EOFException, ParseException, EncodingException {
if (ch == WIN1255.decode(XMLDoc[offset])) {
offset++;
return true;
} else {
return false;
}
}
final public long _getChar(int offset){
int c = WIN1255.decode(XMLDoc[offset]);
if (c=='\r' && XMLDoc[offset+1]=='\n')
return (2L<<32)|'\n';
return (1L<<32)|c;
}
final public char decode(int offset){
return WIN1255.decode(XMLDoc[offset]);
}
final public int getPrevOffset(){
return offset-1;
}
}
class WIN1256Reader implements IReader {
public WIN1256Reader() {
}
final public int getChar()
throws EOFException, ParseException, EncodingException {
if (offset >= endOffset)
throw e;
return WIN1256.decode(XMLDoc[offset++]);
}
final public boolean skipChar(int ch)
throws EOFException, ParseException, EncodingException {
if (ch == WIN1256.decode(XMLDoc[offset])) {
offset++;
return true;
} else {
return false;
}
}
final public long _getChar(int offset){
int c = WIN1256.decode(XMLDoc[offset]);
if (c=='\r' && XMLDoc[offset+1]=='\n')
return (2L<<32)|'\n';
return (1L<<32)|c;
}
final public char decode(int offset){
return WIN1256.decode(XMLDoc[offset]);
}
final public int getPrevOffset(){
return offset-1;
}
}
class WIN1257Reader implements IReader {
public WIN1257Reader() {
}
final public int getChar()
throws EOFException, ParseException, EncodingException {
if (offset >= endOffset)
throw e;
return WIN1257.decode(XMLDoc[offset++]);
}
final public boolean skipChar(int ch)
throws EOFException, ParseException, EncodingException {
if (ch == WIN1257.decode(XMLDoc[offset])) {
offset++;
return true;
} else {
return false;
}
}
final public long _getChar(int offset){
int c = WIN1257.decode(XMLDoc[offset]);
if (c=='\r' && XMLDoc[offset+1]=='\n')
return (2L<<32)|'\n';
return (1L<<32)|c;
}
final public char decode(int offset){
return WIN1257.decode(XMLDoc[offset]);
}
final public int getPrevOffset(){
return offset-1;
}
}
class WIN1258Reader implements IReader {
public WIN1258Reader() {
}
final public int getChar()
throws EOFException, ParseException, EncodingException {
if (offset >= endOffset)
throw e;
return WIN1258.decode(XMLDoc[offset++]);
}
final public boolean skipChar(int ch)
throws EOFException, ParseException, EncodingException {
if (ch == WIN1258.decode(XMLDoc[offset])) {
offset++;
return true;
} else {
return false;
}
}
final public long _getChar(int offset){
int c = WIN1258.decode(XMLDoc[offset]);
if (c=='\r' && XMLDoc[offset+1]=='\n')
return (2L<<32)|'\n';
return (1L<<32)|c;
}
final public char decode(int offset){
return WIN1258.decode(XMLDoc[offset]);
}
final public int getPrevOffset(){
return offset-1;
}
}
// attr_name_array size
private final static int ATTR_NAME_ARRAY_SIZE = 16;
public final static int FORMAT_ASCII = 0;
public final static int FORMAT_ISO_8859_1 = 1;
public final static int FORMAT_ISO_8859_10 = 11;
public final static int FORMAT_ISO_8859_11 = 12;
public final static int FORMAT_ISO_8859_12 = 13;
public final static int FORMAT_ISO_8859_13 = 14;
public final static int FORMAT_ISO_8859_14 = 15;
public final static int FORMAT_ISO_8859_15 = 16;
public final static int FORMAT_ISO_8859_16 = 17;
public final static int FORMAT_ISO_8859_2 = 3;
public final static int FORMAT_ISO_8859_3 = 4;
public final static int FORMAT_ISO_8859_4 = 5;
public final static int FORMAT_ISO_8859_5 = 6;
public final static int FORMAT_ISO_8859_6 = 7;
public final static int FORMAT_ISO_8859_7 = 8;
public final static int FORMAT_ISO_8859_8 = 9;
public final static int FORMAT_ISO_8859_9 = 10;
public final static int FORMAT_UTF_16BE = 63;
public final static int FORMAT_UTF_16LE = 64;
// encoding format
public final static int FORMAT_UTF8 = 2;
public final static int FORMAT_WIN_1250 = 18;
public final static int FORMAT_WIN_1251 = 19;
public final static int FORMAT_WIN_1252 = 20;
public final static int FORMAT_WIN_1253 = 21;
public final static int FORMAT_WIN_1254 = 22;
public final static int FORMAT_WIN_1255 = 23;
public final static int FORMAT_WIN_1256 = 24;
public final static int FORMAT_WIN_1257 = 25;
public final static int FORMAT_WIN_1258 = 26;
public final static int MAX_DEPTH = 254; // maximum depth value
// max prefix length
public final static int MAX_PREFIX_LENGTH = (1<<9) -1;
// max Qname length
public final static int MAX_QNAME_LENGTH = (1<<11) -1;
// max Token length
public final static int MAX_TOKEN_LENGTH = (1<<20) -1;
private final static int STATE_ATTR_NAME = 3;
private final static int STATE_ATTR_VAL = 4;
private final static int STATE_CDATA = 12;
private final static int STATE_COMMENT = 11;
private final static int STATE_DEC_ATTR_NAME = 10;
private final static int STATE_DOC_END = 7; // end of document
private final static int STATE_DOC_START = 6; // beginning of document
private final static int STATE_DOCTYPE = 13;
private final static int STATE_END_COMMENT = 14;
// comment appear after the last ending tag
private final static int STATE_END_PI = 15;
private final static int STATE_END_TAG = 2;
// internal parser state
private final static int STATE_LT_SEEN = 0; // encounter the first <
private final static int STATE_PI_TAG =8;
private final static int STATE_PI_VAL = 9;
private final static int STATE_START_TAG = 1;
private final static int STATE_TEXT = 5;
// tag_stack size
private final static int TAG_STACK_SIZE = 256;
public final static int TOKEN_ATTR_NAME = 2;
public final static int TOKEN_ATTR_NS = 3;
public final static int TOKEN_ATTR_VAL = 4;
public final static int TOKEN_CDATA_VAL = 11;
public final static int TOKEN_CHARACTER_DATA = 5;
public final static int TOKEN_COMMENT = 6;
public final static int TOKEN_DEC_ATTR_NAME = 9;
public final static int TOKEN_DEC_ATTR_VAL = 10;
public final static int TOKEN_DOCUMENT = 13;
public final static int TOKEN_DTD_VAL = 12;
public final static int TOKEN_ENDING_TAG = 1;
public final static int TOKEN_PI_NAME = 7;
public final static int TOKEN_PI_VAL = 8;
//private final static int STATE_END_PI_VAL = 17;
// token type
public final static int TOKEN_STARTING_TAG = 0;
protected byte[] XMLDoc;
protected IReader r;
private int ch;
private int ch_temp;
protected int offset;
protected int depth;
private int length1, length2;
protected boolean helper=false;
protected boolean default_ns = false; //true xmlns='abc'
protected boolean isXML = false;
protected boolean singleByteEncoding;
protected boolean shallowDepth; // true if lc depth is 3
protected boolean ns,is_ns;
protected boolean br; //buffer reuse
private int temp_offset;
protected int endOffset;
protected int prev_offset;
private int increment;
protected long[] tag_stack;
private long[] attr_name_array;
private int attr_count;
private long[] prefixed_attr_name_array;
private int[] prefix_URL_array;
private int prefixed_attr_count;
protected FastLongBuffer VTDBuffer;
//protected int offset_adj; // determine the byte length for ':' for various encoding types
// again, in terms of byte, not char as encoded in VTD
protected FastLongBuffer l1Buffer;
protected FastLongBuffer l2Buffer;
protected FastIntBuffer l3Buffer;
protected FastLongBuffer _l3Buffer;
protected FastLongBuffer _l4Buffer;
protected FastIntBuffer _l5Buffer;
protected FastIntBuffer nsBuffer1;
protected FastLongBuffer nsBuffer2;
protected FastLongBuffer nsBuffer3;
private int last_depth;
private int last_l1_index;
private int last_l2_index;
private int last_l3_index;
private int last_l4_index;
protected EOFException e;
protected int docLen;
protected int docOffset;
protected int encoding;
//namespace aware flag
// this is byte offset, not char offset as encoded in VTD
protected boolean ws; // to prserve whitespace or not, default to false
protected int rootIndex;
protected int VTDDepth; // Maximum Depth of VTDs
protected short LcDepth;
protected long currentElementRecord;
private boolean must_utf_8;
private boolean BOM_detected;
/**
* VTDGen constructor method.
*/
public VTDGen() {
attr_name_array = new long[ATTR_NAME_ARRAY_SIZE];
prefixed_attr_name_array = new long[ATTR_NAME_ARRAY_SIZE];
prefix_URL_array = new int[ATTR_NAME_ARRAY_SIZE];
tag_stack = new long[TAG_STACK_SIZE];
//scratch_buffer = new int[10];
VTDDepth = 0;
LcDepth = 3;
br = false;
e = new EOFException("permature EOF reached, XML document incomplete");
ws = false;
nsBuffer1 = new FastIntBuffer(4);
nsBuffer2 = new FastLongBuffer(4);
nsBuffer3 = new FastLongBuffer(4);
currentElementRecord = 0;
singleByteEncoding = true;
shallowDepth =true;
helper=false;
default_ns = false; //true xmlns='abc'
isXML = false;
//offset_adj = 1;
}
/**
* Clear internal states so VTDGEn can process the next file.
*/
public void clear() {
if (br==false){
VTDBuffer = null;
l1Buffer = null;
l2Buffer = null;
l3Buffer = null;
_l3Buffer = null;
_l4Buffer = null;
_l5Buffer = null;
}
XMLDoc = null;
offset = temp_offset =0;
last_depth = last_l1_index = last_l2_index = last_l3_index=last_l4_index=0;
rootIndex = 0;
depth = -1;
increment =1;
BOM_detected = false;
must_utf_8 = false;
ch = ch_temp = 0;
nsBuffer1.size=0;
nsBuffer2.size=0;
nsBuffer3.size=0;
currentElementRecord = 0;
}
/**
* Enable the parser to collect all white spaces, including the trivial white spaces
* By default, trivial white spaces are ignored
* @param b
*/
public void enableIgnoredWhiteSpace(boolean b){
ws = b;
}
/**
* Enable VTDGen to generate Location Cache of either depth 3 or 5
* This method is meant to called before setDoc() or parseFile()
* @param i
*/
public void selectLcDepth(int i) throws ParseException{
if (i!=3 &&i!=5)
throw new IllegalArgumentException("LcDepth can only take the value of 3 or 5");
//new ParseException("LcDepth can only take the value of 3 or 5");
if (i==5)
shallowDepth = false;
}
/**
* Write white space records that are ignored by default
*/
private void addWhiteSpaceRecord() {
if (depth > -1) {
int length1 = offset - increment - temp_offset;
if (length1 != 0)
if (singleByteEncoding)//if (encoding < FORMAT_UTF_16BE)
writeVTDText(temp_offset, length1, TOKEN_CHARACTER_DATA, depth);
else
writeVTDText(temp_offset >> 1, length1 >> 1,
TOKEN_CHARACTER_DATA, depth);
}
}
/**
* A private method that detects the BOM and decides document encoding
* @throws EncodingException
* @throws ParseException
*/
private void decide_encoding() throws EncodingException,ParseException {
if (XMLDoc.length==0)
throw new EncodingException("Document is zero sized ");
if (XMLDoc[offset] == -2) {
increment = 2;
if (XMLDoc[offset + 1] == -1) {
offset += 2;
encoding = FORMAT_UTF_16BE;
BOM_detected = true;
r = new UTF16BEReader();
} else
throw new EncodingException("Unknown Character encoding: should be 0xff 0xfe");
} else if (XMLDoc[offset] == -1) {
increment = 2;
if (XMLDoc[offset + 1] == -2) {
offset += 2;
encoding = FORMAT_UTF_16LE;
BOM_detected = true;
r = new UTF16LEReader();
} else
throw new EncodingException("Unknown Character encoding: not UTF-16LE");
} else if (XMLDoc[offset] == -17){
if (XMLDoc[offset+1] == -69 && XMLDoc[offset+2]==-65){
offset +=3;
must_utf_8= true;
}
else
throw new EncodingException("Unknown Character encoding: not UTF-8");
}
else if (XMLDoc[offset]==0){
if (XMLDoc[offset+1] == 0x3c
&& XMLDoc[offset+2] == 0
&& XMLDoc[offset+3] == 0x3f){
encoding = FORMAT_UTF_16BE;
increment = 2;
r = new UTF16BEReader();
}
else
throw new EncodingException("Unknown Character encoding: not UTF-16BE");
}
else if (XMLDoc[offset]==0x3c){
if (XMLDoc[offset+1] == 0
&& XMLDoc[offset+2] == 0x3f
&& XMLDoc[offset+3] == 0){
increment = 2;
encoding = FORMAT_UTF_16LE;
r = new UTF16LEReader();
}
}
// check for max file size exception
if (encoding < FORMAT_UTF_16BE) {
if (ns){
if ((offset + (long)docLen) >= 1L << 30)
throw new ParseException("Other error: file size too big >=1GB ");
}
else {
if ((offset + (long)docLen) >= 1L <<31)
throw new ParseException("Other error: file size too big >=2GB ");
}
} else {
//offset_adj = 2;
if ((offset+ (long)docLen) >= 1L << 31)
throw new ParseException("Other error: file size too large >= 2GB");
}
if (encoding >= FORMAT_UTF_16BE )
singleByteEncoding = false;
}
/**
* This method will detect whether the entity is valid or not and increment offset.
* @return int
* @throws com.ximpleware.ParseException Super class for any exception during parsing.
* @throws com.ximpleware.EncodingException UTF/native encoding exception.
* @throws com.ximpleware.EOFException End of file exception.
*/
private int entityIdentifier() throws EntityException, EncodingException,EOFException, ParseException {
int ch = r.getChar();
int val = 0;
switch (ch) {
case '#':
ch = r.getChar();
if (ch == 'x') {
while (true) {
ch = r.getChar();
if (ch >= '0' && ch <= '9') {
val = (val << 4) + (ch - '0');
} else if (ch >= 'a' && ch <= 'f') {
val = (val << 4) + (ch - 'a' + 10);
} else if (ch >= 'A' && ch <= 'F') {
val = (val << 4) + (ch - 'A' + 10);
} else if (ch == ';') {
return val;
} else
throw new EntityException("Errors in char reference: Illegal char following .");
}
} else {
while (true) {
if (ch >= '0' && ch <= '9') {
val = val * 10 + (ch - '0');
} else if (ch == ';') {
break;
} else
throw new EntityException("Errors in char reference: Illegal char following .");
ch = r.getChar();
}
}
if (!XMLChar.isValidChar(val)) {
throw new EntityException("Errors in entity reference: Invalid XML char.");
}
return val;
//break;
case 'a' :
ch = r.getChar();
if (ch == 'm') {
if (r.getChar() == 'p' && r.getChar() == ';') {
//System.out.println(" entity for &");
return '&';
} else
throw new EntityException("Errors in Entity: Illegal builtin reference");
} else if (ch == 'p') {
if (r.getChar() == 'o'
&& r.getChar() == 's'
&& r.getChar() == ';') {
//System.out.println(" entity for ' ");
return '\'';
} else
throw new EntityException("Errors in Entity: Illegal builtin reference");
} else
throw new EntityException("Errors in Entity: Illegal builtin reference");
case 'q' :
if (r.getChar() == 'u'
&& r.getChar() == 'o'
&& r.getChar() == 't'
&& r.getChar() == ';') {
return '"';
} else
throw new EntityException("Errors in Entity: Illegal builtin reference");
case 'l' :
if (r.getChar() == 't' && r.getChar() == ';') {
return '<';
} else
throw new EntityException("Errors in Entity: Illegal builtin reference");
//break;
case 'g' :
if (r.getChar() == 't' && r.getChar() == ';') {
return '>';
} else
throw new EntityException("Errors in Entity: Illegal builtin reference");
default :
throw new EntityException("Errors in Entity: Illegal entity char");
}
//return val;
}
/**
* Write the remaining portion of LC info
*
*/
private void finishUp(){
if (shallowDepth){
if (last_depth == 1) {
l1Buffer.append(((long) last_l1_index << 32) | 0xffffffffL);
} else if (last_depth == 2) {
l2Buffer.append(((long) last_l2_index << 32) | 0xffffffffL);
}
}else{
if (last_depth == 1) {
l1Buffer.append(((long) last_l1_index << 32) | 0xffffffffL);
} else if (last_depth == 2) {
l2Buffer.append(((long) last_l2_index << 32) | 0xffffffffL);
}else if (last_depth == 3) {
_l3Buffer.append(((long) last_l3_index << 32) | 0xffffffffL);
}else if (last_depth == 4) {
_l4Buffer.append(((long) last_l4_index << 32) | 0xffffffffL);
}
}
}
/**
* Format the string indicating the position (line number:offset)of the offset if
* there is an exception.
* @return java.lang.String indicating the line number and offset of the exception
*/
private String formatLineNumber() {
return formatLineNumber(offset);
}
private String formatLineNumber(int os) {
int so = docOffset;
int lineNumber = 0;
int lineOffset = 0;
if (encoding < FORMAT_UTF_16BE) {
while (so <= os-1) {
if (XMLDoc[so] == '\n') {
lineNumber++;
lineOffset = so;
}
//lineOffset++;
so++;
}
lineOffset = os - lineOffset;
} else if (encoding == FORMAT_UTF_16BE) {
while (so <= os-2) {
if (XMLDoc[so + 1] == '\n' && XMLDoc[so] == 0) {
lineNumber++;
lineOffset = so;
}
so += 2;
}
lineOffset = (os - lineOffset) >> 1;
} else {
while (so <= os-2) {
if (XMLDoc[so] == '\n' && XMLDoc[so + 1] == 0) {
lineNumber++;
lineOffset = so;
}
so += 2;
}
lineOffset = (os - lineOffset) >> 1;
}
return "\nLine Number: " + (lineNumber+1) + " Offset: " + (lineOffset-1);
}
/**
* The entity ignorant version of getCharAfterS.
* @return int
* @throws ParseException
* @throws EncodingException
* @throws com.ximpleware.EOFException
*/
final private int getCharAfterS()
throws ParseException, EncodingException, EOFException {
int n;
do {
n = r.getChar();
if (XMLChar.isSpaceChar(n)){
}
else
return n;
n = r.getChar();
if (XMLChar.isSpaceChar(n)){
}
else
return n;
//n = r.getChar();
/*if ((n == ' ' || n == '\n' || n =='\t'|| n == '\r' ) ) {
//if (XMLChar.isSpaceChar(n) ) {
} else
return n;*/
/*n = r.getChar();
if ((n == ' ' || n == '\n' || n =='\t'|| n == '\r' ) ) {
} else
return n;
n = r.getChar();
if (n == ' ' || n == '\n' || n =='\t'|| n == '\r' ) {
} else
return n;*/
} while(true);
//throw new EOFException("should never come here");
}
/**
* The entity aware version of getCharAfterS
* @return int
* @throws ParseException Super class for any exception during parsing.
* @throws EncodingException UTF/native encoding exception.
* @throws com.ximpleware.EOFException End of file exception.
*/
// private int getCharAfterSe()
// throws ParseException, EncodingException, EOFException {
// int n = 0;
// int temp; //offset saver
// while (true) {
// n = r.getChar();
// if (!XMLChar.isSpaceChar(n)) {
// if (n != '&')
// return n;
// else {
// temp = offset;
// if (!XMLChar.isSpaceChar(entityIdentifier())) {
// offset = temp; // rewind
// return '&';
// }
// }
// }
// n = r.getChar();
// if (!XMLChar.isSpaceChar(n)) {
// if (n != '&')
// return n;
// else {
// temp = offset;
// if (!XMLChar.isSpaceChar(entityIdentifier())) {
// offset = temp; // rewind
// return '&';
// }
// }
// }
// }
// }
/**
* Pre-compute the size of VTD+XML index
* @return size of the index
*
*/
public long getIndexSize(){
int size;
if ( (docLen & 7)==0)
size = docLen;
else
size = ((docLen >>3)+1)<<3;
size += (VTDBuffer.size <<3)+
(l1Buffer.size <<3)+
(l2Buffer.size <<3);
if ((l3Buffer.size & 1) == 0){ //even
size += l3Buffer.size<<2;
} else {
size += (l3Buffer.size+1)<<2; //odd
}
return size+64;
}
/**
* This method returns the VTDNav object after parsing, it also cleans
* internal state so VTDGen can process the next file.
* @return com.ximpleware.VTDNav
*/
public VTDNav getNav() {
// call VTDNav constructor
VTDNav vn;
if (shallowDepth)
vn = new VTDNav(rootIndex, encoding, ns, VTDDepth,
new UniByteBuffer(XMLDoc), VTDBuffer, l1Buffer, l2Buffer,
l3Buffer, docOffset, docLen);
else
vn = new VTDNav_L5(rootIndex, encoding, ns, VTDDepth,
new UniByteBuffer(XMLDoc), VTDBuffer, l1Buffer, l2Buffer,
_l3Buffer, _l4Buffer, _l5Buffer, docOffset, docLen);
clear();
r = new UTF8Reader();
return vn;
}
/**
* Get the offset value of previous character.
* @return int
* @throws ParseException Super class for exceptions during parsing.
*/
private int getPrevOffset() throws ParseException {
int prevOffset = offset;
int temp;
switch (encoding) {
case FORMAT_UTF8 :
do {
prevOffset--;
} while (XMLDoc[prevOffset] <0 &&
((XMLDoc[prevOffset] & (byte)0xc0) == (byte)0x80));
return prevOffset;
case FORMAT_ASCII :
case FORMAT_ISO_8859_1:
case FORMAT_ISO_8859_2:
case FORMAT_ISO_8859_3:
case FORMAT_ISO_8859_4:
case FORMAT_ISO_8859_5:
case FORMAT_ISO_8859_6:
case FORMAT_ISO_8859_7:
case FORMAT_ISO_8859_8:
case FORMAT_ISO_8859_9:
case FORMAT_ISO_8859_10:
case FORMAT_ISO_8859_11:
case FORMAT_ISO_8859_13:
case FORMAT_ISO_8859_14:
case FORMAT_ISO_8859_15:
case FORMAT_WIN_1250:
case FORMAT_WIN_1251:
case FORMAT_WIN_1252:
case FORMAT_WIN_1253:
case FORMAT_WIN_1254:
case FORMAT_WIN_1255:
case FORMAT_WIN_1256:
case FORMAT_WIN_1257:
case FORMAT_WIN_1258:
return offset - 1;
case FORMAT_UTF_16LE :
temp= (XMLDoc[offset]&0xff) << 8 | (XMLDoc[offset + 1]&0xff);
if (temp < 0xd800 || temp > 0xdfff) {
return offset - 2;
} else
return offset - 4;
case FORMAT_UTF_16BE :
temp =(XMLDoc[offset]&0xff) << 8 | (XMLDoc[offset + 1]&0xff);
if (temp < 0xd800 || temp > 0xdfff) {
return offset - 2;
} else
return offset - 4;
default :
throw new ParseException("Other Error: Should never happen");
}
}
/**
* This method loads the VTD+XML from a byte array
* @return VTDNav
* @param ba
* @throws IOException
* @throws IndexReadException
*
*/
public VTDNav loadIndex(byte[] ba)throws IOException,IndexReadException{
IndexHandler.readIndex(ba,this);
return getNav();
}
/**
* This method loads the VTD+XML from an input stream
* @return VTDNav
* @param is
* @throws IOException
* @throws IndexReadException
*
*/
public VTDNav loadIndex(InputStream is) throws IOException,IndexReadException{
IndexHandler.readIndex(is, this);
return getNav();
}
/**
* This method loads the VTD+XML from a file
* @return VTDNav
* @param fileName
* @throws IOException
* @throws IndexReadException
*
*/
public VTDNav loadIndex(String fileName)throws IOException,IndexReadException{
FileInputStream fis = null;
try {
fis = new FileInputStream(fileName);
return loadIndex(fis);
} finally {
if (fis != null)
fis.close();
}
}
/**
* Load the separate VTD index and XmL file.
* Refer to persistence model of separate vtd index
* for more details
* @param XMLFileName name of xml file
* @param VTDIndexName name of the vtd index file
* @return VTDNav object
* @throws IOException
* @throws IndexReadException
*
*/
public VTDNav loadSeparateIndex(String XMLFileName, String VTDIndexName) throws IOException, IndexReadException{
FileInputStream xfis = null;
FileInputStream vfis = null;
xfis = new FileInputStream(XMLFileName);
int size = (int)(new File(XMLFileName)).length();
vfis = new FileInputStream(VTDIndexName);
IndexHandler.readSeparateIndex( vfis, xfis, size, this);
return getNav();
}
private void matchCPEncoding()throws ParseException{
if ((r.skipChar('p') || r.skipChar('P')) && r.skipChar('1')
&& r.skipChar('2') && r.skipChar('5')) {
if (encoding <= FORMAT_UTF_16LE) {
if (must_utf_8)
throw new EncodingException(
"Can't switch from UTF-8"
+ formatLineNumber());
if (r.skipChar('0')){
encoding = FORMAT_WIN_1250;
r=new WIN1250Reader();
_writeVTD(temp_offset, 6,
TOKEN_DEC_ATTR_VAL,
depth);
}else if (r.skipChar('1')){
encoding = FORMAT_WIN_1251;
r=new WIN1251Reader();
_writeVTD(temp_offset, 6,
TOKEN_DEC_ATTR_VAL,
depth);
}else if (r.skipChar('2')){
encoding = FORMAT_WIN_1252;
r=new WIN1252Reader();
_writeVTD(temp_offset, 6,
TOKEN_DEC_ATTR_VAL,
depth);
}else if (r.skipChar('3')){
encoding = FORMAT_WIN_1253;
r=new WIN1253Reader();
_writeVTD(temp_offset, 6,
TOKEN_DEC_ATTR_VAL,
depth);
}else if (r.skipChar('4')){
encoding = FORMAT_WIN_1254;
r=new WIN1254Reader();
_writeVTD(temp_offset, 6,
TOKEN_DEC_ATTR_VAL,
depth);
}else if (r.skipChar('5') ){
encoding = FORMAT_WIN_1255;
r=new WIN1255Reader();
_writeVTD(temp_offset, 6,
TOKEN_DEC_ATTR_VAL,
depth);
}else if (r.skipChar('6')){
encoding = FORMAT_WIN_1256;
r=new WIN1256Reader();
_writeVTD(temp_offset, 6,
TOKEN_DEC_ATTR_VAL,
depth);
}else if (r.skipChar('7') ){
encoding = FORMAT_WIN_1257;
r=new WIN1257Reader();
_writeVTD(temp_offset, 6,
TOKEN_DEC_ATTR_VAL,
depth);
}else if (r.skipChar('8') ){
encoding = FORMAT_WIN_1258;
r=new WIN1258Reader();
_writeVTD(temp_offset, 6,
TOKEN_DEC_ATTR_VAL,
depth);
}else
throw new ParseException(
"XML decl error: Invalid Encoding"
+ formatLineNumber());
if (r.skipChar(ch_temp))
return;
} else
throw new ParseException(
"XML decl error: Can't switch encoding to ISO-8859"
+ formatLineNumber());
}
throw new ParseException(
"XML decl error: Invalid Encoding"
+ formatLineNumber());
}
private void matchISOEncoding()throws ParseException{
if ((r.skipChar('s') || r.skipChar('S'))
&& (r.skipChar('o') || r.skipChar('O'))
&& r.skipChar('-') && r.skipChar('8')
&& r.skipChar('8') && r.skipChar('5')
&& r.skipChar('9') && r.skipChar('-'))
{
if (encoding <= FORMAT_UTF_16LE) {
if (must_utf_8)
throw new EncodingException(
"Can't switch from UTF-8"
+ formatLineNumber());
if (r.skipChar('1')){
if (r.skipChar(ch_temp)) {
encoding = FORMAT_ISO_8859_1;
r = new ISO8859_1Reader();
_writeVTD(temp_offset, 10,
TOKEN_DEC_ATTR_VAL,
depth);
return;
} else if (r.skipChar('0') ){
encoding = FORMAT_ISO_8859_10;
r = new ISO8859_10Reader();
_writeVTD(temp_offset, 11,
TOKEN_DEC_ATTR_VAL,
depth);
} else if (r.skipChar('1') ){
encoding = FORMAT_ISO_8859_11;
r = new ISO8859_11Reader();
_writeVTD(temp_offset, 11,
TOKEN_DEC_ATTR_VAL,
depth);
} else if (r.skipChar('3') ){
encoding = FORMAT_ISO_8859_13;
r = new ISO8859_13Reader();
_writeVTD(temp_offset, 11,
TOKEN_DEC_ATTR_VAL,
depth);
} else if (r.skipChar('4') ){
encoding = FORMAT_ISO_8859_14;
r = new ISO8859_14Reader();
_writeVTD(temp_offset, 11,
TOKEN_DEC_ATTR_VAL,
depth);
} else if (r.skipChar('5') ){
encoding = FORMAT_ISO_8859_15;
r = new ISO8859_15Reader();
_writeVTD(temp_offset, 15,
TOKEN_DEC_ATTR_VAL,
depth);
} else throw new ParseException(
"XML decl error: Invalid Encoding"
+ formatLineNumber());
}else if (r.skipChar('2') ){
encoding = FORMAT_ISO_8859_2;
r = new ISO8859_2Reader();
_writeVTD(temp_offset, 10,
TOKEN_DEC_ATTR_VAL,
depth);
}else if (r.skipChar('3')){
r = new ISO8859_3Reader();
encoding = FORMAT_ISO_8859_3;
_writeVTD(temp_offset, 10,
TOKEN_DEC_ATTR_VAL,
depth);
}else if (r.skipChar('4') ){
r = new ISO8859_4Reader();
encoding = FORMAT_ISO_8859_4;
_writeVTD(temp_offset, 10,
TOKEN_DEC_ATTR_VAL,
depth);
}else if (r.skipChar('5') ){
encoding = FORMAT_ISO_8859_5;
r = new ISO8859_5Reader();
_writeVTD(temp_offset, 10,
TOKEN_DEC_ATTR_VAL,
depth);
}else if (r.skipChar('6') ){
encoding = FORMAT_ISO_8859_6;
r = new ISO8859_6Reader();
_writeVTD(temp_offset, 10,
TOKEN_DEC_ATTR_VAL,
depth);
}else if (r.skipChar('7') ){
encoding = FORMAT_ISO_8859_7;
r = new ISO8859_7Reader();
_writeVTD(temp_offset, 10,
TOKEN_DEC_ATTR_VAL,
depth);
}else if (r.skipChar('8') ){
encoding = FORMAT_ISO_8859_8;
r = new ISO8859_8Reader();
_writeVTD(temp_offset, 10,
TOKEN_DEC_ATTR_VAL,
depth);
}else if (r.skipChar('9')){
encoding = FORMAT_ISO_8859_9;
r = new ISO8859_9Reader();
_writeVTD(temp_offset, 10,
TOKEN_DEC_ATTR_VAL,
depth);
} else
throw new ParseException(
"XML decl error: Invalid Encoding"
+ formatLineNumber());
if (r.skipChar(ch_temp))
return;
} else
throw new ParseException(
"XML decl error: Can't switch encoding to ISO-8859"
+ formatLineNumber());
}
throw new ParseException(
"XML decl error: Invalid Encoding"
+ formatLineNumber());
}
private void matchUTFEncoding() throws ParseException{
if ((r.skipChar('s') || r.skipChar('S')))
if (r.skipChar('-')
&& (r.skipChar('a') || r.skipChar('A'))
&& (r.skipChar('s') || r.skipChar('S'))
&& (r.skipChar('c') || r.skipChar('C'))
&& (r.skipChar('i') || r.skipChar('I'))
&& (r.skipChar('i') || r.skipChar('I'))
&& r.skipChar(ch_temp)) {
if (singleByteEncoding) {
if (must_utf_8)
throw new EncodingException(
"Can't switch from UTF-8"
+ formatLineNumber());
encoding = FORMAT_ASCII;
r=new ASCIIReader();
_writeVTD(temp_offset, 8,
TOKEN_DEC_ATTR_VAL,
depth);
return;
} else
throw new ParseException(
"XML decl error: Can't switch encoding to US-ASCII"
+ formatLineNumber());
} else
throw new ParseException(
"XML decl error: Invalid Encoding"
+ formatLineNumber());
if ((r.skipChar('t') || r.skipChar('T'))
&& (r.skipChar('f') || r.skipChar('F'))
&& r.skipChar('-')) {
if (r.skipChar('8') && r.skipChar(ch_temp)) {
if (singleByteEncoding) {
//encoding = FORMAT_UTF8;
_writeVTD(temp_offset, 5,
TOKEN_DEC_ATTR_VAL,
depth);
return;
} else
throw new ParseException(
"XML decl error: Can't switch encoding to UTF-8"
+ formatLineNumber());
}
if (r.skipChar('1') && r.skipChar('6')) {
if (r.skipChar(ch_temp)) {
if (!singleByteEncoding) {
if (!BOM_detected)
throw new EncodingException(
"BOM not detected for UTF-16"
+ formatLineNumber());
_writeVTD(
temp_offset >> 1,
6,
TOKEN_DEC_ATTR_VAL,
depth);
return;
}
throw new ParseException(
"XML decl error: Can't switch encoding to UTF-16"
+ formatLineNumber());
} else if ((r.skipChar('l') || r.skipChar('L'))
&& (r.skipChar('e') || r.skipChar('E'))
&& r.skipChar(ch_temp)) {
if (encoding == FORMAT_UTF_16LE) {
r = new UTF16LEReader();
_writeVTD(
temp_offset >> 1,
8,
TOKEN_DEC_ATTR_VAL,
depth);
return;
}
throw new ParseException(
"XML del error: Can't switch encoding to UTF-16LE"
+ formatLineNumber());
} else if ((r.skipChar('b') || r.skipChar('B'))
&& (r.skipChar('e') || r.skipChar('E'))
&& r.skipChar(ch_temp)) {
if (encoding == FORMAT_UTF_16BE) {
_writeVTD(
temp_offset >> 1,
8,
TOKEN_DEC_ATTR_VAL,
depth);
return;
}
throw new ParseException(
"XML del error: Can't swtich encoding to UTF-16BE"
+ formatLineNumber());
}
throw new ParseException(
"XML decl error: Invalid encoding"
+ formatLineNumber());
}
}
}
private void matchWindowsEncoding()throws ParseException{
if ((r.skipChar('i') || r.skipChar('I'))
&&(r.skipChar('n') || r.skipChar('N'))
&&(r.skipChar('d') || r.skipChar('D'))
&&(r.skipChar('o') || r.skipChar('O'))
&&(r.skipChar('w') || r.skipChar('W'))
&&(r.skipChar('s') || r.skipChar('S'))
&& r.skipChar('-')
&& r.skipChar('1')
&& r.skipChar('2')
&& r.skipChar('5')) {
if (encoding <= FORMAT_UTF_16LE) {
if (must_utf_8)
throw new EncodingException(
"Can't switch from UTF-8"
+ formatLineNumber());
if (r.skipChar('0')){
encoding = FORMAT_WIN_1250;
r=new WIN1250Reader();
_writeVTD(temp_offset, 12,
TOKEN_DEC_ATTR_VAL,
depth);
}else if (r.skipChar('1')){
encoding = FORMAT_WIN_1251;
r=new WIN1251Reader();
_writeVTD(temp_offset, 12,
TOKEN_DEC_ATTR_VAL,
depth);
}else if (r.skipChar('2')){
encoding = FORMAT_WIN_1252;
r=new WIN1252Reader();
_writeVTD(temp_offset, 12,
TOKEN_DEC_ATTR_VAL,
depth);
}else if (r.skipChar('3')){
encoding = FORMAT_WIN_1253;
r=new WIN1253Reader();
_writeVTD(temp_offset, 12,
TOKEN_DEC_ATTR_VAL,
depth);
}else if (r.skipChar('4')){
encoding = FORMAT_WIN_1254;
r=new WIN1254Reader();
_writeVTD(temp_offset, 12,
TOKEN_DEC_ATTR_VAL,
depth);
}else if (r.skipChar('5')){
encoding = FORMAT_WIN_1255;
r=new WIN1255Reader();
_writeVTD(temp_offset, 12,
TOKEN_DEC_ATTR_VAL,
depth);
}else if (r.skipChar('6')){
encoding = FORMAT_WIN_1256;
r=new WIN1256Reader();
_writeVTD(temp_offset, 12,
TOKEN_DEC_ATTR_VAL,
depth);
}else if (r.skipChar('7')){
encoding = FORMAT_WIN_1257;
r=new WIN1257Reader();
_writeVTD(temp_offset, 12,
TOKEN_DEC_ATTR_VAL,
depth);
}else if (r.skipChar('8')){
encoding = FORMAT_WIN_1258;
r=new WIN1258Reader();
_writeVTD(temp_offset, 12,
TOKEN_DEC_ATTR_VAL,
depth);
}else
throw new ParseException(
"XML decl error: Invalid Encoding"
+ formatLineNumber());
if (r.skipChar(ch_temp))
return;
} else
throw new ParseException(
"XML decl error: Can't switch encoding to ISO-8859"
+ formatLineNumber());
}
throw new ParseException(
"XML decl error: Invalid Encoding"
+ formatLineNumber());
}
/**
* Generating VTD tokens and Location cache info. When set to true,
* VTDGen conforms to XML namespace 1.0 spec
* @param NS boolean Enable namespace or not
* @throws ParseException Super class for any exceptions during parsing.
* @throws EOFException End of file exception.
* @throws EntityException Entity resolution exception.
* @throws EncodingException UTF/native encoding exception.
*/
public void parse(boolean NS)
throws EncodingException, EOFException, EntityException, ParseException {
// define internal variables
ns = NS;
//String s1 = null,s2=null;
length1 = length2 = 0;
attr_count = prefixed_attr_count= 0 /*, ch = 0, ch_temp = 0*/;
int parser_state = STATE_DOC_START;
//boolean has_amp = false;
is_ns = false;
encoding = FORMAT_UTF8;
helper=false;
default_ns = false; //true xmlns='abc'
isXML = false; //true only for xmlns:xml
singleByteEncoding = true;
// first check first several bytes to figure out the encoding
decide_encoding();
// enter the main finite state machine
try {
_writeVTD(0,0,TOKEN_DOCUMENT,depth);
while (true) {
switch (parser_state) {
case STATE_LT_SEEN : //if (depth < -1)
// throw new ParseException("Other Errors: Invalid depth");
temp_offset = offset;
ch = r.getChar();
if (XMLChar.isNameStartChar(ch)) {
depth++;
parser_state = STATE_START_TAG;
} else {
if (ch=='/'){
parser_state = STATE_END_TAG;
}
else if (ch=='!'){
parser_state = process_ex_seen();
}else if (ch=='?'){
parser_state = process_qm_seen();
}else throw new ParseException(
"Other Error: Invalid char after <"
+ formatLineNumber());
}
break;
case STATE_START_TAG : //name space is handled by
do {
ch = r.getChar();
if (XMLChar.isNameChar(ch)) {
if (ch == ':') {
length2 = offset - temp_offset - increment;
if (ns && checkPrefix2(temp_offset,length2))
throw new ParseException(
"xmlns can't be an element prefix "
+ formatLineNumber(offset));
}
} else
break;
ch = r.getChar();
if (XMLChar.isNameChar(ch)) {
if (ch == ':') {
length2 = offset - temp_offset - increment;
if (ns && checkPrefix2(temp_offset,length2))
throw new ParseException(
"xmlns can't be an element prefix "
+ formatLineNumber(offset));
}
} else
break;
}while (true);
length1 = offset - temp_offset - increment;
if (depth > MAX_DEPTH) {
throw new ParseException(
"Other Error: Depth exceeds MAX_DEPTH"
+ formatLineNumber());
}
//writeVTD(offset, TOKEN_STARTING_TAG, length2:length1, depth)
long x = ((long) length1 << 32) + temp_offset;
tag_stack[depth] = x;
// System.out.println(
// " " + (temp_offset) + " " + length2 + ":" + length1 + " startingTag " + depth);
if (depth > VTDDepth)
VTDDepth = depth;
//if (encoding < FORMAT_UTF_16BE){
if (singleByteEncoding) {
if (length2 > MAX_PREFIX_LENGTH || length1 > MAX_QNAME_LENGTH)
throw new ParseException("Token Length Error: Starting tag prefix or qname length too long"
+ formatLineNumber());
if (this.shallowDepth)
writeVTD((temp_offset), (length2 << 11) | length1, TOKEN_STARTING_TAG, depth);
else
writeVTD_L5((temp_offset), (length2 << 11) | length1, TOKEN_STARTING_TAG, depth);
} else {
if (length2 > (MAX_PREFIX_LENGTH << 1) || length1 > (MAX_QNAME_LENGTH << 1))
throw new ParseException("Token Length Error: Starting tag prefix or qname length too long"
+ formatLineNumber());
if (this.shallowDepth)
writeVTD((temp_offset) >> 1, (length2 << 10) | (length1 >> 1), TOKEN_STARTING_TAG, depth);
else
writeVTD_L5((temp_offset) >> 1, (length2 << 10) | (length1 >> 1), TOKEN_STARTING_TAG,
depth);
}
if (ns) {
if (length2!=0){
length2 += increment;
currentElementRecord = (((long)((length2<<16)|length1))<<32)
| temp_offset;
} else
currentElementRecord = 0;
if (depth <= nsBuffer1.size - 1) {
nsBuffer1.size = depth ;
int t= nsBuffer1.intAt(depth-1)+1;
nsBuffer2.size=t;
nsBuffer3.size=t;
}
}
// offset += length1;
length2 = 0;
if (XMLChar.isSpaceChar(ch)) {
ch = getCharAfterS();
if (XMLChar.isNameStartChar(ch)) {
// seen an attribute here
temp_offset = r.getPrevOffset();
parser_state = STATE_ATTR_NAME;
break;
}
}
helper = true;
if (ch == '/') {
depth--;
helper = false;
ch = r.getChar();
}
if (ch == '>') {
if (ns){
nsBuffer1.append(nsBuffer3.size-1);
if (currentElementRecord !=0)
qualifyElement();
}
parser_state = processElementTail(helper);
break;
}
throw new ParseException(
"Starting tag Error: Invalid char in starting tag"
+ formatLineNumber());
case STATE_END_TAG :
temp_offset = offset;
int sos = (int) tag_stack[depth];
int sl = (int) (tag_stack[depth] >> 32);
offset = temp_offset+sl;
if (offset>= endOffset)
throw new EOFException("permature EOF reached, XML document incomplete");
for (int i = 0; i < sl; i++) {
if (XMLDoc[sos + i] != XMLDoc[temp_offset + i])
throw new ParseException(
"Ending tag error: Start/ending tag mismatch"
+ formatLineNumber());
}
depth--;
ch = getCharAfterS();
if(ch != '>')
throw new ParseException(
"Ending tag error: Invalid char in ending tag "
+ formatLineNumber());
if (depth != -1) {
temp_offset = offset;
ch = getCharAfterS();
if (ch == '<'){
if (ws)
addWhiteSpaceRecord();
parser_state = STATE_LT_SEEN;
}
else if (XMLChar.isContentChar(ch)) {
parser_state = STATE_TEXT;
}
else {
handleOtherTextChar2(ch);
parser_state = STATE_TEXT;
}
} else
parser_state = STATE_DOC_END;
break;
case STATE_ATTR_NAME :
process_attr_name();
parser_state = STATE_ATTR_VAL;
break;
case STATE_ATTR_VAL :
parser_state = process_attr_val();
break;
case STATE_TEXT :
if (depth == -1)
throw new ParseException(
"Error in text content: Char data at the wrong place"
+ formatLineNumber());
do {
ch = r.getChar();
//System.out.println(""+(char)ch);
if (XMLChar.isContentChar(ch)) {
} else if (ch == '<') {
break;
}else
handleOtherTextChar(ch);
ch = r.getChar();
//System.out.println(""+(char)ch);
if (XMLChar.isContentChar(ch)) {
} else if (ch == '<') {
break;
}else
handleOtherTextChar(ch);
}while(true);
length1 = offset - increment - temp_offset;
if (singleByteEncoding) //if (encoding < FORMAT_UTF_16BE)
writeVTDText(
temp_offset,
length1,
TOKEN_CHARACTER_DATA,
depth);
else
writeVTDText(
temp_offset >> 1,
length1 >> 1,
TOKEN_CHARACTER_DATA,
depth);
//has_amp = true;
parser_state = STATE_LT_SEEN;
break;
case STATE_DOC_START :
parser_state = process_start_doc();
break;
case STATE_DOC_END :
//docEnd = true;
parser_state = process_end_doc();
break;
case STATE_PI_TAG :
parser_state = process_pi_tag();
break;
//throw new ParseException("Error in PI: Invalid char");
case STATE_PI_VAL :
parser_state = process_pi_val();
break;
case STATE_DEC_ATTR_NAME :
parser_state = process_dec_attr();
break;
case STATE_COMMENT :
parser_state = process_comment();
break;
case STATE_CDATA :
parser_state = process_cdata();
break;
case STATE_DOCTYPE :
parser_state = process_doc_type();
break;
case STATE_END_COMMENT :
parser_state = process_end_comment();
break;
case STATE_END_PI :
parser_state = process_end_pi();
break;
default :
throw new ParseException(
"Other error: invalid parser state"
+formatLineNumber());
}
}
} catch (EOFException e) {
if (parser_state != STATE_DOC_END)
throw e;
finishUp();
}
}
private void checkQualifiedAttributeUniqueness() throws ParseException {
// TODO Auto-generated method stub
int preLen1,os1,postLen1,URLLen1,URLOs1,
preLen2, os2,postLen2, URLLen2, URLOs2,k;
for (int i=0;i>16);
postLen1 = (int) ((prefixed_attr_name_array[i] & 0xffffL))-preLen1-increment;
os1 = (int) (prefixed_attr_name_array[i]>>32) + preLen1+increment;
URLLen1 = nsBuffer2.lower32At(prefix_URL_array[i]);
URLOs1 = nsBuffer2.upper32At(prefix_URL_array[i]);
for (int j=i+1;j>16);
postLen2 = (int) ((prefixed_attr_name_array[j] & 0xffffL))-preLen2-increment;
os2 = (int)(prefixed_attr_name_array[j]>>32) + preLen2 + increment;
//System.out.println(new String(XMLDoc,os1, postLen1)
// +" "+ new String(XMLDoc, os2, postLen2));
if (postLen1 == postLen2){
k=0;
for (;k"+(char)(XMLDoc[preOs+k]));
if (XMLDoc[os1+k]!=XMLDoc[os2+k])
break;
}
if (k==postLen1){
// found the match
URLLen2 = nsBuffer2.lower32At(prefix_URL_array[j]);
URLOs2 = nsBuffer2.upper32At(prefix_URL_array[j]);
//System.out.println(" URLOs1 ===>" + URLOs1);
//System.out.println("nsBuffer2 ===>"+nsBuffer2.longAt(i)+" i==>"+i);
//System.out.println("URLLen2 "+ URLLen2+" URLLen1 "+ URLLen1+" ");
if (matchURL(URLOs1, URLLen1, URLOs2, URLLen2))
throw new ParseException(" qualified attribute names collide "
+ formatLineNumber(os2));
}
}
}
//System.out.println("======");
}
}
private void qualifyAttributes() throws ParseException{
int i1= nsBuffer3.size-1;
int j= 0,i=0;
// two cases:
// 1. the current element has no prefix, look for xmlns
// 2. the current element has prefix, look for xmlns:something
while(j>16);
int preOs = (int) (prefixed_attr_name_array[j]>>32);
//System.out.println(new String(XMLDoc, preOs, preLen)+"===");
i = i1;
while(i>=0){
int t = nsBuffer3.upper32At(i);
// with prefix, get full length and prefix length
if ( (t&0xffff) - (t>>16) == preLen+increment){
// doing byte comparison here
int os = nsBuffer3.lower32At(i)+(t>>16)+increment;
//System.out.println(new String(XMLDoc, os, preLen)+"");
int k=0;
for (;k"+(char)(XMLDoc[preOs+k]));
if (XMLDoc[os+k]!=XMLDoc[preOs+k])
break;
}
if (k==preLen){
break; // found the match
}
}
/*if ( (nsBuffer3.upper32At(i) & 0xffff0000) == 0){
return;
}*/
i--;
}
if (i<0)
throw new ParseException("Name space qualification Exception: prefixed attribute not qualified\n"
+formatLineNumber(preOs));
else
prefix_URL_array[j] = i;
j++;
// no need to check if xml is the prefix
}
//for (int h=0;h"+(char)l);
if (URL2.charAt(i)!= (int)l)
return 0;
os += (int)(l>>32);
}
//store offset value
t = os;
for (i=0;i<11 && os>32);
}
if (os == g)
return 2;
//so far a match
os = t;
for (i=18;i<36 && os>32);
}
if (os==g)
return 1;
return 0;
}
private boolean matchXML(int byte_offset) {
// TODO Auto-generated method stub
if (encoding= 0)
{
offset += numRead;
if (b.length-offset 0) {
//System.out.println("len ===> " + len + " "
// + urlConnection.getContentType());
byte[] ba = new byte[len];
int k=len,offset=0;
while(offset0){
k=in.read(ba,offset,len-offset);
offset+=k;
}
this.setDoc(ba);
this.parse(ns);
return true;
} else {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
byte[] ba = new byte[4096];
int k=-1;
while((k=in.read(ba))>0){
baos.write(ba, 0, k);
}
this.setDoc(baos.toByteArray());
this.parse(ns);
return true;
//baos.w
}
}
}catch(IOException e){
}catch(ParseException e){
}finally{
try {
if (in != null)
in.close();
if (urlConnection != null)
urlConnection.disconnect();
} catch (Exception e) {
}
}
return false;
}
//private
/**
* This private method processes CDATA section
* @return the parser state after which the parser loop jumps to
* @throws ParseException
* @throws EncodingException
* @throws EOFException
*/
private int process_cdata() throws ParseException, EncodingException, EOFException{
int parser_state;
while (true) {
ch = r.getChar();
if (XMLChar.isValidChar(ch)) {
if (ch == ']' && r.skipChar(']')) {
while (r.skipChar(']'));
if (r.skipChar('>')) {
break;
} /*else
throw new ParseException(
"Error in CDATA: Invalid termination sequence"
+ formatLineNumber());*/
}
} else
throw new ParseException(
"Error in CDATA: Invalid Char"
+ formatLineNumber());
}
length1 = offset - temp_offset - (increment<<1) - increment;
if (singleByteEncoding){//if (encoding < FORMAT_UTF_16BE){
writeVTDText(
temp_offset,
length1,
TOKEN_CDATA_VAL,
depth);
}
else {
writeVTDText(
temp_offset >> 1,
length1 >> 1,
TOKEN_CDATA_VAL,
depth);
}
//System.out.println(" " + (temp_offset) + " " + length1 + " CDATA " + depth);
temp_offset = offset;
//ch = getCharAfterSe();
ch = getCharAfterS();
if (ch == '<') {
if (ws)
addWhiteSpaceRecord();
parser_state = STATE_LT_SEEN;
} else if (XMLChar.isContentChar(ch)) {
//temp_offset = offset-1;
parser_state = STATE_TEXT;
} else if (ch == '&') {
//has_amp = true;
//temp_offset = offset-1;
entityIdentifier();
parser_state = STATE_TEXT;
//temp_offset = offset;
} else if (ch == ']') {
//temp_offset = offset-1;
if (r.skipChar(']')) {
while (r.skipChar(']')) {
}
if (r.skipChar('>'))
throw new ParseException(
"Error in text content: ]]> in text content"
+ formatLineNumber());
}
parser_state = STATE_TEXT;
}else
throw new ParseException(
"Other Error: Invalid char in xml"
+ formatLineNumber());
return parser_state;
}
/**
* This private method process comment
* @return the parser state after which the parser loop jumps to
* @throws ParseException
* @throws EncodingException
* @throws EOFException
*/
private int process_comment() throws ParseException, EncodingException, EOFException{
int parser_state;
while (true) {
ch = r.getChar();
if (XMLChar.isValidChar(ch)) {
if (ch == '-' && r.skipChar('-')) {
length1 =
offset - temp_offset - (increment<<1);
break;
}
} else
throw new ParseException(
"Error in comment: Invalid Char"
+ formatLineNumber());
}
if (r.getChar() == '>') {
//System.out.println(" " + (temp_offset) + " " + length1 + " comment " + depth);
if (singleByteEncoding)//if (encoding < FORMAT_UTF_16BE)
writeVTDText(
temp_offset,
length1,
TOKEN_COMMENT,
depth);
else
writeVTDText(
temp_offset >> 1,
length1 >> 1,
TOKEN_COMMENT,
depth);
//length1 = 0;
temp_offset = offset;
//ch = getCharAfterSe();
ch = getCharAfterS();
if (ch == '<') {
if (ws)
addWhiteSpaceRecord();
parser_state = STATE_LT_SEEN;
} else if (XMLChar.isContentChar(ch)) {
//temp_offset = offset;
parser_state = STATE_TEXT;
} else if (ch == '&') {
//has_amp = true;
//temp_offset = offset;
entityIdentifier();
parser_state = STATE_TEXT;
} else if (ch == ']') {
if (r.skipChar(']')) {
while (r.skipChar(']')) {
}
if (r.skipChar('>'))
throw new ParseException(
"Error in text content: ]]> in text content"
+ formatLineNumber());
}
parser_state = STATE_TEXT;
}else
throw new ParseException(
"Error in text content: Invalid char"
+ formatLineNumber());
return parser_state;
} else
throw new ParseException(
"Error in comment: Invalid terminating sequence"
+ formatLineNumber());
}
/**
* This private method processes declaration attributes
* @return the parser state after which the parser loop jumps to
* @throws ParseException
* @throws EncodingException
* @throws EOFException
*/
private int process_dec_attr() throws ParseException, EncodingException, EOFException{
int parser_state;
if (ch == 'v'
&& r.skipChar('e')
&& r.skipChar('r')
&& r.skipChar('s')
&& r.skipChar('i')
&& r.skipChar('o')
&& r.skipChar('n')) {
ch = getCharAfterS();
if (ch == '=') {
/*System.out.println(
" " + (temp_offset - 1) + " " + 7 + " dec attr name version " + depth);*/
if (singleByteEncoding)
_writeVTD(
temp_offset - 1,
7,
TOKEN_DEC_ATTR_NAME,
depth);
else
_writeVTD(
(temp_offset -2) >> 1,
7,
TOKEN_DEC_ATTR_NAME,
depth);
} else
throw new ParseException(
"XML decl error: Invalid char"
+ formatLineNumber());
} else
throw new ParseException(
"XML decl error: should be version"
+ formatLineNumber());
ch_temp = getCharAfterS();
if (ch_temp != '\'' && ch_temp != '"')
throw new ParseException(
"XML decl error: Invalid char to start attr name"
+ formatLineNumber());
temp_offset = offset;
// support 1.0 or 1.1
if (r.skipChar('1')
&& r.skipChar('.')
&& (r.skipChar('0') || r.skipChar('1'))) {
/*System.out.println(
" " + temp_offset + " " + 3 + " dec attr val (version)" + depth);*/
if (singleByteEncoding)
_writeVTD(
temp_offset,
3,
TOKEN_DEC_ATTR_VAL,
depth);
else
_writeVTD(
temp_offset >> 1,
3,
TOKEN_DEC_ATTR_VAL,
depth);
} else
throw new ParseException(
"XML decl error: Invalid version(other than 1.0 or 1.1) detected"
+ formatLineNumber());
if (!r.skipChar(ch_temp))
throw new ParseException(
"XML decl error: version not terminated properly"
+ formatLineNumber());
ch = r.getChar();
//? space or e
if (XMLChar.isSpaceChar(ch)) {
ch = getCharAfterS();
temp_offset = offset - increment;
if (ch == 'e') {
if (r.skipChar('n')
&& r.skipChar('c')
&& r.skipChar('o')
&& r.skipChar('d')
&& r.skipChar('i')
&& r.skipChar('n')
&& r.skipChar('g')) {
ch = r.getChar();
if (XMLChar.isSpaceChar(ch))
ch = getCharAfterS();
if (ch == '=') {
/*System.out.println(
" " + (temp_offset) + " " + 8 + " dec attr name (encoding) " + depth);*/
if (singleByteEncoding)
_writeVTD(
temp_offset,
8,
TOKEN_DEC_ATTR_NAME,
depth);
else
_writeVTD(
temp_offset >> 1,
8,
TOKEN_DEC_ATTR_NAME,
depth);
} else
throw new ParseException(
"XML decl error: Invalid char"
+ formatLineNumber());
ch_temp = getCharAfterS();
if (ch_temp != '"' && ch_temp != '\'')
throw new ParseException(
"XML decl error: Invalid char to start attr name"
+ formatLineNumber());
temp_offset = offset;
ch = r.getChar();
switch (ch) {
case 'a' :
case 'A' :
if ((r.skipChar('s')
|| r.skipChar('S'))
&& (r.skipChar('c')
|| r.skipChar('C'))
&& (r.skipChar('i')
|| r.skipChar('I'))
&& (r.skipChar('i')
|| r.skipChar('I'))
&& r.skipChar(ch_temp)) {
if (encoding != FORMAT_UTF_16LE
&& encoding
!= FORMAT_UTF_16BE) {
if (must_utf_8)
throw new EncodingException("Can't switch from UTF-8"
+ formatLineNumber());
encoding = FORMAT_ASCII;
r = new ASCIIReader();
/*System.out.println(
" " + (temp_offset) + " " + 5 + " dec attr val (encoding) " + depth);*/
_writeVTD(
temp_offset,
5,
TOKEN_DEC_ATTR_VAL,
depth);
break;
} else
throw new ParseException(
"XML decl error: Can't switch encoding to ASCII"
+ formatLineNumber());
}
throw new ParseException(
"XML decl error: Invalid Encoding"
+ formatLineNumber());
case 'c':
case 'C':
matchCPEncoding();
break;
case 'i' :
case 'I' :
matchISOEncoding();
break;
case 'u' :
case 'U' :
matchUTFEncoding();
break;
// now deal with windows encoding
case 'w' :
case 'W' :
matchWindowsEncoding();
break;
default :
throw new ParseException(
"XML decl Error: invalid encoding"
+ formatLineNumber());
}
ch = r.getChar();
if (XMLChar.isSpaceChar(ch))
ch = getCharAfterS();
temp_offset = offset - increment;
} else
throw new ParseException(
"XML decl Error: Invalid char"
+ formatLineNumber());
}
if (ch == 's') {
if (r.skipChar('t')
&& r.skipChar('a')
&& r.skipChar('n')
&& r.skipChar('d')
&& r.skipChar('a')
&& r.skipChar('l')
&& r.skipChar('o')
&& r.skipChar('n')
&& r.skipChar('e')) {
ch = getCharAfterS();
if (ch != '=')
throw new ParseException(
"XML decl error: Invalid char"
+ formatLineNumber());
/*System.out.println(
" " + temp_offset + " " + 3 + " dec attr name (standalone) " + depth);*/
if (singleByteEncoding)
_writeVTD(
temp_offset,
10,
TOKEN_DEC_ATTR_NAME,
depth);
else
_writeVTD(
temp_offset >> 1,
10,
TOKEN_DEC_ATTR_NAME,
depth);
ch_temp = getCharAfterS();
temp_offset = offset;
if (ch_temp != '"' && ch_temp != '\'')
throw new ParseException(
"XML decl error: Invalid char to start attr name"
+ formatLineNumber());
ch = r.getChar();
if (ch == 'y') {
if (r.skipChar('e')
&& r.skipChar('s')
&& r.skipChar(ch_temp)) {
/*System.out.println(
" " + (temp_offset) + " " + 3 + " dec attr val (standalone) " + depth);*/
if (singleByteEncoding)
_writeVTD(
temp_offset,
3,
TOKEN_DEC_ATTR_VAL,
depth);
else
_writeVTD(
temp_offset >> 1,
3,
TOKEN_DEC_ATTR_VAL,
depth);
} else
throw new ParseException(
"XML decl error: invalid val for standalone"
+ formatLineNumber());
} else if (ch == 'n') {
if (r.skipChar('o')
&& r.skipChar(ch_temp)) {
/*System.out.println(
" " + (temp_offset) + " " + 2 + " dec attr val (standalone)" + depth);*/
if (singleByteEncoding)
_writeVTD(
temp_offset,
2,
TOKEN_DEC_ATTR_VAL,
depth);
else
_writeVTD(
temp_offset >> 1,
2,
TOKEN_DEC_ATTR_VAL,
depth);
} else
throw new ParseException(
"XML decl error: invalid val for standalone"
+ formatLineNumber());
} else
throw new ParseException(
"XML decl error: invalid val for standalone"
+ formatLineNumber());
} else
throw new ParseException(
"XML decl error" + formatLineNumber());
ch = r.getChar();
if (XMLChar.isSpaceChar(ch))
ch = getCharAfterS();
}
}
if (ch == '?' && r.skipChar('>')) {
temp_offset = offset;
ch = getCharAfterS();
if (ch == '<') {
parser_state = STATE_LT_SEEN;
} else
throw new ParseException(
"Other Error: Invalid Char in XML"
+ formatLineNumber());
} else
throw new ParseException(
"XML decl Error: Invalid termination sequence"
+ formatLineNumber());
return parser_state;
}
/**
* This private method process DTD
* @return the parser state after which the parser loop jumps to
* @throws ParseException
* @throws EncodingException
* @throws EOFException
*/
private int process_doc_type() throws ParseException,EncodingException, EOFException{
int z = 1,parser_state;
while (true) {
ch = r.getChar();
if (XMLChar.isValidChar(ch)) {
if (ch == '>')
z--;
else if (ch == '<')
z++;
if (z == 0)
break;
} else
throw new ParseException(
"Error in DOCTYPE: Invalid char"
+ formatLineNumber());
}
length1 = offset - temp_offset - increment;
/*System.out.println(
" " + (temp_offset) + " " + length1 + " DOCTYPE val " + depth);*/
if (singleByteEncoding){//if (encoding < FORMAT_UTF_16BE){
if (length1 > MAX_TOKEN_LENGTH)
throw new ParseException("Token Length Error:"
+" DTD val too long (>0xfffff)"
+ formatLineNumber());
_writeVTD(
temp_offset,
length1,
TOKEN_DTD_VAL,
depth);
}
else{
if (length1 > (MAX_TOKEN_LENGTH<<1))
throw new ParseException("Token Length Error:"
+" DTD val too long (>0xfffff)"
+ formatLineNumber());
_writeVTD(
temp_offset >> 1,
length1 >> 1,
TOKEN_DTD_VAL,
depth);
}
ch = getCharAfterS();
if (ch == '<') {
parser_state = STATE_LT_SEEN;
} else
throw new ParseException(
"Other Error: Invalid char in xml"
+ formatLineNumber());
return parser_state;
}
/**
* This private method process the comment after the root document
* @return the parser state after which the parser loop jumps to
* @throws ParseException
*/
private int process_end_comment()throws ParseException {
int parser_state;
while (true) {
ch = r.getChar();
if (XMLChar.isValidChar(ch)) {
if (ch == '-' && r.skipChar('-')) {
length1 =
offset - temp_offset - (increment<<1);
break;
}
} else
throw new ParseException(
"Error in comment: Invalid Char"
+ formatLineNumber());
}
if (r.getChar() == '>') {
//System.out.println(" " + temp_offset + " " + length1 + " comment " + depth);
if (singleByteEncoding) //if (encoding < FORMAT_UTF_16BE)
writeVTDText(
temp_offset,
length1,
TOKEN_COMMENT,
depth);
else
writeVTDText(
temp_offset >> 1,
length1 >> 1,
TOKEN_COMMENT,
depth);
parser_state = STATE_DOC_END;
return parser_state;
}
throw new ParseException(
"Error in comment: '-->' expected"
+ formatLineNumber());
}
private int process_end_doc() throws ParseException, EncodingException, EOFException {
int parser_state;
ch = getCharAfterS();
/* eof exception should be thrown here for premature ending*/
if (ch == '<') {
if (r.skipChar('?')) {
/* processing instruction after end tag of root element*/
temp_offset = offset;
parser_state = STATE_END_PI;
return parser_state;
} else if (
r.skipChar('!')
&& r.skipChar('-')
&& r.skipChar('-')) {
// comments allowed after the end tag of the root element
temp_offset = offset;
parser_state = STATE_END_COMMENT;
return parser_state;
}
}
throw new ParseException(
"Other Error: XML not terminated properly"
+ formatLineNumber());
}
/**
* This private method processes PI after root document
* @return the parser state after which the parser loop jumps to
* @throws ParseException
* @throws EncodingException
* @throws EOFException
*/
private int process_end_pi() throws ParseException,EncodingException, EOFException{
int parser_state;
ch = r.getChar();
if (XMLChar.isNameStartChar(ch)) {
if ((ch == 'x' || ch == 'X')
&& (r.skipChar('m') || r.skipChar('M'))
&& (r.skipChar('l') && r.skipChar('L'))) {
//temp_offset = offset;
ch = r.getChar();
if (XMLChar.isSpaceChar(ch) || ch == '?')
throw new ParseException(
"Error in PI: [xX][mM][lL] not a valid PI target"
+ formatLineNumber());
//offset = temp_offset;
}
while (true) {
//ch = getChar();
if (!XMLChar.isNameChar(ch)) {
break;
}
ch = r.getChar();
}
length1 = offset - temp_offset - increment;
/*System.out.println(
""
+ (char) XMLDoc[temp_offset]
+ " "
+ (temp_offset)
+ " "
+ length1
+ " PI Target "
+ depth);*/
if (singleByteEncoding){//if (encoding < FORMAT_UTF_16BE){
if (length1 > MAX_TOKEN_LENGTH)
throw new ParseException("Token Length Error:"
+"PI name too long (>0xfffff)"
+ formatLineNumber());
_writeVTD(
temp_offset,
length1,
TOKEN_PI_NAME,
depth);
}
else{
if (length1 > (MAX_TOKEN_LENGTH<<1))
throw new ParseException("Token Length Error:"
+"PI name too long (>0xfffff)"
+ formatLineNumber());
_writeVTD(
temp_offset >> 1,
length1 >> 1,
TOKEN_PI_NAME,
depth);
}
//length1 = 0;
temp_offset = offset;
if (XMLChar.isSpaceChar(ch)) {
ch = getCharAfterS();
while (true) {
if (XMLChar.isValidChar(ch)) {
if (ch == '?'){
if (r.skipChar('>')) {
parser_state = STATE_DOC_END;
break;
} else
throw new ParseException(
"Error in PI: invalid termination sequence"
+ formatLineNumber());
}
} else
throw new ParseException(
"Error in PI: Invalid char in PI val"
+ formatLineNumber());
ch = r.getChar();
}
length1 = offset - temp_offset - (increment<<1);
if (singleByteEncoding){
if (length1 > MAX_TOKEN_LENGTH)
throw new ParseException("Token Length Error:"
+"PI val too long (>0xfffff)"
+ formatLineNumber());
_writeVTD(
temp_offset,
length1,
TOKEN_PI_VAL,
depth);
}
else{
if (length1 > (MAX_TOKEN_LENGTH<<1))
throw new ParseException("Token Length Error:"
+"PI val too long (>0xfffff)"
+ formatLineNumber());
_writeVTD(
temp_offset >> 1,
length1 >> 1,
TOKEN_PI_VAL,
depth);
}
//System.out.println(" " + temp_offset + " " + length1 + " PI val " + depth);
} else {
if (singleByteEncoding){
_writeVTD(
(temp_offset),
0,
TOKEN_PI_VAL,
depth);
}
else{
_writeVTD(
(temp_offset) >> 1,
0,
TOKEN_PI_VAL,
depth);
}
if ((ch == '?') && r.skipChar('>')) {
parser_state = STATE_DOC_END;
} else
throw new ParseException(
"Error in PI: invalid termination sequence"
+ formatLineNumber());
}
//parser_state = STATE_DOC_END;
} else
throw new ParseException("Error in PI: invalid char in PI target"
+formatLineNumber());
return parser_state;
}
private int process_ex_seen()throws ParseException, EncodingException, EOFException {
int parser_state;
boolean hasDTD = false;
ch = r.getChar();
switch (ch) {
case '-' :
if (r.skipChar('-')) {
temp_offset = offset;
parser_state = STATE_COMMENT;
break;
} else
throw new ParseException(
"Error in comment: Invalid char sequence to start a comment"
+ formatLineNumber());
case '[' :
if (r.skipChar('C')
&& r.skipChar('D')
&& r.skipChar('A')
&& r.skipChar('T')
&& r.skipChar('A')
&& r.skipChar('[')
&& (depth != -1)) {
temp_offset = offset;
parser_state = STATE_CDATA;
break;
} else {
if (depth == -1)
throw new ParseException(
"Error in CDATA: Wrong place for CDATA"
+ formatLineNumber());
throw new ParseException(
"Error in CDATA: Invalid char sequence for CDATA"
+ formatLineNumber());
}
case 'D' :
if (r.skipChar('O')
&& r.skipChar('C')
&& r.skipChar('T')
&& r.skipChar('Y')
&& r.skipChar('P')
&& r.skipChar('E')
&& (depth == -1)
&& !hasDTD) {
hasDTD = true;
temp_offset = offset;
parser_state = STATE_DOCTYPE;
break;
} else {
if (hasDTD == true)
throw new ParseException(
"Error for DOCTYPE: Only DOCTYPE allowed"
+ formatLineNumber());
if (depth != -1)
throw new ParseException(
"Error for DOCTYPE: DTD at wrong place"
+ formatLineNumber());
throw new ParseException(
"Error for DOCTYPE: Invalid char sequence for DOCTYPE"
+ formatLineNumber());
}
default :
throw new ParseException(
"Other Error: Unrecognized char after MAX_PREFIX_LENGTH
|| length1 > MAX_QNAME_LENGTH)
throw new ParseException(
s1
+formatLineNumber());
_writeVTD(
temp_offset,
(length2 << 11) | length1,
TOKEN_ATTR_NS,
depth);
}
else{
if (length2>(MAX_PREFIX_LENGTH << 1)
|| length1 > (MAX_QNAME_LENGTH <<1))
throw new ParseException(
s2
+ formatLineNumber());
_writeVTD(
temp_offset >> 1,
(length2 << 10) | (length1 >> 1),
TOKEN_ATTR_NS,
depth);
}
// append to nsBuffer2
if (ns) {
//unprefixed xmlns are not recorded
if (length2 != 0 && !isXML) {
//nsBuffer2.append(VTDBuffer.size() - 1);
long l = ((long) ((length2 << 16) | length1)) << 32
| temp_offset;
nsBuffer3.append(l); // byte offset and byte
// length
}
}
} else {
//if (encoding < FORMAT_UTF_16BE){
s1="Token Length Error: Attr name prefix or qname length too long";
s2="Token Length overflow error: Attr name prefix or qname length too long" ;
if (singleByteEncoding) {
if (length2>MAX_PREFIX_LENGTH
|| length1 > MAX_QNAME_LENGTH)
throw new ParseException(
"Token Length Error: Attr name prefix or qname length too long"
+ formatLineNumber());
_writeVTD(
temp_offset,
(length2 << 11) | length1,
TOKEN_ATTR_NAME,
depth);
}
else{
if (length2>(MAX_PREFIX_LENGTH<<1)
|| length1 > (MAX_QNAME_LENGTH<<1))
throw new ParseException(
"Token Length overflow error: Attr name prefix or qname length too long"
+ formatLineNumber());
_writeVTD(
temp_offset >> 1,
(length2 << 10) | (length1 >> 1),
TOKEN_ATTR_NAME,
depth);
}
}
/*System.out.println(
" " + temp_offset + " " + length2 + ":" + length1 + " attr name " + depth);*/
length2 = 0;
if (XMLChar.isSpaceChar(ch)) {
ch = getCharAfterS();
}
if (ch != '=')
throw new ParseException(
"Error in attr: invalid char"
+ formatLineNumber());
ch_temp = getCharAfterS();
if (ch_temp != '"' && ch_temp != '\'')
throw new ParseException(
"Error in attr: invalid char (should be ' or \" )"
+ formatLineNumber());
temp_offset = offset;
}
private int process_attr_val()throws ParseException, EncodingException, EOFException{
//int parser_state;
do{
ch = r.getChar();
if (XMLChar.isValidChar(ch) && ch != '<') {
if (ch == ch_temp)
break;
if (ch == '&') {
// as in vtd spec, we mark attr val with entities
if (!XMLChar
.isValidChar(entityIdentifier())) {
throw new ParseException(
"Error in attr: Invalid XML char"
+ formatLineNumber());
}
}
} else
throw new ParseException(
"Error in attr: Invalid XML char"
+ formatLineNumber());
}while (true);
length1 = offset - temp_offset - increment;
if (ns && is_ns){
if (!default_ns && length1==0){
throw new ParseException(" non-default ns URL can't be empty"
+formatLineNumber());
}
//identify nsURL return 0,1,2
int t= identifyNsURL(temp_offset, length1);
if (isXML){//xmlns:xml
if (t!=1)
//URL points to "http://www.w3.org/XML/1998/namespace"
throw new ParseException("xmlns:xml can only point to"
+"\"http://www.w3.org/XML/1998/namespace\""
+ formatLineNumber());
} else {
if (!default_ns)
nsBuffer2.append(((long)temp_offset<<32) | length1);
if (t!=0){
if (t==1)
throw new ParseException("namespace declaration can't point to"
+" \"http://www.w3.org/XML/1998/namespace\""
+ formatLineNumber());
throw new ParseException("namespace declaration can't point to"
+" \"http://www.w3.org/2000/xmlns/\""
+ formatLineNumber());
}
}
// no ns URL points to
//"http://www.w3.org/2000/xmlns/"
// no ns URL points to
//"http://www.w3.org/XML/1998/namespace"
}
if (singleByteEncoding){
//if (encoding < FORMAT_UTF_16BE){
if (length1 > MAX_TOKEN_LENGTH)
throw new ParseException("Token Length Error:"
+" Attr val too long (>0xfffff)"
+ formatLineNumber());
_writeVTD(
temp_offset,
length1,
TOKEN_ATTR_VAL,
depth);
}
else{
if (length1 > (MAX_TOKEN_LENGTH <<1))
throw new ParseException("Token Length Error:"
+" Attr val too long (>0xfffff)"
+ formatLineNumber());
_writeVTD(
temp_offset >> 1,
length1 >> 1,
TOKEN_ATTR_VAL,
depth);
}
isXML = false;
is_ns = false;
ch = r.getChar();
if (XMLChar.isSpaceChar(ch)) {
ch = getCharAfterS();
if (XMLChar.isNameStartChar(ch)) {
temp_offset = offset - increment;
return STATE_ATTR_NAME;
//break;
}
}
helper = true;
if (ch == '/') {
depth--;
helper = false;
ch = r.getChar();
}
if (ch == '>') {
if (ns){
nsBuffer1.append(nsBuffer3.size-1);
if (prefixed_attr_count>0)
qualifyAttributes();
if (prefixed_attr_count>1){
checkQualifiedAttributeUniqueness();
}
if (currentElementRecord !=0)
qualifyElement();
prefixed_attr_count=0;
}
attr_count = 0;
return processElementTail(helper);
}
throw new ParseException(
"Starting tag Error: Invalid char in starting tag"
+ formatLineNumber());
}
/**
* This private method processes PI tag
* @return the parser state after which the parser loop jumps to
* @throws ParseException
* @throws EncodingException
* @throws EOFException
*/
private int process_pi_tag() throws ParseException, EncodingException, EOFException{
int parser_state;
while (true) {
ch = r.getChar();
if (!XMLChar.isNameChar(ch))
break;
//System.out.println(" ch ==> "+(char)ch);
}
length1 = offset - temp_offset - increment;
/*System.out.println(
((char) XMLDoc[temp_offset])
+ " "
+ (temp_offset)
+ " "
+ length1
+ " PI Target "
+ depth); */
//if (encoding < FORMAT_UTF_16BE){
if (singleByteEncoding){
if (length1 > MAX_TOKEN_LENGTH)
throw new ParseException("Token Length Error:"
+" PI name too long (>0xfffff)"
+ formatLineNumber());
_writeVTD(
(temp_offset),
length1,
TOKEN_PI_NAME,
depth);
}
else{
if(length1 > (MAX_TOKEN_LENGTH<<1))
throw new ParseException("Token Length Error:"
+" PI name too long (>0xfffff)"
+ formatLineNumber());
_writeVTD(
(temp_offset) >> 1,
(length1 >> 1),
TOKEN_PI_NAME,
depth);
}
//length1 = 0;
//temp_offset = offset;
/*if (XMLChar.isSpaceChar(ch)) {
ch = r.getChar();
}*/
//ch = r.getChar();
if (ch == '?') {
// insert zero length pi name tag
if (singleByteEncoding){
_writeVTD(
(temp_offset),
0,
TOKEN_PI_VAL,
depth);
}
else{
_writeVTD(
(temp_offset) >> 1,
(0),
TOKEN_PI_VAL,
depth);
}
if (r.skipChar('>')) {
temp_offset = offset;
//ch = getCharAfterSe();
ch = getCharAfterS();
if (ch == '<') {
if (ws)
addWhiteSpaceRecord();
parser_state = STATE_LT_SEEN;
} else if (XMLChar.isContentChar(ch)) {
parser_state = STATE_TEXT;
} else if (ch == '&') {
//has_amp = true;
entityIdentifier();
parser_state = STATE_TEXT;
} else if (ch == ']') {
if (r.skipChar(']')) {
while (r.skipChar(']')) {
}
if (r.skipChar('>'))
throw new ParseException(
"Error in text content: ]]> in text content"
+ formatLineNumber());
}
parser_state = STATE_TEXT;
}else
throw new ParseException(
"Error in text content: Invalid char"
+ formatLineNumber());
return parser_state;
} else
throw new ParseException(
"Error in PI: invalid termination sequence"
+ formatLineNumber());
}
parser_state = STATE_PI_VAL;
return parser_state;
}
/**
* This private method processes PI val
* @return the parser state after which the parser loop jumps to
* @throws ParseException
* @throws EncodingException
* @throws EOFException
*/
private int process_pi_val() throws ParseException, EncodingException, EOFException{
int parser_state;
if (!XMLChar.isSpaceChar(ch))
throw new ParseException(
"Error in PI: invalid termination sequence"
+ formatLineNumber());
temp_offset = offset;
ch = r.getChar();
while (true) {
if (XMLChar.isValidChar(ch)) {
//System.out.println(""+(char)ch);
if (ch == '?')
if (r.skipChar('>')) {
break;
} /*else
throw new ParseException(
"Error in PI: invalid termination sequence for PI"
+ formatLineNumber());*/
} else
throw new ParseException(
"Errors in PI: Invalid char in PI val"
+ formatLineNumber());
ch = r.getChar();
}
length1 = offset - temp_offset - (increment<<1);
/*System.out.println(
((char) XMLDoc[temp_offset])
+ " "
+ (temp_offset)
+ " "
+ length1
+ " PI val "
+ depth);*/
//if (length1 != 0)
if (singleByteEncoding) {// if (encoding < FORMAT_UTF_16BE){
if (length1 > MAX_TOKEN_LENGTH)
throw new ParseException("Token Length Error:"
+ "PI VAL too long (>0xfffff)" + formatLineNumber());
_writeVTD(temp_offset, length1, TOKEN_PI_VAL, depth);
} else {
if (length1 > (MAX_TOKEN_LENGTH << 1))
throw new ParseException("Token Length Error:"
+ "PI VAL too long (>0xfffff)" + formatLineNumber());
_writeVTD(temp_offset >> 1, length1 >> 1, TOKEN_PI_VAL, depth);
}
//length1 = 0;
temp_offset = offset;
//ch = getCharAfterSe();
ch = getCharAfterS();
if (ch == '<') {
if (ws)
addWhiteSpaceRecord();
parser_state = STATE_LT_SEEN;
} else if (XMLChar.isContentChar(ch)) {
//temp_offset = offset;
parser_state = STATE_TEXT;
} else if (ch == '&') {
//has_amp = true;
//temp_offset = offset;
entityIdentifier();
parser_state = STATE_TEXT;
} else if (ch == ']') {
if (r.skipChar(']')) {
while (r.skipChar(']')) {
}
if (r.skipChar('>'))
throw new ParseException(
"Error in text content: ]]> in text content"
+ formatLineNumber());
}
parser_state = STATE_TEXT;
}else
throw new ParseException(
"Error in text content: Invalid char"
+ formatLineNumber());
return parser_state;
}
private int process_qm_seen()throws ParseException, EncodingException, EOFException {
temp_offset = offset;
ch = r.getChar();
if (XMLChar.isNameStartChar(ch)) {
//temp_offset = offset;
if ((ch == 'x' || ch == 'X')
&& (r.skipChar('m') || r.skipChar('M'))
&& (r.skipChar('l') || r.skipChar('L'))) {
ch = r.getChar();
if (ch == '?'
|| XMLChar.isSpaceChar(ch))
throw new ParseException(
"Error in PI: [xX][mM][lL] not a valid PI targetname"
+ formatLineNumber());
offset = r.getPrevOffset();
}
return STATE_PI_TAG;
}
throw new ParseException(
"Other Error: First char after invalid"
+ formatLineNumber());
}
private int process_start_doc()throws ParseException, EncodingException, EOFException {
int c = r.getChar();
if (c == '<') {
temp_offset = offset;
// xml decl has to be right after the start of the document
if (r.skipChar('?')
&& (r.skipChar('x') || r.skipChar('X'))
&& (r.skipChar('m') || r.skipChar('M'))
&& (r.skipChar('l') || r.skipChar('L'))) {
if (r.skipChar(' ')
|| r.skipChar('\t')
|| r.skipChar('\n')
|| r.skipChar('\r')) {
ch = getCharAfterS();
temp_offset = offset;
return STATE_DEC_ATTR_NAME;
} else if (r.skipChar('?'))
throw new ParseException(
"Error in XML decl: Premature ending"
+ formatLineNumber());
}
offset = temp_offset;
return STATE_LT_SEEN;
} else if (c==' '||c=='\n'||c=='\r'||c=='\t'){
if (getCharAfterS()=='<'){
return STATE_LT_SEEN;
}
}
throw new ParseException(
"Other Error: XML not starting properly"
+ formatLineNumber());
}
/**
* Set the XMLDoc container.
* @param ba byte[]
*/
public void setDoc(byte[] ba) {
setDoc(ba,0,ba.length);
}
/**
* Set the XMLDoc container. Also set the offset and len of the document
* with respect to the container.
* @param ba byte[]
* @param os int (in byte)
* @param len int (in byte)
*/
public void setDoc(byte[] ba, int os, int len) {
if (ba == null || os < 0 || len == 0 || ba.length < os + len) {
throw new IllegalArgumentException("Illegal argument for setDoc");
}
int a;
br = false;
depth = -1;
increment = 1;
BOM_detected = false;
must_utf_8 = false;
ch = ch_temp = 0;
temp_offset = 0;
XMLDoc = ba;
docOffset = offset = os;
docLen = len;
endOffset = os + len;
last_l1_index = last_l2_index = last_l3_index = last_l4_index = last_depth = 0;
currentElementRecord = 0;
nsBuffer1.size = 0;
nsBuffer2.size = 0;
nsBuffer3.size = 0;
r = new UTF8Reader();
if (shallowDepth) {
int i1 = 8, i2 = 9, i3 = 11;
if (docLen <= 1024) {
// a = 1024; //set the floor
a = 6;
i1 = 5;
i2 = 5;
i3 = 5;
} else if (docLen <= 4096) {
a = 7;
i1 = 6;
i2 = 6;
i3 = 6;
} else if (docLen <= 1024 * 16) {
a = 8;
i1 = 7;
i2 = 7;
i3 = 7;
} else if (docLen <= 1024 * 16 * 4) {
// a = 2048;
a = 11;
} else if (docLen <= 1024 * 256) {
// a = 1024 * 4;
a = 12;
} else {
// a = 1 << 15;
a = 15;
}
VTDBuffer = new FastLongBuffer(a, len >> (a + 1));
l1Buffer = new FastLongBuffer(i1);
l2Buffer = new FastLongBuffer(i2);
l3Buffer = new FastIntBuffer(i3);
} else {
int i1 = 7, i2 = 9, i3 = 11, i4 = 11, i5 = 11;
if (docLen <= 1024) {
// a = 1024; //set the floor
a = 6;
i1 = 5;
i2 = 5;
i3 = 5;
i4 = 5;
i5 = 5;
} else if (docLen <= 4096) {
a = 7;
i1 = 6;
i2 = 6;
i3 = 6;
i4 = 6;
i5 = 6;
} else if (docLen <= 1024 * 16) {
a = 8;
i1 = 7;
i2 = 7;
i3 = 7;
i4 = 7;
i5 = 7;
} else if (docLen <= 1024 * 16 * 4) {
// a = 2048;
a = 11;
i2 = 8;
i3 = 8;
i4 = 8;
i5 = 8;
} else if (docLen <= 1024 * 256) {
// a = 1024 * 4;
a = 12;
i1 = 8;
i2 = 9;
i3 = 9;
i4 = 9;
i5 = 9;
} else {
// a = 1 << 15;
a = 15;
}
VTDBuffer = new FastLongBuffer(a, len >> (a + 1));
l1Buffer = new FastLongBuffer(i1);
l2Buffer = new FastLongBuffer(i2);
_l3Buffer = new FastLongBuffer(i3);
_l4Buffer = new FastLongBuffer(i4);
_l5Buffer = new FastIntBuffer(i5);
}
}
/**
* The buffer-reuse version of setDoc
* The concept is to reuse LC and VTD buffer for
* XML parsing, instead of allocating every time
* @param ba
*
*/
public void setDoc_BR(byte[] ba){
setDoc_BR(ba,0,ba.length);
}
/**
* The buffer-reuse version of setDoc
* The concept is to reuse LC and VTD buffer for
* XML parsing, instead of allocating every time
* @param ba byte[]
* @param os int (in byte)
* @param len int (in byte)
*
*/
public void setDoc_BR(byte[] ba, int os, int len) {
if (ba == null || os < 0 || len == 0 || ba.length < os + len) {
throw new IllegalArgumentException("Illegal argument for setDoc_BR");
}
int a;
br = true;
depth = -1;
increment = 1;
BOM_detected = false;
must_utf_8 = false;
ch = ch_temp = 0;
temp_offset = 0;
XMLDoc = ba;
docOffset = offset = os;
docLen = len;
endOffset = os + len;
last_l1_index = last_l2_index = last_depth = last_l3_index = last_l4_index= 0;
currentElementRecord = 0;
nsBuffer1.size = 0;
nsBuffer2.size = 0;
nsBuffer3.size = 0;
r = new UTF8Reader();
if (shallowDepth) {
int i1 = 8, i2 = 9, i3 = 11;
if (docLen <= 1024) {
// a = 1024; //set the floor
a = 6;
i1 = 5;
i2 = 5;
i3 = 5;
} else if (docLen <= 4096) {
a = 7;
i1 = 6;
i2 = 6;
i3 = 6;
} else if (docLen <= 1024 * 16) {
a = 8;
i1 = 7;
i2 = 7;
i3 = 7;
} else if (docLen <= 1024 * 16 * 4) {
// a = 2048;
a = 11;
i2 = 8;
i3 = 8;
} else if (docLen <= 1024 * 256) {
// a = 1024 * 4;
a = 12;
} else {
// a = 1 << 15;
a = 15;
}
if (VTDBuffer == null) {
VTDBuffer = new FastLongBuffer(a, len >> (a + 1));
l1Buffer = new FastLongBuffer(i1);
l2Buffer = new FastLongBuffer(i2);
l3Buffer = new FastIntBuffer(i3);
} else {
VTDBuffer.size = 0;
l1Buffer.size = 0;
l2Buffer.size = 0;
l3Buffer.size = 0;
}
} else {
int i1 = 8, i2 = 9, i3 = 11, i4 = 11, i5 = 11;
if (docLen <= 1024) {
// a = 1024; //set the floor
a = 6;
i1 = 5;
i2 = 5;
i3 = 5;
i4 = 5;
i5 = 5;
} else if (docLen <= 4096) {
a = 7;
i1 = 6;
i2 = 6;
i3 = 6;
i4 = 6;
i5 = 6;
} else if (docLen <= 1024 * 16) {
a = 8;
i1 = 7;
i2 = 7;
i3 = 7;
} else if (docLen <= 1024 * 16 * 4) {
// a = 2048;
a = 11;
i2 = 8;
i3 = 8;
i4 = 8;
i5 = 8;
} else if (docLen <= 1024 * 256) {
// a = 1024 * 4;
a = 12;
i1 = 8;
i2 = 9;
i3 = 9;
i4 = 9;
i5 = 9;
} else if (docLen <= 1024 * 1024) {
// a = 1024 * 4;
a = 12;
i1 = 8;
i3 = 10;
i4 = 10;
i5 = 10;
} else {
// a = 1 << 15;
a = 15;
i1 = 8;
}
if (VTDBuffer == null) {
VTDBuffer = new FastLongBuffer(a, len >> (a + 1));
l1Buffer = new FastLongBuffer(i1);
l2Buffer = new FastLongBuffer(i2);
_l3Buffer = new FastLongBuffer(i3);
_l4Buffer = new FastLongBuffer(i4);
_l5Buffer = new FastIntBuffer(i5);
} else {
VTDBuffer.size = 0;
l1Buffer.size = 0;
l2Buffer.size = 0;
_l3Buffer.size = 0;
_l4Buffer.size = 0;
_l5Buffer.size = 0;
}
}
}
/**
* This method writes the VTD+XML into an outputStream
* @param os
* @throws IOException
* @throws IndexWriteException
*
*/
public void writeIndex(OutputStream os) throws IOException,IndexWriteException{
if (shallowDepth)
IndexHandler.writeIndex_L3((byte)1,
this.encoding,
this.ns,
true,
this.VTDDepth,
3,
this.rootIndex,
this.XMLDoc,
this.docOffset,
this.docLen,
this.VTDBuffer,
this.l1Buffer,
this.l2Buffer,
this.l3Buffer,
os);
else
IndexHandler.writeIndex_L5((byte)1,
this.encoding,
this.ns,
true,
this.VTDDepth,
5,
this.rootIndex,
this.XMLDoc,
this.docOffset,
this.docLen,
this.VTDBuffer,
this.l1Buffer,
this.l2Buffer,
this._l3Buffer,
this._l4Buffer,
this._l5Buffer,
os);
}
/**
* This method writes the VTDs and LCs into an outputStream
* @param os
* @throws IOException
* @throws IndexWriteException
*
*/
public void writeSeparateIndex(OutputStream os) throws IOException,IndexWriteException{
if (shallowDepth)
IndexHandler.writeSeparateIndex_L3((byte)2,
this.encoding,
this.ns,
true,
this.VTDDepth,
3,
this.rootIndex,
//this.XMLDoc,
this.docOffset,
this.docLen,
this.VTDBuffer,
this.l1Buffer,
this.l2Buffer,
this.l3Buffer,
os);
else
IndexHandler.writeSeparateIndex_L5((byte)2,
this.encoding,
this.ns,
true,
this.VTDDepth,
5,
this.rootIndex,
//this.XMLDoc,
this.docOffset,
this.docLen,
this.VTDBuffer,
this.l1Buffer,
this.l2Buffer,
this._l3Buffer,
this._l4Buffer,
this._l5Buffer,
os);
}
/**
* This method writes the VTD+XML file into a file of the given name
* @param fileName
* @throws IOException
* @throws IndexWriteException
*
*/
public void writeIndex(String fileName) throws IOException,IndexWriteException{
FileOutputStream fos = new FileOutputStream(fileName);
writeIndex(fos);
fos.close();
}
/**
* This method writes the VTDs and LCs into a file of the given name
* XML is not part of the index
* please refer to VTD-XML web site for the spec and explanation
* @param fileName
* @throws IOException
* @throws IndexWriteException
*
*/
public void writeSeparateIndex(String fileName) throws IOException,IndexWriteException{
FileOutputStream fos = new FileOutputStream(fileName);
writeSeparateIndex(fos);
fos.close();
}
/**
* Write the VTD and LC into their storage container for where LC depth is 5.
* @param offset int
* @param length int
* @param token_type int
* @param depth int
*/
private void writeVTD(int offset, int length, int token_type, int depth) {
VTDBuffer.append(((long) ((token_type << 28)
| ((depth & 0xff) << 20) | length) << 32)
| offset);
switch (depth) {
case 0:
rootIndex = VTDBuffer.size - 1;
break;
case 1:
if (last_depth == 1) {
l1Buffer.append(((long) last_l1_index << 32) | 0xffffffffL);
} else if (last_depth == 2) {
l2Buffer.append(((long) last_l2_index << 32) | 0xffffffffL);
}
last_l1_index = VTDBuffer.size - 1;
last_depth = 1;
break;
case 2:
if (last_depth == 1) {
l1Buffer.append(((long) last_l1_index << 32)
+ l2Buffer.size);
} else if (last_depth == 2) {
l2Buffer.append(((long) last_l2_index << 32) | 0xffffffffL);
}
last_l2_index = VTDBuffer.size - 1;
last_depth = 2;
break;
case 3:
l3Buffer.append(VTDBuffer.size - 1);
if (last_depth == 2) {
l2Buffer.append(((long) last_l2_index << 32)
+ l3Buffer.size - 1);
}
last_depth = 3;
break;
default:
//rootIndex = VTDBuffer.size() - 1;
}
}
private void _writeVTD(int offset, int length, int token_type, int depth) {
VTDBuffer.append(((long) ((token_type << 28)
| ((depth & 0xff) << 20) | length) << 32)
| offset);
}
private void writeVTDText(int offset, int length, int token_type, int depth) {
if (length > MAX_TOKEN_LENGTH) {
int k;
int r_offset = offset;
for (k = length; k > MAX_TOKEN_LENGTH; k = k - MAX_TOKEN_LENGTH) {
VTDBuffer.append(((long) ((token_type << 28)
| ((depth & 0xff) << 20) | MAX_TOKEN_LENGTH) << 32)
| r_offset);
r_offset += MAX_TOKEN_LENGTH;
}
VTDBuffer.append(((long) ((token_type << 28)
| ((depth & 0xff) << 20) | k) << 32)
| r_offset);
} else {
VTDBuffer.append(((long) ((token_type << 28)
| ((depth & 0xff) << 20) | length) << 32)
| offset);
}
}
/**
* Write the VTD and LC into their storage container.
* @param offset int
* @param length int
* @param token_type int
* @param depth int
*/
private void writeVTD_L5(int offset, int length, int token_type, int depth) {
VTDBuffer.append(((long) ((token_type << 28)
| ((depth & 0xff) << 20) | length) << 32)
| offset);
switch (depth) {
case 0:
rootIndex = VTDBuffer.size - 1;
break;
case 1:
if (last_depth == 1) {
l1Buffer.append(((long) last_l1_index << 32) | 0xffffffffL);
} else if (last_depth == 2) {
l2Buffer.append(((long) last_l2_index << 32) | 0xffffffffL);
} else if (last_depth ==3) {
_l3Buffer.append(((long) last_l3_index << 32) | 0xffffffffL);
} else if (last_depth ==4){
_l4Buffer.append(((long) last_l4_index << 32) | 0xffffffffL);
}
last_l1_index = VTDBuffer.size - 1;
last_depth = 1;
break;
case 2:
if (last_depth == 1) {
l1Buffer.append(((long) last_l1_index << 32)
+ l2Buffer.size);
} else if (last_depth == 2) {
l2Buffer.append(((long) last_l2_index << 32) | 0xffffffffL);
} else if (last_depth ==3) {
_l3Buffer.append(((long) last_l3_index << 32) | 0xffffffffL);
} else if (last_depth ==4){
_l4Buffer.append(((long) last_l4_index << 32) | 0xffffffffL);
}
last_l2_index = VTDBuffer.size - 1;
last_depth = 2;
break;
case 3:
/*if (last_depth == 1) {
l1Buffer.append(((long) last_l1_index << 32)
+ l2Buffer.size);
} else*/
if (last_depth == 2) {
l2Buffer.append(((long) last_l2_index << 32)
+ _l3Buffer.size);
} else if (last_depth ==3) {
_l3Buffer.append(((long) last_l3_index << 32) | 0xffffffffL);
} else if (last_depth ==4){
_l4Buffer.append(((long) last_l4_index << 32) | 0xffffffffL);
}
last_l3_index = VTDBuffer.size - 1;
last_depth = 3;
break;
case 4:
/*if (last_depth == 1) {
l1Buffer.append(((long) last_l1_index << 32)
+ l2Buffer.size);
} else if (last_depth == 2) {
l2Buffer.append(((long) last_l2_index << 32) | 0xffffffffL);
} else*/
if (last_depth ==3) {
_l3Buffer.append(((long) last_l3_index << 32)
+ _l4Buffer.size);
} else if (last_depth ==4){
_l4Buffer.append(((long) last_l4_index << 32) | 0xffffffffL);
}
last_l4_index = VTDBuffer.size - 1;
last_depth = 4;
break;
case 5:
_l5Buffer.append(VTDBuffer.size - 1);
if (last_depth == 4) {
_l4Buffer.append(((long) last_l4_index << 32)
+ _l5Buffer.size - 1);
}
last_depth = 5;
break;
//default:
//rootIndex = VTDBuffer.size() - 1;
}
}
/**
*
* @throws ParseException
*/
private void qualifyElement() throws ParseException{
int i= nsBuffer3.size-1;
// two cases:
// 1. the current element has no prefix, look for xmlns
// 2. the current element has prefix, look for xmlns:something
int preLen = (int)((currentElementRecord & 0xffff000000000000L)>>48);
int preOs = (int)currentElementRecord;
while(i>=0){
int t = nsBuffer3.upper32At(i);
// with prefix, get full length and prefix length
if ( (t&0xffff) - (t>>16) == preLen){
// doing byte comparison here
int os = nsBuffer3.lower32At(i)+(t>>16)+increment;
int k=0;
for (;k= '0' && ch <= '9') {
val = (val << 4) + (ch - '0');
} else if (ch >= 'a' && ch <= 'f') {
val = (val << 4) + (ch - 'a' + 10);
} else if (ch >= 'A' && ch <= 'F') {
val = (val << 4) + (ch - 'A' + 10);
} else if (ch == ';') {
inc+=increment;
break;
}
}
} else {
while (true) {
ch = getCharUnit(byte_offset);
byte_offset+=increment;
inc+=increment;
if (ch >= '0' && ch <= '9') {
val = val * 10 + (ch - '0');
} else if (ch == ';') {
break;
}
}
}
break;
case 'a' :
ch = getCharUnit(byte_offset);
if (encoding"+new String(XMLDoc, bos1, len1)+" "+new String(XMLDoc,bos2,len2));
while(i1>32);
i2 += (int)(l2>>32);
}
if (i1==i3 && i2==i4)
return true;
return false;
}
private void checkAttributeUniqueness()
throws ParseException
{
boolean unique = true;
boolean unequal;
for (int i = 0; i < attr_count; i++) {
unequal = false;
int prevLen = (int) attr_name_array[i];
if (length1 == prevLen) {
int prevOffset =
(int) (attr_name_array[i] >> 32);
for (int j = 0; j < prevLen; j++) {
if (XMLDoc[prevOffset + j]
!= XMLDoc[temp_offset + j]) {
unequal = true;
break;
}
}
} else
unequal = true;
unique = unique && unequal;
}
if (!unique && attr_count != 0)
throw new ParseException(
"Error in attr: Attr name not unique"
+ formatLineNumber());
unique = true;
if (attr_count < attr_name_array.length) {
attr_name_array[attr_count] =
((long) (temp_offset) << 32) | length1;
attr_count++;
} else // grow the attr_name_array by 16
{
long[] temp_array = attr_name_array;
/*System.out.println(
"size increase from "
+ temp_array.length
+ " to "
+ (attr_count + 16));*/
attr_name_array =
new long[attr_count + ATTR_NAME_ARRAY_SIZE];
System.arraycopy(temp_array, 0, attr_name_array, 0, attr_count);
/*for (int i = 0; i < attr_count; i++) {
attr_name_array[i] = temp_array[i];
}*/
attr_name_array[attr_count] =
((long) (temp_offset) << 32) | length1;
attr_count++;
}
// insert prefix attr node into the prefixed_attr_name array
// xml:something will not be inserted
//System.out.println(" prefixed attr count ===>"+prefixed_attr_count);
//System.out.println(" length2 ===>"+length2);
if (ns && !is_ns && length2!=0 ){
if ((increment==1 && length2 ==3 && matchXML(temp_offset))
|| (increment==2 &&length2 ==6 && matchXML(temp_offset))){
return;
}
else if (prefixed_attr_count < prefixed_attr_name_array.length){
prefixed_attr_name_array[prefixed_attr_count] =
((long) (temp_offset) << 32) | (length2<<16)| length1;
prefixed_attr_count++;
}else {
long[] temp_array1 = prefixed_attr_name_array;
prefixed_attr_name_array =
new long[prefixed_attr_count + ATTR_NAME_ARRAY_SIZE];
prefix_URL_array =
new int[prefixed_attr_count + ATTR_NAME_ARRAY_SIZE];
System.arraycopy(temp_array1, 0, prefixed_attr_name_array, 0, prefixed_attr_count);
//System.arraycopy(temp_array1, 0, prefixed_attr_val_array, 0, prefixed_attr_count)
/*for (int i = 0; i < attr_count; i++) {
attr_name_array[i] = temp_array[i];
}*/
prefixed_attr_name_array[prefixed_attr_count] =
((long) (temp_offset) << 32) | (length2<<16)| length1;
prefixed_attr_count++;
}
}
}
private void handleOtherTextChar(int ch) throws ParseException{
if (ch == '&') {
//has_amp = true;
if (!XMLChar.isValidChar(entityIdentifier()))
throw new ParseException(
"Error in text content: Invalid char in text content "
+ formatLineNumber());
//parser_state = STATE_TEXT;
} else if (ch == ']') {
if (r.skipChar(']')) {
while (r.skipChar(']')) {
}
if (r.skipChar('>'))
throw new ParseException(
"Error in text content: ]]> in text content"
+ formatLineNumber());
}
} else
throw new ParseException(
"Error in text content: Invalid char in text content "
+ formatLineNumber());
}
private void handleOtherTextChar2(int ch) throws ParseException{
if (ch == '&') {
//has_amp = true;
//temp_offset = offset;
entityIdentifier();
//parser_state = STATE_TEXT;
} else if (ch == ']') {
if (r.skipChar(']')) {
while (r.skipChar(']')) {
}
if (r.skipChar('>'))
throw new ParseException(
"Error in text content: ]]> in text content"
+ formatLineNumber());
}
//parser_state = STATE_TEXT;
}else
throw new ParseException(
"Error in text content: Invalid char"
+ formatLineNumber());
}
private int processElementTail(boolean helper) throws ParseException, EncodingException, EOFException{
if (depth != -1) {
temp_offset = offset;
//ch = getCharAfterSe();
ch = getCharAfterS();
if (ch == '<') {
if (ws)
addWhiteSpaceRecord();
//parser_state = STATE_LT_SEEN;
if (r.skipChar('/')) {
if (helper) {
length1 = offset - temp_offset
- (increment << 1);
//if (length1 > 0) {
if (singleByteEncoding)//if (encoding < FORMAT_UTF_16BE)
writeVTDText((temp_offset),
length1,
TOKEN_CHARACTER_DATA,
depth);
else
writeVTDText((temp_offset) >> 1,
(length1 >> 1),
TOKEN_CHARACTER_DATA,
depth);
//}
}
return STATE_END_TAG;
}
return STATE_LT_SEEN;
} else if (XMLChar.isContentChar(ch)) {
//temp_offset = offset;
return STATE_TEXT;
} else {
handleOtherTextChar2(ch);
return STATE_TEXT;
}
}
return STATE_DOC_END;
}
}