com.hp.hpl.sparta.ParseByteStream Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of pinyin4j-multi Show documentation
Show all versions of pinyin4j-multi Show documentation
Support Chinese character (both Simplified and Tranditional) to most popular Pinyin systems, including
Hanyu Pinyin, Tongyong Pinyin, Wade-Giles, MPS2, Yale and Gwoyeu Romatzyh. Support multiple pronounciations and
customized output.
The newest version!
package com.hp.hpl.sparta;
import java.io.*;
/**
* An XML byte stream that has been parsed into a DOM tree.
* Just like ParseCharStream except handle Unicode encoding of byte stream.
* Use rules in
* http://www.w3.org/TR/2000/REC-xml-20001006#sec-guessing to guess
* encoding -- if encoding declaration is different, restart parsing.
Copyright (C) 2002 Hewlett-Packard Company.
This file is part of Sparta, an XML Parser, DOM, and XPath library.
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public License
as published by the Free Software Foundation; either version 2.1 of
the License, or (at your option) any later version. This library
is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE.
@see GNU Lesser General Public License
@version $Date: 2003/07/28 04:33:04 $ $Revision: 1.5 $
@author Eamonn O'Brien-Strain
*/
class ParseByteStream implements ParseSource {
/** Parse XML document from byte stream, converting to Unicode
* characters as specifed by the initial byte-order-mark.
* @param istream is the source of bytes and must support mark so that
* we can peek ahead at its first two bytes
*/
public ParseByteStream(String systemId, InputStream istream, ParseLog log,
String guessedEncoding, ParseHandler handler) throws ParseException, IOException {
if (log == null) log = DEFAULT_LOG;
//We need to be able to restart the stream if the declared encoding
//is different than our guess, os buffer if necessary. We also need
//to be able to peek ahead at the first 4 bytes
if (!istream.markSupported())
throw new Error(
"Precondition violation: the InputStream passed to ParseByteStream must support mark");
istream.mark(MAXLOOKAHEAD); //mark at begining
byte[] start = new byte[4];
int n = istream.read(start);
if (guessedEncoding == null) guessedEncoding = guessEncoding(systemId, start, n, log);
try {
//First try with guessed encoding
istream.reset();
InputStreamReader reader = new InputStreamReader(istream, fixEncoding(guessedEncoding));
try {
parseSource_ = new ParseCharStream(systemId, reader, log, guessedEncoding, handler);
//}catch( CharConversionException e ){
} catch (IOException e) {
//This exception seems to be caused by reading euc-jp as utf-8
String secondGuessEncoding = "euc-jp";
log.note("Problem reading with assumed encoding of " + guessedEncoding
+ " so restarting with " + secondGuessEncoding, systemId, 1);
istream.reset();
try {
reader = new InputStreamReader(istream, fixEncoding(secondGuessEncoding));
} catch (UnsupportedEncodingException ee) {
throw new ParseException(log, systemId, 1, '\0', secondGuessEncoding, "\""
+ secondGuessEncoding + "\" is not a supported encoding");
}
parseSource_ = new ParseCharStream(systemId, reader, log, null, handler);
}
} catch (EncodingMismatchException e) {
//if that didn't work try declared encoding
String declaredEncoding = e.getDeclaredEncoding();
log.note("Encoding declaration of " + declaredEncoding + " is different that assumed "
+ guessedEncoding + " so restarting the parsing with the new encoding",
systemId, 1);
istream.reset();
InputStreamReader reader;
try {
reader = new InputStreamReader(istream, fixEncoding(declaredEncoding));
} catch (UnsupportedEncodingException ee) {
throw new ParseException(log, systemId, 1, '\0', declaredEncoding, "\""
+ declaredEncoding + "\" is not a supported encoding");
}
parseSource_ = new ParseCharStream(systemId, reader, log, null, handler);
}
}
public String toString() {
return parseSource_.toString();
}
public String getSystemId() {
return parseSource_.getSystemId();
}
/** Last line number read by parser. */
public int getLineNumber() {
return parseSource_.getLineNumber();
}
/**
* @link aggregationByValue
*/
private ParseCharStream parseSource_;
/////////////////////////////////////////////////////////////////////
/** Convert byte stream to Unicode character stream according to
* http://www.w3.org/TR/2000/REC-xml-20001006#sec-guessing
* . */
static private String guessEncoding(String systemId, byte[] start, int n, ParseLog log)
throws IOException {
//Test for UTF-16 byte-order mark
String encoding;
if (n != 4) {
String msg =
n <= 0 ? "no characters in input" : "less than 4 characters in input: \""
+ new String(start, 0, n) + "\"";
log.error(msg, systemId, 1);
encoding = "UTF-8";
} else if (equals(start, 0x0000FEFF) || equals(start, 0xFFFE0000)
|| equals(start, 0x0000FFFE) || equals(start, 0xFEFF0000)
|| equals(start, 0x0000003C) || equals(start, 0x3C000000)
|| equals(start, 0x00003C00) || equals(start, 0x003C0000))
encoding = "UCS-4";
else if (equals(start, 0x003C003F))
encoding = "UTF-16BE"; //or ISO-10646-UCS-2
else if (equals(start, 0x3C003F00))
encoding = "UTF-16LE"; //or ISO-10646-UCS-2
else if (equals(start, 0x3C3F786D))
encoding = "UTF-8";//or ISO 646, ASCII, ISO 8859, Shift-JIS, EUC
else if (equals(start, 0x4C6FA794))
encoding = "EBCDIC";
else if (equals(start, (short) 0xFFFE) || equals(start, (short) 0xFEFF))
encoding = "UTF-16";
else
encoding = "UTF-8";
if (!encoding.equals("UTF-8"))
log.note("From start " + hex(start[0]) + " " + hex(start[1]) + " " + hex(start[2])
+ " " + hex(start[3]) + " deduced encoding = " + encoding, systemId, 1);
return encoding;
}
static private String hex(byte b) {
String s = Integer.toHexString(b);
switch (s.length()) {
case 1:
return "0" + s;
case 2:
return s;
default:
return s.substring(s.length() - 2);
}
}
static private boolean equals(byte[] bytes, int integer) {
return bytes[0] == (byte) ((integer >>> 24))
&& bytes[1] == (byte) ((integer >>> 16) & 0xFF)
&& bytes[2] == (byte) ((integer >>> 8) & 0xFF)
&& bytes[3] == (byte) ((integer) & 0xFF);
}
static private boolean equals(byte[] bytes, short integer) {
return bytes[0] == (byte) ((integer >>> 8)) && bytes[1] == (byte) ((integer) & 0xFF);
}
static private String fixEncoding(String encoding) {
return encoding.toLowerCase().equals("utf8") ? "UTF-8" : encoding;
}
}
// $Log: ParseByteStream.java,v $
// Revision 1.5 2003/07/28 04:33:04 eobrain
// Fix bug that was removing dashes from unicode encoding names. We
// should do this only for UTF-8.
//
// Revision 1.4 2003/07/17 23:55:28 eobrain
// Make compatiblie with J2ME. For example do not use "new"
// java.util classes.
//
// Revision 1.3 2003/01/09 01:05:38 yuhongx
// added FixEncoding().
//
// Revision 1.2 2002/11/06 02:57:59 eobrain
// Organize imputs to removed unused imports. Remove some unused local variables.
//
// Revision 1.1.1.1 2002/08/19 05:04:00 eobrain
// import from HP Labs internal CVS
//
// Revision 1.14 2002/08/18 04:36:25 eob
// Make interface package-private so as not to clutter up the javadoc.
//
// Revision 1.13 2002/08/17 00:54:14 sermarti
//
// Revision 1.12 2002/08/05 20:04:32 sermarti
//
// Revision 1.11 2002/07/25 21:10:15 sermarti
// Adding files that mysteriously weren't added from Sparta before.
//
// Revision 1.10 2002/05/23 22:00:19 eob
// Add better error handling.
//
// Revision 1.9 2002/05/09 17:02:26 eob
// Fix NullPointerException in error reporting.
//
// Revision 1.8 2002/05/09 16:49:52 eob
// Add history for better error reporting.
//
// Revision 1.7 2002/03/21 23:50:49 eob
// Deprecate functionality moved to Parser facade class.
//
// Revision 1.6 2002/02/15 21:30:38 eob
// Comment changes only.
//
// Revision 1.5 2002/02/01 21:55:15 eob
// Comment change only.
//
// Revision 1.4 2002/01/09 00:45:58 eob
// Formatting change only.
//
// Revision 1.3 2002/01/09 00:44:57 eob
// Handle CharConversionException caused by reading euc-jp characters
// before encoding has been established. Restart parsing.
//
// Revision 1.2 2002/01/08 19:53:43 eob
// Comment change only.
//
// Revision 1.1 2002/01/08 19:31:33 eob
// Factored out ParseSource functionality into ParseCharStream and
// ParseByteStream.