All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.hp.hpl.sparta.ParseByteStream Maven / Gradle / Ivy

Go to download

Support Chinese character (both Simplified and Tranditional) to most popular Pinyin systems, including Hanyu Pinyin, Tongyong Pinyin, Wade-Giles, MPS2, Yale and Gwoyeu Romatzyh. Support multiple pronounciations and customized output.

The newest version!
package com.hp.hpl.sparta;

import java.io.*;


/**
 * An XML byte stream that has been parsed into a DOM tree.
 * Just like ParseCharStream except handle Unicode encoding of byte stream.
 * Use rules in
 * http://www.w3.org/TR/2000/REC-xml-20001006#sec-guessing to guess
 * encoding -- if encoding declaration is different, restart parsing.

 
Copyright (C) 2002 Hewlett-Packard Company. This file is part of Sparta, an XML Parser, DOM, and XPath library. This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
@see GNU Lesser General Public License @version $Date: 2003/07/28 04:33:04 $ $Revision: 1.5 $ @author Eamonn O'Brien-Strain */ class ParseByteStream implements ParseSource { /** Parse XML document from byte stream, converting to Unicode * characters as specifed by the initial byte-order-mark. * @param istream is the source of bytes and must support mark so that * we can peek ahead at its first two bytes */ public ParseByteStream(String systemId, InputStream istream, ParseLog log, String guessedEncoding, ParseHandler handler) throws ParseException, IOException { if (log == null) log = DEFAULT_LOG; //We need to be able to restart the stream if the declared encoding //is different than our guess, os buffer if necessary. We also need //to be able to peek ahead at the first 4 bytes if (!istream.markSupported()) throw new Error( "Precondition violation: the InputStream passed to ParseByteStream must support mark"); istream.mark(MAXLOOKAHEAD); //mark at begining byte[] start = new byte[4]; int n = istream.read(start); if (guessedEncoding == null) guessedEncoding = guessEncoding(systemId, start, n, log); try { //First try with guessed encoding istream.reset(); InputStreamReader reader = new InputStreamReader(istream, fixEncoding(guessedEncoding)); try { parseSource_ = new ParseCharStream(systemId, reader, log, guessedEncoding, handler); //}catch( CharConversionException e ){ } catch (IOException e) { //This exception seems to be caused by reading euc-jp as utf-8 String secondGuessEncoding = "euc-jp"; log.note("Problem reading with assumed encoding of " + guessedEncoding + " so restarting with " + secondGuessEncoding, systemId, 1); istream.reset(); try { reader = new InputStreamReader(istream, fixEncoding(secondGuessEncoding)); } catch (UnsupportedEncodingException ee) { throw new ParseException(log, systemId, 1, '\0', secondGuessEncoding, "\"" + secondGuessEncoding + "\" is not a supported encoding"); } parseSource_ = new ParseCharStream(systemId, reader, log, null, handler); } } catch (EncodingMismatchException e) { //if that didn't work try declared encoding String declaredEncoding = e.getDeclaredEncoding(); log.note("Encoding declaration of " + declaredEncoding + " is different that assumed " + guessedEncoding + " so restarting the parsing with the new encoding", systemId, 1); istream.reset(); InputStreamReader reader; try { reader = new InputStreamReader(istream, fixEncoding(declaredEncoding)); } catch (UnsupportedEncodingException ee) { throw new ParseException(log, systemId, 1, '\0', declaredEncoding, "\"" + declaredEncoding + "\" is not a supported encoding"); } parseSource_ = new ParseCharStream(systemId, reader, log, null, handler); } } public String toString() { return parseSource_.toString(); } public String getSystemId() { return parseSource_.getSystemId(); } /** Last line number read by parser. */ public int getLineNumber() { return parseSource_.getLineNumber(); } /** * @link aggregationByValue */ private ParseCharStream parseSource_; ///////////////////////////////////////////////////////////////////// /** Convert byte stream to Unicode character stream according to * http://www.w3.org/TR/2000/REC-xml-20001006#sec-guessing * . */ static private String guessEncoding(String systemId, byte[] start, int n, ParseLog log) throws IOException { //Test for UTF-16 byte-order mark String encoding; if (n != 4) { String msg = n <= 0 ? "no characters in input" : "less than 4 characters in input: \"" + new String(start, 0, n) + "\""; log.error(msg, systemId, 1); encoding = "UTF-8"; } else if (equals(start, 0x0000FEFF) || equals(start, 0xFFFE0000) || equals(start, 0x0000FFFE) || equals(start, 0xFEFF0000) || equals(start, 0x0000003C) || equals(start, 0x3C000000) || equals(start, 0x00003C00) || equals(start, 0x003C0000)) encoding = "UCS-4"; else if (equals(start, 0x003C003F)) encoding = "UTF-16BE"; //or ISO-10646-UCS-2 else if (equals(start, 0x3C003F00)) encoding = "UTF-16LE"; //or ISO-10646-UCS-2 else if (equals(start, 0x3C3F786D)) encoding = "UTF-8";//or ISO 646, ASCII, ISO 8859, Shift-JIS, EUC else if (equals(start, 0x4C6FA794)) encoding = "EBCDIC"; else if (equals(start, (short) 0xFFFE) || equals(start, (short) 0xFEFF)) encoding = "UTF-16"; else encoding = "UTF-8"; if (!encoding.equals("UTF-8")) log.note("From start " + hex(start[0]) + " " + hex(start[1]) + " " + hex(start[2]) + " " + hex(start[3]) + " deduced encoding = " + encoding, systemId, 1); return encoding; } static private String hex(byte b) { String s = Integer.toHexString(b); switch (s.length()) { case 1: return "0" + s; case 2: return s; default: return s.substring(s.length() - 2); } } static private boolean equals(byte[] bytes, int integer) { return bytes[0] == (byte) ((integer >>> 24)) && bytes[1] == (byte) ((integer >>> 16) & 0xFF) && bytes[2] == (byte) ((integer >>> 8) & 0xFF) && bytes[3] == (byte) ((integer) & 0xFF); } static private boolean equals(byte[] bytes, short integer) { return bytes[0] == (byte) ((integer >>> 8)) && bytes[1] == (byte) ((integer) & 0xFF); } static private String fixEncoding(String encoding) { return encoding.toLowerCase().equals("utf8") ? "UTF-8" : encoding; } } // $Log: ParseByteStream.java,v $ // Revision 1.5 2003/07/28 04:33:04 eobrain // Fix bug that was removing dashes from unicode encoding names. We // should do this only for UTF-8. // // Revision 1.4 2003/07/17 23:55:28 eobrain // Make compatiblie with J2ME. For example do not use "new" // java.util classes. // // Revision 1.3 2003/01/09 01:05:38 yuhongx // added FixEncoding(). // // Revision 1.2 2002/11/06 02:57:59 eobrain // Organize imputs to removed unused imports. Remove some unused local variables. // // Revision 1.1.1.1 2002/08/19 05:04:00 eobrain // import from HP Labs internal CVS // // Revision 1.14 2002/08/18 04:36:25 eob // Make interface package-private so as not to clutter up the javadoc. // // Revision 1.13 2002/08/17 00:54:14 sermarti // // Revision 1.12 2002/08/05 20:04:32 sermarti // // Revision 1.11 2002/07/25 21:10:15 sermarti // Adding files that mysteriously weren't added from Sparta before. // // Revision 1.10 2002/05/23 22:00:19 eob // Add better error handling. // // Revision 1.9 2002/05/09 17:02:26 eob // Fix NullPointerException in error reporting. // // Revision 1.8 2002/05/09 16:49:52 eob // Add history for better error reporting. // // Revision 1.7 2002/03/21 23:50:49 eob // Deprecate functionality moved to Parser facade class. // // Revision 1.6 2002/02/15 21:30:38 eob // Comment changes only. // // Revision 1.5 2002/02/01 21:55:15 eob // Comment change only. // // Revision 1.4 2002/01/09 00:45:58 eob // Formatting change only. // // Revision 1.3 2002/01/09 00:44:57 eob // Handle CharConversionException caused by reading euc-jp characters // before encoding has been established. Restart parsing. // // Revision 1.2 2002/01/08 19:53:43 eob // Comment change only. // // Revision 1.1 2002/01/08 19:31:33 eob // Factored out ParseSource functionality into ParseCharStream and // ParseByteStream.




© 2015 - 2024 Weber Informatics LLC | Privacy Policy