info.monitorenter.cpdetector.io.ParsingDetector Maven / Gradle / Ivy
/*
* Encoding.java, a facade to an ANTLR grammar based
* parser / lexer that searches for the "charset" attribute of a
* html page.
* Copyright (C) 2004 Achim Westermann, created on 20.07.2004, 10:35:46
*
* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this collection are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is the cpDetector code in [sub] packages info.monitorenter and
* cpdetector.
*
* The Initial Developer of the Original Code is
* Achim Westermann .
*
* Portions created by the Initial Developer are Copyright (c) 2007
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** *
*
* If you modify or optimize the code in a useful way please let me know.
* [email protected]
*/
package info.monitorenter.cpdetector.io;
import antlr.ANTLRException;
import info.monitorenter.cpdetector.io.parser.EncodingLexer;
import info.monitorenter.cpdetector.io.parser.EncodingParser;
import info.monitorenter.io.LimitedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.nio.charset.UnsupportedCharsetException;
public class ParsingDetector
extends AbstractCodepageDetector {
/**
* Generated serialVersionUID
.
*/
private static final long serialVersionUID = 3618977875919778866L;
private boolean m_verbose = false;
public ParsingDetector() {
this(false);
}
public ParsingDetector(boolean verbose) {
super();
this.m_verbose = verbose;
}
/*
* (non-Javadoc)
*
* @see cpdetector.io.ICodepageDetector#detectCodepage(java.io.InputStream)
*/
public Charset detectCodepage(final InputStream in, final int length) throws IOException {
EncodingLexer lexer;
EncodingParser parser;
Charset charset = null;
String csName = null;
InputStream limitedInputStream = new LimitedInputStream(in, length);
if (this.m_verbose) {
System.out
.println(" parsing for html-charset/xml-encoding attribute with codepage: US-ASCII");
}
try {
lexer = new EncodingLexer(new InputStreamReader(limitedInputStream, "US-ASCII"));
parser = new EncodingParser(lexer);
csName = parser.htmlDocument();
if (csName != null) {
// TODO: prepare document with illegal value, then test: Decide to catch
// exception and return
// UnsupportedCharset.
try {
charset = Charset.forName(csName);
} catch (UnsupportedCharsetException uce) {
charset = UnsupportedCharset.forName(csName);
}
} else {
charset = UnknownCharset.getInstance();
}
} catch (ANTLRException ae) {
if (this.m_verbose) {
System.out.println(" ANTLR parser exception: " + ae.getMessage());
}
} catch (Exception deepdown) {
if (this.m_verbose) {
System.out.println(" Decoding Exception: " + deepdown.getMessage()
+ " (unsupported java charset).");
}
if (charset == null) {
if (csName != null) {
charset = UnsupportedCharset.forName(csName);
} else {
charset = UnknownCharset.getInstance();
}
}
}
return charset;
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy