info.monitorenter.cpdetector.io.JChardetFacade Maven / Gradle / Ivy
/*
* IClassFileFilter.java cpdetector
* Copyright (C) 2004 Achim Westermann, created on 03.06.2004
*
* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this collection are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is the cpDetector code in [sub] packages info.monitorenter and
* cpdetector.
*
* The Initial Developer of the Original Code is
* Achim Westermann .
*
* Portions created by the Initial Developer are Copyright (c) 2007
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** *
*
* If you modify or optimize the code in a useful way please let me know.
* [email protected]
*/
package info.monitorenter.cpdetector.io;
import org.mozilla.intl.chardet.nsDetector;
import org.mozilla.intl.chardet.nsICharsetDetectionObserver;
import org.mozilla.intl.chardet.nsPSMDetector;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.nio.charset.UnsupportedCharsetException;
/**
* A fac�ade for jchardet codepage detection. JChardet is the java port of Frank Yung-Fong Tang's
* Mozilla charset detector.
*
* This charset detector works on guessing the codepage. "The algorithm looks
* into the byte sequence and based on the values of each byte uses a
* elimination logic to narrow down to the final charset. If there is a tie
* between EUC charsets, it uses the second logic to narrow down. This logic
* uses the frequency statistics of characters in a given language." ( source of description ).
*
* It is a singleton for performance reasons (buffer allocation). Because it is
* stateful (internal buffer) the method
* {@link #detectCodepage(java.io.InputStream, int)}(delegated to by
* {@link #detectCodepage(java.net.URL)}has to be synchronized.
*
*
*
* @author Achim Westermann
*
*/
public final class JChardetFacade
extends AbstractCodepageDetector implements nsICharsetDetectionObserver {
private static JChardetFacade instance = null;
private static nsDetector det;
private byte[] buf = new byte[4096];
private Charset codpage = null;
private boolean m_guessing = true;
private int amountOfVerifiers = 0;
/**
*
*/
private JChardetFacade() {
super();
det = new nsDetector(nsPSMDetector.ALL);
det.Init(this);
this.amountOfVerifiers = det.getProbableCharsets().length;
}
public static JChardetFacade getInstance() {
if (instance == null) {
instance = new JChardetFacade();
}
return instance;
}
/*
* (non-Javadoc)
*
* @see cpdetector.io.ICodepageDetector#detectCodepage(java.io.InputStream)
*/
public synchronized Charset detectCodepage(InputStream in, int length) throws IOException {
this.Reset();
int len;
int read = 0;
boolean done = false;
boolean isAscii = true;
Charset ret = null;
do {
len = in.read(buf, 0, Math.min(buf.length, length - read));
if (len > 0) {
read += len;
}
if (!done)
done = det.DoIt(buf, len, false);
} while (len > 0 && !done);
det.DataEnd();
if (this.codpage == null) {
if (this.m_guessing) {
ret = guess();
} else {
ret = UnknownCharset.getInstance();
}
} else {
ret = this.codpage;
}
return ret;
}
/**
*
*/
private Charset guess() {
Charset ret = null;
String[] possibilities = det.getProbableCharsets();
/*
* Detect US-ASCII by the fact, that no exclusion of any Charset was
* possible.
*/
if (possibilities.length == this.amountOfVerifiers) {
ret = Charset.forName("US-ASCII");
} else {
// He should better return an Array of length zero!
String check = possibilities[0];
if (check.equalsIgnoreCase("nomatch")) {
ret = UnknownCharset.getInstance();
} else {
for (int i = 0; ret == null && i < possibilities.length; i++) {
try {
ret = Charset.forName(possibilities[i]);
} catch (UnsupportedCharsetException uce) {
ret = UnsupportedCharset.forName(possibilities[i]);
}
}
}
}
return ret;
}
/**
*
* @see org.mozilla.intl.chardet.nsICharsetDetectionObserver#Notify(String)
*/
public void Notify(final String charset) {
this.codpage = Charset.forName(charset);
}
public void Reset() {
det.Reset();
this.codpage = null;
}
/**
* @return Returns the m_guessing.
*/
public boolean isGuessing() {
return m_guessing;
}
/**
*
* If it was impossible to narrow down possible results to one, an internal
* set of possible character encodings exists. By setting guessing to true,
* the call to {@link #detectCodepage(java.io.InputStream, int)} and
* {@link #detectCodepage(java.net.URL)} will return an arbitrary possible Charset.
*
*
* Currently the following precedence is implemented to choose the possible
* Charset:
*
* - If US-ASCII is possible, it is chosen.
*
- If US-ASCII is not possible, the first supported one in the set of
* possible charsets is returned. No information about the semantics of the
* order in that list is available. If no possibility is supported, an
* instance of {@link UnsupportedCharset} is returned.
*
* ASCII indeed is never detected as possible: No internal verifier exists for
* ASCII, as all Charsets support ASCII. The possibility of ASCII is detected,
* when no Charset has been excluded: The amount of possible Charsets is equal
* to the amount of all detectable Charsets.
*
* @param guessing
* The guessing to set.
*/
public synchronized void setGuessing(final boolean guessing) {
this.m_guessing = guessing;
}
}