All Downloads are FREE. Search and download functionalities are using the official Maven repository.

info.monitorenter.cpdetector.io.JChardetFacade Maven / Gradle / Ivy

/*
 *  IClassFileFilter.java  cpdetector
 *  Copyright (C) 2004 Achim Westermann, created on 03.06.2004 
 *
 * ***** BEGIN LICENSE BLOCK *****
 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
 * 
 * The contents of this collection are subject to the Mozilla Public License Version 
 * 1.1 (the "License"); you may not use this file except in compliance with 
 * the License. You may obtain a copy of the License at 
 * http://www.mozilla.org/MPL/
 * 
 * Software distributed under the License is distributed on an "AS IS" basis,
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
 * for the specific language governing rights and limitations under the
 * License.
 * 
 * The Original Code is the cpDetector code in [sub] packages info.monitorenter and 
 * cpdetector. 
 * 
 * The Initial Developer of the Original Code is
 * Achim Westermann .
 * 
 * Portions created by the Initial Developer are Copyright (c) 2007 
 * the Initial Developer. All Rights Reserved.
 * 
 * Contributor(s):
 * 
 * Alternatively, the contents of this file may be used under the terms of
 * either the GNU General Public License Version 2 or later (the "GPL"), or
 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
 * in which case the provisions of the GPL or the LGPL are applicable instead
 * of those above. If you wish to allow use of your version of this file only
 * under the terms of either the GPL or the LGPL, and not to allow others to
 * use your version of this file under the terms of the MPL, indicate your
 * decision by deleting the provisions above and replace them with the notice
 * and other provisions required by the GPL or the LGPL. If you do not delete
 * the provisions above, a recipient may use your version of this file under
 * the terms of any one of the MPL, the GPL or the LGPL.
 * 
 * ***** END LICENSE BLOCK ***** * 
 *  
 * If you modify or optimize the code in a useful way please let me know.
 * [email protected] 
 */
package info.monitorenter.cpdetector.io;

import org.mozilla.intl.chardet.nsDetector;
import org.mozilla.intl.chardet.nsICharsetDetectionObserver;
import org.mozilla.intl.chardet.nsPSMDetector;

import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.nio.charset.UnsupportedCharsetException;

/**
 * A fac�ade for jchardet codepage detection. JChardet  is the java port of Frank Yung-Fong Tang's
 * Mozilla charset detector.
 * 

* This charset detector works on guessing the codepage. "The algorithm looks * into the byte sequence and based on the values of each byte uses a * elimination logic to narrow down to the final charset. If there is a tie * between EUC charsets, it uses the second logic to narrow down. This logic * uses the frequency statistics of characters in a given language." ( source of description ). *

* It is a singleton for performance reasons (buffer allocation). Because it is * stateful (internal buffer) the method * {@link #detectCodepage(java.io.InputStream, int)}(delegated to by * {@link #detectCodepage(java.net.URL)}has to be synchronized. *

* * * @author Achim Westermann * */ public final class JChardetFacade extends AbstractCodepageDetector implements nsICharsetDetectionObserver { private static JChardetFacade instance = null; private static nsDetector det; private byte[] buf = new byte[4096]; private Charset codpage = null; private boolean m_guessing = true; private int amountOfVerifiers = 0; /** * */ private JChardetFacade() { super(); det = new nsDetector(nsPSMDetector.ALL); det.Init(this); this.amountOfVerifiers = det.getProbableCharsets().length; } public static JChardetFacade getInstance() { if (instance == null) { instance = new JChardetFacade(); } return instance; } /* * (non-Javadoc) * * @see cpdetector.io.ICodepageDetector#detectCodepage(java.io.InputStream) */ public synchronized Charset detectCodepage(InputStream in, int length) throws IOException { this.Reset(); int len; int read = 0; boolean done = false; boolean isAscii = true; Charset ret = null; do { len = in.read(buf, 0, Math.min(buf.length, length - read)); if (len > 0) { read += len; } if (!done) done = det.DoIt(buf, len, false); } while (len > 0 && !done); det.DataEnd(); if (this.codpage == null) { if (this.m_guessing) { ret = guess(); } else { ret = UnknownCharset.getInstance(); } } else { ret = this.codpage; } return ret; } /** * */ private Charset guess() { Charset ret = null; String[] possibilities = det.getProbableCharsets(); /* * Detect US-ASCII by the fact, that no exclusion of any Charset was * possible. */ if (possibilities.length == this.amountOfVerifiers) { ret = Charset.forName("US-ASCII"); } else { // He should better return an Array of length zero! String check = possibilities[0]; if (check.equalsIgnoreCase("nomatch")) { ret = UnknownCharset.getInstance(); } else { for (int i = 0; ret == null && i < possibilities.length; i++) { try { ret = Charset.forName(possibilities[i]); } catch (UnsupportedCharsetException uce) { ret = UnsupportedCharset.forName(possibilities[i]); } } } } return ret; } /** * * @see org.mozilla.intl.chardet.nsICharsetDetectionObserver#Notify(String) */ public void Notify(final String charset) { this.codpage = Charset.forName(charset); } public void Reset() { det.Reset(); this.codpage = null; } /** * @return Returns the m_guessing. */ public boolean isGuessing() { return m_guessing; } /** *

* If it was impossible to narrow down possible results to one, an internal * set of possible character encodings exists. By setting guessing to true, * the call to {@link #detectCodepage(java.io.InputStream, int)} and * {@link #detectCodepage(java.net.URL)} will return an arbitrary possible Charset. *

*

* Currently the following precedence is implemented to choose the possible * Charset: *

    *
  1. If US-ASCII is possible, it is chosen. *
  2. If US-ASCII is not possible, the first supported one in the set of * possible charsets is returned. No information about the semantics of the * order in that list is available. If no possibility is supported, an * instance of {@link UnsupportedCharset} is returned. *
* ASCII indeed is never detected as possible: No internal verifier exists for * ASCII, as all Charsets support ASCII. The possibility of ASCII is detected, * when no Charset has been excluded: The amount of possible Charsets is equal * to the amount of all detectable Charsets. * * @param guessing * The guessing to set. */ public synchronized void setGuessing(final boolean guessing) { this.m_guessing = guessing; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy