info.monitorenter.cpdetector.io.JChardetFacade Maven / Gradle / Ivy

Go to download
/*
 *  IClassFileFilter.java  cpdetector
 *  Copyright (C) 2004 Achim Westermann, created on 03.06.2004 
 *
 * ***** BEGIN LICENSE BLOCK *****
 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
 * 
 * The contents of this collection are subject to the Mozilla Public License Version 
 * 1.1 (the "License"); you may not use this file except in compliance with 
 * the License. You may obtain a copy of the License at 
 * http://www.mozilla.org/MPL/
 * 
 * Software distributed under the License is distributed on an "AS IS" basis,
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
 * for the specific language governing rights and limitations under the
 * License.
 * 
 * The Original Code is the cpDetector code in [sub] packages info.monitorenter and 
 * cpdetector. 
 * 
 * The Initial Developer of the Original Code is
 * Achim Westermann .
 * 
 * Portions created by the Initial Developer are Copyright (c) 2007 
 * the Initial Developer. All Rights Reserved.
 * 
 * Contributor(s):
 * 
 * Alternatively, the contents of this file may be used under the terms of
 * either the GNU General Public License Version 2 or later (the "GPL"), or
 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
 * in which case the provisions of the GPL or the LGPL are applicable instead
 * of those above. If you wish to allow use of your version of this file only
 * under the terms of either the GPL or the LGPL, and not to allow others to
 * use your version of this file under the terms of the MPL, indicate your
 * decision by deleting the provisions above and replace them with the notice
 * and other provisions required by the GPL or the LGPL. If you do not delete
 * the provisions above, a recipient may use your version of this file under
 * the terms of any one of the MPL, the GPL or the LGPL.
 * 
 * ***** END LICENSE BLOCK ***** * 
 *  
 * If you modify or optimize the code in a useful way please let me know.
 * [email protected] 
 */
package info.monitorenter.cpdetector.io;

import org.mozilla.intl.chardet.nsDetector;
import org.mozilla.intl.chardet.nsICharsetDetectionObserver;
import org.mozilla.intl.chardet.nsPSMDetector;

import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.nio.charset.UnsupportedCharsetException;

/**
 * A fac�ade for jchardet codepage detection. JChardet  is the java port of Frank Yung-Fong Tang's
 * Mozilla charset detector.
 * 
 * This charset detector works on guessing the codepage. "The algorithm looks
 * into the byte sequence and based on the values of each byte uses a
 * elimination logic to narrow down to the final charset. If there is a tie
 * between EUC charsets, it uses the second logic to narrow down. This logic
 * uses the frequency statistics of characters in a given language." ( source of description ).
 * 

 * It is a singleton for performance reasons (buffer allocation). Because it is
 * stateful (internal buffer) the method
 * {@link #detectCodepage(java.io.InputStream, int)}(delegated to by
 * {@link #detectCodepage(java.net.URL)}has to be synchronized.
 * 

 * 
 * 
 * @author Achim Westermann 
 * 
 */
public final class JChardetFacade
    extends AbstractCodepageDetector implements nsICharsetDetectionObserver {
  private static JChardetFacade instance = null;

  private static nsDetector det;

  private byte[] buf = new byte[4096];

  private Charset codpage = null;

  private boolean m_guessing = true;

  private int amountOfVerifiers = 0;

  /**
   * 
   */
  private JChardetFacade() {
    super();
    det = new nsDetector(nsPSMDetector.ALL);
    det.Init(this);
    this.amountOfVerifiers = det.getProbableCharsets().length;
  }

  public static JChardetFacade getInstance() {
    if (instance == null) {
      instance = new JChardetFacade();
    }
    return instance;
  }

  /*
   * (non-Javadoc)
   * 
   * @see cpdetector.io.ICodepageDetector#detectCodepage(java.io.InputStream)
   */
  public synchronized Charset detectCodepage(InputStream in, int length) throws IOException {
    this.Reset();
    int len;
    int read = 0;
    boolean done = false;
    boolean isAscii = true;
    Charset ret = null;
    do {
      len = in.read(buf, 0, Math.min(buf.length, length - read));
      if (len > 0) {
        read += len;
      }
      if (!done)
        done = det.DoIt(buf, len, false);
    } while (len > 0 && !done);
    det.DataEnd();
    if (this.codpage == null) {
      if (this.m_guessing) {
        ret = guess();
      } else {
        ret = UnknownCharset.getInstance();
      }
    } else {
      ret = this.codpage;
    }
    return ret;

  }

  /**
   * 
   */
  private Charset guess() {
    Charset ret = null;
    String[] possibilities = det.getProbableCharsets();
    /*
     * Detect US-ASCII by the fact, that no exclusion of any Charset was
     * possible.
     */
    if (possibilities.length == this.amountOfVerifiers) {
      ret = Charset.forName("US-ASCII");
    } else {
      // He should better return an Array of length zero!
      String check = possibilities[0];
      if (check.equalsIgnoreCase("nomatch")) {
        ret = UnknownCharset.getInstance();
      } else {
        for (int i = 0; ret == null && i < possibilities.length; i++) {
          try {
            ret = Charset.forName(possibilities[i]);
          } catch (UnsupportedCharsetException uce) {
            ret = UnsupportedCharset.forName(possibilities[i]);
          }
        }
      }
    }
    return ret;

  }

 /**
  * 
  * @see org.mozilla.intl.chardet.nsICharsetDetectionObserver#Notify(String)
  */
  public void Notify(final String charset) {
    this.codpage = Charset.forName(charset);
  }

  public void Reset() {
    det.Reset();
    this.codpage = null;
  }

  /**
   * @return Returns the m_guessing.
   */
  public boolean isGuessing() {
    return m_guessing;
  }

  /**
   * 

   * If it was impossible to narrow down possible results to one, an internal
   * set of possible character encodings exists. By setting guessing to true,
   * the call to {@link #detectCodepage(java.io.InputStream, int)} and
   * {@link #detectCodepage(java.net.URL)} will return an arbitrary possible Charset.
   * 
   * 
   * Currently the following precedence is implemented to choose the possible
   * Charset:
   * 

   *  If US-ASCII is possible, it is chosen.
   * 
 If US-ASCII is not possible, the first supported one in the set of
   * possible charsets is returned. No information about the semantics of the
   * order in that list is available. If no possibility is supported, an
   * instance of {@link UnsupportedCharset} is returned.
   * 
   * ASCII indeed is never detected as possible: No internal verifier exists for
   * ASCII, as all Charsets support ASCII. The possibility of ASCII is detected,
   * when no Charset has been excluded: The amount of possible Charsets is equal
   * to the amount of all detectable Charsets.
   *
   * @param guessing
   *          The guessing to set.
   */
  public synchronized void setGuessing(final boolean guessing) {
    this.m_guessing = guessing;
  }
}