All Downloads are FREE. Search and download functionalities are using the official Maven repository.

cicada.chardet.nsPSMDetector Maven / Gradle / Ivy

The newest version!
/* ***** BEGIN LICENSE BLOCK *****
 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
 *
 * The contents of this file are subject to the Mozilla Public License Version
 * 1.1 (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS" basis,
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
 * for the specific language governing rights and limitations under the
 * License.
 *
 * The Original Code is mozilla.org code.
 *
 * The Initial Developer of the Original Code is
 * Netscape Communications Corporation.
 * Portions created by the Initial Developer are Copyright (C) 1998
 * the Initial Developer. All Rights Reserved.
 *
 * Alternatively, the contents of this file may be used under the terms of
 * either of the GNU General Public License Version 2 or later (the "GPL"),
 * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
 * in which case the provisions of the GPL or the LGPL are applicable instead
 * of those above. If you wish to allow use of your version of this file only
 * under the terms of either the GPL or the LGPL, and not to allow others to
 * use your version of this file under the terms of the MPL, indicate your
 * decision by deleting the provisions above and replace them with the notice
 * and other provisions required by the GPL or the LGPL. If you do not delete
 * the provisions above, a recipient may use your version of this file under
 * the terms of any one of the MPL, the GPL or the LGPL.
 *
 * ***** END LICENSE BLOCK ***** */

package cicada.chardet;
import java.lang.* ;

public abstract class nsPSMDetector {

   public static final int ALL                 =  0 ;
   public static final int JAPANESE            =  1 ;
   public static final int CHINESE             =  2 ;
   public static final int SIMPLIFIED_CHINESE  =  3 ;
   public static final int TRADITIONAL_CHINESE =  4 ;
   public static final int KOREAN              =  5 ;

   public static final int NO_OF_LANGUAGES     =  6 ;
   public static final int MAX_VERIFIERS       = 16 ;

   nsVerifier[] mVerifier ;
   nsEUCStatistics[] mStatisticsData ;

   nsEUCSampler	mSampler = new nsEUCSampler() ;
   byte[]    mState = new byte[MAX_VERIFIERS] ;
   int[]     mItemIdx = new int[MAX_VERIFIERS] ;

   int     mItems ;
   int	   mClassItems ;
 
   boolean mDone ;
   boolean mRunSampler ;
   boolean mClassRunSampler ;

   public nsPSMDetector() {
	initVerifiers( nsPSMDetector.ALL );
	Reset() ;
   }

   public nsPSMDetector(int langFlag) {
	initVerifiers(langFlag);
	Reset() ;
   }

   public nsPSMDetector(int aItems, nsVerifier[] aVerifierSet, 
					nsEUCStatistics[] aStatisticsSet)  {
	mClassRunSampler = ( aStatisticsSet != null ) ;
	mStatisticsData = aStatisticsSet ;
	mVerifier = aVerifierSet ;

	mClassItems = aItems ;
	Reset() ;
   }
   

   public void Reset() {
	mRunSampler = mClassRunSampler ;
	mDone = false ;
	mItems = mClassItems ;

	for(int i=0; i=0 && currVerSet < NO_OF_LANGUAGES ) {
	   currVerifierSet = currVerSet ;
	}
	else {
	   currVerifierSet = nsPSMDetector.ALL ;
	}

	mVerifier = null ;
	mStatisticsData = null ;

	if ( currVerifierSet == nsPSMDetector.TRADITIONAL_CHINESE ) {

	   mVerifier = new nsVerifier[] {
      		new nsUTF8Verifier(),
      		new nsBIG5Verifier(),
      		new nsISO2022CNVerifier(),
      		new nsEUCTWVerifier(),
      		new nsCP1252Verifier(),
      		new nsUCS2BEVerifier(),
      		new nsUCS2LEVerifier()
	   };

	   mStatisticsData = new nsEUCStatistics[] {
      		null,
      		new Big5Statistics(),
      		null,
      		new EUCTWStatistics(),
      		null,
      		null,
      		null
	   };
	}

	//==========================================================
	else if ( currVerifierSet == nsPSMDetector.KOREAN ) {

	   mVerifier = new nsVerifier[] {
      		new nsUTF8Verifier(),
      		new nsEUCKRVerifier(),
      		new nsISO2022KRVerifier(),
      		new nsCP1252Verifier(),
      		new nsUCS2BEVerifier(),
      		new nsUCS2LEVerifier()
	   };
	}

	//==========================================================
	else if ( currVerifierSet == nsPSMDetector.SIMPLIFIED_CHINESE ) {

	   mVerifier = new nsVerifier[] {
      		new nsUTF8Verifier(),
      		new nsGB2312Verifier(),
      		new nsGB18030Verifier(),
      		new nsISO2022CNVerifier(),
      		new nsHZVerifier(),
      		new nsCP1252Verifier(),
      		new nsUCS2BEVerifier(),
      		new nsUCS2LEVerifier()
	   };
	}

	//==========================================================
	else if ( currVerifierSet == nsPSMDetector.JAPANESE ) {

	   mVerifier = new nsVerifier[] {
      		new nsUTF8Verifier(),
      		new nsSJISVerifier(),
      		new nsEUCJPVerifier(),
      		new nsISO2022JPVerifier(),
      		new nsCP1252Verifier(),
      		new nsUCS2BEVerifier(),
      		new nsUCS2LEVerifier()
	   };
	}
	//==========================================================
	else if ( currVerifierSet == nsPSMDetector.CHINESE ) {

	   mVerifier = new nsVerifier[] {
      		new nsUTF8Verifier(),
      		new nsGB2312Verifier(),
      		new nsGB18030Verifier(),
      		new nsBIG5Verifier(),
      		new nsISO2022CNVerifier(),
      		new nsHZVerifier(),
      		new nsEUCTWVerifier(),
      		new nsCP1252Verifier(),
      		new nsUCS2BEVerifier(),
      		new nsUCS2LEVerifier()
	   };

	   mStatisticsData = new nsEUCStatistics[] {
      		null,
      		new GB2312Statistics(),
		null,
      		new Big5Statistics(),
      		null,
      		null,
      		new EUCTWStatistics(),
      		null,
      		null,
      		null
	   };
	}

	//==========================================================
	else if ( currVerifierSet == nsPSMDetector.ALL ) {

	   mVerifier = new nsVerifier[] {
      		new nsUTF8Verifier(),
      		new nsSJISVerifier(),
      		new nsEUCJPVerifier(),
      		new nsISO2022JPVerifier(),
      		new nsEUCKRVerifier(),
      		new nsISO2022KRVerifier(),
      		new nsBIG5Verifier(),
      		new nsEUCTWVerifier(),
      		new nsGB2312Verifier(),
      		new nsGB18030Verifier(),
      		new nsISO2022CNVerifier(),
      		new nsHZVerifier(),
      		new nsCP1252Verifier(),
      		new nsUCS2BEVerifier(),
      		new nsUCS2LEVerifier()
	   };

	   mStatisticsData = new nsEUCStatistics[] {
      		null,
      		null,
      		new EUCJPStatistics(),
      		null,
      		new EUCKRStatistics(),
      		null,
      		new Big5Statistics(),
      		new EUCTWStatistics(),
      		new GB2312Statistics(),
      		null,
      		null,
      		null,
      		null,
      		null,
      		null
	   };
	}

	mClassRunSampler = ( mStatisticsData != null ) ;
       	mClassItems = mVerifier.length ;

   }
	  
   public abstract void Report(String charset) ;

   public boolean HandleData(byte[] aBuf, int len) {


	int i,j;
	byte b, st;

 	for( i=0; i < len; i++) {
	   b = aBuf[i] ;

	   for (j=0; j < mItems; )
	   {
		st = nsVerifier.getNextState( mVerifier[mItemIdx[j]], 
						b, mState[j]) ;
//if (st != 0)
//System.out.println( "state(0x" + Integer.toHexString(0xFF&b) +") =>"+ Integer.toHexString(st&0xFF)+ " " + mVerifier[mItemIdx[j]].charset());

		if (st == nsVerifier.eItsMe) {

//System.out.println( "eItsMe(0x" + Integer.toHexString(0xFF&b) +") =>"+ mVerifier[mItemIdx[j]].charset());

		   Report( mVerifier[mItemIdx[j]].charset() );
		   mDone = true ;
		   return mDone ;

	        } else if (st == nsVerifier.eError ) {

//System.out.println( "eNotMe(0x" + Integer.toHexString(0xFF&b) +") =>"+ mVerifier[mItemIdx[j]].charset());
		   mItems--;
		   if (j < mItems ) {
			mItemIdx[j] = mItemIdx[mItems];	
			mState[j]   = mState[mItems];
		   }

		} else {
		  
		    mState[j++] = st ;

		}
	   }

	   if ( mItems <= 1 ) {

	        if( 1 == mItems) {
		   Report( mVerifier[mItemIdx[0]].charset() );
		}
		mDone = true ;
		return mDone ;

	   } 
	   else {
		
		int nonUCS2Num=0;
		int nonUCS2Idx=0;

		for(j=0; j 1) ;
	
     	if (mRunSampler) {
            mRunSampler = mSampler.Sample(aBuf, aLen);
            if(((aLastChance && mSampler.GetSomeData()) || 
                mSampler.EnoughData())
               && (eucNum == possibleCandidateNum)) {
              mSampler.CalFreq();

              int bestIdx = -1;
              int eucCnt=0;
              float bestScore = 0.0f;
              for(j = 0; j < mItems; j++) {
                 if((null != mStatisticsData[mItemIdx[j]])  &&
                   (!(mVerifier[mItemIdx[j]].charset()).equals("Big5")))
                 {
                    float score = mSampler.GetScore(
                       mStatisticsData[mItemIdx[j]].mFirstByteFreq(),
                       mStatisticsData[mItemIdx[j]].mFirstByteWeight(),
                       mStatisticsData[mItemIdx[j]].mSecondByteFreq(),
                       mStatisticsData[mItemIdx[j]].mSecondByteWeight() );
//System.out.println("FequencyScore("+mVerifier[mItemIdx[j]].charset()+")= "+ score);
                    if(( 0 == eucCnt++) || (bestScore > score )) {
                       bestScore = score;
                       bestIdx = j;
                    } // if(( 0 == eucCnt++) || (bestScore > score )) 
                } // if(null != ...)
             } // for
             if (bestIdx >= 0)
             {
               Report( mVerifier[mItemIdx[bestIdx]].charset());
               mDone = true;
             }
           } // if (eucNum == possibleCandidateNum)
         } // if(mRunSampler)
   }

   public String[] getProbableCharsets() {

	if (mItems <= 0) {
	   String[] nomatch = new String[1];
	   nomatch[0] = "nomatch" ;
	   return nomatch ;
	}

	String ret[] = new String[mItems] ;
	for (int i=0; i




© 2015 - 2024 Weber Informatics LLC | Privacy Policy