All Downloads are FREE. Search and download functionalities are using the official Maven repository.

bboss.org.mozilla.intl.chardet.nsPSMDetector Maven / Gradle / Ivy

Go to download

bboss is a j2ee framework include aop/ioc,mvc,persistent,taglib,rpc,event ,bean-xml serializable and so on.http://www.bbossgroups.com

There is a newer version: 6.2.7
Show newest version
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*-
 *
 * The contents of this file are subject to the Netscape Public
 * License Version 1.1 (the "License"); you may not use this file
 * except in compliance with the License. You may obtain a copy of
 * the License at http://www.mozilla.org/NPL/
 *
 * Software distributed under the License is distributed on an "AS
 * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * rights and limitations under the License.
 *
 * The Original Code is mozilla.org code.
 *
 * The Initial Developer of the Original Code is Netscape
 * Communications Corporation.  Portions created by Netscape are
 * Copyright (C) 1998 Netscape Communications Corporation. All
 * Rights Reserved.
 *
 * Contributor(s):
 */

package bboss.org.mozilla.intl.chardet ;
import java.lang.* ;

public abstract class nsPSMDetector {

   public static final int ALL                 =  0 ;
   public static final int JAPANESE            =  1 ;
   public static final int CHINESE             =  2 ;
   public static final int SIMPLIFIED_CHINESE  =  3 ;
   public static final int TRADITIONAL_CHINESE =  4 ;
   public static final int KOREAN              =  5 ;

   public static final int NO_OF_LANGUAGES     =  6 ;
   public static final int MAX_VERIFIERS       = 16 ;

   nsVerifier[] mVerifier ;
   nsEUCStatistics[] mStatisticsData ;

   nsEUCSampler	mSampler = new nsEUCSampler() ;
   byte[]    mState = new byte[MAX_VERIFIERS] ;
   int[]     mItemIdx = new int[MAX_VERIFIERS] ;

   int     mItems ;
   int	   mClassItems ;
 
   boolean mDone ;
   boolean mRunSampler ;
   boolean mClassRunSampler ;

   public nsPSMDetector() {
	initVerifiers( nsPSMDetector.ALL );
	Reset() ;
   }

   public nsPSMDetector(int langFlag) {
	initVerifiers(langFlag);
	Reset() ;
   }

   public nsPSMDetector(int aItems, nsVerifier[] aVerifierSet, 
					nsEUCStatistics[] aStatisticsSet)  {
	mClassRunSampler = ( aStatisticsSet != null ) ;
	mStatisticsData = aStatisticsSet ;
	mVerifier = aVerifierSet ;

	mClassItems = aItems ;
	Reset() ;
   }
   

   public void Reset() {
	mRunSampler = mClassRunSampler ;
	mDone = false ;
	mItems = mClassItems ;

	for(int i=0; i=0 && currVerSet < NO_OF_LANGUAGES ) {
	   currVerifierSet = currVerSet ;
	}
	else {
	   currVerifierSet = nsPSMDetector.ALL ;
	}

	mVerifier = null ;
	mStatisticsData = null ;

	if ( currVerifierSet == nsPSMDetector.TRADITIONAL_CHINESE ) {

	   mVerifier = new nsVerifier[] {
      		new nsUTF8Verifier(),
      		new nsBIG5Verifier(),
      		new nsISO2022CNVerifier(),
      		new nsEUCTWVerifier(),
      		new nsCP1252Verifier(),
      		new nsUCS2BEVerifier(),
      		new nsUCS2LEVerifier()
	   };

	   mStatisticsData = new nsEUCStatistics[] {
      		null,
      		new Big5Statistics(),
      		null,
      		new EUCTWStatistics(),
      		null,
      		null,
      		null
	   };
	}

	//==========================================================
	else if ( currVerifierSet == nsPSMDetector.KOREAN ) {

	   mVerifier = new nsVerifier[] {
      		new nsUTF8Verifier(),
      		new nsEUCKRVerifier(),
      		new nsISO2022KRVerifier(),
      		new nsCP1252Verifier(),
      		new nsUCS2BEVerifier(),
      		new nsUCS2LEVerifier()
	   };
	}

	//==========================================================
	else if ( currVerifierSet == nsPSMDetector.SIMPLIFIED_CHINESE ) {

	   mVerifier = new nsVerifier[] {
      		new nsUTF8Verifier(),
      		new nsGB2312Verifier(),
      		new nsGB18030Verifier(),
      		new nsISO2022CNVerifier(),
      		new nsHZVerifier(),
      		new nsCP1252Verifier(),
      		new nsUCS2BEVerifier(),
      		new nsUCS2LEVerifier()
	   };
	}

	//==========================================================
	else if ( currVerifierSet == nsPSMDetector.JAPANESE ) {

	   mVerifier = new nsVerifier[] {
      		new nsUTF8Verifier(),
      		new nsSJISVerifier(),
      		new nsEUCJPVerifier(),
      		new nsISO2022JPVerifier(),
      		new nsCP1252Verifier(),
      		new nsUCS2BEVerifier(),
      		new nsUCS2LEVerifier()
	   };
	}
	//==========================================================
	else if ( currVerifierSet == nsPSMDetector.CHINESE ) {

	   mVerifier = new nsVerifier[] {
      		new nsUTF8Verifier(),
      		new nsGB2312Verifier(),
      		new nsGB18030Verifier(),
      		new nsBIG5Verifier(),
      		new nsISO2022CNVerifier(),
      		new nsHZVerifier(),
      		new nsEUCTWVerifier(),
      		new nsCP1252Verifier(),
      		new nsUCS2BEVerifier(),
      		new nsUCS2LEVerifier()
	   };

	   mStatisticsData = new nsEUCStatistics[] {
      		null,
      		new GB2312Statistics(),
		null,
      		new Big5Statistics(),
      		null,
      		null,
      		new EUCTWStatistics(),
      		null,
      		null,
      		null
	   };
	}

	//==========================================================
	else if ( currVerifierSet == nsPSMDetector.ALL ) {

	   mVerifier = new nsVerifier[] {
      		new nsUTF8Verifier(),
      		new nsSJISVerifier(),
      		new nsEUCJPVerifier(),
      		new nsISO2022JPVerifier(),
      		new nsEUCKRVerifier(),
      		new nsISO2022KRVerifier(),
      		new nsBIG5Verifier(),
      		new nsEUCTWVerifier(),
      		new nsGB2312Verifier(),
      		new nsGB18030Verifier(),
      		new nsISO2022CNVerifier(),
      		new nsHZVerifier(),
      		new nsCP1252Verifier(),
      		new nsUCS2BEVerifier(),
      		new nsUCS2LEVerifier()
	   };

	   mStatisticsData = new nsEUCStatistics[] {
      		null,
      		null,
      		new EUCJPStatistics(),
      		null,
      		new EUCKRStatistics(),
      		null,
      		new Big5Statistics(),
      		new EUCTWStatistics(),
      		new GB2312Statistics(),
      		null,
      		null,
      		null,
      		null,
      		null,
      		null
	   };
	}

	mClassRunSampler = ( mStatisticsData != null ) ;
       	mClassItems = mVerifier.length ;

   }
	  
   public abstract void Report(String charset) ;

   public boolean HandleData(byte[] aBuf, int len) {


	int i,j;
	byte b, st;

 	for( i=0; i < len; i++) {
	   b = aBuf[i] ;

	   for (j=0; j < mItems; )
	   {
		st = nsVerifier.getNextState( mVerifier[mItemIdx[j]], 
						b, mState[j]) ;
//if (st != 0)
//System.out.println( "state(0x" + Integer.toHexString(0xFF&b) +") =>"+ Integer.toHexString(st&0xFF)+ " " + mVerifier[mItemIdx[j]].charset());

		if (st == nsVerifier.eItsMe) {

//System.out.println( "eItsMe(0x" + Integer.toHexString(0xFF&b) +") =>"+ mVerifier[mItemIdx[j]].charset());

		   Report( mVerifier[mItemIdx[j]].charset() );
		   mDone = true ;
		   return mDone ;

	        } else if (st == nsVerifier.eError ) {

//System.out.println( "eNotMe(0x" + Integer.toHexString(0xFF&b) +") =>"+ mVerifier[mItemIdx[j]].charset());
		   mItems--;
		   if (j < mItems ) {
			mItemIdx[j] = mItemIdx[mItems];	
			mState[j]   = mState[mItems];
		   }

		} else {
		  
		    mState[j++] = st ;

		}
	   }

	   if ( mItems <= 1 ) {

	        if( 1 == mItems) {
		   Report( mVerifier[mItemIdx[0]].charset() );
		}
		mDone = true ;
		return mDone ;

	   } 
	   else {
		
		int nonUCS2Num=0;
		int nonUCS2Idx=0;

		for(j=0; j 1) ;
	
     	if (mRunSampler) {
            mRunSampler = mSampler.Sample(aBuf, aLen);
            if(((aLastChance && mSampler.GetSomeData()) || 
                mSampler.EnoughData())
               && (eucNum == possibleCandidateNum)) {
              mSampler.CalFreq();

              int bestIdx = -1;
              int eucCnt=0;
              float bestScore = 0.0f;
              for(j = 0; j < mItems; j++) {
                 if((null != mStatisticsData[mItemIdx[j]])  &&
                   (!(mVerifier[mItemIdx[j]].charset()).equals("Big5")))
                 {
                    float score = mSampler.GetScore(
                       mStatisticsData[mItemIdx[j]].mFirstByteFreq(),
                       mStatisticsData[mItemIdx[j]].mFirstByteWeight(),
                       mStatisticsData[mItemIdx[j]].mSecondByteFreq(),
                       mStatisticsData[mItemIdx[j]].mSecondByteWeight() );
//System.out.println("FequencyScore("+mVerifier[mItemIdx[j]].charset()+")= "+ score);
                    if(( 0 == eucCnt++) || (bestScore > score )) {
                       bestScore = score;
                       bestIdx = j;
                    } // if(( 0 == eucCnt++) || (bestScore > score )) 
                } // if(null != ...)
             } // for
             if (bestIdx >= 0)
             {
               Report( mVerifier[mItemIdx[bestIdx]].charset());
               mDone = true;
             }
           } // if (eucNum == possibleCandidateNum)
         } // if(mRunSampler)
   }

   public String[] getProbableCharsets() {

	if (mItems <= 0) {
	   String[] nomatch = new String[1];
	   nomatch[0] = "nomatch" ;
	   return nomatch ;
	}

	String ret[] = new String[mItems] ;
	for (int i=0; i




© 2015 - 2025 Weber Informatics LLC | Privacy Policy