All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.scudata.expression.fn.CharEncodingDetect Maven / Gradle / Ivy

Go to download

SPL(Structured Process Language) A programming language specially for structured data computing.

There is a newer version: 20240823
Show newest version
package com.scudata.expression.fn;

/**
 * 
 *  Copyright (C) <2009>  
 *
 *   This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */

import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;

public class CharEncodingDetect extends Encoding {
	public static enum LANG {
		ALL, CHINESE, SIMPLIFIED_CHINESE, TRADITIONAL_CHINESE, JAPANESE, KOREAN
	};

	int scores[];
	public boolean debug;

	public CharEncodingDetect() {
		super();
		debug = false;
		scores = new int[TOTALTYPES];
	}

	public static void main(String argc[]) {
		CharEncodingDetect sinodetector;
		int result = OTHER;
		int i;
		sinodetector = new CharEncodingDetect();
		for (i = 0; i < argc.length; i++) {
			if (argc[i].startsWith("http://") == true) {
				try {
					result = sinodetector.detectEncoding(new URL(argc[i]));
				} catch (Exception e) {
					System.err.println("Bad URL " + e.toString());
				}
			} else if (argc[i].equals("-d")) {
				sinodetector.debug = true;
				continue;
			} else {
				result = sinodetector.detectEncoding(new File(argc[i]));
			}
			System.out.println(nicename[result]);
		}
	}

	/**
	 * Function : detectEncoding Aruguments: URL Returns : One of the encodings
	 * from the Encoding enumeration (GB2312, HZ, BIG5, EUC_TW, ASCII, or OTHER)
	 * Description: This function looks at the URL contents and assigns it a
	 * probability score for each encoding type. The encoding type with the
	 * highest probability is returned.
	 */
	public int detectEncoding(URL testurl) {
		byte[] rawtext = new byte[10000];
		int bytesread = 0, byteoffset = 0;
		int guess = OTHER;
		InputStream chinesestream;
		try {
			chinesestream = testurl.openStream();
			while ((bytesread = chinesestream.read(rawtext, byteoffset, rawtext.length - byteoffset)) > 0) {
				byteoffset += bytesread;
			}

			chinesestream.close();
			guess = detectEncodingLang(rawtext, 0);
		} catch (Exception e) {
			System.err.println("Error loading or using URL " + e.toString());
			guess = -1;
		}
		return guess;
	}

	/**
	 * Function : detectEncoding Aruguments: File Returns : One of the encodings
	 * from the Encoding enumeration (GB2312, HZ, BIG5, EUC_TW, ASCII, or OTHER)
	 * Description: This function looks at the file and assigns it a probability
	 * score for each encoding type. The encoding type with the highest
	 * probability is returned.
	 */
	public int detectEncoding(File testfile) {
		FileInputStream chinesefile;
		byte[] rawtext;
		rawtext = new byte[(int) testfile.length()];
		try {
			chinesefile = new FileInputStream(testfile);
			chinesefile.read(rawtext);
			chinesefile.close();
		} catch (Exception e) {
			System.err.println("Error: " + e);
		}
		return detectEncodingLang(rawtext, 0);
	}

	/**
	 * Function : detectEncoding Aruguments: byte array Returns : One of the
	 * encodings from the Encoding enumeration (GB2312, HZ, BIG5, EUC_TW, ASCII,
	 * or OTHER) Description: This function looks at the byte array and assigns
	 * it a probability score for each encoding type. The encoding type with the
	 * highest probability is returned.
	 */
	public int detectEncoding(byte[] rawtext) {
		return detectEncodingLang(rawtext, 0);
	}

	public int detectEncoding(byte[] rawtext, int lang) {
		return detectEncodingLang(rawtext, lang);
	}

	private int detectEncodingLang(byte[] rawtext, int lang) {
		if (lang == LANG.CHINESE.ordinal()) {
			scores[GB2312] = gb2312_probability(rawtext);
			scores[GBK] = gbk_probability(rawtext);
			scores[GB18030] = gb18030_probability(rawtext);
			scores[HZ] = hz_probability(rawtext);
			scores[BIG5] = big5_probability(rawtext);
			scores[CNS11643] = euc_tw_probability(rawtext);
			scores[ISO2022CN] = iso_2022_cn_probability(rawtext);
			scores[UTF8] = utf8_probability(rawtext);
			scores[UNICODE] = utf16_probability(rawtext);
			scores[UNICODE_ESCAPE] = utf16_escape_probability(rawtext);
			scores[ASCII] = ascii_probability(rawtext);
		} else if (lang == LANG.SIMPLIFIED_CHINESE.ordinal()) {
			scores[GB2312] = gb2312_probability(rawtext);
			scores[GB18030] = gb18030_probability(rawtext);
			scores[ASCII] = ascii_probability(rawtext);
			scores[UTF8] = utf8_probability(rawtext);
			scores[UNICODE] = utf16_probability(rawtext);
			scores[UNICODE_ESCAPE] = utf16_escape_probability(rawtext);
		} else if (lang == LANG.TRADITIONAL_CHINESE.ordinal()) {
			scores[BIG5] = big5_probability(rawtext);
			scores[CNS11643] = euc_tw_probability(rawtext);
			scores[ASCII] = ascii_probability(rawtext);
			scores[UTF8] = utf8_probability(rawtext);
			scores[UNICODE] = utf16_probability(rawtext);
			scores[UNICODE_ESCAPE] = utf16_escape_probability(rawtext);
		} else if (lang == LANG.JAPANESE.ordinal()) {
			scores[SJIS] = sjis_probability(rawtext);
			scores[EUC_JP] = euc_jp_probability(rawtext);
			scores[ISO2022JP] = iso_2022_jp_probability(rawtext);
			scores[ASCII] = ascii_probability(rawtext);
			scores[UTF8] = utf8_probability(rawtext);
			scores[UNICODE] = utf16_probability(rawtext);
		} else if (lang == LANG.KOREAN.ordinal()) {
			scores[ISO2022KR] = iso_2022_kr_probability(rawtext);
			scores[EUC_KR] = euc_kr_probability(rawtext);
			scores[CP949] = cp949_probability(rawtext);
			scores[ASCII] = ascii_probability(rawtext);
			scores[UTF8] = utf8_probability(rawtext);
			scores[UNICODE] = utf16_probability(rawtext);
		} else { // LANG.ALL
			scores[GB2312] = gb2312_probability(rawtext);
			scores[GBK] = gbk_probability(rawtext);
			scores[GB18030] = gb18030_probability(rawtext);
			scores[HZ] = hz_probability(rawtext);
			scores[BIG5] = big5_probability(rawtext);
			scores[CNS11643] = euc_tw_probability(rawtext);
			scores[ISO2022CN] = iso_2022_cn_probability(rawtext);
			scores[UTF8] = utf8_probability(rawtext);
			scores[UNICODE] = utf16_probability(rawtext);
			scores[EUC_KR] = euc_kr_probability(rawtext);
			scores[CP949] = cp949_probability(rawtext);
			scores[JOHAB] = 0;
			scores[ISO2022KR] = iso_2022_kr_probability(rawtext);
			scores[ASCII] = ascii_probability(rawtext);
			scores[SJIS] = sjis_probability(rawtext);
			scores[EUC_JP] = euc_jp_probability(rawtext);
			scores[ISO2022JP] = iso_2022_jp_probability(rawtext);
			scores[UNICODET] = 0;
			scores[UNICODE_ESCAPE] = utf16_escape_probability(rawtext);
			scores[ISO2022CN_GB] = 0;
			scores[ISO2022CN_CNS] = 0;
			scores[OTHER] = 0;
		}
		// Tabulate Scores
		int index, maxscore = 0;
		int encoding_guess = OTHER;

		for (index = 0; index < TOTALTYPES; index++) {
			if (debug && scores[index] > 0)
				System.err.println("Encoding " + nicename[index] + " score " + scores[index]);
			if (scores[index] > maxscore) {
				encoding_guess = index;
				maxscore = scores[index];
			}
		}
		// Return OTHER if nothing scored above 50
		if (maxscore <= 50) {
			encoding_guess = OTHER;
		}
		return encoding_guess;
	}

	public List autoDetectEncoding(byte[] rawtext) {
		scores[GB2312] = gb2312_probability(rawtext);
		scores[GBK] = gbk_probability(rawtext);
		scores[GB18030] = gb18030_probability(rawtext);
		scores[HZ] = hz_probability(rawtext);
		scores[BIG5] = big5_probability(rawtext);
		scores[CNS11643] = euc_tw_probability(rawtext);
		scores[ISO2022CN] = iso_2022_cn_probability(rawtext);
		scores[UTF8] = utf8_probability(rawtext);
		scores[UNICODE] = utf16_probability(rawtext);
		scores[EUC_KR] = euc_kr_probability(rawtext);
		scores[CP949] = cp949_probability(rawtext);
		scores[JOHAB] = 0;
		scores[ISO2022KR] = iso_2022_kr_probability(rawtext);
		scores[ASCII] = ascii_probability(rawtext);
		scores[SJIS] = sjis_probability(rawtext);
		scores[EUC_JP] = euc_jp_probability(rawtext);
		scores[ISO2022JP] = iso_2022_jp_probability(rawtext);
		scores[UNICODET] = 0;
		scores[UNICODE_ESCAPE] = utf16_escape_probability(rawtext);
		scores[ISO2022CN_GB] = 0;
		scores[ISO2022CN_CNS] = 0;
		scores[OTHER] = 0;

		// Tabulate Scores
		int index, maxscore = 0;
		int encoding_guess = OTHER;
		List lls = new ArrayList();
		for (index = 0; index < TOTALTYPES; index++) {
			if (debug && scores[index] > 0)
				System.err.println("Encoding " + nicename[index] + " score " + scores[index]);
			if (scores[index] >= maxscore) {
				encoding_guess = index;
				if (scores[index] > maxscore){
					lls.clear();
				}
				maxscore = scores[index];
				lls.add(nicename[index]);
			}
		}
		// Return OTHER if nothing scored above 50
		if (maxscore <= 50) {
			encoding_guess = OTHER;
			lls.clear();
			lls.add("OTHER");
		}
		
		return lls;
	}
	/*
	 * Function: gb2312_probability Argument: pointer to byte array Returns :
	 * number from 0 to 100 representing probability text in array uses GB-2312
	 * encoding
	 */
	int gb2312_probability(byte[] rawtext) {
		int i, rawtextlen = 0;
		int dbchars = 1, gbchars = 1;
		long gbfreq = 0, totalfreq = 1;
		float rangeval = 0, freqval = 0;
		int row, column;
		// Stage 1: Check to see if characters fit into acceptable ranges
		rawtextlen = rawtext.length;
		for (i = 0; i < rawtextlen - 1; i++) {
			// System.err.println(rawtext[i]);
			if (rawtext[i] >= 0) {
				// asciichars++;
			} else {
				dbchars++;
				if ((byte) 0xA1 <= rawtext[i]     && rawtext[i] <= (byte) 0xF7 && 
					(byte) 0xA1 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0xFE) {
					gbchars++;
					totalfreq += 500;
					row = rawtext[i] + 256 - 0xA1;
					column = rawtext[i + 1] + 256 - 0xA1;
					if (15 <= row && row < 55) {
						gbfreq += 435;
					} else if (55 <= row && row < 87) {
						gbfreq += 234;
					}else{
						gbfreq += 50;
					}
				}
				i++;
			}
		}
		rangeval = 50 * ((float) gbchars / (float) dbchars);
		freqval = 50 * ((float) gbfreq / (float) totalfreq);
//		System.out.println("gb2312_probability::gbchars=" + gbchars + ";dbchars=" + dbchars + ";gbfreq=" + gbfreq
//				+ ";totalfreq=" + totalfreq + ";total=" + (rangeval + freqval));
		return (int) (rangeval + freqval);
	}

	/*
	 * Function: gbk_probability Argument: pointer to byte array Returns :
	 * number from 0 to 100 representing probability text in array uses GBK
	 * encoding
	 */
	int gbk_probability(byte[] rawtext) {
		int i, rawtextlen = 0;
		int dbchars = 1, gbchars = 1;
		long gbfreq = 0, totalfreq = 1;
		float rangeval = 0, freqval = 0;
		int row, column;
		// Stage 1: Check to see if characters fit into acceptable ranges
		rawtextlen = rawtext.length;
		for (i = 0; i < rawtextlen - 1; i++) {
			// System.err.println(rawtext[i]);
			if (rawtext[i] >= 0) {
				// asciichars++;
			} else {
				dbchars++;
				if ((byte) 0xA1 <= rawtext[i]     && rawtext[i] <= (byte) 0xF7 && // Original GB range
					(byte) 0xA1 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0xFE) {
					gbchars++;
					totalfreq += 500;
					row = rawtext[i] + 256 - 0xA1;
					column = rawtext[i + 1] + 256 - 0xA1;
					
					if (15 <= row && row < 55) {
						gbfreq += 435;
					} else if (55 <= row && row < 87) {
						gbfreq += 234;
					}else{
						gbfreq += 50;
					}
				} else if ((byte) 0x81 <= rawtext[i] && rawtext[i] <= (byte) 0xFE && // Extended GB range
						 (((byte) 0x80 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0xFE) ||
						  ((byte) 0x40 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0x7E))) {
					gbchars++;
					totalfreq += 500;
					row = rawtext[i] + 256 - 0x81;
					if (0x40 <= rawtext[i + 1] && rawtext[i + 1] <= 0x7E) {
						column = rawtext[i + 1] - 0x40;
					} else {
						column = rawtext[i + 1] + 256 - 0x40;
					}
					
					gbfreq += 185;
				}
				i++;
			}
		}
		rangeval = 50 * ((float) gbchars / (float) dbchars);
		freqval = 50 * ((float) gbfreq / (float) totalfreq);
		// For regular GB files, this would give the same score, so I handicap it slightly
//		System.out.println("gbk_probability::gbchars=" + gbchars + ";dbchars=" + dbchars + ";gbfreq=" + gbfreq
//				+ ";totalfreq=" + totalfreq + ";total=" + (rangeval + freqval));
		return (int) (rangeval + freqval) - 1;
	}

	/*
	 * Function: gb18030_probability Argument: pointer to byte array Returns :
	 * number from 0 to 100 representing probability text in array uses GBK
	 * encoding
	 */
	int gb18030_probability(byte[] rawtext) {
		int i, rawtextlen = 0;
		int dbchars = 1, gbchars = 1;
		long gbfreq = 0, totalfreq = 1;
		float rangeval = 0, freqval = 0;
		int row, column;
		// Stage 1: Check to see if characters fit into acceptable ranges
		rawtextlen = rawtext.length;
		for (i = 0; i < rawtextlen - 1; i++) {
			if (rawtext[i] >= 0) {
				// asciichars++;
			} else {
				dbchars++;
				if (i + 1 < rawtextlen && 
						(byte) 0xA1 <= rawtext[i]     && rawtext[i] <= (byte) 0xF7 && 
						(byte) 0xA1 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0xFE) {
					gbchars++;
					totalfreq += 500;
					row = rawtext[i] + 256 - 0xA1;
					column = rawtext[i + 1] + 256 - 0xA1;
					
					if (15 <= row && row < 55) {
						gbfreq += 435;
					} else if (55 <= row && row < 87) {
						gbfreq += 234;
					}else{
						gbfreq += 50;
					}
				} else if (i + 1 < rawtextlen && 
						(byte) 0x81 <= rawtext[i] && rawtext[i] <= (byte) 0xFE && 
						(((byte) 0x80 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0xFE) || 
						 ((byte) 0x40 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0x7E)) ) {
					gbchars++;
					totalfreq += 500;
					row = rawtext[i] + 256 - 0x81;
					if (0x40 <= rawtext[i + 1] && rawtext[i + 1] <= 0x7E) {
						column = rawtext[i + 1] - 0x40;
					} else {
						column = rawtext[i + 1] + 256 - 0x40;
					}

					gbfreq += 185;
				} else if (i + 3 < rawtextlen &&
						(byte) 0x81 <= rawtext[i] && rawtext[i] <= (byte) 0xFE && 
						(byte) 0x30 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0x39 && 
						(byte) 0x81 <= rawtext[i + 2] && rawtext[i + 2] <= (byte) 0xFE && 
						(byte) 0x30 <= rawtext[i + 3] && rawtext[i + 3] <= (byte) 0x39) {
					gbchars++;
				}
				i++;
			}
		}
		rangeval = 50 * ((float) gbchars / (float) dbchars);
		freqval = 50 * ((float) gbfreq / (float) totalfreq);
//		System.out.println("gb18030_probability::gbchars=" + gbchars + ";dbchars=" + dbchars + ";gbfreq=" + gbfreq
//				+ ";totalfreq=" + totalfreq + ";total=" + (rangeval + freqval));
		return (int) (rangeval + freqval) - 1;
	}

	/*
	 * Function: hz_probability Argument: byte array Returns : number from 0 to
	 * 100 representing probability text in array uses HZ encoding
	 */
	int hz_probability(byte[] rawtext) {
		int i, rawtextlen;
		int hzchars = 0, dbchars = 1;
		long hzfreq = 0, totalfreq = 1;
		float rangeval = 0, freqval = 0;
		int hzstart = 0, hzend = 0;
		int row, column;
		rawtextlen = rawtext.length;
		for (i = 0; i < rawtextlen; i++) {
			if (rawtext[i] == '~') {
				if (rawtext[i + 1] == '{') {
					hzstart++;
					i += 2;
					while (i < rawtextlen - 1) {
						if (rawtext[i] == 0x0A || rawtext[i] == 0x0D) {
							break;
						} else if (rawtext[i] == '~' && rawtext[i + 1] == '}') {
							hzend++;
							i++;
							break;
						} else if ((0x21 <= rawtext[i] 	   && rawtext[i] <= 0x77) &&
								   (0x21 <= rawtext[i + 1] && rawtext[i + 1] <= 0x77)) {
							hzchars += 2;
							row = rawtext[i] - 0x21;
							column = rawtext[i + 1] - 0x21;
							totalfreq += 500;

							if (15 <= row && row < 55) {
								hzfreq += 435;
							} else if (55 <= row && row < 87) {
								hzfreq += 234;
							}else{
								hzfreq += 50;
							}
						} else if ((0xA1 <= rawtext[i] && rawtext[i] <= 0xF7)
								&& (0xA1 <= rawtext[i + 1] && rawtext[i + 1] <= 0xF7)) {
							hzchars += 2;
							row = rawtext[i] + 256 - 0xA1;
							column = rawtext[i + 1] + 256 - 0xA1;
							totalfreq += 500;

							if (15 <= row && row < 55) {
								hzfreq += 435;
							} else if (55 <= row && row < 87) {
								hzfreq += 234;
							}else{
								hzfreq += 50;
							}
						}
						dbchars += 2;
						i += 2;
					}
				} else if (rawtext[i + 1] == '}') {
					hzend++;
					i++;
				} else if (rawtext[i + 1] == '~') {
					i++;
				}
			}
		}
		if (hzstart > 4) {
			rangeval = 50;
		} else if (hzstart > 1) {
			rangeval = 41;
		} else if (hzstart > 0) { // Only 39 in case the sequence happened to occur
			rangeval = 39; 		  // in otherwise non-Hz text
		} else {
			rangeval = 0;
		}
		freqval = 50 * ((float) hzfreq / (float) totalfreq);
		return (int) (rangeval + freqval);
	}

	/**
	 * Function: big5_probability Argument: byte array Returns : number from 0
	 * to 100 representing probability text in array uses Big5 encoding
	 */
	int big5_probability(byte[] rawtext) {
		int i, rawtextlen = 0;
		int dbchars = 1, bfchars = 1;
		float rangeval = 0, freqval = 0;
		long bffreq = 0, totalfreq = 1;
		int row, column;
		// Check to see if characters fit into acceptable ranges
		rawtextlen = rawtext.length;
		for (i = 0; i < rawtextlen - 1; i++) {
			if (rawtext[i] >= 0) {
				// asciichars++;
			} else {
				dbchars++;
				if (  (byte) 0xA1 <= rawtext[i] && rawtext[i] <= (byte) 0xF9 &&
					(((byte) 0x40 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0x7E) ||
					 ((byte) 0xA1 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0xFE)) ) {
					bfchars++;
					totalfreq += 500;
					row = rawtext[i] + 256 - 0xA1;
					if (0x40 <= rawtext[i + 1] && rawtext[i + 1] <= 0x7E) {
						column = rawtext[i + 1] - 0x40;
					} else {
						column = rawtext[i + 1] + 256 - 0x61;
					}

					if (3 <= row && row <= 37) {
						bffreq += 436;
					} else {
						bffreq += 50;
					}
				}
				i++;
			}
		}
		rangeval = 50 * ((float) bfchars / (float) dbchars);
		freqval = 50 * ((float) bffreq / (float) totalfreq);
		return (int) (rangeval + freqval);
	}

	/*
	 * Function: big5plus_probability Argument: pointer to unsigned char array
	 * Returns : number from 0 to 100 representing probability text in array
	 * uses Big5+ encoding
	 */
	int big5plus_probability(byte[] rawtext) {
		int i, rawtextlen = 0;
		int dbchars = 1, bfchars = 1;
		long bffreq = 0, totalfreq = 1;
		float rangeval = 0, freqval = 0;
		int row, column;
		// Stage 1: Check to see if characters fit into acceptable ranges
		rawtextlen = rawtext.length;
		for (i = 0; i < rawtextlen - 1; i++) {
			// System.err.println(rawtext[i]);
			if (rawtext[i] >= 128) {
				// asciichars++;
			} else {
				dbchars++;
				if (  0xA1 <= rawtext[i] && rawtext[i] <= 0xF9 && // Original Big5 range
					((0x40 <= rawtext[i + 1] && rawtext[i + 1] <= 0x7E) || 
					 (0xA1 <= rawtext[i + 1] && rawtext[i + 1] <= 0xFE))) {
					bfchars++;
					totalfreq += 500;
					row = rawtext[i] - 0xA1;
					if (0x40 <= rawtext[i + 1] && rawtext[i + 1] <= 0x7E) {
						column = rawtext[i + 1] - 0x40;
					} else {
						column = rawtext[i + 1] - 0x61;
					}
					
					if (3 <= row && row <= 37) {
						bffreq += 436;
					} else {
						bffreq += 50;
					}
				} else if (0x81 <= rawtext[i] && rawtext[i] <= 0xFE && // Extended Big5 range
						 ((0x40 <= rawtext[i + 1] && rawtext[i + 1] <= 0x7E) || 
						  (0x80 <= rawtext[i + 1] && rawtext[i + 1] <= 0xFE)) ) {
					bfchars++;
					totalfreq += 500;
					row = rawtext[i] - 0x81;
					if (0x40 <= rawtext[i + 1] && rawtext[i + 1] <= 0x7E) {
						column = rawtext[i + 1] - 0x40;
					} else {
						column = rawtext[i + 1] - 0x40;
					}
					bffreq += 185;
				}
				i++;
			}
		}
		rangeval = 50 * ((float) bfchars / (float) dbchars);
		freqval = 50 * ((float) bffreq / (float) totalfreq);
		// For regular Big5 files, this would give the same score, so I handicap it slightly
		return (int) (rangeval + freqval) - 1;
	}

	/*
	 * Function: euc_tw_probability Argument: byte array Returns : number from 0
	 * to 100 representing probability text in array uses EUC-TW (CNS 11643) encoding
	 */
	int euc_tw_probability(byte[] rawtext) {
		int i, rawtextlen = 0;
		int dbchars = 1, cnschars = 1;
		long cnsfreq = 0, totalfreq = 1;
		float rangeval = 0, freqval = 0;
		int row, column;
		// Check to see if characters fit into acceptable ranges
		// and have expected frequency of use
		rawtextlen = rawtext.length;
		for (i = 0; i < rawtextlen - 1; i++) {
			if (rawtext[i] >= 0) { // in ASCII range
				// asciichars++;
			} else { // high bit set
				dbchars++;
				if (i + 3 < rawtextlen
						&& (byte) 0x8E == rawtext[i] && (byte) 0xA1 <= rawtext[i + 1]
						&& rawtext[i + 1] <= (byte) 0xB0 && (byte) 0xA1 <= rawtext[i + 2]
						&& rawtext[i + 2] <= (byte) 0xFE && (byte) 0xA1 <= rawtext[i + 3]
						&& rawtext[i + 3] <= (byte) 0xFE) { // Planes 1 - 16
					cnschars++;
					// System.out.println("plane 2 or above CNS char");
					// These are all less frequent chars so just ignore freq
					i += 3;
				} else if ((byte) 0xA1 <= rawtext[i] 	 && rawtext[i] <= (byte) 0xFE && // Plane 1
						   (byte) 0xA1 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0xFE) {
					cnschars++;
					totalfreq += 500;
					row = rawtext[i] + 256 - 0xA1;
					column = rawtext[i + 1] + 256 - 0xA1;

					if (35 <= row && row <= 92) {
						cnsfreq += 435;
					} else {
						cnsfreq += 50;
					}
					i++;
				}
			}
		}
		rangeval = 50 * ((float) cnschars / (float) dbchars);
		freqval = 50 * ((float) cnsfreq / (float) totalfreq);
		return (int) (rangeval + freqval);
	}

	/*
	 * Function: iso_2022_cn_probability Argument: byte array Returns : number
	 * from 0 to 100 representing probability text in array uses ISO 2022-CN
	 * encoding WORKS FOR BASIC CASES, BUT STILL NEEDS MORE WORK
	 */
	int iso_2022_cn_probability(byte[] rawtext) {
		int i, rawtextlen = 0;
		int dbchars = 1, isochars = 1;
		long isofreq = 0, totalfreq = 1;
		float rangeval = 0, freqval = 0;
		int row, column;
		// Check to see if characters fit into acceptable ranges
		// and have expected frequency of use
		rawtextlen = rawtext.length;
		for (i = 0; i < rawtextlen - 1; i++) {
			if (i + 3 < rawtextlen && rawtext[i] == (byte) 0x1B) { // Escape char ESC
				if (rawtext[i + 1] == (byte) 0x24 && 
					rawtext[i + 2] == 0x29 && 
					rawtext[i + 3] == (byte) 0x41) { // GB Escape $ ) A
					i += 4;
					while (rawtext[i] != (byte) 0x1B && i < rawtextlen - 1) {
						dbchars++;
						if ((0x21 <= rawtext[i]     && rawtext[i] <= 0x77) && 
							(0x21 <= rawtext[i + 1] && rawtext[i + 1] <= 0x77)) {
							isochars++;
							row = rawtext[i] - 0x21;
							column = rawtext[i + 1] - 0x21;
							totalfreq += 500;

							if (15 <= row && row < 55) {
								isofreq += 435;
							} else if (55 <= row && row < 87) {
								isofreq += 234;
							}
							i++;
						}
						i++;
					}
				} else if (rawtext[i + 1] == (byte) 0x24 && 
						   rawtext[i + 2] == (byte) 0x29 &&
						   rawtext[i + 3] == (byte) 0x47) { // CNS Escape $ ) G
					i += 4;
					while (rawtext[i] != (byte) 0x1B) {
						dbchars++;
						if ((byte) 0x21 <= rawtext[i] 	  && rawtext[i] <= (byte) 0x7E && 
							(byte) 0x21 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0x7E) {
							isochars++;
							totalfreq += 500;
							row = rawtext[i] - 0x21;
							column = rawtext[i + 1] - 0x21;
							if (35 <= row && row <= 92) {
								isofreq += 435;
							} else {
								isofreq += 150;
							}
							i++;
						}
						i++;
					}
				}
				if (rawtext[i] == (byte) 0x1B && i + 2 < rawtextlen && rawtext[i + 1] == (byte) 0x28
						&& rawtext[i + 2] == (byte) 0x42) { // ASCII:ESC ( B
					i += 2;
				}
			}
		}
		rangeval = 50 * ((float) isochars / (float) dbchars);
		freqval = 50 * ((float) isofreq / (float) totalfreq);

		return (int) (rangeval + freqval);
	}

	/*
	 * Function: utf8_probability Argument: byte array Returns : number from 0
	 * to 100 representing probability text in array uses UTF-8 encoding of Unicode
	 */
	int utf8_probability(byte[] rawtext) {
		int score = 0;
		int i, rawtextlen = 0;
		int goodbytes = 0, asciibytes = 0;
		// Maybe also use UTF8 Byte Order Mark: EF BB BF
		// Check to see if characters fit into acceptable ranges
		rawtextlen = rawtext.length;
		for (i = 0; i < rawtextlen; i++) {
			if ((rawtext[i] & (byte) 0x7F) == rawtext[i]) { // One byte
				asciibytes++;
				// Ignore ASCII, can throw off count
			} else if (i + 1 < rawtextlen && 
					-64 <= rawtext[i] 	   && rawtext[i] <= -33 && // Two bytes
					-128 <= rawtext[i + 1] && rawtext[i + 1] <= -65) {
				goodbytes += 2;
				i++;
			} else if (i + 2 < rawtextlen && 
					-32  <= rawtext[i] && rawtext[i] <= -17 && // Three bytes
					-128 <= rawtext[i + 1] && rawtext[i + 1] <= -65 && 
					-128 <= rawtext[i + 2] && rawtext[i + 2] <= -65) {
				goodbytes += 3;
				i += 2;
			}
		}
		
		if (asciibytes == rawtextlen) {
			return 0;
		}
		score = (int) (100 * ((float) goodbytes / (float) (rawtextlen - asciibytes)));
		// System.out.println("rawtextlen " + rawtextlen + " goodbytes " +
		// goodbytes + " asciibytes " + asciibytes + " score " + score);
		// If not above 98, reduce to zero to prevent coincidental matches
		// Allows for some (few) bad formed sequences
		if (score > 98) {
			return score;
		} else if (score > 95 && goodbytes > 30) {
			return score;
		} else {
			return 0;
		}
	}

	/*
	 * Function: utf16_probability Argument: byte array Returns : number from 0
	 * to 100 representing probability text in array uses UTF-16 encoding of
	 * Unicode, guess based on BOM // NOT VERY GENERAL, NEEDS MUCH MORE WORK
	 */
	int utf16_probability(byte[] rawtext) {
		// int score = 0;
		// int i, rawtextlen = 0;
		// int goodbytes = 0, asciibytes = 0;
		if (rawtext.length > 1 && 
				((byte) 0xFE == rawtext[0] && (byte) 0xFF == rawtext[1]) || // Big-endian
				((byte) 0xFF == rawtext[0] && (byte) 0xFE == rawtext[1])) { // Little-endian
			return 100;
		}
		return 0;
		/*
		 * // Check to see if characters fit into acceptable ranges rawtextlen =
		 * rawtext.length; for (i = 0; i < rawtextlen; i++) { if ((rawtext[i] &
		 * (byte)0x7F) == rawtext[i]) { // One byte goodbytes += 1;
		 * asciibytes++; } else if ((rawtext[i] & (byte)0xDF) == rawtext[i]) {
		 * // Two bytes if (i+1 < rawtextlen && (rawtext[i+1] & (byte)0xBF) ==
		 * rawtext[i+1]) { goodbytes += 2; i++; } } else if ((rawtext[i] &
		 * (byte)0xEF) == rawtext[i]) { // Three bytes if (i+2 < rawtextlen &&
		 * (rawtext[i+1] & (byte)0xBF) == rawtext[i+1] && (rawtext[i+2] &
		 * (byte)0xBF) == rawtext[i+2]) { goodbytes += 3; i+=2; } } }
		 *
		 * score = (int)(100 * ((float)goodbytes/(float)rawtext.length)); // An
		 * all ASCII file is also a good UTF8 file, but I'd rather it // get
		 * identified as ASCII. Can delete following 3 lines otherwise if
		 * (goodbytes == asciibytes) { score = 0; } // If not above 90, reduce
		 * to zero to prevent coincidental matches if (score > 90) { return
		 * score; } else { return 0; }
		 */
	}

	/*
	 * Function: ascii_probability Argument: byte array Returns : number from 0
	 * to 100 representing probability text in array uses all ASCII Description:
	 * Sees if array has any characters not in ASCII range, if so, score is
	 * reduced
	 */
	int ascii_probability(byte[] rawtext) {
		int score = 75;
		int i, rawtextlen;
		rawtextlen = rawtext.length;
		for (i = 0; i < rawtextlen; i++) {
			if (rawtext[i] < 0) {
				score = score - 5;
			} else if (rawtext[i] == (byte) 0x1B) { // ESC (used by ISO 2022)
				score = score - 5;
			}
			if (score <= 0) {
				return 0;
			}
		}
		return score;
	}

	/*
	 * Function: euc_kr__probability Argument: pointer to byte array Returns :
	 * number from 0 to 100 representing probability text in array uses EUC-KR
	 * encoding
	 */
	int euc_kr_probability(byte[] rawtext) {
		int i, rawtextlen = 0;
		int dbchars = 1, krchars = 1;
		long krfreq = 0, totalfreq = 1;
		float rangeval = 0, freqval = 0;
		int row, column;
		// Stage 1: Check to see if characters fit into acceptable ranges
		rawtextlen = rawtext.length;
		for (i = 0; i < rawtextlen - 1; i++) {
			// System.err.println(rawtext[i]);
			if (rawtext[i] >= 0) {
				// asciichars++;
			} else {
				dbchars++;
				if ((byte) 0xA1 <= rawtext[i] 	  && rawtext[i] <= (byte) 0xFE && 
					(byte) 0xA1 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0xFE) {
					krchars++;
					totalfreq += 500;
					row = rawtext[i] + 256 - 0xA1;
					column = rawtext[i + 1] + 256 - 0xA1;

					if (row >= 15 && row < 40) {
						krfreq += 436;
					} else {
						krfreq += 50;
					}
				}
				i++;
			}
		}
		rangeval = 50 * ((float) krchars / (float) dbchars);
		freqval = 50 * ((float) krfreq / (float) totalfreq);
//		System.out.println("euc_kr_probability::gbchars=" + krchars + ";dbchars=" + dbchars + ";gbfreq=" + krfreq
//				+ ";totalfreq=" + totalfreq + ";total=" + (rangeval + freqval));
		return (int) (rangeval + freqval);
	}

	/*
	 * Function: cp949__probability Argument: pointer to byte array Returns :
	 * number from 0 to 100 representing probability text in array uses Cp949
	 * encoding
	 */
	int cp949_probability(byte[] rawtext) {
		int i, rawtextlen = 0;
		int dbchars = 1, krchars = 1;
		long krfreq = 0, totalfreq = 1;
		float rangeval = 0, freqval = 0;
		int row, column;
		// Stage 1: Check to see if characters fit into acceptable ranges
		rawtextlen = rawtext.length;
		for (i = 0; i < rawtextlen - 1; i++) {
			// System.err.println(rawtext[i]);
			if (rawtext[i] >= 0) {
				// asciichars++;
			} else {
				dbchars++;
				if ( (byte) 0x81 <= rawtext[i] && rawtext[i] <= (byte) 0xFE &&
					((byte) 0x41 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0x5A || 
					 (byte) 0x61 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0x7A || 
					 (byte) 0x81 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0xFE)) {
					krchars++;
					totalfreq += 500;
					if ((byte) 0xA1 <= rawtext[i] 	  && rawtext[i] <= (byte) 0xFE && 
						(byte) 0xA1 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0xFE) {
						row = rawtext[i] + 256 - 0xA1;
						column = rawtext[i + 1] + 256 - 0xA1;

						if (row >= 15 && row < 40) {
							krfreq += 436;
						} else {
							krfreq += 50;
						}
					}else{
						krfreq += 235;
					}
				}
				i++;
			}
		}
		rangeval = 50 * ((float) krchars / (float) dbchars);
		freqval = 50 * ((float) krfreq / (float) totalfreq);
		return (int) (rangeval + freqval);
	}

	int iso_2022_kr_probability(byte[] rawtext) {
		int i;
		for (i = 0; i < rawtext.length; i++) {
			if (i + 3 < rawtext.length && rawtext[i] == 0x1b && 
					(char) rawtext[i + 1] == '$' && 
					(char) rawtext[i + 2] == ')' && 
					(char) rawtext[i + 3] == 'C') {
				return 100;
			}
		}
		return 0;
	}

	/*
	 * Function: euc_jp_probability Argument: pointer to byte array Returns :
	 * number from 0 to 100 representing probability text in array uses EUC-JP
	 * encoding
	 */
	int euc_jp_probability(byte[] rawtext) {
		int i, rawtextlen = 0;
		int dbchars = 1, jpchars = 1;
		long jpfreq = 0, totalfreq = 1;
		float rangeval = 0, freqval = 0;
		int row, column;
		// Stage 1: Check to see if characters fit into acceptable ranges
		rawtextlen = rawtext.length;
		for (i = 0; i < rawtextlen - 1; i++) {
			// System.err.println(rawtext[i]);
			if (rawtext[i] >= 0) {
				// asciichars++;
			} else {
				dbchars++;
				if ((byte) 0xA1 <= rawtext[i] 	  && rawtext[i] <= (byte) 0xFE && 
					(byte) 0xA1 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0xFE) {
					jpchars++;
					totalfreq += 500;
					row = rawtext[i] + 256 - 0xA1;
					column = rawtext[i + 1] + 256 - 0xA1;

					if (3 <= row && row < 15) {
						jpfreq += 500;
					}else if (15 <= row && row < 47) {
						jpfreq += 435;
					}else{
						jpfreq += 50;
					}
				}
				i++;
			}
		}
		rangeval = 50 * ((float) jpchars / (float) dbchars);
		freqval = 50 * ((float) jpfreq / (float) totalfreq);
		return (int) (rangeval + freqval);
	}

	int iso_2022_jp_probability(byte[] rawtext) {
		int i;
		for (i = 0; i < rawtext.length; i++) {
			if (i + 2 < rawtext.length && rawtext[i] == 0x1b && 
					(char) rawtext[i + 1] == '$' && 
					(char) rawtext[i + 2] == 'B') {
				return 100;
			}
		}
		return 0;
	}

	/*
	 * Function: sjis_probability Argument: pointer to byte array Returns :
	 * number from 0 to 100 representing probability text in array uses
	 * Shift-JIS encoding
	 */
	int sjis_probability(byte[] rawtext) {
		int i, rawtextlen = 0;
		int dbchars = 1, jpchars = 1;
		long jpfreq = 0, totalfreq = 1;
		float rangeval = 0, freqval = 0;
		int row, column, adjust;
		// Stage 1: Check to see if characters fit into acceptable ranges
		rawtextlen = rawtext.length;
		for (i = 0; i < rawtextlen - 1; i++) {
			// System.err.println(rawtext[i]);
			if (rawtext[i] >= 0) {
				// asciichars++;
			} else {
				dbchars++;
				if (i + 1 < rawtext.length
						&& (((byte) 0x81 <= rawtext[i] && rawtext[i] <= (byte) 0x9F)
						 || ((byte) 0xE0 <= rawtext[i] && rawtext[i] <= (byte) 0xEF))
						&& (((byte) 0x40 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0x7E)
						 || ((byte) 0x80 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0xFC))) {
					jpchars++;
					totalfreq += 500;
					row = rawtext[i] + 256;
					column = rawtext[i + 1] + 256;
					if (column < 0x9f) {
						adjust = 1;
						if (column > 0x7f) {
							column -= 0x20;
						} else {
							column -= 0x19;
						}
					} else {
						adjust = 0;
						column -= 0x7e;
					}
					if (row < 0xa0) {
						row = ((row - 0x70) << 1) - adjust;
					} else {
						row = ((row - 0xb0) << 1) - adjust;
					}
					row -= 0x20;
					column = 0x20;
					
					if (3 <= row && row < 15) {
						jpfreq += 500;
					}else if (15 <= row && row < 47) {
						jpfreq += 435;
					}else{
						jpfreq += 50;
					}
					
					i++;
				} else if ((byte) 0xA1 <= rawtext[i] && rawtext[i] <= (byte) 0xDF) {
					// half-width katakana, convert to full-width
				}
			}
		}
		rangeval = 50 * ((float) jpchars / (float) dbchars);
		freqval = 50 * ((float) jpfreq / (float) totalfreq);

		return (int) (rangeval + freqval);
	}
	
	int utf16_escape_probability(byte[] rawtext) {
		int score = 0;
		int i = 0, rawtextlen = 0;
		int goodbytes = 0, asciibytes = 0;

		rawtextlen = rawtext.length;
		while (i < rawtextlen) {
			int ch = rawtext[i];
			if (ch == '+') { // + : map to ' '
				asciibytes++;
			} else if ('A' <= ch && ch <= 'Z') { // 'A'..'Z' : as it was
				asciibytes++;
			} else if ('a' <= ch && ch <= 'z') { // 'a'..'z' : as it was
				asciibytes++;
			} else if ('0' <= ch && ch <= '9') { // '0'..'9' : as it was
				asciibytes++;
			} else if (ch == '-' || ch == '_' // unreserved : as it was
					|| ch == '.' || ch == '!' || ch == '~' || ch == '*' || ch == '/' || ch == '(' || ch == ')') {
				asciibytes++;
			} else if (ch == '%' || ch == '\\') {
				if ('u' != rawtext[i + 1]) { // %XX : map to ascii(XX)
					i += 2;
					goodbytes += 3;
				} else {					 // %uXXXX : map to unicode(XXXX)
					i += 5;
					goodbytes += 6;
				}
			}
			i++;
		}

		score = (int) (100 * ((float) goodbytes / (float) rawtext.length));
		if (goodbytes == asciibytes) {
			score = 0;
		}
		if (goodbytes + asciibytes == rawtextlen) {
			return 100;
		}
		return score;
	}
}

class Encoding {
	// Supported Encoding Types
	public static int GB2312 = 0;
	public static int GBK = 1;
	public static int GB18030 = 2;
	public static int HZ = 3;
	public static int BIG5 = 4;
	public static int CNS11643 = 5;
	public static int UTF8 = 6;
	public static int UTF8T = 7;
	public static int UTF8S = 8;
	public static int UNICODE = 9;
	public static int UNICODET = 10;
	public static int UNICODE_ESCAPE = 11;
	public static int ISO2022CN = 12;
	public static int ISO2022CN_CNS = 13;
	public static int ISO2022CN_GB = 14;
	public static int EUC_KR = 15;
	public static int CP949 = 16;
	public static int ISO2022KR = 17;
	public static int JOHAB = 18;
	public static int SJIS = 19;
	public static int EUC_JP = 20;
	public static int ISO2022JP = 21;
	public static int ASCII = 22;
	public static int OTHER = 23;
	public static int TOTALTYPES = 24;
	public final static int SIMP = 0;
	public final static int TRAD = 1;

	// Names of the encodings for human viewing
	public static String[] nicename;

	// Constructor
	public Encoding() {
		nicename = new String[TOTALTYPES];
		// Assign Human readable names
		nicename[GB2312] = "GB2312";
		nicename[GBK] = "GBK";
		nicename[GB18030] = "GB18030";
		nicename[HZ] = "HZ";
		nicename[ISO2022CN_GB] = "ISO2022CN-GB";
		nicename[BIG5] = "Big5";
		nicename[CNS11643] = "CNS11643";
		nicename[ISO2022CN_CNS] = "ISO2022CN-CNS";
		nicename[ISO2022CN] = "ISO2022CN";
		nicename[UTF8] = "UTF-8";
		nicename[UTF8T] = "UTF-8";
		nicename[UTF8S] = "UTF-8";
		nicename[UNICODE] = "Unicode";
		nicename[UNICODET] = "Unicode";
		nicename[UNICODE_ESCAPE] = "UnicodeEscape";
		nicename[EUC_KR] = "EUC-KR";
		nicename[CP949] = "CP949";
		nicename[ISO2022KR] = "ISO2022KR";
		nicename[JOHAB] = "Johab";
		nicename[SJIS] = "SJIS";
		nicename[EUC_JP] = "EUC-JP";
		nicename[ISO2022JP] = "ISO2022JP";
		nicename[ASCII] = "ASCII";
		nicename[OTHER] = "OTHER";
	}

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy