com.ibm.icu.text.CharsetMatch Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of icu4j Show documentation
Show all versions of icu4j Show documentation
International Component for Unicode for Java (ICU4J) is a mature, widely used Java library
providing Unicode and Globalization support
/**
*******************************************************************************
* Copyright (C) 2005-2012, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package com.ibm.icu.text;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
/**
* This class represents a charset that has been identified by a CharsetDetector
* as a possible encoding for a set of input data. From an instance of this
* class, you can ask for a confidence level in the charset identification,
* or for Java Reader or String to access the original byte data in Unicode form.
*
* Instances of this class are created only by CharsetDetectors.
*
* Note: this class has a natural ordering that is inconsistent with equals.
* The natural ordering is based on the match confidence value.
*
* @stable ICU 3.4
*/
public class CharsetMatch implements Comparable {
/**
* Create a java.io.Reader for reading the Unicode character data corresponding
* to the original byte data supplied to the Charset detect operation.
*
* CAUTION: if the source of the byte data was an InputStream, a Reader
* can be created for only one matching char set using this method. If more
* than one charset needs to be tried, the caller will need to reset
* the InputStream and create InputStreamReaders itself, based on the charset name.
*
* @return the Reader for the Unicode character data.
*
* @stable ICU 3.4
*/
public Reader getReader() {
InputStream inputStream = fInputStream;
if (inputStream == null) {
inputStream = new ByteArrayInputStream(fRawInput, 0, fRawLength);
}
try {
inputStream.reset();
return new InputStreamReader(inputStream, getName());
} catch (IOException e) {
return null;
}
}
/**
* Create a Java String from Unicode character data corresponding
* to the original byte data supplied to the Charset detect operation.
*
* @return a String created from the converted input data.
*
* @stable ICU 3.4
*/
public String getString() throws java.io.IOException {
return getString(-1);
}
/**
* Create a Java String from Unicode character data corresponding
* to the original byte data supplied to the Charset detect operation.
* The length of the returned string is limited to the specified size;
* the string will be trunctated to this length if necessary. A limit value of
* zero or less is ignored, and treated as no limit.
*
* @param maxLength The maximium length of the String to be created when the
* source of the data is an input stream, or -1 for
* unlimited length.
* @return a String created from the converted input data.
*
* @stable ICU 3.4
*/
public String getString(int maxLength) throws java.io.IOException {
String result = null;
if (fInputStream != null) {
StringBuilder sb = new StringBuilder();
char[] buffer = new char[1024];
Reader reader = getReader();
int max = maxLength < 0? Integer.MAX_VALUE : maxLength;
int bytesRead = 0;
while ((bytesRead = reader.read(buffer, 0, Math.min(max, 1024))) >= 0) {
sb.append(buffer, 0, bytesRead);
max -= bytesRead;
}
reader.close();
return sb.toString();
} else {
String name = getName();
/*
* getName() may return a name with a suffix 'rtl' or 'ltr'. This cannot
* be used to open a charset (e.g. IBM424_rtl). The ending '_rtl' or 'ltr'
* should be stripped off before creating the string.
*/
int startSuffix = name.indexOf("_rtl") < 0 ? name.indexOf("_ltr") : name.indexOf("_rtl");
if (startSuffix > 0) {
name = name.substring(0, startSuffix);
}
result = new String(fRawInput, name);
}
return result;
}
/**
* Get an indication of the confidence in the charset detected.
* Confidence values range from 0-100, with larger numbers indicating
* a better match of the input data to the characteristics of the
* charset.
*
* @return the confidence in the charset match
*
* @stable ICU 3.4
*/
public int getConfidence() {
return fConfidence;
}
/**
* Get the name of the detected charset.
* The name will be one that can be used with other APIs on the
* platform that accept charset names. It is the "Canonical name"
* as defined by the class java.nio.charset.Charset; for
* charsets that are registered with the IANA charset registry,
* this is the MIME-preferred registerd name.
*
* @see java.nio.charset.Charset
* @see java.io.InputStreamReader
*
* @return The name of the charset.
*
* @stable ICU 3.4
*/
public String getName() {
return fCharsetName;
}
/**
* Get the ISO code for the language of the detected charset.
*
* @return The ISO code for the language or null
if the language cannot be determined.
*
* @stable ICU 3.4
*/
public String getLanguage() {
return fLang;
}
/**
* Compare to other CharsetMatch objects.
* Comparison is based on the match confidence value, which
* allows CharsetDetector.detectAll() to order its results.
*
* @param other the CharsetMatch object to compare against.
* @return a negative integer, zero, or a positive integer as the
* confidence level of this CharsetMatch
* is less than, equal to, or greater than that of
* the argument.
* @throws ClassCastException if the argument is not a CharsetMatch.
* @stable ICU 4.4
*/
public int compareTo (CharsetMatch other) {
int compareResult = 0;
if (this.fConfidence > other.fConfidence) {
compareResult = 1;
} else if (this.fConfidence < other.fConfidence) {
compareResult = -1;
}
return compareResult;
}
/*
* Constructor. Implementation internal
*/
CharsetMatch(CharsetDetector det, CharsetRecognizer rec, int conf) {
fConfidence = conf;
// The references to the original application input data must be copied out
// of the charset recognizer to here, in case the application resets the
// recognizer before using this CharsetMatch.
if (det.fInputStream == null) {
// We only want the existing input byte data if it came straight from the user,
// not if is just the head of a stream.
fRawInput = det.fRawInput;
fRawLength = det.fRawLength;
}
fInputStream = det.fInputStream;
fCharsetName = rec.getName();
fLang = rec.getLanguage();
}
/*
* Constructor. Implementation internal
*/
CharsetMatch(CharsetDetector det, CharsetRecognizer rec, int conf, String csName, String lang) {
fConfidence = conf;
// The references to the original application input data must be copied out
// of the charset recognizer to here, in case the application resets the
// recognizer before using this CharsetMatch.
if (det.fInputStream == null) {
// We only want the existing input byte data if it came straight from the user,
// not if is just the head of a stream.
fRawInput = det.fRawInput;
fRawLength = det.fRawLength;
}
fInputStream = det.fInputStream;
fCharsetName = csName;
fLang = lang;
}
//
// Private Data
//
private int fConfidence;
private byte[] fRawInput = null; // Original, untouched input bytes.
// If user gave us a byte array, this is it.
private int fRawLength; // Length of data in fRawInput array.
private InputStream fInputStream = null; // User's input stream, or null if the user
// gave us a byte array.
private String fCharsetName; // The name of the charset this CharsetMatch
// represents. Filled in by the recognizer.
private String fLang; // The language, if one was determined by
// the recognizer during the detect operation.
}