org.datacleaner.widgets.CharSetEncodingComboBox Maven / Gradle / Ivy
/**
* DataCleaner (community edition)
* Copyright (C) 2014 Free Software Foundation, Inc.
*
* This copyrighted material is made available to anyone wishing to use, modify,
* copy, or redistribute it subject to the terms and conditions of the GNU
* Lesser General Public License, as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this distribution; if not, write to:
* Free Software Foundation, Inc.
* 51 Franklin Street, Fifth Floor
* Boston, MA 02110-1301 USA
*/
package org.datacleaner.widgets;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;
import org.datacleaner.connection.FixedWidthDatastore;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.ibm.icu.text.CharsetDetector;
import com.ibm.icu.text.CharsetMatch;
/**
* An editable combobox with a default set of available character set encodings
*
* @author Kasper Sørensen
*/
public class CharSetEncodingComboBox extends DCComboBox {
private static final long serialVersionUID = 1L;
private static final Logger logger = LoggerFactory.getLogger(CharSetEncodingComboBox.class);
private static final String[] encodings;
private static final String EBCDIC_POSTFIX = " (EBCDIC)";
static {
final List list = new ArrayList<>();
list.add("UTF-8");
list.add("UTF-16");
list.add("UTF-16BE");
list.add("UTF-16LE");
list.add("ASCII");
for (int i = 1; i <= 16; i++) {
list.add("ISO-8859-" + i);
}
for (int i = 1250; i <= 1258; i++) {
list.add("Windows-" + i);
}
for (int i = 1140; i <= 1149; i++) {
list.add(FixedWidthDatastore.EBCDIC_PREFIX + i + EBCDIC_POSTFIX);
}
encodings = list.toArray(new String[list.size()]);
}
public CharSetEncodingComboBox() {
super(encodings);
setEditable(true);
final String defaultCharset = Charset.defaultCharset().name();
setSelectedItem(defaultCharset);
}
public String autoDetectEncoding(final byte[] bytes) {
final CharsetDetector cd = new CharsetDetector();
cd.setText(bytes);
final CharsetMatch charsetMatch = cd.detect();
final String charSet = charsetMatch.getName();
final int confidence = charsetMatch.getConfidence();
logger.info("CharsetMatch: {} ({}% confidence)", charSet, confidence);
setSelectedItem(charSet);
return charSet;
}
@Override
public String getSelectedItem() {
return super.getSelectedItem().replace(EBCDIC_POSTFIX, "");
}
}