
com.google.refine.importing.EncodingGuesser Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of main Show documentation
Show all versions of main Show documentation
OpenRefine is a free, open source power tool for working with messy data and improving it
package com.google.refine.importing;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.List;
import com.fasterxml.jackson.databind.node.ArrayNode;
import com.fasterxml.jackson.databind.node.ObjectNode;
import org.apache.commons.lang3.StringUtils;
import org.mozilla.universalchardet.UnicodeBOMInputStream;
import org.mozilla.universalchardet.UniversalDetector;
import com.google.refine.util.JSONUtilities;
/**
* This class tries to find the correct encoding based on https://github.com/albfernandez/juniversalchardet which is a
* Java port of Mozilla's universalchardet library
* https://hg.mozilla.org/mozilla-central/file/tip/extensions/universalchardet/
*
* @author Steffen Stundzig
*/
public final class EncodingGuesser {
public static final String UTF_8_BOM = "UTF-8-BOM";
public static void guess(final ImportingJob job)
throws IOException {
ObjectNode retrievalRecord = job.getRetrievalRecord();
if (retrievalRecord != null) {
ArrayNode fileRecords = JSONUtilities.getArray(retrievalRecord, "files");
if (fileRecords != null) {
// TODO: If different files have different encodings, we're only able to present a single
// encoding to the user currently. Should we check for conflicts? Warn the user?
for (int i = 0; i < fileRecords.size(); i++) {
ObjectNode record = JSONUtilities.getObjectElement(fileRecords, i);
String encoding = ImportingUtilities.getEncoding(record);
if (StringUtils.isBlank(encoding)) {
String location = JSONUtilities.getString(record, "location", null);
if (location != null) {
try (UnicodeBOMInputStream is = new UnicodeBOMInputStream(
new FileInputStream(new File(job.getRawDataDir(), location)))) {
String detected = UniversalDetector.detectCharset(is);
UnicodeBOMInputStream.BOM bom = is.getBOM();
if (UnicodeBOMInputStream.BOM.UTF_8.equals(bom)) {
detected = UTF_8_BOM;
}
if (detected != null) {
JSONUtilities.safePut(record, "encoding", detected);
}
}
}
}
}
}
}
}
/**
* uses the first found encoding in the file records as initial encoding and put them into the options
*
* @param fileRecords
* @param options
*/
public final static void guessInitialEncoding(final List fileRecords, final ObjectNode options) {
if (fileRecords != null) {
for (ObjectNode record : fileRecords) {
String encoding = JSONUtilities.getString(record, "encoding", null);
if (!StringUtils.isBlank(encoding)) {
JSONUtilities.safePut(options, "encoding", encoding);
break;
}
}
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy