
prerna.util.FileEncoderDetector Maven / Gradle / Ivy
The newest version!
package prerna.util;
import java.io.IOException;
import java.nio.charset.Charset;
import org.apache.commons.fileupload.FileItem;
import org.apache.commons.io.FilenameUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.Detector;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.mozilla.universalchardet.UniversalDetector;
public class FileEncoderDetector {
private static final Logger classLogger = LogManager.getLogger(FileEncoderDetector.class);
private FileItem item;
private Charset charset = null;
public FileEncoderDetector(FileItem item) {
this.item = item;
}
/**
*
* @return
* @throws IOException
*/
public boolean isTextContent() throws IOException {
// use tika to check if this is a file we should process
{
String filetype = FilenameUtils.getExtension(item.getName());
String mimeType = null;
TikaConfig config = TikaConfig.getDefaultConfig();
Detector detector = config.getDetector();
Metadata metadata = new Metadata();
metadata.add(TikaCoreProperties.RESOURCE_NAME_KEY, item.getName());
try (TikaInputStream stream = TikaInputStream.get(this.item.getInputStream())) {
mimeType = detector.detect(stream, metadata).toString();
} catch (IOException e) {
classLogger.error(Constants.ERROR_MESSAGE, e);
}
if(mimeType != null) {
if(mimeType.equals("application/zip")) {
// zip
return false;
} else if(mimeType.startsWith("image/")) {
// image
return false;
} else if (mimeType.equalsIgnoreCase("application/vnd.openxmlformats-officedocument.wordprocessingml.document")
|| ((mimeType.equalsIgnoreCase("application/x-tika-ooxml")
|| mimeType.equalsIgnoreCase("application/msword")
|| mimeType.equalsIgnoreCase("application/x-tika-msoffice"))
&& (filetype.equals("doc") || filetype.equals("docx")))) {
// document
return false;
} else if (mimeType
.equalsIgnoreCase("application/vnd.openxmlformats-officedocument.presentationml.presentation")
|| ((mimeType.equalsIgnoreCase("application/x-tika-ooxml")
|| (mimeType.equalsIgnoreCase("application/vnd.ms-powerpoint")))
&& (filetype.equals("ppt") || filetype.equals("pptx")))) {
// powerpoint
return false;
} else if(mimeType.equalsIgnoreCase("application/vnd.ms-excel.sheet.macroenabled.12")
|| mimeType.equalsIgnoreCase("application/vnd.ms-excel.sheet.binary.macroenabled.12")
|| mimeType.equalsIgnoreCase("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")
|| mimeType.equalsIgnoreCase("application/vnd.ms-excel")
|| ( mimeType.equalsIgnoreCase("application/x-tika-ooxml")
&&
(filetype.equals("xls") || filetype.equals("xlsx") || filetype.equals("xlsm"))
)
) {
// excel
return false;
}
else if (mimeType.equalsIgnoreCase("application/pdf")) {
// pdf
return false;
}
}
}
// use universal detector to determine the type
byte[] buf = new byte[8192];
try(java.io.InputStream fis = this.item.getInputStream()) {
UniversalDetector detector = new UniversalDetector();
int nread;
while ((nread = fis.read(buf)) > 0 && !detector.isDone()) {
detector.handleData(buf, 0, nread);
}
detector.dataEnd();
String encoding = detector.getDetectedCharset();
if (encoding != null) {
// we got an encoding!
this.charset = Charset.forName(encoding);
return true;
}
}
return false;
}
/**
*
* @return
*/
public Charset getCharset() {
return charset;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy