
prerna.util.FileAnalyzer Maven / Gradle / Ivy
The newest version!
package prerna.util;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.List;
import org.apache.commons.fileupload.FileItem;
import org.apache.commons.io.FilenameUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.Detector;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
public class FileAnalyzer {
private static final Logger classLogger = LogManager.getLogger(FileAnalyzer.class);
private static final List COMMON_ENCODINGS = Arrays.asList(
StandardCharsets.UTF_8,
StandardCharsets.ISO_8859_1, // same as latin1
Charset.forName("Windows-1252") // same as cp1252
);
private FileItem item;
private Charset charset = null;
public FileAnalyzer(FileItem item) {
this.item = item;
}
/**
*
* @return
* @throws IOException
*/
public boolean isTextContent() throws IOException {
String filetype = FilenameUtils.getExtension(item.getName());
String mimeType = null;
TikaConfig config = TikaConfig.getDefaultConfig();
Detector detector = config.getDetector();
Metadata metadata = new Metadata();
metadata.add(TikaCoreProperties.RESOURCE_NAME_KEY, item.getName());
try (TikaInputStream stream = TikaInputStream.get(this.item.getInputStream())) {
mimeType = detector.detect(stream, metadata).toString();
} catch (IOException e) {
classLogger.error(Constants.ERROR_MESSAGE, e);
}
if(mimeType != null) {
if(mimeType.equals("application/zip")) {
// zip
return false;
} else if(mimeType.startsWith("image/")) {
// image
return false;
} else if (mimeType.equalsIgnoreCase("application/vnd.openxmlformats-officedocument.wordprocessingml.document")
|| ((mimeType.equalsIgnoreCase("application/x-tika-ooxml")
|| mimeType.equalsIgnoreCase("application/msword")
|| mimeType.equalsIgnoreCase("application/x-tika-msoffice"))
&& (filetype.equals("doc") || filetype.equals("docx")))) {
// document
return false;
} else if (mimeType
.equalsIgnoreCase("application/vnd.openxmlformats-officedocument.presentationml.presentation")
|| ((mimeType.equalsIgnoreCase("application/x-tika-ooxml")
|| (mimeType.equalsIgnoreCase("application/vnd.ms-powerpoint")))
&& (filetype.equals("ppt") || filetype.equals("pptx")))) {
// powerpoint
return false;
} else if(mimeType.equalsIgnoreCase("application/vnd.ms-excel.sheet.macroenabled.12")
|| mimeType.equalsIgnoreCase("application/vnd.ms-excel.sheet.binary.macroenabled.12")
|| mimeType.equalsIgnoreCase("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")
|| mimeType.equalsIgnoreCase("application/vnd.ms-excel")
|| ( mimeType.equalsIgnoreCase("application/x-tika-ooxml")
&&
(filetype.equals("xls") || filetype.equals("xlsx") || filetype.equals("xlsm"))
)
) {
// excel
return false;
}
else if (mimeType.equalsIgnoreCase("application/pdf")) {
// pdf
return false;
}
}
for (Charset charset : COMMON_ENCODINGS) {
try (InputStream is = item.getInputStream();
InputStreamReader isr = new InputStreamReader(is, charset);
BufferedReader reader = new BufferedReader(isr)) {
char[] buffer = new char[4096];
int charsRead = reader.read(buffer);
if (charsRead == -1) {
return false; // Empty file
}
String contentSnippet = new String(buffer, 0, charsRead);
if (isLikelyText(contentSnippet)) {
this.charset = charset;
return true;
}
} catch (IOException e) {
// Ignore and try the next encoding
}
}
return false;
}
/**
*
* @param contentSnippet
* @return
*/
private boolean isLikelyText(String contentSnippet) {
// Check for non-text characters and common text patterns
boolean hasNonTextCharacters = contentSnippet.chars().anyMatch(c ->
!(Character.isWhitespace(c) || Character.isISOControl(c) || (c >= 32 && c <= 126) || (c >= 128 && c <= 255))
);
if (hasNonTextCharacters) {
return false;
}
return contentSnippet.contains("\n") || contentSnippet.contains("\r") ||
contentSnippet.contains(",") || contentSnippet.contains("\t");
}
/**
*
* @return
*/
public Charset getCharset() {
return charset;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy