All Downloads are FREE. Search and download functionalities are using the official Maven repository.

prerna.util.FileAnalyzer Maven / Gradle / Ivy

The newest version!
package prerna.util;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.List;

import org.apache.commons.fileupload.FileItem;
import org.apache.commons.io.FilenameUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.Detector;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;

public class FileAnalyzer {
	
	private static final Logger classLogger = LogManager.getLogger(FileAnalyzer.class);

	private static final List COMMON_ENCODINGS = Arrays.asList(
			StandardCharsets.UTF_8,
			StandardCharsets.ISO_8859_1, // same as latin1
			Charset.forName("Windows-1252") // same as cp1252
			);

	private FileItem item;
	private Charset charset = null;

	public FileAnalyzer(FileItem item) {
		this.item = item;
	}

	/**
	 * 
	 * @return
	 * @throws IOException
	 */
	public boolean isTextContent() throws IOException {
		String filetype = FilenameUtils.getExtension(item.getName());
		String mimeType = null;
		
		TikaConfig config = TikaConfig.getDefaultConfig();
		Detector detector = config.getDetector();
		Metadata metadata = new Metadata();
		metadata.add(TikaCoreProperties.RESOURCE_NAME_KEY, item.getName());
		
		try (TikaInputStream stream = TikaInputStream.get(this.item.getInputStream())) {
			mimeType = detector.detect(stream, metadata).toString();
		} catch (IOException e) {
			classLogger.error(Constants.ERROR_MESSAGE, e);
        }
		
		if(mimeType != null) {
			if(mimeType.equals("application/zip")) {
				// zip
				return false;
			} else if(mimeType.startsWith("image/")) {
				// image
				return false;
			} else if (mimeType.equalsIgnoreCase("application/vnd.openxmlformats-officedocument.wordprocessingml.document")
					|| ((mimeType.equalsIgnoreCase("application/x-tika-ooxml")
							|| mimeType.equalsIgnoreCase("application/msword")
							|| mimeType.equalsIgnoreCase("application/x-tika-msoffice"))
							&& (filetype.equals("doc") || filetype.equals("docx")))) {
				// document
				return false;
			} else if (mimeType
					.equalsIgnoreCase("application/vnd.openxmlformats-officedocument.presentationml.presentation")
					|| ((mimeType.equalsIgnoreCase("application/x-tika-ooxml")
							|| (mimeType.equalsIgnoreCase("application/vnd.ms-powerpoint")))
							&& (filetype.equals("ppt") || filetype.equals("pptx")))) {
				// powerpoint
				return false;
			} else if(mimeType.equalsIgnoreCase("application/vnd.ms-excel.sheet.macroenabled.12")
					|| mimeType.equalsIgnoreCase("application/vnd.ms-excel.sheet.binary.macroenabled.12")
					|| mimeType.equalsIgnoreCase("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")
					|| mimeType.equalsIgnoreCase("application/vnd.ms-excel")
					|| ( mimeType.equalsIgnoreCase("application/x-tika-ooxml")
							&& 
							(filetype.equals("xls") || filetype.equals("xlsx") || filetype.equals("xlsm"))
						)
					) {
				// excel
				return false;
			}
			else if (mimeType.equalsIgnoreCase("application/pdf")) {
				// pdf
				return false;
			}
		}
		
		for (Charset charset : COMMON_ENCODINGS) {
			try (InputStream is = item.getInputStream(); 
					InputStreamReader isr = new InputStreamReader(is, charset);
					BufferedReader reader = new BufferedReader(isr)) {
				char[] buffer = new char[4096];
				int charsRead = reader.read(buffer);
				if (charsRead == -1) {
					return false; // Empty file
				}
				String contentSnippet = new String(buffer, 0, charsRead);
				if (isLikelyText(contentSnippet)) {
					this.charset = charset;
					return true;
				}
			} catch (IOException e) {
				// Ignore and try the next encoding
			}
		}
		return false;
	}

	/**
	 * 
	 * @param contentSnippet
	 * @return
	 */
	private boolean isLikelyText(String contentSnippet) {
		// Check for non-text characters and common text patterns
		boolean hasNonTextCharacters = contentSnippet.chars().anyMatch(c ->
		!(Character.isWhitespace(c) || Character.isISOControl(c) || (c >= 32 && c <= 126) || (c >= 128 && c <= 255))
				);
		if (hasNonTextCharacters) {
			return false;
		}
		return contentSnippet.contains("\n") || contentSnippet.contains("\r") ||
				contentSnippet.contains(",") || contentSnippet.contains("\t");
	}

	/**
	 * 
	 * @return
	 */
	public Charset getCharset() {
		return charset;
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy