All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.parser.strings.StringsParser Maven / Gradle / Ivy

There is a newer version: 3.0.0-BETA2
Show newest version
/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *     http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.strings;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.FutureTask;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;

import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.external.ExternalParser;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

import static java.nio.charset.StandardCharsets.UTF_8;

/**
 * Parser that uses the "strings" (or strings-alternative) command to find the
 * printable strings in a object, or other binary, file
 * (application/octet-stream). Useful as "best-effort" parser for files detected
 * as application/octet-stream.
 * 
 * @author gtotaro
 *
 */
public class StringsParser extends AbstractParser {
	/**
	 * Serial version UID
	 */
	private static final long serialVersionUID = 802566634661575025L;

	private static final Set SUPPORTED_TYPES = Collections
			.singleton(MediaType.OCTET_STREAM);

	private static final StringsConfig DEFAULT_STRINGS_CONFIG = new StringsConfig();
	
	private static final FileConfig DEFAULT_FILE_CONFIG = new FileConfig();
	
	/*
	 * This map is organized as follows:
	 * command's pathname (String) -> is it present? (Boolean), does it support -e option? (Boolean)
	 * It stores check results for command and, if present, -e (encoding) option.
	 */
	private static Map STRINGS_PRESENT = new HashMap();

	@Override
	public Set getSupportedTypes(ParseContext context) {
		return SUPPORTED_TYPES;
	}

	@Override
	public void parse(InputStream stream, ContentHandler handler,
			Metadata metadata, ParseContext context) throws IOException,
			SAXException, TikaException {
		StringsConfig stringsConfig = context.get(StringsConfig.class, DEFAULT_STRINGS_CONFIG);
		FileConfig fileConfig = context.get(FileConfig.class, DEFAULT_FILE_CONFIG);

		if (!hasStrings(stringsConfig)) {
			return;
		}

		TikaInputStream tis = TikaInputStream.get(stream);
		File input = tis.getFile();

		// Metadata
		metadata.set("strings:min-len", "" + stringsConfig.getMinLength());
		metadata.set("strings:encoding", stringsConfig.toString());
		metadata.set("strings:file_output", doFile(input, fileConfig));

		int totalBytes = 0;

		// Content
		XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);

		xhtml.startDocument();

		totalBytes = doStrings(input, stringsConfig, xhtml);

		xhtml.endDocument();

		// Metadata
		metadata.set("strings:length", "" + totalBytes);
	}

	/**
	 * Checks if the "strings" command is supported.
	 * 
	 * @param config
	 *            {@see StringsConfig} object used for testing the strings
	 *            command.
	 * @return Returns returns {@code true} if the strings command is supported.
	 */
	private boolean hasStrings(StringsConfig config) {
		String stringsProg = config.getStringsPath() + getStringsProg();
		
		if (STRINGS_PRESENT.containsKey(stringsProg)) {
			return STRINGS_PRESENT.get(stringsProg)[0];
		}

		String[] checkCmd = { stringsProg, "--version" };
		try {
			boolean hasStrings = ExternalParser.check(checkCmd);

			boolean encodingOpt = false;

			// Check if the -e option (encoding) is supported
			if (!System.getProperty("os.name").startsWith("Windows")) {
				String[] checkOpt = {stringsProg, "-e", "" + config.getEncoding().get(), "/dev/null"};
				int[] errorValues = {1, 2}; // Exit status code: 1 = general error; 2 = incorrect usage.
				encodingOpt = ExternalParser.check(checkOpt, errorValues);
			}
		
			Boolean[] values = {hasStrings, encodingOpt};
			STRINGS_PRESENT.put(stringsProg, values);

			return hasStrings;
		} catch (NoClassDefFoundError ncdfe) {
			// This happens under OSGi + Fork Parser - see TIKA-1507
			// As a workaround for now, just say we can't use strings
			// TODO Resolve it so we don't need this try/catch block
			Boolean[] values = {false, false};
			STRINGS_PRESENT.put(stringsProg, values);
			return false;
		}
	}

	/**
	 * Checks if the "file" command is supported.
	 * 
	 * @param config
	 * @return
	 */
	private boolean hasFile(FileConfig config) {
		String fileProg = config.getFilePath() + getFileProg();

		String[] checkCmd = { fileProg, "--version" };

		boolean hasFile = ExternalParser.check(checkCmd);

		return hasFile;
	}

	/**
	 * Runs the "strings" command on the given file.
	 * 
	 * @param input
	 *            {@see File} object that represents the file to parse.
	 * @param config
	 *            {@see StringsConfig} object including the strings
	 *            configuration.
	 * @param xhtml
	 *            {@see XHTMLContentHandler} object.
	 * @return the total number of bytes read using the strings command.
	 * @throws IOException
	 *             if any I/O error occurs.
	 * @throws TikaException
	 *             if the parsing process has been interrupted.
	 * @throws SAXException
	 */
	private int doStrings(File input, StringsConfig config,
			XHTMLContentHandler xhtml) throws IOException, TikaException,
			SAXException {
		
		String stringsProg = config.getStringsPath() + getStringsProg();
		
		// Builds the command array
		ArrayList cmdList = new ArrayList(4);
		cmdList.add(stringsProg);
		cmdList.add("-n");
		cmdList.add("" + config.getMinLength());;
		// Currently, encoding option is not supported by Windows (and other) versions
		if (STRINGS_PRESENT.get(stringsProg)[1]) {
			cmdList.add("-e");
			cmdList.add("" + config.getEncoding().get());
		}
		cmdList.add(input.getPath());
		
		String[] cmd = cmdList.toArray(new String[cmdList.size()]);
		
		ProcessBuilder pb = new ProcessBuilder(cmd);
		final Process process = pb.start();

		InputStream out = process.getInputStream();

		FutureTask waitTask = new FutureTask(
				new Callable() {
					public Integer call() throws Exception {
						return process.waitFor();
					}
				});

		Thread waitThread = new Thread(waitTask);
		waitThread.start();

		// Reads content printed out by "strings" command
		int totalBytes = 0;
		totalBytes = extractOutput(out, xhtml);		

		try {
			waitTask.get(config.getTimeout(), TimeUnit.SECONDS);

		} catch (InterruptedException ie) {
			waitThread.interrupt();
			process.destroy();
			Thread.currentThread().interrupt();
			throw new TikaException(StringsParser.class.getName()
					+ " interrupted", ie);

		} catch (ExecutionException ee) {
			// should not be thrown

		} catch (TimeoutException te) {
			waitThread.interrupt();
			process.destroy();
			throw new TikaException(StringsParser.class.getName() + " timeout",
					te);
		}

		return totalBytes;
	}

	/**
	 * Extracts ASCII strings using the "strings" command.
	 * 
	 * @param stream
	 *            {@see InputStream} object used for reading the binary file.
	 * @param xhtml
	 *            {@see XHTMLContentHandler} object.
	 * @return the total number of bytes read using the "strings" command.
	 * @throws SAXException
	 *             if the content element could not be written.
	 * @throws IOException
	 *             if any I/O error occurs.
	 */
	private int extractOutput(InputStream stream, XHTMLContentHandler xhtml)
			throws SAXException, IOException {

		char[] buffer = new char[1024];
		int totalBytes = 0;

		try (BufferedReader reader = new BufferedReader(new InputStreamReader(stream, UTF_8))) {
			int n = 0;
			while ((n = reader.read(buffer)) != -1) {
				if (n > 0) {
					xhtml.characters(buffer, 0, n);
				}
				totalBytes += n;
			}

		}

		return totalBytes;
	}

	/**
	 * Runs the "file" command on the given file that aims at providing an
	 * alternative way to determine the file type.
	 * 
	 * @param input
	 *            {@see File} object that represents the file to detect.
	 * @return the file type provided by the "file" command using the "-b"
	 *         option (it stands for "brief mode").
	 * @throws IOException
	 *             if any I/O error occurs.
	 */
	private String doFile(File input, FileConfig config) throws IOException {
		if (!hasFile(config)) {
			return null;
		}
		
		// Builds the command array
		ArrayList cmdList = new ArrayList(3);
		cmdList.add(config.getFilePath() + getFileProg());
		cmdList.add("-b");
		if (config.isMimetype()) {
			cmdList.add("-I");
		}
		cmdList.add(input.getPath());
		
		String[] cmd = cmdList.toArray(new String[cmdList.size()]);

		ProcessBuilder pb = new ProcessBuilder(cmd);
		final Process process = pb.start();

		InputStream out = process.getInputStream();

		String fileOutput = null;

		try (BufferedReader reader = new BufferedReader(new InputStreamReader(out, UTF_8))) {
			fileOutput = reader.readLine();
		} catch (IOException ioe) {
			// file output not available!
			fileOutput = "";
		}

		return fileOutput;
	}

	
	public static String getStringsProg() {
		return System.getProperty("os.name").startsWith("Windows") ? "strings.exe" : "strings";
	}
	
	public static String getFileProg() {
		return System.getProperty("os.name").startsWith("Windows") ? "file.exe" : "file";
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy