All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.Tika Maven / Gradle / Ivy

Go to download

This is the core Apache Tika™ toolkit library from which all other modules inherit functionality. It also includes the core facades for the Tika API.

There is a newer version: 3.0.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.net.URL;
import java.nio.file.Path;
import java.util.Properties;

import org.xml.sax.SAXException;

import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.exception.WriteLimitReachedException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.language.translate.Translator;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParsingReader;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.WriteOutContentHandler;

/**
 * Facade class for accessing Tika functionality. This class hides much of
 * the underlying complexity of the lower level Tika classes and provides
 * simple methods for many common parsing and type detection operations.
 *
 * @see Parser
 * @see Detector
 * @since Apache Tika 0.5
 */
public class Tika {

    /**
     * The detector instance used by this facade.
     */
    private final Detector detector;

    /**
     * The parser instance used by this facade.
     */
    private final Parser parser;

    /**
     * The Translator instance used by this facade.
     */
    private final Translator translator;

    /**
     * Maximum length of the strings returned by the parseToString methods.
     * Used to prevent out of memory problems with huge input documents.
     * The default setting is 100k characters.
     */
    private int maxStringLength = 100 * 1000;

    /**
     * Creates a Tika facade using the given detector and parser instances, but the default
     * Translator.
     *
     * @param detector type detector
     * @param parser   document parser
     * @since Apache Tika 0.8
     */
    public Tika(Detector detector, Parser parser) {
        this.detector = detector;
        this.parser = parser;
        this.translator = TikaConfig.getDefaultConfig().getTranslator();
    }

    /**
     * Creates a Tika facade using the given detector, parser, and translator instances.
     *
     * @param detector   type detector
     * @param parser     document parser
     * @param translator text translator
     * @since Apache Tika 1.6
     */
    public Tika(Detector detector, Parser parser, Translator translator) {
        this.detector = detector;
        this.parser = parser;
        this.translator = translator;
    }

    /**
     * Creates a Tika facade using the given configuration.
     *
     * @param config Tika configuration
     */
    public Tika(TikaConfig config) {
        this(config.getDetector(), new AutoDetectParser(config), config.getTranslator());
    }

    /**
     * Creates a Tika facade using the default configuration.
     */
    public Tika() {
        this(TikaConfig.getDefaultConfig());
    }

    /**
     * Creates a Tika facade using the given detector instance, the
     * default parser configuration, and the default Translator.
     *
     * @param detector type detector
     * @since Apache Tika 0.8
     */
    public Tika(Detector detector) {
        this(detector, new AutoDetectParser(detector));
    }


    /**
     * Detects the media type of the given document. The type detection is
     * based on the content of the given document stream and any given
     * document metadata. The document stream can be null,
     * in which case only the given document metadata is used for type
     * detection.
     * 

* If the document stream supports the * {@link InputStream#markSupported() mark feature}, then the stream is * marked and reset to the original position before this method returns. * Only a limited number of bytes are read from the stream. *

* The given document stream is not closed by this method. *

* Unlike in the {@link #parse(InputStream, Metadata)} method, the * given document metadata is not modified by this method. * * @param stream the document stream, or null * @param metadata document metadata * @return detected media type * @throws IOException if the stream can not be read */ public String detect(InputStream stream, Metadata metadata) throws IOException { if (stream == null || stream.markSupported()) { return detector.detect(stream, metadata).toString(); } else { return detector.detect(new BufferedInputStream(stream), metadata).toString(); } } /** * Detects the media type of the given document. The type detection is * based on the content of the given document stream and the name of the * document. *

* If the document stream supports the * {@link InputStream#markSupported() mark feature}, then the stream is * marked and reset to the original position before this method returns. * Only a limited number of bytes are read from the stream. *

* The given document stream is not closed by this method. * * @param stream the document stream * @param name document name * @return detected media type * @throws IOException if the stream can not be read * @since Apache Tika 0.9 */ public String detect(InputStream stream, String name) throws IOException { Metadata metadata = new Metadata(); metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name); return detect(stream, metadata); } /** * Detects the media type of the given document. The type detection is * based on the content of the given document stream. *

* If the document stream supports the * {@link InputStream#markSupported() mark feature}, then the stream is * marked and reset to the original position before this method returns. * Only a limited number of bytes are read from the stream. *

* The given document stream is not closed by this method. * * @param stream the document stream * @return detected media type * @throws IOException if the stream can not be read */ public String detect(InputStream stream) throws IOException { return detect(stream, new Metadata()); } /** * Detects the media type of the given document. The type detection is * based on the first few bytes of a document and the document name. *

* For best results at least a few kilobytes of the document data * are needed. See also the other detect() methods for better * alternatives when you have more than just the document prefix * available for type detection. * * @param prefix first few bytes of the document * @param name document name * @return detected media type * @since Apache Tika 0.9 */ public String detect(byte[] prefix, String name) { try { try (InputStream stream = TikaInputStream.get(prefix)) { return detect(stream, name); } } catch (IOException e) { throw new IllegalStateException("Unexpected IOException", e); } } /** * Detects the media type of the given document. The type detection is * based on the first few bytes of a document. *

* For best results at least a few kilobytes of the document data * are needed. See also the other detect() methods for better * alternatives when you have more than just the document prefix * available for type detection. * * @param prefix first few bytes of the document * @return detected media type * @since Apache Tika 0.9 */ public String detect(byte[] prefix) { try { try (InputStream stream = TikaInputStream.get(prefix)) { return detect(stream); } } catch (IOException e) { throw new IllegalStateException("Unexpected IOException", e); } } /** * Detects the media type of the file at the given path. The type * detection is based on the document content and a potential known * file extension. *

* Use the {@link #detect(String)} method when you want to detect the * type of the document without actually accessing the file. * * @param path the path of the file * @return detected media type * @throws IOException if the file can not be read */ public String detect(Path path) throws IOException { Metadata metadata = new Metadata(); try (InputStream stream = TikaInputStream.get(path, metadata)) { return detect(stream, metadata); } } /** * Detects the media type of the given file. The type detection is * based on the document content and a potential known file extension. *

* Use the {@link #detect(String)} method when you want to detect the * type of the document without actually accessing the file. * * @param file the file * @return detected media type * @throws IOException if the file can not be read * @see #detect(Path) */ public String detect(File file) throws IOException { Metadata metadata = new Metadata(); try (@SuppressWarnings("deprecation") InputStream stream = TikaInputStream .get(file, metadata)) { return detect(stream, metadata); } } /** * Detects the media type of the resource at the given URL. The type * detection is based on the document content and a potential known * file extension included in the URL. *

* Use the {@link #detect(String)} method when you want to detect the * type of the document without actually accessing the URL. * * @param url the URL of the resource * @return detected media type * @throws IOException if the resource can not be read */ public String detect(URL url) throws IOException { Metadata metadata = new Metadata(); try (InputStream stream = TikaInputStream.get(url, metadata)) { return detect(stream, metadata); } } /** * Detects the media type of a document with the given file name. * The type detection is based on known file name extensions. *

* The given name can also be a URL or a full file path. In such cases * only the file name part of the string is used for type detection. * * @param name the file name of the document * @return detected media type */ public String detect(String name) { try { return detect((InputStream) null, name); } catch (IOException e) { throw new IllegalStateException("Unexpected IOException", e); } } /** * Translate the given text String to and from the given languages. * * @param text The text to translate. * @param sourceLanguage The input text language (for example, "hi"). * @param targetLanguage The desired output language (for example, "fr"). * @return The translated text. If translation is unavailable (client keys not set), returns * the same text back. * @see org.apache.tika.language.translate.Translator */ public String translate(String text, String sourceLanguage, String targetLanguage) { try { return translator.translate(text, sourceLanguage, targetLanguage); } catch (Exception e) { throw new IllegalStateException("Error translating data.", e); } } /** * Translate the given text String to the given language, attempting to auto-detect the * source language. * * @param text The text to translate. * @param targetLanguage The desired output language (for example, "en"). * @return The translated text. If translation is unavailable (client keys not set), returns * the same text back. * @see org.apache.tika.language.translate.Translator */ public String translate(String text, String targetLanguage) { try { return translator.translate(text, targetLanguage); } catch (Exception e) { throw new IllegalStateException("Error translating data.", e); } } /** * Parses the given document and returns the extracted text content. * Input metadata like a file name or a content type hint can be passed * in the given metadata instance. Metadata information extracted from * the document is returned in that same metadata instance. *

* The returned reader will be responsible for closing the given stream. * The stream and any associated resources will be closed at or before * the time when the {@link Reader#close()} method is called. * * @param stream the document to be parsed * @param metadata where document's metadata will be populated * @return extracted text content * @throws IOException if the document can not be read or parsed */ public Reader parse(InputStream stream, Metadata metadata) throws IOException { ParseContext context = new ParseContext(); context.set(Parser.class, parser); return new ParsingReader(parser, stream, metadata, context); } /** * Parses the given document and returns the extracted text content. *

* The returned reader will be responsible for closing the given stream. * The stream and any associated resources will be closed at or before * the time when the {@link Reader#close()} method is called. * * @param stream the document to be parsed * @return extracted text content * @throws IOException if the document can not be read or parsed */ public Reader parse(InputStream stream) throws IOException { return parse(stream, new Metadata()); } /** * Parses the file at the given path and returns the extracted text content. *

* Metadata information extracted from the document is returned in * the supplied metadata instance. * * @param path the path of the file to be parsed * @param metadata where document's metadata will be populated * @return extracted text content * @throws IOException if the file can not be read or parsed */ public Reader parse(Path path, Metadata metadata) throws IOException { InputStream stream = TikaInputStream.get(path, metadata); return parse(stream, metadata); } /** * Parses the file at the given path and returns the extracted text content. * * @param path the path of the file to be parsed * @return extracted text content * @throws IOException if the file can not be read or parsed */ public Reader parse(Path path) throws IOException { return parse(path, new Metadata()); } /** * Parses the given file and returns the extracted text content. *

* Metadata information extracted from the document is returned in * the supplied metadata instance. * * @param file the file to be parsed * @param metadata where document's metadata will be populated * @return extracted text content * @throws IOException if the file can not be read or parsed * @see #parse(Path) */ public Reader parse(File file, Metadata metadata) throws IOException { @SuppressWarnings("deprecation") InputStream stream = TikaInputStream.get(file, metadata); return parse(stream, metadata); } /** * Parses the given file and returns the extracted text content. * * @param file the file to be parsed * @return extracted text content * @throws IOException if the file can not be read or parsed * @see #parse(Path) */ public Reader parse(File file) throws IOException { return parse(file, new Metadata()); } /** * Parses the resource at the given URL and returns the extracted * text content. * * @param url the URL of the resource to be parsed * @return extracted text content * @throws IOException if the resource can not be read or parsed */ public Reader parse(URL url) throws IOException { Metadata metadata = new Metadata(); InputStream stream = TikaInputStream.get(url, metadata); return parse(stream, metadata); } /** * Parses the given document and returns the extracted text content. * The given input stream is closed by this method. *

* To avoid unpredictable excess memory use, the returned string contains * only up to {@link #getMaxStringLength()} first characters extracted * from the input document. Use the {@link #setMaxStringLength(int)} * method to adjust this limitation. *

* NOTE: Unlike most other Tika methods that take an * {@link InputStream}, this method will close the given stream for * you as a convenience. With other methods you are still responsible * for closing the stream or a wrapper instance returned by Tika. * * @param stream the document to be parsed * @param metadata document metadata * @return extracted text content * @throws IOException if the document can not be read * @throws TikaException if the document can not be parsed */ public String parseToString(InputStream stream, Metadata metadata) throws IOException, TikaException { return parseToString(stream, metadata, maxStringLength); } /** * Parses the given document and returns the extracted text content. * The given input stream is closed by this method. This method lets * you control the maxStringLength per call. *

* To avoid unpredictable excess memory use, the returned string contains * only up to maxLength (parameter) first characters extracted * from the input document. *

* NOTE: Unlike most other Tika methods that take an * {@link InputStream}, this method will close the given stream for * you as a convenience. With other methods you are still responsible * for closing the stream or a wrapper instance returned by Tika. * * @param stream the document to be parsed * @param metadata document metadata * @param maxLength maximum length of the returned string * @return extracted text content * @throws IOException if the document can not be read * @throws TikaException if the document can not be parsed */ public String parseToString(InputStream stream, Metadata metadata, int maxLength) throws IOException, TikaException { WriteOutContentHandler handler = new WriteOutContentHandler(maxLength); try { ParseContext context = new ParseContext(); context.set(Parser.class, parser); parser.parse(stream, new BodyContentHandler(handler), metadata, context); } catch (SAXException e) { if (!WriteLimitReachedException.isWriteLimitReached(e)) { // This should never happen with BodyContentHandler... throw new TikaException("Unexpected SAX processing failure", e); } } finally { stream.close(); } return handler.toString(); } /** * Parses the given document and returns the extracted text content. * The given input stream is closed by this method. *

* To avoid unpredictable excess memory use, the returned string contains * only up to {@link #getMaxStringLength()} first characters extracted * from the input document. Use the {@link #setMaxStringLength(int)} * method to adjust this limitation. *

* NOTE: Unlike most other Tika methods that take an * {@link InputStream}, this method will close the given stream for * you as a convenience. With other methods you are still responsible * for closing the stream or a wrapper instance returned by Tika. * * @param stream the document to be parsed * @return extracted text content * @throws IOException if the document can not be read * @throws TikaException if the document can not be parsed */ public String parseToString(InputStream stream) throws IOException, TikaException { return parseToString(stream, new Metadata()); } /** * Parses the file at the given path and returns the extracted text content. *

* To avoid unpredictable excess memory use, the returned string contains * only up to {@link #getMaxStringLength()} first characters extracted * from the input document. Use the {@link #setMaxStringLength(int)} * method to adjust this limitation. * * @param path the path of the file to be parsed * @return extracted text content * @throws IOException if the file can not be read * @throws TikaException if the file can not be parsed */ public String parseToString(Path path) throws IOException, TikaException { Metadata metadata = new Metadata(); InputStream stream = TikaInputStream.get(path, metadata); return parseToString(stream, metadata); } /** * Parses the given file and returns the extracted text content. *

* To avoid unpredictable excess memory use, the returned string contains * only up to {@link #getMaxStringLength()} first characters extracted * from the input document. Use the {@link #setMaxStringLength(int)} * method to adjust this limitation. * * @param file the file to be parsed * @return extracted text content * @throws IOException if the file can not be read * @throws TikaException if the file can not be parsed * @see #parseToString(Path) */ public String parseToString(File file) throws IOException, TikaException { Metadata metadata = new Metadata(); @SuppressWarnings("deprecation") InputStream stream = TikaInputStream.get(file, metadata); return parseToString(stream, metadata); } /** * Parses the resource at the given URL and returns the extracted * text content. *

* To avoid unpredictable excess memory use, the returned string contains * only up to {@link #getMaxStringLength()} first characters extracted * from the input document. Use the {@link #setMaxStringLength(int)} * method to adjust this limitation. * * @param url the URL of the resource to be parsed * @return extracted text content * @throws IOException if the resource can not be read * @throws TikaException if the resource can not be parsed */ public String parseToString(URL url) throws IOException, TikaException { Metadata metadata = new Metadata(); InputStream stream = TikaInputStream.get(url, metadata); return parseToString(stream, metadata); } /** * Returns the maximum length of strings returned by the * parseToString methods. * * @return maximum string length, or -1 if the limit has been disabled * @since Apache Tika 0.7 */ public int getMaxStringLength() { return maxStringLength; } /** * Sets the maximum length of strings returned by the parseToString * methods. * * @param maxStringLength maximum string length, * or -1 to disable this limit * @since Apache Tika 0.7 */ public void setMaxStringLength(int maxStringLength) { this.maxStringLength = maxStringLength; } /** * Returns the parser instance used by this facade. * * @return parser instance * @since Apache Tika 0.10 */ public Parser getParser() { return parser; } /** * Returns the detector instance used by this facade. * * @return detector instance * @since Apache Tika 0.10 */ public Detector getDetector() { return detector; } /** * Returns the translator instance used by this facade. * * @return translator instance * @since Tika 1.6 */ public Translator getTranslator() { return translator; } //--------------------------------------------------------------< Object > public String toString() { String version = null; try (InputStream stream = Tika.class .getResourceAsStream("/META-INF/maven/org.apache.tika/tika-core/pom.properties")) { if (stream != null) { Properties properties = new Properties(); properties.load(stream); version = properties.getProperty("version"); } } catch (Exception ignore) { } if (version != null) { return "Apache Tika " + version; } else { return "Apache Tika"; } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy