org.apache.tika.example.ParsingExample Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of tika-example Show documentation
This module contains examples of how to use Apache Tika.
There is a newer version: 3.0.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.tika.example;

import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
import java.nio.file.DirectoryStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;

import org.xml.sax.SAXException;

import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.metadata.serialization.JsonMetadataList;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.EmptyParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ContentHandlerFactory;
import org.apache.tika.sax.RecursiveParserWrapperHandler;

public class ParsingExample {

    /**
     * Example of how to use Tika's parseToString method to parse the content of a file,
     * and return any text found.
     * 
     * Note: Tika.parseToString() will extract content from the outer container
     * document and any embedded/attached documents.
     *
     * @return The content of a file.
     */
    public String parseToStringExample() throws IOException, SAXException, TikaException {
        Tika tika = new Tika();
        try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {
            return tika.parseToString(stream);
        }
    }

    /**
     * Example of how to use Tika to parse a file when you do not know its file type
     * ahead of time.
     * 

     * AutoDetectParser attempts to discover the file's type automatically, then call
     * the exact Parser built for that file type.
     * 

     * The stream to be parsed by the Parser. In this case, we get a file from the
     * resources folder of this project.
     * 

     * Handlers are used to get the exact information you want out of the host of
     * information gathered by Parsers. The body content handler, intuitively, extracts
     * everything that would go between HTML body tags.
     * 

     * The Metadata object will be filled by the Parser with Metadata discovered about
     * the file being parsed.
     * 

     * Note: This example will extract content from the outer document and all
     * embedded documents.  However, if you choose to use a {@link ParseContext},
     * make sure to set a {@link Parser} or else embedded content will not be
     * parsed.
     *
     * @return The content of a file.
     */
    public String parseExample() throws IOException, SAXException, TikaException {
        AutoDetectParser parser = new AutoDetectParser();
        BodyContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();
        try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {
            parser.parse(stream, handler, metadata);
            return handler.toString();
        }
    }

    /**
     * If you don't want content from embedded documents, send in
     * a {@link org.apache.tika.parser.ParseContext} that does contains a
     * {@link EmptyParser}.
     *
     * @return The content of a file.
     */
    public String parseNoEmbeddedExample() throws IOException, SAXException, TikaException {
        AutoDetectParser parser = new AutoDetectParser();
        BodyContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();
        ParseContext parseContext = new ParseContext();
        parseContext.set(Parser.class, new EmptyParser());
        try (InputStream stream = ParsingExample.class
                .getResourceAsStream("test_recursive_embedded.docx")) {
            parser.parse(stream, handler, metadata, parseContext);
            return handler.toString();
        }
    }


    /**
     * This example shows how to extract content from the outer document and all
     * embedded documents.  The key is to specify a {@link Parser} in the {@link ParseContext}.
     *
     * @return content, including from embedded documents
     * @throws IOException
     * @throws SAXException
     * @throws TikaException
     */
    public String parseEmbeddedExample() throws IOException, SAXException, TikaException {
        AutoDetectParser parser = new AutoDetectParser();
        BodyContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();
        ParseContext context = new ParseContext();
        context.set(Parser.class, parser);
        try (InputStream stream = ParsingExample.class
                .getResourceAsStream("test_recursive_embedded.docx")) {
            parser.parse(stream, handler, metadata, context);
            return handler.toString();
        }
    }

    /**
     * For documents that may contain embedded documents, it might be helpful
     * to create list of metadata objects, one for the container document and
     * one for each embedded document.  This allows easy access to both the
     * extracted content and the metadata of each embedded document.
     * Note that many document formats can contain embedded documents,
     * including traditional container formats -- zip, tar and others -- but also
     * common office document formats including: MSWord, MSExcel,
     * MSPowerPoint, RTF, PDF, MSG and several others.
     * 

     * The "content" format is determined by the ContentHandlerFactory, and
     * the content is stored in {@link org.apache.tika.parser.RecursiveParserWrapper#TIKA_CONTENT}
     * 

     * The drawback to the RecursiveParserWrapper is that it caches metadata and contents
     * in memory.  This should not be used on files whose contents are too big to be handled
     * in memory.
     *
     * @return a list of metadata object, one each for the container file and each embedded file
     * @throws IOException
     * @throws SAXException
     * @throws TikaException
     */
    public List recursiveParserWrapperExample()
            throws IOException, SAXException, TikaException {
        Parser p = new AutoDetectParser();
        ContentHandlerFactory factory =
                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.HTML, -1);

        RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p);
        Metadata metadata = new Metadata();
        metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test_recursive_embedded.docx");
        ParseContext context = new ParseContext();
        RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(factory, -1);
        try (InputStream stream = ParsingExample.class
                .getResourceAsStream("test_recursive_embedded.docx")) {
            wrapper.parse(stream, handler, metadata, context);
        }

        return handler.getMetadataList();
    }

    /**
     * We include a simple JSON serializer for a list of metadata with
     * {@link org.apache.tika.metadata.serialization.JsonMetadataList}.
     * That class also includes a deserializer to convert from JSON
     * back to a List.
     * 
     * This functionality is also available in tika-app's GUI, and
     * with the -J option on tika-app's commandline.  For tika-server
     * users, there is the "rmeta" service that will return this format.
     *
     * @return a JSON representation of a list of Metadata objects
     * @throws IOException
     * @throws SAXException
     * @throws TikaException
     */
    public String serializedRecursiveParserWrapperExample()
            throws IOException, SAXException, TikaException {
        List metadataList = recursiveParserWrapperExample();
        StringWriter writer = new StringWriter();
        JsonMetadataList.toJson(metadataList, writer);
        return writer.toString();
    }


    /**
     * @param outputPath -- output directory to place files
     * @return list of files created
     * @throws IOException
     * @throws SAXException
     * @throws TikaException
     */
    public List extractEmbeddedDocumentsExample(Path outputPath)
            throws IOException, SAXException, TikaException {
        ExtractEmbeddedFiles ex = new ExtractEmbeddedFiles();
        List ret = new ArrayList<>();
        try (TikaInputStream stream = TikaInputStream
                .get(ParsingExample.class.getResourceAsStream("test_recursive_embedded.docx"))) {
            ex.extract(stream, outputPath);
            try (DirectoryStream dirStream = Files.newDirectoryStream(outputPath)) {
                for (Path entry : dirStream) {
                    ret.add(entry);
                }
            }
        }
        return ret;
    }
}