All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.example.ParsingExample Maven / Gradle / Ivy

There is a newer version: 3.0.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.tika.example;

import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
import java.nio.file.DirectoryStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;

import org.xml.sax.SAXException;

import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.metadata.serialization.JsonMetadataList;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.EmptyParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ContentHandlerFactory;
import org.apache.tika.sax.RecursiveParserWrapperHandler;

public class ParsingExample {

    /**
     * Example of how to use Tika's parseToString method to parse the content of a file,
     * and return any text found.
     * 

* Note: Tika.parseToString() will extract content from the outer container * document and any embedded/attached documents. * * @return The content of a file. */ public String parseToStringExample() throws IOException, SAXException, TikaException { Tika tika = new Tika(); try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { return tika.parseToString(stream); } } /** * Example of how to use Tika to parse a file when you do not know its file type * ahead of time. *

* AutoDetectParser attempts to discover the file's type automatically, then call * the exact Parser built for that file type. *

* The stream to be parsed by the Parser. In this case, we get a file from the * resources folder of this project. *

* Handlers are used to get the exact information you want out of the host of * information gathered by Parsers. The body content handler, intuitively, extracts * everything that would go between HTML body tags. *

* The Metadata object will be filled by the Parser with Metadata discovered about * the file being parsed. *

* Note: This example will extract content from the outer document and all * embedded documents. However, if you choose to use a {@link ParseContext}, * make sure to set a {@link Parser} or else embedded content will not be * parsed. * * @return The content of a file. */ public String parseExample() throws IOException, SAXException, TikaException { AutoDetectParser parser = new AutoDetectParser(); BodyContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { parser.parse(stream, handler, metadata); return handler.toString(); } } /** * If you don't want content from embedded documents, send in * a {@link org.apache.tika.parser.ParseContext} that does contains a * {@link EmptyParser}. * * @return The content of a file. */ public String parseNoEmbeddedExample() throws IOException, SAXException, TikaException { AutoDetectParser parser = new AutoDetectParser(); BodyContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); ParseContext parseContext = new ParseContext(); parseContext.set(Parser.class, new EmptyParser()); try (InputStream stream = ParsingExample.class .getResourceAsStream("test_recursive_embedded.docx")) { parser.parse(stream, handler, metadata, parseContext); return handler.toString(); } } /** * This example shows how to extract content from the outer document and all * embedded documents. The key is to specify a {@link Parser} in the {@link ParseContext}. * * @return content, including from embedded documents * @throws IOException * @throws SAXException * @throws TikaException */ public String parseEmbeddedExample() throws IOException, SAXException, TikaException { AutoDetectParser parser = new AutoDetectParser(); BodyContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); context.set(Parser.class, parser); try (InputStream stream = ParsingExample.class .getResourceAsStream("test_recursive_embedded.docx")) { parser.parse(stream, handler, metadata, context); return handler.toString(); } } /** * For documents that may contain embedded documents, it might be helpful * to create list of metadata objects, one for the container document and * one for each embedded document. This allows easy access to both the * extracted content and the metadata of each embedded document. * Note that many document formats can contain embedded documents, * including traditional container formats -- zip, tar and others -- but also * common office document formats including: MSWord, MSExcel, * MSPowerPoint, RTF, PDF, MSG and several others. *

* The "content" format is determined by the ContentHandlerFactory, and * the content is stored in {@link org.apache.tika.parser.RecursiveParserWrapper#TIKA_CONTENT} *

* The drawback to the RecursiveParserWrapper is that it caches metadata and contents * in memory. This should not be used on files whose contents are too big to be handled * in memory. * * @return a list of metadata object, one each for the container file and each embedded file * @throws IOException * @throws SAXException * @throws TikaException */ public List recursiveParserWrapperExample() throws IOException, SAXException, TikaException { Parser p = new AutoDetectParser(); ContentHandlerFactory factory = new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.HTML, -1); RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p); Metadata metadata = new Metadata(); metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test_recursive_embedded.docx"); ParseContext context = new ParseContext(); RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(factory, -1); try (InputStream stream = ParsingExample.class .getResourceAsStream("test_recursive_embedded.docx")) { wrapper.parse(stream, handler, metadata, context); } return handler.getMetadataList(); } /** * We include a simple JSON serializer for a list of metadata with * {@link org.apache.tika.metadata.serialization.JsonMetadataList}. * That class also includes a deserializer to convert from JSON * back to a List. *

* This functionality is also available in tika-app's GUI, and * with the -J option on tika-app's commandline. For tika-server * users, there is the "rmeta" service that will return this format. * * @return a JSON representation of a list of Metadata objects * @throws IOException * @throws SAXException * @throws TikaException */ public String serializedRecursiveParserWrapperExample() throws IOException, SAXException, TikaException { List metadataList = recursiveParserWrapperExample(); StringWriter writer = new StringWriter(); JsonMetadataList.toJson(metadataList, writer); return writer.toString(); } /** * @param outputPath -- output directory to place files * @return list of files created * @throws IOException * @throws SAXException * @throws TikaException */ public List extractEmbeddedDocumentsExample(Path outputPath) throws IOException, SAXException, TikaException { ExtractEmbeddedFiles ex = new ExtractEmbeddedFiles(); List ret = new ArrayList<>(); try (TikaInputStream stream = TikaInputStream .get(ParsingExample.class.getResourceAsStream("test_recursive_embedded.docx"))) { ex.extract(stream, outputPath); try (DirectoryStream dirStream = Files.newDirectoryStream(outputPath)) { for (Path entry : dirStream) { ret.add(entry); } } } return ret; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy