All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.extractor.EmbeddedDocumentUtil Maven / Gradle / Ivy

There is a newer version: 2024.11.18751.20241128T090041Z-241100
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.extractor;


import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
import java.util.Map;

import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.Detector;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MimeType;
import org.apache.tika.mime.MimeTypeException;
import org.apache.tika.mime.MimeTypes;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParserDecorator;
import org.apache.tika.parser.PasswordProvider;
import org.apache.tika.utils.ExceptionUtils;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

/**
 * Utility class to handle common issues with embedded documents.
 * 

* Use statically if all that is needed is getting the EmbeddedDocumentExtractor. * Otherwise, instantiate an instance. *

* Note: This is not thread safe. Make sure to instantiate one per thread. */ public class EmbeddedDocumentUtil implements Serializable { private final ParseContext context; private final EmbeddedDocumentExtractor embeddedDocumentExtractor; //these are lazily initialized and can be null private TikaConfig tikaConfig; private MimeTypes mimeTypes; private Detector detector; public EmbeddedDocumentUtil(ParseContext context) { this.context = context; this.embeddedDocumentExtractor = getEmbeddedDocumentExtractor(context); } /** * This offers a uniform way to get an EmbeddedDocumentExtractor from a ParseContext. * As of Tika 1.15, an AutoDetectParser will automatically be added to parse * embedded documents if no Parser.class is specified in the ParseContext. *

* If you'd prefer not to parse embedded documents, set Parser.class * to {@link org.apache.tika.parser.EmptyParser} in the ParseContext. * * @param context * @return EmbeddedDocumentExtractor */ public static EmbeddedDocumentExtractor getEmbeddedDocumentExtractor(ParseContext context) { EmbeddedDocumentExtractor extractor = context.get(EmbeddedDocumentExtractor.class); if (extractor == null) { //ensure that an AutoDetectParser is //available for parsing embedded docs TIKA-2096 Parser embeddedParser = context.get(Parser.class); if (embeddedParser == null) { TikaConfig tikaConfig = context.get(TikaConfig.class); if (tikaConfig == null) { context.set(Parser.class, new AutoDetectParser()); } else { context.set(Parser.class, new AutoDetectParser(tikaConfig)); } } extractor = new ParsingEmbeddedDocumentExtractor(context); } return extractor; } public PasswordProvider getPasswordProvider() { return context.get(PasswordProvider.class); } public Detector getDetector() { //be as lazy as possible and cache Detector localDetector = context.get(Detector.class); if (localDetector != null) { return localDetector; } if (detector != null) { return detector; } detector = getTikaConfig().getDetector(); return detector; } public MimeTypes getMimeTypes() { MimeTypes localMimeTypes = context.get(MimeTypes.class); //be as lazy as possible and cache the mimeTypes if (localMimeTypes != null) { return localMimeTypes; } if (mimeTypes != null) { return mimeTypes; } mimeTypes = getTikaConfig().getMimeRepository(); return mimeTypes; } /** * @return Returns a {@link TikaConfig} -- trying to find it first in the ParseContext * that was included during initialization, and then creating a new one from * via {@link TikaConfig#getDefaultConfig()} if it can't find one in the * ParseContext. This caches the default config so that it only has to be created once. */ public TikaConfig getTikaConfig() { //be as lazy as possible and cache the TikaConfig if (tikaConfig == null) { tikaConfig = context.get(TikaConfig.class); if (tikaConfig == null) { tikaConfig = TikaConfig.getDefaultConfig(); } } return tikaConfig; } public String getExtension(TikaInputStream is, Metadata metadata) { String mimeString = metadata.get(Metadata.CONTENT_TYPE); //use the buffered mimetypes as default MimeTypes localMimeTypes = getMimeTypes(); MimeType mimeType = null; boolean detected = false; if (mimeString != null) { try { mimeType = localMimeTypes.forName(mimeString); } catch (MimeTypeException e) { //swallow } } if (mimeType == null) { try { MediaType mediaType = getDetector().detect(is, metadata); mimeType = localMimeTypes.forName(mediaType.toString()); detected = true; is.reset(); } catch (IOException e) { //swallow } catch (MimeTypeException e) { //swallow } } if (mimeType != null) { if (detected) { //set or correct the mime type metadata.set(Metadata.CONTENT_TYPE, mimeType.toString()); } return mimeType.getExtension(); } return ".bin"; } /** * @return Returns a {@link TikaConfig} -- trying to find it first in the ParseContext * that was included in the initialization, and then creating a new one from * via {@link TikaConfig#getDefaultConfig()} if it can't find one in the * ParseContext. * * @deprecated as of 1.17, use {@link #getTikaConfig()} instead */ @Deprecated public TikaConfig getConfig() { TikaConfig config = context.get(TikaConfig.class); if (config == null) { config = TikaConfig.getDefaultConfig(); } return config; } public static void recordException(Throwable t, Metadata m) { String ex = ExceptionUtils.getFilteredStackTrace(t); m.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ex); } public static void recordEmbeddedStreamException(Throwable t, Metadata m) { String ex = ExceptionUtils.getFilteredStackTrace(t); m.add(TikaCoreProperties.TIKA_META_EXCEPTION_EMBEDDED_STREAM, ex); } public boolean shouldParseEmbedded(Metadata m) { return getEmbeddedDocumentExtractor().shouldParseEmbedded(m); } private EmbeddedDocumentExtractor getEmbeddedDocumentExtractor() { return embeddedDocumentExtractor; } public void parseEmbedded(InputStream inputStream, ContentHandler handler, Metadata metadata, boolean outputHtml) throws IOException, SAXException { embeddedDocumentExtractor.parseEmbedded(inputStream, handler, metadata, outputHtml); } /** * Tries to find an existing parser within the ParseContext. * It looks inside of CompositeParsers and ParserDecorators. * The use case is when a parser needs to parse an internal stream * that is _part_ of the document, e.g. rtf body inside an msg. *

* Can return null if the context contains no parser or * the correct parser can't be found. * * @param clazz parser class to search for * @param context * @return */ public static Parser tryToFindExistingLeafParser(Class clazz, ParseContext context) { Parser p = context.get(Parser.class); if (equals(p, clazz)) { return p; } Parser returnParser = null; if (p != null) { if (p instanceof ParserDecorator) { p = findInDecorated((ParserDecorator)p, clazz); } if (equals(p, clazz)) { return p; } if (p instanceof CompositeParser) { returnParser = findInComposite((CompositeParser) p, clazz, context); } } if (returnParser != null && equals(returnParser, clazz)) { return returnParser; } return null; } private static Parser findInDecorated(ParserDecorator p, Class clazz) { Parser candidate = p.getWrappedParser(); if (equals(candidate, clazz)) { return candidate; } if (candidate instanceof ParserDecorator) { candidate = findInDecorated((ParserDecorator)candidate, clazz); } return candidate; } private static Parser findInComposite(CompositeParser p, Class clazz, ParseContext context) { for (Parser candidate : p.getAllComponentParsers()) { if (equals(candidate, clazz)) { return candidate; } if (candidate instanceof ParserDecorator) { candidate = findInDecorated((ParserDecorator)candidate, clazz); } if (equals(candidate, clazz)) { return candidate; } if (candidate instanceof CompositeParser) { candidate = findInComposite((CompositeParser) candidate, clazz, context); } if (equals(candidate, clazz)) { return candidate; } } return null; } private static boolean equals(Parser parser, Class clazz) { if (parser == null) { return false; } return parser.getClass().equals(clazz); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy