All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.jackrabbit.oak.plugins.tika.TikaHelper Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.jackrabbit.oak.plugins.tika;

import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.atomic.AtomicBoolean;

import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaException;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.EmptyParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParserDecorator;
import org.jetbrains.annotations.Nullable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.SAXException;

class TikaHelper {
    private static final String DEFAULT_TIKA_CONFIG = "/org/apache/jackrabbit/oak/plugins/index/lucene/tika-config.xml";
    private static final Logger log = LoggerFactory.getLogger(TikaHelper.class);

    private final AutoDetectParser parser;
    private final Set supportedMediaTypes;
    private static AtomicBoolean supportedTypesLogged = new AtomicBoolean();

    public TikaHelper(@Nullable File tikaConfig) throws IOException {
        try {
            parser =  new AutoDetectParser(getTikaConfig(tikaConfig));
            supportedMediaTypes = parser.getSupportedTypes(new ParseContext());
            logSupportedTypesOnce(supportedMediaTypes);
        } catch (TikaException e) {
            throw new RuntimeException(e);
        } catch (SAXException e) {
            throw new RuntimeException(e);
        }
    }

    public Parser getParser() {
        return parser;
    }

    public boolean isSupportedMediaType(String type) {
        return supportedMediaTypes.contains(MediaType.parse(type));
    }

    /**
     * This method should only be used for information purpose and not be relied
     * upon to determine if the given type is indexed or not. It relies on Tika
     * implementation detail to determine if a given type is meant to be indexed
     *
     * @param type mimeType to check
     * @return true if the given type is supported and indexed
     */
    public boolean isIndexed(String type) {
        if (!isSupportedMediaType(type)){
            return false;
        }

        MediaType mediaType = MediaType.parse(type);
        Parser p = getSupportingParser(parser, mediaType);
        if (p == null){
            return false;
        }
        p = unwrap(p);
        if (p instanceof EmptyParser){
            return false;
        }
        return true;
    }

    private static TikaConfig getTikaConfig(File tikaConfig) throws TikaException, IOException, SAXException {
        TikaConfig config;
        if (tikaConfig == null) {
            URL configUrl = TextExtractor.class.getResource(DEFAULT_TIKA_CONFIG);
            if (configUrl != null) {
                log.info("Loading default Tika config from {}", configUrl);
                config = new TikaConfig(configUrl);
            } else {
                log.info("Using default Tika config");
                config = TikaConfig.getDefaultConfig();
            }
        } else {
            log.info("Loading external Tika config from {}", tikaConfig);
            config = new TikaConfig(tikaConfig);
        }
        return config;
    }

    private static Parser getSupportingParser(Parser p, MediaType mediaType){
        if (p instanceof CompositeParser){
            Map parsers = ((CompositeParser) p).getParsers();
            return getSupportingParser(parsers.get(mediaType), mediaType);
        }
        return p;
    }

    private static Parser unwrap(Parser p){
        if (p instanceof ParserDecorator){
            return unwrap(((ParserDecorator) p).getWrappedParser());
        }
        return p;
    }

    private static void logSupportedTypesOnce(Set supportedMediaTypes) {
        boolean alreadyLogged = supportedTypesLogged.getAndSet(true);
        if (!alreadyLogged) {
            log.info("Supported media types {}", supportedMediaTypes);
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy