org.apache.jackrabbit.oak.plugins.tika.TikaHelper Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.jackrabbit.oak.plugins.tika;
import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.atomic.AtomicBoolean;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaException;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.EmptyParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParserDecorator;
import org.jetbrains.annotations.Nullable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.SAXException;
class TikaHelper {
private static final String DEFAULT_TIKA_CONFIG = "/org/apache/jackrabbit/oak/plugins/index/lucene/tika-config.xml";
private static final Logger log = LoggerFactory.getLogger(TikaHelper.class);
private final AutoDetectParser parser;
private final Set supportedMediaTypes;
private static AtomicBoolean supportedTypesLogged = new AtomicBoolean();
public TikaHelper(@Nullable File tikaConfig) throws IOException {
try {
parser = new AutoDetectParser(getTikaConfig(tikaConfig));
supportedMediaTypes = parser.getSupportedTypes(new ParseContext());
logSupportedTypesOnce(supportedMediaTypes);
} catch (TikaException e) {
throw new RuntimeException(e);
} catch (SAXException e) {
throw new RuntimeException(e);
}
}
public Parser getParser() {
return parser;
}
public boolean isSupportedMediaType(String type) {
return supportedMediaTypes.contains(MediaType.parse(type));
}
/**
* This method should only be used for information purpose and not be relied
* upon to determine if the given type is indexed or not. It relies on Tika
* implementation detail to determine if a given type is meant to be indexed
*
* @param type mimeType to check
* @return true if the given type is supported and indexed
*/
public boolean isIndexed(String type) {
if (!isSupportedMediaType(type)){
return false;
}
MediaType mediaType = MediaType.parse(type);
Parser p = getSupportingParser(parser, mediaType);
if (p == null){
return false;
}
p = unwrap(p);
if (p instanceof EmptyParser){
return false;
}
return true;
}
private static TikaConfig getTikaConfig(File tikaConfig) throws TikaException, IOException, SAXException {
TikaConfig config;
if (tikaConfig == null) {
URL configUrl = TextExtractor.class.getResource(DEFAULT_TIKA_CONFIG);
if (configUrl != null) {
log.info("Loading default Tika config from {}", configUrl);
config = new TikaConfig(configUrl);
} else {
log.info("Using default Tika config");
config = TikaConfig.getDefaultConfig();
}
} else {
log.info("Loading external Tika config from {}", tikaConfig);
config = new TikaConfig(tikaConfig);
}
return config;
}
private static Parser getSupportingParser(Parser p, MediaType mediaType){
if (p instanceof CompositeParser){
Map parsers = ((CompositeParser) p).getParsers();
return getSupportingParser(parsers.get(mediaType), mediaType);
}
return p;
}
private static Parser unwrap(Parser p){
if (p instanceof ParserDecorator){
return unwrap(((ParserDecorator) p).getWrappedParser());
}
return p;
}
private static void logSupportedTypesOnce(Set supportedMediaTypes) {
boolean alreadyLogged = supportedTypesLogged.getAndSet(true);
if (!alreadyLogged) {
log.info("Supported media types {}", supportedMediaTypes);
}
}
}