All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.quarkus.tika.deployment.TikaProcessor Maven / Gradle / Ivy

package io.quarkus.tika.deployment;

import java.lang.reflect.Method;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Optional;
import java.util.Set;
import java.util.function.Function;
import java.util.function.Predicate;
import java.util.stream.Collectors;

import org.apache.tika.detect.Detector;
import org.apache.tika.detect.EncodingDetector;
import org.apache.tika.parser.Parser;

import io.quarkus.arc.deployment.AdditionalBeanBuildItem;
import io.quarkus.arc.deployment.BeanContainerBuildItem;
import io.quarkus.deployment.Feature;
import io.quarkus.deployment.annotations.BuildProducer;
import io.quarkus.deployment.annotations.BuildStep;
import io.quarkus.deployment.annotations.ExecutionTime;
import io.quarkus.deployment.annotations.Record;
import io.quarkus.deployment.builditem.FeatureBuildItem;
import io.quarkus.deployment.builditem.nativeimage.NativeImageResourceBuildItem;
import io.quarkus.deployment.builditem.nativeimage.NativeImageResourceDirectoryBuildItem;
import io.quarkus.deployment.builditem.nativeimage.RuntimeInitializedClassBuildItem;
import io.quarkus.deployment.builditem.nativeimage.ServiceProviderBuildItem;
import io.quarkus.deployment.util.ServiceUtil;
import io.quarkus.tika.TikaParseException;
import io.quarkus.tika.runtime.TikaConfiguration;
import io.quarkus.tika.runtime.TikaParserProducer;
import io.quarkus.tika.runtime.TikaRecorder;

public class TikaProcessor {
    private static final Set NOT_NATIVE_READY_PARSERS = Arrays.stream(new String[] {
            "org.apache.tika.parser.mat.MatParser",
            "org.apache.tika.parser.journal.GrobidRESTParser",
            "org.apache.tika.parser.journal.JournalParser",
            "org.apache.tika.parser.jdbc.SQLite3Parser",
            "org.apache.tika.parser.mail.RFC822Parser",
            "org.apache.tika.parser.pkg.CompressorParser",
            "org.apache.tika.parser.geo.topic.GeoParser"
    }).collect(Collectors.toSet());

    private static final Map PARSER_ABBREVIATIONS = Arrays.stream(new String[][] {
            { "pdf", "org.apache.tika.parser.pdf.PDFParser" },
            { "odf", "org.apache.tika.parser.odf.OpenDocumentParser" }
    }).collect(Collectors.toMap(kv -> kv[0], kv -> kv[1]));

    @BuildStep
    AdditionalBeanBuildItem beans() {
        return AdditionalBeanBuildItem.unremovableOf(TikaParserProducer.class);
    }

    @BuildStep
    FeatureBuildItem feature() {
        return new FeatureBuildItem(Feature.TIKA);
    }

    @BuildStep
    public void registerRuntimeInitializedClasses(BuildProducer resource) {
        //org.apache.tika.parser.pdf.PDFParser (https://issues.apache.org/jira/browse/PDFBOX-4548)
        resource.produce(new RuntimeInitializedClassBuildItem("org.apache.pdfbox.pdmodel.font.PDType1Font"));
    }

    @BuildStep
    public void registerTikaCoreResources(BuildProducer resource) {
        resource.produce(new NativeImageResourceBuildItem("org/apache/tika/mime/tika-mimetypes.xml"));
        resource.produce(new NativeImageResourceBuildItem("org/apache/tika/parser/external/tika-external-parsers.xml"));
    }

    @BuildStep
    public void registerTikaParsersResources(BuildProducer resource) {
        resource.produce(new NativeImageResourceBuildItem("org/apache/tika/parser/pdf/PDFParser.properties"));
    }

    @BuildStep
    public void registerPdfBoxResources(BuildProducer resource) {
        resource.produce(new NativeImageResourceDirectoryBuildItem("org/apache/pdfbox/resources/afm"));
        resource.produce(new NativeImageResourceDirectoryBuildItem("org/apache/pdfbox/resources/glyphlist"));
        resource.produce(new NativeImageResourceDirectoryBuildItem("org/apache/fontbox/cmap"));
        resource.produce(new NativeImageResourceDirectoryBuildItem("org/apache/fontbox/unicode"));
    }

    @BuildStep
    @Record(ExecutionTime.STATIC_INIT)
    void initializeTikaParser(BeanContainerBuildItem beanContainer, TikaRecorder recorder,
            BuildProducer serviceProvider, TikaConfiguration configuration)
            throws Exception {
        Map> parsers = getSupportedParserConfig(configuration.tikaConfigPath,
                configuration.parsers,
                configuration.parserOptions, configuration.parser);
        String tikaXmlConfiguration = generateTikaXmlConfiguration(parsers);

        serviceProvider.produce(new ServiceProviderBuildItem(Parser.class.getName(), new ArrayList<>(parsers.keySet())));
        serviceProvider
                .produce(new ServiceProviderBuildItem(Detector.class.getName(), getProviderNames(Detector.class.getName())));
        serviceProvider.produce(new ServiceProviderBuildItem(EncodingDetector.class.getName(),
                getProviderNames(EncodingDetector.class.getName())));

        recorder.initTikaParser(beanContainer.getValue(), configuration, tikaXmlConfiguration);
    }

    private static List getProviderNames(String serviceProviderName) throws Exception {
        return new ArrayList<>(ServiceUtil.classNamesNamedIn(TikaProcessor.class.getClassLoader(),
                "META-INF/services/" + serviceProviderName));
    }

    public static Map> getSupportedParserConfig(Optional tikaConfigPath,
            Optional requiredParsers,
            Map> parserParamMaps,
            Map parserAbbreviations) throws Exception {
        Predicate pred = p -> !NOT_NATIVE_READY_PARSERS.contains(p);
        List providerNames = getProviderNames(Parser.class.getName());
        if (tikaConfigPath.isPresent() || !requiredParsers.isPresent()) {
            return providerNames.stream().filter(pred).collect(Collectors.toMap(Function.identity(),
                    p -> Collections. emptyList()));
        } else {
            List abbreviations = Arrays.stream(requiredParsers.get().split(",")).map(s -> s.trim())
                    .collect(Collectors.toList());
            Map fullNamesAndAbbreviations = abbreviations.stream()
                    .collect(Collectors.toMap(p -> getParserNameFromConfig(p, parserAbbreviations), Function.identity()));
            return providerNames.stream().filter(pred).filter(p -> fullNamesAndAbbreviations.containsKey(p))
                    .collect(Collectors.toMap(Function.identity(),
                            p -> getParserConfig(p, parserParamMaps.get(fullNamesAndAbbreviations.get(p)))));
        }
    }

    private static String generateTikaXmlConfiguration(Map> parserConfig) {
        StringBuilder tikaXmlConfigurationBuilder = new StringBuilder();
        tikaXmlConfigurationBuilder.append("");
        tikaXmlConfigurationBuilder.append("");
        for (Entry> parserEntry : parserConfig.entrySet()) {
            tikaXmlConfigurationBuilder.append("");
            if (!parserEntry.getValue().isEmpty()) {
                appendParserParameters(tikaXmlConfigurationBuilder, parserEntry.getValue());
            }
            tikaXmlConfigurationBuilder.append("");
        }
        tikaXmlConfigurationBuilder.append("");
        tikaXmlConfigurationBuilder.append("");
        return tikaXmlConfigurationBuilder.toString();
    }

    private static void appendParserParameters(StringBuilder tikaXmlConfigurationBuilder,
            List parserParams) {
        tikaXmlConfigurationBuilder.append("");
        for (TikaParserParameter parserParam : parserParams) {
            tikaXmlConfigurationBuilder.append("");
            tikaXmlConfigurationBuilder.append(parserParam.getValue());
            tikaXmlConfigurationBuilder.append("");
        }
        tikaXmlConfigurationBuilder.append("");
    }

    private static List getParserConfig(String parserName, Map parserParamMap) {
        List parserParams = new LinkedList<>();
        if (parserParamMap != null) {
            for (Map.Entry entry : parserParamMap.entrySet()) {
                String paramName = unhyphenate(entry.getKey());
                String paramType = getParserParamType(parserName, paramName);
                parserParams.add(new TikaParserParameter(paramName, entry.getValue(), paramType));
            }
        }
        return parserParams;
    }

    private static String getParserNameFromConfig(String abbreviation, Map parserAbbreviations) {
        if (PARSER_ABBREVIATIONS.containsKey(abbreviation)) {
            return PARSER_ABBREVIATIONS.get(abbreviation);
        }

        if (parserAbbreviations.containsKey(abbreviation)) {
            return parserAbbreviations.get(abbreviation);
        }

        throw new IllegalStateException("The custom abbreviation `" + abbreviation
                + "` can not be resolved to a parser class name, please set a "
                + "quarkus.tika.parser-name." + abbreviation + " property");
    }

    // Convert a property name such as "sort-by-position" to "sortByPosition"
    public static String unhyphenate(String paramName) {
        StringBuilder sb = new StringBuilder();
        String[] words = paramName.split("-");
        for (int i = 0; i < words.length; i++) {
            sb.append(i > 0 ? capitalize(words[i]) : words[i]);
        }
        return sb.toString();
    }

    private static String capitalize(String paramName) {
        char[] chars = paramName.toCharArray();
        chars[0] = Character.toUpperCase(chars[0]);
        return new String(chars);
    }

    // TODO: Remove the reflection code below once TikaConfig becomes capable
    // of loading the parameters without the type attribute: TIKA-2944

    private static Class loadParserClass(String parserName) {
        try {
            return TikaProcessor.class.getClassLoader().loadClass(parserName);
        } catch (Throwable t) {
            final String errorMessage = "Parser " + parserName + " can not be loaded";
            throw new TikaParseException(errorMessage);
        }
    }

    private static String getParserParamType(String parserName, String paramName) {
        try {
            Class parserClass = loadParserClass(parserName);
            Method[] methods = parserClass.getMethods();
            String setterMethodName = "set" + capitalize(paramName);
            String paramType = null;
            for (Method method : methods) {
                if (method.getName().equals(setterMethodName) && method.getParameterCount() == 1) {
                    paramType = method.getParameterTypes()[0].getSimpleName().toLowerCase();
                    if (paramType.equals(boolean.class.getSimpleName())) {
                        // TikaConfig Param class does not recognize 'boolean', only 'bool'
                        // This whole reflection code is temporary anyway
                        paramType = "bool";
                    }
                    return paramType;
                }
            }
        } catch (Throwable t) {
            throw new TikaParseException(String.format("Parser %s has no %s property", parserName, paramName));
        }
        throw new TikaParseException(String.format("Parser %s has no %s property", parserName, paramName));
    }

    public static class TikaParserParameter {
        private String name;
        private String value;
        private String type;

        public TikaParserParameter(String name, String value, String type) {
            this.name = name;
            this.value = value;
            this.type = type;
        }

        public String getName() {
            return name;
        }

        public String getType() {
            return type;
        }

        public String getValue() {
            return value;
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy