All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.eval.tokens.AnalyzerDeserializer Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.eval.tokens;


import java.io.IOException;
import java.io.Reader;
import java.lang.reflect.Type;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

import com.google.gson.JsonArray;
import com.google.gson.JsonDeserializationContext;
import com.google.gson.JsonDeserializer;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import com.google.gson.JsonParseException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilterFactory;
import org.apache.lucene.analysis.util.CharFilterFactory;
import org.apache.lucene.analysis.util.ClasspathResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoaderAware;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import org.apache.lucene.analysis.util.TokenizerFactory;

class AnalyzerDeserializer implements JsonDeserializer> {


    private static final String ANALYZERS = "analyzers";
    private static final String CHAR_FILTERS = "charfilters";
    private static final String TOKEN_FILTERS = "tokenfilters";
    private static final String TOKENIZER = "tokenizer";
    private static final String FACTORY = "factory";
    private static final String PARAMS = "params";
    private static final String COMMENT = "_comment";

    private final int maxTokens;

    AnalyzerDeserializer(int maxTokens) {
        this.maxTokens = maxTokens;
    }

    @Override
    public Map deserialize(JsonElement element, Type type,
                                             JsonDeserializationContext jsonDeserializationContext) throws JsonParseException {
        if (! element.isJsonObject()) {
            throw new IllegalArgumentException("Expecting top level 'analyzers:{}'");
        }

        JsonElement root = element.getAsJsonObject().get(ANALYZERS);
        if (root == null) {
            throw new IllegalArgumentException("Expecting top level 'analyzers:{}");
        }
        try {
            return buildAnalyzers(root, maxTokens);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }

    }

    public static Map buildAnalyzers(JsonElement value, int maxTokens) throws IOException {
        if (! value.isJsonObject()) {
            throw new IllegalArgumentException("Expecting map with analyzer names/analyzer definitions");
        }
        Map analyzers = new HashMap<>();
        JsonObject root = (JsonObject)value;
        for (Map.Entry e : root.entrySet()) {
            String analyzerName = e.getKey();
            Analyzer analyzer = buildAnalyzer(analyzerName, e.getValue(), maxTokens);
            analyzers.put(analyzerName, analyzer);
        }
        return analyzers;
    }

    public static Analyzer buildAnalyzer(String analyzerName, JsonElement value, int maxTokens) throws IOException {
        if (! value.isJsonObject()) {
            throw new IllegalArgumentException("Expecting map of charfilter, tokenizer, tokenfilters");
        }
        JsonObject aRoot = (JsonObject)value;
        CharFilterFactory[] charFilters = new CharFilterFactory[0];
        TokenizerFactory tokenizerFactory = null;
        TokenFilterFactory[] tokenFilterFactories = new TokenFilterFactory[0];
        for ( Map.Entry e : aRoot.entrySet()) {
            String k = e.getKey();
            if (k.equals(CHAR_FILTERS)) {
                charFilters = buildCharFilters(e.getValue(), analyzerName);
            } else if (k.equals(TOKEN_FILTERS)) {
                tokenFilterFactories = buildTokenFilterFactories(e.getValue(), analyzerName, maxTokens);
            } else if (k.equals(TOKENIZER)) {
                tokenizerFactory = buildTokenizerFactory(e.getValue(), analyzerName);
            } else if (! k.equals(COMMENT)) {
                throw new IllegalArgumentException("Should have one of three values here:"+
                        CHAR_FILTERS + ", "+
                        TOKENIZER+", "+
                        TOKEN_FILTERS +
                        ". I don't recognize: "+k);
            }
        }
        if (tokenizerFactory == null) {
            throw new IllegalArgumentException("Must specify at least a tokenizer factory for an analyzer!");
        }
        return new MyTokenizerChain(charFilters, tokenizerFactory, tokenFilterFactories);
    }

    private static TokenizerFactory buildTokenizerFactory(JsonElement map, String analyzerName) throws IOException {
        if (!(map instanceof JsonObject)) {
            throw new IllegalArgumentException("Expecting a map with \"factory\" string and " +
                    "\"params\" map in tokenizer factory;"+
                    " not: "+map.toString() + " in "+analyzerName);
        }
        JsonElement factoryEl = ((JsonObject)map).get(FACTORY);
        if (factoryEl == null || ! factoryEl.isJsonPrimitive()) {
            throw new IllegalArgumentException("Expecting value for factory in char filter factory builder in:"+
                    analyzerName);
        }
        String factoryName = factoryEl.getAsString();
        factoryName = factoryName.startsWith("oala.") ?
                factoryName.replaceFirst("oala.", "org.apache.lucene.analysis.") : factoryName;

        JsonElement paramsEl = ((JsonObject)map).get(PARAMS);
        Map params = mapify(paramsEl);
        String spiName = "";
        for (String s : TokenizerFactory.availableTokenizers()) {
            Class clazz = TokenizerFactory.lookupClass(s);
            if (clazz.getName().equals(factoryName)) {
                spiName = s;
                break;
            }
        }
        if (spiName.equals("")) {
            throw new IllegalArgumentException("A SPI class of type org.apache.lucene.analysis.util.TokenizerFactory with name"+
            "'"+factoryName+"' does not exist.");
        }
        try {
            TokenizerFactory tokenizerFactory = TokenizerFactory.forName(spiName, params);
            if (tokenizerFactory instanceof ResourceLoaderAware) {
                ((ResourceLoaderAware) tokenizerFactory).inform(new ClasspathResourceLoader(AnalyzerDeserializer.class));
            }

            return tokenizerFactory;
        } catch (IllegalArgumentException e) {
            throw new IllegalArgumentException("While working on "+analyzerName, e);
        }
    }

    private static CharFilterFactory[] buildCharFilters(JsonElement el, String analyzerName) throws IOException {
        if (el == null || el.isJsonNull()) {
            return null;
        }
        if (! el.isJsonArray()) {
            throw new IllegalArgumentException("Expecting array for charfilters, but got:"+el.toString() +
                    " for "+analyzerName);
        }
        JsonArray jsonArray = (JsonArray)el;
        List ret = new LinkedList();
        for (JsonElement filterMap : jsonArray) {
            if (!(filterMap instanceof JsonObject)) {
                throw new IllegalArgumentException("Expecting a map with \"factory\" string and \"params\" map in char filter factory;"+
                        " not: "+filterMap.toString() + " in "+analyzerName);
            }
            JsonElement factoryEl = ((JsonObject)filterMap).get(FACTORY);
            if (factoryEl == null || ! factoryEl.isJsonPrimitive()) {
                throw new IllegalArgumentException(
                        "Expecting value for factory in char filter factory builder in:"+analyzerName);
            }
            String factoryName = factoryEl.getAsString();
            factoryName = factoryName.replaceAll("oala.", "org.apache.lucene.analysis.");

            JsonElement paramsEl = ((JsonObject)filterMap).get(PARAMS);
            Map params = mapify(paramsEl);
            String spiName = "";
            for (String s : CharFilterFactory.availableCharFilters()) {
                Class clazz = CharFilterFactory.lookupClass(s);
                if (clazz.getName().equals(factoryName)) {
                    spiName = s;
                    break;
                }
            }
            if (spiName.equals("")) {
                throw new IllegalArgumentException("A SPI class of type org.apache.lucene.analysis.util.CharFilterFactory with name"+
                        "'"+factoryName+"' does not exist.");
            }

            try {
                CharFilterFactory charFilterFactory = CharFilterFactory.forName(spiName, params);
                if (charFilterFactory instanceof ResourceLoaderAware) {
                    ((ResourceLoaderAware) charFilterFactory).inform(new ClasspathResourceLoader(AnalyzerDeserializer.class));
                }
                ret.add(charFilterFactory);
            } catch (IllegalArgumentException e) {
                throw new IllegalArgumentException("While trying to load "+
                        analyzerName + ": "+ e.getMessage(), e);
            }
        }
        if (ret.size() == 0) {
            return new CharFilterFactory[0];
        }
        return ret.toArray(new CharFilterFactory[ret.size()]);
    }

    private static TokenFilterFactory[] buildTokenFilterFactories(JsonElement el,
                                                                  String analyzerName, int maxTokens) throws IOException {
        if (el == null || el.isJsonNull()) {
            return null;
        }
        if (! el.isJsonArray()) {
            throw new IllegalArgumentException(
                    "Expecting array for tokenfilters, but got:"+el.toString() + " in "+analyzerName);
        }
        JsonArray jsonArray = (JsonArray)el;
        List ret = new LinkedList<>();
        for (JsonElement filterMap : jsonArray) {
            if (!(filterMap instanceof JsonObject)) {
                throw new IllegalArgumentException("Expecting a map with \"factory\" string and \"params\" map in token filter factory;"+
                        " not: "+filterMap.toString() + " in "+ analyzerName);
            }
            JsonElement factoryEl = ((JsonObject)filterMap).get(FACTORY);
            if (factoryEl == null || ! factoryEl.isJsonPrimitive()) {
                throw new IllegalArgumentException("Expecting value for factory in token filter factory builder in "+analyzerName);
            }
            String factoryName = factoryEl.getAsString();
            factoryName = factoryName.startsWith("oala.") ?
                    factoryName.replaceFirst("oala.", "org.apache.lucene.analysis.") :
                    factoryName;

            JsonElement paramsEl = ((JsonObject)filterMap).get(PARAMS);
            Map params = mapify(paramsEl);
            String spiName = "";
            for (String s : TokenFilterFactory.availableTokenFilters()) {
                Class clazz = TokenFilterFactory.lookupClass(s);
                if (clazz.getName().equals(factoryName)) {
                    spiName = s;
                    break;
                }
            }
            if (spiName.equals("")) {
                throw new IllegalArgumentException("A SPI class of type org.apache.lucene.analysis.util.TokenFilterFactory with name"+
                        "'"+factoryName+"' does not exist.");
            }

            try {
                TokenFilterFactory tokenFilterFactory = TokenFilterFactory.forName(spiName, params);
                if (tokenFilterFactory instanceof ResourceLoaderAware) {
                    ((ResourceLoaderAware) tokenFilterFactory).inform(new ClasspathResourceLoader(AnalyzerDeserializer.class));
                }
                ret.add(tokenFilterFactory);
            } catch (IllegalArgumentException e) {
                throw new IllegalArgumentException("While loading "+analyzerName, e);
            }
        }

        if (maxTokens > -1) {
            Map m = new HashMap<>();
            m.put("maxTokenCount", Integer.toString(maxTokens));
            ret.add(new LimitTokenCountFilterFactory(m));
        }

        if (ret.size() == 0) {
            return new TokenFilterFactory[0];
        }
        return ret.toArray(new TokenFilterFactory[ret.size()]);
    }

    private static  Map mapify(JsonElement paramsEl) {
        if (paramsEl == null || paramsEl.isJsonNull()) {
            return Collections.EMPTY_MAP;
        }
        if (! paramsEl.isJsonObject()) {
            throw new IllegalArgumentException("Expecting map, not: "+paramsEl.toString());
        }
        Map params = new HashMap<>();
        for (Map.Entry e : ((JsonObject)paramsEl).entrySet()) {
            JsonElement value = e.getValue();
            if (! value.isJsonPrimitive()) {
                throw new IllegalArgumentException("Expecting parameter to have primitive value: "+value.toString());
            }
            String v = e.getValue().getAsString();
            params.put(e.getKey(), v);
        }
        return params;
    }

    /**
     * Plagiarized verbatim from Solr!
     */
    private static class MyTokenizerChain extends Analyzer {

        final private CharFilterFactory[] charFilters;
        final private TokenizerFactory tokenizer;
        final private TokenFilterFactory[] filters;

        public MyTokenizerChain(TokenizerFactory tokenizer, TokenFilterFactory[] filters) {
            this(null, tokenizer, filters);
        }

        public MyTokenizerChain(CharFilterFactory[] charFilters, TokenizerFactory tokenizer, TokenFilterFactory[] filters) {
            this.charFilters = charFilters;
            this.tokenizer = tokenizer;
            this.filters = filters;
        }

        public CharFilterFactory[] getCharFilterFactories() {
            return charFilters;
        }

        public TokenizerFactory getTokenizerFactory() {
            return tokenizer;
        }

        public TokenFilterFactory[] getTokenFilterFactories() {
            return filters;
        }

        @Override
        public Reader initReader(String fieldName, Reader reader) {

            if (charFilters != null && charFilters.length > 0) {
                Reader cs = reader;
                for (CharFilterFactory charFilter : charFilters) {
                    cs = charFilter.create(cs);
                }
                reader = cs;
            }

            return reader;
        }

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tk = tokenizer.create();
            TokenStream ts = tk;
            for (TokenFilterFactory filter : filters) {
                ts = filter.create(ts);
            }

            return new TokenStreamComponents(tk, ts);
        }
    }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy