docet.engine.SimpleDocetDocSearcher Maven / Gradle / Ivy

Go to download
/*
 * Licensed to Diennea S.r.l. under one
 * or more contributor license agreements. See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. Diennea S.r.l. licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License. You may obtain a copy of the License at
 * http://www.apache.org/licenses/LICENSE-2.0
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied. See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package docet.engine;

import java.io.IOException;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.locks.ReentrantLock;
import java.util.stream.Collectors;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.fr.FrenchAnalyzer;
import org.apache.lucene.analysis.it.ItalianAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Fragmenter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleSpanFragmenter;
import org.apache.lucene.store.FSDirectory;

import docet.error.DocetDocumentSearchException;
import docet.model.DocetPackageDescriptor;
import docet.model.DocetPage;

/**
 * Simple implementation of a (Lucene-based) document searcher.
 *
 * @author matteo.casadei
 *
 */
public class SimpleDocetDocSearcher implements DocetDocumentSearcher {

    private static final int DEFAULT_MAX_TERMS_DISTANCE_IN_SEARCH = 6;
    private static final int DEFAULT_TERMS_MAX_DISTANCE_SIMILARITY = 1;
    private static final int MAX_NUM_FRAGMENTS = 3;
    private static final int MIN_TERM_LENGTH_THRESHOLD = 3;
    private static final String MACHING_EXCERPTS_SEPARATOR = " ... ";

    private static final String LUCENE_QUERY_CONTENT_PREFIX = "contents-";

    private final ReentrantLock lock;
    private final Path searchIndexPath;
    private IndexReader reader;
    private FSDirectory index;
    private DocetPackageDescriptor descriptor;

    public SimpleDocetDocSearcher(final Path searchIndexPath, final DocetPackageDescriptor descriptor) {
        this.searchIndexPath = searchIndexPath;
        this.descriptor = descriptor;
        this.lock  = new ReentrantLock(true);
    }

    private String getFallbackLangForLang(final String lang) {
        final String fallbackLang = this.descriptor.getFallbackLangForLang(lang);
        if (fallbackLang == null) {
            return "";
        } else {
            return fallbackLang;
        }
    }

    @Override
    public List searchForMatchingDocuments(final String searchText, final String lang, final int maxNumResults)
        throws DocetDocumentSearchException {
        final List results = new ArrayList<>();
        final String fallbackLang = this.getFallbackLangForLang(lang);
        final String actualSearchLang;
        if (fallbackLang.isEmpty()) {
            actualSearchLang = lang;
        } else {
            actualSearchLang = fallbackLang;
        }
        try {
            final IndexSearcher searcher = new IndexSearcher(reader);
            final Analyzer analyzer = new AnalyzerBuilder().language(actualSearchLang).build();
            QueryParser queryParser = new QueryParser(LUCENE_QUERY_CONTENT_PREFIX + actualSearchLang, analyzer);
            final Query query = queryParser.parse(constructLucenePhraseTermSearchQuery(searchText));
            final QueryScorer queryScorer = new QueryScorer(query, LUCENE_QUERY_CONTENT_PREFIX + actualSearchLang);

            final Fragmenter fragmenter = new SimpleSpanFragmenter(queryScorer);
            final Highlighter highlighter = new Highlighter(queryScorer);
            highlighter.setMaxDocCharsToAnalyze(Integer.MAX_VALUE);
            highlighter.setTextFragmenter(fragmenter);

            final TopDocs res =  searcher.search(query, maxNumResults);
            final float maxScore = res.getMaxScore();
            final List scoreDocs = Arrays.asList(res.scoreDocs);
            Map docs = new HashMap<>();
            Map scoresForDocs = new HashMap<>();
            for (final ScoreDoc sd : scoreDocs) {
                final org.apache.lucene.document.Document doc = searcher.doc(sd.doc);
                final String contents = doc.get(LUCENE_QUERY_CONTENT_PREFIX + actualSearchLang);
                final String docId = doc.get("id");
                final String[] fragments = highlighter.getBestFragments(analyzer, LUCENE_QUERY_CONTENT_PREFIX
                    + actualSearchLang, contents, MAX_NUM_FRAGMENTS);
                List fragmentList = Arrays.asList(fragments);
                fragmentList = fragmentList.stream().map(s1 -> s1.trim().split("\n"))
                        .map(s1 -> Arrays.asList(s1).stream().filter(s -> !s.trim().isEmpty())
                                .reduce((sa, sb) -> sa + MACHING_EXCERPTS_SEPARATOR + sb).orElse(MACHING_EXCERPTS_SEPARATOR))
                                .collect(Collectors.toList());
                docs.put(doc, MACHING_EXCERPTS_SEPARATOR  + fragmentList.stream()
                        .filter(s -> !s.isEmpty())
                        .reduce((s1, s2) -> s1 + "..." + s2).orElse("") + MACHING_EXCERPTS_SEPARATOR);
                scoresForDocs.putIfAbsent(docId, sd);
            }
            docs.entrySet().stream().forEach(e -> {
                final int relevance = Math.round((scoresForDocs.get(e.getKey().get("id")).score / maxScore) * 100);
                results.add(DocetPage.toDocetDocument(e.getKey(), e.getValue(), relevance));
            });
            return results;
        } catch (ParseException | IOException | InvalidTokenOffsetsException ex) {
            throw new DocetDocumentSearchException("Error on searching query " + searchText + " for lang "
                                                        + actualSearchLang, ex);
        }
    }

    @Override
    public boolean open() throws IOException {
        final boolean res;
        try {
            this.lock.lock();
            if (!isOpen()) {
                this.index = FSDirectory.open(searchIndexPath);
                this.reader = DirectoryReader.open(this.index);
                res = true;
            } else {
                res = false;
            }
        } finally {
            this.lock.unlock();
        }
        return res;
    }

    @Override
    public boolean close() throws IOException {
        final boolean res;
        try {
            this.lock.lock();
            if (isOpen()) {
                if (this.reader != null) {
                    this.reader.close();
                    this.reader = null;
                }
                if (this.index != null) {
                    this.index.close();
                    this.index = null;
                }
                res = true;
            } else {
                res = false;
            }
        } finally {
            this.lock.unlock();
        }
        return res;
    }

    private boolean isOpen() {
        return this.reader != null && this.index != null;
    }

    private String constructLucenePhraseTermSearchQuery(final String searchText) {
        final String phraseSearch = "\"" + searchText + "\"~" + DEFAULT_MAX_TERMS_DISTANCE_IN_SEARCH;
        final List singleTerms = Arrays.asList(searchText.split("\\s")).stream()
                .filter(s -> !s.trim().isEmpty() && s.trim().length() > MIN_TERM_LENGTH_THRESHOLD)
                .map(s -> s.trim() + "~" + DEFAULT_TERMS_MAX_DISTANCE_SIMILARITY)
                .collect(Collectors.toList());
        final String singleTermsQuery = singleTerms.stream()
                .reduce("", (t1, t2) -> t1 + " OR " + t2);
        return phraseSearch + singleTermsQuery;
    }

    /**
     *
     */
    private static class AnalyzerBuilder {

        private String lang;
        private static final String DEFAULT_LANGUAGE = "it";

        public AnalyzerBuilder() {
            this.lang = DEFAULT_LANGUAGE;
        }

        public AnalyzerBuilder language(final String lang) {
            this.lang = lang;
            return this;
        }

        public Analyzer build() {
            final Analyzer analyzer;
            switch (this.lang) {
                case "fr":
                    analyzer = new FrenchAnalyzer();
                    break;
                case "it":
                    analyzer = new ItalianAnalyzer();
                    break;
                case "en":
                default:
                    analyzer = new StandardAnalyzer();
            }
            return analyzer;
        }
    }
}