All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.digitalcollections.solrocr.solr.SolrOcrHighlighter Maven / Gradle / Ivy

Go to download

Solr plugin to add support for highlighting directly from various OCR formats (hOCR/ALTO/MiniOCR) without having to store the OCR documents in the index.

The newest version!
/*
 * Contains verbatim code and custom code based on code from the Solr
 * project, licensed under the following terms. All parts where this is
 * the case are clearly marked as such in a source code comment referring
 * to this header.
 *
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE.upstream file distributed
 * with this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * For all parts where this is not the case, refer to the LICENSE file in the
 * repository root.
 */
package de.digitalcollections.solrocr.solr;

import de.digitalcollections.solrocr.lucene.OcrHighlighter;
import de.digitalcollections.solrocr.model.OcrHighlightResult;
import java.io.IOException;
import java.util.Collection;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.search.Query;
import org.apache.solr.common.params.HighlightParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.highlight.UnifiedSolrHighlighter;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.search.DocList;
import org.apache.solr.util.SolrPluginUtils;

public class SolrOcrHighlighter extends UnifiedSolrHighlighter {
  public NamedList doHighlighting(
      DocList docs, Query query, SolrQueryRequest req, Map respHeader)
      throws IOException {
    // Copied from superclass
    // - *snip* -
    final SolrParams params = req.getParams();
    if (!isHighlightingEnabled(params)) {
      return null;
    }
    if (docs.size() == 0) {
      return new SimpleOrderedMap<>();
    }
    int[] docIDs = toDocIDs(docs);
    String[] keys = getUniqueKeys(req.getSearcher(), docIDs);
    // - *snap* -

    // query-time parameters
    String[] ocrFieldNames = getOcrHighlightFields(req);
    // No output if no fields were defined
    if (ocrFieldNames == null || ocrFieldNames.length == 0) {
      return null;
    }
    int[] maxPassagesOcr = getMaxPassages(ocrFieldNames, params);

    // Highlight OCR fields
    OcrHighlighter ocrHighlighter =
        new OcrHighlighter(req.getSearcher(), req.getSchema().getIndexAnalyzer(), req.getParams());
    OcrHighlightResult[] ocrSnippets =
        ocrHighlighter.highlightOcrFields(ocrFieldNames, query, docIDs, maxPassagesOcr, respHeader);

    // Assemble output data
    SimpleOrderedMap out = new SimpleOrderedMap<>();
    if (ocrSnippets != null) {
      this.addOcrSnippets(out, keys, ocrSnippets);
    }
    return out;
  }

  private int[] getMaxPassages(String[] fieldNames, SolrParams params) {
    int[] maxPassages = new int[fieldNames.length];
    for (int i = 0; i < fieldNames.length; i++) {
      maxPassages[i] = params.getFieldInt(fieldNames[i], HighlightParams.SNIPPETS, 1);
    }
    return maxPassages;
  }

  private void addOcrSnippets(
      NamedList out, String[] keys, OcrHighlightResult[] ocrSnippets) {
    for (int k = 0; k < keys.length; k++) {
      String docId = keys[k];
      SimpleOrderedMap docMap = (SimpleOrderedMap) out.get(docId);
      if (docMap == null) {
        docMap = new SimpleOrderedMap<>();
        out.add(docId, docMap);
      }
      if (ocrSnippets[k] == null) {
        continue;
      }
      docMap.addAll(ocrSnippets[k].toNamedList());
    }
  }

  /** Obtain all fields among the requested fields that contain OCR data. */
  private String[] getOcrHighlightFields(SolrQueryRequest req) {
    String[] fields = req.getParams().getParams(OcrHighlightParams.OCR_FIELDS);

    if (fields != null && fields.length > 0) {
      Set expandedFields = new LinkedHashSet<>();
      Collection storedHighlightFieldNames =
          req.getSearcher().getDocFetcher().getStoredHighlightFieldNames();
      for (String field : fields) {
        expandWildcardsInHighlightFields(
            expandedFields, storedHighlightFieldNames, SolrPluginUtils.split(field));
      }
      fields = expandedFields.toArray(new String[] {});
      // Trim them now in case they haven't been yet.  Not needed for all code-paths above but do it
      // here.
      for (int i = 0; i < fields.length; i++) {
        fields[i] = fields[i].trim();
      }
    }
    return fields;
  }

  /**
   * Copied from {@link
   * org.apache.solr.highlight.SolrHighlighter#expandWildcardsInHighlightFields(java.util.Set,
   * java.util.Collection, java.lang.String...)} due to private access there. Please refer
   * to the file header for licensing information on the original code.
   */
  private static void expandWildcardsInHighlightFields(
      Set expandedFields, Collection storedHighlightFieldNames, String... fields) {
    for (String field : fields) {
      if (field.contains("*")) {
        // create a Java regular expression from the wildcard string
        String fieldRegex = field.replace("\\*", ".*");
        for (String storedFieldName : storedHighlightFieldNames) {
          if (storedFieldName.matches(fieldRegex)) {
            expandedFields.add(storedFieldName);
          }
        }
      } else {
        expandedFields.add(field);
      }
    }
  }
}