org.apache.solr.handler.tagger.Tagger Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of solr-core Show documentation
Apache Solr (module: core)
There is a newer version: 9.7.0
/*
 * This software was produced for the U. S. Government
 * under Contract No. W15P7T-11-C-F600, and is
 * subject to the Rights in Noncommercial Computer Software
 * and Noncommercial Computer Software Documentation
 * Clause 252.227-7014 (JUN 1995)
 *
 * Copyright 2013 The MITRE Corporation. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.solr.handler.tagger;

import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.index.Terms;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
import org.apache.solr.common.util.CollectionUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Tags maximum string of words in a corpus. This is a callback-style API in which you implement
 * {@link #tagCallback(int, int, Object)}.
 *
 * This class should be independently usable outside Solr.
 */
public abstract class Tagger {
  private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());

  private final TokenStream tokenStream;
  private final TermToBytesRefAttribute byteRefAtt;
  private final PositionIncrementAttribute posIncAtt;
  private final OffsetAttribute offsetAtt;
  private final TaggingAttribute taggingAtt;

  private final TagClusterReducer tagClusterReducer;
  private final Terms terms;
  private final Bits liveDocs;
  private final boolean skipAltTokens;
  private final boolean ignoreStopWords;

  private Map docIdsCache;

  /** Whether the WARNING about skipped tokens was already logged. */
  private boolean loggedSkippedAltTokenWarning = false;

  public Tagger(
      Terms terms,
      Bits liveDocs,
      TokenStream tokenStream,
      TagClusterReducer tagClusterReducer,
      boolean skipAltTokens,
      boolean ignoreStopWords)
      throws IOException {
    this.terms = terms;
    this.liveDocs = liveDocs;
    this.tokenStream = tokenStream;
    this.skipAltTokens = skipAltTokens;
    this.ignoreStopWords = ignoreStopWords;
    byteRefAtt = tokenStream.addAttribute(TermToBytesRefAttribute.class);
    posIncAtt = tokenStream.addAttribute(PositionIncrementAttribute.class);
    offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
    taggingAtt = tokenStream.addAttribute(TaggingAttribute.class);
    tokenStream.reset();

    this.tagClusterReducer = tagClusterReducer;
  }

  public void enableDocIdsCache(int initSize) {
    if (initSize > 0) docIdsCache = CollectionUtil.newHashMap(initSize);
  }

  public void process() throws IOException {
    if (terms == null) return;

    // a shared pointer to the head used by this method and each Tag instance.
    final TagLL[] head = new TagLL[1];

    TermPrefixCursor cursor = null; // re-used

    // boolean switch used to log warnings in case tokens where skipped during tagging.
    boolean skippedTokens = false;

    while (tokenStream.incrementToken()) {
      if (log.isTraceEnabled()) {
        log.trace(
            "Token: {}, posInc: {},  offset: [{},{}]",
            byteRefAtt,
            posIncAtt.getPositionIncrement(),
            offsetAtt.startOffset(),
            offsetAtt.endOffset());
      }
      // check for posInc < 1 (alternate Tokens, such as expanded Synonyms)
      if (posIncAtt.getPositionIncrement() < 1) {
        // (a) Deal with this as a configuration issue and throw an exception
        if (!skipAltTokens) {
          // TODO throw UnsupportedTokenException when PhraseBuilder is ported
          throw new IllegalStateException(
              "Query Analyzer generates alternate "
                  + "Tokens (posInc == 0). Please adapt your Analyzer configuration or "
                  + "enable '"
                  + TaggerRequestHandler.SKIP_ALT_TOKENS
                  + "' to skip such "
                  + "tokens. NOTE: enabling '"
                  + TaggerRequestHandler.SKIP_ALT_TOKENS
                  + "' might result in wrong tagging results if the index time analyzer "
                  + "is not configured accordingly. For detailed information see "
                  + "https://github.com/OpenSextant/SolrTextTagger/pull/11#issuecomment-24936225");
        } else {
          // (b) In case the index time analyser had indexed all variants (users
          //    need to ensure that) processing of alternate tokens can be skipped
          //    as anyways all alternatives will be contained in the FST.
          skippedTokens = true;
          log.trace("  ... ignored token");
          continue;
        }
      }
      // -- If PositionIncrement > 1 (stopwords)
      if (!ignoreStopWords && posIncAtt.getPositionIncrement() > 1) {
        log.trace("   - posInc > 1 ... mark cluster as done");
        advanceTagsAndProcessClusterIfDone(head, null);
      }

      final BytesRef term;
      // NOTE: we need to lookup tokens if
      // * the LookupAtt is true OR
      // * there are still advancing tags (to find the longest possible match)
      if (taggingAtt.isTaggable() || head[0] != null) {
        // -- Lookup the term id from the next token
        term = byteRefAtt.getBytesRef();
        if (term.length == 0) {
          throw new IllegalArgumentException(
              "term: " + term.utf8ToString() + " analyzed to a zero-length token");
        }
      } else { // no current cluster AND lookup == false ...
        term = null; // skip this token
      }

      // -- Process tag
      advanceTagsAndProcessClusterIfDone(head, term);

      // -- only create new Tags for Tokens we need to lookup
      if (taggingAtt.isTaggable() && term != null) {

        // determine if the terms index has a term starting with the provided term
        // TODO create a pool of these cursors to reuse them more?  could be trivial impl
        if (cursor == null) { // (else the existing cursor will be re-used)
          cursor = new TermPrefixCursor(terms.iterator(), liveDocs, docIdsCache);
        }
        if (cursor.advance(term)) {
          TagLL newTail =
              new TagLL(head, cursor, offsetAtt.startOffset(), offsetAtt.endOffset(), null);
          cursor = null; // because the new tag now "owns" this instance
          // and add it to the end
          if (head[0] == null) {
            head[0] = newTail;
          } else {
            for (TagLL t = head[0]; true; t = t.nextTag) {
              if (t.nextTag == null) {
                t.addAfterLL(newTail);
                break;
              }
            }
          }
        }
      } // if termId >= 0
    } // end while(incrementToken())

    // -- Finish all tags
    advanceTagsAndProcessClusterIfDone(head, null);
    assert head[0] == null;

    if (!loggedSkippedAltTokenWarning && skippedTokens) {
      loggedSkippedAltTokenWarning = true; // only log once
      log.warn(
          "{}{}{}{}",
          "The Tagger skipped some alternate tokens (tokens with posInc == 0) ",
          "while processing text. This may cause problems with some Analyzer ",
          "configurations (e.g. query time synonym expansion). For details see ",
          "https://github.com/OpenSextant/SolrTextTagger/pull/11#issuecomment-24936225");
    }

    tokenStream.end();
    // tokenStream.close(); caller closes because caller acquired it
  }

  private void advanceTagsAndProcessClusterIfDone(TagLL[] head, BytesRef term) throws IOException {
    // -- Advance tags
    final int endOffset = term != null ? offsetAtt.endOffset() : -1;
    boolean anyAdvance = false;
    for (TagLL t = head[0]; t != null; t = t.nextTag) {
      anyAdvance |= t.advance(term, endOffset);
    }

    // -- Process cluster if done
    if (!anyAdvance && head[0] != null) {
      tagClusterReducer.reduce(head);
      for (TagLL t = head[0]; t != null; t = t.nextTag) {
        assert t.value != null;
        tagCallback(t.startOffset, t.endOffset, t.value);
      }
      head[0] = null;
    }
  }

  /**
   * Invoked by {@link #process()} for each tag found. endOffset is always >= the endOffset given
   * in the previous call.
   *
   * @param startOffset The character offset of the original stream where the tag starts.
   * @param endOffset One more than the character offset of the original stream where the tag ends.
   * @param docIdsKey A reference to the matching docIds that can be resolved via {@link
   *     #lookupDocIds(Object)}.
   */
  protected abstract void tagCallback(int startOffset, int endOffset, Object docIdsKey);

  /**
   * Returns a sorted array of integer docIds given the corresponding key.
   *
   * @param docIdsKey The lookup key.
   * @return Not null
   */
  protected IntsRef lookupDocIds(Object docIdsKey) {
    return (IntsRef) docIdsKey;
  }
}