All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.solr.handler.tagger.Tagger Maven / Gradle / Ivy

There is a newer version: 9.7.0
Show newest version
/*
 * This software was produced for the U. S. Government
 * under Contract No. W15P7T-11-C-F600, and is
 * subject to the Rights in Noncommercial Computer Software
 * and Noncommercial Computer Software Documentation
 * Clause 252.227-7014 (JUN 1995)
 *
 * Copyright 2013 The MITRE Corporation. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.solr.handler.tagger;

import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.index.Terms;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
import org.apache.solr.common.util.CollectionUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Tags maximum string of words in a corpus. This is a callback-style API in which you implement
 * {@link #tagCallback(int, int, Object)}.
 *
 * 

This class should be independently usable outside Solr. */ public abstract class Tagger { private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); private final TokenStream tokenStream; private final TermToBytesRefAttribute byteRefAtt; private final PositionIncrementAttribute posIncAtt; private final OffsetAttribute offsetAtt; private final TaggingAttribute taggingAtt; private final TagClusterReducer tagClusterReducer; private final Terms terms; private final Bits liveDocs; private final boolean skipAltTokens; private final boolean ignoreStopWords; private Map docIdsCache; /** Whether the WARNING about skipped tokens was already logged. */ private boolean loggedSkippedAltTokenWarning = false; public Tagger( Terms terms, Bits liveDocs, TokenStream tokenStream, TagClusterReducer tagClusterReducer, boolean skipAltTokens, boolean ignoreStopWords) throws IOException { this.terms = terms; this.liveDocs = liveDocs; this.tokenStream = tokenStream; this.skipAltTokens = skipAltTokens; this.ignoreStopWords = ignoreStopWords; byteRefAtt = tokenStream.addAttribute(TermToBytesRefAttribute.class); posIncAtt = tokenStream.addAttribute(PositionIncrementAttribute.class); offsetAtt = tokenStream.addAttribute(OffsetAttribute.class); taggingAtt = tokenStream.addAttribute(TaggingAttribute.class); tokenStream.reset(); this.tagClusterReducer = tagClusterReducer; } public void enableDocIdsCache(int initSize) { if (initSize > 0) docIdsCache = CollectionUtil.newHashMap(initSize); } public void process() throws IOException { if (terms == null) return; // a shared pointer to the head used by this method and each Tag instance. final TagLL[] head = new TagLL[1]; TermPrefixCursor cursor = null; // re-used // boolean switch used to log warnings in case tokens where skipped during tagging. boolean skippedTokens = false; while (tokenStream.incrementToken()) { if (log.isTraceEnabled()) { log.trace( "Token: {}, posInc: {}, offset: [{},{}]", byteRefAtt, posIncAtt.getPositionIncrement(), offsetAtt.startOffset(), offsetAtt.endOffset()); } // check for posInc < 1 (alternate Tokens, such as expanded Synonyms) if (posIncAtt.getPositionIncrement() < 1) { // (a) Deal with this as a configuration issue and throw an exception if (!skipAltTokens) { // TODO throw UnsupportedTokenException when PhraseBuilder is ported throw new IllegalStateException( "Query Analyzer generates alternate " + "Tokens (posInc == 0). Please adapt your Analyzer configuration or " + "enable '" + TaggerRequestHandler.SKIP_ALT_TOKENS + "' to skip such " + "tokens. NOTE: enabling '" + TaggerRequestHandler.SKIP_ALT_TOKENS + "' might result in wrong tagging results if the index time analyzer " + "is not configured accordingly. For detailed information see " + "https://github.com/OpenSextant/SolrTextTagger/pull/11#issuecomment-24936225"); } else { // (b) In case the index time analyser had indexed all variants (users // need to ensure that) processing of alternate tokens can be skipped // as anyways all alternatives will be contained in the FST. skippedTokens = true; log.trace(" ... ignored token"); continue; } } // -- If PositionIncrement > 1 (stopwords) if (!ignoreStopWords && posIncAtt.getPositionIncrement() > 1) { log.trace(" - posInc > 1 ... mark cluster as done"); advanceTagsAndProcessClusterIfDone(head, null); } final BytesRef term; // NOTE: we need to lookup tokens if // * the LookupAtt is true OR // * there are still advancing tags (to find the longest possible match) if (taggingAtt.isTaggable() || head[0] != null) { // -- Lookup the term id from the next token term = byteRefAtt.getBytesRef(); if (term.length == 0) { throw new IllegalArgumentException( "term: " + term.utf8ToString() + " analyzed to a zero-length token"); } } else { // no current cluster AND lookup == false ... term = null; // skip this token } // -- Process tag advanceTagsAndProcessClusterIfDone(head, term); // -- only create new Tags for Tokens we need to lookup if (taggingAtt.isTaggable() && term != null) { // determine if the terms index has a term starting with the provided term // TODO create a pool of these cursors to reuse them more? could be trivial impl if (cursor == null) { // (else the existing cursor will be re-used) cursor = new TermPrefixCursor(terms.iterator(), liveDocs, docIdsCache); } if (cursor.advance(term)) { TagLL newTail = new TagLL(head, cursor, offsetAtt.startOffset(), offsetAtt.endOffset(), null); cursor = null; // because the new tag now "owns" this instance // and add it to the end if (head[0] == null) { head[0] = newTail; } else { for (TagLL t = head[0]; true; t = t.nextTag) { if (t.nextTag == null) { t.addAfterLL(newTail); break; } } } } } // if termId >= 0 } // end while(incrementToken()) // -- Finish all tags advanceTagsAndProcessClusterIfDone(head, null); assert head[0] == null; if (!loggedSkippedAltTokenWarning && skippedTokens) { loggedSkippedAltTokenWarning = true; // only log once log.warn( "{}{}{}{}", "The Tagger skipped some alternate tokens (tokens with posInc == 0) ", "while processing text. This may cause problems with some Analyzer ", "configurations (e.g. query time synonym expansion). For details see ", "https://github.com/OpenSextant/SolrTextTagger/pull/11#issuecomment-24936225"); } tokenStream.end(); // tokenStream.close(); caller closes because caller acquired it } private void advanceTagsAndProcessClusterIfDone(TagLL[] head, BytesRef term) throws IOException { // -- Advance tags final int endOffset = term != null ? offsetAtt.endOffset() : -1; boolean anyAdvance = false; for (TagLL t = head[0]; t != null; t = t.nextTag) { anyAdvance |= t.advance(term, endOffset); } // -- Process cluster if done if (!anyAdvance && head[0] != null) { tagClusterReducer.reduce(head); for (TagLL t = head[0]; t != null; t = t.nextTag) { assert t.value != null; tagCallback(t.startOffset, t.endOffset, t.value); } head[0] = null; } } /** * Invoked by {@link #process()} for each tag found. endOffset is always >= the endOffset given * in the previous call. * * @param startOffset The character offset of the original stream where the tag starts. * @param endOffset One more than the character offset of the original stream where the tag ends. * @param docIdsKey A reference to the matching docIds that can be resolved via {@link * #lookupDocIds(Object)}. */ protected abstract void tagCallback(int startOffset, int endOffset, Object docIdsKey); /** * Returns a sorted array of integer docIds given the corresponding key. * * @param docIdsKey The lookup key. * @return Not null */ protected IntsRef lookupDocIds(Object docIdsKey) { return (IntsRef) docIdsKey; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy