org.apache.solr.analysis.TokenizerChain Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of solr-core Show documentation
Apache Solr (module: core)
There is a newer version: 9.7.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.analysis;

import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharFilterFactory;
import org.apache.lucene.analysis.TokenFilterFactory;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.TokenizerFactory;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.custom.CustomAnalyzer;

/**
 * An analyzer that uses a tokenizer and a list of token filters to create a TokenStream.
 *
 * It should probably be replaced with {@link CustomAnalyzer}.
 *
 * @since 3.1
 */
public final class TokenizerChain extends SolrAnalyzer {
  private static final CharFilterFactory[] EMPTY_CHAR_FITLERS = new CharFilterFactory[0];
  private static final TokenFilterFactory[] EMPTY_TOKEN_FITLERS = new TokenFilterFactory[0];

  private final CharFilterFactory[] charFilters;
  private final TokenizerFactory tokenizer;
  private final TokenFilterFactory[] filters;

  /** Copy from CustomAnalyzer. */
  public TokenizerChain(CustomAnalyzer customAnalyzer) {
    this(
        customAnalyzer.getCharFilterFactories().toArray(new CharFilterFactory[0]),
        customAnalyzer.getTokenizerFactory(),
        customAnalyzer.getTokenFilterFactories().toArray(new TokenFilterFactory[0]));
    setPositionIncrementGap(customAnalyzer.getPositionIncrementGap(null));
    assert customAnalyzer.getOffsetGap(null) == 1; // note: we don't support setting the offset gap
  }

  /**
   * Creates a new TokenizerChain w/o any CharFilterFactories.
   *
   * @param tokenizer Factory for the Tokenizer to use, must not be null.
   * @param filters Factories for the TokenFilters to use - if null, will be treated as if empty.
   */
  public TokenizerChain(TokenizerFactory tokenizer, TokenFilterFactory[] filters) {
    this(null, tokenizer, filters);
  }

  /**
   * Creates a new TokenizerChain.
   *
   * @param charFilters Factories for the CharFilters to use, if any - if null, will be treated as
   *     if empty.
   * @param tokenizer Factory for the Tokenizer to use, must not be null.
   * @param filters Factories for the TokenFilters to use if any- if null, will be treated as if
   *     empty.
   */
  public TokenizerChain(
      CharFilterFactory[] charFilters, TokenizerFactory tokenizer, TokenFilterFactory[] filters) {
    charFilters = null == charFilters ? EMPTY_CHAR_FITLERS : charFilters;
    filters = null == filters ? EMPTY_TOKEN_FITLERS : filters;
    if (null == tokenizer) {
      throw new NullPointerException("TokenizerFactory must not be null");
    }

    this.charFilters = charFilters;
    this.tokenizer = tokenizer;
    this.filters = filters;
  }

  /**
   * @return array of CharFilterFactories, may be empty but never null
   */
  public CharFilterFactory[] getCharFilterFactories() {
    return charFilters;
  }

  /**
   * @return the TokenizerFactory in use, will never be null
   */
  public TokenizerFactory getTokenizerFactory() {
    return tokenizer;
  }

  /**
   * @return array of TokenFilterFactories, may be empty but never null
   */
  public TokenFilterFactory[] getTokenFilterFactories() {
    return filters;
  }

  @Override
  public Reader initReader(String fieldName, Reader reader) {
    if (charFilters != null && charFilters.length > 0) {
      Reader cs = reader;
      for (CharFilterFactory charFilter : charFilters) {
        cs = charFilter.create(cs);
      }
      reader = cs;
    }
    return reader;
  }

  @Override
  protected Reader initReaderForNormalization(String fieldName, Reader reader) {
    if (charFilters != null && charFilters.length > 0) {
      for (CharFilterFactory charFilter : charFilters) {
        reader = charFilter.normalize(reader);
      }
    }
    return reader;
  }

  @Override
  protected TokenStreamComponents createComponents(String fieldName) {
    Tokenizer tk = tokenizer.create(attributeFactory(fieldName));
    TokenStream ts = tk;
    for (TokenFilterFactory filter : filters) {
      ts = filter.create(ts);
    }
    return new TokenStreamComponents(tk, ts);
  }

  @Override
  protected TokenStream normalize(String fieldName, TokenStream in) {
    TokenStream result = in;
    for (TokenFilterFactory filter : filters) {
      result = filter.normalize(result);
    }
    return result;
  }

  @Override
  public String toString() {
    StringBuilder sb = new StringBuilder("TokenizerChain(");
    for (CharFilterFactory filter : charFilters) {
      sb.append(filter);
      sb.append(", ");
    }
    sb.append(tokenizer);
    for (TokenFilterFactory filter : filters) {
      sb.append(", ");
      sb.append(filter);
    }
    sb.append(')');
    return sb.toString();
  }

  public Analyzer getMultiTermAnalyzer() {
    return new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName) {
        Tokenizer tk = new KeywordTokenizer();
        TokenStream ts = tk;
        for (TokenFilterFactory filter : filters) {
          ts = filter.normalize(ts);
        }
        return new TokenStreamComponents(tk, ts);
      }

      @Override
      protected Reader initReader(String fieldName, Reader reader) {
        if (charFilters != null && charFilters.length > 0) {
          Reader cs = reader;
          for (CharFilterFactory charFilter : charFilters) {
            cs = charFilter.normalize(cs);
          }
          reader = cs;
        }
        return reader;
      }
    };
  }
}