All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.solr.schema.PreAnalyzedField Maven / Gradle / Ivy

There is a newer version: 9.6.1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.schema;

import static org.apache.solr.common.params.CommonParams.JSON;

import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.lang.invoke.MethodHandles;
import java.lang.reflect.Constructor;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.queries.function.ValueSource;
import org.apache.lucene.queries.function.valuesource.SortedSetFieldSource;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.SortedSetSelector;
import org.apache.lucene.util.AttributeFactory;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.AttributeSource.State;
import org.apache.solr.analysis.SolrAnalyzer;
import org.apache.solr.response.TextResponseWriter;
import org.apache.solr.search.QParser;
import org.apache.solr.uninverting.UninvertingReader.Type;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Pre-analyzed field type provides a way to index a serialized token stream, optionally with an
 * independent stored value of a field.
 */
public class PreAnalyzedField extends TextField implements HasImplicitIndexAnalyzer {
  private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());

  /**
   * Init argument name. Value is a fully-qualified class name of the parser that implements {@link
   * PreAnalyzedParser}.
   */
  public static final String PARSER_IMPL = "parserImpl";

  private static final String DEFAULT_IMPL = JsonPreAnalyzedParser.class.getName();

  private PreAnalyzedParser parser;
  private PreAnalyzedAnalyzer preAnalyzer;

  @Override
  public void init(IndexSchema schema, Map args) {
    super.init(schema, args);
    String implName = args.get(PARSER_IMPL);
    if (implName == null) {
      parser = new JsonPreAnalyzedParser();
    } else {
      // short name
      if (JSON.equalsIgnoreCase(implName)) {
        parser = new JsonPreAnalyzedParser();
      } else if ("simple".equalsIgnoreCase(implName)) {
        parser = new SimplePreAnalyzedParser();
      } else {
        try {
          Class implClazz =
              schema.getSolrClassLoader().findClass(implName, PreAnalyzedParser.class);
          Constructor c = implClazz.getConstructor(new Class[0]);
          parser = (PreAnalyzedParser) c.newInstance(new Object[0]);
        } catch (Exception e) {
          log.warn(
              "Can't use the configured PreAnalyzedParser class '{}', using defualt {}",
              implName,
              DEFAULT_IMPL,
              e);
          parser = new JsonPreAnalyzedParser();
        }
      }
      args.remove(PARSER_IMPL);
    }
    // create Analyzer instance for reuse:
    preAnalyzer = new PreAnalyzedAnalyzer(parser);
  }

  /**
   * Overridden to return an analyzer consisting of a {@link PreAnalyzedTokenizer}. NOTE: If an
   * index analyzer is specified in the schema, it will be ignored.
   */
  @Override
  public Analyzer getIndexAnalyzer() {
    return preAnalyzer;
  }

  /**
   * Returns the query analyzer defined via the schema, unless there is none, in which case the
   * index-time pre-analyzer is returned.
   *
   * 

Note that if the schema specifies an index-time analyzer via either {@code } or * {@code }, but no query-time analyzer, the query analyzer returned here * will be the index-time analyzer specified in the schema rather than the pre-analyzer. */ @Override public Analyzer getQueryAnalyzer() { Analyzer queryAnalyzer = super.getQueryAnalyzer(); return queryAnalyzer instanceof FieldType.DefaultAnalyzer ? getIndexAnalyzer() : queryAnalyzer; } @Override public IndexableField createField(SchemaField field, Object value) { IndexableField f = null; try { f = fromString(field, String.valueOf(value)); } catch (Exception e) { log.warn("Error parsing pre-analyzed field '{}'", field.getName(), e); return null; } return f; } @Override public SortField getSortField(SchemaField field, boolean top) { return getSortedSetSortField( field, SortedSetSelector.Type.MIN, top, SortField.STRING_FIRST, SortField.STRING_LAST); } @Override public ValueSource getValueSource(SchemaField field, QParser parser) { return new SortedSetFieldSource(field.getName()); } @Override public Type getUninversionType(SchemaField sf) { return Type.SORTED_SET_BINARY; } @Override public void write(TextResponseWriter writer, String name, IndexableField f) throws IOException { writer.writeStr(name, toExternal(f), true); } /** * Utility method to convert a field to a string that is parse-able by this class. * * @param f field to convert * @return string that is compatible with the serialization format * @throws IOException If there is a low-level I/O error. */ public String toFormattedString(Field f) throws IOException { return parser.toFormattedString(f); } /** * Utility method to create a {@link org.apache.lucene.document.FieldType} based on the {@link * SchemaField} */ public static org.apache.lucene.document.FieldType createFieldType(SchemaField field) { if (!field.indexed() && !field.stored()) { log.trace("Ignoring unindexed/unstored field: {}", field); return null; } org.apache.lucene.document.FieldType newType = new org.apache.lucene.document.FieldType(); newType.setTokenized(field.isTokenized()); newType.setStored(field.stored()); newType.setOmitNorms(field.omitNorms()); IndexOptions options = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS; if (field.omitTermFreqAndPositions()) { options = IndexOptions.DOCS; } else if (field.omitPositions()) { options = IndexOptions.DOCS_AND_FREQS; } else if (field.storeOffsetsWithPositions()) { options = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS; } newType.setIndexOptions(options); newType.setStoreTermVectors(field.storeTermVector()); newType.setStoreTermVectorOffsets(field.storeTermOffsets()); newType.setStoreTermVectorPositions(field.storeTermPositions()); newType.setStoreTermVectorPayloads(field.storeTermPayloads()); return newType; } /** This is a simple holder of a stored part and the collected states (tokens with attributes). */ public static class ParseResult { public String str; public byte[] bin; public List states = new ArrayList<>(); } /** Parse the input and return the stored part and the tokens with attributes. */ public static interface PreAnalyzedParser { /** * Parse input. * * @param reader input to read from * @param parent parent who will own the resulting states (tokens with attributes) * @return parse result, with possibly null stored and/or states fields. * @throws IOException if a parsing error or IO error occurs */ public ParseResult parse(Reader reader, AttributeSource parent) throws IOException; /** * Format a field so that the resulting String is valid for parsing with {@link #parse(Reader, * AttributeSource)}. * * @param f field instance * @return formatted string * @throws IOException If there is a low-level I/O error. */ public String toFormattedString(Field f) throws IOException; } public IndexableField fromString(SchemaField field, String val) throws Exception { if (val == null || val.trim().length() == 0) { return null; } PreAnalyzedTokenizer parse = new PreAnalyzedTokenizer(parser); Reader reader = new StringReader(val); parse.setReader(reader); parse.decodeInput(reader); // consume parse.reset(); org.apache.lucene.document.FieldType type = createFieldType(field); if (type == null) { parse.close(); return null; } Field f = null; if (parse.getStringValue() != null) { if (field.stored()) { f = new Field(field.getName(), parse.getStringValue(), type); } else { type.setStored(false); } } else if (parse.getBinaryValue() != null) { if (field.isBinary()) { f = new Field(field.getName(), parse.getBinaryValue(), type); } } else { type.setStored(false); } if (parse.hasTokenStream()) { if (field.indexed()) { type.setTokenized(true); if (f != null) { f.setTokenStream(parse); } else { f = new Field(field.getName(), parse, type); } } else { if (f != null) { type.setIndexOptions(IndexOptions.NONE); type.setTokenized(false); } } } return f; } /** Token stream that works from a list of saved states. */ private static class PreAnalyzedTokenizer extends Tokenizer { private final List cachedStates = new ArrayList<>(); private Iterator it = null; private String stringValue = null; private byte[] binaryValue = null; private PreAnalyzedParser parser; private IOException readerConsumptionException; private int lastEndOffset; public PreAnalyzedTokenizer(PreAnalyzedParser parser) { // we don't pack attributes: since we are used for (de)serialization and dont want bloat. super(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY); this.parser = parser; } public boolean hasTokenStream() { return !cachedStates.isEmpty(); } public String getStringValue() { return stringValue; } public byte[] getBinaryValue() { return binaryValue; } @Override public final boolean incrementToken() { if (!it.hasNext()) { return false; } AttributeSource.State state = it.next(); restoreState(state.clone()); // TODO: why can't I lookup the OffsetAttribute up in ctor instead? lastEndOffset = addAttribute(OffsetAttribute.class).endOffset(); return true; } /** * Throws a delayed exception if one was thrown from decodeInput() while reading from the input * reader. */ @Override public final void reset() throws IOException { super.reset(); if (readerConsumptionException != null) { IOException e = new IOException(readerConsumptionException); readerConsumptionException = null; throw e; } it = cachedStates.iterator(); } @Override public void end() throws IOException { super.end(); // we must set the end offset correctly so multi-valued fields don't try to send offsets // backwards: addAttribute(OffsetAttribute.class).setOffset(lastEndOffset, lastEndOffset); } private void setReaderConsumptionException(IOException e) { readerConsumptionException = e; } /** Parses the input reader and adds attributes specified there. */ private void decodeInput(Reader reader) throws IOException { removeAllAttributes(); // reset attributes to the empty set cachedStates.clear(); stringValue = null; binaryValue = null; try { ParseResult res = parser.parse(reader, this); if (res != null) { stringValue = res.str; binaryValue = res.bin; if (res.states != null) { cachedStates.addAll(res.states); } } } catch (IOException e) { removeAllAttributes(); // reset attributes to the empty set throw e; // rethrow } } } private static class PreAnalyzedAnalyzer extends SolrAnalyzer { private PreAnalyzedParser parser; PreAnalyzedAnalyzer(PreAnalyzedParser parser) { this.parser = parser; } @Override protected TokenStreamComponents createComponents(String fieldName) { final PreAnalyzedTokenizer tokenizer = new PreAnalyzedTokenizer(parser); return new TokenStreamComponents( r -> { try { tokenizer.decodeInput(r); } catch (IOException e) { tokenizer.setReaderConsumptionException(e); } }, tokenizer); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy