org.apache.solr.handler.component.TermVectorComponent Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.handler.component;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.BytesRef;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.params.TermVectorParams;
import org.apache.solr.common.util.Base64;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrCore;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.DocList;
import org.apache.solr.search.DocListAndSet;
import org.apache.solr.search.ReturnFields;
import org.apache.solr.search.SolrDocumentFetcher;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.search.SolrReturnFields;
import org.apache.solr.util.SolrPluginUtils;
import org.apache.solr.util.plugin.SolrCoreAware;

/**
 * Return term vectors for the documents in a query result set.
 * 
 * Info available:
 * term, frequency, position, offset, payloads, IDF.
 * 

 * Note Returning IDF can be expensive.
 * 
 * 
 * <searchComponent name="tvComponent" class="solr.TermVectorComponent"/>
 * 
 * <requestHandler name="/terms" class="solr.SearchHandler">
 *   <lst name="defaults">
 *     <bool name="tv">true</bool>
 *   </lst>
 *   <arr name="last-component">
 *     <str>tvComponent</str>
 *   </arr>
 * </requestHandler>
 *
 *
 */
public class TermVectorComponent extends SearchComponent implements SolrCoreAware {


  public static final String COMPONENT_NAME = "tv";

  public static final String TERM_VECTORS = "termVectors";

  private static final String TV_KEY_WARNINGS = "warnings";

  @SuppressWarnings({"rawtypes"})
  protected NamedList initParams;

  /**
   * Helper method for determining the list of fields that we should 
   * try to find term vectors on.  
   * 
   * Does simple (non-glob-supporting) parsing on the 
   * {@link TermVectorParams#FIELDS} param if specified, otherwise it returns 
   * the concrete field values specified in {@link CommonParams#FL} -- 
   * ignoring functions, transformers, or literals.  
   * 
   * 
   * If "fl=*" is used, or neither param is specified, then null 
   * will be returned.  If the empty set is returned, it means the "fl" 
   * specified consisted entirely of things that are not real fields 
   * (ie: functions, transformers, partial-globs, score, etc...) and not 
   * supported by this component. 
   * 
   */
  private Set getFields(ResponseBuilder rb) {
    SolrParams params = rb.req.getParams();
    String[] fldLst = params.getParams(TermVectorParams.FIELDS);
    if (null == fldLst || 0 == fldLst.length || 
        (1 == fldLst.length && 0 == fldLst[0].length())) {

      // no tv.fl, parse the main fl
      ReturnFields rf = new SolrReturnFields
        (params.getParams(CommonParams.FL), rb.req);

      if (rf.wantsAllFields()) {
        return null;
      }

      Set fieldNames = rf.getLuceneFieldNames();
      return (null != fieldNames) ?
        fieldNames :
        // return empty set indicating no fields should be used
        Collections.emptySet();
    }

    // otherwise us the raw fldList as is, no special parsing or globs
    Set fieldNames = new LinkedHashSet<>();
    for (String fl : fldLst) {
      fieldNames.addAll(Arrays.asList(SolrPluginUtils.split(fl)));
    }
    return fieldNames;
  }

  @Override
  public void process(ResponseBuilder rb) throws IOException {
    SolrParams params = rb.req.getParams();
    if (!params.getBool(COMPONENT_NAME, false)) {
      return;
    }

    NamedList