org.apache.lucene.search.vectorhighlight.FieldTermStack Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-highlighter Show documentation
This is the highlighter for apache lucene java
There is a newer version: 10.0.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.vectorhighlight;

import java.io.IOException;
import java.util.Collections;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Set;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRefBuilder;

/**
 * FieldTermStack is a stack that keeps query terms in the specified field of the
 * document to be highlighted.
 */
public class FieldTermStack {

  private final String fieldName;
  LinkedList termList = new LinkedList<>();

  /**
   * a constructor.
   *
   * @param reader IndexReader of the index
   * @param docId document id to be highlighted
   * @param fieldName field of the document to be highlighted
   * @param fieldQuery FieldQuery object
   * @throws IOException If there is a low-level I/O error
   */
  public FieldTermStack(
      IndexReader reader, int docId, String fieldName, final FieldQuery fieldQuery)
      throws IOException {
    this.fieldName = fieldName;

    Set termSet = fieldQuery.getTermSet(fieldName);
    // just return to make null snippet if un-matched fieldName specified when fieldMatch == true
    if (termSet == null) return;

    final Fields vectors = reader.termVectors().get(docId);
    if (vectors == null) {
      // null snippet
      return;
    }

    final Terms vector = vectors.terms(fieldName);
    if (vector == null || vector.hasPositions() == false) {
      // null snippet
      return;
    }

    final CharsRefBuilder spare = new CharsRefBuilder();
    final TermsEnum termsEnum = vector.iterator();
    PostingsEnum dpEnum = null;
    BytesRef text;

    int numDocs = reader.maxDoc();

    while ((text = termsEnum.next()) != null) {
      spare.copyUTF8Bytes(text);
      final String term = spare.toString();
      if (!termSet.contains(term)) {
        continue;
      }
      dpEnum = termsEnum.postings(dpEnum, PostingsEnum.POSITIONS);
      dpEnum.nextDoc();

      // For weight look here:
      // http://lucene.apache.org/core/3_6_0/api/core/org/apache/lucene/search/DefaultSimilarity.html
      final float weight =
          (float)
              (Math.log(numDocs / (double) (reader.docFreq(new Term(fieldName, text)) + 1)) + 1.0);

      final int freq = dpEnum.freq();

      for (int i = 0; i < freq; i++) {
        int pos = dpEnum.nextPosition();
        if (dpEnum.startOffset() < 0) {
          return; // no offsets, null snippet
        }
        termList.add(new TermInfo(term, dpEnum.startOffset(), dpEnum.endOffset(), pos, weight));
      }
    }

    // sort by position
    Collections.sort(termList);

    // now look for dups at the same position, linking them together
    int currentPos = -1;
    TermInfo previous = null;
    TermInfo first = null;
    Iterator iterator = termList.iterator();
    while (iterator.hasNext()) {
      TermInfo current = iterator.next();
      if (current.position == currentPos) {
        assert previous != null;
        previous.setNext(current);
        previous = current;
        iterator.remove();
      } else {
        if (previous != null) {
          previous.setNext(first);
        }
        previous = first = current;
        currentPos = current.position;
      }
    }
    if (previous != null) {
      previous.setNext(first);
    }
  }

  /**
   * @return field name
   */
  public String getFieldName() {
    return fieldName;
  }

  /**
   * @return the top TermInfo object of the stack
   */
  public TermInfo pop() {
    return termList.poll();
  }

  /**
   * @param termInfo the TermInfo object to be put on the top of the stack
   */
  public void push(TermInfo termInfo) {
    termList.push(termInfo);
  }

  /**
   * to know whether the stack is empty
   *
   * @return true if the stack is empty, false if not
   */
  public boolean isEmpty() {
    return termList == null || termList.size() == 0;
  }

  /**
   * Single term with its position/offsets in the document and IDF weight. It is Comparable but
   * considers only position.
   */
  public static class TermInfo implements Comparable {

    private final String text;
    private final int startOffset;
    private final int endOffset;
    private final int position;

    // IDF-weight of this term
    private final float weight;

    // pointer to other TermInfo's at the same position.
    // this is a circular list, so with no syns, just points to itself
    private TermInfo next;

    TermInfo(String text, int startOffset, int endOffset, int position, float weight) {
      this.text = text;
      this.startOffset = startOffset;
      this.endOffset = endOffset;
      this.position = position;
      this.weight = weight;
      this.next = this;
    }

    void setNext(TermInfo next) {
      this.next = next;
    }

    /** Returns the next TermInfo at this same position. This is a circular list! */
    public TermInfo getNext() {
      return next;
    }

    public String getText() {
      return text;
    }

    public int getStartOffset() {
      return startOffset;
    }

    public int getEndOffset() {
      return endOffset;
    }

    public int getPosition() {
      return position;
    }

    public float getWeight() {
      return weight;
    }

    @Override
    public String toString() {
      return text + '(' + startOffset + ',' + endOffset + ',' + position + ')';
    }

    @Override
    public int compareTo(TermInfo o) {
      return (this.position - o.position);
    }

    @Override
    public int hashCode() {
      final int prime = 31;
      int result = 1;
      result = prime * result + position;
      return result;
    }

    @Override
    public boolean equals(Object obj) {
      if (this == obj) {
        return true;
      }
      if (obj == null) {
        return false;
      }
      if (getClass() != obj.getClass()) {
        return false;
      }
      TermInfo other = (TermInfo) obj;
      return position == other.position;
    }
  }
}