All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.elasticsearch.action.termvector.TermVectorFields Maven / Gradle / Ivy

There is a newer version: 8.13.3
Show newest version
/*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.elasticsearch.action.termvector;

import com.carrotsearch.hppc.ObjectLongOpenHashMap;
import com.carrotsearch.hppc.cursors.ObjectLongCursor;
import org.apache.lucene.index.*;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.RamUsageEstimator;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.io.stream.BytesStreamInput;

import java.io.IOException;
import java.util.Comparator;
import java.util.Iterator;

import static org.apache.lucene.util.ArrayUtil.grow;

/**
 * This class represents the result of a {@link TermVectorRequest}. It works
 * exactly like the {@link Fields} class except for one thing: It can return
 * offsets and payloads even if positions are not present. You must call
 * nextPosition() anyway to move the counter although this method only returns
 * -1,, if no positions were returned by the {@link TermVectorRequest}.
 * 

* The data is stored in two byte arrays ({@code headerRef} and * {@code termVectors}, both {@link ByteRef}) that have the following format: *

* {@code headerRef}: Stores offsets per field in the {@code termVectors} array * and some header information as {@link BytesRef}. Format is *

    *
  • String : "TV"
  • *
  • vint: version (=-1)
  • *
  • boolean: hasTermStatistics (are the term statistics stored?)
  • *
  • boolean: hasFieldStatitsics (are the field statistics stored?)
  • *
  • vint: number of fields
  • *
      *
    • String: field name 1
    • *
    • vint: offset in {@code termVectors} for field 1
    • *
    • ...
    • *
    • String: field name last field
    • *
    • vint: offset in {@code termVectors} for last field
    • *
    *
*

* termVectors: Stores the actual term vectors as a {@link BytesRef}. *

* Term vectors for each fields are stored in blocks, one for each field. The * offsets in {@code headerRef} are used to find where the block for a field * starts. Each block begins with a *

    *
  • vint: number of terms
  • *
  • boolean: positions (has it positions stored?)
  • *
  • boolean: offsets (has it offsets stored?)
  • *
  • boolean: payloads (has it payloads stored?)
  • *
* If the field statistics were requested ({@code hasFieldStatistics} is true, * see {@code headerRef}), the following numbers are stored: *
    *
  • vlong: sum of total term freqencies of the field (sumTotalTermFreq)
  • *
  • vlong: sum of document frequencies for each term (sumDocFreq)
  • *
  • vint: number of documents in the shard that has an entry for this field * (docCount)
  • *
*

* After that, for each term it stores *

    *
      *
    • vint: term lengths
    • *
    • BytesRef: term name
    • *
    *

    * If term statistics are requested ({@code hasTermStatistics} is true, see * {@code headerRef}): *

      *
    • vint: document frequency, how often does this term appear in documents?
    • *
    • vlong: total term frequency. Sum of terms in this field.
    • *
    * After that *
      *
    • vint: frequency (always returned)
    • *
        *
      • vint: position_1 (if positions == true)
      • *
      • vint: startOffset_1 (if offset == true)
      • *
      • vint: endOffset_1 (if offset == true)
      • *
      • BytesRef: payload_1 (if payloads == true)
      • *
      • ...
      • *
      • vint: endOffset_freqency (if offset == true)
      • *
      • BytesRef: payload_freqency (if payloads == true)
      • *
          *
      */ public final class TermVectorFields extends Fields { private final ObjectLongOpenHashMap fieldMap; private final BytesReference termVectors; final boolean hasTermStatistic; final boolean hasFieldStatistic; /** * @param headerRef Stores offsets per field in the {@code termVectors} and some * header information as {@link BytesRef}. * @param termVectors Stores the actual term vectors as a {@link BytesRef}. */ public TermVectorFields(BytesReference headerRef, BytesReference termVectors) throws IOException { BytesStreamInput header = new BytesStreamInput(headerRef); fieldMap = new ObjectLongOpenHashMap<>(); // here we read the header to fill the field offset map String headerString = header.readString(); assert headerString.equals("TV"); int version = header.readInt(); assert version == -1; hasTermStatistic = header.readBoolean(); hasFieldStatistic = header.readBoolean(); final int numFields = header.readVInt(); for (int i = 0; i < numFields; i++) { fieldMap.put((header.readString()), header.readVLong()); } header.close(); // reference to the term vector data this.termVectors = termVectors; } @Override public Iterator iterator() { final Iterator> iterator = fieldMap.iterator(); return new Iterator() { @Override public boolean hasNext() { return iterator.hasNext(); } @Override public String next() { return iterator.next().key; } @Override public void remove() { throw new UnsupportedOperationException(); } }; } @Override public Terms terms(String field) throws IOException { // first, find where in the termVectors bytes the actual term vector for // this field is stored if (!fieldMap.containsKey(field)) { return null; // we don't have it. } long offset = fieldMap.lget(); final BytesStreamInput perFieldTermVectorInput = new BytesStreamInput(this.termVectors); perFieldTermVectorInput.reset(); perFieldTermVectorInput.skip(offset); // read how many terms.... final long numTerms = perFieldTermVectorInput.readVLong(); // ...if positions etc. were stored.... final boolean hasPositions = perFieldTermVectorInput.readBoolean(); final boolean hasOffsets = perFieldTermVectorInput.readBoolean(); final boolean hasPayloads = perFieldTermVectorInput.readBoolean(); // read the field statistics final long sumTotalTermFreq = hasFieldStatistic ? readPotentiallyNegativeVLong(perFieldTermVectorInput) : -1; final long sumDocFreq = hasFieldStatistic ? readPotentiallyNegativeVLong(perFieldTermVectorInput) : -1; final int docCount = hasFieldStatistic ? readPotentiallyNegativeVInt(perFieldTermVectorInput) : -1; return new Terms() { @Override public TermsEnum iterator(TermsEnum reuse) throws IOException { // convert bytes ref for the terms to actual data return new TermsEnum() { int currentTerm = 0; int freq = 0; int docFreq = -1; long totalTermFrequency = -1; int[] positions = new int[1]; int[] startOffsets = new int[1]; int[] endOffsets = new int[1]; BytesRef[] payloads = new BytesRef[1]; final BytesRef spare = new BytesRef(); @Override public BytesRef next() throws IOException { if (currentTerm++ < numTerms) { // term string. first the size... int termVectorSize = perFieldTermVectorInput.readVInt(); spare.grow(termVectorSize); // ...then the value. perFieldTermVectorInput.readBytes(spare.bytes, 0, termVectorSize); spare.length = termVectorSize; if (hasTermStatistic) { docFreq = readPotentiallyNegativeVInt(perFieldTermVectorInput); totalTermFrequency = readPotentiallyNegativeVLong(perFieldTermVectorInput); } freq = readPotentiallyNegativeVInt(perFieldTermVectorInput); // grow the arrays to read the values. this is just // for performance reasons. Re-use memory instead of // realloc. growBuffers(); // finally, read the values into the arrays // curentPosition etc. so that we can just iterate // later writeInfos(perFieldTermVectorInput); return spare; } else { return null; } } private void writeInfos(final BytesStreamInput input) throws IOException { for (int i = 0; i < freq; i++) { if (hasPositions) { positions[i] = input.readVInt(); } if (hasOffsets) { startOffsets[i] = input.readVInt(); endOffsets[i] = input.readVInt(); } if (hasPayloads) { int payloadLength = input.readVInt(); if (payloads[i] == null) { payloads[i] = new BytesRef(payloadLength); } else { payloads[i].grow(payloadLength); } input.readBytes(payloads[i].bytes, 0, payloadLength); payloads[i].length = payloadLength; payloads[i].offset = 0; } } } private void growBuffers() { if (hasPositions) { positions = grow(positions, freq); } if (hasOffsets) { startOffsets = grow(startOffsets, freq); endOffsets = grow(endOffsets, freq); } if (hasPayloads) { if (payloads.length < freq) { final BytesRef[] newArray = new BytesRef[ArrayUtil.oversize(freq, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; System.arraycopy(payloads, 0, newArray, 0, payloads.length); payloads = newArray; } } } @Override public Comparator getComparator() { return BytesRef.getUTF8SortedAsUnicodeComparator(); } @Override public SeekStatus seekCeil(BytesRef text) throws IOException { throw new UnsupportedOperationException(); } @Override public void seekExact(long ord) throws IOException { throw new UnsupportedOperationException("Seek is not supported"); } @Override public BytesRef term() throws IOException { return spare; } @Override public long ord() throws IOException { throw new UnsupportedOperationException("ordinals are not supported"); } @Override public int docFreq() throws IOException { return docFreq; } @Override public long totalTermFreq() throws IOException { return totalTermFrequency; } @Override public DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags) throws IOException { return docsAndPositions(liveDocs, reuse instanceof DocsAndPositionsEnum ? (DocsAndPositionsEnum) reuse : null, 0); } @Override public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) throws IOException { final TermVectorsDocsAndPosEnum retVal = (reuse instanceof TermVectorsDocsAndPosEnum ? (TermVectorsDocsAndPosEnum) reuse : new TermVectorsDocsAndPosEnum()); return retVal.reset(hasPositions ? positions : null, hasOffsets ? startOffsets : null, hasOffsets ? endOffsets : null, hasPayloads ? payloads : null, freq); } }; } @Override public Comparator getComparator() { return BytesRef.getUTF8SortedAsUnicodeComparator(); } @Override public long size() throws IOException { return numTerms; } @Override public long getSumTotalTermFreq() throws IOException { return sumTotalTermFreq; } @Override public long getSumDocFreq() throws IOException { return sumDocFreq; } @Override public int getDocCount() throws IOException { return docCount; } @Override public boolean hasFreqs() { return true; } @Override public boolean hasOffsets() { return hasOffsets; } @Override public boolean hasPositions() { return hasPositions; } @Override public boolean hasPayloads() { return hasPayloads; } }; } @Override public int size() { return fieldMap.size(); } private final class TermVectorsDocsAndPosEnum extends DocsAndPositionsEnum { private boolean hasPositions; private boolean hasOffsets; private boolean hasPayloads; int curPos = -1; int doc = -1; private int freq; private int[] startOffsets; private int[] positions; private BytesRef[] payloads; private int[] endOffsets; private DocsAndPositionsEnum reset(int[] positions, int[] startOffsets, int[] endOffsets, BytesRef[] payloads, int freq) { curPos = -1; doc = -1; this.hasPositions = positions != null; this.hasOffsets = startOffsets != null; this.hasPayloads = payloads != null; this.freq = freq; this.startOffsets = startOffsets; this.endOffsets = endOffsets; this.payloads = payloads; this.positions = positions; return this; } @Override public int nextDoc() throws IOException { return doc = (doc == -1 ? 0 : NO_MORE_DOCS); } @Override public int docID() { return doc; } @Override public int advance(int target) throws IOException { while (nextDoc() < target && doc != NO_MORE_DOCS) { } return doc; } @Override public int freq() throws IOException { return freq; } // call nextPosition once before calling this one // because else counter is not advanced @Override public int startOffset() throws IOException { assert curPos < freq && curPos >= 0; return hasOffsets ? startOffsets[curPos] : -1; } @Override // can return -1 if posistions were not requested or // stored but offsets were stored and requested public int nextPosition() throws IOException { assert curPos + 1 < freq; ++curPos; // this is kind of cheating but if you don't need positions // we safe lots fo space on the wire return hasPositions ? positions[curPos] : -1; } @Override public BytesRef getPayload() throws IOException { assert curPos < freq && curPos >= 0; return hasPayloads ? payloads[curPos] : null; } @Override public int endOffset() throws IOException { assert curPos < freq && curPos >= 0; return hasOffsets ? endOffsets[curPos] : -1; } @Override public long cost() { return 1; } } // read a vInt. this is used if the integer might be negative. In this case, // the writer writes a 0 for -1 or value +1 and accordingly we have to // substract 1 again // adds one to mock not existing term freq int readPotentiallyNegativeVInt(BytesStreamInput stream) throws IOException { return stream.readVInt() - 1; } // read a vLong. this is used if the integer might be negative. In this // case, the writer writes a 0 for -1 or value +1 and accordingly we have to // substract 1 again // adds one to mock not existing term freq long readPotentiallyNegativeVLong(BytesStreamInput stream) throws IOException { return stream.readVLong() - 1; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy