All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.index.TermsEnum Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.index;


import java.io.IOException;

import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefIterator;

/** Iterator to seek ({@link #seekCeil(BytesRef)}, {@link
 * #seekExact(BytesRef)}) or step through ({@link
 * #next} terms to obtain frequency information ({@link
 * #docFreq}), {@link PostingsEnum} or {@link
 * PostingsEnum} for the current term ({@link
 * #postings}.
 * 
 * 

Term enumerations are always ordered by * BytesRef.compareTo, which is Unicode sort * order if the terms are UTF-8 bytes. Each term in the * enumeration is greater than the one before it.

* *

The TermsEnum is unpositioned when you first obtain it * and you must first successfully call {@link #next} or one * of the seek methods. * * @lucene.experimental */ public abstract class TermsEnum implements BytesRefIterator { private AttributeSource atts = null; /** Sole constructor. (For invocation by subclass * constructors, typically implicit.) */ protected TermsEnum() { } /** Returns the related attributes. */ public AttributeSource attributes() { if (atts == null) atts = new AttributeSource(); return atts; } /** Represents returned result from {@link #seekCeil}. */ public static enum SeekStatus { /** The term was not found, and the end of iteration was hit. */ END, /** The precise term was found. */ FOUND, /** A different term was found after the requested term */ NOT_FOUND }; /** Attempts to seek to the exact term, returning * true if the term is found. If this returns false, the * enum is unpositioned. For some codecs, seekExact may * be substantially faster than {@link #seekCeil}. */ public boolean seekExact(BytesRef text) throws IOException { return seekCeil(text) == SeekStatus.FOUND; } /** Seeks to the specified term, if it exists, or to the * next (ceiling) term. Returns SeekStatus to * indicate whether exact term was found, a different * term was found, or EOF was hit. The target term may * be before or after the current term. If this returns * SeekStatus.END, the enum is unpositioned. */ public abstract SeekStatus seekCeil(BytesRef text) throws IOException; /** Seeks to the specified term by ordinal (position) as * previously returned by {@link #ord}. The target ord * may be before or after the current ord, and must be * within bounds. */ public abstract void seekExact(long ord) throws IOException; /** * Expert: Seeks a specific position by {@link TermState} previously obtained * from {@link #termState()}. Callers should maintain the {@link TermState} to * use this method. Low-level implementations may position the TermsEnum * without re-seeking the term dictionary. *

* Seeking by {@link TermState} should only be used iff the state was obtained * from the same {@link TermsEnum} instance. *

* NOTE: Using this method with an incompatible {@link TermState} might leave * this {@link TermsEnum} in undefined state. On a segment level * {@link TermState} instances are compatible only iff the source and the * target {@link TermsEnum} operate on the same field. If operating on segment * level, TermState instances must not be used across segments. *

* NOTE: A seek by {@link TermState} might not restore the * {@link AttributeSource}'s state. {@link AttributeSource} states must be * maintained separately if this method is used. * @param term the term the TermState corresponds to * @param state the {@link TermState} * */ public void seekExact(BytesRef term, TermState state) throws IOException { if (!seekExact(term)) { throw new IllegalArgumentException("term=" + term + " does not exist"); } } /** Returns current term. Do not call this when the enum * is unpositioned. */ public abstract BytesRef term() throws IOException; /** Returns ordinal position for current term. This is an * optional method (the codec may throw {@link * UnsupportedOperationException}). Do not call this * when the enum is unpositioned. */ public abstract long ord() throws IOException; /** Returns the number of documents containing the current * term. Do not call this when the enum is unpositioned. * {@link SeekStatus#END}.*/ public abstract int docFreq() throws IOException; /** Returns the total number of occurrences of this term * across all documents (the sum of the freq() for each * doc that has this term). This will be -1 if the * codec doesn't support this measure. Note that, like * other term measures, this measure does not take * deleted documents into account. */ public abstract long totalTermFreq() throws IOException; /** Get {@link PostingsEnum} for the current term. Do not * call this when the enum is unpositioned. This method * will not return null. *

* NOTE: the returned iterator may return deleted documents, so * deleted documents have to be checked on top of the {@link PostingsEnum}. *

* Use this method if you only require documents and frequencies, * and do not need any proximity data. * This method is equivalent to * {@link #postings(PostingsEnum, int) postings(reuse, PostingsEnum.FREQS)} * * @param reuse pass a prior PostingsEnum for possible reuse * @see #postings(PostingsEnum, int) */ public final PostingsEnum postings(PostingsEnum reuse) throws IOException { return postings(reuse, PostingsEnum.FREQS); } /** Get {@link PostingsEnum} for the current term, with * control over whether freqs, positions, offsets or payloads * are required. Do not call this when the enum is * unpositioned. This method may return null if the postings * information required is not available from the index *

* NOTE: the returned iterator may return deleted documents, so * deleted documents have to be checked on top of the {@link PostingsEnum}. * * @param reuse pass a prior PostingsEnum for possible reuse * @param flags specifies which optional per-document values * you require; see {@link PostingsEnum#FREQS} */ public abstract PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException; /** * Expert: Returns the TermsEnums internal state to position the TermsEnum * without re-seeking the term dictionary. *

* NOTE: A seek by {@link TermState} might not capture the * {@link AttributeSource}'s state. Callers must maintain the * {@link AttributeSource} states separately * * @see TermState * @see #seekExact(BytesRef, TermState) */ public TermState termState() throws IOException { return new TermState() { @Override public void copyFrom(TermState other) { throw new UnsupportedOperationException(); } }; } /** An empty TermsEnum for quickly returning an empty instance e.g. * in {@link org.apache.lucene.search.MultiTermQuery} *

Please note: This enum should be unmodifiable, * but it is currently possible to add Attributes to it. * This should not be a problem, as the enum is always empty and * the existence of unused Attributes does not matter. */ public static final TermsEnum EMPTY = new TermsEnum() { @Override public SeekStatus seekCeil(BytesRef term) { return SeekStatus.END; } @Override public void seekExact(long ord) {} @Override public BytesRef term() { throw new IllegalStateException("this method should never be called"); } @Override public int docFreq() { throw new IllegalStateException("this method should never be called"); } @Override public long totalTermFreq() { throw new IllegalStateException("this method should never be called"); } @Override public long ord() { throw new IllegalStateException("this method should never be called"); } @Override public PostingsEnum postings(PostingsEnum reuse, int flags) { throw new IllegalStateException("this method should never be called"); } @Override public BytesRef next() { return null; } @Override // make it synchronized here, to prevent double lazy init public synchronized AttributeSource attributes() { return super.attributes(); } @Override public TermState termState() { throw new IllegalStateException("this method should never be called"); } @Override public void seekExact(BytesRef term, TermState state) { throw new IllegalStateException("this method should never be called"); } }; /** Get {@link DocsEnum} for the current term. Do not * call this when the enum is unpositioned. This method * will not return null. * * @param liveDocs unset bits are documents that should not * be returned * @param reuse pass a prior DocsEnum for possible reuse * @deprecated Use {@link #postings(PostingsEnum)} instead */ @Deprecated public final DocsEnum docs(Bits liveDocs, DocsEnum reuse) throws IOException { return docs(liveDocs, reuse, DocsEnum.FLAG_FREQS); } /** Get {@link DocsEnum} for the current term, with * control over whether freqs are required. Do not * call this when the enum is unpositioned. This method * will not return null. * * @param liveDocs unset bits are documents that should not * be returned * @param reuse pass a prior DocsEnum for possible reuse * @param flags specifies which optional per-document values * you require; see {@link DocsEnum#FLAG_FREQS} * @see #docs(Bits, DocsEnum, int) * @deprecated Use {@link #postings(PostingsEnum, int)} instead */ @Deprecated public final DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags) throws IOException { final int newFlags; if (flags == DocsEnum.FLAG_FREQS) { newFlags = PostingsEnum.FREQS; } else if (flags == DocsEnum.FLAG_NONE) { newFlags = PostingsEnum.NONE; } else { throw new IllegalArgumentException("Invalid legacy docs flags: " + flags); } PostingsEnum actualReuse = DocsAndPositionsEnum.unwrap(reuse); PostingsEnum postings = postings(actualReuse, newFlags); if (postings == null) { throw new AssertionError(); } else if (postings == actualReuse && liveDocs == DocsAndPositionsEnum.unwrapliveDocs(reuse)) { return reuse; } else { return DocsAndPositionsEnum.wrap(postings, liveDocs); } }; /** Get {@link DocsAndPositionsEnum} for the current term. * Do not call this when the enum is unpositioned. This * method will return null if positions were not * indexed. * * @param liveDocs unset bits are documents that should not * be returned * @param reuse pass a prior DocsAndPositionsEnum for possible reuse * @see #docsAndPositions(Bits, DocsAndPositionsEnum, int) * @deprecated Use {@link #postings(PostingsEnum, int)} instead */ @Deprecated public final DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse) throws IOException { return docsAndPositions(liveDocs, reuse, DocsAndPositionsEnum.FLAG_OFFSETS | DocsAndPositionsEnum.FLAG_PAYLOADS); } /** Get {@link DocsAndPositionsEnum} for the current term, * with control over whether offsets and payloads are * required. Some codecs may be able to optimize their * implementation when offsets and/or payloads are not required. * Do not call this when the enum is unpositioned. This * will return null if positions were not indexed. * @param liveDocs unset bits are documents that should not * be returned * @param reuse pass a prior DocsAndPositionsEnum for possible reuse * @param flags specifies which optional per-position values you * require; see {@link DocsAndPositionsEnum#FLAG_OFFSETS} and * {@link DocsAndPositionsEnum#FLAG_PAYLOADS}. * @deprecated Use {@link #postings(PostingsEnum, int)} instead */ @Deprecated public final DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) throws IOException { final int newFlags; if (flags == (DocsAndPositionsEnum.FLAG_OFFSETS | DocsAndPositionsEnum.FLAG_PAYLOADS)) { newFlags = PostingsEnum.OFFSETS | PostingsEnum.PAYLOADS; } else if (flags == DocsAndPositionsEnum.FLAG_OFFSETS) { newFlags = PostingsEnum.OFFSETS; } else if (flags == DocsAndPositionsEnum.FLAG_PAYLOADS) { newFlags = PostingsEnum.PAYLOADS; } else if (flags == DocsAndPositionsEnum.FLAG_NONE) { newFlags = PostingsEnum.POSITIONS; } else { throw new IllegalArgumentException("Invalid legacy docsAndPositions flags: " + flags); } PostingsEnum actualReuse = DocsAndPositionsEnum.unwrap(reuse); PostingsEnum postings = postings(actualReuse, newFlags | DocsAndPositionsEnum.OLD_NULL_SEMANTICS); if (postings == null) { return null; // if no positions were indexed } else if (postings == actualReuse && liveDocs == DocsAndPositionsEnum.unwrapliveDocs(reuse)) { return reuse; } else { return DocsAndPositionsEnum.wrap(postings, liveDocs); } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy