All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.elasticsearch.index.codec.postingsformat.BloomFilterPostingsFormat Maven / Gradle / Ivy

There is a newer version: 7.10.2_1
Show newest version
/*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.elasticsearch.index.codec.postingsformat;

import org.apache.lucene.store.IndexInput;

import org.apache.lucene.codecs.*;
import org.apache.lucene.index.*;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.elasticsearch.common.util.BloomFilter;

import java.io.IOException;
import java.util.*;
import java.util.Map.Entry;

/**
 * 

* A {@link PostingsFormat} useful for low doc-frequency fields such as primary * keys. Bloom filters are maintained in a ".blm" file which offers "fast-fail" * for reads in segments known to have no record of the key. A choice of * delegate PostingsFormat is used to record all other Postings data. *

*

* This is a special bloom filter version, based on {@link org.elasticsearch.common.util.BloomFilter} and inspired * by Lucene {@link org.apache.lucene.codecs.bloom.BloomFilteringPostingsFormat}. *

* @deprecated only for reading old segments */ @Deprecated public class BloomFilterPostingsFormat extends PostingsFormat { public static final String BLOOM_CODEC_NAME = "XBloomFilter"; // the Lucene one is named BloomFilter public static final int BLOOM_CODEC_VERSION = 1; public static final int BLOOM_CODEC_VERSION_CHECKSUM = 2; public static final int BLOOM_CODEC_VERSION_CURRENT = BLOOM_CODEC_VERSION_CHECKSUM; /** * Extension of Bloom Filters file */ static final String BLOOM_EXTENSION = "blm"; private BloomFilter.Factory bloomFilterFactory = BloomFilter.Factory.DEFAULT; private PostingsFormat delegatePostingsFormat; /** * Creates Bloom filters for a selection of fields created in the index. This * is recorded as a set of Bitsets held as a segment summary in an additional * "blm" file. This PostingsFormat delegates to a choice of delegate * PostingsFormat for encoding all other postings data. * * @param delegatePostingsFormat The PostingsFormat that records all the non-bloom filter data i.e. * postings info. * @param bloomFilterFactory The {@link BloomFilter.Factory} responsible for sizing BloomFilters * appropriately */ public BloomFilterPostingsFormat(PostingsFormat delegatePostingsFormat, BloomFilter.Factory bloomFilterFactory) { super(BLOOM_CODEC_NAME); this.delegatePostingsFormat = delegatePostingsFormat; this.bloomFilterFactory = bloomFilterFactory; } // Used only by core Lucene at read-time via Service Provider instantiation - // do not use at Write-time in application code. public BloomFilterPostingsFormat() { super(BLOOM_CODEC_NAME); } @Override public BloomFilteredFieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { throw new UnsupportedOperationException("this codec can only be used for reading"); } @Override public BloomFilteredFieldsProducer fieldsProducer(SegmentReadState state) throws IOException { return new BloomFilteredFieldsProducer(state); } public final class BloomFilteredFieldsProducer extends FieldsProducer { private FieldsProducer delegateFieldsProducer; HashMap bloomsByFieldName = new HashMap<>(); private final IndexInput data; // for internal use only FieldsProducer getDelegate() { return delegateFieldsProducer; } public BloomFilteredFieldsProducer(SegmentReadState state) throws IOException { String bloomFileName = IndexFileNames.segmentFileName( state.segmentInfo.name, state.segmentSuffix, BLOOM_EXTENSION); boolean success = false; try { data = state.directory.openChecksumInput(bloomFileName, state.context); int version = CodecUtil.checkHeader(data, BLOOM_CODEC_NAME, BLOOM_CODEC_VERSION, BLOOM_CODEC_VERSION_CURRENT); // // Load the hash function used in the BloomFilter // hashFunction = HashFunction.forName(bloomIn.readString()); // Load the delegate postings format final String delegatePostings = data.readString(); this.delegateFieldsProducer = PostingsFormat.forName(delegatePostings) .fieldsProducer(state); success = true; } finally { if (!success) { IOUtils.closeWhileHandlingException(this); } } } @Override public Iterator iterator() { return delegateFieldsProducer.iterator(); } @Override public void close() throws IOException { IOUtils.close(data, delegateFieldsProducer); } @Override public Terms terms(String field) throws IOException { BloomFilter filter = bloomsByFieldName.get(field); if (filter == null) { return delegateFieldsProducer.terms(field); } else { Terms result = delegateFieldsProducer.terms(field); if (result == null) { return null; } return new BloomFilteredTerms(result, filter); } } @Override public int size() { return delegateFieldsProducer.size(); } public long getUniqueTermCount() throws IOException { return delegateFieldsProducer.getUniqueTermCount(); } @Override public long ramBytesUsed() { long size = delegateFieldsProducer.ramBytesUsed(); for (BloomFilter bloomFilter : bloomsByFieldName.values()) { size += bloomFilter.getSizeInBytes(); } return size; } @Override public void checkIntegrity() throws IOException { delegateFieldsProducer.checkIntegrity(); } } public static final class BloomFilteredTerms extends FilterAtomicReader.FilterTerms { private BloomFilter filter; public BloomFilteredTerms(Terms terms, BloomFilter filter) { super(terms); this.filter = filter; } public BloomFilter getFilter() { return filter; } @Override public TermsEnum iterator(TermsEnum reuse) throws IOException { TermsEnum result; if ((reuse != null) && (reuse instanceof BloomFilteredTermsEnum)) { // recycle the existing BloomFilteredTermsEnum by asking the delegate // to recycle its contained TermsEnum BloomFilteredTermsEnum bfte = (BloomFilteredTermsEnum) reuse; if (bfte.filter == filter) { bfte.reset(this.in); return bfte; } reuse = bfte.reuse; } // We have been handed something we cannot reuse (either null, wrong // class or wrong filter) so allocate a new object result = new BloomFilteredTermsEnum(this.in, reuse, filter); return result; } } static final class BloomFilteredTermsEnum extends TermsEnum { private Terms delegateTerms; private TermsEnum delegateTermsEnum; private TermsEnum reuse; private BloomFilter filter; public BloomFilteredTermsEnum(Terms other, TermsEnum reuse, BloomFilter filter) { this.delegateTerms = other; this.reuse = reuse; this.filter = filter; } void reset(Terms others) { reuse = this.delegateTermsEnum; this.delegateTermsEnum = null; this.delegateTerms = others; } private TermsEnum getDelegate() throws IOException { if (delegateTermsEnum == null) { /* pull the iterator only if we really need it - * this can be a relatively heavy operation depending on the * delegate postings format and they underlying directory * (clone IndexInput) */ delegateTermsEnum = delegateTerms.iterator(reuse); } return delegateTermsEnum; } @Override public final BytesRef next() throws IOException { return getDelegate().next(); } @Override public final Comparator getComparator() { return delegateTerms.getComparator(); } @Override public final boolean seekExact(BytesRef text) throws IOException { // The magical fail-fast speed up that is the entire point of all of // this code - save a disk seek if there is a match on an in-memory // structure // that may occasionally give a false positive but guaranteed no false // negatives if (!filter.mightContain(text)) { return false; } return getDelegate().seekExact(text); } @Override public final SeekStatus seekCeil(BytesRef text) throws IOException { return getDelegate().seekCeil(text); } @Override public final void seekExact(long ord) throws IOException { getDelegate().seekExact(ord); } @Override public final BytesRef term() throws IOException { return getDelegate().term(); } @Override public final long ord() throws IOException { return getDelegate().ord(); } @Override public final int docFreq() throws IOException { return getDelegate().docFreq(); } @Override public final long totalTermFreq() throws IOException { return getDelegate().totalTermFreq(); } @Override public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) throws IOException { return getDelegate().docsAndPositions(liveDocs, reuse, flags); } @Override public DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags) throws IOException { return getDelegate().docs(liveDocs, reuse, flags); } } // TODO: would be great to move this out to test code, but the interaction between es090 and bloom is complex // at least it is not accessible via SPI public final class BloomFilteredFieldsConsumer extends FieldsConsumer { private FieldsConsumer delegateFieldsConsumer; private Map bloomFilters = new HashMap<>(); private SegmentWriteState state; // private PostingsFormat delegatePostingsFormat; public BloomFilteredFieldsConsumer(FieldsConsumer fieldsConsumer, SegmentWriteState state, PostingsFormat delegatePostingsFormat) { this.delegateFieldsConsumer = fieldsConsumer; // this.delegatePostingsFormat=delegatePostingsFormat; this.state = state; } // for internal use only public FieldsConsumer getDelegate() { return delegateFieldsConsumer; } @Override public TermsConsumer addField(FieldInfo field) throws IOException { BloomFilter bloomFilter = bloomFilterFactory.createFilter(state.segmentInfo.getDocCount()); if (bloomFilter != null) { assert bloomFilters.containsKey(field) == false; bloomFilters.put(field, bloomFilter); return new WrappedTermsConsumer(delegateFieldsConsumer.addField(field), bloomFilter); } else { // No, use the unfiltered fieldsConsumer - we are not interested in // recording any term Bitsets. return delegateFieldsConsumer.addField(field); } } @Override public void close() throws IOException { delegateFieldsConsumer.close(); // Now we are done accumulating values for these fields List> nonSaturatedBlooms = new ArrayList<>(); for (Entry entry : bloomFilters.entrySet()) { nonSaturatedBlooms.add(entry); } String bloomFileName = IndexFileNames.segmentFileName( state.segmentInfo.name, state.segmentSuffix, BLOOM_EXTENSION); IndexOutput bloomOutput = null; try { bloomOutput = state.directory .createOutput(bloomFileName, state.context); CodecUtil.writeHeader(bloomOutput, BLOOM_CODEC_NAME, BLOOM_CODEC_VERSION_CURRENT); // remember the name of the postings format we will delegate to bloomOutput.writeString(delegatePostingsFormat.getName()); // First field in the output file is the number of fields+blooms saved bloomOutput.writeInt(nonSaturatedBlooms.size()); for (Entry entry : nonSaturatedBlooms) { FieldInfo fieldInfo = entry.getKey(); BloomFilter bloomFilter = entry.getValue(); bloomOutput.writeInt(fieldInfo.number); saveAppropriatelySizedBloomFilter(bloomOutput, bloomFilter, fieldInfo); } CodecUtil.writeFooter(bloomOutput); } finally { IOUtils.close(bloomOutput); } //We are done with large bitsets so no need to keep them hanging around bloomFilters.clear(); } private void saveAppropriatelySizedBloomFilter(IndexOutput bloomOutput, BloomFilter bloomFilter, FieldInfo fieldInfo) throws IOException { // FuzzySet rightSizedSet = bloomFilterFactory.downsize(fieldInfo, // bloomFilter); // if (rightSizedSet == null) { // rightSizedSet = bloomFilter; // } // rightSizedSet.serialize(bloomOutput); BloomFilter.serilaize(bloomFilter, bloomOutput); } } class WrappedTermsConsumer extends TermsConsumer { private TermsConsumer delegateTermsConsumer; private BloomFilter bloomFilter; public WrappedTermsConsumer(TermsConsumer termsConsumer, BloomFilter bloomFilter) { this.delegateTermsConsumer = termsConsumer; this.bloomFilter = bloomFilter; } @Override public PostingsConsumer startTerm(BytesRef text) throws IOException { return delegateTermsConsumer.startTerm(text); } @Override public void finishTerm(BytesRef text, TermStats stats) throws IOException { // Record this term in our BloomFilter if (stats.docFreq > 0) { bloomFilter.put(text); } delegateTermsConsumer.finishTerm(text, stats); } @Override public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException { delegateTermsConsumer.finish(sumTotalTermFreq, sumDocFreq, docCount); } @Override public Comparator getComparator() throws IOException { return delegateTermsConsumer.getComparator(); } } public PostingsFormat getDelegate() { return this.delegatePostingsFormat; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy