All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.elasticsearch.index.codec.bloomfilter.ES85BloomFilterPostingsFormat Maven / Gradle / Ivy

There is a newer version: 8.14.0
Show newest version
/*
 * @notice
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * Modifications copyright (C) 2022 Elasticsearch B.V.
 */
package org.elasticsearch.index.codec.bloomfilter;

import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.NormsProducer;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.index.BaseTermsEnum;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.FilterLeafReader;
import org.apache.lucene.index.ImpactsEnum;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.RandomAccessInput;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.BitUtil;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.lucene.store.IndexOutputOutputStream;
import org.elasticsearch.common.util.BigArrays;
import org.elasticsearch.common.util.ByteArray;
import org.elasticsearch.core.IOUtils;

import java.io.Closeable;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.function.Function;

/**
 * This implementation is forked from Lucene's BloomFilterPosting to support on-disk bloom filters.
 * 

* A {@link PostingsFormat} useful for low doc-frequency fields such as primary keys. Bloom filters * offers "fast-fail" for reads in segments known to have no record of the key. */ public class ES85BloomFilterPostingsFormat extends PostingsFormat { static final String BLOOM_CODEC_NAME = "ES85BloomFilter"; static final int VERSION_START = 0; static final int VERSION_CURRENT = VERSION_START; static final String BLOOM_FILTER_META_FILE = "bfm"; static final String BLOOM_FILTER_INDEX_FILE = "bfi"; private Function postingsFormats; private BigArrays bigArrays; public ES85BloomFilterPostingsFormat(BigArrays bigArrays, Function postingsFormats) { this(); this.bigArrays = Objects.requireNonNull(bigArrays); this.postingsFormats = Objects.requireNonNull(postingsFormats); } public ES85BloomFilterPostingsFormat() { super(BLOOM_CODEC_NAME); } @Override public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { if (postingsFormats == null || bigArrays == null) { assert false : BLOOM_CODEC_NAME + " was initialized with a wrong constructor"; throw new UnsupportedOperationException(BLOOM_CODEC_NAME + " was initialized with a wrong constructor"); } return new FieldsWriter(state); } @Override public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { return new FieldsReader(state); } @Override public String toString() { return BLOOM_CODEC_NAME; } private static String metaFile(SegmentInfo si, String segmentSuffix) { return IndexFileNames.segmentFileName(si.name, segmentSuffix, BLOOM_FILTER_META_FILE); } private static String indexFile(SegmentInfo si, String segmentSuffix) { return IndexFileNames.segmentFileName(si.name, segmentSuffix, BLOOM_FILTER_INDEX_FILE); } final class FieldsWriter extends FieldsConsumer { private final SegmentWriteState state; private final IndexOutput indexOut; private final List bloomFilters = new ArrayList<>(); private final List fieldsGroups = new ArrayList<>(); private final List toCloses = new ArrayList<>(); private boolean closed; FieldsWriter(SegmentWriteState state) throws IOException { this.state = state; boolean success = false; try { indexOut = state.directory.createOutput(indexFile(state.segmentInfo, state.segmentSuffix), state.context); toCloses.add(indexOut); CodecUtil.writeIndexHeader(indexOut, BLOOM_CODEC_NAME, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); success = true; } finally { if (success == false) { IOUtils.closeWhileHandlingException(toCloses); } } } @Override public void write(Fields fields, NormsProducer norms) throws IOException { writePostings(fields, norms); writeBloomFilters(fields); } private void writePostings(Fields fields, NormsProducer norms) throws IOException { final Map currentGroups = new HashMap<>(); for (String field : fields) { final PostingsFormat postingsFormat = postingsFormats.apply(field); if (postingsFormat == null) { throw new IllegalStateException("PostingsFormat for field [" + field + "] wasn't specified"); } FieldsGroup group = currentGroups.get(postingsFormat); if (group == null) { group = new FieldsGroup(postingsFormat, Integer.toString(fieldsGroups.size()), new ArrayList<>()); currentGroups.put(postingsFormat, group); fieldsGroups.add(group); } group.fields.add(field); } for (FieldsGroup group : currentGroups.values()) { final FieldsConsumer writer = group.postingsFormat.fieldsConsumer(new SegmentWriteState(state, group.suffix)); toCloses.add(writer); final Fields maskedFields = new FilterLeafReader.FilterFields(fields) { @Override public Iterator iterator() { return group.fields.iterator(); } }; writer.write(maskedFields, norms); } } private void writeBloomFilters(Fields fields) throws IOException { for (String field : fields) { final Terms terms = fields.terms(field); if (terms == null) { continue; } final int bloomFilterSize = bloomFilterSize(state.segmentInfo.maxDoc()); final int numBytes = numBytesForBloomFilter(bloomFilterSize); try (ByteArray buffer = bigArrays.newByteArray(numBytes)) { final TermsEnum termsEnum = terms.iterator(); while (true) { final BytesRef term = termsEnum.next(); if (term == null) { break; } final int hash = hashTerm(term) % bloomFilterSize; final int pos = hash >> 3; final int mask = 1 << (hash & 0x7); final byte val = (byte) (buffer.get(pos) | mask); buffer.set(pos, val); } bloomFilters.add(new BloomFilter(field, indexOut.getFilePointer(), bloomFilterSize)); final BytesReference bytes = BytesReference.fromByteArray(buffer, numBytes); bytes.writeTo(new IndexOutputOutputStream(indexOut)); } } } @Override public void close() throws IOException { if (closed) { return; } closed = true; try { CodecUtil.writeFooter(indexOut); } finally { IOUtils.close(toCloses); } try (IndexOutput metaOut = state.directory.createOutput(metaFile(state.segmentInfo, state.segmentSuffix), state.context)) { CodecUtil.writeIndexHeader(metaOut, BLOOM_CODEC_NAME, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); // write postings formats metaOut.writeVInt(fieldsGroups.size()); for (FieldsGroup group : fieldsGroups) { group.writeTo(metaOut, state.fieldInfos); } // Write bloom filters metaOut.writeVInt(bloomFilters.size()); for (BloomFilter bloomFilter : bloomFilters) { bloomFilter.writeTo(metaOut, state.fieldInfos); } CodecUtil.writeFooter(metaOut); } } } private record BloomFilter(String field, long startFilePointer, int bloomFilterSize) { void writeTo(IndexOutput out, FieldInfos fieldInfos) throws IOException { out.writeVInt(fieldInfos.fieldInfo(field).number); out.writeVLong(startFilePointer); out.writeVInt(bloomFilterSize); } static BloomFilter readFrom(IndexInput in, FieldInfos fieldInfos) throws IOException { final String fieldName = fieldInfos.fieldInfo(in.readVInt()).name; final long startFilePointer = in.readVLong(); final int bloomFilterSize = in.readVInt(); return new BloomFilter(fieldName, startFilePointer, bloomFilterSize); } } private record FieldsGroup(PostingsFormat postingsFormat, String suffix, List fields) { void writeTo(IndexOutput out, FieldInfos fieldInfos) throws IOException { out.writeString(postingsFormat.getName()); out.writeString(suffix); out.writeVInt(fields.size()); for (String field : fields) { out.writeVInt(fieldInfos.fieldInfo(field).number); } } static FieldsGroup readFrom(IndexInput in, FieldInfos fieldInfos) throws IOException { final PostingsFormat postingsFormat = forName(in.readString()); final String suffix = in.readString(); final int numFields = in.readVInt(); final List fields = new ArrayList<>(); for (int i = 0; i < numFields; i++) { fields.add(fieldInfos.fieldInfo(in.readVInt()).name); } return new FieldsGroup(postingsFormat, suffix, fields); } } static final class FieldsReader extends FieldsProducer { private final Map bloomFilters; private final List toCloses = new ArrayList<>(); private final Map readerMap = new HashMap<>(); private final IndexInput indexIn; FieldsReader(SegmentReadState state) throws IOException { boolean success = false; try ( ChecksumIndexInput metaIn = state.directory.openChecksumInput( metaFile(state.segmentInfo, state.segmentSuffix), IOContext.READONCE ) ) { CodecUtil.checkIndexHeader( metaIn, BLOOM_CODEC_NAME, VERSION_START, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix ); // read postings formats final int numFieldsGroups = metaIn.readVInt(); for (int i = 0; i < numFieldsGroups; i++) { final FieldsGroup group = FieldsGroup.readFrom(metaIn, state.fieldInfos); final FieldsProducer reader = group.postingsFormat.fieldsProducer(new SegmentReadState(state, group.suffix)); toCloses.add(reader); for (String field : group.fields) { readerMap.put(field, reader); } } // read bloom filters final int numBloomFilters = metaIn.readVInt(); bloomFilters = new HashMap<>(numBloomFilters); for (int i = 0; i < numBloomFilters; i++) { final BloomFilter bloomFilter = BloomFilter.readFrom(metaIn, state.fieldInfos); bloomFilters.put(bloomFilter.field, bloomFilter); } CodecUtil.checkFooter(metaIn); indexIn = state.directory.openInput(indexFile(state.segmentInfo, state.segmentSuffix), state.context); toCloses.add(indexIn); CodecUtil.checkIndexHeader( indexIn, BLOOM_CODEC_NAME, VERSION_START, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix ); CodecUtil.retrieveChecksum(indexIn); assert assertBloomFilterSizes(state.segmentInfo); success = true; } finally { if (success == false) { IOUtils.closeWhileHandlingException(toCloses); } } } private boolean assertBloomFilterSizes(SegmentInfo segmentInfo) { for (BloomFilter bloomFilter : bloomFilters.values()) { assert bloomFilter.bloomFilterSize == bloomFilterSize(segmentInfo.maxDoc()) : "bloom_filter=" + bloomFilter + ", max_docs=" + segmentInfo.maxDoc(); } return true; } @Override public Iterator iterator() { return readerMap.keySet().iterator(); } @Override public void close() throws IOException { IOUtils.close(toCloses); } @Override public Terms terms(String field) throws IOException { final FieldsProducer reader = readerMap.get(field); if (reader == null) { return null; } final Terms terms = reader.terms(field); if (terms == null) { return null; } final BloomFilter bloomFilter = bloomFilters.get(field); if (bloomFilter != null) { final RandomAccessInput data = indexIn.randomAccessSlice( bloomFilter.startFilePointer(), numBytesForBloomFilter(bloomFilter.bloomFilterSize) ); return new BloomFilterTerms(terms, data, bloomFilter.bloomFilterSize); } else { return terms; } } @Override public int size() { return readerMap.size(); } @Override public void checkIntegrity() throws IOException { // already fully checked the meta file; let's fully checked the index file. CodecUtil.checksumEntireFile(indexIn); // multiple fields can share the same reader final Set seenReaders = new HashSet<>(); for (FieldsProducer reader : readerMap.values()) { if (seenReaders.add(reader)) { reader.checkIntegrity(); } } } } private static class BloomFilterTerms extends FilterLeafReader.FilterTerms { private final RandomAccessInput data; private final int bloomFilterSize; BloomFilterTerms(Terms in, RandomAccessInput data, int bloomFilterSize) { super(in); this.data = data; this.bloomFilterSize = bloomFilterSize; } private boolean mayContainTerm(BytesRef term) throws IOException { final int hash = hashTerm(term) % bloomFilterSize; final int pos = hash >> 3; final int mask = 1 << (hash & 0x7); final byte bits = data.readByte(pos); return (bits & mask) != 0; } @Override public TermsEnum iterator() throws IOException { return new LazyFilterTermsEnum() { private TermsEnum delegate; @Override TermsEnum getDelegate() throws IOException { if (delegate == null) { delegate = in.iterator(); } return delegate; } @Override public boolean seekExact(BytesRef term) throws IOException { if (mayContainTerm(term)) { return getDelegate().seekExact(term); } else { return false; } } @Override public void seekExact(BytesRef term, TermState state) throws IOException { getDelegate().seekExact(term, state); } @Override public TermState termState() throws IOException { // TODO: return TermState that includes BloomFilter and fix _disk_usage API return getDelegate().termState(); } }; } } private abstract static class LazyFilterTermsEnum extends BaseTermsEnum { abstract TermsEnum getDelegate() throws IOException; @Override public SeekStatus seekCeil(BytesRef text) throws IOException { return getDelegate().seekCeil(text); } @Override public void seekExact(long ord) throws IOException { getDelegate().seekExact(ord); } @Override public BytesRef term() throws IOException { return getDelegate().term(); } @Override public long ord() throws IOException { return getDelegate().ord(); } @Override public int docFreq() throws IOException { return getDelegate().docFreq(); } @Override public long totalTermFreq() throws IOException { return getDelegate().totalTermFreq(); } @Override public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException { return getDelegate().postings(reuse, flags); } @Override public ImpactsEnum impacts(int flags) throws IOException { return getDelegate().impacts(flags); } @Override public BytesRef next() throws IOException { return getDelegate().next(); } @Override public AttributeSource attributes() { try { return getDelegate().attributes(); } catch (IOException e) { throw new UncheckedIOException(e); } } } static int bloomFilterSize(int maxDocs) { // 10% saturation (i.e., 10 bits for each term) final long numBits = maxDocs * 10L; if (numBits > Integer.MAX_VALUE) { return Integer.MAX_VALUE; } else { return (int) numBits; } } static int numBytesForBloomFilter(int bloomFilterSize) { return Math.toIntExact((bloomFilterSize + 7L) / 8L); } static int hashTerm(BytesRef br) { final int hash = murmurhash3_x86_32(br.bytes, br.offset, br.length, 0x9747b28c); return hash & 0x7FFF_FFFF; } /** * Forked from Lucene's StringHelper#murmurhash3_x86_32 so that changes to the Lucene implementation * do not break the compatibility of this format. */ @SuppressWarnings("fallthrough") private static int murmurhash3_x86_32(byte[] data, int offset, int len, int seed) { final int c1 = 0xcc9e2d51; final int c2 = 0x1b873593; int h1 = seed; int roundedEnd = offset + (len & 0xfffffffc); // round down to 4 byte block for (int i = offset; i < roundedEnd; i += 4) { // little endian load order int k1 = (int) BitUtil.VH_LE_INT.get(data, i); k1 *= c1; k1 = Integer.rotateLeft(k1, 15); k1 *= c2; h1 ^= k1; h1 = Integer.rotateLeft(h1, 13); h1 = h1 * 5 + 0xe6546b64; } // tail int k1 = 0; switch (len & 0x03) { case 3: k1 = (data[roundedEnd + 2] & 0xff) << 16; // fallthrough case 2: k1 |= (data[roundedEnd + 1] & 0xff) << 8; // fallthrough case 1: k1 |= (data[roundedEnd] & 0xff); k1 *= c1; k1 = Integer.rotateLeft(k1, 15); k1 *= c2; h1 ^= k1; } // finalization h1 ^= len; // fmix(h1); h1 ^= h1 >>> 16; h1 *= 0x85ebca6b; h1 ^= h1 >>> 13; h1 *= 0xc2b2ae35; h1 ^= h1 >>> 16; return h1; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy