All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.io.BloomMapFile Maven / Gradle / Ivy

There is a newer version: 3.4.1
Show newest version
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in org.apache.hadoop.shaded.com.liance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org.apache.hadoop.shaded.org.licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.shaded.org.apache.hadoop.org.apache.hadoop.shaded.io.

import java.org.apache.hadoop.shaded.io.DataInputStream;
import java.org.apache.hadoop.shaded.io.DataOutputStream;
import java.org.apache.hadoop.shaded.io.IOException;

import org.apache.hadoop.shaded.org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.shaded.org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.shaded.org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.shaded.org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.shaded.org.apache.hadoop.fs.Path;
import org.apache.hadoop.shaded.org.apache.hadoop.org.apache.hadoop.shaded.io.SequenceFile.CompressionType;
import org.apache.hadoop.shaded.org.apache.hadoop.org.apache.hadoop.shaded.io.org.apache.hadoop.shaded.com.ress.CompressionCodec;
import org.apache.hadoop.shaded.org.apache.hadoop.util.Progressable;
import org.apache.hadoop.shaded.org.apache.hadoop.util.bloom.DynamicBloomFilter;
import org.apache.hadoop.shaded.org.apache.hadoop.util.bloom.Filter;
import org.apache.hadoop.shaded.org.apache.hadoop.util.bloom.Key;
import org.apache.hadoop.shaded.org.apache.hadoop.util.hash.Hash;
import org.apache.hadoop.shaded.org.slf4j.Logger;
import org.apache.hadoop.shaded.org.slf4j.LoggerFactory;

import static org.apache.hadoop.shaded.org.apache.hadoop.fs.CommonConfigurationKeysPublic.IO_MAPFILE_BLOOM_ERROR_RATE_DEFAULT;
import static org.apache.hadoop.shaded.org.apache.hadoop.fs.CommonConfigurationKeysPublic.IO_MAPFILE_BLOOM_ERROR_RATE_KEY;
import static org.apache.hadoop.shaded.org.apache.hadoop.fs.CommonConfigurationKeysPublic.IO_MAPFILE_BLOOM_SIZE_DEFAULT;
import static org.apache.hadoop.shaded.org.apache.hadoop.fs.CommonConfigurationKeysPublic.IO_MAPFILE_BLOOM_SIZE_KEY;

/**
 * This class extends {@link MapFile} and provides very much the same
 * functionality. However, it uses dynamic Bloom filters to provide
 * quick membership test for keys, and it offers a fast version of 
 * {@link Reader#get(WritableComparable, Writable)} operation, especially in
 * case of sparsely populated MapFile-s.
 */
@InterfaceAudience.Public
@InterfaceStability.Stable
public class BloomMapFile {
  private static final Logger LOG = LoggerFactory.getLogger(BloomMapFile.class);
  public static final String BLOOM_FILE_NAME = "bloom";
  public static final int HASH_COUNT = 5;
  
  public static void delete(FileSystem fs, String name) throws IOException {
    Path dir = new Path(name);
    Path data = new Path(dir, MapFile.DATA_FILE_NAME);
    Path index = new Path(dir, MapFile.INDEX_FILE_NAME);
    Path bloom = new Path(dir, BLOOM_FILE_NAME);

    fs.delete(data, true);
    fs.delete(index, true);
    fs.delete(bloom, true);
    fs.delete(dir, true);
  }

  private static byte[] byteArrayForBloomKey(DataOutputBuffer buf) {
    int cleanLength = buf.getLength();
    byte [] ba = buf.getData();
    if (cleanLength != ba.length) {
      ba = new byte[cleanLength];
      System.arraycopy(buf.getData(), 0, ba, 0, cleanLength);
    }
    return ba;
  }
  
  public static class Writer extends MapFile.Writer {
    private DynamicBloomFilter bloomFilter;
    private int numKeys;
    private int vectorSize;
    private Key bloomKey = new Key();
    private DataOutputBuffer buf = new DataOutputBuffer();
    private FileSystem fs;
    private Path dir;
    
    @Deprecated
    public Writer(Configuration conf, FileSystem fs, String dirName,
        Class keyClass,
        Class valClass, CompressionType org.apache.hadoop.shaded.com.ress,
        CompressionCodec codec, Progressable progress) throws IOException {
      this(conf, new Path(dirName), keyClass(keyClass), valueClass(valClass), 
           org.apache.hadoop.shaded.com.ression(org.apache.hadoop.shaded.com.ress, codec), progressable(progress));
    }

    @Deprecated
    public Writer(Configuration conf, FileSystem fs, String dirName,
        Class keyClass,
        Class valClass, CompressionType org.apache.hadoop.shaded.com.ress,
        Progressable progress) throws IOException {
      this(conf, new Path(dirName), keyClass(keyClass), valueClass(valClass), 
           org.apache.hadoop.shaded.com.ression(org.apache.hadoop.shaded.com.ress), progressable(progress));
    }

    @Deprecated
    public Writer(Configuration conf, FileSystem fs, String dirName,
        Class keyClass,
        Class valClass, CompressionType org.apache.hadoop.shaded.com.ress)
        throws IOException {
      this(conf, new Path(dirName), keyClass(keyClass), valueClass(valClass), 
           org.apache.hadoop.shaded.com.ression(org.apache.hadoop.shaded.com.ress));
    }

    @Deprecated
    public Writer(Configuration conf, FileSystem fs, String dirName,
        WritableComparator org.apache.hadoop.shaded.com.arator, Class valClass,
        CompressionType org.apache.hadoop.shaded.com.ress, CompressionCodec codec, Progressable progress)
        throws IOException {
      this(conf, new Path(dirName), org.apache.hadoop.shaded.com.arator(org.apache.hadoop.shaded.com.arator), 
           valueClass(valClass), org.apache.hadoop.shaded.com.ression(org.apache.hadoop.shaded.com.ress, codec), 
           progressable(progress));
    }

    @Deprecated
    public Writer(Configuration conf, FileSystem fs, String dirName,
        WritableComparator org.apache.hadoop.shaded.com.arator, Class valClass,
        CompressionType org.apache.hadoop.shaded.com.ress, Progressable progress) throws IOException {
      this(conf, new Path(dirName), org.apache.hadoop.shaded.com.arator(org.apache.hadoop.shaded.com.arator), 
           valueClass(valClass), org.apache.hadoop.shaded.com.ression(org.apache.hadoop.shaded.com.ress),
           progressable(progress));
    }

    @Deprecated
    public Writer(Configuration conf, FileSystem fs, String dirName,
        WritableComparator org.apache.hadoop.shaded.com.arator, Class valClass, CompressionType org.apache.hadoop.shaded.com.ress)
        throws IOException {
      this(conf, new Path(dirName), org.apache.hadoop.shaded.com.arator(org.apache.hadoop.shaded.com.arator), 
           valueClass(valClass), org.apache.hadoop.shaded.com.ression(org.apache.hadoop.shaded.com.ress));
    }

    @Deprecated
    public Writer(Configuration conf, FileSystem fs, String dirName,
        WritableComparator org.apache.hadoop.shaded.com.arator, Class valClass) throws IOException {
      this(conf, new Path(dirName), org.apache.hadoop.shaded.com.arator(org.apache.hadoop.shaded.com.arator), 
           valueClass(valClass));
    }

    @Deprecated
    public Writer(Configuration conf, FileSystem fs, String dirName,
                  Class keyClass,
                  Class valClass) throws IOException {
      this(conf, new Path(dirName), keyClass(keyClass), valueClass(valClass));
    }

    public Writer(Configuration conf, Path dir, 
                  SequenceFile.Writer.Option... options) throws IOException {
      super(conf, dir, options);
      this.fs = dir.getFileSystem(conf);
      this.dir = dir;
      initBloomFilter(conf);
    }

    private synchronized void initBloomFilter(Configuration conf) {
      numKeys = conf.getInt(
          IO_MAPFILE_BLOOM_SIZE_KEY, IO_MAPFILE_BLOOM_SIZE_DEFAULT);
      // vector size should be -kn / (ln(1 - c^(1/k))) bits for
      // single key, where  is the number of hash functions,
      // n is the number of keys and c is the desired
      // max. error rate.
      // Our desired error rate is by default 0.005, i.e. 0.5%
      float errorRate = conf.getFloat(
          IO_MAPFILE_BLOOM_ERROR_RATE_KEY, IO_MAPFILE_BLOOM_ERROR_RATE_DEFAULT);
      vectorSize = (int)Math.ceil((double)(-HASH_COUNT * numKeys) /
          Math.log(1.0 - Math.pow(errorRate, 1.0/HASH_COUNT)));
      bloomFilter = new DynamicBloomFilter(vectorSize, HASH_COUNT,
          Hash.getHashType(conf), numKeys);
    }

    @Override
    public synchronized void append(WritableComparable key, Writable val)
        throws IOException {
      super.append(key, val);
      buf.reset();
      key.write(buf);
      bloomKey.set(byteArrayForBloomKey(buf), 1.0);
      bloomFilter.add(bloomKey);
    }

    @Override
    public synchronized void close() throws IOException {
      super.close();
      DataOutputStream out = fs.create(new Path(dir, BLOOM_FILE_NAME), true);
      try {
        bloomFilter.write(out);
        out.flush();
        out.close();
        out = null;
      } finally {
        IOUtils.closeStream(out);
      }
    }

  }
  
  public static class Reader extends MapFile.Reader {
    private DynamicBloomFilter bloomFilter;
    private DataOutputBuffer buf = new DataOutputBuffer();
    private Key bloomKey = new Key();

    public Reader(Path dir, Configuration conf,
                  SequenceFile.Reader.Option... options) throws IOException {
      super(dir, conf, options);
      initBloomFilter(dir, conf);
    }

    @Deprecated
    public Reader(FileSystem fs, String dirName, Configuration conf)
        throws IOException {
      this(new Path(dirName), conf);
    }

    @Deprecated
    public Reader(FileSystem fs, String dirName, WritableComparator org.apache.hadoop.shaded.com.arator,
        Configuration conf, boolean open) throws IOException {
      this(new Path(dirName), conf, org.apache.hadoop.shaded.com.arator(org.apache.hadoop.shaded.com.arator));
    }

    @Deprecated
    public Reader(FileSystem fs, String dirName, WritableComparator org.apache.hadoop.shaded.com.arator,
        Configuration conf) throws IOException {
      this(new Path(dirName), conf, org.apache.hadoop.shaded.com.arator(org.apache.hadoop.shaded.com.arator));
    }
    
    private void initBloomFilter(Path dirName, 
                                 Configuration conf) {
      
      DataInputStream in = null;
      try {
        FileSystem fs = dirName.getFileSystem(conf);
        in = fs.open(new Path(dirName, BLOOM_FILE_NAME));
        bloomFilter = new DynamicBloomFilter();
        bloomFilter.readFields(in);
        in.close();
        in = null;
      } catch (IOException org.apache.hadoop.shaded.io.) {
        LOG.warn("Can't open BloomFilter: " + org.apache.hadoop.shaded.io. + " - fallback to MapFile.");
        bloomFilter = null;
      } finally {
        IOUtils.closeStream(in);
      }
    }
    
    /**
     * Checks if this MapFile has the indicated key. The membership test is
     * performed using a Bloom filter, so the result has always non-zero
     * probability of false positives.
     * @param key key to check
     * @return  false iff key doesn't exist, true if key probably exists.
     * @throws IOException
     */
    public boolean probablyHasKey(WritableComparable key) throws IOException {
      if (bloomFilter == null) {
        return true;
      }
      buf.reset();
      key.write(buf);
      bloomKey.set(byteArrayForBloomKey(buf), 1.0);
      return bloomFilter.membershipTest(bloomKey);
    }
    
    /**
     * Fast version of the
     * {@link MapFile.Reader#get(WritableComparable, Writable)} method. First
     * it checks the Bloom filter for the existence of the key, and only if
     * present it performs the real get operation. This yields significant
     * performance improvements for get operations on sparsely populated files.
     */
    @Override
    public synchronized Writable get(WritableComparable key, Writable val)
        throws IOException {
      if (!probablyHasKey(key)) {
        return null;
      }
      return super.get(key, val);
    }
    
    /**
     * Retrieve the Bloom filter used by this instance of the Reader.
     * @return a Bloom filter (see {@link Filter})
     */
    public Filter getBloomFilter() {
      return bloomFilter;
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy