All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.splout.db.hadoop.TupleSampler Maven / Gradle / Ivy

Go to download

Splout is a read only, horizontally scalable SQL database that plays well with Hadoop.

There is a newer version: 0.3.0
Show newest version
package com.splout.db.hadoop;

/*
 * #%L
 * Splout SQL Hadoop library
 * %%
 * Copyright (C) 2012 Datasalt Systems S.L.
 * %%
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * #L%
 */

import java.io.IOException;
import java.io.Serializable;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import com.datasalt.pangool.io.Tuple;
import com.datasalt.pangool.io.TupleFile;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.mapreduce.TaskID;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.mockito.Mockito;

import com.datasalt.pangool.io.ITuple;
import com.datasalt.pangool.io.Schema;
import com.datasalt.pangool.tuplemr.MapOnlyJobBuilder;
import com.datasalt.pangool.tuplemr.TupleMRException;
import com.datasalt.pangool.tuplemr.mapred.MapOnlyMapper;
import com.splout.db.common.PartitionMap;

/**
 * This class samples a list of {@link TableInput} files that produce a certain Table Schema. There are two sampling
 * methods supported:
 * 
    *
  • DEFAULT: Inspired by Hadoop's TeraInputFormat. A Hadoop Job is not needed. Consecutive records are read from each * InputSplit.
  • *
  • RESERVOIR: It uses a Map-Only Pangool Job for performing Reservoir Sampling over the dataset.
  • *
* Sampling can be used by {@link TablespaceGenerator} for determining a {@link PartitionMap} based on the approximated * distribution of the keys. */ @SuppressWarnings("serial") public class TupleSampler implements Serializable { private final static Log logger = LogFactory.getLog(TupleSampler.class); private final SamplingType samplingType; private final SamplingOptions options; public enum SamplingType { DEFAULT, RESERVOIR } public static class TupleSamplerException extends Exception { public TupleSamplerException(String reason) { super(reason); } public TupleSamplerException(Exception e) { super(e); } } // Each sampling algorithm may have its own options but there are some which are common to both public static abstract class SamplingOptions extends HashMap { public Long getMaxInputSplitSize() { return (Long) this.get("maxInputSplitSize"); } public void setMaxInputSplitSize(Long maxInputSplitSize) { this.put("maxInputSplitSize", maxInputSplitSize); } } // Options for DEFAULT sampling public static class DefaultSamplingOptions extends SamplingOptions { public DefaultSamplingOptions() { super(); setMaxSplitsToVisit(10); } public int getMaxSplitsToVisit() { return (Integer) this.get("maxSplitsToVisit"); } public void setMaxSplitsToVisit(int maxSplitsToVisit) { this.put("maxSplitsToVisit", maxSplitsToVisit); } } public TupleSampler(SamplingType samplingType, SamplingOptions options) { this.samplingType = samplingType; this.options = options; } public void sample(List inputFiles, Schema tableSchema, Configuration hadoopConf, long sampleSize, Path outFile) throws TupleSamplerException { try { List splits = new ArrayList(); Map> splitToFormat = new HashMap>(); Map recordProcessorPerSplit = new HashMap(); // Iterate over all {@link TableInput} and collect information about the InputSplits derived from them for(TableInput tableFile : inputFiles) { Job job = new Job(hadoopConf); FileInputFormat.setInputPaths(job, tableFile.getPaths()); if(options.getMaxInputSplitSize() != null) { logger.info("Using max input split size: " + options.getMaxInputSplitSize()); FileInputFormat.setMaxInputSplitSize(job, options.getMaxInputSplitSize()); } job.setInputFormatClass(FileInputFormat.class); for(InputSplit split : tableFile.getFormat().getSplits(job)) { splitToFormat.put(split, tableFile.getFormat()); recordProcessorPerSplit.put(split, tableFile.getRecordProcessor()); splits.add(split); } } FileSystem outFs = outFile.getFileSystem(hadoopConf); if(outFs.exists(outFile)) { outFs.delete(outFile, false); } if(samplingType.equals(SamplingType.DEFAULT)) { try { DefaultSamplingOptions defOptions = (DefaultSamplingOptions) options; // Default sampling method defaultSampling(tableSchema, sampleSize, hadoopConf, outFile, splits, splitToFormat, recordProcessorPerSplit, defOptions.getMaxSplitsToVisit()); } catch (ClassCastException e) { throw new RuntimeException("Invalid options class: "+ options.getClass() + " Expected:" + DefaultSamplingOptions.class); } } else { // Reservoir sampling reservoirSampling(tableSchema, sampleSize, hadoopConf, outFile, splits.size(), inputFiles); } } catch(Exception e) { throw new TupleSamplerException(e); } } /* * Reservoir sampling, to be used in datasets where default method is not enough. */ private void reservoirSampling(Schema tableSchema, final long sampleSize, Configuration hadoopConf, Path outputPath, final int nSplits, List inputFiles) throws IOException, InterruptedException, ClassNotFoundException, TupleMRException, URISyntaxException, TupleSamplerException { MapOnlyJobBuilder builder = new MapOnlyJobBuilder(hadoopConf, "Reservoir Sampling"); for(TableInput inputFile : inputFiles) { final RecordProcessor processor = inputFile.getRecordProcessor(); for(Path path : inputFile.getPaths()) { builder.addInput(path, inputFile.getFormat(), new MapOnlyMapper() { final int nSamples = (int) (sampleSize / nSplits); final ITuple[] samples = new ITuple[nSamples]; CounterInterface counterInterface; long recordCounter = 0; protected void setup(Context context) throws IOException, InterruptedException { counterInterface = new CounterInterface(context); }; // Collect Tuples with decreasing probability // (http://en.wikipedia.org/wiki/Reservoir_sampling) protected void map(ITuple key, NullWritable value, Context context) throws IOException, InterruptedException { ITuple uTuple; try { uTuple = processor.process(key, counterInterface); } catch(Throwable e) { throw new RuntimeException(e); } if(uTuple == null) { // user may have filtered the record return; } long reservoirIndex; if(recordCounter < nSamples) { reservoirIndex = recordCounter; } else { reservoirIndex = (long) (Math.random() * recordCounter); } if(reservoirIndex < nSamples) { samples[(int) reservoirIndex] = new NullableTuple(uTuple, true); // deep copy the Tuple } recordCounter++; } // Write the in-memory sampled Tuples protected void cleanup(Context context) throws IOException, InterruptedException { for(ITuple tuple : samples) { if(tuple != null) { context.write(tuple, NullWritable.get()); } } } }); } } // Set output path Path outReservoirPath = new Path(outputPath + "-reservoir"); builder.setTupleOutput(outReservoirPath, new NullableSchema(tableSchema)); Job job = builder.createJob(); if(!job.waitForCompletion(true)) { throw new TupleSamplerException("Reservoir Sampling failed!"); } FileSystem outFs = outReservoirPath.getFileSystem(hadoopConf); // Instantiate the writer we will write samples to TupleFile.Writer writer = new TupleFile.Writer(outFs, hadoopConf, outputPath, new NullableSchema( tableSchema)); if(outFs.listStatus(outReservoirPath) == null) { throw new IOException("Output folder not created: the Job failed!"); } // Aggregate the output into a single file for being consistent with the other sampling methods for(FileStatus fileStatus : outFs.listStatus(outReservoirPath)) { Path thisPath = fileStatus.getPath(); if(thisPath.getName().startsWith("part-m-")) { TupleFile.Reader reader = new TupleFile.Reader(outFs, hadoopConf, thisPath); Tuple tuple = new Tuple(reader.getSchema()); while(reader.next(tuple)) { writer.append(tuple); } reader.close(); } } writer.close(); outFs.delete(outReservoirPath, true); } /* * Default sampling method a-la-TeraSort, getting some consecutive samples from each InputSplit. */ private void defaultSampling(Schema tableSchema, long sampleSize, Configuration hadoopConf, Path outFile, List splits, Map> splitToFormat, Map recordProcessorPerSplit, int maxSplitsToVisit) throws IOException, InterruptedException { // Instantiate the writer we will write samples to FileSystem fs = FileSystem.get(outFile.toUri(), hadoopConf); TupleFile.Writer writer = new TupleFile.Writer(fs, hadoopConf, outFile, new NullableSchema( tableSchema)); if(splits.size() == 0) { throw new IllegalArgumentException("There are no splits to sample from!"); } logger.info("Sampling from input splits > " + splits); int samples = Math.min(maxSplitsToVisit, splits.size()); long recordsPerSample = sampleSize / samples; int sampleStep = splits.size() / samples; long records = 0; CounterInterface counterInterface = new CounterInterface(null) { public Counter getCounter(String group, String name) { return Mockito.mock(Counter.class); }; }; // Take N samples from different parts of the input for(int i = 0; i < samples; ++i) { TaskAttemptID attemptId = new TaskAttemptID(new TaskID(), 1); TaskAttemptContext attemptContext = new TaskAttemptContext(hadoopConf, attemptId); InputSplit split = splits.get(sampleStep * i); logger.info("Sampling split: " + split); RecordReader reader = splitToFormat.get(split).createRecordReader(split, attemptContext); reader.initialize(split, attemptContext); RecordProcessor processor = recordProcessorPerSplit.get(split); while(reader.nextKeyValue()) { // ITuple tuple = reader.getCurrentKey(); ITuple uTuple; try { uTuple = processor.process(tuple, counterInterface); } catch(Throwable e) { throw new RuntimeException(e); } if(uTuple != null) { // user may have filtered the record writer.append(new NullableTuple(uTuple)); records += 1; if((i + 1) * recordsPerSample <= records) { break; } } } } writer.close(); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy