
com.splout.db.hadoop.TupleSampler Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of splout-hadoop Show documentation
Show all versions of splout-hadoop Show documentation
Splout is a read only, horizontally scalable SQL database that plays well with Hadoop.
package com.splout.db.hadoop;
/*
* #%L
* Splout SQL Hadoop library
* %%
* Copyright (C) 2012 Datasalt Systems S.L.
* %%
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* #L%
*/
import java.io.IOException;
import java.io.Serializable;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.datasalt.pangool.io.Tuple;
import com.datasalt.pangool.io.TupleFile;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.mapreduce.TaskID;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.mockito.Mockito;
import com.datasalt.pangool.io.ITuple;
import com.datasalt.pangool.io.Schema;
import com.datasalt.pangool.tuplemr.MapOnlyJobBuilder;
import com.datasalt.pangool.tuplemr.TupleMRException;
import com.datasalt.pangool.tuplemr.mapred.MapOnlyMapper;
import com.splout.db.common.PartitionMap;
/**
* This class samples a list of {@link TableInput} files that produce a certain Table Schema. There are two sampling
* methods supported:
*
* - DEFAULT: Inspired by Hadoop's TeraInputFormat. A Hadoop Job is not needed. Consecutive records are read from each
* InputSplit.
* - RESERVOIR: It uses a Map-Only Pangool Job for performing Reservoir Sampling over the dataset.
*
* Sampling can be used by {@link TablespaceGenerator} for determining a {@link PartitionMap} based on the approximated
* distribution of the keys.
*/
@SuppressWarnings("serial")
public class TupleSampler implements Serializable {
private final static Log logger = LogFactory.getLog(TupleSampler.class);
private final SamplingType samplingType;
private final SamplingOptions options;
public enum SamplingType {
DEFAULT, RESERVOIR
}
public static class TupleSamplerException extends Exception {
public TupleSamplerException(String reason) {
super(reason);
}
public TupleSamplerException(Exception e) {
super(e);
}
}
// Each sampling algorithm may have its own options but there are some which are common to both
public static abstract class SamplingOptions extends HashMap {
public Long getMaxInputSplitSize() {
return (Long) this.get("maxInputSplitSize");
}
public void setMaxInputSplitSize(Long maxInputSplitSize) {
this.put("maxInputSplitSize", maxInputSplitSize);
}
}
// Options for DEFAULT sampling
public static class DefaultSamplingOptions extends SamplingOptions {
public DefaultSamplingOptions() {
super();
setMaxSplitsToVisit(10);
}
public int getMaxSplitsToVisit() {
return (Integer) this.get("maxSplitsToVisit");
}
public void setMaxSplitsToVisit(int maxSplitsToVisit) {
this.put("maxSplitsToVisit", maxSplitsToVisit);
}
}
public TupleSampler(SamplingType samplingType, SamplingOptions options) {
this.samplingType = samplingType;
this.options = options;
}
public void sample(List inputFiles, Schema tableSchema, Configuration hadoopConf,
long sampleSize, Path outFile) throws TupleSamplerException {
try {
List splits = new ArrayList();
Map> splitToFormat = new HashMap>();
Map recordProcessorPerSplit = new HashMap();
// Iterate over all {@link TableInput} and collect information about the InputSplits derived from them
for(TableInput tableFile : inputFiles) {
Job job = new Job(hadoopConf);
FileInputFormat.setInputPaths(job, tableFile.getPaths());
if(options.getMaxInputSplitSize() != null) {
logger.info("Using max input split size: " + options.getMaxInputSplitSize());
FileInputFormat.setMaxInputSplitSize(job, options.getMaxInputSplitSize());
}
job.setInputFormatClass(FileInputFormat.class);
for(InputSplit split : tableFile.getFormat().getSplits(job)) {
splitToFormat.put(split, tableFile.getFormat());
recordProcessorPerSplit.put(split, tableFile.getRecordProcessor());
splits.add(split);
}
}
FileSystem outFs = outFile.getFileSystem(hadoopConf);
if(outFs.exists(outFile)) {
outFs.delete(outFile, false);
}
if(samplingType.equals(SamplingType.DEFAULT)) {
try {
DefaultSamplingOptions defOptions = (DefaultSamplingOptions) options;
// Default sampling method
defaultSampling(tableSchema, sampleSize, hadoopConf, outFile, splits, splitToFormat,
recordProcessorPerSplit, defOptions.getMaxSplitsToVisit());
} catch (ClassCastException e) {
throw new RuntimeException("Invalid options class: "+ options.getClass() + " Expected:" +
DefaultSamplingOptions.class);
}
} else {
// Reservoir sampling
reservoirSampling(tableSchema, sampleSize, hadoopConf, outFile, splits.size(), inputFiles);
}
} catch(Exception e) {
throw new TupleSamplerException(e);
}
}
/*
* Reservoir sampling, to be used in datasets where default method is not enough.
*/
private void reservoirSampling(Schema tableSchema, final long sampleSize, Configuration hadoopConf,
Path outputPath, final int nSplits, List inputFiles) throws IOException,
InterruptedException, ClassNotFoundException, TupleMRException, URISyntaxException, TupleSamplerException {
MapOnlyJobBuilder builder = new MapOnlyJobBuilder(hadoopConf, "Reservoir Sampling");
for(TableInput inputFile : inputFiles) {
final RecordProcessor processor = inputFile.getRecordProcessor();
for(Path path : inputFile.getPaths()) {
builder.addInput(path, inputFile.getFormat(),
new MapOnlyMapper() {
final int nSamples = (int) (sampleSize / nSplits);
final ITuple[] samples = new ITuple[nSamples];
CounterInterface counterInterface;
long recordCounter = 0;
protected void setup(Context context) throws IOException, InterruptedException {
counterInterface = new CounterInterface(context);
};
// Collect Tuples with decreasing probability
// (http://en.wikipedia.org/wiki/Reservoir_sampling)
protected void map(ITuple key, NullWritable value, Context context) throws IOException,
InterruptedException {
ITuple uTuple;
try {
uTuple = processor.process(key,
counterInterface);
} catch(Throwable e) {
throw new RuntimeException(e);
}
if(uTuple == null) { // user may have filtered the record
return;
}
long reservoirIndex;
if(recordCounter < nSamples) {
reservoirIndex = recordCounter;
} else {
reservoirIndex = (long) (Math.random() * recordCounter);
}
if(reservoirIndex < nSamples) {
samples[(int) reservoirIndex] = new NullableTuple(uTuple, true); // deep copy the Tuple
}
recordCounter++;
}
// Write the in-memory sampled Tuples
protected void cleanup(Context context) throws IOException, InterruptedException {
for(ITuple tuple : samples) {
if(tuple != null) {
context.write(tuple, NullWritable.get());
}
}
}
});
}
}
// Set output path
Path outReservoirPath = new Path(outputPath + "-reservoir");
builder.setTupleOutput(outReservoirPath, new NullableSchema(tableSchema));
Job job = builder.createJob();
if(!job.waitForCompletion(true)) {
throw new TupleSamplerException("Reservoir Sampling failed!");
}
FileSystem outFs = outReservoirPath.getFileSystem(hadoopConf);
// Instantiate the writer we will write samples to
TupleFile.Writer writer = new TupleFile.Writer(outFs, hadoopConf, outputPath, new NullableSchema(
tableSchema));
if(outFs.listStatus(outReservoirPath) == null) {
throw new IOException("Output folder not created: the Job failed!");
}
// Aggregate the output into a single file for being consistent with the other sampling methods
for(FileStatus fileStatus : outFs.listStatus(outReservoirPath)) {
Path thisPath = fileStatus.getPath();
if(thisPath.getName().startsWith("part-m-")) {
TupleFile.Reader reader = new TupleFile.Reader(outFs, hadoopConf, thisPath);
Tuple tuple = new Tuple(reader.getSchema());
while(reader.next(tuple)) {
writer.append(tuple);
}
reader.close();
}
}
writer.close();
outFs.delete(outReservoirPath, true);
}
/*
* Default sampling method a-la-TeraSort, getting some consecutive samples from each InputSplit.
*/
private void defaultSampling(Schema tableSchema, long sampleSize, Configuration hadoopConf,
Path outFile, List splits,
Map> splitToFormat,
Map recordProcessorPerSplit,
int maxSplitsToVisit) throws IOException, InterruptedException {
// Instantiate the writer we will write samples to
FileSystem fs = FileSystem.get(outFile.toUri(), hadoopConf);
TupleFile.Writer writer = new TupleFile.Writer(fs, hadoopConf, outFile, new NullableSchema(
tableSchema));
if(splits.size() == 0) {
throw new IllegalArgumentException("There are no splits to sample from!");
}
logger.info("Sampling from input splits > " + splits);
int samples = Math.min(maxSplitsToVisit, splits.size());
long recordsPerSample = sampleSize / samples;
int sampleStep = splits.size() / samples;
long records = 0;
CounterInterface counterInterface = new CounterInterface(null) {
public Counter getCounter(String group, String name) {
return Mockito.mock(Counter.class);
};
};
// Take N samples from different parts of the input
for(int i = 0; i < samples; ++i) {
TaskAttemptID attemptId = new TaskAttemptID(new TaskID(), 1);
TaskAttemptContext attemptContext = new TaskAttemptContext(hadoopConf, attemptId);
InputSplit split = splits.get(sampleStep * i);
logger.info("Sampling split: " + split);
RecordReader reader = splitToFormat.get(split).createRecordReader(split,
attemptContext);
reader.initialize(split, attemptContext);
RecordProcessor processor = recordProcessorPerSplit.get(split);
while(reader.nextKeyValue()) {
//
ITuple tuple = reader.getCurrentKey();
ITuple uTuple;
try {
uTuple = processor.process(tuple, counterInterface);
} catch(Throwable e) {
throw new RuntimeException(e);
}
if(uTuple != null) { // user may have filtered the record
writer.append(new NullableTuple(uTuple));
records += 1;
if((i + 1) * recordsPerSample <= records) {
break;
}
}
}
}
writer.close();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy