com.bazaarvoice.emodb.hadoop.io.StashFileSystem Maven / Gradle / Ivy
package com.bazaarvoice.emodb.hadoop.io;
import com.amazonaws.auth.AWSCredentialsProvider;
import com.amazonaws.auth.BasicAWSCredentials;
import com.amazonaws.auth.DefaultAWSCredentialsProviderChain;
import com.amazonaws.internal.StaticCredentialsProvider;
import com.amazonaws.regions.Region;
import com.amazonaws.regions.Regions;
import com.amazonaws.services.s3.AmazonS3Client;
import com.bazaarvoice.emodb.common.stash.FixedStashReader;
import com.bazaarvoice.emodb.common.stash.StandardStashReader;
import com.bazaarvoice.emodb.common.stash.StashReader;
import com.bazaarvoice.emodb.common.stash.StashRowIterator;
import com.bazaarvoice.emodb.common.stash.StashSplit;
import com.bazaarvoice.emodb.common.stash.StashTable;
import com.bazaarvoice.emodb.hadoop.ConfigurationParameters;
import com.bazaarvoice.emodb.sor.api.TableNotStashedException;
import com.google.common.base.Objects;
import com.google.common.collect.FluentIterable;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Range;
import com.google.common.io.ByteStreams;
import com.google.common.io.Closeables;
import com.google.common.io.Files;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PositionedReadable;
import org.apache.hadoop.fs.RemoteIterator;
import org.apache.hadoop.fs.Seekable;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.util.Progressable;
import javax.ws.rs.core.UriBuilder;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import static com.bazaarvoice.emodb.hadoop.io.FileSystemUtil.getEmptySplitFileName;
import static com.bazaarvoice.emodb.hadoop.io.FileSystemUtil.getEmptySplitRecordReader;
import static com.bazaarvoice.emodb.hadoop.io.FileSystemUtil.getRootFileStatus;
import static com.bazaarvoice.emodb.hadoop.io.FileSystemUtil.getSplitFileStatus;
import static com.bazaarvoice.emodb.hadoop.io.FileSystemUtil.getSplitName;
import static com.bazaarvoice.emodb.hadoop.io.FileSystemUtil.getSplitPath;
import static com.bazaarvoice.emodb.hadoop.io.FileSystemUtil.getTableFileStatus;
import static com.bazaarvoice.emodb.hadoop.io.FileSystemUtil.getTableName;
import static com.bazaarvoice.emodb.hadoop.io.FileSystemUtil.isEmptySplit;
import static com.google.common.base.Preconditions.checkArgument;
/**
* FileSystem implementation backed by EmoStash. After initializing the root URI immediate S3 subdirectories
* are tables and any leaf files under the tables are splits.
*/
public class StashFileSystem extends FileSystem implements EmoInputSplittable {
private static final Regions DEFAULT_REGION = Regions.US_EAST_1;
private static final int BLOCK_SIZE = 10 * 1024 * 1024;
private static final String SPLIT_PREFIX = "S_";
private static final Pattern SPLIT_PATTERN = Pattern.compile("^S_(?[0-9a-zA-Z_\\-]+)\\.gz$");
private URI _uri;
private Path _rootPath;
private AmazonS3Client _s3;
private StashReader _stashReader;
private final AtomicInteger _stashReaderRefCount = new AtomicInteger(0);
private final AtomicBoolean _closed = new AtomicBoolean(false);
public StashFileSystem() {
// empty
}
@Override
public String getScheme() {
return "emostash";
}
@Override
public void initialize(URI location, Configuration conf)
throws IOException {
// The location is either a table or a split. Get the root path depending on which it is.
// Strip the trailing slash if present
String locationPath = Objects.firstNonNull(location.getPath(), "/");
if (locationPath.length() > 0 && locationPath.endsWith("/")) {
locationPath = locationPath.substring(0, locationPath.length() - 1);
}
// Get the parent directory
String basePath = location.getPath();
int lastSlash = locationPath.lastIndexOf('/');
// If it's a split go to the parent's parent.
boolean isSplit = isSplitFile(basePath.substring(lastSlash + 1));
for (int i=0; i < (isSplit ? 2 : 1); i++) {
basePath = lastSlash != 0 ? basePath.substring(0, lastSlash) : "/";
lastSlash = basePath.lastIndexOf('/');
}
_uri = UriBuilder.fromUri(location).replacePath(basePath).build();
_rootPath = new Path(_uri);
_s3 = createS3Client(conf);
addS3ClientReference("instance");
StashLocation stashLocation = LocationUtil.getStashLocation(_uri);
/**
* Some locations are fixed in that the root directory directly contains the table directories.
* Other locations, such as "emostash://ci.us", are dynamic in that the actual root directory is
* in a directory beneath the root directory. Which subdirectory to use is determined by reading the
* content of a signal file called "_LATEST". This is handled by the StandardStashReader.
*
*/
boolean useLatestDirectory = stashLocation.isUseLatestDirectory();
if (useLatestDirectory) {
_stashReader = StandardStashReader.getInstance(stashLocation.getUri(), _s3);
} else {
_stashReader = FixedStashReader.getInstance(stashLocation.getUri(), _s3);
}
super.initialize(_uri, conf);
}
private AmazonS3Client createS3Client(Configuration conf) {
AWSCredentialsProvider credentials;
String accessKey = conf.get(ConfigurationParameters.ACCESS_KEY_PARAM);
String secretKey = conf.get(ConfigurationParameters.SECRET_KEY_PARAM);
if (accessKey != null || secretKey != null) {
// Keys explicitly configured
checkArgument(accessKey != null && secretKey != null, "Access and secret keys must both be provided");
credentials = new StaticCredentialsProvider(new BasicAWSCredentials(accessKey, secretKey));
} else {
// Use the default provider chain
credentials = new DefaultAWSCredentialsProviderChain();
}
String regionParam = conf.get(ConfigurationParameters.REGION_PARAM);
Region region = Region.getRegion(regionParam != null ? Regions.fromName(regionParam) : DEFAULT_REGION);
AmazonS3Client s3 = new AmazonS3Client(credentials);
s3.setRegion(region);
return s3;
}
@Override
public URI getUri() {
return _uri;
}
@Override
public void close()
throws IOException {
if (_closed.compareAndSet(false, true)) {
try {
super.close();
} finally {
maybeCloseS3Client("instance");
}
}
}
private void addS3ClientReference(String reason) {
_stashReaderRefCount.incrementAndGet();
LOG.debug("Incremented S3 client reference count for " + reason);
}
/**
* Closes the S3 client if it is completely dereferenced. This is necessary because Hadoop may close the file
* system while a stream returned by {@link #open(org.apache.hadoop.fs.Path, int)} is still open.
*/
private void maybeCloseS3Client(String reason) {
LOG.debug("Decremented S3 client reference count for " + reason);
if (_stashReaderRefCount.decrementAndGet() == 0) {
LOG.debug("Closing S3 client for StashFileSystem at " + _uri);
_s3.shutdown();
}
}
private boolean isSplitFile(String fileName) {
return SPLIT_PATTERN.matcher(fileName).matches() || isEmptySplit(fileName);
}
private String toSplitFile(StashSplit stashSplit) {
// Need to convert each split to a file with a unique recognizable pattern. We also need to preserve the
// original file's extension so that Hadoop will honor the underlying compression scheme.
String extension = Files.getFileExtension(stashSplit.getFile());
return String.format("%s%s.%s", SPLIT_PREFIX, stashSplit, extension);
}
private StashSplit fromSplitFile(String fileName) {
Matcher matcher = SPLIT_PATTERN.matcher(fileName);
if (!matcher.matches()) {
throw new IllegalArgumentException("File name does not represent a split file: " + fileName);
}
String stashSplitString = matcher.group("split");
return StashSplit.fromString(stashSplitString);
}
@Override
public FileStatus[] listStatus(Path path)
throws IOException {
// Only valid if the path is root or a table
if (getSplitName(_rootPath, path) != null) {
throw new IOException("Cannot list a split");
}
ImmutableList.Builder resultsBuilder = ImmutableList.builder();
String table = getTableName(_rootPath, path);
if (table == null) {
// Return the list of tables
Iterator tableIterator = _stashReader.listTables();
while (tableIterator.hasNext()) {
StashTable stashTable = tableIterator.next();
resultsBuilder.add(getTableFileStatus(_rootPath, stashTable.getTableName()));
}
} else {
// Return the list of splits
try {
List splits = _stashReader.getSplits(table);
for (StashSplit split : splits) {
resultsBuilder.add(getSplitFileStatus(_rootPath, table, toSplitFile(split), split.getSize(), BLOCK_SIZE));
}
} catch (TableNotStashedException e) {
// Ok, table is not in stash; therefore there are no splits
}
}
FileStatus[] results = FluentIterable.from(resultsBuilder.build()).toArray(FileStatus.class);
if (results.length == 0 && table != null) {
// Do not return an empty list of splits, return a single empty split
results = new FileStatus[] { getSplitFileStatus(_rootPath, table, getEmptySplitFileName(), 1, BLOCK_SIZE) };
}
return results;
}
@Override
public FileStatus getFileStatus(Path path)
throws IOException {
if (path.equals(_rootPath)) {
return getRootFileStatus(_rootPath);
}
String table = getTableName(_rootPath, path);
String split = getSplitName(_rootPath, path);
if (split == null) {
// This is a table. Since S3 doesn't have a concept of empty directories assume all tables exist
return getTableFileStatus(_rootPath, table);
}
// This is a split
if (isEmptySplit(path)) {
// Always return that the empty split exists
return getSplitFileStatus(_rootPath, table, split, 1, BLOCK_SIZE);
} else {
StashSplit stashSplit = fromSplitFile(split);
return getSplitFileStatus(_rootPath, table, split, stashSplit.getSize(), BLOCK_SIZE);
}
}
@Override
public List getInputSplits(Configuration config, Path path, int splitSize)
throws IOException {
ImmutableList.Builder splits = ImmutableList.builder();
RemoteIterator files = listFiles(path, false);
if (!files.hasNext()) {
// No splits. Don't return nothing, return a single empty split
String table = getTableName(_rootPath, path);
return ImmutableList.of(new SplitPath(getSplitPath(_rootPath, table, getEmptySplitFileName()), 1));
}
while (files.hasNext()) {
LocatedFileStatus file = files.next();
splits.add(new SplitPath(file.getPath(), file.getLen()));
}
return splits.build();
}
@Override
public BaseRecordReader getBaseRecordReader(Configuration config, Path path, int splitSize)
throws IOException {
String split = getSplitName(_rootPath, path);
if (isEmptySplit(split)) {
return getEmptySplitRecordReader();
}
final String reason = "record reader for split " + path;
final StashSplit stashSplit = fromSplitFile(split);
// Increment the s3 client reference count so it stays open at least until the returned reader is closed.
addS3ClientReference(reason);
return new BaseRecordReader(splitSize) {
private StashRowIterator _iterator;
@Override
protected Iterator
© 2015 - 2025 Weber Informatics LLC | Privacy Policy