com.datatorrent.lib.io.fs.AbstractFileSplitter Maven / Gradle / Ivy
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package com.datatorrent.lib.io.fs;
import java.io.File;
import java.io.IOException;
import java.util.Iterator;
import javax.annotation.Nullable;
import javax.validation.constraints.Min;
import javax.validation.constraints.NotNull;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
import com.google.common.base.Preconditions;
import com.datatorrent.api.AutoMetric;
import com.datatorrent.api.Context;
import com.datatorrent.api.DefaultOutputPort;
import com.datatorrent.common.util.BaseOperator;
import com.datatorrent.lib.io.block.BlockMetadata;
/**
* An abstract File Splitter.
*
* @since 3.2.0
*/
@org.apache.hadoop.classification.InterfaceStability.Evolving
public abstract class AbstractFileSplitter extends BaseOperator
{
protected Long blockSize;
private int sequenceNo;
/**
* This is a threshold on the no. of blocks emitted per window. A lot of blocks emitted
* per window can overwhelm the downstream operators. This setting helps to control that.
*/
@Min(1)
protected int blocksThreshold;
protected transient long blockCount;
protected BlockMetadataIterator blockMetadataIterator;
protected transient int operatorId;
protected transient Context.OperatorContext context;
protected transient long currentWindowId;
@AutoMetric
protected int filesProcessed;
public final transient DefaultOutputPort filesMetadataOutput = new DefaultOutputPort<>();
public final transient DefaultOutputPort blocksMetadataOutput =
new DefaultOutputPort<>();
public AbstractFileSplitter()
{
blocksThreshold = Integer.MAX_VALUE;
}
@Override
public void setup(Context.OperatorContext context)
{
Preconditions.checkArgument(blockSize == null || blockSize > 0, "invalid block size");
operatorId = context.getId();
this.context = context;
currentWindowId = context.getValue(Context.OperatorContext.ACTIVATION_WINDOW_ID);
if (blockSize == null) {
blockSize = getDefaultBlockSize();
}
}
@Override
public void beginWindow(long windowId)
{
filesProcessed = 0;
blockCount = 0;
currentWindowId = windowId;
}
protected void process()
{
if (blockMetadataIterator != null && blockCount < blocksThreshold) {
emitBlockMetadata();
}
FileInfo fileInfo;
while (blockCount < blocksThreshold && (fileInfo = getFileInfo()) != null) {
if (!processFileInfo(fileInfo)) {
break;
}
}
}
/**
* @return {@link FileInfo}
*/
protected abstract FileInfo getFileInfo();
/**
* @param fileInfo file info
* @return true if blocks threshold is reached; false otherwise
*/
protected boolean processFileInfo(FileInfo fileInfo)
{
try {
FileMetadata fileMetadata = buildFileMetadata(fileInfo);
filesMetadataOutput.emit(fileMetadata);
filesProcessed++;
if (!fileMetadata.isDirectory()) {
blockMetadataIterator = new BlockMetadataIterator(this, fileMetadata, blockSize);
if (!emitBlockMetadata()) {
//block threshold reached
return false;
}
}
return true;
} catch (IOException e) {
throw new RuntimeException("creating metadata", e);
}
}
/**
* @return true if all the blocks were emitted; false otherwise
*/
protected boolean emitBlockMetadata()
{
while (blockMetadataIterator.hasNext()) {
if (blockCount++ < blocksThreshold) {
this.blocksMetadataOutput.emit(blockMetadataIterator.next());
} else {
return false;
}
}
blockMetadataIterator = null;
return true;
}
/**
* Builds block metadata
*
* @param pos offset of the block
* @param lengthOfFileInBlock length of the block in file
* @param blockNumber block number
* @param fileMetadata file metadata
* @param isLast last block of the file
* @return block file metadata
*/
protected BlockMetadata.FileBlockMetadata buildBlockMetadata(long pos, long lengthOfFileInBlock, int blockNumber,
FileMetadata fileMetadata, boolean isLast)
{
BlockMetadata.FileBlockMetadata fileBlockMetadata = createBlockMetadata(fileMetadata);
fileBlockMetadata.setBlockId(fileMetadata.getBlockIds()[blockNumber - 1]);
fileBlockMetadata.setOffset(pos);
fileBlockMetadata.setLength(lengthOfFileInBlock);
fileBlockMetadata.setLastBlock(isLast);
fileBlockMetadata.setPreviousBlockId(blockNumber == 1 ? -1 : fileMetadata.getBlockIds()[blockNumber - 2]);
return fileBlockMetadata;
}
/**
* Can be overridden for creating block metadata of a type that extends {@link BlockMetadata.FileBlockMetadata}
*/
protected BlockMetadata.FileBlockMetadata createBlockMetadata(FileMetadata fileMetadata)
{
return new BlockMetadata.FileBlockMetadata(fileMetadata.getFilePath());
}
/**
* Creates file-metadata and populates no. of blocks in the metadata.
*
* @param fileInfo file information
* @return file-metadata
* @throws IOException
*/
protected FileMetadata buildFileMetadata(FileInfo fileInfo) throws IOException
{
LOG.debug("file {}", fileInfo.getFilePath());
FileMetadata fileMetadata = createFileMetadata(fileInfo);
LOG.debug("fileMetadata {}", fileMetadata);
Path path = new Path(fileInfo.getFilePath());
fileMetadata.setFileName(path.getName());
FileStatus status = getFileStatus(path);
fileMetadata.setDirectory(status.isDirectory());
fileMetadata.setFileLength(status.getLen());
if (fileInfo.getDirectoryPath() == null) { // Direct filename is given as input.
fileMetadata.setRelativePath(status.getPath().getName());
} else {
String relativePath = getRelativePathWithFolderName(fileInfo);
fileMetadata.setRelativePath(relativePath);
}
if (!status.isDirectory()) {
int noOfBlocks = (int)((status.getLen() / blockSize) + (((status.getLen() % blockSize) == 0) ? 0 : 1));
if (fileMetadata.getDataOffset() >= status.getLen()) {
noOfBlocks = 0;
}
fileMetadata.setNumberOfBlocks(noOfBlocks);
populateBlockIds(fileMetadata);
}
return fileMetadata;
}
/*
* As folder name was given to input for copy, prefix folder name to the sub items to copy.
*/
private String getRelativePathWithFolderName(FileInfo fileInfo)
{
String parentDir = new Path(fileInfo.getDirectoryPath()).getName();
return parentDir + File.separator + fileInfo.getRelativeFilePath();
}
/**
* This can be over-ridden to create file metadata of type that extends {@link FileSplitterInput.FileMetadata}
*
* @param fileInfo file information
* @return file-metadata
*/
protected FileMetadata createFileMetadata(FileInfo fileInfo)
{
return new FileMetadata(fileInfo.getFilePath());
}
protected void populateBlockIds(FileMetadata fileMetadata)
{
// block ids are 32 bits of operatorId | 32 bits of sequence number
long[] blockIds = new long[fileMetadata.getNumberOfBlocks()];
long longLeftSide = ((long)operatorId) << 32;
for (int i = 0; i < fileMetadata.getNumberOfBlocks(); i++) {
blockIds[i] = longLeftSide | sequenceNo++ & 0xFFFFFFFFL;
}
fileMetadata.setBlockIds(blockIds);
}
/**
* Get default block size which is used when the user hasn't specified block size.
*
* @return default block size.
*/
protected abstract long getDefaultBlockSize();
/**
* Get status of a file.
*
* @param path path of a file
* @return file status
*/
protected abstract FileStatus getFileStatus(Path path) throws IOException;
public void setBlockSize(Long blockSize)
{
this.blockSize = blockSize;
}
public Long getBlockSize()
{
return blockSize;
}
public void setBlocksThreshold(int threshold)
{
this.blocksThreshold = threshold;
}
public int getBlocksThreshold()
{
return blocksThreshold;
}
/**
* An {@link Iterator} for Block-Metadatas of a file.
*/
protected static class BlockMetadataIterator implements Iterator
{
private final FileMetadata fileMetadata;
private final long blockSize;
private long pos;
private int blockNumber;
private final AbstractFileSplitter splitter;
protected BlockMetadataIterator()
{
//for kryo
fileMetadata = null;
blockSize = -1;
splitter = null;
}
protected BlockMetadataIterator(AbstractFileSplitter splitter, FileMetadata fileMetadata, long blockSize)
{
this.splitter = splitter;
this.fileMetadata = fileMetadata;
this.blockSize = blockSize;
this.pos = fileMetadata.getDataOffset();
this.blockNumber = 0;
}
@Override
public boolean hasNext()
{
return pos < fileMetadata.getFileLength();
}
@SuppressWarnings("StatementWithEmptyBody")
@Override
public BlockMetadata.FileBlockMetadata next()
{
long length;
while ((length = blockSize * ++blockNumber) <= pos) {
}
boolean isLast = length >= fileMetadata.getFileLength();
long lengthOfFileInBlock = isLast ? fileMetadata.getFileLength() : length;
BlockMetadata.FileBlockMetadata fileBlock = splitter.buildBlockMetadata(pos, lengthOfFileInBlock, blockNumber,
fileMetadata, isLast);
pos = lengthOfFileInBlock;
return fileBlock;
}
@Override
public void remove()
{
throw new UnsupportedOperationException("remove not supported");
}
}
/**
* Represents the file metadata - file path, name, no. of blocks, etc.
*/
public static class FileMetadata
{
@NotNull
private String filePath;
private String fileName;
private int numberOfBlocks;
private long dataOffset;
private long fileLength;
private long discoverTime;
private long[] blockIds;
private boolean isDirectory;
private String relativePath;
@SuppressWarnings("unused")
protected FileMetadata()
{
//for kryo
filePath = null;
discoverTime = System.currentTimeMillis();
}
/**
* Constructs file metadata
*
* @param filePath file path
*/
public FileMetadata(@NotNull String filePath)
{
this.filePath = filePath;
discoverTime = System.currentTimeMillis();
}
protected FileMetadata(FileMetadata fileMetadata)
{
this();
filePath = fileMetadata.filePath;
fileName = fileMetadata.fileName;
numberOfBlocks = fileMetadata.numberOfBlocks;
dataOffset = fileMetadata.dataOffset;
fileLength = fileMetadata.fileLength;
discoverTime = fileMetadata.discoverTime;
blockIds = fileMetadata.blockIds;
isDirectory = fileMetadata.isDirectory;
relativePath = fileMetadata.relativePath;
}
/**
* Returns the total number of blocks.
*/
public int getNumberOfBlocks()
{
return numberOfBlocks;
}
/**
* Sets the total number of blocks.
*/
public void setNumberOfBlocks(int numberOfBlocks)
{
this.numberOfBlocks = numberOfBlocks;
}
/**
* Returns the file name.
*/
public String getFileName()
{
return fileName;
}
/**
* Sets the file name.
*/
public void setFileName(String fileName)
{
this.fileName = fileName;
}
/**
* Sets the file path.
*/
public void setFilePath(String filePath)
{
this.filePath = filePath;
}
/**
* Returns the file path.
*/
public String getFilePath()
{
return filePath;
}
/**
* Returns the data offset.
*/
public long getDataOffset()
{
return dataOffset;
}
/**
* Sets the data offset.
*/
public void setDataOffset(long offset)
{
this.dataOffset = offset;
}
/**
* Returns the file length.
*/
public long getFileLength()
{
return fileLength;
}
/**
* Sets the file length.
*/
public void setFileLength(long fileLength)
{
this.fileLength = fileLength;
}
/**
* Returns the file discover time.
*/
public long getDiscoverTime()
{
return discoverTime;
}
/**
* Sets the discover time.
*/
public void setDiscoverTime(long discoverTime)
{
this.discoverTime = discoverTime;
}
/**
* Returns the block ids associated with the file.
*/
public long[] getBlockIds()
{
return blockIds;
}
/**
* Sets the blocks ids of the file.
*/
public void setBlockIds(long[] blockIds)
{
this.blockIds = blockIds;
}
/**
* Sets whether the file metadata is a directory.
*/
public void setDirectory(boolean isDirectory)
{
this.isDirectory = isDirectory;
}
/**
* @return true if it is a directory; false otherwise.
*/
public boolean isDirectory()
{
return isDirectory;
}
/**
* Sets relative file path
* @return relativePath
*/
public String getRelativePath()
{
return relativePath;
}
/**
* Gets relative file path
* @param relativePath
*/
public void setRelativePath(String relativePath)
{
this.relativePath = relativePath;
}
@Override
public String toString()
{
return "FileMetadata [fileName=" + fileName + ", numberOfBlocks=" + numberOfBlocks + ", isDirectory=" + isDirectory + ", relativePath=" + relativePath + "]";
}
}
/**
* A class that encapsulates file path.
*/
public static class FileInfo
{
protected final String directoryPath;
protected final String relativeFilePath;
protected FileInfo()
{
directoryPath = null;
relativeFilePath = null;
}
public FileInfo(@Nullable String directoryPath, @NotNull String relativeFilePath)
{
this.directoryPath = directoryPath;
this.relativeFilePath = relativeFilePath;
}
/**
* @return directory path
*/
public String getDirectoryPath()
{
return directoryPath;
}
/**
* @return path relative to directory
*/
public String getRelativeFilePath()
{
return relativeFilePath;
}
/**
* @return full path of the file
*/
public String getFilePath()
{
if (directoryPath == null) {
return relativeFilePath;
}
return new Path(directoryPath, relativeFilePath).toUri().getPath();
}
}
private static final Logger LOG = LoggerFactory.getLogger(AbstractFileSplitter.class);
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy