com.datatorrent.lib.io.fs.AbstractFileInputOperator Maven / Gradle / Ivy
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package com.datatorrent.lib.io.fs;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.validation.constraints.NotNull;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.apex.malhar.lib.fs.LineByLineFileInputOperator;
import org.apache.apex.malhar.lib.wal.WindowDataManager;
import org.apache.commons.lang.mutable.MutableLong;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import com.datatorrent.api.Context.CountersAggregator;
import com.datatorrent.api.Context.OperatorContext;
import com.datatorrent.api.DefaultOutputPort;
import com.datatorrent.api.DefaultPartition;
import com.datatorrent.api.InputOperator;
import com.datatorrent.api.Operator;
import com.datatorrent.api.Partitioner;
import com.datatorrent.api.StatsListener;
import com.datatorrent.lib.counters.BasicCounters;
import com.datatorrent.lib.util.KryoCloneUtils;
/**
* This is the base implementation of a directory input operator, which scans a directory for files.
* Files are then read and split into tuples, which are emitted.
* Subclasses should implement the methods required to read and emit tuples from files.
*
* Derived class defines how to read entries from the input stream and emit to the port.
*
*
* The directory scanning logic is pluggable to support custom directory layouts and naming schemes. The default
* implementation scans a single directory.
*
*
* Fault tolerant by tracking previously read files and current offset as part of checkpoint state. In case of failure
* the operator will skip files that were already processed and fast forward to the offset of the current file.
*
*
* Supports partitioning and dynamic changes to number of partitions through property {@link #partitionCount}. The
* directory scanner is responsible to only accept the files that belong to a partition.
*
*
* This class supports retrying of failed files by putting them into failed list, and retrying them after pending
* files are processed. Retrying is disabled when maxRetryCount is set to zero.
*
* @displayName FS Directory Scan Input
* @category Input
* @tags fs, file, input operator
*
* @param The type of the object that this input operator reads.
* @since 1.0.2
*/
public abstract class AbstractFileInputOperator
implements InputOperator, Partitioner>, StatsListener, Operator.CheckpointListener
{
private static final Logger LOG = LoggerFactory.getLogger(AbstractFileInputOperator.class);
@NotNull
protected String directory;
@NotNull
protected DirectoryScanner scanner = new DirectoryScanner();
protected int scanIntervalMillis = 5000;
protected int offset;
protected String currentFile;
protected Set processedFiles = new HashSet();
protected int emitBatchSize = 1000;
protected int currentPartitions = 1;
protected int partitionCount = 1;
private int retryCount = 0;
private int maxRetryCount = 5;
protected transient int skipCount = 0;
private transient OperatorContext context;
private final BasicCounters fileCounters = new BasicCounters(MutableLong.class);
protected MutableLong globalNumberOfFailures = new MutableLong();
protected MutableLong localNumberOfFailures = new MutableLong();
protected MutableLong globalNumberOfRetries = new MutableLong();
protected MutableLong localNumberOfRetries = new MutableLong();
protected transient MutableLong globalProcessedFileCount = new MutableLong();
protected transient MutableLong localProcessedFileCount = new MutableLong();
protected transient MutableLong pendingFileCount = new MutableLong();
@NotNull
private WindowDataManager windowDataManager = new WindowDataManager.NoopWindowDataManager();
protected transient long currentWindowId;
protected final transient LinkedList currentWindowRecoveryState = Lists.newLinkedList();
protected int operatorId; //needed in partitioning
/**
* Class representing failed file, When read fails on a file in middle, then the file is
* added to failedList along with last read offset.
* The files from failedList will be processed after all pendingFiles are processed, but
* before checking for new files.
* failed file is retried for maxRetryCount number of times, after that the file is
* ignored.
*/
protected static class FailedFile
{
String path;
int offset;
int retryCount;
long lastFailedTime;
/* For kryo serialization */
@SuppressWarnings("unused")
protected FailedFile() {}
protected FailedFile(String path, int offset)
{
this.path = path;
this.offset = offset;
this.retryCount = 0;
}
protected FailedFile(String path, int offset, int retryCount)
{
this.path = path;
this.offset = offset;
this.retryCount = retryCount;
}
@Override
public String toString()
{
return "FailedFile[" +
"path='" + path + '\'' +
", offset=" + offset +
", retryCount=" + retryCount +
", lastFailedTime=" + lastFailedTime +
']';
}
}
/**
* Enums for aggregated counters about file processing.
*
* Contains the enums representing number of files processed, number of
* pending files, number of file errors, and number of retries.
*
* @since 1.0.4
*/
public static enum AggregatedFileCounters
{
/**
* The number of files processed by the logical operator up until this.
* point in time
*/
PROCESSED_FILES,
/**
* The number of files waiting to be processed by the logical operator.
*/
PENDING_FILES,
/**
* The number of IO errors encountered by the logical operator.
*/
NUMBER_OF_ERRORS,
/**
* The number of times the logical operator tried to resume reading a file
* on which it encountered an error.
*/
NUMBER_OF_RETRIES
}
/**
* The enums used to track statistics about the
* AbstractFSDirectoryInputOperator.
*/
protected static enum FileCounters
{
/**
* The number of files that were in the processed list up to the last
* repartition of the operator.
*/
GLOBAL_PROCESSED_FILES,
/**
* The number of files added to the processed list by the physical operator
* since the last repartition.
*/
LOCAL_PROCESSED_FILES,
/**
* The number of io errors encountered up to the last repartition of the
* operator.
*/
GLOBAL_NUMBER_OF_FAILURES,
/**
* The number of failures encountered by the physical operator since the
* last repartition.
*/
LOCAL_NUMBER_OF_FAILURES,
/**
* The number of retries encountered by the physical operator up to the last
* repartition.
*/
GLOBAL_NUMBER_OF_RETRIES,
/**
* The number of retries encountered by the physical operator since the last
* repartition.
*/
LOCAL_NUMBER_OF_RETRIES,
/**
* The number of files pending on the physical operator.
*/
PENDING_FILES
}
/**
* A counter aggregator for AbstractFSDirectoryInputOperator.
*
* In order for this CountersAggregator to be used on your operator, you must
* set it within your application like this.
*
*
* dag.getOperatorMeta("fsinputoperator").getAttributes().put(OperatorContext.COUNTERS_AGGREGATOR,
* new AbstractFSDirectoryInputOperator.FileCountersAggregator());
*
*
* The value of the aggregated counter can be retrieved by issuing a get
* request to the host running your gateway like this.
*
*
* http://<your host>:9090/ws/v2/applications/<your app id>/logicalPlan/operators/<operatorname>/aggregation
*
*
* @since 1.0.4
*/
public static final class FileCountersAggregator implements CountersAggregator, Serializable
{
private static final long serialVersionUID = 201409041428L;
MutableLong totalLocalProcessedFiles = new MutableLong();
MutableLong pendingFiles = new MutableLong();
MutableLong totalLocalNumberOfFailures = new MutableLong();
MutableLong totalLocalNumberOfRetries = new MutableLong();
@Override
@SuppressWarnings("unchecked")
public Object aggregate(Collection> countersList)
{
if (countersList.isEmpty()) {
return null;
}
BasicCounters tempFileCounters = (BasicCounters)countersList.iterator().next();
MutableLong globalProcessedFiles = tempFileCounters.getCounter(FileCounters.GLOBAL_PROCESSED_FILES);
MutableLong globalNumberOfFailures = tempFileCounters.getCounter(FileCounters.GLOBAL_NUMBER_OF_FAILURES);
MutableLong globalNumberOfRetries = tempFileCounters.getCounter(FileCounters.GLOBAL_NUMBER_OF_RETRIES);
totalLocalProcessedFiles.setValue(0);
pendingFiles.setValue(0);
totalLocalNumberOfFailures.setValue(0);
totalLocalNumberOfRetries.setValue(0);
for (Object fileCounters : countersList) {
BasicCounters basicFileCounters = (BasicCounters)fileCounters;
totalLocalProcessedFiles.add(basicFileCounters.getCounter(FileCounters.LOCAL_PROCESSED_FILES));
pendingFiles.add(basicFileCounters.getCounter(FileCounters.PENDING_FILES));
totalLocalNumberOfFailures.add(basicFileCounters.getCounter(FileCounters.LOCAL_NUMBER_OF_FAILURES));
totalLocalNumberOfRetries.add(basicFileCounters.getCounter(FileCounters.LOCAL_NUMBER_OF_RETRIES));
}
globalProcessedFiles.add(totalLocalProcessedFiles);
globalProcessedFiles.subtract(pendingFiles);
globalNumberOfFailures.add(totalLocalNumberOfFailures);
globalNumberOfRetries.add(totalLocalNumberOfRetries);
BasicCounters aggregatedCounters = new BasicCounters(MutableLong.class);
aggregatedCounters.setCounter(AggregatedFileCounters.PROCESSED_FILES, globalProcessedFiles);
aggregatedCounters.setCounter(AggregatedFileCounters.PENDING_FILES, pendingFiles);
aggregatedCounters.setCounter(AggregatedFileCounters.NUMBER_OF_ERRORS, totalLocalNumberOfFailures);
aggregatedCounters.setCounter(AggregatedFileCounters.NUMBER_OF_RETRIES, totalLocalNumberOfRetries);
return aggregatedCounters;
}
}
protected long lastRepartition = 0;
/* List of unfinished files */
protected Queue unfinishedFiles = new LinkedList();
/* List of failed file */
protected Queue failedFiles = new LinkedList();
protected transient FileSystem fs;
protected transient Configuration configuration;
protected transient long lastScanMillis;
protected transient Path filePath;
protected transient InputStream inputStream;
protected Set pendingFiles = new LinkedHashSet();
public String getDirectory()
{
return directory;
}
public void setDirectory(String directory)
{
this.directory = directory;
}
public DirectoryScanner getScanner()
{
return scanner;
}
public void setScanner(DirectoryScanner scanner)
{
this.scanner = scanner;
}
/**
* Returns the frequency with which new files are scanned for in milliseconds.
* @return The scan interval in milliseconds.
*/
public int getScanIntervalMillis()
{
return scanIntervalMillis;
}
/**
* Sets the frequency with which new files are scanned for in milliseconds.
* @param scanIntervalMillis The scan interval in milliseconds.
*/
public void setScanIntervalMillis(int scanIntervalMillis)
{
this.scanIntervalMillis = scanIntervalMillis;
}
/**
* Returns the number of tuples emitted in a batch.
* @return The number of tuples emitted in a batch.
*/
public int getEmitBatchSize()
{
return emitBatchSize;
}
/**
* Sets the number of tuples to emit in a batch.
* @param emitBatchSize The number of tuples to emit in a batch.
*/
public void setEmitBatchSize(int emitBatchSize)
{
this.emitBatchSize = emitBatchSize;
}
/**
* Sets the idempotent storage manager on the operator.
* @param windowDataManager an {@link WindowDataManager}
*/
public void setWindowDataManager(WindowDataManager windowDataManager)
{
this.windowDataManager = windowDataManager;
}
/**
* Returns the idempotent storage manager which is being used by the operator.
*
* @return the idempotent storage manager.
*/
public WindowDataManager getWindowDataManager()
{
return windowDataManager;
}
/**
* Returns the desired number of partitions.
* @return the desired number of partitions.
*/
public int getPartitionCount()
{
return partitionCount;
}
/**
* Sets the desired number of partitions.
* @param requiredPartitions The desired number of partitions.
*/
public void setPartitionCount(int requiredPartitions)
{
this.partitionCount = requiredPartitions;
}
/**
* Returns the current number of partitions for the operator.
* @return The current number of partitions for the operator.
*/
public int getCurrentPartitions()
{
return currentPartitions;
}
@Override
public void setup(OperatorContext context)
{
operatorId = context.getId();
globalProcessedFileCount.setValue(processedFiles.size());
LOG.debug("Setup processed file count: {}", globalProcessedFileCount);
this.context = context;
try {
filePath = new Path(directory);
configuration = new Configuration();
fs = getFSInstance();
} catch (IOException ex) {
failureHandling(ex);
}
fileCounters.setCounter(FileCounters.GLOBAL_PROCESSED_FILES, globalProcessedFileCount);
fileCounters.setCounter(FileCounters.LOCAL_PROCESSED_FILES, localProcessedFileCount);
fileCounters.setCounter(FileCounters.GLOBAL_NUMBER_OF_FAILURES, globalNumberOfFailures);
fileCounters.setCounter(FileCounters.LOCAL_NUMBER_OF_FAILURES, localNumberOfFailures);
fileCounters.setCounter(FileCounters.GLOBAL_NUMBER_OF_RETRIES, globalNumberOfRetries);
fileCounters.setCounter(FileCounters.LOCAL_NUMBER_OF_RETRIES, localNumberOfRetries);
fileCounters.setCounter(FileCounters.PENDING_FILES, pendingFileCount);
windowDataManager.setup(context);
if (context.getValue(OperatorContext.ACTIVATION_WINDOW_ID) < windowDataManager.getLargestRecoveryWindow()) {
//reset current file and offset in case of replay
currentFile = null;
offset = 0;
}
}
/**
* Override this method to change the FileSystem instance that is used by the operator.
*
* @return A FileSystem object.
* @throws IOException
*/
protected FileSystem getFSInstance() throws IOException
{
return FileSystem.newInstance(filePath.toUri(), configuration);
}
@Override
public void teardown()
{
IOException savedException = null;
boolean fileFailed = false;
try {
if (inputStream != null) {
inputStream.close();
}
} catch (IOException ex) {
savedException = ex;
fileFailed = true;
}
boolean fsFailed = false;
try {
fs.close();
} catch (IOException ex) {
savedException = ex;
fsFailed = true;
}
if (savedException != null) {
String errorMessage = "";
if (fileFailed) {
errorMessage += "Failed to close " + currentFile + ". ";
}
if (fsFailed) {
errorMessage += "Failed to close filesystem.";
}
throw new RuntimeException(errorMessage, savedException);
}
windowDataManager.teardown();
}
@Override
public void beginWindow(long windowId)
{
currentWindowId = windowId;
if (windowId <= windowDataManager.getLargestRecoveryWindow()) {
replay(windowId);
}
}
@Override
public void endWindow()
{
if (currentWindowId > windowDataManager.getLargestRecoveryWindow()) {
try {
windowDataManager.save(currentWindowRecoveryState, operatorId, currentWindowId);
} catch (IOException e) {
throw new RuntimeException("saving recovery", e);
}
}
currentWindowRecoveryState.clear();
if (context != null) {
pendingFileCount.setValue(pendingFiles.size() + failedFiles.size() + unfinishedFiles.size());
if (currentFile != null) {
pendingFileCount.increment();
}
context.setCounters(fileCounters);
}
}
protected void replay(long windowId)
{
//This operator can partition itself dynamically. When that happens a file can be re-hashed
//to a different partition than the previous one. In order to handle this, the partition loads
//all the recovery data for a window and then processes only those files which would be hashed
//to it in the current run.
try {
Map recoveryDataPerOperator = windowDataManager.load(windowId);
for (Object recovery : recoveryDataPerOperator.values()) {
@SuppressWarnings("unchecked")
LinkedList recoveryData = (LinkedList)recovery;
for (RecoveryEntry recoveryEntry : recoveryData) {
if (scanner.acceptFile(recoveryEntry.file)) {
//The operator may have continued processing the same file in multiple windows.
//So the recovery states of subsequent windows will have an entry for that file however the offset changes.
//In this case we continue reading from previously opened stream.
if (currentFile == null || !(currentFile.equals(recoveryEntry.file) && offset == recoveryEntry.startOffset)) {
if (inputStream != null) {
closeFile(inputStream);
}
processedFiles.add(recoveryEntry.file);
//removing the file from failed and unfinished queues and pending set
Iterator failedFileIterator = failedFiles.iterator();
while (failedFileIterator.hasNext()) {
FailedFile ff = failedFileIterator.next();
if (ff.path.equals(recoveryEntry.file) && ff.offset == recoveryEntry.startOffset) {
failedFileIterator.remove();
break;
}
}
Iterator unfinishedFileIterator = unfinishedFiles.iterator();
while (unfinishedFileIterator.hasNext()) {
FailedFile ff = unfinishedFileIterator.next();
if (ff.path.equals(recoveryEntry.file) && ff.offset == recoveryEntry.startOffset) {
unfinishedFileIterator.remove();
break;
}
}
if (pendingFiles.contains(recoveryEntry.file)) {
pendingFiles.remove(recoveryEntry.file);
}
inputStream = retryFailedFile(new FailedFile(recoveryEntry.file, recoveryEntry.startOffset));
while (offset < recoveryEntry.endOffset) {
T line = readEntity();
offset++;
emit(line);
}
} else {
while (offset < recoveryEntry.endOffset) {
T line = readEntity();
offset++;
emit(line);
}
}
}
}
}
} catch (IOException e) {
throw new RuntimeException("replay", e);
}
}
@Override
public void emitTuples()
{
if (currentWindowId <= windowDataManager.getLargestRecoveryWindow()) {
return;
}
if (inputStream == null) {
try {
if (currentFile != null && offset > 0) {
//open file resets offset to 0 so this a way around it.
int tmpOffset = offset;
if (fs.exists(new Path(currentFile))) {
this.inputStream = openFile(new Path(currentFile));
offset = tmpOffset;
skipCount = tmpOffset;
} else {
currentFile = null;
offset = 0;
skipCount = 0;
}
} else if (!unfinishedFiles.isEmpty()) {
retryFailedFile(unfinishedFiles.poll());
} else if (!pendingFiles.isEmpty()) {
String newPathString = pendingFiles.iterator().next();
pendingFiles.remove(newPathString);
if (fs.exists(new Path(newPathString))) {
this.inputStream = openFile(new Path(newPathString));
}
} else if (!failedFiles.isEmpty()) {
retryFailedFile(failedFiles.poll());
} else {
scanDirectory();
}
} catch (IOException ex) {
failureHandling(ex);
}
}
if (inputStream != null) {
int startOffset = offset;
String file = currentFile; //current file is reset to null when closed.
try {
int counterForTuple = 0;
while (counterForTuple++ < emitBatchSize) {
T line = readEntity();
if (line == null) {
LOG.info("done reading file ({} entries).", offset);
closeFile(inputStream);
break;
}
// If skipCount is non zero, then failed file recovery is going on, skipCount is
// used to prevent already emitted records from being emitted again during recovery.
// When failed file is open, skipCount is set to the last read offset for that file.
//
if (skipCount == 0) {
offset++;
emit(line);
} else {
skipCount--;
}
}
} catch (IOException e) {
failureHandling(e);
}
//Only when something was emitted from the file then we record it for entry.
if (offset > startOffset) {
currentWindowRecoveryState.add(new RecoveryEntry(file, startOffset, offset));
}
}
}
/**
* Scans the directory for new files.
*/
protected void scanDirectory()
{
if (System.currentTimeMillis() - scanIntervalMillis >= lastScanMillis) {
Set newPaths = scanner.scan(fs, filePath, processedFiles);
for (Path newPath : newPaths) {
String newPathString = newPath.toString();
pendingFiles.add(newPathString);
processedFiles.add(newPathString);
localProcessedFileCount.increment();
}
lastScanMillis = System.currentTimeMillis();
}
}
/**
* Helper method for handling IOExceptions.
* @param e The caught IOException.
*/
private void failureHandling(Exception e)
{
localNumberOfFailures.increment();
if (maxRetryCount <= 0) {
throw new RuntimeException(e);
}
LOG.error("FS reader error", e);
addToFailedList();
}
protected void addToFailedList()
{
FailedFile ff = new FailedFile(currentFile, offset, retryCount);
try {
// try to close file
if (this.inputStream != null) {
this.inputStream.close();
}
} catch (IOException e) {
localNumberOfFailures.increment();
LOG.error("Could not close input stream on: " + currentFile);
}
ff.retryCount++;
ff.lastFailedTime = System.currentTimeMillis();
ff.offset = this.offset;
// Clear current file state.
this.currentFile = null;
this.inputStream = null;
if (ff.retryCount > maxRetryCount) {
return;
}
localNumberOfRetries.increment();
LOG.info("adding to failed list path {} offset {} retry {}", ff.path, ff.offset, ff.retryCount);
failedFiles.add(ff);
}
protected InputStream retryFailedFile(FailedFile ff) throws IOException
{
LOG.info("retrying failed file {} offset {} retry {}", ff.path, ff.offset, ff.retryCount);
String path = ff.path;
if (!fs.exists(new Path(path))) {
return null;
}
this.inputStream = openFile(new Path(path));
this.offset = ff.offset;
this.retryCount = ff.retryCount;
this.skipCount = ff.offset;
return this.inputStream;
}
protected InputStream openFile(Path path) throws IOException
{
currentFile = path.toString();
offset = 0;
retryCount = 0;
skipCount = 0;
LOG.info("opening file {}", path);
InputStream input = fs.open(path);
return input;
}
protected void closeFile(InputStream is) throws IOException
{
LOG.info("closing file {} offset {}", currentFile, offset);
if (is != null) {
is.close();
}
currentFile = null;
inputStream = null;
}
@Override
public Collection>> definePartitions(Collection>> partitions, PartitioningContext context)
{
lastRepartition = System.currentTimeMillis();
int totalCount = getNewPartitionCount(partitions, context);
LOG.debug("Computed new partitions: {}", totalCount);
if (totalCount == partitions.size()) {
return partitions;
}
AbstractFileInputOperator tempOperator = partitions.iterator().next().getPartitionedInstance();
MutableLong tempGlobalNumberOfRetries = tempOperator.globalNumberOfRetries;
MutableLong tempGlobalNumberOfFailures = tempOperator.globalNumberOfRetries;
/*
* Build collective state from all instances of the operator.
*/
Set totalProcessedFiles = Sets.newHashSet();
Set currentFiles = Sets.newHashSet();
List oldscanners = Lists.newLinkedList();
List totalFailedFiles = Lists.newLinkedList();
List totalPendingFiles = Lists.newLinkedList();
Set deletedOperators = Sets.newHashSet();
for (Partition> partition : partitions) {
AbstractFileInputOperator oper = partition.getPartitionedInstance();
totalProcessedFiles.addAll(oper.processedFiles);
totalFailedFiles.addAll(oper.failedFiles);
totalPendingFiles.addAll(oper.pendingFiles);
currentFiles.addAll(unfinishedFiles);
tempGlobalNumberOfRetries.add(oper.localNumberOfRetries);
tempGlobalNumberOfFailures.add(oper.localNumberOfFailures);
if (oper.currentFile != null) {
currentFiles.add(new FailedFile(oper.currentFile, oper.offset));
}
oldscanners.add(oper.getScanner());
deletedOperators.add(oper.operatorId);
}
/*
* Create partitions of scanners, scanner's partition method will do state
* transfer for DirectoryScanner objects.
*/
List scanners = scanner.partition(totalCount, oldscanners);
Collection>> newPartitions = Lists.newArrayListWithExpectedSize(totalCount);
Collection newManagers = Lists.newArrayListWithExpectedSize(totalCount);
KryoCloneUtils> cloneUtils = KryoCloneUtils.createCloneUtils(this);
for (int i = 0; i < scanners.size(); i++) {
@SuppressWarnings("unchecked")
AbstractFileInputOperator oper = cloneUtils.getClone();
DirectoryScanner scn = scanners.get(i);
oper.setScanner(scn);
// Do state transfer for processed files.
oper.processedFiles.addAll(totalProcessedFiles);
oper.globalNumberOfFailures = tempGlobalNumberOfRetries;
oper.localNumberOfFailures.setValue(0);
oper.globalNumberOfRetries = tempGlobalNumberOfFailures;
oper.localNumberOfRetries.setValue(0);
/* redistribute unfinished files properly */
oper.unfinishedFiles.clear();
oper.currentFile = null;
oper.offset = 0;
Iterator unfinishedIter = currentFiles.iterator();
while (unfinishedIter.hasNext()) {
FailedFile unfinishedFile = unfinishedIter.next();
if (scn.acceptFile(unfinishedFile.path)) {
oper.unfinishedFiles.add(unfinishedFile);
unfinishedIter.remove();
}
}
/* transfer failed files */
oper.failedFiles.clear();
Iterator iter = totalFailedFiles.iterator();
while (iter.hasNext()) {
FailedFile ff = iter.next();
if (scn.acceptFile(ff.path)) {
oper.failedFiles.add(ff);
iter.remove();
}
}
/* redistribute pending files properly */
oper.pendingFiles.clear();
Iterator pendingFilesIterator = totalPendingFiles.iterator();
while (pendingFilesIterator.hasNext()) {
String pathString = pendingFilesIterator.next();
if (scn.acceptFile(pathString)) {
oper.pendingFiles.add(pathString);
pendingFilesIterator.remove();
}
}
newPartitions.add(new DefaultPartition>(oper));
newManagers.add(oper.windowDataManager);
}
windowDataManager.partitioned(newManagers, deletedOperators);
LOG.info("definePartitions called returning {} partitions", newPartitions.size());
return newPartitions;
}
protected int getNewPartitionCount(Collection>> partitions, PartitioningContext context)
{
return DefaultPartition.getRequiredPartitionCount(context, this.partitionCount);
}
@Override
public void partitioned(Map>> partitions)
{
currentPartitions = partitions.size();
}
@Override
public void checkpointed(long windowId)
{
}
@Override
public void committed(long windowId)
{
try {
windowDataManager.deleteUpTo(operatorId, windowId);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
/**
* Read the next item from the stream. Depending on the type of stream, this could be a byte array, line or object.
* Upon return of null, the stream will be considered fully consumed.
*
* @return Depending on the type of stream an object is returned. When null is returned the stream is consumed.
* @throws IOException
*/
protected abstract T readEntity() throws IOException;
/**
* Emit the tuple on the port
*
* @param tuple
*/
protected abstract void emit(T tuple);
/**
* Repartition is required when number of partitions are not equal to required
* partitions.
* @param batchedOperatorStats the stats to use when repartitioning.
* @return Returns the stats listener response.
*/
@Override
public Response processStats(BatchedOperatorStats batchedOperatorStats)
{
Response res = new Response();
res.repartitionRequired = false;
if (currentPartitions != partitionCount) {
LOG.info("processStats: trying repartition of input operator current {} required {}", currentPartitions, partitionCount);
res.repartitionRequired = true;
}
return res;
}
/**
* Returns the maximum number of times the operator will attempt to process
* a file on which it encounters an error.
* @return The maximum number of times the operator will attempt to process a
* file on which it encounters an error.
*/
public int getMaxRetryCount()
{
return maxRetryCount;
}
/**
* Sets the maximum number of times the operator will attempt to process
* a file on which it encounters an error.
* @param maxRetryCount The maximum number of times the operator will attempt
* to process a file on which it encounters an error.
*/
public void setMaxRetryCount(int maxRetryCount)
{
this.maxRetryCount = maxRetryCount;
}
/**
* The class that is used to scan for new files in the directory for the
* AbstractFSDirectoryInputOperator.
*/
public static class DirectoryScanner implements Serializable
{
private static final long serialVersionUID = 4535844463258899929L;
private String filePatternRegexp;
private transient Pattern regex = null;
private int partitionIndex;
private int partitionCount;
protected final transient HashSet ignoredFiles = new HashSet();
public String getFilePatternRegexp()
{
return filePatternRegexp;
}
public void setFilePatternRegexp(String filePatternRegexp)
{
this.filePatternRegexp = filePatternRegexp;
this.regex = null;
}
public int getPartitionCount()
{
return partitionCount;
}
public int getPartitionIndex()
{
return partitionIndex;
}
protected Pattern getRegex()
{
if (this.regex == null && this.filePatternRegexp != null) {
this.regex = Pattern.compile(this.filePatternRegexp);
}
return this.regex;
}
public LinkedHashSet scan(FileSystem fs, Path filePath, Set consumedFiles)
{
LinkedHashSet pathSet = Sets.newLinkedHashSet();
try {
LOG.debug("Scanning {} with pattern {}", filePath, this.filePatternRegexp);
FileStatus[] files = fs.listStatus(filePath);
for (FileStatus status : files) {
Path path = status.getPath();
String filePathStr = path.toString();
if (consumedFiles.contains(filePathStr)) {
continue;
}
if (ignoredFiles.contains(filePathStr)) {
continue;
}
if (acceptFile(filePathStr)) {
LOG.debug("Found {}", filePathStr);
pathSet.add(path);
} else {
// don't look at it again
ignoredFiles.add(filePathStr);
}
}
} catch (FileNotFoundException e) {
LOG.warn("Failed to list directory {}", filePath, e);
} catch (IOException e) {
throw new RuntimeException(e);
}
return pathSet;
}
protected boolean acceptFile(String filePathStr)
{
if (partitionCount > 1) {
int i = filePathStr.hashCode();
int mod = i % partitionCount;
if (mod < 0) {
mod += partitionCount;
}
LOG.debug("partition {} {} {} {}", partitionIndex, filePathStr, i, mod);
if (mod != partitionIndex) {
return false;
}
}
Pattern regex = this.getRegex();
if (regex != null) {
Matcher matcher = regex.matcher(filePathStr);
if (!matcher.matches()) {
return false;
}
}
return true;
}
public List partition(int count)
{
ArrayList partitions = Lists.newArrayListWithExpectedSize(count);
for (int i = 0; i < count; i++) {
partitions.add(this.createPartition(i, count));
}
return partitions;
}
public List partition(int count, @SuppressWarnings("unused") Collection scanners)
{
return partition(count);
}
protected DirectoryScanner createPartition(int partitionIndex, int partitionCount)
{
DirectoryScanner that = new DirectoryScanner();
that.filePatternRegexp = this.filePatternRegexp;
that.regex = this.regex;
that.partitionIndex = partitionIndex;
that.partitionCount = partitionCount;
return that;
}
@Override
public String toString()
{
return "DirectoryScanner [filePatternRegexp=" + filePatternRegexp + " partitionIndex=" +
partitionIndex + " partitionCount=" + partitionCount + "]";
}
}
protected static class RecoveryEntry
{
final String file;
final int startOffset;
final int endOffset;
@SuppressWarnings("unused")
private RecoveryEntry()
{
file = null;
startOffset = -1;
endOffset = -1;
}
RecoveryEntry(String file, int startOffset, int endOffset)
{
this.file = Preconditions.checkNotNull(file, "file");
this.startOffset = startOffset;
this.endOffset = endOffset;
}
@Override
public boolean equals(Object o)
{
if (this == o) {
return true;
}
if (!(o instanceof RecoveryEntry)) {
return false;
}
RecoveryEntry that = (RecoveryEntry)o;
if (endOffset != that.endOffset) {
return false;
}
if (startOffset != that.startOffset) {
return false;
}
return file.equals(that.file);
}
@Override
public int hashCode()
{
int result = file.hashCode();
result = 31 * result + startOffset;
result = 31 * result + endOffset;
return result;
}
}
/**
* This class is deprecated, use {@link LineByLineFileInputOperator}
*
* This is an implementation of the {@link AbstractFileInputOperator} that outputs the lines in a file.
* Each line is emitted as a separate tuple. It is emitted as a String.
*
*
* The directory path where to scan and read files from should be specified using the {@link #directory} property.
*
* @deprecated
* @displayName File Line Input
* @category Input
* @tags fs, file, line, lines, input operator
*
*/
public static class FileLineInputOperator extends AbstractFileInputOperator
{
public final transient DefaultOutputPort output = new DefaultOutputPort();
protected transient BufferedReader br;
@Override
protected InputStream openFile(Path path) throws IOException
{
InputStream is = super.openFile(path);
br = new BufferedReader(new InputStreamReader(is));
return is;
}
@Override
protected void closeFile(InputStream is) throws IOException
{
super.closeFile(is);
br.close();
br = null;
}
@Override
protected String readEntity() throws IOException
{
return br.readLine();
}
@Override
protected void emit(String tuple)
{
output.emit(tuple);
}
}
}