com.datatorrent.lib.io.fs.FileSplitterInput Maven / Gradle / Ivy
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package com.datatorrent.lib.io.fs;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.URI;
import java.util.Collections;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.LinkedBlockingDeque;
import java.util.concurrent.atomic.AtomicReference;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.annotation.Nullable;
import javax.validation.Valid;
import javax.validation.constraints.Min;
import javax.validation.constraints.NotNull;
import javax.validation.constraints.Size;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.apex.malhar.lib.wal.WindowDataManager;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import com.google.common.base.Joiner;
import com.google.common.base.Preconditions;
import com.google.common.base.Splitter;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import com.datatorrent.api.Component;
import com.datatorrent.api.Context;
import com.datatorrent.api.InputOperator;
import com.datatorrent.api.Operator;
import com.datatorrent.api.annotation.OperatorAnnotation;
import com.datatorrent.api.annotation.Stateless;
import com.datatorrent.netlet.util.DTThrowable;
/**
* Input operator that scans a directory for files and splits a file into blocks.
* The operator emits block metadata and file metadata.
*
* The file system/directory space should be different for different partitions of file splitter.
* The scanning of
*
* @displayName File Splitter
* @category Input
* @tags file
* @since 2.0.0
*/
@OperatorAnnotation(checkpointableWithinAppWindow = false)
public class FileSplitterInput extends AbstractFileSplitter implements InputOperator, Operator.CheckpointListener
{
@NotNull
private WindowDataManager windowDataManager;
@NotNull
protected final transient LinkedList currentWindowRecoveryState;
@Valid
@NotNull
private TimeBasedDirectoryScanner scanner;
@NotNull
private Map> referenceTimes;
private transient long sleepMillis;
public FileSplitterInput()
{
super();
currentWindowRecoveryState = Lists.newLinkedList();
windowDataManager = new WindowDataManager.NoopWindowDataManager();
referenceTimes = Maps.newHashMap();
scanner = new TimeBasedDirectoryScanner();
}
@Override
public void setup(Context.OperatorContext context)
{
sleepMillis = context.getValue(Context.OperatorContext.SPIN_MILLIS);
scanner.setup(context);
windowDataManager.setup(context);
super.setup(context);
long largestRecoveryWindow = windowDataManager.getLargestRecoveryWindow();
if (largestRecoveryWindow == Stateless.WINDOW_ID || context.getValue(Context.OperatorContext.ACTIVATION_WINDOW_ID) >
largestRecoveryWindow) {
scanner.startScanning(Collections.unmodifiableMap(referenceTimes));
}
}
@Override
public void beginWindow(long windowId)
{
super.beginWindow(windowId);
if (windowId <= windowDataManager.getLargestRecoveryWindow()) {
replay(windowId);
}
}
protected void replay(long windowId)
{
try {
@SuppressWarnings("unchecked")
LinkedList recoveredData = (LinkedList)windowDataManager.load(operatorId, windowId);
if (recoveredData == null) {
//This could happen when there are multiple physical instances and one of them is ahead in processing windows.
return;
}
if (blockMetadataIterator != null) {
emitBlockMetadata();
}
for (ScannedFileInfo info : recoveredData) {
updateReferenceTimes(info);
FileMetadata fileMetadata = buildFileMetadata(info);
filesMetadataOutput.emit(fileMetadata);
blockMetadataIterator = new BlockMetadataIterator(this, fileMetadata, blockSize);
if (!emitBlockMetadata()) {
break;
}
}
} catch (IOException e) {
throw new RuntimeException("replay", e);
}
if (windowId == windowDataManager.getLargestRecoveryWindow()) {
scanner.startScanning(Collections.unmodifiableMap(referenceTimes));
}
}
@Override
public void emitTuples()
{
if (currentWindowId <= windowDataManager.getLargestRecoveryWindow()) {
return;
}
Throwable throwable;
if ((throwable = scanner.atomicThrowable.get()) != null) {
DTThrowable.rethrow(throwable);
}
if (blockMetadataIterator == null && scanner.discoveredFiles.isEmpty()) {
try {
Thread.sleep(sleepMillis);
} catch (InterruptedException e) {
throw new RuntimeException("waiting for work", e);
}
}
process();
}
@Override
protected FileInfo getFileInfo()
{
return scanner.pollFile();
}
@Override
protected boolean processFileInfo(FileInfo fileInfo)
{
ScannedFileInfo scannedFileInfo = (ScannedFileInfo)fileInfo;
currentWindowRecoveryState.add(scannedFileInfo);
updateReferenceTimes(scannedFileInfo);
return super.processFileInfo(fileInfo);
}
protected void updateReferenceTimes(ScannedFileInfo fileInfo)
{
Map referenceTimePerInputDir;
if ((referenceTimePerInputDir = referenceTimes.get(fileInfo.getDirectoryPath())) == null) {
referenceTimePerInputDir = Maps.newHashMap();
}
referenceTimePerInputDir.put(fileInfo.getFilePath(), fileInfo.modifiedTime);
referenceTimes.put(fileInfo.getDirectoryPath(), referenceTimePerInputDir);
}
@Override
public void endWindow()
{
if (currentWindowId > windowDataManager.getLargestRecoveryWindow()) {
try {
windowDataManager.save(currentWindowRecoveryState, operatorId, currentWindowId);
} catch (IOException e) {
throw new RuntimeException("saving recovery", e);
}
}
currentWindowRecoveryState.clear();
}
@Override
protected long getDefaultBlockSize()
{
return scanner.fs.getDefaultBlockSize(new Path(scanner.files.iterator().next()));
}
@Override
protected FileStatus getFileStatus(Path path) throws IOException
{
return scanner.fs.getFileStatus(path);
}
@Override
public void checkpointed(long l)
{
}
@Override
public void committed(long l)
{
try {
windowDataManager.deleteUpTo(operatorId, l);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
@Override
public void teardown()
{
scanner.teardown();
}
public void setWindowDataManager(WindowDataManager windowDataManager)
{
this.windowDataManager = windowDataManager;
}
public WindowDataManager getWindowDataManager()
{
return this.windowDataManager;
}
public void setScanner(TimeBasedDirectoryScanner scanner)
{
this.scanner = scanner;
}
public TimeBasedDirectoryScanner getScanner()
{
return this.scanner;
}
public static class TimeBasedDirectoryScanner implements Runnable, Component
{
private static long DEF_SCAN_INTERVAL_MILLIS = 5000;
private static String FILE_BEING_COPIED = "_COPYING_";
private boolean recursive;
private transient volatile boolean trigger;
@NotNull
@Size(min = 1)
private final Set files;
@Min(0)
private long scanIntervalMillis;
private String filePatternRegularExp;
private String ignoreFilePatternRegularExp;
protected transient long lastScanMillis;
protected transient FileSystem fs;
protected final transient LinkedBlockingDeque discoveredFiles;
protected final transient ExecutorService scanService;
protected final transient AtomicReference atomicThrowable;
private transient volatile boolean running;
protected final transient HashSet ignoredFiles;
protected transient Pattern regex;
private transient Pattern ignoreRegex;
protected transient long sleepMillis;
protected transient Map> referenceTimes;
private transient ScannedFileInfo lastScannedInfo;
private transient int numDiscoveredPerIteration;
public TimeBasedDirectoryScanner()
{
recursive = true;
scanIntervalMillis = DEF_SCAN_INTERVAL_MILLIS;
files = Sets.newLinkedHashSet();
scanService = Executors.newSingleThreadExecutor();
discoveredFiles = new LinkedBlockingDeque<>();
atomicThrowable = new AtomicReference<>();
ignoredFiles = Sets.newHashSet();
}
@Override
public void setup(Context.OperatorContext context)
{
sleepMillis = context.getValue(Context.OperatorContext.SPIN_MILLIS);
if (filePatternRegularExp != null) {
regex = Pattern.compile(filePatternRegularExp);
}
if (ignoreFilePatternRegularExp != null) {
ignoreRegex = Pattern.compile(this.ignoreFilePatternRegularExp);
}
try {
fs = getFSInstance();
} catch (IOException e) {
throw new RuntimeException("opening fs", e);
}
}
protected void startScanning(Map> referenceTimes)
{
this.referenceTimes = Preconditions.checkNotNull(referenceTimes);
scanService.submit(this);
}
/**
* Stop scanner
*/
protected void stopScanning()
{
running = false;
}
@Override
public void teardown()
{
stopScanning();
scanService.shutdownNow();
try {
fs.close();
} catch (IOException e) {
throw new RuntimeException("closing fs", e);
}
}
protected FileSystem getFSInstance() throws IOException
{
return FileSystem.newInstance(new Path(files.iterator().next()).toUri(), new Configuration());
}
@Override
public void run()
{
running = true;
try {
while (running) {
if ((trigger || (System.currentTimeMillis() - scanIntervalMillis >= lastScanMillis)) && isIterationCompleted()) {
trigger = false;
lastScannedInfo = null;
numDiscoveredPerIteration = 0;
for (String afile : files) {
String filePath = new File(afile).getAbsolutePath();
LOG.debug("Scan started for input {}", filePath);
Map lastModifiedTimesForInputDir;
lastModifiedTimesForInputDir = referenceTimes.get(filePath);
scan(new Path(afile), null, lastModifiedTimesForInputDir);
}
scanIterationComplete();
} else {
Thread.sleep(sleepMillis);
}
}
} catch (Throwable throwable) {
LOG.error("service", throwable);
running = false;
atomicThrowable.set(throwable);
DTThrowable.rethrow(throwable);
}
}
//check if scanned files of last iteration are processed by operator thread
private boolean isIterationCompleted()
{
if (lastScannedInfo == null) { // first iteration started
return true;
}
Map referenceTime = referenceTimes.get(lastScannedInfo.getDirectoryPath());
if (referenceTime != null) {
return referenceTime.get(lastScannedInfo.getFilePath()) != null;
}
return false;
}
/**
* Operations that need to be done once a scan is complete.
*/
protected void scanIterationComplete()
{
LOG.debug("scan complete {} {}", lastScanMillis, numDiscoveredPerIteration);
lastScanMillis = System.currentTimeMillis();
}
protected void scan(@NotNull Path filePath, Path rootPath)
{
Map lastModifiedTimesForInputDir;
lastModifiedTimesForInputDir = referenceTimes.get(filePath.toUri().getPath());
scan(filePath, rootPath, lastModifiedTimesForInputDir);
}
private void scan(Path filePath, Path rootPath, Map lastModifiedTimesForInputDir)
{
try {
FileStatus parentStatus = fs.getFileStatus(filePath);
String parentPathStr = filePath.toUri().getPath();
LOG.debug("scan {}", parentPathStr);
FileStatus[] childStatuses = fs.listStatus(filePath);
if (childStatuses.length == 0 && rootPath == null && (lastModifiedTimesForInputDir == null || lastModifiedTimesForInputDir.get(parentPathStr) == null)) { // empty input directory copy as is
ScannedFileInfo info = new ScannedFileInfo(null, filePath.toString(), parentStatus.getModificationTime());
processDiscoveredFile(info);
}
for (FileStatus childStatus : childStatuses) {
Path childPath = childStatus.getPath();
String childPathStr = childPath.toUri().getPath();
if (childStatus.isDirectory() && isRecursive()) {
addToDiscoveredFiles(rootPath, parentStatus, childStatus, lastModifiedTimesForInputDir);
scan(childPath, rootPath == null ? parentStatus.getPath() : rootPath, lastModifiedTimesForInputDir);
} else if (acceptFile(childPathStr)) {
addToDiscoveredFiles(rootPath, parentStatus, childStatus, lastModifiedTimesForInputDir);
} else {
// don't look at it again
ignoredFiles.add(childPathStr);
}
}
} catch (FileNotFoundException fnf) {
LOG.warn("Failed to list directory {}", filePath, fnf);
} catch (IOException e) {
throw new RuntimeException("listing files", e);
}
}
private void addToDiscoveredFiles(Path rootPath, FileStatus parentStatus, FileStatus childStatus,
Map lastModifiedTimesForInputDir) throws IOException
{
Path childPath = childStatus.getPath();
String childPathStr = childPath.toUri().getPath();
// Directory by now is scanned forcibly. Now check for whether file/directory needs to be added to discoveredFiles.
Long oldModificationTime = null;
if (lastModifiedTimesForInputDir != null) {
oldModificationTime = lastModifiedTimesForInputDir.get(childPathStr);
}
if (skipFile(childPath, childStatus.getModificationTime(), oldModificationTime) || // Skip dir or file if no timestamp modification
(childStatus.isDirectory() && (oldModificationTime != null))) { // If timestamp modified but if its a directory and already present in map, then skip.
return;
}
if (ignoredFiles.contains(childPathStr)) {
return;
}
ScannedFileInfo info = createScannedFileInfo(parentStatus.getPath(), parentStatus, childPath, childStatus,
rootPath);
LOG.debug("Processing file: " + info.getFilePath());
processDiscoveredFile(info);
}
protected void processDiscoveredFile(ScannedFileInfo info)
{
numDiscoveredPerIteration++;
lastScannedInfo = info;
discoveredFiles.add(info);
}
protected ScannedFileInfo createScannedFileInfo(Path parentPath, FileStatus parentStatus, Path childPath,
FileStatus childStatus, Path rootPath)
{
ScannedFileInfo info;
if (rootPath == null) {
info = parentStatus.isDirectory() ?
new ScannedFileInfo(parentPath.toUri().getPath(), childPath.getName(), childStatus.getModificationTime()) :
new ScannedFileInfo(null, childPath.toUri().getPath(), childStatus.getModificationTime());
} else {
URI relativeChildURI = rootPath.toUri().relativize(childPath.toUri());
info = new ScannedFileInfo(rootPath.toUri().getPath(), relativeChildURI.getPath(),
childStatus.getModificationTime());
}
return info;
}
/**
* Skips file/directory based on their modification time.
*
* @param path file path
* @param modificationTime modification time
* @param lastModificationTime last cached directory modification time
* @return true to skip; false otherwise.
* @throws IOException
*/
protected static boolean skipFile(@SuppressWarnings("unused") @NotNull Path path, @NotNull Long modificationTime,
Long lastModificationTime) throws IOException
{
return (!(lastModificationTime == null || modificationTime > lastModificationTime));
}
/**
* Accepts file which match a regular pattern.
*
* @param filePathStr file path
* @return true if the path matches the pattern; false otherwise;
*/
protected boolean acceptFile(String filePathStr)
{
if (fs.getScheme().equalsIgnoreCase("hdfs") && filePathStr.endsWith(FILE_BEING_COPIED)) {
return false;
}
if (regex != null) {
Matcher matcher = regex.matcher(filePathStr);
if (!matcher.matches()) {
return false;
}
}
if (ignoreRegex != null) {
Matcher matcher = ignoreRegex.matcher(filePathStr);
if (matcher.matches()) {
return false;
}
}
return true;
}
public FileInfo pollFile()
{
return discoveredFiles.poll();
}
protected int getNumDiscoveredPerIteration()
{
return numDiscoveredPerIteration;
}
/**
* Gets the regular expression for file names to split.
*
* @return regular expression
*/
public String getFilePatternRegularExp()
{
return filePatternRegularExp;
}
/**
* Only files with names matching the given java regular expression are split.
*
* @param filePatternRegexp regular expression
*/
public void setFilePatternRegularExp(String filePatternRegexp)
{
this.filePatternRegularExp = filePatternRegexp;
}
/**
* @return the regular expression for ignored files.
*/
public String getIgnoreFilePatternRegularExp()
{
return ignoreFilePatternRegularExp;
}
/**
* Sets the regular expression for files that should be ignored.
*
* @param ignoreFilePatternRegex regular expression for files that will be ignored.
*/
public void setIgnoreFilePatternRegularExp(String ignoreFilePatternRegex)
{
this.ignoreFilePatternRegularExp = ignoreFilePatternRegex;
}
/**
* A comma separated list of directories to scan. If the path is not fully qualified the default
* file system is used. A fully qualified path can be provided to scan directories in other filesystems.
*
* @param files files
*/
public void setFiles(String files)
{
Iterables.addAll(this.files, Splitter.on(",").omitEmptyStrings().split(files));
}
/**
* Gets the files to be scanned.
*
* @return files to be scanned.
*/
public String getFiles()
{
return Joiner.on(",").join(this.files);
}
/**
* True if recursive; false otherwise.
*
* @param recursive true if recursive; false otherwise.
*/
public void setRecursive(boolean recursive)
{
this.recursive = recursive;
}
/**
* Sets whether scan will be recursive.
*
* @return true if recursive; false otherwise.
*/
public boolean isRecursive()
{
return this.recursive;
}
/**
* Sets the trigger which will initiate scan.
*
* @param trigger
*/
public void setTrigger(boolean trigger)
{
this.trigger = trigger;
}
/**
* The trigger which will initiate scan.
*
* @return trigger
*/
public boolean isTrigger()
{
return this.trigger;
}
/**
* Returns the frequency with which new files are scanned for in milliseconds.
*
* @return The scan interval in milliseconds.
*/
public long getScanIntervalMillis()
{
return scanIntervalMillis;
}
/**
* Sets the frequency with which new files are scanned for in milliseconds.
*
* @param scanIntervalMillis The scan interval in milliseconds.
*/
public void setScanIntervalMillis(long scanIntervalMillis)
{
this.scanIntervalMillis = scanIntervalMillis;
}
}
/**
* File info created for files discovered by scanner
*/
public static class ScannedFileInfo extends AbstractFileSplitter.FileInfo
{
protected final long modifiedTime;
protected ScannedFileInfo()
{
super();
modifiedTime = -1;
}
public ScannedFileInfo(@Nullable String directoryPath, @NotNull String relativeFilePath, long modifiedTime)
{
super(directoryPath, relativeFilePath);
this.modifiedTime = modifiedTime;
}
public long getModifiedTime()
{
return modifiedTime;
}
}
private static final Logger LOG = LoggerFactory.getLogger(FileSplitterInput.class);
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy