com.hazelcast.jet.impl.connector.StreamFilesP Maven / Gradle / Ivy
/*
* Copyright (c) 2008-2018, Hazelcast, Inc. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.hazelcast.jet.impl.connector;
import com.hazelcast.jet.JetException;
import com.hazelcast.jet.core.AbstractProcessor;
import com.hazelcast.jet.core.ProcessorMetaSupplier;
import com.hazelcast.jet.core.ProcessorSupplier;
import com.hazelcast.jet.function.DistributedBiFunction;
import com.hazelcast.jet.impl.util.ReflectionUtils;
import com.hazelcast.logging.ILogger;
import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.Charset;
import java.nio.file.DirectoryStream;
import java.nio.file.FileSystems;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.PathMatcher;
import java.nio.file.Paths;
import java.nio.file.WatchEvent;
import java.nio.file.WatchKey;
import java.nio.file.WatchService;
import java.util.ArrayDeque;
import java.util.HashMap;
import java.util.Map;
import java.util.Queue;
import java.util.stream.IntStream;
import static com.hazelcast.jet.impl.util.ExceptionUtil.sneakyThrow;
import static com.hazelcast.jet.impl.util.LoggingUtil.logFine;
import static com.hazelcast.jet.impl.util.LoggingUtil.logFinest;
import static java.nio.file.StandardWatchEventKinds.ENTRY_CREATE;
import static java.nio.file.StandardWatchEventKinds.ENTRY_DELETE;
import static java.nio.file.StandardWatchEventKinds.ENTRY_MODIFY;
import static java.nio.file.StandardWatchEventKinds.OVERFLOW;
import static java.util.concurrent.TimeUnit.SECONDS;
import static java.util.stream.Collectors.toList;
/**
* Private API. Access via {@link
* com.hazelcast.jet.core.processor.SourceProcessors#streamFilesP}.
*
* Since the work of this vertex is file IO-intensive, its {@link
* com.hazelcast.jet.core.Vertex#localParallelism(int) local parallelism}
* should be set according to the performance characteristics of the
* underlying storage system. Modern high-end devices peak with 4-8 reading
* threads, so if running a single Jet job with a single file-reading
* vertex, the optimal value would be in the range of 4-8. Note that any
* one file is only read by one thread, so extra parallelism won't improve
* performance if there aren't enough files to read.
*/
public class StreamFilesP extends AbstractProcessor {
/**
* The amount of data read from one file at once must be limited
* in order to prevent a possible {@link java.nio.file.StandardWatchEventKinds#OVERFLOW
* OVERFLOW} if too many Watcher events accumulate in the queue. This
* constant specifies the number of lines to read at once, before going
* back to polling the event queue.
*/
private static final int LINES_IN_ONE_BATCH = 64;
private static final String SENSITIVITY_MODIFIER_CLASS_NAME = "com.sun.nio.file.SensitivityWatchEventModifier";
private static final WatchEvent.Kind[] WATCH_EVENT_KINDS = {ENTRY_CREATE, ENTRY_MODIFY, ENTRY_DELETE};
private static final WatchEvent.Modifier[] WATCH_EVENT_MODIFIERS = getHighSensitivityModifiers();
/**
* Map from file to offset. Initially we store (-fileSize): if the offset is negative when we
* receive the first watcher event, we skip up to the next newline to avoid partial reading
* of the first line.
*/
// exposed for testing
final Map fileOffsets = new HashMap<>();
private final Path watchedDirectory;
private final Charset charset;
private final PathMatcher glob;
private final int parallelism;
private final int id;
private final DistributedBiFunction mapOutputFn;
private final Queue eventQueue = new ArrayDeque<>();
private WatchService watcher;
private StringBuilder lineBuilder = new StringBuilder();
private R pendingLine;
private Path currentFile;
private String currentFileName;
private FileInputStream currentInputStream;
private Reader currentReader;
StreamFilesP(@Nonnull String watchedDirectory, @Nonnull Charset charset, @Nonnull String glob,
int parallelism, int id, @Nonnull DistributedBiFunction mapOutputFn
) {
this.watchedDirectory = Paths.get(watchedDirectory);
this.charset = charset;
this.glob = FileSystems.getDefault().getPathMatcher("glob:" + glob);
this.parallelism = parallelism;
this.id = id;
this.mapOutputFn = mapOutputFn;
setCooperative(false);
}
@Override
protected void init(@Nonnull Context context) throws Exception {
try (DirectoryStream directoryStream = Files.newDirectoryStream(watchedDirectory)) {
for (Path file : directoryStream) {
if (Files.isRegularFile(file)) {
// Negative offset means "initial offset", needed to skip the first line
fileOffsets.put(file, new FileOffset(-Files.size(file), ""));
}
}
}
watcher = FileSystems.getDefault().newWatchService();
watchedDirectory.register(watcher, WATCH_EVENT_KINDS, WATCH_EVENT_MODIFIERS);
getLogger().info("Started to watch directory: " + watchedDirectory);
}
@Override
public void close(@Nullable Throwable error) {
try {
closeCurrentFile();
getLogger().fine("Closing StreamFilesP");
watcher.close();
} catch (IOException e) {
getLogger().severe("Failed to close StreamFilesP", e);
} finally {
watcher = null;
}
}
@Override
public boolean complete() {
if (!drainWatcherEvents()) {
return true;
}
if (currentFile == null) {
currentFile = eventQueue.poll();
currentFileName = currentFile != null ? String.valueOf(currentFile.getFileName()) : null;
}
if (currentFile != null) {
processFile();
}
return false;
}
/**
* @return false, if the watcher should be closed
*/
private boolean drainWatcherEvents() {
final ILogger logger = getLogger();
// poll with blocking only when there is no other work to do
final WatchKey key;
try {
key = (currentFile == null && eventQueue.isEmpty())
? watcher.poll(1, SECONDS)
: watcher.poll();
} catch (InterruptedException e) {
return false;
}
if (key == null) {
if (!Files.exists(watchedDirectory)) {
logger.info("Directory " + watchedDirectory + " does not exist, stopped watching");
return false;
}
return true;
}
for (WatchEvent> event : key.pollEvents()) {
final WatchEvent.Kind> kind = event.kind();
final Path fileName = ((WatchEvent) event).context();
final Path filePath = watchedDirectory.resolve(fileName);
if (kind == ENTRY_CREATE || kind == ENTRY_MODIFY) {
if (glob.matches(fileName) && belongsToThisProcessor(fileName) && !Files.isDirectory(filePath)) {
logFine(logger, "Will open file to read new content: %s", filePath);
eventQueue.add(filePath);
}
} else if (kind == ENTRY_DELETE) {
logFinest(logger, "File was deleted: %s", filePath);
fileOffsets.remove(filePath);
} else if (kind == OVERFLOW) {
logger.warning("Detected OVERFLOW in " + watchedDirectory);
} else {
throw new JetException("Unknown kind of WatchEvent: " + kind);
}
}
if (!key.reset()) {
logger.info("Watch key is invalid. Stopping watcher.");
return false;
}
return true;
}
private boolean belongsToThisProcessor(Path path) {
return ((path.hashCode() & Integer.MAX_VALUE) % parallelism) == id;
}
private void processFile() {
try {
if (!ensureFileOpen()) {
return;
}
for (int i = 0; i < LINES_IN_ONE_BATCH; i++) {
if (pendingLine == null) {
String line = readCompleteLine(currentReader);
pendingLine = line != null ? mapOutputFn.apply(currentFileName, line) : null;
}
if (pendingLine == null) {
fileOffsets.put(currentFile,
new FileOffset(currentInputStream.getChannel().position(), lineBuilder.toString()));
lineBuilder.setLength(0);
closeCurrentFile();
break;
}
if (tryEmit(pendingLine)) {
pendingLine = null;
} else {
break;
}
}
} catch (IOException e) {
throw sneakyThrow(e);
}
}
private boolean ensureFileOpen() throws IOException {
if (currentReader != null) {
return true;
}
FileOffset offset = fileOffsets.getOrDefault(currentFile, FileOffset.ZERO);
logFine(getLogger(), "Processing file %s, previous offset: %s", currentFile, offset);
try {
FileInputStream fis = new FileInputStream(currentFile.toFile());
fis.getChannel().position(offset.positiveOffset());
BufferedReader r = new BufferedReader(new InputStreamReader(fis, charset));
if (offset.offset < 0 && !findEndOfLine(r)) {
closeCurrentFile();
return false;
}
currentReader = r;
currentInputStream = fis;
lineBuilder.append(offset.pendingLine);
return true;
} catch (FileNotFoundException ignored) {
// This could be caused by ENTRY_MODIFY emitted on file deletion
// just before ENTRY_DELETE
closeCurrentFile();
return false;
}
}
/**
* Reads the file until the end of line is found.
*
* @return whether it was found
*/
private boolean findEndOfLine(Reader in) throws IOException {
while (true) {
int ch = in.read();
if (ch < 0) {
// we've hit EOF before finding the end of current line
return false;
}
if (ch == '\n' || ch == '\r') {
maybeSkipLF(in, ch);
return true;
}
}
}
/**
* Reads a line from the input only if it is terminated by CR or LF or
* CRLF. If it detects EOF before the newline character, returns
* {@code null}.
*
* @return The line (possibly zero-length) or null on EOF.
*/
// package-visible for testing
String readCompleteLine(Reader reader) throws IOException {
int ch;
while ((ch = reader.read()) >= 0) {
if (ch == '\r' || ch == '\n') {
maybeSkipLF(reader, ch);
try {
return lineBuilder.toString();
} finally {
lineBuilder.setLength(0);
}
} else {
lineBuilder.append((char) ch);
}
}
// EOF
return null;
}
private static void maybeSkipLF(Reader reader, int ch) throws IOException {
// look ahead for possible '\n' after '\r' (windows end-line style)
if (ch == '\r') {
reader.mark(1);
int ch2 = reader.read();
if (ch2 != '\n') {
reader.reset();
}
}
}
private void closeCurrentFile() {
if (currentReader != null) {
try {
currentReader.close();
} catch (IOException e) {
throw sneakyThrow(e);
}
}
currentFile = null;
currentFileName = null;
currentReader = null;
currentInputStream = null;
}
/**
* Private API. Use {@link
* com.hazelcast.jet.core.processor.SourceProcessors#streamFilesP} instead.
*/
@Nonnull
public static ProcessorMetaSupplier metaSupplier(
@Nonnull String watchedDirectory,
@Nonnull String charset,
@Nonnull String glob,
@Nonnull DistributedBiFunction mapOutputFn
) {
return ProcessorMetaSupplier.of((ProcessorSupplier)
count -> IntStream.range(0, count)
.mapToObj(i -> new StreamFilesP(watchedDirectory, Charset.forName(charset), glob, count, i,
mapOutputFn))
.collect(toList()),
2);
}
private static WatchEvent.Modifier[] getHighSensitivityModifiers() {
// Modifiers for file watch service to achieve the highest possible sensitivity.
// Background: Java 7 SE defines no standard modifiers for a watch service. However some JDKs use internal
// modifiers to increase sensitivity. This field contains modifiers to be used for highest possible sensitivity.
// It's JVM-specific and hence it's just a best-effort.
// I believe this is useful on platforms without native watch service (or where Java does not use it) e.g. MacOSX
Object modifier = ReflectionUtils.readStaticFieldOrNull(SENSITIVITY_MODIFIER_CLASS_NAME, "HIGH");
if (modifier instanceof WatchEvent.Modifier) {
return new WatchEvent.Modifier[]{(WatchEvent.Modifier) modifier};
}
//bad luck, we did not find the modifier
return new WatchEvent.Modifier[0];
}
private static final class FileOffset {
private static final FileOffset ZERO = new FileOffset(0, "");
private final long offset;
private final String pendingLine;
private FileOffset(long offset, @Nonnull String pendingLine) {
this.offset = offset;
this.pendingLine = pendingLine;
}
/**
* Negative offset means we're reading the file for the first time.
* We recover the actual offset by negating, then we subtract one
* so that we don't skip the first line if we started right after a newline.
*/
private long positiveOffset() {
return offset >= 0 ? offset : -offset - 1;
}
@Override
public String toString() {
return "FileOffset{offset=" + offset + ", pendingLine='" + pendingLine + '\'' + '}';
}
}
}