
org.apache.flume.source.taildir.TaildirMatcher Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.flume.source.taildir;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import org.apache.flume.annotations.InterfaceAudience;
import org.apache.flume.annotations.InterfaceStability;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.IOException;
import java.nio.file.DirectoryStream;
import java.nio.file.FileSystem;
import java.nio.file.FileSystems;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.PathMatcher;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.concurrent.TimeUnit;
/**
* Identifies and caches the files matched by single file pattern for {@code TAILDIR} source.
*
* Since file patterns only apply to the fileNames and not the parent dictionaries, this
* implementation checks the parent directory for modification (additional or removed files
* update modification time of parent dir)
* If no modification happened to the parent dir that means the underlying files could only be
* written to but no need to rerun the pattern matching on fileNames.
*
* This implementation provides lazy caching or no caching. Instances of this class keep the
* result file list from the last successful execution of {@linkplain #getMatchingFiles()}
* function invocation, and may serve the content without hitting the FileSystem for performance
* optimization.
*
* IMPORTANT: It is assumed that the hosting system provides at least second granularity
* for both {@code System.currentTimeMillis()} and {@code File.lastModified()}. Also
* that system clock is used for file system timestamps. If it is not the case then configure it
* as uncached. Class is solely for package only usage. Member functions are not thread safe.
*
* @see TaildirSource
* @see ReliableTaildirEventReader
* @see TaildirSourceConfigurationConstants
*/
@InterfaceAudience.Private
@InterfaceStability.Evolving
public class TaildirMatcher {
private static final Logger logger = LoggerFactory.getLogger(TaildirMatcher.class);
private static final FileSystem FS = FileSystems.getDefault();
// flag from configuration to switch off caching completely
private final boolean cachePatternMatching;
// id from configuration
private final String fileGroup;
// plain string of the desired files from configuration
private final String filePattern;
// directory monitored for changes
private final File parentDir;
// cached instance for filtering files based on filePattern
private final DirectoryStream.Filter fileFilter;
// system time in milliseconds, stores the last modification time of the
// parent directory seen by the last check, rounded to seconds
// initial value is used in first check only when it will be replaced instantly
// (system time is positive)
private long lastSeenParentDirMTime = -1;
// system time in milliseconds, time of the last check, rounded to seconds
// initial value is used in first check only when it will be replaced instantly
// (system time is positive)
private long lastCheckedTime = -1;
// cached content, files which matched the pattern within the parent directory
private List lastMatchedFiles = Lists.newArrayList();
/**
* Package accessible constructor. From configuration context it represents a single
* filegroup
and encapsulates the corresponding filePattern
.
* filePattern
consists of two parts: first part has to be a valid path to an
* existing parent directory, second part has to be a valid regex
* {@link java.util.regex.Pattern} that match any non-hidden file names within parent directory
* . A valid example for filePattern is /dir0/dir1/.*
given
* /dir0/dir1
is an existing directory structure readable by the running user.
*
* An instance of this class is created for each fileGroup
*
* @param fileGroup arbitrary name of the group given by the config
* @param filePattern parent directory plus regex pattern. No wildcards are allowed in directory
* name
* @param cachePatternMatching default true, recommended in every setup especially with huge
* parent directories. Don't set when local system clock is not used
* for stamping mtime (eg: remote filesystems)
* @see TaildirSourceConfigurationConstants
*/
TaildirMatcher(String fileGroup, String filePattern, boolean cachePatternMatching) {
// store whatever came from configuration
this.fileGroup = fileGroup;
this.filePattern = filePattern;
this.cachePatternMatching = cachePatternMatching;
// calculate final members
File f = new File(filePattern);
this.parentDir = f.getParentFile();
String regex = f.getName();
final PathMatcher matcher = FS.getPathMatcher("regex:" + regex);
this.fileFilter = new DirectoryStream.Filter() {
@Override
public boolean accept(Path entry) throws IOException {
return matcher.matches(entry.getFileName()) && !Files.isDirectory(entry);
}
};
// sanity check
Preconditions.checkState(parentDir.exists(),
"Directory does not exist: " + parentDir.getAbsolutePath());
}
/**
* Lists those files within the parentDir that match regex pattern passed in during object
* instantiation. Designed for frequent periodic invocation
* {@link org.apache.flume.source.PollableSourceRunner}.
*
* Based on the modification of the parentDir this function may trigger cache recalculation by
* calling {@linkplain #getMatchingFilesNoCache()} or
* return the value stored in {@linkplain #lastMatchedFiles}.
* Parentdir is allowed to be a symbolic link.
*
* Files returned by this call are weakly consistent (see {@link DirectoryStream}).
* It does not freeze the directory while iterating,
* so it may (or may not) reflect updates to the directory that occur during the call,
* In which case next call
* will return those files (as mtime is increasing it won't hit cache but trigger recalculation).
* It is guaranteed that invocation reflects every change which was observable at the time of
* invocation.
*
* Matching file list recalculation is triggered when caching was turned off or
* if mtime is greater than the previously seen mtime
* (including the case of cache hasn't been calculated before).
* Additionally if a constantly updated directory was configured as parentDir
* then multiple changes to the parentDir may happen
* within the same second so in such case (assuming at least second granularity of reported mtime)
* it is impossible to tell whether a change of the dir happened before the check or after
* (unless the check happened after that second).
* Having said that implementation also stores system time of the previous invocation and previous
* invocation has to happen strictly after the current mtime to avoid further cache refresh
* (because then it is guaranteed that previous invocation resulted in valid cache content).
* If system clock hasn't passed the second of
* the current mtime then logic expects more changes as well
* (since it cannot be sure that there won't be any further changes still in that second
* and it would like to avoid data loss in first place)
* hence it recalculates matching files. If system clock finally
* passed actual mtime then a subsequent invocation guarantees that it picked up every
* change from the passed second so
* any further invocations can be served from cache associated with that second
* (given mtime is not updated again).
*
* @return List of files matching the pattern sorted by last modification time. No recursion.
* No directories. If nothing matches then returns an empty list. If I/O issue occurred then
* returns the list collected to the point when exception was thrown.
*
* @see #getMatchingFilesNoCache()
*/
List getMatchingFiles() {
long now = TimeUnit.SECONDS.toMillis(
TimeUnit.MILLISECONDS.toSeconds(System.currentTimeMillis()));
long currentParentDirMTime = parentDir.lastModified();
List result;
// calculate matched files if
// - we don't want to use cache (recalculate every time) OR
// - directory was clearly updated after the last check OR
// - last mtime change wasn't already checked for sure
// (system clock hasn't passed that second yet)
if (!cachePatternMatching ||
lastSeenParentDirMTime < currentParentDirMTime ||
!(currentParentDirMTime < lastCheckedTime)) {
lastMatchedFiles = sortByLastModifiedTime(getMatchingFilesNoCache());
lastSeenParentDirMTime = currentParentDirMTime;
lastCheckedTime = now;
}
return lastMatchedFiles;
}
/**
* Provides the actual files within the parentDir which
* files are matching the regex pattern. Each invocation uses {@link DirectoryStream}
* to identify matching files.
*
* Files returned by this call are weakly consistent (see {@link DirectoryStream}).
* It does not freeze the directory while iterating, so it may (or may not) reflect updates
* to the directory that occur during the call. In which case next call will return those files.
*
* @return List of files matching the pattern unsorted. No recursion. No directories.
* If nothing matches then returns an empty list. If I/O issue occurred then returns the list
* collected to the point when exception was thrown.
*
* @see DirectoryStream
* @see DirectoryStream.Filter
*/
private List getMatchingFilesNoCache() {
List result = Lists.newArrayList();
try (DirectoryStream stream = Files.newDirectoryStream(parentDir.toPath(), fileFilter)) {
for (Path entry : stream) {
result.add(entry.toFile());
}
} catch (IOException e) {
logger.error("I/O exception occurred while listing parent directory. " +
"Files already matched will be returned. " + parentDir.toPath(), e);
}
return result;
}
/**
* Utility function to sort matched files based on last modification time.
* Sorting itself use only a snapshot of last modification times captured before the sorting
* to keep the number of stat system calls to the required minimum.
*
* @param files list of files in any order
* @return sorted list
*/
private static List sortByLastModifiedTime(List files) {
final HashMap lastModificationTimes = new HashMap(files.size());
for (File f: files) {
lastModificationTimes.put(f, f.lastModified());
}
Collections.sort(files, new Comparator() {
@Override
public int compare(File o1, File o2) {
return lastModificationTimes.get(o1).compareTo(lastModificationTimes.get(o2));
}
});
return files;
}
@Override
public String toString() {
return "{" +
"filegroup='" + fileGroup + '\'' +
", filePattern='" + filePattern + '\'' +
", cached=" + cachePatternMatching +
'}';
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
TaildirMatcher that = (TaildirMatcher) o;
return fileGroup.equals(that.fileGroup);
}
@Override
public int hashCode() {
return fileGroup.hashCode();
}
public String getFileGroup() {
return fileGroup;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy