Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Licensed to David Pilato (the "Author") under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Author licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package fr.pilato.elasticsearch.crawler.fs.util;
import fr.pilato.elasticsearch.crawler.fs.FsCrawler;
import fr.pilato.elasticsearch.crawler.fs.ScanStatistic;
import fr.pilato.elasticsearch.crawler.fs.meta.MetaParser;
import org.apache.commons.io.FileUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.URISyntaxException;
import java.nio.file.CopyOption;
import java.nio.file.FileAlreadyExistsException;
import java.nio.file.FileVisitOption;
import java.nio.file.FileVisitResult;
import java.nio.file.Files;
import java.nio.file.NoSuchFileException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.SimpleFileVisitor;
import java.nio.file.attribute.BasicFileAttributeView;
import java.nio.file.attribute.BasicFileAttributes;
import java.nio.file.attribute.FileOwnerAttributeView;
import java.nio.file.attribute.PosixFileAttributeView;
import java.nio.file.attribute.PosixFileAttributes;
import java.time.Instant;
import java.util.EnumSet;
import java.util.List;
public class FsCrawlerUtil extends MetaParser {
public static final String INDEX_TYPE_DOC = "doc";
public static final String INDEX_TYPE_FOLDER = "folder";
static public final class Doc {
public static final String CONTENT = "content";
public static final String ATTACHMENT = "attachment";
public static final String META = "meta";
static public final class Meta {
public static final String AUTHOR = "author";
public static final String TITLE = "title";
public static final String DATE = "date";
public static final String KEYWORDS = "keywords";
}
public static final String FILE = "file";
static public final class File {
public static final String CONTENT_TYPE = "content_type";
public static final String LAST_MODIFIED = "last_modified";
public static final String INDEXING_DATE = "indexing_date";
public static final String FILESIZE = "filesize";
public static final String FILENAME = "filename";
public static final String URL = "url";
public static final String INDEXED_CHARS = "indexed_chars";
public static final String CHECKSUM = "checksum";
}
public static final String PATH = "path";
static public final class Path {
public static final String ENCODED = "encoded";
public static final String ROOT = "root";
public static final String VIRTUAL = "virtual";
public static final String REAL = "real";
}
public static final String ATTRIBUTES = "attributes";
static public final class Attributes {
public static final String OWNER = "owner";
public static final String GROUP = "group";
}
}
static public final class Dir {
public static final String NAME = "name";
public static final String ENCODED = "encoded";
public static final String ROOT = "root";
public static final String VIRTUAL = "virtual";
public static final String REAL = "real";
}
private static final Logger logger = LogManager.getLogger(FsCrawlerUtil.class);
/**
* Reads a mapping from config/_default/version/type.json file
*
* @param config Root dir where we can find the configuration (default to ~/.fscrawler)
* @param version Elasticsearch major version number (only major digit is kept so for 2.3.4 it will be 2)
* @param type The expected type (will be expanded to type.json)
* @return the mapping
* @throws URISyntaxException
* @throws IOException
*/
public static String readDefaultMapping(Path config, String version, String type) throws URISyntaxException, IOException {
Path defaultConfigDir = config.resolve("_default");
try {
return readMapping(defaultConfigDir, version, type);
} catch (NoSuchFileException e) {
throw new IllegalArgumentException("Mapping file " + type + ".json does not exist for elasticsearch version " + version +
" in [" + defaultConfigDir + "] dir");
}
}
/**
* Reads a mapping from dir/version/type.json file
*
* @param dir Directory containing mapping files per major version
* @param version Elasticsearch major version number (only major digit is kept so for 2.3.4 it will be 2)
* @param type The expected type (will be expanded to type.json)
* @return the mapping
* @throws URISyntaxException
* @throws IOException
*/
public static String readMapping(Path dir, String version, String type) throws URISyntaxException, IOException {
Path file = dir.resolve(version).resolve(type + ".json");
return new String(Files.readAllBytes(file), "UTF-8");
}
/**
* Reads a mapping from dir/version/type.json file.
* If not found, read from ~/.fscrawler/_default/version/type.json
*
* @param dir Directory which might contain mapping files per major version (job dir)
* @param config Root dir where we can find the configuration (default to ~/.fscrawler)
* @param version Elasticsearch major version number (only major digit is kept so for 2.3.4 it will be 2)
* @param type The expected type (will be expanded to type.json)
* @return the mapping
* @throws URISyntaxException
* @throws IOException
*/
public static String readMapping(Path dir, Path config, String version, String type) throws URISyntaxException, IOException {
try {
return readMapping(dir, version, type);
} catch (NoSuchFileException e) {
// We fall back to default mappings in config dir
return readDefaultMapping(config, version, type);
}
}
/**
* We check if we can index the file or if we should ignore it
*
* @param filename The filename to scan
* @param includes include rules, may be empty not null
* @param excludes exclude rules, may be empty not null
*/
public static boolean isIndexable(String filename, List includes, List excludes) {
logger.debug("filename = [{}], includes = [{}], excludes = [{}]", filename, includes, excludes);
// Ignore temporary files
if (filename.contains("~")) {
logger.trace("filename contains ~");
return false;
}
// No rules ? Fine, we index everything
if ((includes == null || includes.isEmpty()) && (excludes == null || excludes.isEmpty())) {
logger.trace("no rules");
return true;
}
// Exclude rules : we know that whatever includes rules are, we should exclude matching files
if (excludes != null) {
for (String exclude : excludes) {
String regex = exclude.replace("?", ".?").replace("*", ".*?");
logger.trace("regex is [{}]", regex);
if (filename.matches(regex)) {
logger.trace("does match exclude regex");
return false;
}
}
}
// Include rules : we should add document if it match include rules
if (includes == null || includes.isEmpty()) {
logger.trace("no include rules");
return true;
}
for (String include : includes) {
String regex = include.replace("?", ".?").replace("*", ".*?");
logger.trace("regex is [{}]", regex);
if (filename.matches(regex)) {
logger.trace("does match include regex");
return true;
}
}
logger.trace("does not match any include pattern");
return false;
}
public static String computeVirtualPathName(ScanStatistic stats,
String realPath) {
if (realPath == null)
return null;
if (realPath.length() < stats.getRootPath().length())
return "/";
return realPath.substring(stats.getRootPath().length())
.replace("\\", "/");
}
public static Instant getCreationTime(File file) {
Instant time;
try {
Path path = Paths.get(file.getAbsolutePath());
BasicFileAttributes fileattr = Files
.getFileAttributeView(path, BasicFileAttributeView.class)
.readAttributes();
time = Instant.ofEpochMilli(fileattr.creationTime().toMillis());
} catch (Exception e) {
time = null;
}
return time;
}
/**
* Determines the 'owner' of the file.
*/
public static String getOwnerName(final File file) {
try {
final Path path = Paths.get(file.getAbsolutePath());
final FileOwnerAttributeView ownerAttributeView = Files.getFileAttributeView(path, FileOwnerAttributeView.class);
return ownerAttributeView != null ? ownerAttributeView.getOwner().getName() : null;
}
catch(Exception e) {
logger.warn("Failed to determine 'owner' of {}: {}", file, e.getMessage());
return null;
}
}
/**
* Determines the 'group' of the file. Please note that 'group' is not
* available of Windows OS.
*/
public static String getGroupName(final File file) {
try {
final Path path = Paths.get(file.getAbsolutePath());
final PosixFileAttributes posixFileAttributes = Files.getFileAttributeView(path, PosixFileAttributeView.class).readAttributes();
return posixFileAttributes != null ? posixFileAttributes.group().getName() : null;
} catch (Exception e) {
logger.warn("Failed to determine 'group' of {}: {}", file, e.getMessage());
return null;
}
}
/**
* Extracts the major digits from a version string
* @param version A version like x.y.z-WHATEVER
* @return x
*/
public static String extractMajorVersionNumber(String version) {
String[] digits = version.split("\\.");
return digits[0];
}
private static final String CLASSPATH_RESOURCES_ROOT = "/fr/pilato/elasticsearch/crawler/fs/_default/";
public static final String[] MAPPING_RESOURCES = {
"1/doc.json", "1/folder.json",
"2/doc.json", "2/folder.json",
"5/doc.json", "5/folder.json"
};
/**
* Copy default resources files which are available as project resources under
* fr.pilato.elasticsearch.crawler.fs._default package to a given configuration path
* under a _default sub directory.
* @param configPath The config path which is by default .fscrawler
* @throws IOException If copying does not work
*/
public static void copyDefaultResources(Path configPath) throws IOException, URISyntaxException {
Path targetResourceDir = configPath.resolve("_default");
for (String filename : MAPPING_RESOURCES) {
logger.debug("Copying [{}]...", filename);
Path target = targetResourceDir.resolve(filename);
copyResourceFile(CLASSPATH_RESOURCES_ROOT + filename, target);
}
}
/**
* Move a Legacy resource to another destination if needed
* @param legacy Legacy file
* @param destinationFile New file name (directory must exist)
* @throws IOException In case moving file did not work
*/
public static void moveLegacyResource(Path legacy, Path destinationFile) throws IOException {
if (Files.exists(legacy)) {
logger.debug("Found a legacy file at [{}]", legacy);
Files.move(legacy, destinationFile);
logger.info("Moved Legacy file from [{}] to [{}]", legacy, destinationFile);
}
}
/**
* Copy a single resource file from the classpath or from a JAR.
* @param target The target
* @throws IOException If copying does not work
*/
public static void copyResourceFile(String source, Path target) throws IOException, URISyntaxException {
InputStream resource = FsCrawler.class.getResourceAsStream(source);
FileUtils.copyInputStreamToFile(resource, target.toFile());
}
/**
* Copy files from a source to a target
* under a _default sub directory.
* @param source The source dir
* @param target The target dir
* @param options Potential options
* @throws IOException If copying does not work
*/
public static void copyDirs(Path source, Path target, CopyOption... options) throws IOException {
if (Files.notExists(target)) {
Files.createDirectory(target);
}
logger.debug(" --> Copying resources from [{}]", source);
if (Files.notExists(source)) {
throw new RuntimeException(source + " doesn't seem to exist.");
}
Files.walkFileTree(source, EnumSet.of(FileVisitOption.FOLLOW_LINKS), Integer.MAX_VALUE,
new InternalFileVisitor(source, target, options));
logger.debug(" --> Resources ready in [{}]", target);
}
private static class InternalFileVisitor extends SimpleFileVisitor {
private final Path fromPath;
private final Path toPath;
private final CopyOption[] copyOption;
public InternalFileVisitor(Path fromPath, Path toPath, CopyOption... copyOption) {
this.fromPath = fromPath;
this.toPath = toPath;
this.copyOption = copyOption;
}
@Override
public FileVisitResult preVisitDirectory(Path dir, BasicFileAttributes attrs) throws IOException {
Path targetPath = toPath.resolve(fromPath.relativize(dir));
if(!Files.exists(targetPath)){
Files.createDirectory(targetPath);
}
return FileVisitResult.CONTINUE;
}
@Override
public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
try {
Files.copy(file, toPath.resolve(fromPath.relativize(file)), copyOption);
} catch (FileAlreadyExistsException ignored) {
// The file already exists we just ignore it
}
return FileVisitResult.CONTINUE;
}
}
}