All Downloads are FREE. Search and download functionalities are using the official Maven repository.
Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
io.cdap.plugin.batch.source.FTPBatchSource Maven / Gradle / Ivy
/*
* Copyright © 2016-2019 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package io.cdap.plugin.batch.source;
import com.google.common.reflect.TypeToken;
import com.google.gson.Gson;
import io.cdap.cdap.api.annotation.Description;
import io.cdap.cdap.api.annotation.Macro;
import io.cdap.cdap.api.annotation.Name;
import io.cdap.cdap.api.annotation.Plugin;
import io.cdap.cdap.api.data.schema.Schema;
import io.cdap.cdap.api.plugin.PluginConfig;
import io.cdap.cdap.etl.api.FailureCollector;
import io.cdap.cdap.etl.api.batch.BatchSource;
import io.cdap.cdap.etl.api.batch.BatchSourceContext;
import io.cdap.plugin.format.FileFormat;
import io.cdap.plugin.format.plugin.AbstractFileSource;
import io.cdap.plugin.format.plugin.FileSourceProperties;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import java.lang.reflect.Type;
import java.net.URLEncoder;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.annotation.Nullable;
/**
* {@link BatchSource} that reads from an FTP or SFTP server.
*/
@Plugin(type = "batchsource")
@Name("FTP")
@Description("Batch source for an FTP or SFTP source. Prefix of the path ('ftp://...' or 'sftp://...') determines " +
"the source server type, either FTP or SFTP.")
public class FTPBatchSource extends AbstractFileSource {
private static final String NAME_FILE_SYSTEM_PROPERTIES = "fileSystemProperties";
private static final String FS_SFTP_IMPL = "fs.sftp.impl";
private static final String SFTP_FS_CLASS = "org.apache.hadoop.fs.sftp.SFTPFileSystem";
private static final String FTP_PROTOCOL = "ftp";
private static final String SFTP_PROTOCOL = "sftp";
private static final int DEFAULT_FTP_PORT = 21;
private static final int DEFAULT_SFTP_PORT = 22;
public static final Schema SCHEMA = Schema.recordOf("text",
Schema.Field.of("offset", Schema.of(Schema.Type.LONG)),
Schema.Field.of("body", Schema.of(Schema.Type.STRING)));
private final FTPBatchSourceConfig config;
public FTPBatchSource(FTPBatchSourceConfig config) {
super(config);
this.config = config;
}
@Override
protected Map getFileSystemProperties(BatchSourceContext context) {
Map properties = new HashMap<>(config.getFileSystemProperties());
if (!properties.containsKey(FS_SFTP_IMPL)) {
properties.put(FS_SFTP_IMPL, SFTP_FS_CLASS);
}
// Limit the number of splits to 1 since FTPInputStream does not support seek;
properties.put(FileInputFormat.SPLIT_MINSIZE, Long.toString(Long.MAX_VALUE));
return properties;
}
/**
* Config class that contains all the properties needed for FTP Batch Source.
*/
@SuppressWarnings("unused")
public static class FTPBatchSourceConfig extends PluginConfig implements FileSourceProperties {
private static final Gson GSON = new Gson();
private static final Type MAP_STRING_STRING_TYPE = new TypeToken>() {
}.getType();
@Macro
@Description("Name be used to uniquely identify this source for lineage, annotating metadata, etc.")
private String referenceName;
@Macro
@Description("Path to file(s) to be read. Path is expected to be of the form " +
"'prefix://username:password@hostname:port/path'.")
private String path;
@Macro
@Nullable
@Description("Any additional properties to use when reading from the filesystem. "
+ "This is an advanced feature that requires knowledge of the properties supported by the underlying filesystem.")
private String fileSystemProperties;
@Nullable
@Description("Whether to allow an input that does not exist. When false, the source will fail the run if the input "
+ "does not exist. When true, the run will not fail and the source will not generate any output. "
+ "The default value is false.")
private Boolean ignoreNonExistingFolders;
@Macro
@Nullable
@Description("Regular expression that file names must match in order to be read. "
+ "If no value is given, no file filtering will be done. "
+ "See https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html for more information about "
+ "the regular expression syntax.")
private String fileRegex;
@Override
public void validate() {
getFileSystemProperties();
}
public void validate(FailureCollector collector) {
try {
getFileSystemProperties();
} catch (IllegalArgumentException e) {
collector.addFailure("File system properties must be a valid json.", null)
.withConfigProperty(NAME_FILE_SYSTEM_PROPERTIES).withStacktrace(e.getStackTrace());
}
}
@Override
public String getReferenceName() {
return referenceName;
}
@Override
public String getPath() {
if (authContainsSpecialCharacters()) {
Path urlInfo;
String extractedPassword = extractPasswordFromUrl();
String encodedPassword = URLEncoder.encode(extractedPassword);
String validatePath = path.replace(extractedPassword, encodedPassword);
try {
urlInfo = new Path(validatePath);
} catch (Exception e) {
throw new IllegalArgumentException(String.format("Unable to parse url: %s %s", e.getMessage(), e));
}
String host = urlInfo.toUri().getAuthority().substring(urlInfo.toUri().getAuthority().lastIndexOf("@") + 1);
String user = urlInfo.toUri().getAuthority().split(":")[0];
String protocol = urlInfo.toUri().getScheme();
int port = urlInfo.toUri().getPort();
if (port == -1 && protocol.equals(FTP_PROTOCOL)) {
port = DEFAULT_FTP_PORT;
}
if (port == -1 && protocol.equals(SFTP_PROTOCOL)) {
port = DEFAULT_SFTP_PORT;
}
String cleanHost = host.replace(":" + port, "");
return urlInfo.toUri().getScheme() + "://" + cleanHost + urlInfo.toUri().getPath();
}
return path;
}
@Override
public String getFormatName() {
return FileFormat.TEXT.name().toLowerCase();
}
@Nullable
@Override
public Pattern getFilePattern() {
return null;
}
@Override
public long getMaxSplitSize() {
return Long.MAX_VALUE;
}
@Override
public boolean shouldAllowEmptyInput() {
return false;
}
@Override
public boolean shouldReadRecursively() {
return false;
}
@Nullable
@Override
public String getPathField() {
return null;
}
@Override
public boolean useFilenameAsPath() {
return false;
}
@Override
public boolean skipHeader() {
return false;
}
@Nullable
@Override
public Schema getSchema() {
return SCHEMA;
}
public String extractPasswordFromUrl() {
int getLastIndexOfAtSign = path.lastIndexOf("@");
String authentication = path.substring(0, getLastIndexOfAtSign);
return authentication.substring(authentication.lastIndexOf(":") + 1);
}
public boolean authContainsSpecialCharacters() {
Pattern regularPasswordWithoutSpecialCharacters = Pattern.compile("[^A-Za-z0-9]");
Matcher regularPassword = regularPasswordWithoutSpecialCharacters.matcher(extractPasswordFromUrl());
return !regularPassword.matches();
}
Map getFileSystemProperties() {
HashMap fileSystemPropertiesMap = new HashMap<>();
if (fileSystemProperties != null) {
fileSystemPropertiesMap.putAll(GSON.fromJson(fileSystemProperties, MAP_STRING_STRING_TYPE));
}
try {
if (authContainsSpecialCharacters()) {
Path urlInfo;
String extractedPassword = extractPasswordFromUrl();
String encodedPassword = URLEncoder.encode(extractedPassword);
String validatePath = path.replace(extractedPassword, encodedPassword);
try {
urlInfo = new Path(validatePath);
} catch (Exception e) {
throw new IllegalArgumentException(String.format("Unable to parse url: %s %s", e.getMessage(), e));
}
// After encoding the url, the format should look like:
// ftp://kimi:42%4067%[email protected] :21/kimi-look-here.txt
int port = urlInfo.toUri().getPort();
String host = urlInfo.toUri().getAuthority().substring(urlInfo.toUri().getAuthority().lastIndexOf("@") + 1);
String user = urlInfo.toUri().getAuthority().split(":")[0];
if (urlInfo.toUri().getScheme().equals(FTP_PROTOCOL)) {
port = (port == -1) ? DEFAULT_FTP_PORT : port;
String cleanHostFTP = host.replace(":" + port, "");
fileSystemPropertiesMap.put("fs.ftp.host", cleanHostFTP);
fileSystemPropertiesMap.put(String.format("fs.ftp.user.%s", cleanHostFTP), user);
fileSystemPropertiesMap.put(String.format("fs.ftp.password.%s", cleanHostFTP), extractedPassword);
fileSystemPropertiesMap.put("fs.ftp.host.port", String.valueOf(port));
} else {
port = (port == -1) ? DEFAULT_SFTP_PORT : port;
String cleanHostSFTP = host.replace(":" + port, "");
fileSystemPropertiesMap.put(String.format("fs.sftp.user.%s", cleanHostSFTP), user);
fileSystemPropertiesMap.put(String.format("fs.sftp.password.%s.%s", cleanHostSFTP, user),
extractedPassword);
fileSystemPropertiesMap.put("fs.sftp.host.port", String.valueOf(port));
}
}
} catch (Exception e) {
throw new IllegalArgumentException(String.format("Unable to parse filesystem properties: %s %s", e.getMessage(),
e));
}
return fileSystemPropertiesMap;
}
}
}