org.apache.crunch.Source Maven / Gradle / Ivy
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.crunch;
import java.io.IOException;
import org.apache.crunch.types.Converter;
import org.apache.crunch.types.PType;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.mapreduce.Job;
/**
* A {@code Source} represents an input data set that is an input to one or more
* MapReduce jobs.
*
*/
public interface Source {
/**
* Adds the given key-value pair to the {@code Configuration} instance that is used to read
* this {@code Source }. Allows for multiple inputs to re-use the same config keys with
* different values when necessary.
*/
Source inputConf(String key, String value);
/**
* Adds the {@code Configuration} of the given filesystem such that the source can read from it when the {@code
* Pipeline} itself does not have that configuration.
*
* Changing the filesystem after it is set is not supported and will result in {@link
* IllegalStateException}
*
* @param fileSystem the filesystem
* @return this Source
* @throws IllegalStateException if the filesystem has already been set
* @throws IllegalArgumentException if the source is pointing to a fully qualified Path in a different FileSystem
*/
Source fileSystem(FileSystem fileSystem);
/**
* Returns the {@code FileSystem} for this source or null if no explicit filesystem {@link #fileSystem(FileSystem)
* has been set}.
*/
FileSystem getFileSystem();
/**
* Returns the {@code PType} for this source.
*/
PType getType();
/**
* Returns the {@code Converter} used for mapping the inputs from this instance
* into {@code PCollection} or {@code PTable} values.
*/
Converter, ?, ?, ?> getConverter();
/**
* Configure the given job to use this source as an input.
*
* @param job
* The job to configure
* @param inputId
* For a multi-input job, an identifier for this input to the job
* @throws IOException
*/
void configureSource(Job job, int inputId) throws IOException;
/**
* Returns the number of bytes in this {@code Source}.
*/
long getSize(Configuration configuration);
/**
* Returns the time (in milliseconds) that this {@code Source} was most recently
* modified (e.g., because an input file was edited or new files were added to
* a directory.)
*/
long getLastModifiedAt(Configuration configuration);
}