es.accenture.flink.Sources.KuduInputFormat Maven / Gradle / Ivy
Show all versions of flink-kudu-connector Show documentation
package es.accenture.flink.Sources;
import es.accenture.flink.Utils.RowSerializable;
import org.apache.flink.api.common.io.InputFormat;
import org.apache.flink.api.common.io.LocatableInputSplitAssigner;
import org.apache.flink.api.common.io.statistics.BaseStatistics;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.core.io.InputSplitAssigner;
import org.apache.kudu.client.*;
import org.apache.log4j.Logger;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
* {@link InputFormat} subclass that wraps the access for KuduTables.
*/
public class KuduInputFormat implements InputFormat {
private String KUDU_MASTER;
private String TABLE_NAME;
private transient KuduTable table = null;
private transient KuduScanner scanner = null;
private transient KuduClient client = null;
private transient RowResultIterator results = null;
private List rows = null;
private List tokens = null;
private boolean endReached = false;
private int scannedRows = 0;
private static final Logger LOG = Logger.getLogger(KuduInputFormat.class);
private List projectColumns;
/**
* Constructor of class KuduInputFormat
* @param tableName Name of the Kudu table in which we are going to read
* @param IP Kudu-master server's IP direction
*/
public KuduInputFormat(String tableName, String IP){
LOG.info("1. CONSTRUCTOR");
KUDU_MASTER = IP;
TABLE_NAME = tableName;
}
/**
* Returns an instance of Scan that retrieves the required subset of records from the Kudu table.
* @return The appropriate instance of Scan for this usecase.
*/
private KuduScanner getScanner(){
return this.scanner;
}
/**
* What table is to be read.
* Per instance of a TableInputFormat derivative only a single tablename is possible.
* @return The name of the table
*/
public String getTableName(){
return TABLE_NAME;
}
/**
* @return A list of rows ({@link RowSerializable}) from the Kudu table
*/
public List getRows(){
return this.rows;
}
/**
* The output from Kudu is always an instance of {@link RowResult}.
* This method is to copy the data in the RowResult instance into the required {@link RowSerializable}
* @param rowResult The Result instance from Kudu that needs to be converted
* @return The appropriate instance of {@link RowSerializable} that contains the needed information.
*/
private RowSerializable RowResultToRowSerializable(RowResult rowResult) throws IllegalAccessException {
RowSerializable row = new RowSerializable(rowResult.getColumnProjection().getColumnCount());
for (int i=0; i();
for (int i = 0; i < table.getSchema().getColumnCount(); i++) {
projectColumns.add(this.table.getSchema().getColumnByIndex(i).getName());
}
return table;
}
/**
* Create an {@link KuduTable} instance and set it into this format
*/
@Override
public void open(KuduInputSplit split) throws IOException {
LOG.info("SPLIT "+split.getSplitNumber()+" PASANDO POR 5. OPEN");
if (table == null) {
throw new IOException("The Kudu table has not been opened!");
}
LOG.info("Opening split...");
KuduScanToken.KuduScanTokenBuilder builder = client.newScanTokenBuilder(this.table)
.setProjectedColumnNames(this.projectColumns);
this.tokens = builder.build();
endReached = false;
scannedRows = 0;
try {
LOG.info("SPLIT NUMBER "+split.getSplitNumber());
scanner = tokens.get(split.getSplitNumber()).intoScanner(client);
} catch (Exception e) {
e.printStackTrace();
}
results = scanner.nextRows();
}
/**
*
* @return True if has reached the end, false if not
*/
@Override
public boolean reachedEnd() throws IOException {
return endReached;
}
/**
* Receives the last Row {@link RowSerializable} returned by the iterator and returns the next one.
* @param reuse; the last record returned by the iterator.
* @return resRow; the next record from the iterator.
*/
@Override
public RowSerializable nextRecord(RowSerializable reuse) throws IOException {
if (scanner == null) {
throw new IOException("No table scanner provided!");
}
if (reuse == null){
throw new IOException("No row reuse provided");
}
if (results.getNumRows()==0){
throw new IOException("The table is empty");
}
try {
RowResult res = this.results.next();
RowSerializable resRow= RowResultToRowSerializable(res);
if (res != null) {
scannedRows++;
return resRow;
}
} catch (Exception e) {
endReached = true;
scanner.close();
//workaround for timeout on scan
LOG.warn("Error after scan of " + scannedRows + " rows. Retry with a new scanner...", e);
}
return null;
}
/**
* Method that marks the end of the life-cycle of an input split.
* It's used to close the Kudu Scanner.
* After this method returns without an error, the input is assumed to be correctly read
*/
@Override
public void close() throws IOException {
LOG.info("Closing split (scanned {} rows)" + scannedRows);
try {
if (scanner != null) {
scanner.close();
}
} finally {
scanner = null;
}
}
/**
* Creates the different splits of the KuduTable that can be processed in parallel.
* @param minNumSplits; The minimum desired number of splits.
* If fewer are created, some parallel instances may remain idle.
* @return inputs; The splits of this input that can be processed in parallel.
*/
@Override
public KuduInputSplit[] createInputSplits(final int minNumSplits) {
LOG.info("3. CREATE SPLITS");
KuduScanToken.KuduScanTokenBuilder builder = client.newScanTokenBuilder(this.table)
.setProjectedColumnNames(this.projectColumns);
this.tokens = builder.build();
List splits = new ArrayList<>(minNumSplits);
for (KuduScanToken token : tokens){
byte[] startKey = token.getTablet().getPartition().getPartitionKeyStart();
byte[] endKey = token.getTablet().getPartition().getPartitionKeyEnd();
List locations = new ArrayList<>(token.getTablet().getReplicas().size());
for (LocatedTablet.Replica replica : token.getTablet().getReplicas()) {
locations.add(replica.getRpcHost().concat(":").concat(replica.getRpcPort().toString()));
}
int numSplit = splits.size();
KuduInputSplit split = new KuduInputSplit(numSplit, (locations.toArray(new String[locations.size()])),
TABLE_NAME, startKey, endKey);
splits.add(split);
}
LOG.info("Created: " + splits.size() + " splits");
return splits.toArray(new KuduInputSplit[0]);
}
/**
* Test if the given region is to be included in the InputSplit while splitting the regions of a table.
*
* This optimization is effective when there is a specific reasoning to exclude an entire region from the M-R job,
* (and hence, not contributing to the InputSplit), given the start and end keys of the same.
* Useful when we need to remember the last-processed top record and revisit the [last, current) interval for M-R
* processing, continuously. In addition to reducing InputSplits, reduces the load on the region server as well, due
* to the ordering of the keys.
*
* Note: It is possible that endKey.length() == 0
, for the last (recent) region.
* Override this method, if you want to bulk exclude regions altogether from M-R. By default, no region is excluded(
* i.e. all regions are included).
*
* @param startKey Start key of the region
* @param endKey End key of the region
* @return true, if this region needs to be included as part of the input (default).
*/
protected boolean includeRegionInSplit(final byte[] startKey, final byte[] endKey) { return true; }
@Override
public InputSplitAssigner getInputSplitAssigner(KuduInputSplit[] inputSplits) {
LOG.info("4. ASSIGNER");
return new LocatableInputSplitAssigner(inputSplits);
}
@Override
public BaseStatistics getStatistics(BaseStatistics cachedStatistics) { return null; }
}