org.elasticsearch.hadoop.mr.EsInputFormat Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of elasticsearch-spark-20_2.12 Show documentation
Show all versions of elasticsearch-spark-20_2.12 Show documentation
Elasticsearch Spark (for Spark 2.X)
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.hadoop.mr;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.MapWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.util.Progressable;
import org.elasticsearch.hadoop.cfg.HadoopSettings;
import org.elasticsearch.hadoop.cfg.HadoopSettingsManager;
import org.elasticsearch.hadoop.cfg.Settings;
import org.elasticsearch.hadoop.mr.compat.CompatHandler;
import org.elasticsearch.hadoop.mr.security.HadoopUserProvider;
import org.elasticsearch.hadoop.rest.InitializationUtils;
import org.elasticsearch.hadoop.rest.PartitionDefinition;
import org.elasticsearch.hadoop.rest.RestRepository;
import org.elasticsearch.hadoop.rest.RestService;
import org.elasticsearch.hadoop.rest.RestService.PartitionReader;
import org.elasticsearch.hadoop.rest.ScrollQuery;
import org.elasticsearch.hadoop.rest.SearchRequestBuilder;
import org.elasticsearch.hadoop.rest.stats.Stats;
import org.elasticsearch.hadoop.serialization.ScrollReader;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import java.util.Map;
/**
* ElasticSearch {@link InputFormat} for streaming data (typically based on a query) from ElasticSearch.
* Returns the document ID as key and its content as value.
*
* This class implements both the "old" (org.apache.hadoop.mapred) and the "new" (org.apache.hadoop.mapreduce) API.
*/
public class EsInputFormat extends InputFormat implements org.apache.hadoop.mapred.InputFormat{
private static Log log = LogFactory.getLog(EsInputFormat.class);
protected static class EsInputSplit extends InputSplit implements org.apache.hadoop.mapred.InputSplit {
private PartitionDefinition partition;
public EsInputSplit() {}
public EsInputSplit(PartitionDefinition partition) {
this.partition = partition;
}
@Override
public long getLength() {
// TODO: can this be computed easily?
return 1l;
}
@Override
public String[] getLocations() {
return partition.getHostNames();
}
@Override
public void write(DataOutput out) throws IOException {
partition.write(out);
}
@Override
public void readFields(DataInput in) throws IOException {
partition = new PartitionDefinition(in);
}
public PartitionDefinition getPartition() {
return partition;
}
@Override
public String toString() {
return "EsInputSplit{" +
(partition == null ? "NULL" : partition.toString()) +
"}";
}
}
protected static abstract class EsInputRecordReader extends RecordReader implements org.apache.hadoop.mapred.RecordReader {
private int read = 0;
private EsInputSplit esSplit;
private ScrollReader scrollReader;
private RestRepository client;
private SearchRequestBuilder queryBuilder;
private ScrollQuery scrollQuery;
// reuse objects
private K currentKey;
private V currentValue;
private long size = 0;
private HeartBeat beat;
private Progressable progressable;
// default constructor used by the NEW api
public EsInputRecordReader() {
}
// constructor used by the old API
public EsInputRecordReader(org.apache.hadoop.mapred.InputSplit split, Configuration job, Reporter reporter) {
reporter.setStatus(split.toString());
init((EsInputSplit) split, job, reporter);
}
// new API init call
@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException {
org.elasticsearch.hadoop.mr.compat.TaskAttemptContext compatContext = CompatHandler.taskAttemptContext(context);
compatContext.setStatus(split.toString());
init((EsInputSplit) split, compatContext.getConfiguration(), compatContext);
}
void init(EsInputSplit esSplit, Configuration cfg, Progressable progressable) {
// get a copy to override the host/port
Settings settings = HadoopSettingsManager.loadFrom(cfg).copy().load(esSplit.getPartition().getSerializedSettings());
if (log.isTraceEnabled()) {
log.trace(String.format("Init shard reader from cfg %s", HadoopCfgUtils.asProperties(cfg)));
log.trace(String.format("Init shard reader w/ settings %s", settings));
}
this.esSplit = esSplit;
// initialize mapping/ scroll reader
InitializationUtils.setValueReaderIfNotSet(settings, WritableValueReader.class, log);
InitializationUtils.setUserProviderIfNotSet(settings, HadoopUserProvider.class, log);
PartitionDefinition part = esSplit.getPartition();
PartitionReader partitionReader = RestService.createReader(settings, part, log);
this.scrollReader = partitionReader.scrollReader;
this.client = partitionReader.client;
this.queryBuilder = partitionReader.queryBuilder;
this.progressable = progressable;
// in Hadoop-like envs (Spark) the progressable might be null and thus the heart-beat is not needed
if (progressable != null) {
beat = new HeartBeat(progressable, cfg, settings.getHeartBeatLead(), log);
}
if (log.isDebugEnabled()) {
log.debug(String.format("Initializing RecordReader for [%s]", esSplit));
}
}
@Override
public boolean nextKeyValue() throws IOException {
// new API call routed to old API
// under the new API always create new objects since consumers can (and sometimes will) modify them
currentKey = createKey();
currentValue = createValue();
return next(currentKey, currentValue);
}
@Override
public K getCurrentKey() throws IOException {
return currentKey;
}
@Override
public V getCurrentValue() {
return currentValue;
}
@Override
public float getProgress() {
return size == 0 ? 0 : ((float) getPos()) / size;
}
@Override
public void close() throws IOException {
try {
if (log.isDebugEnabled()) {
log.debug(String.format("Closing RecordReader for [%s]", esSplit));
}
if (beat != null) {
beat.stop();
}
if (scrollQuery != null) {
scrollQuery.close();
}
if (client != null) {
client.close();
}
} finally {
Stats stats = new Stats();
if (client != null) {
stats.aggregate(client.stats());
client = null;
}
if (scrollQuery != null) {
stats.aggregate(scrollQuery.stats());
scrollQuery = null;
}
ReportingUtils.report(progressable, stats);
}
}
@Override
public boolean next(K key, V value) throws IOException {
if (scrollQuery == null) {
if (beat != null) {
beat.start();
}
scrollQuery = queryBuilder.build(client, scrollReader);
size = scrollQuery.getSize();
if (log.isTraceEnabled()) {
log.trace(String.format("Received scroll [%s], size [%d] for query [%s]", scrollQuery, size, queryBuilder));
}
}
boolean hasNext = scrollQuery.hasNext();
if (!hasNext) {
return false;
}
Object[] next = scrollQuery.next();
// NB: the left assignment is not needed since method override
// the writable content however for consistency, they are below
currentKey = setCurrentKey(key, next[0]);
currentValue = setCurrentValue(value, next[1]);
// keep on counting
read++;
return true;
}
@Override
public abstract K createKey();
@Override
public abstract V createValue();
/**
* Sets the current key.
*
* @param hadoopKey hadoop key
* @param object the actual value to read
* @return returns the key to be used; needed in scenario where the key is immutable
*/
protected abstract K setCurrentKey(K hadoopKey, Object object);
/**
* Sets the current value.
*
* @param hadoopValue hadoop value
* @param object the actual value to read
* @return returns the value to be used; needed in scenario where the passed value is immutable
*/
protected abstract V setCurrentValue(V hadoopValue, Object object);
@Override
public long getPos() {
return read;
}
}
protected static abstract class AbstractWritableEsInputRecordReader extends EsInputRecordReader {
public AbstractWritableEsInputRecordReader() {
super();
}
public AbstractWritableEsInputRecordReader(org.apache.hadoop.mapred.InputSplit split, Configuration job, Reporter reporter) {
super(split, job, reporter);
}
@Override
public Text createKey() {
return new Text();
}
@Override
protected Text setCurrentKey(Text hadoopKey, Object object) {
if (hadoopKey != null) {
hadoopKey.set(object.toString());
}
return hadoopKey;
}
}
protected static class WritableEsInputRecordReader extends AbstractWritableEsInputRecordReader