org.elasticsearch.hadoop.mr.EsInputFormat Maven / Gradle / Ivy
                 Go to download
                
        
                    Show more of this group  Show more artifacts with this name
Show all versions of elasticsearch-spark-20_2.10 Show documentation
                Show all versions of elasticsearch-spark-20_2.10 Show documentation
Elasticsearch Spark (for Spark 2.X)
                
            /*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.elasticsearch.hadoop.mr;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.MapWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.util.Progressable;
import org.elasticsearch.hadoop.cfg.HadoopSettings;
import org.elasticsearch.hadoop.cfg.HadoopSettingsManager;
import org.elasticsearch.hadoop.cfg.Settings;
import org.elasticsearch.hadoop.mr.compat.CompatHandler;
import org.elasticsearch.hadoop.rest.InitializationUtils;
import org.elasticsearch.hadoop.rest.PartitionDefinition;
import org.elasticsearch.hadoop.rest.RestRepository;
import org.elasticsearch.hadoop.rest.RestService;
import org.elasticsearch.hadoop.rest.RestService.PartitionReader;
import org.elasticsearch.hadoop.rest.ScrollQuery;
import org.elasticsearch.hadoop.rest.SearchRequestBuilder;
import org.elasticsearch.hadoop.rest.stats.Stats;
import org.elasticsearch.hadoop.serialization.ScrollReader;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import java.util.Map;
/**
 * ElasticSearch {@link InputFormat} for streaming data (typically based on a query) from ElasticSearch.
 * Returns the document ID as key and its content as value.
 *
 * This class implements both the "old" (org.apache.hadoop.mapred) and the "new" (org.apache.hadoop.mapreduce) API.
 */
public class EsInputFormat extends InputFormat implements org.apache.hadoop.mapred.InputFormat{
    private static Log log = LogFactory.getLog(EsInputFormat.class);
    protected static class EsInputSplit extends InputSplit implements org.apache.hadoop.mapred.InputSplit {
        private PartitionDefinition partition;
        public EsInputSplit() {}
        public EsInputSplit(PartitionDefinition partition) {
            this.partition = partition;
        }
        @Override
        public long getLength() {
            // TODO: can this be computed easily?
            return 1l;
        }
        @Override
        public String[] getLocations() {
            return partition.getHostNames();
        }
        @Override
        public void write(DataOutput out) throws IOException {
            partition.write(out);
        }
        @Override
        public void readFields(DataInput in) throws IOException {
            partition = new PartitionDefinition(in);
        }
        public PartitionDefinition getPartition() {
            return partition;
        }
    }
    protected static abstract class EsInputRecordReader extends RecordReader implements org.apache.hadoop.mapred.RecordReader {
        private int read = 0;
        private EsInputSplit esSplit;
        private ScrollReader scrollReader;
        private RestRepository client;
        private SearchRequestBuilder queryBuilder;
        private ScrollQuery scrollQuery;
        // reuse objects
        private K currentKey;
        private V currentValue;
        private long size = 0;
        private HeartBeat beat;
        private Progressable progressable;
        // default constructor used by the NEW api
        public EsInputRecordReader() {
        }
        // constructor used by the old API
        public EsInputRecordReader(org.apache.hadoop.mapred.InputSplit split, Configuration job, Reporter reporter) {
            reporter.setStatus(split.toString());
            init((EsInputSplit) split, job, reporter);
        }
        // new API init call
        @Override
        public void initialize(InputSplit split, TaskAttemptContext context) throws IOException {
            org.elasticsearch.hadoop.mr.compat.TaskAttemptContext compatContext = CompatHandler.taskAttemptContext(context);
            compatContext.setStatus(split.toString());
            init((EsInputSplit) split, compatContext.getConfiguration(), compatContext);
        }
        void init(EsInputSplit esSplit, Configuration cfg, Progressable progressable) {
            // get a copy to override the host/port
            Settings settings = HadoopSettingsManager.loadFrom(cfg).copy().load(esSplit.getPartition().getSerializedSettings());
            if (log.isTraceEnabled()) {
                log.trace(String.format("Init shard reader from cfg %s", HadoopCfgUtils.asProperties(cfg)));
                log.trace(String.format("Init shard reader w/ settings %s", settings));
            }
            this.esSplit = esSplit;
            // initialize mapping/ scroll reader
            InitializationUtils.setValueReaderIfNotSet(settings, WritableValueReader.class, log);
            PartitionDefinition part = esSplit.getPartition();
            PartitionReader partitionReader = RestService.createReader(settings, part, log);
            this.scrollReader = partitionReader.scrollReader;
            this.client = partitionReader.client;
            this.queryBuilder = partitionReader.queryBuilder;
            this.progressable = progressable;
            // in Hadoop-like envs (Spark) the progressable might be null and thus the heart-beat is not needed
            if (progressable != null) {
                beat = new HeartBeat(progressable, cfg, settings.getHeartBeatLead(), log);
            }
            if (log.isDebugEnabled()) {
                log.debug(String.format("Initializing RecordReader for [%s]", esSplit));
            }
        }
        @Override
        public boolean nextKeyValue() throws IOException {
            // new API call routed to old API
            // under the new API always create new objects since consumers can (and sometimes will) modify them
            currentKey = createKey();
            currentValue = createValue();
            return next(currentKey, currentValue);
        }
        @Override
        public K getCurrentKey() throws IOException {
            return currentKey;
        }
        @Override
        public V getCurrentValue() {
            return currentValue;
        }
        @Override
        public float getProgress() {
            return size == 0 ? 0 : ((float) getPos()) / size;
        }
        @Override
        public void close() throws IOException {
            try {
                if (log.isDebugEnabled()) {
                    log.debug(String.format("Closing RecordReader for [%s]", esSplit));
                }
                if (beat != null) {
                    beat.stop();
                }
                if (scrollQuery != null) {
                    scrollQuery.close();
                }
                if (client != null) {
                    client.close();
                }
            } finally {
                Stats stats = new Stats();
                if (client != null) {
                    stats.aggregate(client.stats());
                    client = null;
                }
                if (scrollQuery != null) {
                    stats.aggregate(scrollQuery.stats());
                    scrollQuery = null;
                }
                ReportingUtils.report(progressable, stats);
            }
        }
        @Override
        public boolean next(K key, V value) throws IOException {
            if (scrollQuery == null) {
                if (beat != null) {
                    beat.start();
                }
                scrollQuery = queryBuilder.build(client, scrollReader);
                size = scrollQuery.getSize();
                if (log.isTraceEnabled()) {
                    log.trace(String.format("Received scroll [%s],  size [%d] for query [%s]", scrollQuery, size, queryBuilder));
                }
            }
            boolean hasNext = scrollQuery.hasNext();
            if (!hasNext) {
                return false;
            }
            Object[] next = scrollQuery.next();
            // NB: the left assignment is not needed since method override
            // the writable content however for consistency, they are below
            currentKey = setCurrentKey(key, next[0]);
            currentValue = setCurrentValue(value, next[1]);
            // keep on counting
            read++;
            return true;
        }
        @Override
        public abstract K createKey();
        @Override
        public abstract V createValue();
        /**
         * Sets the current key.
         *
         * @param hadoopKey hadoop key
         * @param object the actual value to read
         * @return returns the key to be used; needed in scenario where the key is immutable (like Pig)
         */
        protected abstract K setCurrentKey(K hadoopKey, Object object);
        /**
         * Sets the current value.
         *
         * @param hadoopValue hadoop value
         * @param object the actual value to read
         * @return returns the value to be used; needed in scenario where the passed value is immutable (like Pig)
         */
        protected abstract V setCurrentValue(V hadoopValue, Object object);
        @Override
        public long getPos() {
            return read;
        }
    }
    protected static abstract class AbstractWritableEsInputRecordReader extends EsInputRecordReader {
        public AbstractWritableEsInputRecordReader() {
            super();
        }
        public AbstractWritableEsInputRecordReader(org.apache.hadoop.mapred.InputSplit split, Configuration job, Reporter reporter) {
            super(split, job, reporter);
        }
        @Override
        public Text createKey() {
            return new Text();
        }
        @Override
        protected Text setCurrentKey(Text hadoopKey, Object object) {
            if (hadoopKey != null) {
                hadoopKey.set(object.toString());
            }
            return hadoopKey;
        }
    }
    protected static class WritableEsInputRecordReader extends AbstractWritableEsInputRecordReader            © 2015 - 2025 Weber Informatics LLC | Privacy Policy