org.elasticsearch.hadoop.mr.EsInputFormat Maven / Gradle / Ivy

Go to download
/*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.elasticsearch.hadoop.mr;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.MapWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.util.Progressable;
import org.elasticsearch.hadoop.EsHadoopIllegalArgumentException;
import org.elasticsearch.hadoop.cfg.ConfigurationOptions;
import org.elasticsearch.hadoop.cfg.FieldPresenceValidation;
import org.elasticsearch.hadoop.cfg.Settings;
import org.elasticsearch.hadoop.cfg.SettingsManager;
import org.elasticsearch.hadoop.mr.compat.CompatHandler;
import org.elasticsearch.hadoop.rest.InitializationUtils;
import org.elasticsearch.hadoop.rest.QueryBuilder;
import org.elasticsearch.hadoop.rest.RestRepository;
import org.elasticsearch.hadoop.rest.ScrollQuery;
import org.elasticsearch.hadoop.rest.stats.Stats;
import org.elasticsearch.hadoop.serialization.ScrollReader;
import org.elasticsearch.hadoop.serialization.builder.ValueReader;
import org.elasticsearch.hadoop.serialization.dto.Node;
import org.elasticsearch.hadoop.serialization.dto.Shard;
import org.elasticsearch.hadoop.serialization.dto.mapping.Field;
import org.elasticsearch.hadoop.serialization.dto.mapping.MappingUtils;
import org.elasticsearch.hadoop.util.IOUtils;
import org.elasticsearch.hadoop.util.ObjectUtils;
import org.elasticsearch.hadoop.util.StringUtils;
import org.elasticsearch.hadoop.util.Version;

/**
 * ElasticSearch {@link InputFormat} for streaming data (typically based on a query) from ElasticSearch.
 * Returns the document ID as key and its content as value.
 *
 * This class implements both the "old" (org.apache.hadoop.mapred) and the "new" (org.apache.hadoop.mapreduce) API.
 */
public class EsInputFormat extends InputFormat implements org.apache.hadoop.mapred.InputFormat {

    private static Log log = LogFactory.getLog(EsInputFormat.class);

    protected static class ShardInputSplit extends InputSplit implements org.apache.hadoop.mapred.InputSplit {

        private String nodeIp;
        private int httpPort;
        private String nodeId;
        private String nodeName;
        private String shardId;
        private String mapping;
        private String settings;

        public ShardInputSplit() {}

        public ShardInputSplit(String nodeIp, int httpPort, String nodeId, String nodeName, Integer shard,
                String mapping, String settings) {
            this.nodeIp = nodeIp;
            this.httpPort = httpPort;
            this.nodeId = nodeId;
            this.nodeName = nodeName;
            this.shardId = shard.toString();
            this.mapping = mapping;
            this.settings = settings;
        }

        @Override
        public long getLength() {
            // TODO: can this be computed easily?
            return 1l;
        }

        @Override
        public String[] getLocations() {
            // TODO: check whether the host name needs to be used instead
            return new String[] { nodeIp };
        }

        @Override
        public void write(DataOutput out) throws IOException {
            out.writeUTF(nodeIp);
            out.writeInt(httpPort);
            out.writeUTF(nodeId);
            out.writeUTF(nodeName);
            out.writeUTF(shardId);
            // avoid using writeUTF since the mapping can be longer than 65K
            byte[] utf = StringUtils.toUTF(mapping);
            out.writeInt(utf.length);
            out.write(utf);

            utf = StringUtils.toUTF(settings);
            out.writeInt(utf.length);
            out.write(utf);
        }

        @Override
        public void readFields(DataInput in) throws IOException {
            nodeIp = in.readUTF();
            httpPort = in.readInt();
            nodeId = in.readUTF();
            nodeName = in.readUTF();
            shardId = in.readUTF();
            int length = in.readInt();
            byte[] utf = new byte[length];
            in.readFully(utf);
            mapping = StringUtils.asUTFString(utf);

            length = in.readInt();
            utf = new byte[length];
            in.readFully(utf);
            settings = StringUtils.asUTFString(utf);
        }

        @Override
        public String toString() {
            StringBuilder builder = new StringBuilder();
            builder.append("ShardInputSplit [node=[").append(nodeId).append("/").append(nodeName)
                        .append("|").append(nodeIp).append(":").append(httpPort)
                        .append("],shard=").append(shardId).append("]");
            return builder.toString();
        }
    }


    protected static abstract class ShardRecordReader extends RecordReader implements org.apache.hadoop.mapred.RecordReader {

        private int read = 0;
        private ShardInputSplit esSplit;
        private ScrollReader scrollReader;

        private RestRepository client;
        private QueryBuilder queryBuilder;
        private ScrollQuery scrollQuery;

        // reuse objects
        private K currentKey;
        private V currentValue;

        private long size = 0;

        private HeartBeat beat;
        private Progressable progressable;

        // default constructor used by the NEW api
        public ShardRecordReader() {
        }

        // constructor used by the old API
        public ShardRecordReader(org.apache.hadoop.mapred.InputSplit split, Configuration job, Reporter reporter) {
            reporter.setStatus(split.toString());
            init((ShardInputSplit) split, job, reporter);
        }

        // new API init call
        @Override
        public void initialize(InputSplit split, TaskAttemptContext context) throws IOException {
            org.elasticsearch.hadoop.mr.compat.TaskAttemptContext compatContext = CompatHandler.taskAttemptContext(context);
            compatContext.setStatus(split.toString());
            init((ShardInputSplit) split, compatContext.getConfiguration(), compatContext);
        }

        void init(ShardInputSplit esSplit, Configuration cfg, Progressable progressable) {
            // get a copy to override the host/port
            Settings settings = SettingsManager.loadFrom(cfg).copy().load(esSplit.settings);

            if (log.isTraceEnabled()) {
                log.trace(String.format("Init shard reader from cfg %s", HadoopCfgUtils.asProperties(cfg)));
                log.trace(String.format("Init shard reader w/ settings %s", esSplit.settings));
            }

            // override the global settings to communicate directly with the target node
            settings.setHosts(esSplit.nodeIp).setPort(esSplit.httpPort);

            this.esSplit = esSplit;

            // initialize mapping/ scroll reader
            InitializationUtils.setValueReaderIfNotSet(settings, WritableValueReader.class, log);
            ValueReader reader = ObjectUtils.instantiate(settings.getSerializerValueReaderClassName(), settings);

            String mappingData = esSplit.mapping;

            Field mapping = null;

            if (StringUtils.hasText(mappingData)) {
                mapping = IOUtils.deserializeFromBase64(mappingData);
            }
            else {
                log.warn(String.format("No mapping found for [%s] - either no index exists or the split configuration has been corrupted", esSplit));
            }

            scrollReader = new ScrollReader(reader, mapping);

            // heart-beat
            beat = new HeartBeat(progressable, cfg, settings.getHeartBeatLead(), log);

            // initialize REST client
            client = new RestRepository(settings);

            queryBuilder = QueryBuilder.query(settings)
                    .shard(esSplit.shardId)
                    .onlyNode(esSplit.nodeId);

            queryBuilder.fields(settings.getScrollFields());

            this.progressable = progressable;

            if (log.isDebugEnabled()) {
                log.debug(String.format("Initializing RecordReader for [%s]", esSplit));
            }
        }

        @Override
        public boolean nextKeyValue() throws IOException {
            // new API call routed to old API
            if (currentKey == null) {
                currentKey = createKey();
            }
            if (currentValue == null) {
                currentValue = createValue();
            }

            // FIXME: does the new API mandate a new instance each time (?)
            return next(currentKey, currentValue);
        }

        @Override
        public K getCurrentKey() throws IOException {
            return currentKey;
        }

        @Override
        public V getCurrentValue() {
            return currentValue;
        }

        @Override
        public float getProgress() {
            return size == 0 ? 0 : ((float) getPos()) / size;
        }

        @Override
        public void close() throws IOException {
            try {
                if (log.isDebugEnabled()) {
                    log.debug(String.format("Closing RecordReader for [%s]", esSplit));
                }

                if (beat != null) {
                    beat.stop();
                }

                if (scrollQuery != null) {
                    scrollQuery.close();
                }

                if (client != null) {
                    client.close();
                }

            } finally {
                Stats stats = new Stats();
                if (client != null) {
                    stats.aggregate(client.stats());
                    client = null;
                }
                if (scrollQuery != null) {
                    stats.aggregate(scrollQuery.stats());
                    scrollQuery = null;
                }
                ReportingUtils.report(progressable, stats);
            }
        }

        @Override
        public boolean next(K key, V value) throws IOException {
            if (scrollQuery == null) {
                beat.start();

                scrollQuery = queryBuilder.build(client, scrollReader);
                size = scrollQuery.getSize();

                if (log.isTraceEnabled()) {
                    log.trace(String.format("Received scroll [%s],  size [%d] for query [%s]", scrollQuery, size, queryBuilder));
                }
            }

            boolean hasNext = scrollQuery.hasNext();

            if (!hasNext) {
                return false;
            }

            Object[] next = scrollQuery.next();
            currentKey = setCurrentKey(currentKey, key, next[0]);
            currentValue = setCurrentValue(currentValue, value, next[1]);

            // keep on counting
            read++;
            return true;
        }

        @Override
        public abstract K createKey();

        @Override
        public abstract V createValue();

        protected abstract K setCurrentKey(K oldApiKey, K newApiKey, Object object);

        protected abstract V setCurrentValue(V oldApiValue, V newApiKey, Object object);

        @Override
        public long getPos() {
            return read;
        }
    }

    protected static class WritableShardRecordReader extends ShardRecordReader> {

        private boolean useLinkedMapWritable = true;

        public WritableShardRecordReader() {
            super();
        }

        public WritableShardRecordReader(org.apache.hadoop.mapred.InputSplit split, Configuration job, Reporter reporter) {
            super(split, job, reporter);
        }


        @Override
        void init(ShardInputSplit esSplit, Configuration cfg, Progressable progressable) {
            useLinkedMapWritable = (!MapWritable.class.getName().equals(HadoopCfgUtils.getMapValueClass(cfg)));

            super.init(esSplit, cfg, progressable);
        }

        @Override
        public Text createKey() {
            return new Text();
        }

        @Override
        public Map createValue() {
            return (useLinkedMapWritable ? new LinkedMapWritable() : new MapWritable());
        }

        @Override
        protected Text setCurrentKey(Text oldApiKey, Text newApiKey, Object object) {
            String val = object.toString();
            if (oldApiKey == null) {
                oldApiKey = new Text();
                oldApiKey.set(val);
            }

            // new API might not be used
            if (newApiKey != null) {
                newApiKey.set(val);
            }
            return oldApiKey;
        }

        @SuppressWarnings("unchecked")
        @Override
        protected Map setCurrentValue(Map oldApiValue, Map newApiKey, Object object) {
            Map val = (Map) object;
            if (newApiKey != null) {
                newApiKey.clear();
                newApiKey.putAll(val);
            }
            return val;
        }
    }

    //
    // new API - just delegates to the Old API
    //
    @Override
    public List getSplits(JobContext context) throws IOException {
        org.elasticsearch.hadoop.mr.compat.JobContext compatJobContext = CompatHandler.jobContext(context);
        JobConf conf = HadoopCfgUtils.asJobConf(compatJobContext.getConfiguration());
        // NOTE: this method expects a ShardInputSplit to be returned (which implements both the old and the new API).
        return Arrays.asList((InputSplit[]) getSplits(conf, conf.getNumMapTasks()));
    }

    @SuppressWarnings("unchecked")
    @Override
    public ShardRecordReader createRecordReader(InputSplit split, TaskAttemptContext context) {
        return (ShardRecordReader) new WritableShardRecordReader();
    }


    //
    // Old API - if this method is replaced, make sure to return a new/old-API compatible InputSplit
    //

    // Note: data written to the JobConf will be silently discarded
    @Override
    public org.apache.hadoop.mapred.InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {

        Settings settings = SettingsManager.loadFrom(job);
        InitializationUtils.discoverNodesIfNeeded(settings, log);
        InitializationUtils.discoverEsVersion(settings, log);

        String savedSettings = settings.save();

        RestRepository client = new RestRepository(settings);
        boolean indexExists = client.indexExists(true);
        Map targetShards = null;

        if (!indexExists) {
            if (settings.getIndexReadMissingAsEmpty()) {
                log.info(String.format("Index [%s] missing - treating it as empty", settings.getResourceRead()));
                targetShards = Collections.emptyMap();
            }
            else {
                client.close();
                throw new EsHadoopIllegalArgumentException(
                        String.format("Index [%s] missing and settings [%s] is set to false", settings.getResourceRead(), ConfigurationOptions.ES_FIELD_READ_EMPTY_AS_NULL));
            }
        }
        else {
            targetShards = client.getReadTargetShards();
            if (log.isTraceEnabled()) {
                log.trace("Creating splits for shards " + targetShards);
            }
        }

        Version.logVersion();
        log.info(String.format("Reading from [%s]", settings.getResourceRead()));

        String savedMapping = null;
        if (!targetShards.isEmpty()) {
            Field mapping = client.getMapping();
            log.info(String.format("Discovered mapping {%s} for [%s]", mapping, settings.getResourceRead()));
            // validate if possible
            FieldPresenceValidation validation = settings.getFieldExistanceValidation();
            if (validation.isRequired()) {
                MappingUtils.validateMapping(settings.getScrollFields(), mapping, validation, log);
            }

            //TODO: implement this more efficiently
            savedMapping = IOUtils.serializeToBase64(mapping);

        }

        client.close();

        ShardInputSplit[] splits = new ShardInputSplit[targetShards.size()];

        int index = 0;
        for (Entry entry : targetShards.entrySet()) {
            Shard shard = entry.getKey();
            Node node = entry.getValue();
            splits[index++] =
                        new ShardInputSplit(node.getIpAddress(), node.getHttpPort(), node.getId(), node.getName(), shard.getName(), savedMapping, savedSettings);
        }

        log.info(String.format("Created [%d] shard-splits", splits.length));
        return splits;
    }

    @SuppressWarnings("unchecked")
    @Override
    public ShardRecordReader getRecordReader(org.apache.hadoop.mapred.InputSplit split, JobConf job, Reporter reporter) {
        return (ShardRecordReader) new WritableShardRecordReader(split, job, reporter);
    }
}