All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.elasticsearch.hadoop.mr.EsInputFormat Maven / Gradle / Ivy

There is a newer version: 7.0.0-alpha2
Show newest version
/*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.elasticsearch.hadoop.mr;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.MapWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.util.Progressable;
import org.elasticsearch.hadoop.cfg.HadoopSettings;
import org.elasticsearch.hadoop.cfg.HadoopSettingsManager;
import org.elasticsearch.hadoop.cfg.Settings;
import org.elasticsearch.hadoop.mr.compat.CompatHandler;
import org.elasticsearch.hadoop.mr.security.HadoopUserProvider;
import org.elasticsearch.hadoop.rest.InitializationUtils;
import org.elasticsearch.hadoop.rest.PartitionDefinition;
import org.elasticsearch.hadoop.rest.RestRepository;
import org.elasticsearch.hadoop.rest.RestService;
import org.elasticsearch.hadoop.rest.RestService.PartitionReader;
import org.elasticsearch.hadoop.rest.ScrollQuery;
import org.elasticsearch.hadoop.rest.SearchRequestBuilder;
import org.elasticsearch.hadoop.rest.stats.Stats;
import org.elasticsearch.hadoop.serialization.ScrollReader;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import java.util.Map;

/**
 * ElasticSearch {@link InputFormat} for streaming data (typically based on a query) from ElasticSearch.
 * Returns the document ID as key and its content as value.
 *
 * 

This class implements both the "old" (org.apache.hadoop.mapred) and the "new" (org.apache.hadoop.mapreduce) API. */ public class EsInputFormat extends InputFormat implements org.apache.hadoop.mapred.InputFormat{ private static Log log = LogFactory.getLog(EsInputFormat.class); protected static class EsInputSplit extends InputSplit implements org.apache.hadoop.mapred.InputSplit { private PartitionDefinition partition; public EsInputSplit() {} public EsInputSplit(PartitionDefinition partition) { this.partition = partition; } @Override public long getLength() { // TODO: can this be computed easily? return 1l; } @Override public String[] getLocations() { return partition.getHostNames(); } @Override public void write(DataOutput out) throws IOException { partition.write(out); } @Override public void readFields(DataInput in) throws IOException { partition = new PartitionDefinition(in); } public PartitionDefinition getPartition() { return partition; } @Override public String toString() { return "EsInputSplit{" + (partition == null ? "NULL" : partition.toString()) + "}"; } } protected static abstract class EsInputRecordReader extends RecordReader implements org.apache.hadoop.mapred.RecordReader { private int read = 0; private EsInputSplit esSplit; private ScrollReader scrollReader; private RestRepository client; private SearchRequestBuilder queryBuilder; private ScrollQuery scrollQuery; // reuse objects private K currentKey; private V currentValue; private long size = 0; private HeartBeat beat; private Progressable progressable; // default constructor used by the NEW api public EsInputRecordReader() { } // constructor used by the old API public EsInputRecordReader(org.apache.hadoop.mapred.InputSplit split, Configuration job, Reporter reporter) { reporter.setStatus(split.toString()); init((EsInputSplit) split, job, reporter); } // new API init call @Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException { org.elasticsearch.hadoop.mr.compat.TaskAttemptContext compatContext = CompatHandler.taskAttemptContext(context); compatContext.setStatus(split.toString()); init((EsInputSplit) split, compatContext.getConfiguration(), compatContext); } void init(EsInputSplit esSplit, Configuration cfg, Progressable progressable) { // get a copy to override the host/port Settings settings = HadoopSettingsManager.loadFrom(cfg).copy().load(esSplit.getPartition().getSerializedSettings()); if (log.isTraceEnabled()) { log.trace(String.format("Init shard reader from cfg %s", HadoopCfgUtils.asProperties(cfg))); log.trace(String.format("Init shard reader w/ settings %s", settings)); } this.esSplit = esSplit; // initialize mapping/ scroll reader InitializationUtils.setValueReaderIfNotSet(settings, WritableValueReader.class, log); InitializationUtils.setUserProviderIfNotSet(settings, HadoopUserProvider.class, log); PartitionDefinition part = esSplit.getPartition(); PartitionReader partitionReader = RestService.createReader(settings, part, log); this.scrollReader = partitionReader.scrollReader; this.client = partitionReader.client; this.queryBuilder = partitionReader.queryBuilder; this.progressable = progressable; // in Hadoop-like envs (Spark) the progressable might be null and thus the heart-beat is not needed if (progressable != null) { beat = new HeartBeat(progressable, cfg, settings.getHeartBeatLead(), log); } if (log.isDebugEnabled()) { log.debug(String.format("Initializing RecordReader for [%s]", esSplit)); } } @Override public boolean nextKeyValue() throws IOException { // new API call routed to old API // under the new API always create new objects since consumers can (and sometimes will) modify them currentKey = createKey(); currentValue = createValue(); return next(currentKey, currentValue); } @Override public K getCurrentKey() throws IOException { return currentKey; } @Override public V getCurrentValue() { return currentValue; } @Override public float getProgress() { return size == 0 ? 0 : ((float) getPos()) / size; } @Override public void close() throws IOException { try { if (log.isDebugEnabled()) { log.debug(String.format("Closing RecordReader for [%s]", esSplit)); } if (beat != null) { beat.stop(); } if (scrollQuery != null) { scrollQuery.close(); } if (client != null) { client.close(); } } finally { Stats stats = new Stats(); if (client != null) { stats.aggregate(client.stats()); client = null; } if (scrollQuery != null) { stats.aggregate(scrollQuery.stats()); scrollQuery = null; } ReportingUtils.report(progressable, stats); } } @Override public boolean next(K key, V value) throws IOException { if (scrollQuery == null) { if (beat != null) { beat.start(); } scrollQuery = queryBuilder.build(client, scrollReader); size = scrollQuery.getSize(); if (log.isTraceEnabled()) { log.trace(String.format("Received scroll [%s], size [%d] for query [%s]", scrollQuery, size, queryBuilder)); } } boolean hasNext = scrollQuery.hasNext(); if (!hasNext) { return false; } Object[] next = scrollQuery.next(); // NB: the left assignment is not needed since method override // the writable content however for consistency, they are below currentKey = setCurrentKey(key, next[0]); currentValue = setCurrentValue(value, next[1]); // keep on counting read++; return true; } @Override public abstract K createKey(); @Override public abstract V createValue(); /** * Sets the current key. * * @param hadoopKey hadoop key * @param object the actual value to read * @return returns the key to be used; needed in scenario where the key is immutable (like Pig) */ protected abstract K setCurrentKey(K hadoopKey, Object object); /** * Sets the current value. * * @param hadoopValue hadoop value * @param object the actual value to read * @return returns the value to be used; needed in scenario where the passed value is immutable (like Pig) */ protected abstract V setCurrentValue(V hadoopValue, Object object); @Override public long getPos() { return read; } } protected static abstract class AbstractWritableEsInputRecordReader extends EsInputRecordReader { public AbstractWritableEsInputRecordReader() { super(); } public AbstractWritableEsInputRecordReader(org.apache.hadoop.mapred.InputSplit split, Configuration job, Reporter reporter) { super(split, job, reporter); } @Override public Text createKey() { return new Text(); } @Override protected Text setCurrentKey(Text hadoopKey, Object object) { if (hadoopKey != null) { hadoopKey.set(object.toString()); } return hadoopKey; } } protected static class WritableEsInputRecordReader extends AbstractWritableEsInputRecordReader> { private boolean useLinkedMapWritable = true; public WritableEsInputRecordReader() { super(); } public WritableEsInputRecordReader(org.apache.hadoop.mapred.InputSplit split, Configuration job, Reporter reporter) { super(split, job, reporter); } @Override void init(EsInputSplit esSplit, Configuration cfg, Progressable progressable) { useLinkedMapWritable = (!MapWritable.class.getName().equals(HadoopCfgUtils.getMapValueClass(cfg))); super.init(esSplit, cfg, progressable); } @Override public Map createValue() { return (useLinkedMapWritable ? new LinkedMapWritable() : new MapWritable()); } @SuppressWarnings("unchecked") @Override protected Map setCurrentValue(Map hadoopValue, Object object) { if (hadoopValue != null) { hadoopValue.clear(); Map val = (Map) object; hadoopValue.putAll(val); } return hadoopValue; } } protected static class JsonWritableEsInputRecordReader extends AbstractWritableEsInputRecordReader { public JsonWritableEsInputRecordReader() { super(); } public JsonWritableEsInputRecordReader(org.apache.hadoop.mapred.InputSplit split, Configuration job, Reporter reporter) { super(split, job, reporter); } @Override public Text createValue() { return new Text(); } @Override protected Text setCurrentValue(Text hadoopValue, Object object) { if (hadoopValue != null) { hadoopValue.set(object.toString()); } return hadoopValue; } } // // new API - just delegates to the Old API // @Override public List getSplits(JobContext context) throws IOException { JobConf conf = HadoopCfgUtils.asJobConf(CompatHandler.jobContext(context).getConfiguration()); // NOTE: this method expects a ShardInputSplit to be returned (which implements both the old and the new API). return Arrays.asList((InputSplit[]) getSplits(conf, conf.getNumMapTasks())); } @SuppressWarnings("unchecked") @Override public EsInputRecordReader createRecordReader(InputSplit split, TaskAttemptContext context) { return (EsInputRecordReader) (isOutputAsJson(CompatHandler.taskAttemptContext(context).getConfiguration()) ? new JsonWritableEsInputRecordReader() : new WritableEsInputRecordReader()); } // // Old API - if this method is replaced, make sure to return a new/old-API compatible InputSplit // // Note: data written to the JobConf will be silently discarded @Override public org.apache.hadoop.mapred.InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { Settings settings = HadoopSettingsManager.loadFrom(job); Collection partitions = RestService.findPartitions(settings, log); EsInputSplit[] splits = new EsInputSplit[partitions.size()]; int index = 0; for (PartitionDefinition part : partitions) { splits[index++] = new EsInputSplit(part); } log.info(String.format("Created [%d] splits", splits.length)); return splits; } @SuppressWarnings("unchecked") @Override public EsInputRecordReader getRecordReader(org.apache.hadoop.mapred.InputSplit split, JobConf job, Reporter reporter) { return (EsInputRecordReader) (isOutputAsJson(job) ? new JsonWritableEsInputRecordReader(split, job, reporter) : new WritableEsInputRecordReader(split, job, reporter)); } protected boolean isOutputAsJson(Configuration cfg) { return new HadoopSettings(cfg).getOutputAsJson(); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy