org.apache.pig.pen.LocalMapReduceSimulator Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pig.pen;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.TaskID;
import org.apache.hadoop.mapred.jobcontrol.Job;
import org.apache.hadoop.mapred.jobcontrol.JobControl;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.pig.PigException;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.backend.hadoop.datastorage.ConfigurationUtil;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MRConfiguration;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceOper;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigMapBase;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigMapOnly;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigMapReduce;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigMapReduceCounter;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.plans.MROperPlan;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.PhysicalOperator;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.expressionOperators.POUserFunc;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.plans.PhyPlanVisitor;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.plans.PhysicalPlan;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POLoad;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POPackage;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POStore;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.util.PlanHelper;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.PigContext;
import org.apache.pig.impl.builtin.ReadScalars;
import org.apache.pig.impl.io.FileLocalizer;
import org.apache.pig.impl.io.NullableTuple;
import org.apache.pig.impl.io.PigNullableWritable;
import org.apache.pig.impl.plan.DepthFirstWalker;
import org.apache.pig.impl.plan.OperatorKey;
import org.apache.pig.impl.plan.VisitorException;
import org.apache.pig.impl.util.ConfigurationValidator;
import org.apache.pig.impl.util.ObjectSerializer;
import org.apache.pig.impl.util.Pair;
import org.apache.pig.newplan.logical.relational.LOLoad;
import org.apache.pig.newplan.logical.relational.LogicalRelationalOperator;
import org.apache.pig.pen.util.LineageTracer;
/**
* Main class that launches pig for Map Reduce
*
*/
public class LocalMapReduceSimulator {
private MapReduceLauncher launcher = new MapReduceLauncher();
private Map phyToMRMap = new HashMap();;
@SuppressWarnings("unchecked")
public void launchPig(PhysicalPlan php, Map baseData,
LineageTracer lineage,
IllustratorAttacher attacher,
ExampleGenerator eg,
PigContext pc) throws PigException, IOException, InterruptedException {
phyToMRMap.clear();
MROperPlan mrp = launcher.compile(php, pc);
ConfigurationValidator.validatePigProperties(pc.getProperties());
Configuration conf = ConfigurationUtil.toConfiguration(pc.getProperties());
JobControlCompiler jcc = new JobControlCompiler(pc, conf);
JobControl jc;
int numMRJobsCompl = 0;
DataBag input;
List> intermediateData = new ArrayList>();
Map jobToMroMap = jcc.getJobMroMap();
HashMap output = new HashMap();
Configuration jobConf;
// jc is null only when mrp.size == 0
boolean needFileInput;
final ArrayList emptyInpTargets = new ArrayList();
pc.getProperties().setProperty("pig.illustrating", "true");
String jtIdentifier = "" + System.currentTimeMillis();
int jobId = 0;
while(mrp.size() != 0) {
jc = jcc.compile(mrp, "Illustrator");
if(jc == null) {
throw new ExecException("Native execution is not supported");
}
List jobs = jc.getWaitingJobs();
for (Job job : jobs) {
jobId++;
jobConf = job.getJobConf();
FileLocalizer.setInitialized(false);
ArrayList> inpTargets =
(ArrayList>)
ObjectSerializer.deserialize(jobConf.get("pig.inpTargets"));
intermediateData.clear();
MapReduceOper mro = jobToMroMap.get(job);
PigSplit split = null;
List stores = null;
PhysicalOperator pack = null;
// revisit as there are new physical operators from MR compilation
if (!mro.mapPlan.isEmpty())
attacher.revisit(mro.mapPlan);
if (!mro.reducePlan.isEmpty()) {
attacher.revisit(mro.reducePlan);
pack = mro.reducePlan.getRoots().get(0);
}
List lds = PlanHelper.getPhysicalOperators(mro.mapPlan, POLoad.class);
if (!mro.mapPlan.isEmpty()) {
stores = PlanHelper.getPhysicalOperators(mro.mapPlan, POStore.class);
}
if (!mro.reducePlan.isEmpty()) {
if (stores == null)
stores = PlanHelper.getPhysicalOperators(mro.reducePlan, POStore.class);
else
stores.addAll(PlanHelper.getPhysicalOperators(mro.reducePlan, POStore.class));
}
for (POStore store : stores) {
output.put(store.getSFile().getFileName(), attacher.getDataMap().get(store));
}
OutputAttacher oa = new OutputAttacher(mro.mapPlan, output);
oa.visit();
if (!mro.reducePlan.isEmpty()) {
oa = new OutputAttacher(mro.reducePlan, output);
oa.visit();
}
int index = 0;
for (POLoad ld : lds) {
input = output.get(ld.getLFile().getFileName());
if (input == null && baseData != null) {
for (LogicalRelationalOperator lo : baseData.keySet()) {
if (((LOLoad) lo).getSchemaFile().equals(ld.getLFile().getFileName()))
{
input = baseData.get(lo);
break;
}
}
}
if (input != null)
mro.mapPlan.remove(ld);
}
int mapTaskId = 0;
for (POLoad ld : lds) {
// check newly generated data first
input = output.get(ld.getLFile().getFileName());
if (input == null && baseData != null) {
if (input == null && baseData != null) {
for (LogicalRelationalOperator lo : baseData.keySet()) {
if (((LOLoad) lo).getSchemaFile().equals(ld.getLFile().getFileName()))
{
input = baseData.get(lo);
break;
}
}
}
}
needFileInput = (input == null);
split = new PigSplit(null, index, needFileInput ? emptyInpTargets : inpTargets.get(index), 0);
++index;
Mapper map;
if (mro.reducePlan.isEmpty()) {
// map-only
map = new PigMapOnly.Map();
Mapper.Context context = ((PigMapOnly.Map) map)
.getIllustratorContext(jobConf, input, intermediateData, split);
if(mro.isCounterOperation()) {
if(mro.isRowNumber()) {
map = new PigMapReduceCounter.PigMapCounter();
}
context = ((PigMapReduceCounter.PigMapCounter) map).getIllustratorContext(jobConf, input, intermediateData, split);
}
((PigMapBase) map).setMapPlan(mro.mapPlan);
context.getConfiguration().set(MRConfiguration.TASK_ID, new TaskID(jtIdentifier, jobId, true, mapTaskId++).toString());
map.run(context);
} else {
if ("true".equals(jobConf.get("pig.usercomparator")))
map = new PigMapReduce.MapWithComparator();
else if (!"".equals(jobConf.get("pig.keyDistFile", "")))
map = new PigMapReduce.MapWithPartitionIndex();
else
map = new PigMapReduce.Map();
Mapper.Context context = ((PigMapBase) map)
.getIllustratorContext(jobConf, input, intermediateData, split);
((PigMapBase) map).setMapPlan(mro.mapPlan);
context.getConfiguration().set(MRConfiguration.TASK_ID, new TaskID(jtIdentifier, jobId, true, mapTaskId++).toString());
map.run(context);
}
}
if (!mro.reducePlan.isEmpty())
{
if (pack instanceof POPackage)
mro.reducePlan.remove(pack);
// reducer run
PigMapReduce.Reduce reduce;
if ("true".equals(jobConf.get("pig.usercomparator")))
reduce = new PigMapReduce.ReduceWithComparator();
else
reduce = new PigMapReduce.Reduce();
Reducer.Context
context = reduce.getIllustratorContext(job, intermediateData, (POPackage) pack);
if(mro.isCounterOperation()) {
reduce = new PigMapReduceCounter.PigReduceCounter();
context = ((PigMapReduceCounter.PigReduceCounter)reduce).getIllustratorContext(job, intermediateData, (POPackage) pack);
}
((PigMapReduce.Reduce) reduce).setReducePlan(mro.reducePlan);
context.getConfiguration().set(MRConfiguration.TASK_ID, new TaskID(jtIdentifier, jobId, false, 0).toString());
reduce.run(context);
}
for (PhysicalOperator key : mro.phyToMRMap.keySet())
for (PhysicalOperator value : mro.phyToMRMap.get(key))
phyToMRMap.put(key, value);
}
int removedMROp = jcc.updateMROpPlan(new LinkedList());
numMRJobsCompl += removedMROp;
}
jcc.reset();
}
private class OutputAttacher extends PhyPlanVisitor {
private Map outputBuffer;
OutputAttacher(PhysicalPlan plan, Map output) {
super(plan, new DepthFirstWalker(
plan));
this.outputBuffer = output;
}
@Override
public void visitUserFunc(POUserFunc userFunc) throws VisitorException {
if (userFunc.getFunc() != null && userFunc.getFunc() instanceof ReadScalars) {
((ReadScalars) userFunc.getFunc()).setOutputBuffer(outputBuffer);
}
}
}
public Map getPhyToMRMap() {
return phyToMRMap;
}
}