org.apache.pig.AlgebraicEvalFunc Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pig;
import java.io.IOException;
import org.apache.pig.Algebraic;
import org.apache.pig.EvalFunc;
import org.apache.pig.data.BagFactory;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.PigContext;
import org.apache.pig.tools.counters.PigCounterHelper;
/**
* This class is used to provide a free implementation of the Accumulator interface
* and EvalFunc class in the case of an Algebraic function. Instead of having to provide
* redundant implementations for Accumulator and EvalFunc, implementing the
* getInitial, getIntermed, and getFinal methods (which implies implementing the static classes
* they reference) will give you an implementation of each of those for free.
* One key thing to note is that if a subclass of AlgebraicEvalFunc wishes to use any constructor
* arguments, it MUST call super(args).
*
* IMPORTANT: the implementation of the Accumulator interface that this class provides is good,
* but it is simulated. For maximum efficiency, it is important to manually implement the accumulator
* interface. See {@link Accumulator} for more information on how to do so.
*/
public abstract class AlgebraicEvalFunc extends AccumulatorEvalFunc implements Algebraic {
private EvalFunc initEvalFunc;
private EvalFunc intermedEvalFunc;
private EvalFunc finalEvalFunc;
private static final BagFactory mBagFactory = BagFactory.getInstance();
private static final TupleFactory mTupleFactory = TupleFactory.getInstance();
private DataBag intermediateDB;
private DataBag wrapDB;
private DataBag accumDB;
private Tuple argTuple;
private Tuple accumTup;
/**
* This represents the number of elements an intermediate bag must have
* in order for the intermediate EvalFunc to be used on it.
*/
private int bagCombineThreshold = 1000;
/**
* This represents the factor by which the result of applying the intermediate
* EvalFunc must shrink the data in order for combining at this stage to be
* deemed "worth it."
*/
private int combineFactor = 2;
private PigCounterHelper pigCounterHelper = new PigCounterHelper();
private boolean combine;
private String[] constructorArgs;
/**
* It is key that if a subclass has a constructor, that it calls super(args...) or else
* this class will not instantiate properly.
*/
public AlgebraicEvalFunc(String... constructorArgs) { this.constructorArgs = constructorArgs; }
/**
* This must be implement as per a normal Algebraic interface. See {@link Algebraic} for
* more information.
*/
@Override
public abstract String getFinal();
/**
* This must be implement as per a normal Algebraic interface. See {@link Algebraic} for
* more information.
*/
@Override
public abstract String getInitial();
/**
* This must be implement as per a normal Algebraic interface. See {@link Algebraic} for
* more information.
*/
@Override
public abstract String getIntermed();
private boolean init = false;
/**
* This helper function instantiates an EvalFunc given its String class name.
*/
private EvalFunc> makeEvalFunc(String base) {
StringBuffer sb = new StringBuffer();
sb.append(base).append("(");
boolean first = true;
for (String s : constructorArgs) {
if (!first) sb.append(",");
else first = false;
sb.append("'").append(s).append("'");
}
sb.append(")");
return (EvalFunc>)PigContext.instantiateFuncFromSpec(sb.toString());
}
/**
* This is the free accumulate implementation based on the static classes provided
* by the Algebraic static classes. This implemention works by leveraging the
* initial, intermediate, and final classes provided by the algebraic interface.
* The exec function of the Initial EvalFunc will be called on every Tuple of the input
* and the output will be collected in an intermediate state. Periodically, this intermediate
* state will have the Intermediate EvalFunc called on it 1 or more times. The Final EvalFunc
* is not called until getValue() is called.
*/
@SuppressWarnings("unchecked")
@Override
public void accumulate(Tuple input) throws IOException {
if (!init) {
intermediateDB = mBagFactory.newDefaultBag();
wrapDB = mBagFactory.newDefaultBag();
accumDB = mBagFactory.newDefaultBag();
argTuple = mTupleFactory.newTuple(1);
argTuple.set(0, wrapDB);
accumTup = mTupleFactory.newTuple(1);
accumTup.set(0,accumDB);
initEvalFunc = (EvalFunc)makeEvalFunc(getInitial());
intermedEvalFunc = (EvalFunc)makeEvalFunc(getIntermed());
finalEvalFunc = (EvalFunc)makeEvalFunc(getFinal());
combine = true;
init = true;
}
accumDB.clear();
for (Tuple t : (DataBag)input.get(0)) {
wrapDB.clear();
wrapDB.add(t);
accumDB.add(initEvalFunc.exec(argTuple));
}
intermediateDB.add(intermedEvalFunc.exec(accumTup));
if (combine && intermediateDB.size() > bagCombineThreshold) {
long initialSizeEstimate = intermediateDB.getMemorySize();
DataBag newIntermediateDB = mBagFactory.newDefaultBag();
Tuple t = mTupleFactory.newTuple(1);
t.set(0, intermediateDB);
newIntermediateDB.add(intermedEvalFunc.exec(t));
intermediateDB = newIntermediateDB;
long newSizeEstimate = intermediateDB.getMemorySize();
pigCounterHelper.incrCounter("AlgebraicEvalFunc", "InitialSizeEst", initialSizeEstimate);
pigCounterHelper.incrCounter("AlgebraicEvalFunc", "PostCombineSizeEst", newSizeEstimate);
pigCounterHelper.incrCounter("AlgebraicEvalFunc", "CombineApply", 1L);
if (combineFactor * newSizeEstimate > initialSizeEstimate) {
combine = false;
pigCounterHelper.incrCounter("AlgebraicEvalFunc", "CombineShutoff", 1L);
}
}
}
/**
* Per the Accumulator interface, this clears all of the variables used in the implementation.
*/
@Override
public void cleanup() {
intermediateDB = null;
wrapDB = null;
accumDB = null;
argTuple = null;
accumTup = null;
initEvalFunc = null;
intermedEvalFunc = null;
finalEvalFunc = null;
init = false;
}
/**
* This function returns the ultimate result. It is when getValue() is called that
* the Final EvalFunc's exec function is called on the accumulated data.
*/
@Override
public T getValue() {
try {
return finalEvalFunc.exec(mTupleFactory.newTuple(intermediateDB));
} catch (IOException e) {
throw new RuntimeException("Error in AlgebraicEvalFunc evaluating final method");
}
}
}