All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.pig.AlgebraicEvalFunc Maven / Gradle / Ivy

There is a newer version: 0.17.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.pig;

import java.io.IOException;

import org.apache.pig.Algebraic;
import org.apache.pig.EvalFunc;
import org.apache.pig.data.BagFactory;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.PigContext;
import org.apache.pig.tools.counters.PigCounterHelper;

/**
 * This class is used to provide a free implementation of the Accumulator interface
 * and EvalFunc class in the case of an Algebraic function. Instead of having to provide
 * redundant implementations for Accumulator and EvalFunc, implementing the
 * getInitial, getIntermed, and getFinal methods (which implies implementing the static classes
 * they reference) will give you an implementation of each of those for free. 

* One key thing to note is that if a subclass of AlgebraicEvalFunc wishes to use any constructor * arguments, it MUST call super(args). *

* IMPORTANT: the implementation of the Accumulator interface that this class provides is good, * but it is simulated. For maximum efficiency, it is important to manually implement the accumulator * interface. See {@link Accumulator} for more information on how to do so. */ public abstract class AlgebraicEvalFunc extends AccumulatorEvalFunc implements Algebraic { private EvalFunc initEvalFunc; private EvalFunc intermedEvalFunc; private EvalFunc finalEvalFunc; private static final BagFactory mBagFactory = BagFactory.getInstance(); private static final TupleFactory mTupleFactory = TupleFactory.getInstance(); private DataBag intermediateDB; private DataBag wrapDB; private DataBag accumDB; private Tuple argTuple; private Tuple accumTup; /** * This represents the number of elements an intermediate bag must have * in order for the intermediate EvalFunc to be used on it. */ private int bagCombineThreshold = 1000; /** * This represents the factor by which the result of applying the intermediate * EvalFunc must shrink the data in order for combining at this stage to be * deemed "worth it." */ private int combineFactor = 2; private PigCounterHelper pigCounterHelper = new PigCounterHelper(); private boolean combine; private String[] constructorArgs; /** * It is key that if a subclass has a constructor, that it calls super(args...) or else * this class will not instantiate properly. */ public AlgebraicEvalFunc(String... constructorArgs) { this.constructorArgs = constructorArgs; } /** * This must be implement as per a normal Algebraic interface. See {@link Algebraic} for * more information. */ @Override public abstract String getFinal(); /** * This must be implement as per a normal Algebraic interface. See {@link Algebraic} for * more information. */ @Override public abstract String getInitial(); /** * This must be implement as per a normal Algebraic interface. See {@link Algebraic} for * more information. */ @Override public abstract String getIntermed(); private boolean init = false; /** * This helper function instantiates an EvalFunc given its String class name. */ private EvalFunc makeEvalFunc(String base) { StringBuffer sb = new StringBuffer(); sb.append(base).append("("); boolean first = true; for (String s : constructorArgs) { if (!first) sb.append(","); else first = false; sb.append("'").append(s).append("'"); } sb.append(")"); return (EvalFunc)PigContext.instantiateFuncFromSpec(sb.toString()); } /** * This is the free accumulate implementation based on the static classes provided * by the Algebraic static classes. This implemention works by leveraging the * initial, intermediate, and final classes provided by the algebraic interface. * The exec function of the Initial EvalFunc will be called on every Tuple of the input * and the output will be collected in an intermediate state. Periodically, this intermediate * state will have the Intermediate EvalFunc called on it 1 or more times. The Final EvalFunc * is not called until getValue() is called. */ @SuppressWarnings("unchecked") @Override public void accumulate(Tuple input) throws IOException { if (!init) { intermediateDB = mBagFactory.newDefaultBag(); wrapDB = mBagFactory.newDefaultBag(); accumDB = mBagFactory.newDefaultBag(); argTuple = mTupleFactory.newTuple(1); argTuple.set(0, wrapDB); accumTup = mTupleFactory.newTuple(1); accumTup.set(0,accumDB); initEvalFunc = (EvalFunc)makeEvalFunc(getInitial()); intermedEvalFunc = (EvalFunc)makeEvalFunc(getIntermed()); finalEvalFunc = (EvalFunc)makeEvalFunc(getFinal()); combine = true; init = true; } accumDB.clear(); for (Tuple t : (DataBag)input.get(0)) { wrapDB.clear(); wrapDB.add(t); accumDB.add(initEvalFunc.exec(argTuple)); } intermediateDB.add(intermedEvalFunc.exec(accumTup)); if (combine && intermediateDB.size() > bagCombineThreshold) { long initialSizeEstimate = intermediateDB.getMemorySize(); DataBag newIntermediateDB = mBagFactory.newDefaultBag(); Tuple t = mTupleFactory.newTuple(1); t.set(0, intermediateDB); newIntermediateDB.add(intermedEvalFunc.exec(t)); intermediateDB = newIntermediateDB; long newSizeEstimate = intermediateDB.getMemorySize(); pigCounterHelper.incrCounter("AlgebraicEvalFunc", "InitialSizeEst", initialSizeEstimate); pigCounterHelper.incrCounter("AlgebraicEvalFunc", "PostCombineSizeEst", newSizeEstimate); pigCounterHelper.incrCounter("AlgebraicEvalFunc", "CombineApply", 1L); if (combineFactor * newSizeEstimate > initialSizeEstimate) { combine = false; pigCounterHelper.incrCounter("AlgebraicEvalFunc", "CombineShutoff", 1L); } } } /** * Per the Accumulator interface, this clears all of the variables used in the implementation. */ @Override public void cleanup() { intermediateDB = null; wrapDB = null; accumDB = null; argTuple = null; accumTup = null; initEvalFunc = null; intermedEvalFunc = null; finalEvalFunc = null; init = false; } /** * This function returns the ultimate result. It is when getValue() is called that * the Final EvalFunc's exec function is called on the accumulated data. */ @Override public T getValue() { try { return finalEvalFunc.exec(mTupleFactory.newTuple(intermediateDB)); } catch (IOException e) { throw new RuntimeException("Error in AlgebraicEvalFunc evaluating final method"); } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy