All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.pig.pen.LineageTrimmingVisitor Maven / Gradle / Ivy

There is a newer version: 0.17.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.pig.pen;

import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.IdentityHashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.PhysicalOperator;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.plans.PhysicalPlan;
import org.apache.pig.data.BagFactory;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.PigContext;
import org.apache.pig.impl.logicalLayer.LOCogroup;
import org.apache.pig.impl.logicalLayer.LOCross;
import org.apache.pig.impl.logicalLayer.LODistinct;
import org.apache.pig.impl.logicalLayer.LOFilter;
import org.apache.pig.impl.logicalLayer.LOForEach;
import org.apache.pig.impl.logicalLayer.LOLimit;
import org.apache.pig.impl.logicalLayer.LOLoad;
import org.apache.pig.impl.logicalLayer.LOSort;
import org.apache.pig.impl.logicalLayer.LOSplit;
import org.apache.pig.impl.logicalLayer.LOUnion;
import org.apache.pig.impl.logicalLayer.LOVisitor;
import org.apache.pig.impl.logicalLayer.LogicalOperator;
import org.apache.pig.impl.logicalLayer.LogicalPlan;
import org.apache.pig.impl.plan.VisitorException;
import org.apache.pig.impl.util.IdentityHashSet;
import org.apache.pig.pen.util.LineageTracer;
import org.apache.pig.pen.util.MetricEvaluation;
import org.apache.pig.pen.util.PreOrderDepthFirstWalker;

public class LineageTrimmingVisitor extends LOVisitor {

    LogicalPlan plan = null;
    Map baseData = new HashMap();
    Map LogToPhyMap = null;
    PhysicalPlan physPlan = null;
    double completeness = 100.0;
    Log log = LogFactory.getLog(getClass());

    Map, Integer>> AffinityGroups = new HashMap, Integer>>();
    Map Lineage = new HashMap();

    boolean continueTrimming;
    PigContext pc;

    public LineageTrimmingVisitor(LogicalPlan plan,
            Map baseData,
            Map LogToPhyMap,
            PhysicalPlan physPlan, PigContext pc) {
        super(plan, new PreOrderDepthFirstWalker(
                plan));
        // this.baseData.putAll(baseData);
        this.baseData = baseData;
        this.plan = plan;
        this.LogToPhyMap = LogToPhyMap;
        this.pc = pc;
        this.physPlan = physPlan;
        init();
    }

    public void init() {

        DerivedDataVisitor visitor = new DerivedDataVisitor(plan, pc, baseData,
                LogToPhyMap, physPlan);
        try {
            visitor.visit();
        } catch (VisitorException e) {
            log.error(e.getMessage());
        }

        LineageTracer lineage = visitor.lineage;
        Lineage.put(plan.getLeaves().get(0), lineage);
        Map>> OpToEqClasses = visitor.OpToEqClasses;
        Collection> EqClasses = visitor.EqClasses;
        Map, Integer> affinityGroup = new HashMap, Integer>();
        for (IdentityHashSet set : EqClasses) {
            affinityGroup.put(set, 1);
        }
        AffinityGroups.put(plan.getLeaves().get(0), affinityGroup);
        completeness = MetricEvaluation.getCompleteness(null,
                visitor.derivedData, OpToEqClasses, true);
        LogToPhyMap = visitor.LogToPhyMap;
        continueTrimming = true;

    }

    @Override
    protected void visit(LOCogroup cg) throws VisitorException {
        if (continueTrimming) {
            Map, Integer> affinityGroups = null;

            continueTrimming = checkCompleteness(cg);
            
            DerivedDataVisitor visitor = null;
            LineageTracer lineage = null;
            // create affinity groups
            if (cg.getInputs().size() == 1) {
                affinityGroups = new HashMap, Integer>();
                LogicalOperator childOp = cg.getInputs().get(0);
                visitor = new DerivedDataVisitor(childOp, null, baseData,
                        LogToPhyMap, physPlan);
                try {
                    visitor.visit();
                } catch (VisitorException e) {
                    log.error(e.getMessage());
                }

                lineage = visitor.lineage;

                DataBag bag = visitor.evaluateIsolatedOperator(cg);
                for (Iterator it = bag.iterator(); it.hasNext();) {
                    DataBag field;
                    try {
                        field = (DataBag) it.next().get(1);
                    } catch (ExecException e) {
                        // TODO Auto-generated catch block
                        e.printStackTrace();
                        log.error(e.getMessage());
                        throw new VisitorException(
                                "Error trimming operator COGROUP operator "
                                        + cg.getAlias()
                                        + "in example generator");
                    }
                    IdentityHashSet set = new IdentityHashSet();
                    affinityGroups.put(set, 2);
                    for (Iterator it1 = field.iterator(); it1.hasNext();) {
                        set.add(it1.next());
                    }
                }

                // add the equivalence classes obtained from derived data
                // creation
                for (IdentityHashSet set : visitor.EqClasses) {
                    affinityGroups.put(set, 1);
                }
                AffinityGroups.put(cg.getInputs().get(0), affinityGroups);
                Lineage.put(cg.getInputs().get(0), lineage);

            } else {
                List inputs = new LinkedList();
                visitor = new DerivedDataVisitor(cg, null, baseData,
                        LogToPhyMap, physPlan);
                affinityGroups = new HashMap, Integer>();
                for (int i = 0; i < cg.getInputs().size(); i++) {
                    // affinityGroups = new HashMap,
                    // Integer>();
                    LogicalOperator childOp = cg.getInputs().get(i);
                    // visitor = new DerivedDataVisitor(cg.getInputs().get(i),
                    // null, baseData, LogToPhyMap, physPlan);
                    visitor.setOperatorToEvaluate(childOp);
                    try {
                        visitor.visit();
                    } catch (VisitorException e) {
                        log.error(e.getMessage());
                    }
                    // Lineage.put(childOp, visitor.lineage);
                    inputs.add(visitor.derivedData.get(childOp));

                    for (IdentityHashSet set : visitor.EqClasses)
                        affinityGroups.put(set, 1);

                    // AffinityGroups.put(cg.getInputs().get(i),
                    // affinityGroups);
                }
                for (LogicalOperator input : cg.getInputs()) {
                    Lineage.put(input, visitor.lineage);
                    AffinityGroups.put(input, affinityGroups);
                }

                visitor = new DerivedDataVisitor(cg, null, baseData,
                        LogToPhyMap, physPlan);
                DataBag output = visitor.evaluateIsolatedOperator(cg, inputs);

                for (int i = 1; i <= cg.getInputs().size(); i++) {
                    affinityGroups = new HashMap, Integer>();
                    for (Iterator it = output.iterator(); it.hasNext();) {
                        DataBag bag = null;
                        try {
                            bag = (DataBag) it.next().get(i);
                        } catch (ExecException e) {
                            // TODO Auto-generated catch block
                            log.error(e.getMessage());
                        }
                        IdentityHashSet set = new IdentityHashSet();
                        affinityGroups.put(set, 1);
                        for (Iterator it1 = bag.iterator(); it1
                                .hasNext();) {
                            set.add(it1.next());
                        }
                    }
                    AffinityGroups.get(cg.getInputs().get(i - 1)).putAll(
                            affinityGroups);

                }
                AffinityGroups = AffinityGroups;
            }
        }
    }

    @Override
    protected void visit(LOCross cs) throws VisitorException {
        if(continueTrimming)
            processOperator(cs);

    }

    @Override
    protected void visit(LODistinct dt) throws VisitorException {
        if(continueTrimming)
            processOperator(dt);

    }

    @Override
    protected void visit(LOFilter filter) throws VisitorException {
        if (continueTrimming)
            processOperator(filter);
    }

    @Override
    protected void visit(LOForEach forEach) throws VisitorException {
        if (continueTrimming)
            processOperator(forEach);
    }

    @Override
    protected void visit(LOLimit limOp) throws VisitorException {
        if(continueTrimming)
            processOperator(limOp);

    }

    @Override
    protected void visit(LOLoad load) throws VisitorException {
        if (continueTrimming)
            processOperator(load);
    }

    @Override
    protected void visit(LOSort s) throws VisitorException {
        if(continueTrimming)
            processOperator(s);

    }

    @Override
    protected void visit(LOSplit split) throws VisitorException {
        if(continueTrimming)
            processOperator(split);

    }

    @Override
    protected void visit(LOUnion u) throws VisitorException {
        if(continueTrimming)
            processOperator(u);

    }

    private Map PruneBaseDataConstrainedCoverage(
            Map baseData, DataBag rootOutput,
            LineageTracer lineage,
            Map, Integer> equivalenceClasses) {

        IdentityHashMap> membershipMap = lineage
                .getMembershipMap();
        IdentityHashMap lineageGroupWeights = lineage
                .getWeightedCounts(2f, 1);

        // compute a mapping from lineage group to the set of equivalence
        // classes covered by it
        // IdentityHashMap> lineageGroupToEquivClasses = new
        // IdentityHashMap>();
        IdentityHashMap>> lineageGroupToEquivClasses = new IdentityHashMap>>();
        int equivClassId = 0;
        for (IdentityHashSet equivClass : equivalenceClasses.keySet()) {
            for (Tuple t : equivClass) {
                Tuple lineageGroup = lineage.getRepresentative(t);
                // Set entry =
                // lineageGroupToEquivClasses.get(lineageGroup);
                Set> entry = lineageGroupToEquivClasses
                        .get(lineageGroup);
                if (entry == null) {
                    // entry = new HashSet();
                    entry = new HashSet>();
                    lineageGroupToEquivClasses.put(lineageGroup, entry);
                }
                // entry.add(equivClassId);
                entry.add(equivClass);
            }

            equivClassId++;
        }

        // select lineage groups such that we cover all equivalence classes
        IdentityHashSet selectedLineageGroups = new IdentityHashSet();
        while (!lineageGroupToEquivClasses.isEmpty()) {
            // greedily find the lineage group with the best "score", where
            // score = # equiv classes covered / group weight
            double bestScore = -1;
            Tuple bestLineageGroup = null;
            Set> bestEquivClassesCovered = null;
            for (Tuple lineageGroup : lineageGroupToEquivClasses.keySet()) {
                double weight = lineageGroupWeights.get(lineageGroup);

                Set> equivClassesCovered = lineageGroupToEquivClasses
                        .get(lineageGroup);
                int numEquivClassesCovered = equivClassesCovered.size();
                double score = ((double) numEquivClassesCovered) / weight;

                if (score > bestScore) {

                    if (selectedLineageGroups.contains(lineageGroup)) {
                        bestLineageGroup = lineageGroup;
                        bestEquivClassesCovered = equivClassesCovered;
                        continue;
                    }

                    bestScore = score;
                    bestLineageGroup = lineageGroup;
                    bestEquivClassesCovered = equivClassesCovered;
                }
            }
            // add the best-scoring lineage group to the set of ones we plan to
            // retain
            selectedLineageGroups.add(bestLineageGroup);

            // make copy of bestEquivClassesCovered (or else the code that
            // follows won't work correctly, because removing from the reference
            // set)
            Set> toCopy = bestEquivClassesCovered;
            bestEquivClassesCovered = new HashSet>();
            bestEquivClassesCovered.addAll(toCopy);

            // remove the classes we've now covered
            Collection toRemove = new LinkedList();
            for (Tuple lineageGroup : lineageGroupToEquivClasses.keySet()) {

                Set> equivClasses = lineageGroupToEquivClasses
                        .get(lineageGroup);

                for (Iterator> it = equivClasses
                        .iterator(); it.hasNext();) {
                    IdentityHashSet equivClass = it.next();
                    if (bestEquivClassesCovered.contains(equivClass)) {
                        if ((equivalenceClasses.get(equivClass) - 1) <= 0) {
                            // equivClasses.remove(equivClass);
                            it.remove();
                        }
                    }
                }
                if (equivClasses.size() == 0)
                    toRemove.add(lineageGroup);

            }
            for (Tuple removeMe : toRemove)
                lineageGroupToEquivClasses.remove(removeMe);

            for (IdentityHashSet equivClass : bestEquivClassesCovered) {
                equivalenceClasses.put(equivClass, equivalenceClasses
                        .get(equivClass) - 1);
            }
        }

        // revise baseData to only contain the tuples that are part of
        // selectedLineageGroups
        IdentityHashSet tuplesToRetain = new IdentityHashSet();
        for (Tuple lineageGroup : selectedLineageGroups) {
            Collection members = membershipMap.get(lineageGroup);
            for (Tuple t : members)
                tuplesToRetain.add(t);
        }
        Map newBaseData = new HashMap();
        for (LOLoad loadOp : baseData.keySet()) {
            DataBag data = baseData.get(loadOp);
            // DataBag newData = new DataBag();
            DataBag newData = BagFactory.getInstance().newDefaultBag();
            for (Iterator it = data.iterator(); it.hasNext();) {
                Tuple t = it.next();
                if (tuplesToRetain.contains(t))
                    newData.add(t);
            }
            newBaseData.put(loadOp, newData);
        }

        return newBaseData;
    }

    private void processOperator(LogicalOperator op) {
        if (op instanceof LOLoad) return;
        
        continueTrimming = checkCompleteness(op);

        if (continueTrimming == false)
            return;

        LogicalOperator childOp = plan.getPredecessors(op).get(0);

        DerivedDataVisitor visitor = new DerivedDataVisitor(childOp, null,
                baseData, LogToPhyMap, physPlan);
        try {
            visitor.visit();
        } catch (VisitorException e) {
            log.error(e.getMessage());
        }

        DataBag bag = visitor.derivedData.get(childOp);
        Map, Integer> affinityGroups = new HashMap, Integer>();

        for (Iterator it = bag.iterator(); it.hasNext();) {
            IdentityHashSet set = new IdentityHashSet();
            affinityGroups.put(set, 1);
            set.add(it.next());
        }

        for (IdentityHashSet set : visitor.EqClasses) {
            // newEquivalenceClasses.put(set, 1);
            affinityGroups.put(set, 1);
        }

        AffinityGroups.put(childOp, affinityGroups);
        Lineage.put(childOp, visitor.lineage);

    }

    private boolean checkCompleteness(LogicalOperator op) {
        LineageTracer lineage = Lineage.get(op);
        Lineage.remove(op);

        Map, Integer> affinityGroups = AffinityGroups
                .get(op);
        AffinityGroups.remove(op);

        Map newBaseData = PruneBaseDataConstrainedCoverage(
                baseData, null, lineage, affinityGroups);

        // obtain the derived data
        DerivedDataVisitor visitor = new DerivedDataVisitor(plan, null,
                newBaseData, LogToPhyMap, physPlan);
        try {
            visitor.visit();
        } catch (VisitorException e) {
            log.error(e.getMessage());
        }

        double newCompleteness = MetricEvaluation.getCompleteness(null,
                visitor.derivedData, visitor.OpToEqClasses, true);

        if (newCompleteness >= completeness) {
            completeness = newCompleteness;
            baseData.putAll(newBaseData);
        } else {
            continueTrimming = false;
        }

        return continueTrimming;
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy