All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.pig.newplan.logical.relational.LogicalPlan Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.pig.newplan.logical.relational;

import java.io.IOException;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;

import org.apache.pig.PigConstants;
import org.apache.pig.PigException;
import org.apache.pig.backend.hadoop.datastorage.ConfigurationUtil;
import org.apache.pig.data.SchemaTupleBackend;
import org.apache.pig.impl.PigContext;
import org.apache.pig.impl.PigImplConstants;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.plan.CompilationMessageCollector;
import org.apache.pig.impl.plan.CompilationMessageCollector.MessageType;
import org.apache.pig.impl.util.HashOutputStream;
import org.apache.pig.impl.util.ObjectSerializer;
import org.apache.pig.newplan.BaseOperatorPlan;
import org.apache.pig.newplan.Operator;
import org.apache.pig.newplan.OperatorPlan;
import org.apache.pig.newplan.logical.DotLOPrinter;
import org.apache.pig.newplan.logical.optimizer.LogicalPlanOptimizer;
import org.apache.pig.newplan.logical.optimizer.LogicalPlanPrinter;
import org.apache.pig.newplan.logical.optimizer.SchemaResetter;
import org.apache.pig.newplan.logical.optimizer.UidResetter;
import org.apache.pig.newplan.logical.visitor.CastLineageSetter;
import org.apache.pig.newplan.logical.visitor.ColumnAliasConversionVisitor;
import org.apache.pig.newplan.logical.visitor.DanglingNestedNodeRemover;
import org.apache.pig.newplan.logical.visitor.DuplicateForEachColumnRewriteVisitor;
import org.apache.pig.newplan.logical.visitor.ForEachUserSchemaVisitor;
import org.apache.pig.newplan.logical.visitor.ImplicitSplitInsertVisitor;
import org.apache.pig.newplan.logical.visitor.InputOutputFileValidatorVisitor;
import org.apache.pig.newplan.logical.visitor.ScalarVariableValidator;
import org.apache.pig.newplan.logical.visitor.ScalarVisitor;
import org.apache.pig.newplan.logical.visitor.SchemaAliasVisitor;
import org.apache.pig.newplan.logical.visitor.SortInfoSetter;
import org.apache.pig.newplan.logical.visitor.StoreAliasSetter;
import org.apache.pig.newplan.logical.visitor.TypeCheckingRelVisitor;
import org.apache.pig.newplan.logical.visitor.UnionOnSchemaSetter;
import org.apache.pig.pen.POOptimizeDisabler;
import org.apache.pig.validator.BlackAndWhitelistValidator;

import com.google.common.base.Splitter;
import com.google.common.collect.Lists;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing;

/**
 * LogicalPlan is the logical view of relational operations Pig will execute
 * for a given script.  Note that it contains only relational operations.
 * All expressions will be contained in LogicalExpressionPlans inside
 * each relational operator.
 */
public class LogicalPlan extends BaseOperatorPlan {

    public LogicalPlan(LogicalPlan other) {
        // shallow copy constructor
        super(other);
    }

    public LogicalPlan() {
        super();
    }

    /**
     * Equality is checked by calling equals on every leaf in the plan.  This
     * assumes that plans are always connected graphs.  It is somewhat
     * inefficient since every leaf will test equality all the way to
     * every root.  But it is only intended for use in testing, so that
     * should be ok.  Checking predecessors (as opposed to successors) was
     * chosen because splits (which have multiple successors) do not depend
     * on order of outputs for correctness, whereas joins (with multiple
     * predecessors) do.  That is, reversing the outputs of split in the
     * graph has no correctness implications, whereas reversing the inputs
     * of join can.  This method of doing equals will detect predecessors
     * in different orders but not successors in different orders.
     * It will return false if either plan has non deterministic EvalFunc.
     */
    @Override
    public boolean isEqual(OperatorPlan other) throws FrontendException {
        if (other == null || !(other instanceof LogicalPlan)) {
            return false;
        }

        return super.isEqual(other);
    }

    @Override
    public void explain(PrintStream ps, String format, boolean verbose)
    throws FrontendException {
        if (format.equals("xml")) {
            ps.println("XML Not Supported");
            return;
        }

        ps.println("#-----------------------------------------------");
        ps.println("# New Logical Plan:");
        ps.println("#-----------------------------------------------");

        if (this.size() == 0) {
            ps.println("Logical plan is empty.");
        } else if (format.equals("dot")) {
            DotLOPrinter lpp = new DotLOPrinter(this, ps);
            lpp.dump();
        } else {
            LogicalPlanPrinter npp = new LogicalPlanPrinter(this, ps);
            npp.visit();
        }
    }

    public Operator findByAlias(String alias) {
    	Iterator it = getOperators();
    	List ops = new ArrayList();
    	while( it.hasNext() ) {
    	    LogicalRelationalOperator op = (LogicalRelationalOperator) it.next();
    	    if(op.getAlias() == null)
    	        continue;
    	    if(op.getAlias().equals( alias ) )  {
    	        ops.add( op );
    	    }
    	}

    	if( ops.isEmpty() ) {
            return null;
    	} else {
    		return ops.get( ops.size() - 1 ); // Last one
    	}
    }

    /**
     * Returns the signature of the LogicalPlan. The signature is a unique identifier for a given
     * plan generated by a Pig script. The same script run multiple times with the same version of
     * Pig is guaranteed to produce the same signature, even if the input or output locations differ.
     *
     * @return a unique identifier for the logical plan
     * @throws FrontendException if signature can't be computed
     */
    public String getSignature() throws FrontendException {

        // Use a streaming hash function. We use a murmur_32 function with a constant seed, 0.
        HashFunction hf = Hashing.murmur3_32(0);
        HashOutputStream hos = new HashOutputStream(hf);
        PrintStream ps = new PrintStream(hos);

        LogicalPlanPrinter printer = new LogicalPlanPrinter(this, ps);
        printer.visit();

        return Integer.toString(hos.getHashCode().asInt());
    }

    public void validate(PigContext pigContext, String scope, boolean skipInputOutputValidation)
            throws FrontendException {

        new DanglingNestedNodeRemover(this).visit();
        new ColumnAliasConversionVisitor(this).visit();
        new SchemaAliasVisitor(this).visit();
        new ScalarVisitor(this, pigContext, scope).visit();
        new ForEachUserSchemaVisitor(this).visit();

        // ImplicitSplitInsertVisitor has to be called before
        // DuplicateForEachColumnRewriteVisitor.  Detail at pig-1766
        new ImplicitSplitInsertVisitor(this).visit();

        // DuplicateForEachColumnRewriteVisitor should be before
        // TypeCheckingRelVisitor which does resetSchema/getSchema
        // heavily
        new DuplicateForEachColumnRewriteVisitor(this).visit();

        CompilationMessageCollector collector = new CompilationMessageCollector() ;

        new TypeCheckingRelVisitor( this, collector).visit();


        new UnionOnSchemaSetter(this).visit();
        new CastLineageSetter(this, collector).visit();
        new ScalarVariableValidator(this).visit();
        new StoreAliasSetter(this).visit();

        // compute whether output data is sorted or not
        new SortInfoSetter(this).visit();

        boolean aggregateWarning = "true".equalsIgnoreCase(pigContext.getProperties().getProperty("aggregate.warning"));

        if(aggregateWarning) {
            CompilationMessageCollector.logMessages(collector, MessageType.Warning, aggregateWarning, log);
        } else {
            for(Enum type: MessageType.values()) {
                CompilationMessageCollector.logAllMessages(collector, log);
            }
        }

        if (!(skipInputOutputValidation || pigContext.inExplain || pigContext.inDumpSchema)) {
            // Validate input/output file
            new InputOutputFileValidatorVisitor(this, pigContext).visit();
        }

        BlackAndWhitelistValidator validator = new BlackAndWhitelistValidator(pigContext, this);
        validator.validate();

        // Now make sure the plan is consistent
        UidResetter uidResetter = new UidResetter(this);
        uidResetter.visit();

        SchemaResetter schemaResetter = new SchemaResetter(this,
                true /* skip duplicate uid check*/);
        schemaResetter.visit();
    }
    
    public void optimize(PigContext pigContext) throws FrontendException {
        if (pigContext.inIllustrator) {
            // disable all PO-specific optimizations
            POOptimizeDisabler pod = new POOptimizeDisabler(this);
            pod.visit();
        }

        HashSet disabledOptimizerRules;
        try {
            disabledOptimizerRules = (HashSet) ObjectSerializer
                    .deserialize(pigContext.getProperties().getProperty(
                            PigImplConstants.PIG_OPTIMIZER_RULES_KEY));
        } catch (IOException ioe) {
            int errCode = 2110;
            String msg = "Unable to deserialize optimizer rules.";
            throw new FrontendException(msg, errCode, PigException.BUG, ioe);
        }
        if (disabledOptimizerRules == null) {
            disabledOptimizerRules = new HashSet();
        }

        String pigOptimizerRulesDisabled = pigContext.getProperties()
                .getProperty(PigConstants.PIG_OPTIMIZER_RULES_DISABLED_KEY);
        if (pigOptimizerRulesDisabled != null) {
            disabledOptimizerRules.addAll(Lists.newArrayList((Splitter.on(",")
                    .split(pigOptimizerRulesDisabled))));
        }

        if (pigContext.inIllustrator) {
            disabledOptimizerRules.add("MergeForEach");
            disabledOptimizerRules.add("PartitionFilterOptimizer");
            disabledOptimizerRules.add("LimitOptimizer");
            disabledOptimizerRules.add("NestedLimitOptimizer");
            disabledOptimizerRules.add("SplitFilter");
            disabledOptimizerRules.add("PushUpFilter");
            disabledOptimizerRules.add("MergeFilter");
            disabledOptimizerRules.add("PushDownForEachFlatten");
            disabledOptimizerRules.add("ColumnMapKeyPrune");
            disabledOptimizerRules.add("AddForEach");
            disabledOptimizerRules.add("GroupByConstParallelSetter");
        }

        try {
            SchemaTupleBackend.initialize(ConfigurationUtil.toConfiguration(pigContext.getProperties(), true),
                    pigContext);
        } catch (IOException e) {
            throw new FrontendException(e);
        }
        // run optimizer
        LogicalPlanOptimizer optimizer = new LogicalPlanOptimizer(this, 100,
                disabledOptimizerRules, pigContext);
        optimizer.optimize();
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy