org.apache.pig.impl.util.LineageTracer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of pig Show documentation
There is a newer version: 0.17.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.pig.impl.util;

import java.util.*;

import org.apache.pig.data.Tuple;


public class LineageTracer {
    
    // Use textbook Union-Find data structure, with counts associated with items
    
    // note: we test for equality by comparing tuple references, not by calling the "equals()" method
    //       the "IdentityHashMap" data structure is based on reference equality
    IdentityHashMap parents = new IdentityHashMap();
    IdentityHashMap counts = new IdentityHashMap();   // has one entry per unique tuple being tracked
    IdentityHashMap ranks = new IdentityHashMap();
    
    // insert a new tuple (if a tuple is inserted multiple times, it gets a count > 1)
    public void insert(Tuple t) {
        if (parents.containsKey(t)) {
            counts.put(t, counts.get(t)+1);
        } else {
            parents.put(t, t);
            counts.put(t, 1);
            ranks.put(t, 0);
        }
    }
    
    // union two tuple sets
    public void union(Tuple t1, Tuple t2) {
        link(getRepresentative(t1), getRepresentative(t2));
    }
    
    // find the set representative of a given tuple
    public Tuple getRepresentative(Tuple t) {
        Tuple tParent = parents.get(t);
        if (tParent != t) {
            tParent = getRepresentative(tParent);
            parents.put(t, tParent);
        }
        return tParent;
    }
    
    private void link(Tuple t1, Tuple t2) {
        int t1Rank = ranks.get(t1);
        int t2Rank = ranks.get(t2);
        if (t1Rank > t2Rank) {
            parents.put(t2, t1);
        } else {
            parents.put(t1, t2);
            if (t1Rank == t2Rank) ranks.put(t2, t2Rank + 1);
        }
    }
    
    // get the cardinality of each tuple set (identified by a representative tuple)
    public IdentityHashMap getCounts() {
        return getWeightedCounts(new IdentityHashSet(), 1);
    }
    
    // get the cardinality of each tuple set, weighted in a special way
    // weighting works like this: if a tuple set contains one or more tuples from the "specialTuples" set, we multiply its value by "multiplier"
    public IdentityHashMap getWeightedCounts(IdentityHashSet specialTuples, int multiplier) {
        IdentityHashMap repCounts = new IdentityHashMap();
        IdentityHashSet specialSets = new IdentityHashSet();
        
        for (IdentityHashMap.Entry e : counts.entrySet()) {
            Tuple t = e.getKey();

            int newCount = counts.get(t);
            Tuple rep = getRepresentative(t);
            int oldCount = (repCounts.containsKey(rep))? repCounts.get(rep) : 0;
            repCounts.put(rep, oldCount + newCount);
            if (specialTuples.contains(t)) specialSets.add(rep);
        }
        
        for (IdentityHashMap.Entry e : repCounts.entrySet()) {
            if (specialSets.contains(e.getKey())) e.setValue(e.getValue() * multiplier);
        }

        return repCounts;
    }
    
    // get all members of the set containing t
    public Collection getMembers(Tuple t) {
        Tuple representative = getRepresentative(t);
        
        Collection members = new LinkedList();
        for (IdentityHashMap.Entry e : counts.entrySet()) {
            Tuple t1 = e.getKey();
            if (getRepresentative(t1) == representative) members.add(t1);
        }
        return members;
    }
    
    // get a mapping from set representatives to members
    public IdentityHashMap> getMembershipMap() {
        IdentityHashMap> map = new IdentityHashMap>();
        for (IdentityHashMap.Entry e : counts.entrySet()) {
            Tuple t = e.getKey();

            Tuple representative = getRepresentative(t);
            Collection members = map.get(representative);
            if (members == null) {
                members = new LinkedList();
                map.put(representative, members);
            }
            members.add(t);
        }
        return map;
    }
}