org.apache.pig.impl.util.LineageTracer Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pig.impl.util;
import java.util.*;
import org.apache.pig.data.Tuple;
public class LineageTracer {
// Use textbook Union-Find data structure, with counts associated with items
// note: we test for equality by comparing tuple references, not by calling the "equals()" method
// the "IdentityHashMap" data structure is based on reference equality
IdentityHashMap parents = new IdentityHashMap();
IdentityHashMap counts = new IdentityHashMap(); // has one entry per unique tuple being tracked
IdentityHashMap ranks = new IdentityHashMap();
// insert a new tuple (if a tuple is inserted multiple times, it gets a count > 1)
public void insert(Tuple t) {
if (parents.containsKey(t)) {
counts.put(t, counts.get(t)+1);
} else {
parents.put(t, t);
counts.put(t, 1);
ranks.put(t, 0);
}
}
// union two tuple sets
public void union(Tuple t1, Tuple t2) {
link(getRepresentative(t1), getRepresentative(t2));
}
// find the set representative of a given tuple
public Tuple getRepresentative(Tuple t) {
Tuple tParent = parents.get(t);
if (tParent != t) {
tParent = getRepresentative(tParent);
parents.put(t, tParent);
}
return tParent;
}
private void link(Tuple t1, Tuple t2) {
int t1Rank = ranks.get(t1);
int t2Rank = ranks.get(t2);
if (t1Rank > t2Rank) {
parents.put(t2, t1);
} else {
parents.put(t1, t2);
if (t1Rank == t2Rank) ranks.put(t2, t2Rank + 1);
}
}
// get the cardinality of each tuple set (identified by a representative tuple)
public IdentityHashMap getCounts() {
return getWeightedCounts(new IdentityHashSet(), 1);
}
// get the cardinality of each tuple set, weighted in a special way
// weighting works like this: if a tuple set contains one or more tuples from the "specialTuples" set, we multiply its value by "multiplier"
public IdentityHashMap getWeightedCounts(IdentityHashSet specialTuples, int multiplier) {
IdentityHashMap repCounts = new IdentityHashMap();
IdentityHashSet specialSets = new IdentityHashSet();
for (IdentityHashMap.Entry e : counts.entrySet()) {
Tuple t = e.getKey();
int newCount = counts.get(t);
Tuple rep = getRepresentative(t);
int oldCount = (repCounts.containsKey(rep))? repCounts.get(rep) : 0;
repCounts.put(rep, oldCount + newCount);
if (specialTuples.contains(t)) specialSets.add(rep);
}
for (IdentityHashMap.Entry e : repCounts.entrySet()) {
if (specialSets.contains(e.getKey())) e.setValue(e.getValue() * multiplier);
}
return repCounts;
}
// get all members of the set containing t
public Collection getMembers(Tuple t) {
Tuple representative = getRepresentative(t);
Collection members = new LinkedList();
for (IdentityHashMap.Entry e : counts.entrySet()) {
Tuple t1 = e.getKey();
if (getRepresentative(t1) == representative) members.add(t1);
}
return members;
}
// get a mapping from set representatives to members
public IdentityHashMap> getMembershipMap() {
IdentityHashMap> map = new IdentityHashMap>();
for (IdentityHashMap.Entry e : counts.entrySet()) {
Tuple t = e.getKey();
Tuple representative = getRepresentative(t);
Collection members = map.get(representative);
if (members == null) {
members = new LinkedList();
map.put(representative, members);
}
members.add(t);
}
return map;
}
}