org.apache.pig.impl.builtin.GFCross Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pig.impl.builtin;
import java.io.IOException;
import java.text.MessageFormat;
import java.util.Random;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.pig.EvalFunc;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MRConfiguration;
import org.apache.pig.data.BagFactory;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.PigImplConstants;
import org.apache.pig.impl.util.UDFContext;
public class GFCross extends EvalFunc {
private static final Log LOG = LogFactory.getLog(GFCross.class);
private int numInputs, myNumber, numGroupsPerInput, numGroupsGoingTo;
private BagFactory mBagFactory = BagFactory.getInstance();
private TupleFactory mTupleFactory = TupleFactory.getInstance();
private int parallelism = 0;
private Random r;
private String crossKey;
static private final int DEFAULT_PARALLELISM = 96;
public GFCross(String key) {
crossKey = key;
}
public String getCrossKey() {
return crossKey;
}
@Override
public DataBag exec(Tuple input) throws IOException {
if (parallelism == 0) {
parallelism = DEFAULT_PARALLELISM;
Configuration cfg = UDFContext.getUDFContext().getJobConf();
if (cfg != null) {
String s = cfg.get(PigImplConstants.PIG_CROSS_PARALLELISM + "." + crossKey);
if (s == null) {
throw new IOException("Unable to get parallelism hint from job conf");
}
parallelism = Integer.valueOf(s);
if (parallelism < 0) {
throw new IOException(PigImplConstants.PIG_CROSS_PARALLELISM + "." + crossKey + " was " + parallelism);
}
long taskIdHashCode = cfg.get(MRConfiguration.TASK_ID).hashCode();
long seed = ((long)taskIdHashCode << 32) | (taskIdHashCode & 0xffffffffL);
r = new Random(seed);
} else {
// Don't see a case where cfg can be null.
// But there is an existing testcase TestGFCross.testDefault
// Using constant generated from task_14738102975522_0001_r_000000 hashcode
r = new Random(-4235927512599300514L);
}
numInputs = (Integer)input.get(0);
myNumber = (Integer)input.get(1);
numGroupsPerInput = (int) Math.ceil(Math.pow(parallelism, 1.0/numInputs));
numGroupsGoingTo = (int) Math.pow(numGroupsPerInput,numInputs - 1);
LOG.info(MessageFormat.format("Parallelism = {0}, numInputs = {1}, myNumber = {2},"
+ " numGroupsPerInput = {3}, numGroupsGoingTo = {4}",
parallelism, numInputs, myNumber,
numGroupsPerInput, numGroupsGoingTo));
}
DataBag output = mBagFactory.newDefaultBag();
try{
int[] digits = new int[numInputs];
digits[myNumber] = r.nextInt(numGroupsPerInput);
for (int i=0; i