org.apache.pig.builtin.COR Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pig.builtin;
import java.io.IOException;
import java.util.Iterator;
import java.util.Vector;
import org.apache.pig.Algebraic;
import org.apache.pig.EvalFunc;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.DataType;
import org.apache.pig.data.DefaultBagFactory;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.logicalLayer.schema.Schema;
/**
* Computes the correlation between sets of data. The returned value
* will be a bag which will contain a tuple for each combination of input
* schema and inside tuple we will have two schema name and correlation between
* those two schemas.
*
* A = load 'input.xml' using PigStorage(':');
* B = group A all;
* D = foreach B generate group,COR(A.$0,A.$1,A.$2);
*/
public class COR extends EvalFunc implements Algebraic {
//name of the schemas. Initialize when user use define
protected VectorschemaName = new Vector();
//flag to indicate if define is called or not.
private boolean flag = false;
public COR(){}
public COR(String... schemaName){
for(int i=0;i {
@Override
public Tuple exec(Tuple input) throws IOException {
if (input == null || input.size() == 0)
return null;
Tuple output = TupleFactory.getInstance().newTuple(input.size()*(input.size()-1));
try {
int k = -1;
for(int i=0;i {
@Override
public Tuple exec(Tuple input) throws IOException {
if (input == null || input.size() == 0)
return null;
try{
return combine((DataBag)input.get(0));
}catch(Exception e){
throw new IOException("Caught exception in COR.Intermed", e);
}
}
}
public static class Final extends EvalFunc {
protected VectorschemaName = new Vector();
boolean flag = false;
public Final(){}
public Final(String... schemaName){
for(int i=0;i it = values.iterator(); it.hasNext();) {
Tuple t = it.next();
tuple.append(t);
}
}catch(Exception e){}
try{
int size = ((Tuple)tuple.get(0)).size();
for(int i=0;i iterator_x = first.iterator();
Iterator iterator_y = second.iterator();
try{
while(iterator_x.hasNext()){
double x = (Double)iterator_x.next().get(0);
double y = (Double)iterator_y.next().get(0);
sum_x_y+=x*y;
sum_x+=x;
sum_y+=y;
sum_x_square+=x*x;
sum_y_square+=y*y;
}
}catch(Exception e){
throw new IOException("Caught exception processing input", e);
}
Tuple result = TupleFactory.getInstance().newTuple(5);
try{
result.set(0, sum_x_y);
result.set(1, sum_x);
result.set(2, sum_y);
result.set(3, sum_x_square);
result.set(4, sum_y_square);
}catch(Exception e){
throw new IOException("Caught exception processing result", e);
}
return result;
}
@Override
public Schema outputSchema(Schema input) {
return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass().getName().toLowerCase(), input), DataType.BAG));
}
}