All Downloads are FREE. Search and download functionalities are using the official Maven repository.

datafu.pig.sessions.Sessionize Maven / Gradle / Ivy

Go to download

A collection of user-defined functions for working with large-scale data in Hadoop and Pig.

There is a newer version: 1.2.0
Show newest version
/*
 * Copyright 2010 LinkedIn, Inc
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package datafu.pig.sessions;

import java.io.IOException;
import java.util.UUID;

import org.apache.pig.Accumulator;
import org.apache.pig.AccumulatorEvalFunc;
import org.apache.pig.EvalFunc;
import org.apache.pig.builtin.Nondeterministic;
import org.apache.pig.data.BagFactory;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import org.joda.time.DateTime;
import org.joda.time.Period;

/**
 * Sessionizes an input stream, appending a session ID to each tuple.
 *
 * 

* This UDF takes a constructor argument which is the session timeout (an idle * period of this amount indicates that a new session has started) and assumes * the first element of the input bag is an ISO8601 timestamp. The input bag * must be sorted by this timestamp. It returns the input bag with a new field, * session_id, that is a GUID indicating the session of the request. *

* *

* Example: *

 * {@code
 * 
 * %declare TIME_WINDOW  30m
 * 
 * define Sessionize datafu.pig.sessions.Sessionize('$TIME_WINDOW');
 *
 * views = LOAD 'views.tsv' AS (visit_date:chararray, member_id:int, url:chararray);
 *
 * -- sessionize the visit stream
 * views = GROUP views BY member_id;
 * sessions = FOREACH views {
 *   visits = ORDER views BY visit_date;
 *   GENERATE FLATTEN(Sessionize(VISITS)) AS (visit_date,member_id,url,session_id); 
 * }
 *
 * -- count the number of sessions hitting the url
 * rollup = GROUP sessions BY url;
 * result = FOREACH rollup GENERATE group AS url, COUNT(SESSIONS) AS session_cnt;
 * }
 * 
*

*/ @Nondeterministic public class Sessionize extends AccumulatorEvalFunc { private final long millis; private DataBag outputBag; private DateTime last_date; private String id; public Sessionize(String timeSpec) { Period p = new Period("PT" + timeSpec.toUpperCase()); this.millis = p.toStandardSeconds().getSeconds() * 1000; cleanup(); } @Override public void accumulate(Tuple input) throws IOException { for (Tuple t : (DataBag) input.get(0)) { Object timeObj = t.get(0); DateTime date; if (timeObj instanceof String) { date = new DateTime((String)timeObj); } else if (timeObj instanceof Long) { date = new DateTime((Long)timeObj); } else { throw new RuntimeException("Time must either be a String or Long"); } if (this.last_date == null) this.last_date = date; else if (date.isAfter(this.last_date.plus(this.millis))) this.id = UUID.randomUUID().toString(); else if (date.isBefore(last_date)) throw new IOException(String.format("input time series is not sorted (%s < %s)", date, last_date)); Tuple t_new = TupleFactory.getInstance().newTuple(t.getAll()); t_new.append(this.id); outputBag.add(t_new); this.last_date = date; } } @Override public DataBag getValue() { return outputBag; } @Override public void cleanup() { this.last_date = null; this.outputBag = BagFactory.getInstance().newDefaultBag(); this.id = UUID.randomUUID().toString(); } @Override public Schema outputSchema(Schema input) { try { Schema.FieldSchema inputFieldSchema = input.getField(0); if (inputFieldSchema.type != DataType.BAG) { throw new RuntimeException("Expected a BAG as input"); } Schema inputBagSchema = inputFieldSchema.schema; if (inputBagSchema.getField(0).type != DataType.TUPLE) { throw new RuntimeException(String.format("Expected input bag to contain a TUPLE, but instead found %s", DataType.findTypeName(inputBagSchema.getField(0).type))); } Schema inputTupleSchema = inputBagSchema.getField(0).schema; if (inputTupleSchema.getField(0).type != DataType.CHARARRAY && inputTupleSchema.getField(0).type != DataType.LONG) { throw new RuntimeException(String.format("Expected first element of tuple to be a CHARARRAY or LONG, but instead found %s", DataType.findTypeName(inputTupleSchema.getField(0).type))); } Schema outputTupleSchema = inputTupleSchema.clone(); outputTupleSchema.add(new Schema.FieldSchema("session_id", DataType.CHARARRAY)); return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass() .getName() .toLowerCase(), input), outputTupleSchema, DataType.BAG)); } catch (CloneNotSupportedException e) { throw new RuntimeException(e); } catch (FrontendException e) { throw new RuntimeException(e); } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy