com.datatorrent.lib.join.AbstractJoinOperator Maven / Gradle / Ivy
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package com.datatorrent.lib.join;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.List;
import javax.validation.constraints.NotNull;
import org.apache.hadoop.classification.InterfaceStability;
import com.datatorrent.api.AutoMetric;
import com.datatorrent.api.Context;
import com.datatorrent.api.DefaultInputPort;
import com.datatorrent.api.DefaultOutputPort;
import com.datatorrent.api.Operator;
import com.datatorrent.api.annotation.InputPortFieldAnnotation;
import com.datatorrent.common.util.BaseOperator;
/**
*
* This is the base implementation of join operator. Operator receives tuples from two streams,
* applies the join operation based on constraint and emit the joined value.
* Subclasses should provide implementation to createOutputTuple,copyValue, getKeyValue, getTime methods.
*
* Properties:
* expiryTime: Expiry time for stored tuples
* includeFieldStr: List of comma separated fields to be added to the output tuple.
* Ex: Field1,Field2;Field3,Field4
* keyFields: List of comma separated key field for both the streams. Ex: Field1,Field2
* timeFields: List of comma separated time field for both the streams. Ex: Field1,Field2
* bucketSpanInMillis: Span of each bucket in milliseconds.
* strategy: Type of join operation. Default type is inner join
*
*
* Example:
* Left input port receives customer details and right input port receives Order details.
* Schema for the Customer be in the form of
* Schema for the Order be in the form of
* Now, Join the tuples of Customer and Order streams where Customer.ID = Order.CID and the constraint is
* matched tuples must have timestamp within 5 minutes.
* Here, key Fields = ID, CID and Time Fields = RTime, OTime, expiryTime = 5 minutes
*
*
* @displayName Abstract Join Operator
* @tags join
*
* @since 3.4.0
*/
@InterfaceStability.Unstable
public abstract class AbstractJoinOperator extends BaseOperator implements Operator.CheckpointListener
{
@AutoMetric
private long tuplesJoinedPerSec;
private double windowTimeSec;
protected int tuplesCount;
public final transient DefaultOutputPort> outputPort = new DefaultOutputPort<>();
// Strategy of Join operation, by default the option is inner join
protected JoinStrategy strategy = JoinStrategy.INNER_JOIN;
// This represents whether the processing tuple is from left port or not
protected boolean isLeft;
@InputPortFieldAnnotation
public transient DefaultInputPort input1 = new DefaultInputPort()
{
@Override
public void process(T tuple)
{
isLeft = true;
processTuple(tuple);
}
};
@InputPortFieldAnnotation
public transient DefaultInputPort input2 = new DefaultInputPort()
{
@Override
public void process(T tuple)
{
isLeft = false;
processTuple(tuple);
}
};
// Stores for each of the input port
@NotNull
protected StoreContext leftStore;
@NotNull
protected StoreContext rightStore;
private String includeFieldStr;
private String keyFieldStr;
private String timeFieldStr;
@Override
public void setup(Context.OperatorContext context)
{
// Checks whether the strategy is outer join and set it to store
boolean isOuter = strategy.equals(JoinStrategy.LEFT_OUTER_JOIN) || strategy.equals(JoinStrategy.OUTER_JOIN);
leftStore.getStore().isOuterJoin(isOuter);
isOuter = strategy.equals(JoinStrategy.RIGHT_OUTER_JOIN) || strategy.equals(JoinStrategy.OUTER_JOIN);
rightStore.getStore().isOuterJoin(isOuter);
// Setup the stores
leftStore.getStore().setup(context);
rightStore.getStore().setup(context);
populateFields();
windowTimeSec = (context.getValue(Context.OperatorContext.APPLICATION_WINDOW_COUNT) *
context.getValue(Context.DAGContext.STREAMING_WINDOW_SIZE_MILLIS) * 1.0) / 1000.0;
}
/**
* Create the event with the given tuple. If it successfully inserted it into the store
* then it does the join operation
*
* @param tuple Tuple to process
*/
protected void processTuple(T tuple)
{
JoinStore store = isLeft ? leftStore.getStore() : rightStore.getStore();
TimeEvent t = createEvent(tuple);
if (store.put(t)) {
join(t, isLeft);
}
}
private void populateFields()
{
populateIncludeFields();
populateKeyFields();
if (timeFieldStr != null) {
populateTimeFields();
}
}
/**
* Populate the fields from the includeFiledStr
*/
private void populateIncludeFields()
{
String[] portFields = includeFieldStr.split(";");
assert (portFields.length == 2);
leftStore.setIncludeFields(portFields[0].split(","));
rightStore.setIncludeFields(portFields[1].split(","));
}
/**
* Get the tuples from another store based on join constraint and key
*
* @param tuple input
* @param isLeft whether the given tuple is from first port or not
*/
private void join(TimeEvent tuple, boolean isLeft)
{
// Get the valid tuples from the store based on key
// If the tuple is null means the join type is outer and return unmatched tuples from store.
ArrayList value;
JoinStore store = isLeft ? rightStore.getStore() : leftStore.getStore();
if (tuple != null) {
value = (ArrayList)store.getValidTuples(tuple);
} else {
value = (ArrayList)store.getUnMatchedTuples();
}
// Join the input tuple with the joined tuples
if (value != null) {
List result = new ArrayList<>();
for (TimeEvent joinedValue : value) {
T output = createOutputTuple();
Object tupleValue = null;
if (tuple != null) {
tupleValue = tuple.getValue();
}
copyValue(output, tupleValue, isLeft);
copyValue(output, joinedValue.getValue(), !isLeft);
result.add(output);
joinedValue.setMatch(true);
}
if (tuple != null) {
tuple.setMatch(true);
}
if (result.size() != 0) {
outputPort.emit(result);
tuplesCount += result.size();
}
}
}
// Emit the unmatched tuples, if the strategy is outer join
@Override
public void endWindow()
{
if (strategy.equals(JoinStrategy.LEFT_OUTER_JOIN) || strategy.equals(JoinStrategy.OUTER_JOIN)) {
join(null, false);
}
if (strategy.equals(JoinStrategy.RIGHT_OUTER_JOIN) || strategy.equals(JoinStrategy.OUTER_JOIN)) {
join(null, true);
}
leftStore.getStore().endWindow();
rightStore.getStore().endWindow();
tuplesJoinedPerSec = (long)(tuplesCount / windowTimeSec);
}
@Override
public void beginWindow(long windowId)
{
super.beginWindow(windowId);
tuplesJoinedPerSec = 0;
tuplesCount = 0;
}
@Override
public void checkpointed(long windowId)
{
leftStore.getStore().checkpointed(windowId);
rightStore.getStore().checkpointed(windowId);
}
@Override
public void committed(long windowId)
{
leftStore.getStore().committed(windowId);
rightStore.getStore().committed(windowId);
}
/**
* Convert the given tuple to event
*
* @param tuple Given tuple to convert into event
* @return event
*/
protected TimeEvent createEvent(Object tuple)
{
String key = leftStore.getKeys();
String timeField = leftStore.getTimeFields();
if (!isLeft) {
key = rightStore.getKeys();
timeField = rightStore.getTimeFields();
}
if (timeField != null) {
return new TimeEventImpl(getKeyValue(key, tuple), (Long)getTime(timeField, tuple), tuple);
} else {
return new TimeEventImpl(getKeyValue(key, tuple), Calendar.getInstance().getTimeInMillis(), tuple);
}
}
private void populateKeyFields()
{
leftStore.setKeys(keyFieldStr.split(",")[0]);
rightStore.setKeys(keyFieldStr.split(",")[1]);
}
public JoinStrategy getStrategy()
{
return strategy;
}
public void setStrategy(JoinStrategy strategy)
{
this.strategy = strategy;
}
public void setLeftStore(@NotNull JoinStore lStore)
{
leftStore = new StoreContext(lStore);
}
public void setRightStore(@NotNull JoinStore rStore)
{
rightStore = new StoreContext(rStore);
}
public void setKeyFields(String keyFieldStr)
{
this.keyFieldStr = keyFieldStr;
}
public void setTimeFieldStr(String timeFieldStr)
{
this.timeFieldStr = timeFieldStr;
}
public void setIncludeFields(String includeFieldStr)
{
this.includeFieldStr = includeFieldStr;
}
public StoreContext getLeftStore()
{
return leftStore;
}
public StoreContext getRightStore()
{
return rightStore;
}
public String getIncludeFieldStr()
{
return includeFieldStr;
}
public String getKeyFieldStr()
{
return keyFieldStr;
}
public String getTimeFieldStr()
{
return timeFieldStr;
}
/**
* Specify the comma separated time fields for both steams
*/
private void populateTimeFields()
{
leftStore.setTimeFields(timeFieldStr.split(",")[0]);
rightStore.setTimeFields(timeFieldStr.split(",")[1]);
}
public void setStrategy(String policy)
{
this.strategy = JoinStrategy.valueOf(policy.toUpperCase());
}
/**
* Create the output object
*
* @return output tuple
*/
protected abstract T createOutputTuple();
/**
* Get the values from extractTuple and set these values to the output
*
* @param output otuput tuple
* @param extractTuple Extract the values from this tuple
* @param isLeft Whether the extracted tuple belongs to left stream or not
*/
protected abstract void copyValue(T output, Object extractTuple, boolean isLeft);
/**
* Get the value of the key field from the given tuple
*
* @param keyField Value of the field to extract from given tuple
* @param tuple Given tuple
* @return the value of field from given tuple
*/
protected abstract Object getKeyValue(String keyField, Object tuple);
/**
* Get the value of the time field from the given tuple
*
* @param field Time field
* @param tuple given tuple
* @return the value of time field from given tuple
*/
protected abstract Object getTime(String field, Object tuple);
public static enum JoinStrategy
{
INNER_JOIN,
LEFT_OUTER_JOIN,
RIGHT_OUTER_JOIN,
OUTER_JOIN
}
public static class StoreContext
{
private transient String timeFields;
private transient String[] includeFields;
private transient String keys;
private JoinStore store;
public StoreContext(JoinStore store)
{
this.store = store;
}
public String getTimeFields()
{
return timeFields;
}
public void setTimeFields(String timeFields)
{
this.timeFields = timeFields;
}
public String[] getIncludeFields()
{
return includeFields;
}
public void setIncludeFields(String[] includeFields)
{
this.includeFields = includeFields;
}
public String getKeys()
{
return keys;
}
public void setKeys(String keys)
{
this.keys = keys;
}
public JoinStore getStore()
{
return store;
}
public void setStore(JoinStore store)
{
this.store = store;
}
}
}