All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.datatorrent.lib.join.SemiJoinOperator Maven / Gradle / Ivy

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package com.datatorrent.lib.join;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.hadoop.classification.InterfaceStability.Evolving;

import com.datatorrent.api.Context.OperatorContext;
import com.datatorrent.api.DefaultInputPort;
import com.datatorrent.api.DefaultOutputPort;
import com.datatorrent.api.Operator;
import com.datatorrent.api.annotation.OperatorAnnotation;
import com.datatorrent.lib.streamquery.condition.Condition;
import com.datatorrent.lib.streamquery.index.Index;

/**
 * An implementation of Operator that reads table row data from two table data input ports. 
*

* Operator semi-joins row on given condition and selected names, emits * semi-joined result at output port. * * Note: A semi-join is not a left-join or left-outer-join. In semi-join only the joined rows * from the left table are returned. However, in a left-outer-join, all the rows from left table * will be returned (also padding with nulls for columns from the right table when not joined). * * For more information see {@link http://docs.oracle.com/cd/B28359_01/server.111/b28286/statements_10002.htm#i2166436} * *
* StateFull : Yes, Operator aggregates input over application window.
* Partitions : No, will yield wrong result(s).
*
* Ports :
* inport1 : Input port for table 1, expects HashMap<String, Object>
* inport2 : Input port for table 2, expects HashMap<String, Object>
* outport : Output semi-joined row port, emits HashMap<String, ArrayList<Object>>
*
* Properties : * joinCondition : Join condition for table rows.
* table1Columns : Columns to be selected from table1.
* table2Columns : Columns to be selected from table2.
*
* * @displayName Semi join * @category Join Manipulators * @tags sql, semi join operator * @since 0.3.3 */ @OperatorAnnotation(partitionable = false) @Evolving public class SemiJoinOperator implements Operator { /** * Join Condition. */ private Condition joinCondition; /** * Table1 select columns. * Note: only left table (Table1) will be output in an semi-join */ private ArrayList table1Columns = new ArrayList<>(); /** * Collect data rows from input port 1. */ private List> table1; /** * Collect data from input port 2. */ private List> table2; /** * Input port 1 that takes a map of <string,object>. */ public final transient DefaultInputPort> inport1 = new DefaultInputPort>() { @Override public void process(Map tuple) { table1.add(tuple); for (int j = 0; j < table2.size(); j++) { if ((joinCondition != null) && (joinCondition.isValidJoin(tuple, table2.get(j)))) { joinRows(tuple); // row has been join, and can be removed now table1.remove(tuple); } } } }; /** * Input port 2 that takes a map of <string,object>. */ public final transient DefaultInputPort> inport2 = new DefaultInputPort>() { @Override public void process(Map tuple) { table2.add(tuple); for (int j = 0; j < table1.size(); j++) { if ((joinCondition != null) && (joinCondition.isValidJoin(table1.get(j), tuple))) { joinRows(table1.get(j)); table1.remove(table1.get(j)); } } } }; /** * Output port that emits a map of <string,object>. */ public final transient DefaultOutputPort> outport = new DefaultOutputPort<>(); @Override public void setup(OperatorContext arg0) { table1 = new ArrayList<>(); table2 = new ArrayList<>(); } @Override public void teardown() { } @Override public void beginWindow(long arg0) { } @Override public void endWindow() { table1.clear(); table2.clear(); } /** * @return the joinCondition */ public Condition getJoinCondition() { return joinCondition; } /** * Pick the supported condition. Currently only equal join is supported. * * @param joinCondition - join condition */ public void setJoinCondition(Condition joinCondition) { this.joinCondition = joinCondition; } /** * Select table1 column name. */ public void selectTable1Column(Index column) { table1Columns.add(column); } /** * Join row from table1 (only left table is used in semi-join). */ protected void joinRows(Map row) { // joined row Map join = new HashMap<>(); // filter table1 columns if (row != null) { for (Index index : table1Columns) { index.filter(row, join); } } // emit row outport.emit(join); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy