All Downloads are FREE. Search and download functionalities are using the official Maven repository.

tech.tablesaw.joining.DataFrameJoiner Maven / Gradle / Ivy

package tech.tablesaw.joining;

import com.google.common.collect.Streams;
import com.google.common.primitives.Ints;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;
import tech.tablesaw.api.*;
import tech.tablesaw.columns.Column;
import tech.tablesaw.columns.booleans.BooleanColumnType;
import tech.tablesaw.columns.dates.DateColumnType;
import tech.tablesaw.columns.datetimes.DateTimeColumnType;
import tech.tablesaw.columns.instant.InstantColumnType;
import tech.tablesaw.columns.numbers.*;
import tech.tablesaw.columns.strings.StringColumnType;
import tech.tablesaw.columns.strings.TextColumnType;
import tech.tablesaw.columns.times.TimeColumnType;
import tech.tablesaw.index.*;
import tech.tablesaw.selection.Selection;

/** Implements joins between two or more Tables */
public class DataFrameJoiner {

  /** The types of joins that are supported */
  private enum JoinType {
    INNER,
    LEFT_OUTER,
    RIGHT_OUTER,
    FULL_OUTER
  }

  private static final String TABLE_ALIAS = "T";

  private final Table table;
  private final String[] joinColumnNames;
  private final List joinColumnIndexes;
  private final AtomicInteger joinTableId = new AtomicInteger(2);

  /**
   * Constructor.
   *
   * @param table The table to join on.
   * @param joinColumnNames The join column names to join on.
   */
  public DataFrameJoiner(Table table, String... joinColumnNames) {
    this.table = table;
    this.joinColumnNames = joinColumnNames;
    this.joinColumnIndexes = getJoinIndexes(table, joinColumnNames);
  }

  /**
   * Finds the index of the columns corresponding to the columnNames. E.G. The column named "ID" is
   * located at index 5 in table.
   *
   * @param table the table that contains the columns.
   * @param columnNames the column names to find indexes of.
   * @return a list of column indexes within the table.
   */
  private List getJoinIndexes(Table table, String[] columnNames) {
    return Arrays.stream(columnNames).map(table::columnIndex).collect(Collectors.toList());
  }

  /**
   * Joins to the given tables assuming that they have a column of the name we're joining on
   *
   * @param tables The tables to join with
   */
  public Table inner(Table... tables) {
    return inner(false, tables);
  }

  /**
   * Joins to the given tables assuming that they have a column of the name we're joining on
   *
   * @param allowDuplicateColumnNames if {@code false} the join will fail if any columns other than
   *     the join column have the same name if {@code true} the join will succeed and duplicate
   *     columns are renamed*
   * @param tables The tables to join with
   */
  public Table inner(boolean allowDuplicateColumnNames, Table... tables) {
    Table joined = table;
    for (Table currT : tables) {
      joined =
          joinInternal(
              joined, currT, JoinType.INNER, allowDuplicateColumnNames, false, joinColumnNames);
    }
    return joined;
  }

  /**
   * Joins the joiner to the table2, using the given column for the second table and returns the
   * resulting table
   *
   * @param table2 The table to join with
   * @param col2Name The column to join on. If col2Name refers to a double column, the join is
   *     performed after rounding to integers.
   * @return The resulting table
   */
  public Table inner(Table table2, String col2Name) {
    return inner(table2, false, col2Name);
  }

  /**
   * Joins the joiner to the table2, using the given columns for the second table and returns the
   * resulting table
   *
   * @param table2 The table to join with
   * @param col2Names The columns to join on. If a name refers to a double column, the join is
   *     performed after rounding to integers.
   * @return The resulting table
   */
  public Table inner(Table table2, String[] col2Names) {
    return inner(table2, false, col2Names);
  }

  /**
   * Joins the joiner to the table2, using the given column for the second table and returns the
   * resulting table
   *
   * @param table2 The table to join with
   * @param col2Name The column to join on. If col2Name refers to a double column, the join is
   *     performed after rounding to integers.
   * @param allowDuplicateColumnNames if {@code false} the join will fail if any columns other than
   *     the join column have the same name if {@code true} the join will succeed and duplicate
   *     columns are renamed*
   * @return The resulting table
   */
  public Table inner(Table table2, String col2Name, boolean allowDuplicateColumnNames) {
    return inner(table2, allowDuplicateColumnNames, col2Name);
  }

  /**
   * Joins the joiner to the table2, using the given columns for the second table and returns the
   * resulting table
   *
   * @param table2 The table to join with
   * @param allowDuplicateColumnNames if {@code false} the join will fail if any columns other than
   *     the join column have the same name if {@code true} the join will succeed and duplicate
   *     columns are renamed*
   * @param col2Names The columns to join on. If a name refers to a double column, the join is
   *     performed after rounding to integers.
   * @return The resulting table
   */
  public Table inner(Table table2, boolean allowDuplicateColumnNames, String... col2Names) {
    Table joinedTable;
    joinedTable =
        joinInternal(table, table2, JoinType.INNER, allowDuplicateColumnNames, false, col2Names);
    return joinedTable;
  }

  /**
   * Joins the joiner to the table2, using the given columns for the second table and returns the
   * resulting table
   *
   * @param table2 The table to join with
   * @param allowDuplicateColumnNames if {@code false} the join will fail if any columns other than
   *     the join column have the same name if {@code true} the join will succeed and duplicate
   *     columns are renamed*
   * @param keepAllJoinKeyColumns if {@code false} the join will only keep join key columns in
   *     table1 if {@code true} the join will return all join key columns in both table, which may
   *     have difference when there are null values
   * @param col2Names The columns to join on. If a name refers to a double column, the join is
   *     performed after rounding to integers.
   * @return The resulting table
   */
  public Table inner(
      Table table2,
      boolean allowDuplicateColumnNames,
      boolean keepAllJoinKeyColumns,
      String... col2Names) {
    return joinInternal(
        table, table2, JoinType.INNER, allowDuplicateColumnNames, keepAllJoinKeyColumns, col2Names);
  }

  /**
   * Joins two tables.
   *
   * @param table1 the table on the left side of the join.
   * @param table2 the table on the right side of the join.
   * @param joinType the type of join.
   * @param allowDuplicates if {@code false} the join will fail if any columns other than the join
   *     column have the same name if {@code true} the join will succeed and duplicate columns are
   *     renamed
   * @param keepAllJoinKeyColumns if {@code false} the join will only keep join key columns in
   *     table1 if {@code true} the join will return all join key columns in both table, which may
   *     have difference when there are null values
   * @param table2JoinColumnNames The names of the columns in table2 to join on.
   * @return the joined table
   */
  private Table joinInternal(
      Table table1,
      Table table2,
      JoinType joinType,
      boolean allowDuplicates,
      boolean keepAllJoinKeyColumns,
      String... table2JoinColumnNames) {

    List table2JoinColumnIndexes = getJoinIndexes(table2, table2JoinColumnNames);
    List table1Indexes = buildIndexesForJoinColumns(joinColumnIndexes, table1);
    List table2Indexes = buildIndexesForJoinColumns(table2JoinColumnIndexes, table2);

    Table result = Table.create(table1.name());
    // A set of column indexes in the result table that can be ignored. They are duplicate join
    // keys.
    Set resultIgnoreColIndexes =
        emptyTableFromColumns(
            result,
            table1,
            table2,
            joinType,
            allowDuplicates,
            table2JoinColumnIndexes,
            keepAllJoinKeyColumns);

    validateIndexes(table1Indexes, table2Indexes);
    if (table1.rowCount() == 0 && (joinType == JoinType.LEFT_OUTER || joinType == JoinType.INNER)) {
      // Handle special case of empty table here so it doesn't fall through to the behavior
      // that adds rows for full outer and right outer joins
      if (!keepAllJoinKeyColumns) {
        result.removeColumns(Ints.toArray(resultIgnoreColIndexes));
      }
      return result;
    }

    Selection table1DoneRows = Selection.with();
    Selection table2DoneRows = Selection.with();
    // use table 2 for row iteration, which can significantly increase performance
    if (table1.rowCount() > table2.rowCount() && joinType == JoinType.INNER) {
      for (Row row : table2) {
        int ri = row.getRowNumber();
        if (table2DoneRows.contains(ri)) {
          // Already processed a selection of table1 that contained this row.
          continue;
        }
        Selection table1Rows =
            createMultiColSelection(
                table2, ri, table1Indexes, table1.rowCount(), table2JoinColumnIndexes);
        Selection table2Rows =
            createMultiColSelection(
                table2, ri, table2Indexes, table2.rowCount(), table2JoinColumnIndexes);
        crossProduct(
            result,
            table1,
            table2,
            table1Rows,
            table2Rows,
            resultIgnoreColIndexes,
            keepAllJoinKeyColumns);

        table2DoneRows = table2DoneRows.or(table2Rows);
        if (table2DoneRows.size() == table2.rowCount()) {
          // Processed all the rows in table1 exit early.
          if (!keepAllJoinKeyColumns) {
            result.removeColumns(Ints.toArray(resultIgnoreColIndexes));
          }
          return result;
        }
      }
    } else {
      for (Row row : table1) {
        int ri = row.getRowNumber();
        if (table1DoneRows.contains(ri)) {
          // Already processed a selection of table1 that contained this row.
          continue;
        }
        Selection table1Rows =
            createMultiColSelection(
                table1, ri, table1Indexes, table1.rowCount(), joinColumnIndexes);
        Selection table2Rows =
            createMultiColSelection(
                table1, ri, table2Indexes, table2.rowCount(), joinColumnIndexes);
        if ((joinType == JoinType.LEFT_OUTER || joinType == JoinType.FULL_OUTER)
            && table2Rows.isEmpty()) {
          withMissingLeftJoin(
              result, table1, table1Rows, resultIgnoreColIndexes, keepAllJoinKeyColumns);
        } else {
          crossProduct(
              result,
              table1,
              table2,
              table1Rows,
              table2Rows,
              resultIgnoreColIndexes,
              keepAllJoinKeyColumns);
        }
        table1DoneRows = table1DoneRows.or(table1Rows);
        if (joinType == JoinType.FULL_OUTER || joinType == JoinType.RIGHT_OUTER) {
          // Update done rows in table2 for full Outer.
          table2DoneRows = table2DoneRows.or(table2Rows);
        } else if (table1DoneRows.size() == table1.rowCount()) {
          // Processed all the rows in table1 exit early.
          if (!keepAllJoinKeyColumns) {
            result.removeColumns(Ints.toArray(resultIgnoreColIndexes));
          }
          return result;
        }
      }
    }

    // Add all rows from table2 that were not handled already.
    Selection table2Rows = table2DoneRows.flip(0, table2.rowCount());
    withMissingRight(
        result,
        table1.columnCount(),
        table2,
        table2Rows,
        joinType,
        table2JoinColumnIndexes,
        resultIgnoreColIndexes,
        keepAllJoinKeyColumns);
    if (!keepAllJoinKeyColumns) {
      result.removeColumns(Ints.toArray(resultIgnoreColIndexes));
    }
    return result;
  }

  private void validateIndexes(List table1Indexes, List table2Indexes) {
    if (table1Indexes.size() != table2Indexes.size()) {
      throw new IllegalArgumentException(
          "Cannot join using a different number of indices on each table: "
              + table1Indexes
              + " and "
              + table2Indexes);
    }
    for (int i = 0; i < table1Indexes.size(); i++) {
      if (!table1Indexes.get(i).getClass().equals(table2Indexes.get(i).getClass())) {
        throw new IllegalArgumentException(
            "Cannot join using different index types: " + table1Indexes + " and " + table2Indexes);
      }
    }
  }

  /** Build a reverse index for every join column in the table. */
  private List buildIndexesForJoinColumns(List joinColumnIndexes, Table table) {
    return joinColumnIndexes.stream().map(c -> indexFor(table, c)).collect(Collectors.toList());
  }

  /** Create a reverse index for a given column. */
  private Index indexFor(Table table, int colIndex) {
    ColumnType type = table.column(colIndex).type();
    if (type instanceof DateColumnType) {
      return new IntIndex(table.dateColumn(colIndex));
    } else if (type instanceof DateTimeColumnType) {
      return new LongIndex(table.dateTimeColumn(colIndex));
    } else if (type instanceof InstantColumnType) {
      return new LongIndex(table.instantColumn(colIndex));
    } else if (type instanceof TimeColumnType) {
      return new IntIndex(table.timeColumn(colIndex));
    } else if (type instanceof StringColumnType) {
      return new StringIndex(table.stringColumn(colIndex));
    } else if (type instanceof TextColumnType) {
      return new StringIndex(table.textColumn(colIndex));
    } else if (type instanceof IntColumnType) {
      return new IntIndex(table.intColumn(colIndex));
    } else if (type instanceof LongColumnType) {
      return new LongIndex(table.longColumn(colIndex));
    } else if (type instanceof ShortColumnType) {
      return new ShortIndex(table.shortColumn(colIndex));
    } else if (type instanceof BooleanColumnType) {
      return new ByteIndex(table.booleanColumn(colIndex));
    } else if (type instanceof DoubleColumnType) {
      return new DoubleIndex(table.doubleColumn(colIndex));
    } else if (type instanceof FloatColumnType) {
      return new FloatIndex(table.floatColumn(colIndex));
    }
    throw new IllegalArgumentException("Joining attempted on unsupported column type " + type);
  }

  /**
   * Given a reverse index find a selection of rows that have the same value as the supplied column
   * does in the given row index.
   */
  private Selection selectionForColumn(Column valueColumn, int rowIndex, Index rawIndex) {

    ColumnType type = valueColumn.type();
    if (type instanceof DateColumnType) {
      IntIndex index = (IntIndex) rawIndex;
      int value = ((DateColumn) valueColumn).getIntInternal(rowIndex);
      return index.get(value);
    } else if (type instanceof TimeColumnType) {
      IntIndex index = (IntIndex) rawIndex;
      int value = ((TimeColumn) valueColumn).getIntInternal(rowIndex);
      return index.get(value);
    } else if (type instanceof DateTimeColumnType) {
      LongIndex index = (LongIndex) rawIndex;
      long value = ((DateTimeColumn) valueColumn).getLongInternal(rowIndex);
      return index.get(value);
    } else if (type instanceof InstantColumnType) {
      LongIndex index = (LongIndex) rawIndex;
      long value = ((InstantColumn) valueColumn).getLongInternal(rowIndex);
      return index.get(value);
    } else if (type instanceof StringColumnType) {
      StringIndex index = (StringIndex) rawIndex;
      String value = ((StringColumn) valueColumn).get(rowIndex);
      return index.get(value);
    } else if (type instanceof TextColumnType) {
      StringIndex index = (StringIndex) rawIndex;
      String value = ((TextColumn) valueColumn).get(rowIndex);
      return index.get(value);
    } else if (type instanceof IntColumnType) {
      IntIndex index = (IntIndex) rawIndex;
      int value = ((IntColumn) valueColumn).getInt(rowIndex);
      return index.get(value);
    } else if (type instanceof LongColumnType) {
      LongIndex index = (LongIndex) rawIndex;
      long value = ((LongColumn) valueColumn).getLong(rowIndex);
      return index.get(value);
    } else if (type instanceof ShortColumnType) {
      ShortIndex index = (ShortIndex) rawIndex;
      short value = ((ShortColumn) valueColumn).getShort(rowIndex);
      return index.get(value);
    } else if (type instanceof BooleanColumnType) {
      ByteIndex index = (ByteIndex) rawIndex;
      byte value = ((BooleanColumn) valueColumn).getByte(rowIndex);
      return index.get(value);
    } else if (type instanceof DoubleColumnType) {
      DoubleIndex index = (DoubleIndex) rawIndex;
      double value = ((DoubleColumn) valueColumn).getDouble(rowIndex);
      return index.get(value);
    } else if (type instanceof FloatColumnType) {
      FloatIndex index = (FloatIndex) rawIndex;
      float value = ((FloatColumn) valueColumn).getFloat(rowIndex);
      return index.get(value);
    } else {
      throw new IllegalArgumentException(
          "Joining is supported on numeric, string, and date-like columns. Column "
              + valueColumn.name()
              + " is of type "
              + valueColumn.type());
    }
  }
  /**
   * Create a big multicolumn selection for all join columns in the given table. Joins two tables.
   *
   * @param table the table that used to generate Selection.
   * @param ri row number of row in table.
   * @param indexes a reverse index for every join column in the table.
   * @param selectionSize max size in table .
   * @param joinColumnIndexes the column index of join key in tables
   * @return selection created
   */
  private Selection createMultiColSelection(
      Table table,
      int ri,
      List indexes,
      int selectionSize,
      List joinColumnIndexes) {
    Selection multiColSelection = Selection.withRange(0, selectionSize);
    int i = 0;
    for (Integer joinColumnIndex : joinColumnIndexes) {
      Column col = table.column(joinColumnIndex);
      Selection oneColSelection = selectionForColumn(col, ri, indexes.get(i));
      // and the selections.
      multiColSelection = multiColSelection.and(oneColSelection);
      i++;
    }
    return multiColSelection;
  }

  private String newName(String table2Alias, String columnName) {
    return table2Alias + "." + columnName;
  }

  /**
   * Full outer join to the given tables assuming that they have a column of the name we're joining
   * on
   *
   * @param tables The tables to join with
   * @return The resulting table
   */
  public Table fullOuter(Table... tables) {
    return fullOuter(false, tables);
  }

  /**
   * Full outer join to the given tables assuming that they have a column of the name we're joining
   * on
   *
   * @param allowDuplicateColumnNames if {@code false} the join will fail if any columns other than
   *     the join column have the same name if {@code true} the join will succeed and duplicate
   *     columns are renamed*
   * @param tables The tables to join with
   * @return The resulting table
   */
  public Table fullOuter(boolean allowDuplicateColumnNames, Table... tables) {
    Table joined = table;

    for (Table currT : tables) {
      joined =
          joinInternal(
              joined,
              currT,
              JoinType.FULL_OUTER,
              allowDuplicateColumnNames,
              false,
              joinColumnNames);
    }
    return joined;
  }

  /**
   * Joins the joiner to the table2, using the given columns for the second table and returns the
   * resulting table
   *
   * @param table2 The table to join with
   * @param allowDuplicateColumnNames if {@code false} the join will fail if any columns other than
   *     the join column have the same name if {@code true} the join will succeed and duplicate
   *     columns are renamed
   * @param keepAllJoinKeyColumns if {@code false} the join will only keep join key columns in
   *     table1 if {@code true} the join will return all join key columns in both table, which may
   *     have difference when there are null values
   * @param col2Names The columns to join on. If a name refers to a double column, the join is
   *     performed after rounding to integers.
   * @return The resulting table
   */
  public Table fullOuter(
      Table table2,
      boolean allowDuplicateColumnNames,
      boolean keepAllJoinKeyColumns,
      String... col2Names) {
    return joinInternal(
        table,
        table2,
        JoinType.FULL_OUTER,
        allowDuplicateColumnNames,
        keepAllJoinKeyColumns,
        col2Names);
  }

  /**
   * Full outer join the joiner to the table2, using the given column for the second table and
   * returns the resulting table
   *
   * @param table2 The table to join with
   * @param col2Name The column to join on. If col2Name refers to a double column, the join is
   *     performed after rounding to integers.
   * @return The resulting table
   */
  public Table fullOuter(Table table2, String col2Name) {
    return joinInternal(table, table2, JoinType.FULL_OUTER, false, false, col2Name);
  }

  /**
   * Joins to the given tables assuming that they have a column of the name we're joining on
   *
   * @param tables The tables to join with
   * @return The resulting table
   */
  public Table leftOuter(Table... tables) {
    return leftOuter(false, tables);
  }

  /**
   * Joins to the given tables assuming that they have a column of the name we're joining on
   *
   * @param allowDuplicateColumnNames if {@code false} the join will fail if any columns other than
   *     the join column have the same name if {@code true} the join will succeed and duplicate
   *     columns are renamed*
   * @param tables The tables to join with
   * @return The resulting table
   */
  public Table leftOuter(boolean allowDuplicateColumnNames, Table... tables) {
    Table joined = table;
    for (Table table2 : tables) {
      joined =
          joinInternal(
              joined,
              table2,
              JoinType.LEFT_OUTER,
              allowDuplicateColumnNames,
              false,
              joinColumnNames);
    }
    return joined;
  }

  /**
   * Joins the joiner to the table2, using the given columns for the second table and returns the
   * resulting table
   *
   * @param table2 The table to join with
   * @param col2Names The columns to join on. If a name refers to a double column, the join is
   *     performed after rounding to integers.
   * @return The resulting table
   */
  public Table leftOuter(Table table2, String[] col2Names) {
    return leftOuter(table2, false, col2Names);
  }

  /**
   * Joins the joiner to the table2, using the given column for the second table and returns the
   * resulting table
   *
   * @param table2 The table to join with
   * @param col2Name The column to join on. If col2Name refers to a double column, the join is
   *     performed after rounding to integers.
   * @return The resulting table
   */
  public Table leftOuter(Table table2, String col2Name) {
    return leftOuter(table2, false, col2Name);
  }

  /**
   * Joins the joiner to the table2, using the given columns for the second table and returns the
   * resulting table
   *
   * @param table2 The table to join with
   * @param allowDuplicateColumnNames if {@code false} the join will fail if any columns other than
   *     the join column have the same name if {@code true} the join will succeed and duplicate
   *     columns are renamed
   * @param col2Names The columns to join on. If a name refers to a double column, the join is
   *     performed after rounding to integers.
   * @return The resulting table
   */
  public Table leftOuter(Table table2, boolean allowDuplicateColumnNames, String... col2Names) {
    return joinInternal(
        table, table2, JoinType.LEFT_OUTER, allowDuplicateColumnNames, false, col2Names);
  }

  /**
   * Joins the joiner to the table2, using the given columns for the second table and returns the
   * resulting table
   *
   * @param table2 The table to join with
   * @param allowDuplicateColumnNames if {@code false} the join will fail if any columns other than
   *     the join column have the same name if {@code true} the join will succeed and duplicate
   *     columns are renamed
   * @param keepAllJoinKeyColumns if {@code false} the join will only keep join key columns in
   *     table1 if {@code true} the join will return all join key columns in both table, which may
   *     have difference when there are null values
   * @param col2Names The columns to join on. If a name refers to a double column, the join is
   *     performed after rounding to integers.
   * @return The resulting table
   */
  public Table leftOuter(
      Table table2,
      boolean allowDuplicateColumnNames,
      boolean keepAllJoinKeyColumns,
      String... col2Names) {
    return joinInternal(
        table,
        table2,
        JoinType.LEFT_OUTER,
        allowDuplicateColumnNames,
        keepAllJoinKeyColumns,
        col2Names);
  }

  /**
   * Joins to the given tables assuming that they have a column of the name we're joining on
   *
   * @param tables The tables to join with
   * @return The resulting table
   */
  public Table rightOuter(Table... tables) {
    return rightOuter(false, tables);
  }

  /**
   * Joins to the given tables assuming that they have a column of the name we're joining on
   *
   * @param allowDuplicateColumnNames if {@code false} the join will fail if any columns other than
   *     the join column have the same name if {@code true} the join will succeed and duplicate
   *     columns are renamed
   * @param tables The tables to join with
   * @return The resulting table
   */
  public Table rightOuter(boolean allowDuplicateColumnNames, Table... tables) {
    Table joined = table;
    for (Table table2 : tables) {
      joined =
          joinInternal(
              joined,
              table2,
              JoinType.RIGHT_OUTER,
              allowDuplicateColumnNames,
              false,
              joinColumnNames);
      joinColumnIndexes.clear();
      joinColumnIndexes.addAll(getJoinIndexes(joined, joinColumnNames));
    }
    return joined;
  }

  /**
   * Joins the joiner to the table2, using the given column for the second table and returns the
   * resulting table
   *
   * @param table2 The table to join with
   * @param col2Name The column to join on. If col2Name refers to a double column, the join is
   *     performed after rounding to integers.
   * @return The resulting table
   */
  public Table rightOuter(Table table2, String col2Name) {
    return rightOuter(table2, false, col2Name);
  }

  /**
   * Joins the joiner to the table2, using the given columns for the second table and returns the
   * resulting table
   *
   * @param table2 The table to join with
   * @param col2Names The columns to join on. If a name refers to a double column, the join is
   *     performed after rounding to integers.
   * @return The resulting table
   */
  public Table rightOuter(Table table2, String[] col2Names) {
    return rightOuter(table2, false, col2Names);
  }

  /**
   * Joins the joiner to the table2, using the given columns for the second table and returns the
   * resulting table
   *
   * @param table2 The table to join with
   * @param allowDuplicateColumnNames if {@code false} the join will fail if any columns other than
   *     the join column have the same name if {@code true} the join will succeed and duplicate
   *     columns are renamed
   * @param col2Names The columns to join on. If a name refers to a double column, the join is
   *     performed after rounding to integers.
   * @return The resulting table
   */
  public Table rightOuter(Table table2, boolean allowDuplicateColumnNames, String... col2Names) {
    return joinInternal(
        table, table2, JoinType.RIGHT_OUTER, allowDuplicateColumnNames, false, col2Names);
  }

  /**
   * Joins the joiner to the table2, using the given columns for the second table and returns the
   * resulting table
   *
   * @param table2 The table to join with
   * @param allowDuplicateColumnNames if {@code false} the join will fail if any columns other than
   *     the join column have the same name if {@code true} the join will succeed and duplicate
   *     columns are renamed
   * @param keepAllJoinKeyColumns if {@code false} the join will only keep join key columns in
   *     table1 if {@code true} the join will return all join key columns in both table, which may
   *     have difference when there are null values
   * @param col2Names The columns to join on. If a name refers to a double column, the join is
   *     performed after rounding to integers.
   * @return The resulting table
   */
  public Table rightOuter(
      Table table2,
      boolean allowDuplicateColumnNames,
      boolean keepAllJoinKeyColumns,
      String... col2Names) {
    return joinInternal(
        table,
        table2,
        JoinType.RIGHT_OUTER,
        allowDuplicateColumnNames,
        keepAllJoinKeyColumns,
        col2Names);
  }

  /**
   * Adds empty columns to the destination table with the same type as columns in table1 and table2.
   *
   * 

For inner, left and full outer join types the join columns in table2 are not needed and will * be marked as placeholders. The indexes of those columns will be returned. The downstream logic * is easier if we wait to remove the redundant columns until the last step. * * @param destination the table to fill up with columns. Will be mutated in place. * @param table1 the table on left side of the join. * @param table2 the table on the right side of the join. * @param joinType the type of join. * @param allowDuplicates whether to allow duplicates. If yes rename columns in table2 that have * the same name as columns in table1 with the exception of join columns in table2 when * performing a right join. * @param table2JoinColumnIndexes the index locations of the table2 join columns. * @return A */ private Set emptyTableFromColumns( Table destination, Table table1, Table table2, JoinType joinType, boolean allowDuplicates, List table2JoinColumnIndexes, boolean keepTable2JoinKeyColumns) { Column[] cols = Streams.concat(table1.columns().stream(), table2.columns().stream()) .map(Column::emptyCopy) .toArray(Column[]::new); // For inner join, left join and full outer join mark the join columns in table2 as // placeholders. // For right join mark the join columns in table1 as placeholders. // Keep track of which join columns are placeholders so they can be ignored. Set ignoreColumns = new HashSet<>(); for (int c = 0; c < cols.length; c++) { if (joinType == JoinType.RIGHT_OUTER) { if (c < table1.columnCount() && joinColumnIndexes.contains(c)) { if (!keepTable2JoinKeyColumns) { cols[c].setName("Placeholder_" + ignoreColumns.size()); } ignoreColumns.add(c); } } else { int table2Index = c - table1.columnCount(); if (c >= table1.columnCount() && table2JoinColumnIndexes.contains(table2Index)) { if (!keepTable2JoinKeyColumns) { cols[c].setName("Placeholder_" + ignoreColumns.size()); } ignoreColumns.add(c); } } } // Rename duplicate columns in second table if (allowDuplicates) { Set table1ColNames = Arrays.stream(cols) .map(Column::name) .map(String::toLowerCase) .limit(table1.columnCount()) .collect(Collectors.toSet()); String table2Alias = TABLE_ALIAS + joinTableId.getAndIncrement(); for (int c = table1.columnCount(); c < cols.length; c++) { String columnName = cols[c].name(); if (table1ColNames.contains(columnName.toLowerCase())) { cols[c].setName(newName(table2Alias, columnName)); } } } destination.addColumns(cols); return ignoreColumns; } /** * Creates cross product for the selection of two tables. * * @param destination the destination table. * @param table1 the table on left of join. * @param table2 the table on right of join. * @param table1Rows the selection of rows in table1. * @param table2Rows the selection of rows in table2. * @param ignoreColumns a set of column indexes in the result to ignore. They are redundant join * columns. */ @SuppressWarnings({"rawtypes", "unchecked"}) private void crossProduct( Table destination, Table table1, Table table2, Selection table1Rows, Selection table2Rows, Set ignoreColumns, boolean keepTable2JoinKeyColumns) { for (int c = 0; c < table1.columnCount() + table2.columnCount(); c++) { if (!keepTable2JoinKeyColumns && ignoreColumns.contains(c)) { continue; } int table2Index = c - table1.columnCount(); for (int r1 : table1Rows) { for (int r2 : table2Rows) { if (c < table1.columnCount()) { Column t1Col = table1.column(c); destination.column(c).append(t1Col, r1); } else { Column t2Col = table2.column(table2Index); destination.column(c).append(t2Col, r2); } } } } } /** * Adds rows to destination for each row in table1 with the columns from table2 added as missing * values. */ @SuppressWarnings({"rawtypes", "unchecked"}) private void withMissingLeftJoin( Table destination, Table table1, Selection table1Rows, Set ignoreColumns, boolean keepTable2JoinKeyColumns) { for (int c = 0; c < destination.columnCount(); c++) { if (!keepTable2JoinKeyColumns && ignoreColumns.contains(c)) { continue; } if (c < table1.columnCount()) { Column t1Col = table1.column(c); for (int index : table1Rows) { destination.column(c).append(t1Col, index); } } else { for (int r1 = 0; r1 < table1Rows.size(); r1++) { destination.column(c).appendMissing(); } } } } /** * Adds rows to destination for each row in table2 with the columns from table1 added as missing * values. */ @SuppressWarnings({"rawtypes", "unchecked"}) private void withMissingRight( Table destination, int table1ColCount, Table table2, Selection table2Rows, JoinType joinType, List col2Indexes, Set ignoreColumns, boolean keepTable2JoinKeyColumns) { // Add index data from table2 into join column positions in table one. if (joinType == JoinType.FULL_OUTER) { for (int i = 0; i < col2Indexes.size(); i++) { Column t2Col = table2.column(col2Indexes.get(i)); for (int index : table2Rows) { destination.column(joinColumnIndexes.get(i)).append(t2Col, index); } } } for (int c = 0; c < destination.columnCount(); c++) { if (!keepTable2JoinKeyColumns) { if (ignoreColumns.contains(c) || joinColumnIndexes.contains(c)) { continue; } } if (c < table1ColCount) { for (int r1 = 0; r1 < table2Rows.size(); r1++) { destination.column(c).appendMissing(); } } else { Column t2Col = table2.column(c - table1ColCount); for (int index : table2Rows) { destination.column(c).append(t2Col, index); } } } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy