All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.github.lwhite1.tablesaw.api.CategoryColumn Maven / Gradle / Ivy

There is a newer version: 0.7.7.3
Show newest version
package com.github.lwhite1.tablesaw.api;

import com.github.lwhite1.tablesaw.columns.AbstractColumn;
import com.github.lwhite1.tablesaw.columns.CategoryColumnUtils;
import com.github.lwhite1.tablesaw.columns.Column;
import com.github.lwhite1.tablesaw.filtering.StringBiPredicate;
import com.github.lwhite1.tablesaw.filtering.StringPredicate;
import com.github.lwhite1.tablesaw.filtering.text.CategoryFilters;
import com.github.lwhite1.tablesaw.io.TypeUtils;
import com.github.lwhite1.tablesaw.store.ColumnMetadata;
import com.github.lwhite1.tablesaw.util.BitmapBackedSelection;
import com.github.lwhite1.tablesaw.util.DictionaryMap;
import com.github.lwhite1.tablesaw.util.Selection;
import com.google.common.base.CharMatcher;
import com.google.common.base.Preconditions;
import com.google.common.base.Splitter;
import com.google.common.base.Strings;
import com.google.common.collect.Lists;
import it.unimi.dsi.fastutil.ints.Int2IntMap;
import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap;
import it.unimi.dsi.fastutil.ints.Int2ObjectMap;
import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.ints.IntArrays;
import it.unimi.dsi.fastutil.ints.IntComparator;
import it.unimi.dsi.fastutil.ints.IntListIterator;

import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;

/**
 * A column in a base table that contains float values
 */
public class CategoryColumn extends AbstractColumn
    implements CategoryFilters, CategoryColumnUtils, Iterable {

  private static final int BYTE_SIZE = 4;

  public static final String MISSING_VALUE = (String) ColumnType.CATEGORY.getMissingValue();

  private static int DEFAULT_ARRAY_SIZE = 128;

  private int id = 0;

  // holds a key for each row in the table. the key can be used to lookup the backing string value
  private IntArrayList values;

  // a bidirectional map of keys to backing string values.
  private DictionaryMap lookupTable = new DictionaryMap();

  public static CategoryColumn create(String name) {
    return create(name, DEFAULT_ARRAY_SIZE);
  }

  public static CategoryColumn create(String name, int size) {
    return new CategoryColumn(name, size);
  }

  public static CategoryColumn create(String name, List categories) {
    CategoryColumn column = new CategoryColumn(name, categories.size());
    for (String string : categories) {
      column.add(string);
    }
    return column;
  }

  private CategoryColumn(String name) {
    super(name);
    values = new IntArrayList(DEFAULT_ARRAY_SIZE);
  }

  public CategoryColumn(ColumnMetadata metadata) {
    super(metadata);
    values = new IntArrayList(DEFAULT_ARRAY_SIZE);
  }

  public CategoryColumn(String name, int size) {
    super(name);
    values = new IntArrayList(size);
  }

  @Override
  public ColumnType type() {
    return ColumnType.CATEGORY;
  }

  @Override
  public String getString(int row) {
    return get(row);
  }

  @Override
  public CategoryColumn emptyCopy() {
    CategoryColumn copy = new CategoryColumn(name());
    copy.setComment(comment());
    return copy;
  }

  @Override
  public CategoryColumn emptyCopy(int rowSize) {
    CategoryColumn copy = new CategoryColumn(name(), rowSize);
    copy.setComment(comment());
    return copy;
  }

  @Override
  public void sortAscending() {
    IntArrays.parallelQuickSort(values.elements(), dictionarySortComparator);
  }

  private IntComparator dictionarySortComparator = new IntComparator() {
    @Override
    public int compare(int i, int i1) {
      return lookupTable.get(i).compareTo(lookupTable.get(i1));
    }

    @Override
    public int compare(Integer o1, Integer o2) {
      return compare((int) o1, (int) o2);
    }
  };

  private IntComparator reverseDictionarySortComparator = new IntComparator() {
    @Override
    public int compare(int i, int i1) {
      return -lookupTable.get(i).compareTo(lookupTable.get(i1));
    }

    @Override
    public int compare(Integer o1, Integer o2) {
      return compare((int) o1, (int) o2);
    }
  };

  @Override
  public void sortDescending() {
    IntArrays.parallelQuickSort(values.elements(), reverseDictionarySortComparator);
  }

  /**
   * Returns the number of elements (a.k.a. rows or cells) in the column
   */
  @Override
  public int size() {
    return values.size();
  }

  /**
   * Returns the value at rowIndex in this column. The index is zero-based.
   *
   * @throws IndexOutOfBoundsException if the given rowIndex is not in the column
   */
  public String get(int rowIndex) {
    int k = values.getInt(rowIndex);
    return lookupTable.get(k);
  }

  public List toList() {
    return Lists.newArrayList(dictionaryMap().categoryArray());
  }

  @Override
  public Table summary() {
    return countByCategory();
  }

  public Table countByCategory() {
    Table t = new Table("Column: " + name());
    CategoryColumn categories = CategoryColumn.create("Category");
    IntColumn counts = IntColumn.create("Count");

    Int2IntMap valueToCount = new Int2IntOpenHashMap();
    for (int next : values) {
      if (valueToCount.containsKey(next)) {
        valueToCount.put(next, valueToCount.get(next) + 1);
      } else {
        valueToCount.put(next, 1);
      }
    }

    for (Map.Entry entry : valueToCount.entrySet()) {
      categories.add(lookupTable.get(entry.getKey()));
      counts.add(entry.getValue());
    }
    t.addColumn(categories);
    t.addColumn(counts);
    return t;
  }

  @Override
  public void clear() {
    values.clear();
    lookupTable.clear();
  }

  public void set(int rowIndex, String stringValue) {
    boolean b = lookupTable.contains(stringValue);
    int valueId;
    if (!b) {
// TODO(lwhite): synchronize id() or column-level saveTable lock so we can increment id safely without atomic integer
// objects
      valueId = id++;
      lookupTable.put(valueId, stringValue);
    } else {
      valueId = lookupTable.get(stringValue);
    }
    values.set(rowIndex, valueId);
  }

  @Override
  public int countUnique() {
    return lookupTable.size();
  }

  /**
   * Returns the largest ("top") n values in the column
   *
   * @param n The maximum number of records to return. The actual number will be smaller if n is greater than the
   *          number of observations in the column
   * @return A list, possibly empty, of the largest observations
   */
  public List top(int n) {
    List top = new ArrayList<>();
    CategoryColumn copy = this.copy();
    copy.sortDescending();
    for (int i = 0; i < n; i++) {
      top.add(copy.get(i));
    }
    return top;
  }

  /**
   * Returns the smallest ("bottom") n values in the column
   *
   * @param n The maximum number of records to return. The actual number will be smaller if n is greater than the
   *          number of observations in the column
   * @return A list, possibly empty, of the smallest n observations
   */
  public List bottom(int n) {
    List bottom = new ArrayList<>();
    CategoryColumn copy = this.copy();
    copy.sortAscending();
    for (int i = 0; i < n; i++) {
      bottom.add(copy.get(i));
    }
    return bottom;
  }

  public void add(String stringValue) {
    int valueId = lookupTable.get(stringValue);
    if (valueId < 0) {
      valueId = id++;
      lookupTable.put(valueId, stringValue);
    }
    values.add(valueId);
  }

  /**
   * Initializes this Column with the given values for performance
   */
  public void initializeWith(IntArrayList list, DictionaryMap map) {
    for (int key : list) {
      add(map.get(key));
    }
  }

  /**
   * Returns true if this column contains a cell with the given string, and false otherwise
   */
  public boolean contains(String aString) {
    return values.indexOf(dictionaryMap().get(aString)) >= 0;
  }

  /**
   * Returns all the values associated with the given indexes
   */
  public IntArrayList getValues(IntArrayList indexes) {
    IntArrayList newList = new IntArrayList(indexes.size());
    for (int i : indexes) {
      newList.add(values.getInt(i));
    }
    return newList;
  }

  /**
   * Add all the strings in the list to this column
   */
  public void addAll(List stringValues) {
    for (String stringValue : stringValues) {
      add(stringValue);
    }
  }

  public final IntComparator rowComparator = new IntComparator() {

    @Override
    public int compare(int i, int i1) {
      String f1 = get(i);
      String f2 = get(i1);
      return f1.compareTo(f2);
    }

    @Override
    public int compare(Integer i, Integer i1) {
      return compare((int) i, (int) i1);
    }
  };

  public static String convert(String stringValue) {
    if (Strings.isNullOrEmpty(stringValue) || TypeUtils.MISSING_INDICATORS.contains(stringValue)) {
      return MISSING_VALUE;
    }
    return stringValue;
  }

  public void addCell(String object) {
    try {
      add(convert(object));
    } catch (NullPointerException e) {
      throw new RuntimeException(name() + ": "
          + String.valueOf(object) + ": "
          + e.getMessage());
    }
  }

  @Override
  public IntComparator rowComparator() {
    return rowComparator;
  }

  @Override
  public boolean isEmpty() {
    return values.isEmpty();
  }

  public Selection isEqualTo(String string) {
    Selection results = new BitmapBackedSelection();
    int key = lookupTable.get(string);
    if (key >= 0) {
      int i = 0;
      for (int next : values) {
        if (key == next) {
          results.add(i);
        }
        i++;
      }
    }
    return results;
  }

  public Selection isNotEqualTo(String string) {
    Selection results = new BitmapBackedSelection();
    int key = lookupTable.get(string);
    if (key >= 0) {
      int i = 0;
      for (int next : values) {
        if (key != next) {
          results.add(i);
        }
        i++;
      }
    }
    return results;
  }

  /**
   * Returns a list of boolean columns suitable for use as dummy variables in, for example, regression analysis,
   * selectWhere a column of categorical data must be encoded as a list of columns, such that each column represents
   * a single
   * category and indicates whether it is present (1) or not present (0)
   */
  public List getDummies() {
    List results = new ArrayList<>();

    // createFromCsv the necessary columns
    for (Int2ObjectMap.Entry entry : lookupTable.keyToValueMap().int2ObjectEntrySet()) {
      BooleanColumn column = BooleanColumn.create(entry.getValue());
      results.add(column);
    }

    // iterate over the values, updating the dummy variable columns as appropriate
    for (int next : values) {
      String category = lookupTable.get(next);
      for (BooleanColumn column : results) {
        if (category.equals(column.name())) {
          //TODO(lwhite): update the correct row more efficiently, by using set rather than add & only updating true
          column.add(true);
        } else {
          column.add(false);
        }
      }
    }
    return results;
  }

  public int getInt(int rowNumber) {
    return values.getInt(rowNumber);
  }

  public CategoryColumn unique() {
    List strings = new ArrayList<>(lookupTable.categories());
    return CategoryColumn.create(name() + " Unique values", strings);
  }

  /**
   * Returns the integers that back this column
   */
  public IntArrayList data() {
    return values;
  }

  public IntColumn toIntColumn() {
    IntColumn intColumn = IntColumn.create(this.name() + ": codes", size());
    IntArrayList data = data();
    for (int i = 0; i < size(); i++) {
      intColumn.add(data.getInt(i));
    }
    return intColumn;
  }

  public DictionaryMap dictionaryMap() {
    return lookupTable;
  }

  @Override
  public String toString() {
    return "Category column: " + name();
  }

  public int[] indexes() {
    int[] rowIndexes = new int[size()];
    for (int i = 0; i < size(); i++) {
      rowIndexes[i] = i;
    }
    return rowIndexes;
  }

  public CategoryColumn replaceAll(String[] regexArray, String replacement) {

    CategoryColumn newColumn = CategoryColumn.create(name() + "[repl]", this.size());

    for (int r = 0; r < size(); r++) {
      String value = get(r);
      for (String regex : regexArray) {
        value = value.replaceAll(regex, replacement);
      }
      newColumn.add(value);
    }
    return newColumn;
  }

  public CategoryColumn tokenizeAndSort(String separator) {
    CategoryColumn newColumn = CategoryColumn.create(name() + "[sorted]", this.size());

    for (int r = 0; r < size(); r++) {
      String value = get(r);

      Splitter splitter = Splitter.on(separator);
      splitter = splitter.trimResults();
      splitter = splitter.omitEmptyStrings();
      List tokens =
          new ArrayList<>(splitter.splitToList(value));
      Collections.sort(tokens);
      value = String.join(" ", tokens);
      newColumn.add(value);
    }
    return newColumn;
  }

  /**
   * Splits on Whitespace and returns the lexicographically sorted result
   */
  public CategoryColumn tokenizeAndSort() {
    CategoryColumn newColumn = CategoryColumn.create(name() + "[sorted]", this.size());

    for (int r = 0; r < size(); r++) {
      String value = get(r);
      Splitter splitter = Splitter.on(CharMatcher.WHITESPACE);
      splitter = splitter.trimResults();
      splitter = splitter.omitEmptyStrings();
      List tokens = new ArrayList<>(splitter.splitToList(value));
      Collections.sort(tokens);
      value = String.join(" ", tokens);
      newColumn.add(value);
    }
    return newColumn;
  }

  public CategoryColumn tokenizeAndRemoveDuplicates() {
    CategoryColumn newColumn = CategoryColumn.create(name() + "[without duplicates]", this.size());

    for (int r = 0; r < size(); r++) {
      String value = get(r);

      Splitter splitter = Splitter.on(CharMatcher.WHITESPACE);
      splitter = splitter.trimResults();
      splitter = splitter.omitEmptyStrings();
      List tokens = new ArrayList<>(splitter.splitToList(value));

      value = String.join(" ", new HashSet<>(tokens));
      newColumn.add(value);
    }
    return newColumn;
  }

  public String print() {
    StringBuilder builder = new StringBuilder();
    builder.append(title());
    for (int next : values) {
      builder.append(get(next));
      builder.append('\n');
    }
    return builder.toString();
  }

  @Override
  public Selection isMissing() {
    return select(isMissing);
  }

  @Override
  public Selection isNotMissing() {
    return select(isNotMissing);
  }


  public Selection select(StringPredicate predicate) {
    Selection selection = new BitmapBackedSelection();
    for (int idx = 0; idx < data().size(); idx++) {
      int next = data().getInt(idx);
      if (predicate.test(get(next))) {
        selection.add(idx);
      }
    }
    return selection;
  }

  public Selection select(StringBiPredicate predicate, String value) {
    Selection selection = new BitmapBackedSelection();
    for (int idx = 0; idx < data().size(); idx++) {
      int next = data().getInt(idx);
      if (predicate.test(get(next), value)) {
        selection.add(idx);
      }
    }
    return selection;
  }

  public CategoryColumn copy() {
    CategoryColumn newCol = CategoryColumn.create(name(), size());
    newCol.lookupTable = new DictionaryMap(lookupTable);
    newCol.values.addAll(values);
    newCol.setComment(comment());
    return newCol;
  }

  @Override
  public void append(Column column) {
    Preconditions.checkArgument(column.type() == this.type());
    CategoryColumn intColumn = (CategoryColumn) column;
    for (int i = 0; i < intColumn.size(); i++) {
      add(intColumn.get(i));
    }
  }

  /**
   * Returns the count of missing values in this column
   */
  @Override
  public int countMissing() {
    int count = 0;
    for (int i = 0; i < size(); i++) {
      if (MISSING_VALUE.equals(get(i))) {
        count++;
      }
    }
    return count;
  }

  @Override
  public Iterator iterator() {
    return new Iterator() {

      private IntListIterator valuesIt = values.iterator();

      @Override
      public boolean hasNext() {
        return valuesIt.hasNext();
      }

      @Override
      public String next() {
        return lookupTable.get(valuesIt.next());
      }
    };
  }

  public CategoryColumn selectIf(StringPredicate predicate) {
    CategoryColumn column = emptyCopy();
    for (String next : this) {
      if (predicate.test(next)) {
        column.add(next);
      }
    }
    return column;
  }

  public Set asSet() {
    return lookupTable.categories();
  }

  /**
   * Returns the integer encoded value of each cell in this column. It can be used to lookup the mapped string in
   * the lookupTable
   */
  public IntArrayList values() {
    return values;
  }

  @Override
  public int byteSize() {
    return BYTE_SIZE;
  }

  /**
   * Returns the contents of the cell at rowNumber as a byte[]
   */
  @Override
  public byte[] asBytes(int rowNumber) {
    return ByteBuffer.allocate(4).putInt(getInt(rowNumber)).array();
  }

  public Selection isIn(String ... strings) {
    IntArrayList keys = new IntArrayList();
    for (String string : strings) {
      int key = lookupTable.get(string);
      if (key >= 0) {
        keys.add(key);
      }
    }

    int i = 0;
    Selection results = new BitmapBackedSelection();
    for (int next : values) {
      if (keys.contains(next)) {
        results.add(i);
      }
      i++;
    }
    return results;
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy