tech.tablesaw.columns.strings.ShortDictionaryMap Maven / Gradle / Ivy
package tech.tablesaw.columns.strings;
import com.google.common.base.Objects;
import com.google.common.base.Preconditions;
import it.unimi.dsi.fastutil.objects.Object2ShortOpenHashMap;
import it.unimi.dsi.fastutil.objects.ObjectSet;
import it.unimi.dsi.fastutil.shorts.Short2IntMap;
import it.unimi.dsi.fastutil.shorts.Short2IntOpenHashMap;
import it.unimi.dsi.fastutil.shorts.Short2ObjectMap;
import it.unimi.dsi.fastutil.shorts.Short2ObjectOpenHashMap;
import it.unimi.dsi.fastutil.shorts.ShortArrayList;
import it.unimi.dsi.fastutil.shorts.ShortArrays;
import it.unimi.dsi.fastutil.shorts.ShortComparator;
import it.unimi.dsi.fastutil.shorts.ShortListIterator;
import it.unimi.dsi.fastutil.shorts.ShortOpenHashSet;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Comparator;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import tech.tablesaw.api.BooleanColumn;
import tech.tablesaw.api.IntColumn;
import tech.tablesaw.api.StringColumn;
import tech.tablesaw.api.Table;
import tech.tablesaw.selection.BitmapBackedSelection;
import tech.tablesaw.selection.Selection;
/** A map that supports reversible key value pairs of short-String */
public class ShortDictionaryMap implements DictionaryMap {
// The maximum number of unique values or categories that I can hold. If the column has more
// unique values,
// use a TextColumn
private static final int MAX_UNIQUE = Short.MAX_VALUE - Short.MIN_VALUE;
private static final short MISSING_VALUE = Short.MAX_VALUE;
private static final short DEFAULT_RETURN_VALUE = Short.MIN_VALUE;
private final ShortComparator reverseDictionarySortComparator =
(i, i1) ->
Comparator.reverseOrder()
.compare(getValueForShortKey(i), getValueForShortKey(i1));
private final ShortComparator dictionarySortComparator =
(i, i1) -> getValueForShortKey(i).compareTo(getValueForShortKey(i1));
// holds a key for each element in the column. the key can be used to lookup the backing string
// value
private ShortArrayList values = new ShortArrayList();
private AtomicInteger nextIndex = new AtomicInteger(DEFAULT_RETURN_VALUE);
// we maintain 3 maps, one from strings to keys, one from keys to strings, and one from key to
// count of values
private Short2ObjectMap keyToValue = new Short2ObjectOpenHashMap<>();
private Object2ShortOpenHashMap valueToKey = new Object2ShortOpenHashMap<>();
private Short2IntOpenHashMap keyToCount = new Short2IntOpenHashMap();
/** {@inheritDoc} */
@Override
public int getKeyAtIndex(int rowNumber) {
return values.getShort(rowNumber);
}
/** Returns a new DictionaryMap that is a deep copy of the original */
ShortDictionaryMap(ByteDictionaryMap original) throws NoKeysAvailableException {
valueToKey.defaultReturnValue(DEFAULT_RETURN_VALUE);
keyToCount.defaultReturnValue(0);
for (int i = 0; i < original.size(); i++) {
String value = original.getValueForIndex(i);
append(value);
}
}
private ShortDictionaryMap(ShortDictionaryBuilder builder) {
this.nextIndex = builder.nextIndex;
this.keyToValue = builder.keyToValue;
this.valueToKey = builder.valueToKey;
this.keyToCount = builder.keyToCount;
this.values = builder.values;
}
private void put(short key, String value) {
keyToValue.put(key, value);
valueToKey.put(value, key);
}
private short getKeyForValue(String value) {
return valueToKey.getShort(value);
}
/**
* Returns the number of elements (a.k.a. rows or cells) in the column
*
* @return size as int
*/
@Override
public int size() {
return values.size();
}
@Override
public String getValueForIndex(int rowIndex) {
short k = values.getShort(rowIndex);
return getValueForKey(k);
}
public ObjectSet> getKeyValueEntries() {
return keyToValue.short2ObjectEntrySet();
}
public Short2IntMap.FastEntrySet getKeyCountEntries() {
return keyToCount.short2IntEntrySet();
}
public ShortArrayList values() {
return values;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
ShortDictionaryMap that = (ShortDictionaryMap) o;
boolean a = Objects.equal(values, that.values);
boolean b = Objects.equal(keyToValue, that.keyToValue);
boolean c = Objects.equal(valueToKey, that.valueToKey);
boolean d = Objects.equal(keyToCount, that.keyToCount);
boolean e = Objects.equal(nextIndex.get(), that.nextIndex.get());
return a && b && c && d && e;
}
@Override
public int hashCode() {
return Objects.hashCode(
reverseDictionarySortComparator,
dictionarySortComparator,
values,
nextIndex,
keyToValue,
valueToKey,
keyToCount);
}
@Override
public int getKeyForIndex(int rowIndex) {
return values.getShort(rowIndex);
}
private Set categories() {
return valueToKey.keySet();
}
private Short2ObjectMap keyToValueMap() {
return keyToValue;
}
@Override
public void sortAscending() {
short[] elements = values.toShortArray();
ShortArrays.parallelQuickSort(elements, dictionarySortComparator);
this.values = new ShortArrayList(elements);
}
@Override
public String getValueForKey(int key) {
return keyToValue.get((short) key);
}
private String getValueForShortKey(short key) {
return keyToValue.get(key);
}
@Override
public void sortDescending() {
short[] elements = values.toShortArray();
ShortArrays.parallelQuickSort(elements, reverseDictionarySortComparator);
this.values = new ShortArrayList(elements);
}
public int countOccurrences(String value) {
return keyToCount.get(getKeyForValue(value));
}
public Set asSet() {
return new HashSet<>(categories());
}
public int firstIndexOf(String value) {
return values.indexOf(getKeyForValue(value));
}
@Override
public String[] asObjectArray() {
final String[] output = new String[size()];
for (int i = 0; i < size(); i++) {
output[i] = getValueForIndex(i);
}
return output;
}
@Override
public int countUnique() {
return keyToValueMap().size();
}
@Override
public Selection selectIsIn(String... strings) {
ShortOpenHashSet keys = new ShortOpenHashSet(strings.length);
for (String string : strings) {
short key = getKeyForValue(string);
if (key != DEFAULT_RETURN_VALUE) {
keys.add(key);
}
}
Selection results = new BitmapBackedSelection();
for (int i = 0; i < values.size(); i++) {
if (keys.contains(values.getShort(i))) {
results.add(i);
}
}
return results;
}
@Override
public Selection selectIsIn(Collection strings) {
ShortOpenHashSet keys = new ShortOpenHashSet(strings.size());
for (String string : strings) {
short key = getKeyForValue(string);
if (key != DEFAULT_RETURN_VALUE) {
keys.add(key);
}
}
Selection results = new BitmapBackedSelection();
for (int i = 0; i < values.size(); i++) {
if (keys.contains(values.getShort(i))) {
results.add(i);
}
}
return results;
}
@Override
public void append(String value) throws NoKeysAvailableException {
short key;
if (value == null || StringColumnType.missingValueIndicator().equals(value)) {
key = MISSING_VALUE;
put(key, StringColumnType.missingValueIndicator());
} else {
key = getKeyForValue(value);
}
if (key == DEFAULT_RETURN_VALUE) {
key = getValueId();
put(key, value);
}
values.add(key);
keyToCount.addTo(key, 1);
}
private short getValueId() throws NoKeysAvailableException {
int nextValue = nextIndex.incrementAndGet();
if (nextValue >= Short.MAX_VALUE) {
String msg =
String.format(
"String column can only contain %d unique values. Column has more.", MAX_UNIQUE);
throw new NoKeysAvailableException(msg);
}
return (short) nextValue;
}
/**
* Given a key matching some string, add to the selection the index of every record that matches
* that key
*/
private void addValuesToSelection(Selection results, short key) {
if (key != DEFAULT_RETURN_VALUE) {
int i = 0;
for (short next : values) {
if (key == next) {
results.add(i);
}
i++;
}
}
}
@Override
public void set(int rowIndex, String stringValue) throws NoKeysAvailableException {
String str = StringColumnType.missingValueIndicator();
if (stringValue != null) {
str = stringValue;
}
short valueId = getKeyForValue(str);
if (valueId == DEFAULT_RETURN_VALUE) {
valueId = getValueId();
put(valueId, str);
}
short oldKey = values.set(rowIndex, valueId);
keyToCount.addTo(valueId, 1);
if (keyToCount.addTo(oldKey, -1) == 1) {
String obsoleteValue = keyToValue.remove(oldKey);
valueToKey.removeShort(obsoleteValue);
keyToCount.remove(oldKey);
}
}
@Override
public void clear() {
nextIndex = new AtomicInteger(DEFAULT_RETURN_VALUE);
values.clear();
keyToValue.clear();
valueToKey.clear();
keyToCount.clear();
}
/** */
@Override
public Table countByCategory(String columnName) {
Table t = Table.create("Column: " + columnName);
StringColumn categories = StringColumn.create("Category");
IntColumn counts = IntColumn.create("Count");
// Now uses the keyToCount map
for (Map.Entry entry : keyToCount.short2IntEntrySet()) {
categories.append(getValueForKey(entry.getKey()));
counts.append(entry.getValue());
}
t.addColumns(categories);
t.addColumns(counts);
return t;
}
@Override
public Selection isEqualTo(String string) {
Selection results = new BitmapBackedSelection();
short key = getKeyForValue(string);
addValuesToSelection(results, key);
return results;
}
/**
* Returns a list of boolean columns suitable for use as dummy variables in, for example,
* regression analysis, select a column of categorical data must be encoded as a list of columns,
* such that each column represents a single category and indicates whether it is present (1) or
* not present (0)
*
* @return a list of {@link BooleanColumn}
*/
@Override
public List getDummies() {
List results = new ArrayList<>();
// createFromCsv the necessary columns
for (Short2ObjectMap.Entry entry : keyToValueMap().short2ObjectEntrySet()) {
BooleanColumn column = BooleanColumn.create(entry.getValue());
results.add(column);
}
// iterate over the values, updating the dummy variable columns as appropriate
for (short next : values) {
String category = getValueForKey(next);
for (BooleanColumn column : results) {
if (category.equals(column.name())) {
// TODO(lwhite): update the correct row more efficiently, by using set rather than add &
// only
// updating true
column.append(true);
} else {
column.append(false);
}
}
}
return results;
}
/** Returns the contents of the cell at rowNumber as a byte[] */
@Override
public byte[] asBytes(int rowNumber) {
return ByteBuffer.allocate(byteSize()).putShort((short) getKeyForIndex(rowNumber)).array();
}
private int byteSize() {
return 2;
}
/** Returns the count of missing values in this column */
@Override
public int countMissing() {
return keyToCount.get(MISSING_VALUE);
}
@Override
public Iterator iterator() {
return new Iterator() {
private final ShortListIterator valuesIt = values.iterator();
@Override
public boolean hasNext() {
return valuesIt.hasNext();
}
@Override
public String next() {
return getValueForKey(valuesIt.nextShort());
}
};
}
@Override
public void appendMissing() {
try {
append(StringColumnType.missingValueIndicator());
} catch (NoKeysAvailableException e) {
// This can't happen because missing value key is the first one allocated
throw new IllegalStateException(e);
}
}
@Override
public boolean isMissing(int rowNumber) {
return getKeyForIndex(rowNumber) == MISSING_VALUE;
}
@Override
public DictionaryMap promoteYourself() {
IntDictionaryMap dictionaryMap;
try {
dictionaryMap = new IntDictionaryMap(this);
} catch (NoKeysAvailableException e) {
// this should never happen;
throw new IllegalStateException(e);
}
return dictionaryMap;
}
@Override
public int nextKeyWithoutIncrementing() {
return nextIndex.get();
}
public static class ShortDictionaryBuilder {
private AtomicInteger nextIndex;
// The list of keys that represents the contents of string column in user order
private ShortArrayList values;
// we maintain 3 maps, one from strings to keys, one from keys to strings, and one from key to
// count of values
private Short2ObjectMap keyToValue;
// the inverse of the above keyToValue map
private Object2ShortOpenHashMap valueToKey;
// the map with counts
private Short2IntOpenHashMap keyToCount;
public ShortDictionaryBuilder setNextIndex(int value) {
nextIndex = new AtomicInteger(value);
return this;
}
public ShortDictionaryBuilder setKeyToValue(Short2ObjectMap keyToValue) {
this.keyToValue = keyToValue;
return this;
}
public ShortDictionaryBuilder setValueToKey(Object2ShortOpenHashMap valueToKey) {
this.valueToKey = valueToKey;
return this;
}
public ShortDictionaryBuilder setKeyToCount(Short2IntOpenHashMap keyToCount) {
this.keyToCount = keyToCount;
return this;
}
public ShortDictionaryBuilder setValues(short[] data) {
this.values = new ShortArrayList(data);
return this;
}
public ShortDictionaryMap build() {
Preconditions.checkNotNull(nextIndex);
Preconditions.checkNotNull(keyToCount);
Preconditions.checkNotNull(keyToValue);
Preconditions.checkNotNull(valueToKey);
Preconditions.checkNotNull(values);
return new ShortDictionaryMap(this);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy