All Downloads are FREE. Search and download functionalities are using the official Maven repository.

tech.tablesaw.api.StringColumn Maven / Gradle / Ivy

There is a newer version: 0.9.5
Show newest version
/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package tech.tablesaw.api;

import static com.google.common.base.Preconditions.checkArgument;
import static tech.tablesaw.api.ColumnType.*;

import com.google.common.base.Preconditions;
import it.unimi.dsi.fastutil.ints.IntComparator;
import java.util.*;
import java.util.stream.Stream;
import javax.annotation.Nullable;
import tech.tablesaw.columns.AbstractColumn;
import tech.tablesaw.columns.AbstractColumnParser;
import tech.tablesaw.columns.Column;
import tech.tablesaw.columns.strings.*;
import tech.tablesaw.selection.BitmapBackedSelection;
import tech.tablesaw.selection.Selection;

/**
 * A column that contains String values. They are assumed to be 'categorical' rather than free-form
 * text, so are stored in an encoding that takes advantage of the expected repetition of string
 * values.
 *
 * 

Because the MISSING_VALUE for this column type is an empty string, there is little or no need * for special handling of missing values in this class's methods. */ public class StringColumn extends AbstractColumn implements CategoricalColumn, StringFilters, StringMapFunctions, StringReduceUtils { private DictionaryMap data; private StringColumnFormatter printFormatter = new StringColumnFormatter(); private final IntComparator rowComparator = (i, i1) -> { String f1 = get(i); String f2 = get(i1); return f1.compareTo(f2); }; public static boolean valueIsMissing(String string) { return StringColumnType.valueIsMissing(string); } /** {@inheritDoc} */ @Override public StringColumn appendMissing() { data.appendMissing(); return this; } /** {@inheritDoc} */ @Override public int valueHash(int rowNumber) { return get(rowNumber).hashCode(); } /** {@inheritDoc} */ @Override public boolean equals(int rowNumber1, int rowNumber2) { return getDictionary().getKeyAtIndex(rowNumber1) == getDictionary().getKeyAtIndex(rowNumber2); } public static StringColumn create(String name) { return new StringColumn(name); } public static StringColumn create(String name, String... strings) { return new StringColumn(name, strings); } /* public static StringColumn create(String name, StringData stringData) { return new StringColumn(name, stringData); } */ public static StringColumn create(String name, Collection strings) { return new StringColumn(name, strings); } public static StringColumn createInternal(String name, DictionaryMap map) { return new StringColumn(name, map); } public static StringColumn create(String name, int size) { // TODO Pick map implementation based on array size StringColumn column = new StringColumn(name); for (int i = 0; i < size; i++) { column.appendMissing(); } return column; } public static StringColumn create(String name, Stream stream) { StringColumn column = create(name); stream.forEach(column::append); return column; } private StringColumn(String name, Collection strings) { super(StringColumnType.instance(), name, StringColumnType.DEFAULT_PARSER); // TODO Pick map implementation based on array size data = new ByteDictionaryMap(); for (String s : strings) { append(s); } } private StringColumn(String name, DictionaryMap map) { super(StringColumnType.instance(), name, StringColumnType.DEFAULT_PARSER); data = map; } private StringColumn(String name) { super(StringColumnType.instance(), name, StringColumnType.DEFAULT_PARSER); data = new ByteDictionaryMap(); } private StringColumn(String name, String[] strings) { super(StringColumnType.instance(), name, StringColumnType.DEFAULT_PARSER); // TODO Pick map implementation based on array size data = new ByteDictionaryMap(); for (String string : strings) { append(string); } } /** * Sets an {@link StringColumnFormatter} which will be used to format the display of data from * this column when it is printed (using, for example, Table:print()) and optionally when written * to a text file like a CSV. */ public void setPrintFormatter(StringColumnFormatter formatter) { Preconditions.checkNotNull(formatter); this.printFormatter = formatter; } /** Returns the current {@link StringColumnFormatter}. */ public StringColumnFormatter getPrintFormatter() { return printFormatter; } /** {@inheritDoc} */ @Override public boolean isMissing(int rowNumber) { return data.isMissing(rowNumber); } /** {@inheritDoc} */ @Override public StringColumn emptyCopy() { StringColumn empty = create(name()); empty.setPrintFormatter(getPrintFormatter()); return empty; } /** {@inheritDoc} */ @Override public StringColumn emptyCopy(int rowSize) { return create(name(), rowSize); } /** {@inheritDoc} */ @Override public void sortAscending() { data.sortAscending(); } /** {@inheritDoc} */ @Override public void sortDescending() { data.sortDescending(); } /** * Returns the number of elements (a.k.a. rows or cells) in the column * * @return size as int */ @Override public int size() { return data.size(); } /** * Returns the value at rowIndex in this column. The index is zero-based. * * @param rowIndex index of the row * @return value as String * @throws IndexOutOfBoundsException if the given rowIndex is not in the column */ @Override public String get(int rowIndex) { return data.get(rowIndex); } /** * Returns a List<String> representation of all the values in this column * *

NOTE: Unless you really need a string consider using the column itself for large datasets as * it uses much less memory * * @return values as a list of String. */ @Override public List asList() { List strings = new ArrayList<>(); for (String category : this) { strings.add(category); } return strings; } /** {@inheritDoc} */ @Override public Table summary() { Table summary = Table.create(this.name()); StringColumn measure = StringColumn.create("Measure"); StringColumn value = StringColumn.create("Value"); summary.addColumns(measure); summary.addColumns(value); measure.append("Count"); value.append(String.valueOf(size())); measure.append("Unique"); value.append(String.valueOf(countUnique())); Table countByCategory = countByCategory().sortDescendingOn("Count"); measure.append("Top"); value.append(countByCategory.stringColumn("Category").getString(0)); measure.append("Top Freq."); value.appendObj(countByCategory.intColumn("Count").getString(0)); return summary; } /** {@inheritDoc} */ @Override public Table countByCategory() { return data.countByCategory(name()); } /** {@inheritDoc} */ @Override public void clear() { data.clear(); } /** {@inheritDoc} */ @Override public StringColumn lead(int n) { StringColumn column = lag(-n); column.setName(name() + " lead(" + n + ")"); return column; } /** {@inheritDoc} */ @Override public StringColumn lag(int n) { StringColumn copy = emptyCopy(); copy.setName(name() + " lag(" + n + ")"); if (n >= 0) { for (int m = 0; m < n; m++) { copy.appendMissing(); } for (int i = 0; i < size(); i++) { if (i + n >= size()) { break; } copy.append(get(i)); } } else { for (int i = -n; i < size(); i++) { copy.append(get(i)); } for (int m = 0; m > n; m--) { copy.appendMissing(); } } return copy; } /** * Conditionally update this column, replacing current values with newValue for all rows where the * current value matches the selection criteria * *

Examples: myCatColumn.set(myCatColumn.isEqualTo("Cat"), "Dog"); // no more cats * myCatColumn.set(myCatColumn.valueIsMissing(), "Fox"); // no more missing values */ @Override public StringColumn set(Selection rowSelection, String newValue) { for (int row : rowSelection) { set(row, newValue); } return this; } /** {@inheritDoc} */ @Override public StringColumn set(int rowIndex, String stringValue) { if (stringValue == null) { return setMissing(rowIndex); } try { data.set(rowIndex, stringValue); } catch (NoKeysAvailableException ex) { data = data.promoteYourself(); try { data.set(rowIndex, stringValue); } catch (NoKeysAvailableException e) { // this can't happen throw new IllegalStateException(e); } } return this; } /** {@inheritDoc} */ @Override public int countUnique() { return data.countUnique(); } /** * Returns true if this column contains a cell with the given string, and false otherwise * * @param aString the value to look for * @return true if contains, false otherwise */ @Override public boolean contains(String aString) { return firstIndexOf(aString) >= 0; } /** {@inheritDoc} */ @Override public StringColumn setMissing(int i) { return set(i, StringColumnType.missingValueIndicator()); } /** * Add all the strings in the list to this column * * @param stringValues a list of values */ public StringColumn addAll(List stringValues) { for (String stringValue : stringValues) { append(stringValue); } return this; } /** {@inheritDoc} */ @Override public StringColumn appendCell(String object) { return appendCell(object, parser()); } /** {@inheritDoc} */ @Override public StringColumn appendCell(String object, AbstractColumnParser parser) { return appendObj(parser.parse(object)); } /** {@inheritDoc} */ @Override public IntComparator rowComparator() { return rowComparator; } @Override public Selection isMissing() { return data.isMissing(); } @Override public Selection isNotMissing() { return data.isNotMissing(); } /** {@inheritDoc} */ @Override public boolean isEmpty() { return data.isEmpty(); } /** {@inheritDoc} */ @Override public Selection isEqualTo(String string) { return data.isEqualTo(string); } /** {@inheritDoc} */ @Override public Selection isNotEqualTo(String string) { return data.isNotEqualTo(string); } /** * Returns a list of boolean columns suitable for use as dummy variables in, for example, * regression analysis, select a column of categorical data must be encoded as a list of columns, * such that each column represents a single category and indicates whether it is present (1) or * not present (0) * * @return a list of {@link BooleanColumn} */ public List getDummies() { return data.getDummies(); } /** * Returns a new Column containing all the unique values in this column * * @return a column with unique values. */ @Override public StringColumn unique() { List strings = new ArrayList<>(data.asSet()); return new StringColumn(name(), strings); } public DoubleColumn asDoubleColumn() { return DoubleColumn.create(this.name(), asDoubleArray()); } /** {@inheritDoc} */ @Override public StringColumn where(Selection selection) { return subset(selection.toArray()); } /** {@inheritDoc} */ @Override public StringColumn copy() { StringColumn newCol = create(name(), size()); int r = 0; for (String string : this) { newCol.set(r, string); r++; } newCol.setPrintFormatter(getPrintFormatter()); return newCol; } /** {@inheritDoc} */ @Override public StringColumn append(Column column) { checkArgument( column.type().equals(STRING), "Column '%s' has type %s, but column '%s' has type %s.", name(), type(), column.name(), column.type()); final int size = column.size(); for (int i = 0; i < size; i++) { append(column.getString(i)); } return this; } /** Returns the count of missing values in this column */ @Override public int countMissing() { return data.countMissing(); } /** {@inheritDoc} */ @Override public StringColumn removeMissing() { StringColumn noMissing = emptyCopy(); for (String v : this) { if (!StringColumnType.valueIsMissing(v)) { noMissing.append(v); } } return noMissing; } /** {@inheritDoc} */ @Override public Iterator iterator() { return data.iterator(); } public Set asSet() { return data.asSet(); } /** Returns the contents of the cell at rowNumber as a byte[] */ @Override public byte[] asBytes(int rowNumber) { return data.asBytes(rowNumber); } public double getDouble(int i) { return (double) data.uniqueValuesAt(data.firstIndexOf(data.getValueForIndex(i))) - 1; } public double[] asDoubleArray() { return Arrays.stream(data.asIntArray()).asDoubleStream().toArray(); } /** Added for naming consistency with all other columns */ @Override public StringColumn append(String value) { try { data.append(value); } catch (NoKeysAvailableException ex) { data = data.promoteYourself(); try { data.append(value); } catch (NoKeysAvailableException e) { // this can't happen throw new IllegalStateException(e); } } return this; } /** {@inheritDoc} */ @Override public StringColumn appendObj(Object obj) { if (obj == null) { return appendMissing(); } if (!(obj instanceof String)) { throw new IllegalArgumentException( "Cannot append " + obj.getClass().getName() + " to StringColumn"); } return append((String) obj); } /** {@inheritDoc} */ @Override public Selection isIn(String... strings) { return data.isIn(strings); } /** {@inheritDoc} */ @Override public Selection isIn(Collection strings) { return data.isIn(strings); } /** {@inheritDoc} */ @Override public Selection isNotIn(String... strings) { Selection results = new BitmapBackedSelection(); results.addRange(0, size()); results.andNot(isIn(strings)); return results; } /** {@inheritDoc} */ @Override public Selection isNotIn(Collection strings) { Selection results = new BitmapBackedSelection(); results.addRange(0, size()); results.andNot(isIn(strings)); return results; } public int firstIndexOf(String value) { return data.firstIndexOf(value); } public int countOccurrences(String value) { return data.countOccurrences(value); } /** {@inheritDoc} */ @Override public String[] asObjectArray() { return data.asObjectArray(); } /** {@inheritDoc} */ @Override public StringColumn asStringColumn() { return copy(); } /** For tablesaw internal use Note: This method returns null if the stringDataType is TEXTUAL */ public @Nullable DictionaryMap getDictionary() { return data; } /** {@inheritDoc} */ @Override public String getString(int row) { return printFormatter.format(get(row)); } /** {@inheritDoc} */ @Override public String getUnformattedString(int row) { return String.valueOf(get(row)); } /** * Returns the largest ("top") n values in the column * * @param n The maximum number of records to return. The actual number will be smaller if n is * greater than the number of observations in the column * @return A list, possibly empty, of the largest observations */ public List top(int n) { List top = new ArrayList<>(); Column copy = this.copy(); copy.sortDescending(); for (int i = 0; i < n; i++) { top.add(copy.get(i)); } return top; } /** * Returns the smallest ("bottom") n values in the column * * @param n The maximum number of records to return. The actual number will be smaller if n is * greater than the number of observations in the column * @return A list, possibly empty, of the smallest n observations */ public List bottom(int n) { List bottom = new ArrayList<>(); Column copy = this.copy(); copy.sortAscending(); for (int i = 0; i < n; i++) { bottom.add(copy.get(i)); } return bottom; } /** {@inheritDoc} */ @Override public Column append(Column column, int row) { return append(column.getUnformattedString(row)); } /** {@inheritDoc} */ @Override public Column set(int row, Column column, int sourceRow) { return set(row, column.getUnformattedString(sourceRow)); } /** {@inheritDoc} */ @Override public int byteSize() { return type().byteSize(); } /** {@inheritDoc} */ @Override public int compare(String o1, String o2) { return o1.compareTo(o2); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy