All Downloads are FREE. Search and download functionalities are using the official Maven repository.

tech.tablesaw.columns.strings.TextualStringData Maven / Gradle / Ivy

The newest version!
/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package tech.tablesaw.columns.strings;

import static tech.tablesaw.columns.AbstractColumn.DEFAULT_ARRAY_SIZE;

import com.google.common.collect.Sets;
import it.unimi.dsi.fastutil.ints.IntComparator;
import java.util.*;
import java.util.stream.Stream;
import javax.annotation.Nullable;
import tech.tablesaw.api.BooleanColumn;
import tech.tablesaw.api.StringColumn;
import tech.tablesaw.api.Table;
import tech.tablesaw.columns.Column;
import tech.tablesaw.selection.BitmapBackedSelection;
import tech.tablesaw.selection.Selection;

/**
 * A column that contains String values. They are assumed to be free-form text. For categorical
 * data, use stringColumn
 *
 * 

This is the default column type for SQL longvarchar and longnvarchar types * *

Because the MISSING_VALUE for this column type is an empty string, there is little or no need * for special handling of missing values in this class's methods. */ public class TextualStringData implements StringData { // holds each element in the column. protected List values; private final IntComparator rowComparator = (i, i1) -> { String f1 = get(i); String f2 = get(i1); return f1.compareTo(f2); }; private final Comparator descendingStringComparator = Comparator.reverseOrder(); public int valueHash(int rowNumber) { return get(rowNumber).hashCode(); } /** {@inheritDoc} */ public boolean equals(int rowNumber1, int rowNumber2) { return get(rowNumber1).equals(get(rowNumber2)); } private TextualStringData(Collection strings) { values = new ArrayList<>(strings.size()); for (String string : strings) { append(string); } } private TextualStringData() { values = new ArrayList<>(DEFAULT_ARRAY_SIZE); } private TextualStringData(String[] strings) { values = new ArrayList<>(strings.length); for (String string : strings) { append(string); } } public static boolean valueIsMissing(String string) { return StringColumnType.valueIsMissing(string); } public TextualStringData appendMissing() { append(StringColumnType.missingValueIndicator()); return this; } public static TextualStringData create() { return new TextualStringData(); } public static TextualStringData create(String... strings) { return new TextualStringData(strings); } public static TextualStringData create(Collection strings) { return new TextualStringData(strings); } public static TextualStringData create(int size) { ArrayList strings = new ArrayList<>(size); for (int i = 0; i < size; i++) { strings.add(StringColumnType.missingValueIndicator()); } return new TextualStringData(strings); } public static TextualStringData create(Stream stream) { TextualStringData column = create(); stream.forEach(column::append); return column; } /** {@inheritDoc} */ public boolean isMissing(int rowNumber) { return get(rowNumber).equals(StringColumnType.missingValueIndicator()); } /** {@inheritDoc} */ public TextualStringData emptyCopy() { return create(); } /** {@inheritDoc} */ public TextualStringData emptyCopy(int rowSize) { return create(rowSize); } /** {@inheritDoc} */ public void sortAscending() { values.sort(String::compareTo); } /** {@inheritDoc} */ public void sortDescending() { values.sort(descendingStringComparator); } /** * Returns the number of elements (a.k.a. rows or cells) in the column * * @return size as int */ public int size() { return values.size(); } /** * Returns the value at rowIndex in this column. The index is zero-based. * * @param rowIndex index of the row * @return value as String * @throws IndexOutOfBoundsException if the given rowIndex is not in the column */ public String get(int rowIndex) { return values.get(rowIndex); } /** * Returns a List<String> representation of all the values in this column * *

NOTE: Unless you really need a string consider using the column itself for large datasets as * it uses much less memory * * @return values as a list of String. */ public List asList() { return new ArrayList<>(values); } @Override public Table countByCategory(String columnName) { throw new UnsupportedOperationException(); // TODO: fix me // return asCategoricalStringData().countByCategory(columnName); } /** {@inheritDoc} */ public Table summary() { // Table table = Table.create("Column: " + name()); Table table = Table.create(); StringColumn measure = StringColumn.create("Measure"); StringColumn value = StringColumn.create("Value"); table.addColumns(measure); table.addColumns(value); measure.append("Count"); value.append(String.valueOf(size())); measure.append("Missing"); value.append(String.valueOf(countMissing())); return table; } /** {@inheritDoc} */ public void clear() { values.clear(); } /** {@inheritDoc} */ public TextualStringData lead(int n) { return lag(-n); } /** {@inheritDoc} */ public TextualStringData lag(int n) { TextualStringData copy = emptyCopy(); if (n >= 0) { for (int m = 0; m < n; m++) { copy.appendMissing(); } for (int i = 0; i < size(); i++) { if (i + n >= size()) { break; } copy.append(get(i)); } } else { for (int i = -n; i < size(); i++) { copy.append(get(i)); } for (int m = 0; m > n; m--) { copy.appendMissing(); } } return copy; } /** * Conditionally update this column, replacing current values with newValue for all rows where the * current value matches the selection criteria * *

Examples: myCatColumn.set(myCatColumn.isEqualTo("Cat"), "Dog"); // no more cats * myCatColumn.set(myCatColumn.valueIsMissing(), "Fox"); // no more missing values */ public TextualStringData set(Selection rowSelection, String newValue) { for (int row : rowSelection) { set(row, newValue); } return this; } /** {@inheritDoc} */ public TextualStringData set(int rowIndex, String stringValue) { if (stringValue == null) { return setMissing(rowIndex); } values.set(rowIndex, stringValue); return this; } /** {@inheritDoc} */ public int countUnique() { return asSet().size(); } /** * Returns true if this column contains a cell with the given string, and false otherwise * * @param aString the value to look for * @return true if contains, false otherwise */ public boolean contains(String aString) { return values.contains(aString); } /** {@inheritDoc} */ public TextualStringData setMissing(int i) { return set(i, StringColumnType.missingValueIndicator()); } /** * Add all the strings in the list to this column * * @param stringValues a list of values */ public TextualStringData addAll(List stringValues) { for (String stringValue : stringValues) { append(stringValue); } return this; } /** {@inheritDoc} */ public IntComparator rowComparator() { return rowComparator; } /** {@inheritDoc} */ public boolean isEmpty() { return values.isEmpty(); } /** * Returns a new Column containing all the unique values in this column * * @return a column with unique values. */ public TextualStringData unique() { List strings = new ArrayList<>(asSet()); return TextualStringData.create(strings); } /** {@inheritDoc} */ public TextualStringData where(Selection selection) { return (TextualStringData) subset(selection.toArray()); } // TODO (lwhite): This could avoid the append and do a list copy /** {@inheritDoc} */ public TextualStringData copy() { TextualStringData newCol = create(size()); int r = 0; for (String string : this) { newCol.set(r, string); r++; } return newCol; } /** {@inheritDoc} */ public void append(Column column) { final int size = column.size(); for (int i = 0; i < size; i++) { append(column.getString(i)); } } /** Returns the count of missing values in this column */ public int countMissing() { int count = 0; for (int i = 0; i < size(); i++) { if (StringColumnType.missingValueIndicator().equals(get(i))) { count++; } } return count; } /** {@inheritDoc} */ public TextualStringData removeMissing() { TextualStringData noMissing = emptyCopy(); for (String v : this) { if (!StringColumnType.valueIsMissing(v)) { noMissing.append(v); } } return noMissing; } /** {@inheritDoc} */ public Iterator iterator() { return values.iterator(); } /** {@inheritDoc} */ public Set asSet() { return new HashSet<>(values); } /** Returns the contents of the cell at rowNumber as a byte[] */ public byte[] asBytes(int rowNumber) { String value = get(rowNumber); return value.getBytes(); } /** Added for naming consistency with all other columns */ public TextualStringData append(String value) { values.add(value); return this; } /** {@inheritDoc} */ public TextualStringData appendObj(Object obj) { if (obj == null) { return appendMissing(); } if (!(obj instanceof String)) { throw new IllegalArgumentException( "Cannot append " + obj.getClass().getName() + " to TextColumn"); } return append((String) obj); } /** {@inheritDoc} */ public Selection isIn(String... strings) { Set stringSet = Sets.newHashSet(strings); Selection results = new BitmapBackedSelection(); for (int i = 0; i < size(); i++) { if (stringSet.contains(values.get(i))) { results.add(i); } } return results; } /** {@inheritDoc} */ public Selection isIn(Collection strings) { Set stringSet = Sets.newHashSet(strings); Selection results = new BitmapBackedSelection(); for (int i = 0; i < size(); i++) { if (stringSet.contains(values.get(i))) { results.add(i); } } return results; } /** {@inheritDoc} */ public Selection isNotIn(String... strings) { Selection results = new BitmapBackedSelection(); results.addRange(0, size()); results.andNot(isIn(strings)); return results; } /** {@inheritDoc} */ public Selection isNotIn(Collection strings) { Selection results = new BitmapBackedSelection(); results.addRange(0, size()); results.andNot(isIn(strings)); return results; } public int firstIndexOf(String value) { return values.indexOf(value); } /** {@inheritDoc} */ public String[] asObjectArray() { final String[] output = new String[size()]; for (int i = 0; i < size(); i++) { output[i] = get(i); } return output; } /** * Returns a double that can stand in for the string at index i in some ML applications * *

TODO: Evaluate use of hashCode() here for uniqueness * * @param i The index in this column */ public double getDouble(int i) { return values.get(i).hashCode(); } public double[] asDoubleArray() { double[] result = new double[this.size()]; for (int i = 0; i < size(); i++) { result[i] = getDouble(i); } return result; } public int countOccurrences(String value) { return isEqualTo(value).size(); } /** * {@inheritDoc} Unsupported Operation This can't be used on a text column as the number of * BooleanColumns would likely be excessive */ public List getDummies() { throw new UnsupportedOperationException( "StringColumns containing arbitary, non-categorical strings do not support the getDummies() method for performance reasons"); } /** Returns null, as this Column is not backed by a dictionaryMap */ public @Nullable DictionaryMap getDictionary() { return null; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy