tech.tablesaw.io.ReadOptions Maven / Gradle / Ivy
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package tech.tablesaw.io;
import static tech.tablesaw.api.ColumnType.BOOLEAN;
import static tech.tablesaw.api.ColumnType.DOUBLE;
import static tech.tablesaw.api.ColumnType.FLOAT;
import static tech.tablesaw.api.ColumnType.INTEGER;
import static tech.tablesaw.api.ColumnType.LOCAL_DATE;
import static tech.tablesaw.api.ColumnType.LOCAL_DATE_TIME;
import static tech.tablesaw.api.ColumnType.LOCAL_TIME;
import static tech.tablesaw.api.ColumnType.LONG;
import static tech.tablesaw.api.ColumnType.SHORT;
import static tech.tablesaw.api.ColumnType.STRING;
import static tech.tablesaw.api.ColumnType.TEXT;
import com.google.common.base.Strings;
import com.google.common.collect.Lists;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URL;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Optional;
import java.util.function.Function;
import tech.tablesaw.api.ColumnType;
import tech.tablesaw.io.ReadOptions.ColumnTypeReadOptions;
public class ReadOptions {
public static final boolean DEFAULT_IGNORE_ZERO_DECIMAL = true;
public static final boolean DEFAULT_SKIP_ROWS_WITH_INVALID_COLUMN_COUNT = false;
private static final List DEFAULT_TYPES =
Lists.newArrayList(
LOCAL_DATE_TIME, LOCAL_TIME, LOCAL_DATE, BOOLEAN, INTEGER, LONG, DOUBLE, STRING, TEXT);
/**
* An extended list of types that are used if minimizeColumnSizes is true. By including extra
* types like Short the resulting table size is reduced at the cost of some additional complexity
* for the programmer if, for example, they will subsequently modify the data in a way that
* exceeds the range of the type.
*/
protected static final List EXTENDED_TYPES =
Lists.newArrayList(
LOCAL_DATE_TIME,
LOCAL_TIME,
LOCAL_DATE,
BOOLEAN,
SHORT,
INTEGER,
LONG,
FLOAT,
DOUBLE,
STRING,
TEXT);
protected final Source source;
protected final String tableName;
protected final List columnTypesToDetect;
protected final boolean sample;
protected final String dateFormat;
protected final String dateTimeFormat;
protected final String timeFormat;
protected final Locale locale;
protected final String[] missingValueIndicators;
protected final boolean minimizeColumnSizes;
protected final int maxCharsPerColumn;
protected final boolean ignoreZeroDecimal;
protected final boolean allowDuplicateColumnNames;
protected final boolean skipRowsWithInvalidColumnCount;
protected final DateTimeFormatter dateFormatter;
protected final DateTimeFormatter dateTimeFormatter;
protected final DateTimeFormatter timeFormatter;
protected final ColumnTypeReadOptions columnTypeReadOptions;
protected final boolean header;
protected ReadOptions(ReadOptions.Builder builder) {
source = builder.source;
tableName = builder.tableName;
columnTypesToDetect = builder.columnTypesToDetect;
sample = builder.sample;
dateFormat = builder.dateFormat;
timeFormat = builder.timeFormat;
dateTimeFormat = builder.dateTimeFormat;
missingValueIndicators = builder.missingValueIndicators;
minimizeColumnSizes = builder.minimizeColumnSizes;
header = builder.header;
maxCharsPerColumn = builder.maxCharsPerColumn;
ignoreZeroDecimal = builder.ignoreZeroDecimal;
skipRowsWithInvalidColumnCount = builder.skipRowsWithInvalidColumnCount;
dateFormatter = builder.dateFormatter;
timeFormatter = builder.timeFormatter;
dateTimeFormatter = builder.dateTimeFormatter;
allowDuplicateColumnNames = builder.allowDuplicateColumnNames;
locale = builder.locale;
if (builder.columnTypes != null)
columnTypeReadOptions = new ByIdxColumnTypeReadOptions(builder.columnTypes);
else if (!builder.columnTypeMap.isEmpty())
columnTypeReadOptions = new ByNameMapColumnTypeReadOptions(builder.columnTypeMap);
else if (builder.completeColumnTypeFunction != null)
columnTypeReadOptions =
new CompleteFunctionColumnTypeReadOptions(builder.completeColumnTypeFunction);
else if (builder.columnTypeFunction != null)
columnTypeReadOptions = new PartialFunctionColumnTypeReadOptions(builder.columnTypeFunction);
else columnTypeReadOptions = ColumnTypeReadOptions.EMPTY;
}
public Source source() {
return source;
}
public String tableName() {
return tableName;
}
public boolean allowDuplicateColumnNames() {
return allowDuplicateColumnNames;
}
public List columnTypesToDetect() {
return columnTypesToDetect;
}
public boolean sample() {
return sample;
}
public boolean minimizeColumnSizes() {
return minimizeColumnSizes;
}
public String[] missingValueIndicators() {
return missingValueIndicators;
}
public Locale locale() {
return locale;
}
public boolean header() {
return header;
}
public boolean ignoreZeroDecimal() {
return ignoreZeroDecimal;
}
public boolean skipRowsWithInvalidColumnCount() {
return skipRowsWithInvalidColumnCount;
}
public DateTimeFormatter dateTimeFormatter() {
if (dateTimeFormatter != null) {
return dateTimeFormatter;
}
if (Strings.isNullOrEmpty(dateTimeFormat)) {
return null;
}
return DateTimeFormatter.ofPattern(dateTimeFormat, locale);
}
public DateTimeFormatter timeFormatter() {
if (timeFormatter != null) {
return timeFormatter;
}
if (Strings.isNullOrEmpty(timeFormat)) {
return null;
}
return DateTimeFormatter.ofPattern(timeFormat, locale);
}
public DateTimeFormatter dateFormatter() {
if (dateFormatter != null) {
return dateFormatter;
}
if (Strings.isNullOrEmpty(dateFormat)) {
return null;
}
return DateTimeFormatter.ofPattern(dateFormat, locale);
}
public ColumnTypeReadOptions columnTypeReadOptions() {
return columnTypeReadOptions;
}
protected static class Builder {
protected final Source source;
protected String tableName = "";
protected List columnTypesToDetect = DEFAULT_TYPES;
protected boolean sample = true;
protected String dateFormat;
protected DateTimeFormatter dateFormatter;
protected String timeFormat;
protected DateTimeFormatter timeFormatter;
protected String dateTimeFormat;
protected DateTimeFormatter dateTimeFormatter;
protected Locale locale = Locale.getDefault();
protected String[] missingValueIndicators = new String[0];
protected boolean minimizeColumnSizes = false;
protected boolean header = true;
protected int maxCharsPerColumn = 4096;
protected boolean ignoreZeroDecimal = DEFAULT_IGNORE_ZERO_DECIMAL;
protected boolean skipRowsWithInvalidColumnCount = DEFAULT_SKIP_ROWS_WITH_INVALID_COLUMN_COUNT;
private boolean allowDuplicateColumnNames = false;
protected ColumnType[] columnTypes;
protected Map columnTypeMap = new HashMap<>();
protected Function> columnTypeFunction;
protected Function completeColumnTypeFunction;
protected Builder() {
source = null;
}
protected Builder(Source source) {
this.source = source;
}
protected Builder(File file) {
this.source = new Source(file);
this.tableName = file.getName();
}
protected Builder(URL url) throws IOException {
this.source = new Source(url.openStream());
this.tableName = url.toString();
}
protected Builder(InputStream stream) {
this.source = new Source(stream);
}
protected Builder(InputStreamReader reader) {
this.source = new Source(reader);
}
protected Builder(Reader reader) {
this.source = new Source(reader);
}
public Builder tableName(String tableName) {
this.tableName = tableName;
return this;
}
public Builder header(boolean hasHeader) {
this.header = hasHeader;
return this;
}
/** Deprecated. Use dateFormat(DateTimeFormatter dateFormat) instead */
@Deprecated
public Builder dateFormat(String dateFormat) {
this.dateFormat = dateFormat;
return this;
}
public Builder dateFormat(DateTimeFormatter dateFormat) {
this.dateFormatter = dateFormat;
return this;
}
public Builder allowDuplicateColumnNames(Boolean allow) {
this.allowDuplicateColumnNames = allow;
return this;
}
/** Deprecated. Use timeFormat(DateTimeFormatter dateFormat) instead */
@Deprecated
public Builder timeFormat(String timeFormat) {
this.timeFormat = timeFormat;
return this;
}
public Builder timeFormat(DateTimeFormatter dateFormat) {
this.timeFormatter = dateFormat;
return this;
}
/** Deprecated. Use dateTimeFormat(DateTimeFormatter dateFormat) instead */
@Deprecated
public Builder dateTimeFormat(String dateTimeFormat) {
this.dateTimeFormat = dateTimeFormat;
return this;
}
public Builder dateTimeFormat(DateTimeFormatter dateFormat) {
this.dateTimeFormatter = dateFormat;
return this;
}
public Builder missingValueIndicator(String... missingValueIndicators) {
this.missingValueIndicators = missingValueIndicators;
return this;
}
public Builder maxCharsPerColumn(int maxCharsPerColumn) {
this.maxCharsPerColumn = maxCharsPerColumn;
return this;
}
/** Ignore zero value decimals in data values. Defaults to {@code true}. */
public Builder ignoreZeroDecimal(boolean ignoreZeroDecimal) {
this.ignoreZeroDecimal = ignoreZeroDecimal;
return this;
}
/** Skip the rows with invalid column count in data values. Defaluts to {@code false}. */
public Builder skipRowsWithInvalidColumnCount(boolean skipRowsWithInvalidColumnCount) {
this.skipRowsWithInvalidColumnCount = skipRowsWithInvalidColumnCount;
return this;
}
public Builder sample(boolean sample) {
this.sample = sample;
return this;
}
public Builder locale(Locale locale) {
this.locale = locale;
return this;
}
/** @see ColumnTypeDetector */
public Builder columnTypesToDetect(List columnTypesToDetect) {
// Types need to be in certain order as more general types like string come last
// Otherwise everything will be parsed as a string
List orderedTypes = new ArrayList<>();
for (ColumnType t : EXTENDED_TYPES) {
if (columnTypesToDetect.contains(t)) {
orderedTypes.add(t);
}
}
this.columnTypesToDetect = orderedTypes;
return this;
}
/**
* Allow the {@link ColumnTypeDetector} to choose shorter column types such as float instead of
* double when the data will fit in a smaller type
*/
public Builder minimizeColumnSizes() {
this.columnTypesToDetect = EXTENDED_TYPES;
return this;
}
/**
* Provide column types for all columns skipping autodetect column type logic. The array must
* contain a ColumnType for each column in the table. An error will be thrown if they don't
* match up
*/
public Builder columnTypes(ColumnType[] columnTypes) {
if (columnTypeOptionsAlreadySet()) {
throw new IllegalStateException("columnTypes already set");
}
this.columnTypes = columnTypes;
return this;
}
/**
* Provide a function that determines ColumnType for all column names. To provide only for some
* use {@link #columnTypesPartial(Function)}
*
* This method is generally more efficient because it skips column type detection
*/
public Builder columnTypes(Function columnTypeFunction) {
if (columnTypeOptionsAlreadySet()) {
throw new IllegalStateException("columnTypes already set");
}
this.completeColumnTypeFunction = columnTypeFunction;
return this;
}
/**
* Provide a function that determines ColumnType for some column names. To provide for all
* column names use {@link #columnTypes(Function)} that generally is more efficient because it
* skips column type detection
*/
public Builder columnTypesPartial(Function> columnTypeFunction) {
if (columnTypeOptionsAlreadySet()) {
throw new IllegalStateException("columnTypes already set");
}
this.columnTypeFunction = columnTypeFunction;
return this;
}
/**
* Provide a map that determines ColumnType for given column names. Types for not present column
* names will be autodetected. To provide type for all column names use {@link
* #columnTypes(Function)} that generally is more efficient because it skips column type
* detection
*/
public Builder columnTypesPartial(Map columnTypeByName) {
if (columnTypeOptionsAlreadySet()) {
throw new IllegalStateException("columnTypes already set");
}
if (columnTypeByName != null) {
this.columnTypeMap = columnTypeByName;
}
return this;
}
private boolean columnTypeOptionsAlreadySet() {
return columnTypes != null
|| columnTypeFunction != null
|| completeColumnTypeFunction != null
|| !columnTypeMap.isEmpty();
}
public ReadOptions build() {
return new ReadOptions(this);
}
}
/**
* Allows user to set column types. It can work in three ways:
*
*
* - If no information is provided column types are autodetected
*
- A complete list of columns can be provided using {@link
* ReadOptions.Builder#columnTypes(ColumnType[])} or {@link
* ReadOptions.Builder#columnTypes(Function)}. This skips column type detection.
*
- Provide values for some column names using {@link
* ReadOptions.Builder#columnTypesPartial(Map)} or {@link
* ReadOptions.Builder#columnTypesPartial(Function)} (String, ColumnType)}. In this case
* provided columnTypes are used and the others are autodetected
*
*/
public interface ColumnTypeReadOptions {
Optional columnType(int columnNumber, String columnName);
default boolean hasColumnTypeForAllColumnsIfHavingColumnNames() {
return false;
}
default boolean hasColumnTypeForAllColumns() {
return false;
}
default ColumnType[] columnTypes() {
return null;
}
ColumnTypeReadOptions EMPTY = (columnNumber, columnName) -> Optional.empty();
static ColumnTypeReadOptions of(ColumnType[] allColumnTypes) {
return new ByIdxColumnTypeReadOptions(allColumnTypes);
}
}
}
class ByIdxColumnTypeReadOptions implements ColumnTypeReadOptions {
final ColumnType[] columnTypesByIdx;
public ByIdxColumnTypeReadOptions(ColumnType[] columnTypesByIdx) {
this.columnTypesByIdx = columnTypesByIdx;
}
@Override
public Optional columnType(int columnNumber, String columnName) {
return Optional.of(columnTypesByIdx[columnNumber]);
}
@Override
public ColumnType[] columnTypes() {
return columnTypesByIdx;
}
@Override
public boolean hasColumnTypeForAllColumnsIfHavingColumnNames() {
return true;
}
@Override
public boolean hasColumnTypeForAllColumns() {
return true;
}
}
class ByNameMapColumnTypeReadOptions implements ColumnTypeReadOptions {
private final Map columnTypesByNameMap;
public ByNameMapColumnTypeReadOptions(Map columnTypesByNameMap) {
this.columnTypesByNameMap = columnTypesByNameMap;
}
@Override
public Optional columnType(int columnNumber, String columnName) {
return Optional.ofNullable(columnTypesByNameMap.get(columnName));
}
}
class CompleteFunctionColumnTypeReadOptions implements ColumnTypeReadOptions {
private final Function function;
public CompleteFunctionColumnTypeReadOptions(Function function) {
this.function = function;
}
@Override
public Optional columnType(int columnNumber, String columnName) {
return Optional.of(function.apply(columnName));
}
@Override
public boolean hasColumnTypeForAllColumnsIfHavingColumnNames() {
return true;
}
}
class PartialFunctionColumnTypeReadOptions implements ColumnTypeReadOptions {
final Function> function;
public PartialFunctionColumnTypeReadOptions(Function> function) {
this.function = function;
}
@Override
public Optional columnType(int columnNumber, String columnName) {
return function.apply(columnName);
}
}