tech.tablesaw.io.saw.SawReader Maven / Gradle / Ivy
package tech.tablesaw.io.saw;
import static tech.tablesaw.io.saw.SawUtils.BOOLEAN;
import static tech.tablesaw.io.saw.SawUtils.DOUBLE;
import static tech.tablesaw.io.saw.SawUtils.FLOAT;
import static tech.tablesaw.io.saw.SawUtils.INSTANT;
import static tech.tablesaw.io.saw.SawUtils.INTEGER;
import static tech.tablesaw.io.saw.SawUtils.LOCAL_DATE;
import static tech.tablesaw.io.saw.SawUtils.LOCAL_DATE_TIME;
import static tech.tablesaw.io.saw.SawUtils.LOCAL_TIME;
import static tech.tablesaw.io.saw.SawUtils.LONG;
import static tech.tablesaw.io.saw.SawUtils.SHORT;
import static tech.tablesaw.io.saw.SawUtils.STRING;
import static tech.tablesaw.io.saw.SawUtils.TEXT;
import com.google.common.annotations.Beta;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
import it.unimi.dsi.fastutil.bytes.Byte2IntOpenHashMap;
import it.unimi.dsi.fastutil.bytes.Byte2ObjectMap;
import it.unimi.dsi.fastutil.bytes.Byte2ObjectOpenHashMap;
import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap;
import it.unimi.dsi.fastutil.ints.Int2ObjectMap;
import it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap;
import it.unimi.dsi.fastutil.objects.Object2ByteOpenHashMap;
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
import it.unimi.dsi.fastutil.objects.Object2ShortOpenHashMap;
import it.unimi.dsi.fastutil.shorts.Short2IntOpenHashMap;
import it.unimi.dsi.fastutil.shorts.Short2ObjectMap;
import it.unimi.dsi.fastutil.shorts.Short2ObjectOpenHashMap;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.stream.Collectors;
import org.iq80.snappy.SnappyFramedInputStream;
import tech.tablesaw.api.BooleanColumn;
import tech.tablesaw.api.DateColumn;
import tech.tablesaw.api.DateTimeColumn;
import tech.tablesaw.api.DoubleColumn;
import tech.tablesaw.api.FloatColumn;
import tech.tablesaw.api.InstantColumn;
import tech.tablesaw.api.IntColumn;
import tech.tablesaw.api.LongColumn;
import tech.tablesaw.api.ShortColumn;
import tech.tablesaw.api.StringColumn;
import tech.tablesaw.api.Table;
import tech.tablesaw.api.TextColumn;
import tech.tablesaw.api.TimeColumn;
import tech.tablesaw.columns.Column;
import tech.tablesaw.columns.strings.ByteDictionaryMap;
import tech.tablesaw.columns.strings.IntDictionaryMap;
import tech.tablesaw.columns.strings.ShortDictionaryMap;
@Beta
public class SawReader {
private final Path sawPath;
private final SawMetadata sawMetadata;
private ReadOptions readOptions = ReadOptions.defaultOptions();
public SawReader(Path sawPath) {
this.sawPath = sawPath;
this.sawMetadata = SawMetadata.readMetadata(sawPath);
}
public SawReader(Path sawPath, ReadOptions options) {
this.sawPath = sawPath;
this.readOptions = options;
this.sawMetadata = SawMetadata.readMetadata(sawPath);
}
public SawReader(File sawPathFile) {
this.sawPath = sawPathFile.toPath();
this.sawMetadata = SawMetadata.readMetadata(sawPath);
}
public SawReader(File sawPathFile, ReadOptions options) {
this.sawPath = sawPathFile.toPath();
this.readOptions = options;
this.sawMetadata = SawMetadata.readMetadata(sawPath);
}
public SawReader(String sawPathName) {
this.sawPath = setPath(sawPathName);
this.sawMetadata = SawMetadata.readMetadata(sawPath);
}
public SawReader(String sawPathName, ReadOptions options) {
this.sawPath = setPath(sawPathName);
this.readOptions = options;
this.sawMetadata = SawMetadata.readMetadata(sawPath);
}
private Path setPath(String parentFolderName) {
Preconditions.checkArgument(
parentFolderName != null, "The folder name for the saw output cannot be null");
Preconditions.checkArgument(
!parentFolderName.isEmpty(), "The folder name for the saw output cannot be empty");
return Paths.get(parentFolderName);
}
public String shape() {
return sawMetadata.shape();
}
public int columnCount() {
return sawMetadata.columnCount();
}
public int rowCount() {
return sawMetadata.getRowCount();
}
public List columnNames() {
return sawMetadata.columnNames();
}
public Table structure() {
return sawMetadata.structure();
}
public Table read() {
final ExecutorService executor = Executors.newFixedThreadPool(readOptions.getThreadPoolSize());
// The column names to filter for, if we don't want the whole table
final Set selectedColumns = new HashSet<>(readOptions.getSelectedColumns());
final List columnMetadata = getMetadata(selectedColumns);
final Table table = Table.create(sawMetadata.getTableName());
// Note: We do some extra work with the hash map to ensure that the columns are returned
// to the table in original order
List>> callables = new ArrayList<>();
Map> columns = new ConcurrentHashMap<>();
try {
for (ColumnMetadata column : columnMetadata) {
callables.add(
() -> {
Path columnPath = sawPath.resolve(column.getId());
return readColumn(columnPath.toString(), sawMetadata, column);
});
}
List>> futures = executor.invokeAll(callables);
for (Future> future : futures) {
Column> column = future.get();
columns.put(column.name(), column);
}
for (ColumnMetadata metadata : columnMetadata) {
table.internalAddWithoutValidation(columns.get(metadata.getName()));
}
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
throw new IllegalStateException(e);
} catch (ExecutionException e) {
throw new IllegalStateException(e);
} finally {
executor.shutdown();
}
return table;
}
private List getMetadata(Set selectedColumns) {
if (selectedColumns.isEmpty()) {
return ImmutableList.copyOf(sawMetadata.getColumnMetadataList());
}
return ImmutableList.copyOf(
sawMetadata.getColumnMetadataList().stream()
.filter(x -> selectedColumns.contains(x.getName()))
.collect(Collectors.toList()));
}
private Column> readColumn(
String fileName, SawMetadata sawMetadata, ColumnMetadata columnMetadata) throws IOException {
final String typeString = columnMetadata.getType();
final int rowcount = sawMetadata.getRowCount();
switch (typeString) {
case FLOAT:
return readFloatColumn(fileName, columnMetadata, rowcount);
case DOUBLE:
return readDoubleColumn(fileName, columnMetadata, rowcount);
case INTEGER:
return readIntColumn(fileName, columnMetadata, rowcount);
case BOOLEAN:
return readBooleanColumn(fileName, columnMetadata, rowcount);
case LOCAL_DATE:
return readLocalDateColumn(fileName, columnMetadata, rowcount);
case LOCAL_TIME:
return readLocalTimeColumn(fileName, columnMetadata, rowcount);
case LOCAL_DATE_TIME:
return readLocalDateTimeColumn(fileName, columnMetadata, rowcount);
case INSTANT:
return readInstantColumn(fileName, columnMetadata, rowcount);
case STRING:
return readStringColumn(fileName, columnMetadata, rowcount);
case TEXT:
return readTextColumn(fileName, columnMetadata, rowcount);
case SHORT:
return readShortColumn(fileName, columnMetadata, rowcount);
case LONG:
return readLongColumn(fileName, columnMetadata, rowcount);
default:
throw new IllegalStateException("Unhandled column type writing columns: " + typeString);
}
}
/**
* Returns a data input stream for reading from a file with the given name
*
* @throws IOException if anything goes wrong
*/
private DataInputStream inputStream(String fileName) throws IOException {
FileInputStream fis = new FileInputStream(fileName);
if (sawMetadata.getCompressionType().equals(CompressionType.NONE)) {
return new DataInputStream(fis);
} else {
SnappyFramedInputStream sis = new SnappyFramedInputStream(fis, true);
return new DataInputStream(sis);
}
}
private FloatColumn readFloatColumn(String fileName, ColumnMetadata metadata, int rowcount)
throws IOException {
float[] data = new float[rowcount];
try (DataInputStream dis = inputStream(fileName)) {
for (int i = 0; i < rowcount; i++) {
data[i] = dis.readFloat();
}
}
return FloatColumn.create(metadata.getName(), data);
}
private DoubleColumn readDoubleColumn(String fileName, ColumnMetadata metadata, int rowcount)
throws IOException {
double[] data = new double[rowcount];
try (DataInputStream dis = inputStream(fileName)) {
for (int i = 0; i < rowcount; i++) {
data[i] = dis.readDouble();
}
}
return DoubleColumn.create(metadata.getName(), data);
}
private IntColumn readIntColumn(String fileName, ColumnMetadata metadata, int rowcount)
throws IOException {
return IntColumn.create(metadata.getName(), readIntValues(fileName, rowcount));
}
private ShortColumn readShortColumn(String fileName, ColumnMetadata metadata, int rowcount)
throws IOException {
short[] data = new short[rowcount];
try (DataInputStream dis = inputStream(fileName)) {
for (int i = 0; i < rowcount; i++) {
data[i] = dis.readShort();
}
}
return ShortColumn.create(metadata.getName(), data);
}
private LongColumn readLongColumn(String fileName, ColumnMetadata metadata, int rowcount)
throws IOException {
return LongColumn.create(metadata.getName(), readLongValues(fileName, rowcount));
}
private DateColumn readLocalDateColumn(String fileName, ColumnMetadata metadata, int rowcount)
throws IOException {
return DateColumn.createInternal(metadata.getName(), readIntValues(fileName, rowcount));
}
private int[] readIntValues(String fileName, int rowcount) throws IOException {
int[] data = new int[rowcount];
try (DataInputStream dis = inputStream(fileName)) {
for (int i = 0; i < rowcount; i++) {
data[i] = dis.readInt();
}
}
return data;
}
private DateTimeColumn readLocalDateTimeColumn(
String fileName, ColumnMetadata metadata, int rowcount) throws IOException {
long[] data = readLongValues(fileName, rowcount);
return DateTimeColumn.createInternal(metadata.getName(), data);
}
private long[] readLongValues(String fileName, int rowcount) throws IOException {
long[] data = new long[rowcount];
try (DataInputStream dis = inputStream(fileName)) {
for (int i = 0; i < rowcount; i++) {
data[i] = dis.readLong();
}
}
return data;
}
private InstantColumn readInstantColumn(String fileName, ColumnMetadata metadata, int rowcount)
throws IOException {
return InstantColumn.createInternal(metadata.getName(), readLongValues(fileName, rowcount));
}
private TimeColumn readLocalTimeColumn(String fileName, ColumnMetadata metadata, int rowcount)
throws IOException {
return TimeColumn.createInternal(metadata.getName(), readIntValues(fileName, rowcount));
}
/**
* Reads the encoded StringColumn from the given file and stuffs it into a new StringColumn,
* saving time by updating the dictionary directly and just writing ints to the column's data
*/
private StringColumn readStringColumn(
String fileName, ColumnMetadata columnMetadata, int rowcount) throws IOException {
try (DataInputStream dis = inputStream(fileName)) {
if (columnMetadata.getStringColumnKeySize().equals(Byte.class.getSimpleName())) {
return StringColumn.createInternal(
columnMetadata.getName(), getByteMap(dis, columnMetadata, rowcount));
}
if (columnMetadata.getStringColumnKeySize().equals(Integer.class.getSimpleName())) {
return StringColumn.createInternal(
columnMetadata.getName(), getIntMap(dis, columnMetadata, rowcount));
}
return StringColumn.createInternal(
columnMetadata.getName(), getShortMap(dis, columnMetadata, rowcount));
}
}
private ByteDictionaryMap getByteMap(DataInputStream dis, ColumnMetadata metaData, int rowcount)
throws IOException {
int cardinality = metaData.getCardinality();
byte[] data = new byte[rowcount];
byte[] keys = new byte[cardinality];
byte[] countKeys = new byte[cardinality];
String[] values = new String[cardinality];
int[] counts = new int[cardinality];
// process the data
// first we read the keys and values for the maps
for (int k = 0; k < cardinality; k++) {
keys[k] = dis.readByte();
}
for (int k = 0; k < cardinality; k++) {
values[k] = dis.readUTF();
}
for (int k = 0; k < cardinality; k++) {
countKeys[k] = dis.readByte();
}
for (int k = 0; k < cardinality; k++) {
counts[k] = dis.readInt();
}
// get the column entries
for (int i = 0; i < rowcount; i++) {
data[i] = dis.readByte();
}
Object2ByteOpenHashMap valueToKey = new Object2ByteOpenHashMap<>(values, keys);
Byte2ObjectMap keyToValue = new Byte2ObjectOpenHashMap<>(keys, values);
Byte2IntOpenHashMap keyToCount = new Byte2IntOpenHashMap(countKeys, counts);
return new ByteDictionaryMap.ByteDictionaryBuilder()
.setValues(data)
.setValueToKey(valueToKey)
.setKeyToValue(keyToValue)
.setKeyToCount(keyToCount)
.setNextIndex(metaData.getNextStringKey())
.build();
}
private ShortDictionaryMap getShortMap(DataInputStream dis, ColumnMetadata metaData, int rowcount)
throws IOException {
int cardinality = metaData.getCardinality();
short[] data = new short[rowcount];
short[] keys = new short[cardinality];
short[] countKeys = new short[cardinality];
String[] values = new String[cardinality];
int[] counts = new int[cardinality];
// process the data
// first we read the keys and values for the maps
for (int k = 0; k < cardinality; k++) {
keys[k] = dis.readShort();
}
for (int k = 0; k < cardinality; k++) {
values[k] = dis.readUTF();
}
for (int k = 0; k < cardinality; k++) {
countKeys[k] = dis.readShort();
}
for (int k = 0; k < cardinality; k++) {
counts[k] = dis.readInt();
}
// get the column entries
for (int i = 0; i < rowcount; i++) {
data[i] = dis.readShort();
}
Object2ShortOpenHashMap valueToKey = new Object2ShortOpenHashMap<>(values, keys);
Short2ObjectMap keyToValue = new Short2ObjectOpenHashMap<>(keys, values);
Short2IntOpenHashMap keyToCount = new Short2IntOpenHashMap(countKeys, counts);
return new ShortDictionaryMap.ShortDictionaryBuilder()
.setValues(data)
.setValueToKey(valueToKey)
.setKeyToValue(keyToValue)
.setKeyToCount(keyToCount)
.setNextIndex(metaData.getNextStringKey())
.build();
}
private IntDictionaryMap getIntMap(DataInputStream dis, ColumnMetadata metaData, int rowcount)
throws IOException {
int cardinality = metaData.getCardinality();
int[] data = new int[rowcount];
int[] keys = new int[cardinality];
int[] countKeys = new int[cardinality];
String[] values = new String[cardinality];
int[] counts = new int[cardinality];
// process the data
// first we read the keys and values for the maps
for (int k = 0; k < cardinality; k++) {
keys[k] = dis.readInt();
}
for (int k = 0; k < cardinality; k++) {
values[k] = dis.readUTF();
}
for (int k = 0; k < cardinality; k++) {
countKeys[k] = dis.readInt();
}
for (int k = 0; k < cardinality; k++) {
counts[k] = dis.readInt();
}
// get the column entries
for (int i = 0; i < rowcount; i++) {
data[i] = dis.readInt();
}
Object2IntOpenHashMap valueToKey = new Object2IntOpenHashMap<>(values, keys);
Int2ObjectMap keyToValue = new Int2ObjectOpenHashMap<>(keys, values);
Int2IntOpenHashMap keyToCount = new Int2IntOpenHashMap(countKeys, counts);
return new IntDictionaryMap.IntDictionaryBuilder()
.setValues(data)
.setValueToKey(valueToKey)
.setKeyToValue(keyToValue)
.setKeyToCount(keyToCount)
.setNextIndex(metaData.getNextStringKey())
.build();
}
/** Reads the TextColumn data from the given file and stuffs it into a new TextColumn */
private TextColumn readTextColumn(String fileName, ColumnMetadata columnMetadata, int rowcount)
throws IOException {
TextColumn textColumn = TextColumn.create(columnMetadata.getName(), rowcount);
try (DataInputStream dis = inputStream(fileName)) {
for (int j = 0; j < rowcount; j++) {
textColumn.set(j, dis.readUTF());
}
}
return textColumn;
}
private BooleanColumn readBooleanColumn(String fileName, ColumnMetadata metadata, int rowcount)
throws IOException {
BooleanColumn column = BooleanColumn.create(metadata.getName());
try (DataInputStream dis = inputStream(fileName)) {
for (int i = 0; i < rowcount; i++) {
column.append(dis.readByte());
}
}
return column;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy