All Downloads are FREE. Search and download functionalities are using the official Maven repository.

tech.tablesaw.store.StorageManager Maven / Gradle / Ivy

There is a newer version: 0.43.1
Show newest version
package tech.tablesaw.store;

import com.google.common.annotations.VisibleForTesting;

import tech.tablesaw.api.BooleanColumn;
import tech.tablesaw.api.CategoryColumn;
import tech.tablesaw.api.DateColumn;
import tech.tablesaw.api.DateTimeColumn;
import tech.tablesaw.api.DoubleColumn;
import tech.tablesaw.api.FloatColumn;
import tech.tablesaw.api.IntColumn;
import tech.tablesaw.api.LongColumn;
import tech.tablesaw.api.ShortColumn;
import tech.tablesaw.api.Table;
import tech.tablesaw.api.TimeColumn;
import tech.tablesaw.columns.Column;
import tech.tablesaw.table.Relation;

import org.iq80.snappy.SnappyFramedInputStream;
import org.iq80.snappy.SnappyFramedOutputStream;

import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.EOFException;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.nio.charset.StandardCharsets;
import java.nio.file.FileSystem;
import java.nio.file.FileSystems;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.concurrent.CompletionService;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorCompletionService;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.regex.Pattern;

/**
 * A controller for reading and writing data in Tablesaw's own compressed, column-oriented file format
 */
public class StorageManager {

    private static final int FLUSH_AFTER_ITERATIONS = 10_000;

    private static final String FILE_EXTENSION = "saw";
    private static final Pattern WHITE_SPACE_PATTERN = Pattern.compile("\\s+");
    private static final Pattern SEPARATOR_PATTERN = Pattern.compile(Pattern.quote(separator()));

    private static final int READER_POOL_SIZE = 4;

    static String separator() {
        FileSystem fileSystem = FileSystems.getDefault();
        return fileSystem.getSeparator();
    }

    /**
     * Reads a tablesaw table into memory
     *
     * @param path The location of the table. It is interpreted as relative to the working directory if not fully
     *             specified. The path will typically end in ".saw", as in "mytables/nasdaq-2015.saw"
     * @throws IOException if the file cannot be read
     */
    public static tech.tablesaw.api.Table readTable(String path) throws IOException {

        ExecutorService executorService = Executors.newFixedThreadPool(READER_POOL_SIZE);
        CompletionService readerCompletionService = new ExecutorCompletionService<>(executorService);

        TableMetadata tableMetadata = readTableMetadata(path + separator() + "Metadata.json");
        List columnMetadata = tableMetadata.getColumnMetadataList();
        Table table = Table.create(tableMetadata);

        // NB: We do some extra work with the hash map to ensure that the columns are added to the table in original
        // order
        // TODO(lwhite): Not using CPU efficiently. Need to prevent waiting for other threads until all columns are read
        // TODO - continued : Problem seems to be mostly with category columns rebuilding the encoding dictionary
        ConcurrentLinkedQueue columnList = new ConcurrentLinkedQueue<>();
        Map columns = new HashMap<>();
        try {
            for (ColumnMetadata column : columnMetadata) {
                readerCompletionService.submit(() -> {
                    columnList.add(readColumn(path + separator() + column.getId(), column));
                    return null;
                });
            }
            for (int i = 0; i < columnMetadata.size(); i++) {
                Future future = readerCompletionService.take();
                future.get();
            }
            for (Column c : columnList) {
                columns.put(c.id(), c);
            }

            for (ColumnMetadata metadata : columnMetadata) {
                String id = metadata.getId();
                table.addColumn(columns.get(id));
            }

        } catch (InterruptedException | ExecutionException e) {
            throw new RuntimeException(e);
        }
        executorService.shutdown();
        return table;
    }

    private static Column readColumn(String fileName, ColumnMetadata columnMetadata)
            throws IOException {

        switch (columnMetadata.getType()) {
            case FLOAT:
                return readFloatColumn(fileName, columnMetadata);
            case INTEGER:
                return readIntColumn(fileName, columnMetadata);
            case BOOLEAN:
                return readBooleanColumn(fileName, columnMetadata);
            case LOCAL_DATE:
                return readLocalDateColumn(fileName, columnMetadata);
            case LOCAL_TIME:
                return readLocalTimeColumn(fileName, columnMetadata);
            case LOCAL_DATE_TIME:
                return readLocalDateTimeColumn(fileName, columnMetadata);
            case CATEGORY:
                return readCategoryColumn(fileName, columnMetadata);
            case SHORT_INT:
                return readShortColumn(fileName, columnMetadata);
            case LONG_INT:
                return readLongColumn(fileName, columnMetadata);
            default:
                throw new RuntimeException("Unhandled column type writing columns");
        }
    }

    private static FloatColumn readFloatColumn(String fileName, ColumnMetadata metadata) throws IOException {
        FloatColumn floats = new FloatColumn(metadata);
        try (FileInputStream fis = new FileInputStream(fileName);
             SnappyFramedInputStream sis = new SnappyFramedInputStream(fis, true);
             DataInputStream dis = new DataInputStream(sis)) {
            boolean EOF = false;
            while (!EOF) {
                try {
                    float cell = dis.readFloat();
                    floats.append(cell);
                } catch (EOFException e) {
                    EOF = true;
                }
            }
        }
        return floats;
    }

    private static IntColumn readIntColumn(String fileName, ColumnMetadata metadata) throws IOException {
        IntColumn ints = new IntColumn(metadata);
        try (FileInputStream fis = new FileInputStream(fileName);
             SnappyFramedInputStream sis = new SnappyFramedInputStream(fis, true);
             DataInputStream dis = new DataInputStream(sis)) {
            boolean EOF = false;
            while (!EOF) {
                try {
                    ints.append(dis.readInt());
                } catch (EOFException e) {
                    EOF = true;
                }
            }
        }
        return ints;
    }

    private static ShortColumn readShortColumn(String fileName, ColumnMetadata metadata) throws IOException {
        ShortColumn ints = new ShortColumn(metadata);
        try (FileInputStream fis = new FileInputStream(fileName);
             SnappyFramedInputStream sis = new SnappyFramedInputStream(fis, true);
             DataInputStream dis = new DataInputStream(sis)) {
            boolean EOF = false;
            while (!EOF) {
                try {
                    ints.append(dis.readShort());
                } catch (EOFException e) {
                    EOF = true;
                }
            }
        }
        return ints;
    }

    private static LongColumn readLongColumn(String fileName, ColumnMetadata metadata) throws IOException {
        LongColumn ints = new LongColumn(metadata);
        try (FileInputStream fis = new FileInputStream(fileName);
             SnappyFramedInputStream sis = new SnappyFramedInputStream(fis, true);
             DataInputStream dis = new DataInputStream(sis)) {
            boolean EOF = false;
            while (!EOF) {
                try {
                    ints.append(dis.readLong());
                } catch (EOFException e) {
                    EOF = true;
                }
            }
        }
        return ints;
    }

    private static DateColumn readLocalDateColumn(String fileName, ColumnMetadata metadata) throws IOException {
        DateColumn dates = new DateColumn(metadata);
        try (FileInputStream fis = new FileInputStream(fileName);
             SnappyFramedInputStream sis = new SnappyFramedInputStream(fis, true);
             DataInputStream dis = new DataInputStream(sis)) {
            boolean EOF = false;
            while (!EOF) {
                try {
                    int cell = dis.readInt();
                    dates.append(cell);
                } catch (EOFException e) {
                    EOF = true;
                }
            }
        }
        return dates;
    }

    private static DateTimeColumn readLocalDateTimeColumn(String fileName, ColumnMetadata metadata) throws
            IOException {
        DateTimeColumn dates = new DateTimeColumn(metadata);
        try (FileInputStream fis = new FileInputStream(fileName);
             SnappyFramedInputStream sis = new SnappyFramedInputStream(fis, true);
             DataInputStream dis = new DataInputStream(sis)) {
            boolean EOF = false;
            while (!EOF) {
                try {
                    long cell = dis.readLong();
                    dates.append(cell);
                } catch (EOFException e) {
                    EOF = true;
                }
            }
        }
        return dates;
    }

    private static TimeColumn readLocalTimeColumn(String fileName, ColumnMetadata metadata) throws IOException {
        TimeColumn times = new TimeColumn(metadata);
        try (FileInputStream fis = new FileInputStream(fileName);
             SnappyFramedInputStream sis = new SnappyFramedInputStream(fis, true);
             DataInputStream dis = new DataInputStream(sis)) {
            boolean EOF = false;
            while (!EOF) {
                try {
                    int cell = dis.readInt();
                    times.append(cell);
                } catch (EOFException e) {
                    EOF = true;
                }
            }
        }
        return times;
    }

    public static CategoryColumn readCategoryColumn(String fileName, ColumnMetadata metadata) throws IOException {
        CategoryColumn stringColumn = new CategoryColumn(metadata);
        try (FileInputStream fis = new FileInputStream(fileName);
             SnappyFramedInputStream sis = new SnappyFramedInputStream(fis, true);
             DataInputStream dis = new DataInputStream(sis)) {

            int stringCount = dis.readInt();

            int j = 0;
            while (j < stringCount) {
                stringColumn.dictionaryMap().put(j, dis.readUTF());
                j++;
            }

            int size = metadata.getSize();
            for (int i = 0; i < size; i++) {
                stringColumn.data().add(dis.readInt());
            }
        }
        return stringColumn;
    }

    private static BooleanColumn readBooleanColumn(String fileName, ColumnMetadata metadata) throws IOException {
        BooleanColumn bools = new BooleanColumn(metadata);
        try (FileInputStream fis = new FileInputStream(fileName);
             SnappyFramedInputStream sis = new SnappyFramedInputStream(fis, true);
             DataInputStream dis = new DataInputStream(sis)) {
            boolean EOF = false;
            while (!EOF) {
                try {
                    boolean cell = dis.readBoolean();
                    bools.append(cell);
                } catch (EOFException e) {
                    EOF = true;
                }
            }
        }
        return bools;
    }

    /**
     * Saves the data from the given table in the location specified by folderName. Within that folder each table has
     * its own sub-folder, whose name is based on the name of the table.
     * 

* NOTE: If you store a table with the same name in the same folder. The data in that folder will be over-written. *

* The storage format is the tablesaw compressed column-oriented format, which consists of a set of file in a * folder. * The name of the folder is based on the name of the table. * * @param folderName The location of the table (for example: "mytables") * @param table The table to be saved * @return The path and name of the table * @throws IOException IOException if the file can not be read */ public static String saveTable(String folderName, Relation table) throws IOException { ExecutorService executorService = Executors.newFixedThreadPool(10); CompletionService writerCompletionService = new ExecutorCompletionService<>(executorService); String name = table.name(); name = WHITE_SPACE_PATTERN.matcher(name).replaceAll(""); // remove whitespace from the table name name = SEPARATOR_PATTERN.matcher(name).replaceAll("_"); // remove path separators from the table name String storageFolder = folderName + separator() + name + '.' + FILE_EXTENSION; Path path = Paths.get(storageFolder); if (!Files.exists(path)) { try { Files.createDirectories(path); } catch (IOException e) { e.printStackTrace(); } } writeTableMetadata(path.toString() + separator() + "Metadata.json", table); try { for (Column column : table.columns()) { writerCompletionService.submit(() -> { Path columnPath = path.resolve(column.id()); writeColumn(columnPath.toString(), column); return null; }); } for (int i = 0; i < table.columnCount(); i++) { Future future = writerCompletionService.take(); future.get(); } } catch (InterruptedException | ExecutionException e) { throw new RuntimeException(e); } executorService.shutdown(); return storageFolder; } private static void writeColumn(String fileName, Column column) { try { switch (column.type()) { case FLOAT: writeColumn(fileName, (FloatColumn) column); break; case DOUBLE: writeColumn(fileName, (DoubleColumn) column); break; case INTEGER: writeColumn(fileName, (IntColumn) column); break; case BOOLEAN: writeColumn(fileName, (BooleanColumn) column); break; case LOCAL_DATE: writeColumn(fileName, (DateColumn) column); break; case LOCAL_TIME: writeColumn(fileName, (TimeColumn) column); break; case LOCAL_DATE_TIME: writeColumn(fileName, (DateTimeColumn) column); break; case CATEGORY: writeColumn(fileName, (CategoryColumn) column); break; case SHORT_INT: writeColumn(fileName, (ShortColumn) column); break; case LONG_INT: writeColumn(fileName, (LongColumn) column); break; default: throw new RuntimeException("Unhandled column type writing columns"); } } catch (IOException ex) { throw new RuntimeException("IOException writing to file"); } } @VisibleForTesting public static void writeColumn(String fileName, FloatColumn column) throws IOException { try (FileOutputStream fos = new FileOutputStream(fileName); SnappyFramedOutputStream sos = new SnappyFramedOutputStream(fos); DataOutputStream dos = new DataOutputStream(sos)) { int i = 0; for (float d : column) { dos.writeFloat(d); if (i % FLUSH_AFTER_ITERATIONS == 0) { dos.flush(); } i++; } dos.flush(); } } public static void writeColumn(String fileName, DoubleColumn column) throws IOException { try (FileOutputStream fos = new FileOutputStream(fileName); SnappyFramedOutputStream sos = new SnappyFramedOutputStream(fos); DataOutputStream dos = new DataOutputStream(sos)) { int i = 0; for (double d : column) { dos.writeDouble(d); if (i % FLUSH_AFTER_ITERATIONS == 0) { dos.flush(); } i++; } dos.flush(); } } /** * Writes out the values of the category column encoded as ints to minimize the time required for subsequent reads *

* The files are written Strings first, then the ints that encode them so they can be read in the opposite order * * @throws IOException IOException if the file can not be read */ public static void writeColumn(String fileName, CategoryColumn column) throws IOException { int categoryCount = column.dictionaryMap().size(); try (FileOutputStream fos = new FileOutputStream(fileName); SnappyFramedOutputStream sos = new SnappyFramedOutputStream(fos); DataOutputStream dos = new DataOutputStream(sos)) { dos.writeInt(categoryCount); // write the strings SortedSet keys = new TreeSet<>(column.dictionaryMap().keyToValueMap().keySet()); for (int key : keys) { dos.writeUTF(column.dictionaryMap().get(key)); } dos.flush(); // write the integer values that represent the strings int i = 0; for (int d : column.data()) { dos.writeInt(d); if (i % FLUSH_AFTER_ITERATIONS == 0) { dos.flush(); } i++; } } } //TODO(lwhite): saveTable the column using integer compression public static void writeColumn(String fileName, IntColumn column) throws IOException { try (FileOutputStream fos = new FileOutputStream(fileName); SnappyFramedOutputStream sos = new SnappyFramedOutputStream(fos); DataOutputStream dos = new DataOutputStream(sos)) { int i = 0; for (int d : column.data()) { dos.writeInt(d); if (i % FLUSH_AFTER_ITERATIONS == 0) { dos.flush(); } i++; } dos.flush(); } } public static void writeColumn(String fileName, ShortColumn column) throws IOException { try (FileOutputStream fos = new FileOutputStream(fileName); SnappyFramedOutputStream sos = new SnappyFramedOutputStream(fos); DataOutputStream dos = new DataOutputStream(sos)) { int i = 0; for (short d : column) { dos.writeShort(d); if (i % FLUSH_AFTER_ITERATIONS == 0) { dos.flush(); } i++; } dos.flush(); } } public static void writeColumn(String fileName, LongColumn column) throws IOException { try (FileOutputStream fos = new FileOutputStream(fileName); SnappyFramedOutputStream sos = new SnappyFramedOutputStream(fos); DataOutputStream dos = new DataOutputStream(sos)) { int i = 0; for (long d : column) { dos.writeLong(d); if (i % FLUSH_AFTER_ITERATIONS == 0) { dos.flush(); } i++; } dos.flush(); } } //TODO(lwhite): saveTable the column using integer compression public static void writeColumn(String fileName, DateColumn column) throws IOException { try (FileOutputStream fos = new FileOutputStream(fileName); SnappyFramedOutputStream sos = new SnappyFramedOutputStream(fos); DataOutputStream dos = new DataOutputStream(sos)) { int i = 0; for (int d : column.data()) { dos.writeInt(d); if (i % FLUSH_AFTER_ITERATIONS == 0) { dos.flush(); } i++; } dos.flush(); } } public static void writeColumn(String fileName, DateTimeColumn column) throws IOException { try (FileOutputStream fos = new FileOutputStream(fileName); SnappyFramedOutputStream sos = new SnappyFramedOutputStream(fos); DataOutputStream dos = new DataOutputStream(sos)) { int i = 0; for (long d : column.data()) { dos.writeLong(d); if (i % FLUSH_AFTER_ITERATIONS == 0) { dos.flush(); } i++; } dos.flush(); } } //TODO(lwhite): saveTable the column using integer compression public static void writeColumn(String fileName, TimeColumn column) throws IOException { try (FileOutputStream fos = new FileOutputStream(fileName); SnappyFramedOutputStream sos = new SnappyFramedOutputStream(fos); DataOutputStream dos = new DataOutputStream(sos)) { int i = 0; for (int d : column.data()) { dos.writeInt(d); if (i % FLUSH_AFTER_ITERATIONS == 0) { dos.flush(); } i++; } dos.flush(); } } //TODO(lwhite): saveTable the column using compressed bitmap public static void writeColumn(String fileName, BooleanColumn column) throws IOException { try (FileOutputStream fos = new FileOutputStream(fileName); SnappyFramedOutputStream sos = new SnappyFramedOutputStream(fos); DataOutputStream dos = new DataOutputStream(sos)) { for (int i = 0; i < column.size(); i++) { boolean value = column.get(i); dos.writeBoolean(value); if (i % FLUSH_AFTER_ITERATIONS == 0) { dos.flush(); } } dos.flush(); } } /** * Writes out a json-formatted representation of the given {@code table}'s metadata to the given {@code file} * * @param fileName Expected to be fully specified * @throws IOException if the file can not be read */ private static void writeTableMetadata(String fileName, Relation table) throws IOException { File myFile = Paths.get(fileName).toFile(); myFile.createNewFile(); try (FileOutputStream fOut = new FileOutputStream(myFile); OutputStreamWriter myOutWriter = new OutputStreamWriter(fOut)) { myOutWriter.append(new TableMetadata(table).toJson()); } } /** * Reads in a json-formatted file and creates a TableMetadata instance from it. Files are expected to be in * the format provided by TableMetadata} * * @param fileName Expected to be fully specified * @throws IOException if the file can not be read */ private static TableMetadata readTableMetadata(String fileName) throws IOException { byte[] encoded = Files.readAllBytes(Paths.get(fileName)); return TableMetadata.fromJson(new String(encoded, StandardCharsets.UTF_8)); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy