All Downloads are FREE. Search and download functionalities are using the official Maven repository.

tech.tablesaw.io.saw.SawWriter Maven / Gradle / Ivy

There is a newer version: 0.43.3
Show newest version
package tech.tablesaw.io.saw;

import static tech.tablesaw.io.saw.SawMetadata.METADATA_FILE_NAME;
import static tech.tablesaw.io.saw.SawUtils.BOOLEAN;
import static tech.tablesaw.io.saw.SawUtils.DOUBLE;
import static tech.tablesaw.io.saw.SawUtils.FLOAT;
import static tech.tablesaw.io.saw.SawUtils.INSTANT;
import static tech.tablesaw.io.saw.SawUtils.INTEGER;
import static tech.tablesaw.io.saw.SawUtils.LOCAL_DATE;
import static tech.tablesaw.io.saw.SawUtils.LOCAL_DATE_TIME;
import static tech.tablesaw.io.saw.SawUtils.LOCAL_TIME;
import static tech.tablesaw.io.saw.SawUtils.LONG;
import static tech.tablesaw.io.saw.SawUtils.SHORT;
import static tech.tablesaw.io.saw.SawUtils.STRING;
import static tech.tablesaw.io.saw.SawUtils.TEXT;

import com.google.common.annotations.Beta;
import com.google.common.base.Preconditions;
import it.unimi.dsi.fastutil.bytes.Byte2IntMap;
import it.unimi.dsi.fastutil.bytes.Byte2ObjectMap;
import it.unimi.dsi.fastutil.bytes.ByteIterator;
import it.unimi.dsi.fastutil.floats.FloatIterator;
import it.unimi.dsi.fastutil.ints.Int2IntMap;
import it.unimi.dsi.fastutil.ints.Int2ObjectMap;
import it.unimi.dsi.fastutil.ints.IntIterator;
import it.unimi.dsi.fastutil.longs.LongIterator;
import it.unimi.dsi.fastutil.objects.ObjectSet;
import it.unimi.dsi.fastutil.shorts.Short2IntMap;
import it.unimi.dsi.fastutil.shorts.Short2ObjectMap;
import it.unimi.dsi.fastutil.shorts.ShortIterator;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.UncheckedIOException;
import java.nio.file.FileAlreadyExistsException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Comparator;
import java.util.List;
import java.util.concurrent.CompletionService;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorCompletionService;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.stream.Stream;
import org.iq80.snappy.SnappyFramedOutputStream;
import tech.tablesaw.api.BooleanColumn;
import tech.tablesaw.api.DateColumn;
import tech.tablesaw.api.DateTimeColumn;
import tech.tablesaw.api.DoubleColumn;
import tech.tablesaw.api.FloatColumn;
import tech.tablesaw.api.InstantColumn;
import tech.tablesaw.api.IntColumn;
import tech.tablesaw.api.LongColumn;
import tech.tablesaw.api.ShortColumn;
import tech.tablesaw.api.StringColumn;
import tech.tablesaw.api.Table;
import tech.tablesaw.api.TextColumn;
import tech.tablesaw.api.TimeColumn;
import tech.tablesaw.columns.Column;
import tech.tablesaw.columns.strings.ByteDictionaryMap;
import tech.tablesaw.columns.strings.DictionaryMap;
import tech.tablesaw.columns.strings.IntDictionaryMap;
import tech.tablesaw.columns.strings.ShortDictionaryMap;

@Beta
public class SawWriter {

  // We flush the output stream repeatedly to ensure it doesn't grow without bounds for big files
  private static final int FLUSH_AFTER_ITERATIONS = 20_000;

  private final SawMetadata sawMetadata;
  private final Table table;
  private final WriteOptions writeOptions;
  private final Path path;

  public SawWriter(Path path, Table table, WriteOptions options) {
    this.path = path;
    this.sawMetadata = new SawMetadata(table, options);
    this.table = table;
    this.writeOptions = options;
  }

  public SawWriter(String path, Table table, WriteOptions options) {
    this.path = setPath(path);
    this.sawMetadata = new SawMetadata(table, options);
    this.table = table;
    this.writeOptions = options;
  }

  public SawWriter(File file, Table table, WriteOptions options) {
    this.path = file.toPath();
    this.sawMetadata = new SawMetadata(table, options);
    this.table = table;
    this.writeOptions = options;
  }

  public SawWriter(Path path, Table table) {
    this.path = path;
    this.table = table;
    this.writeOptions = WriteOptions.defaultOptions();
    this.sawMetadata = new SawMetadata(table, writeOptions);
  }

  public SawWriter(File file, Table table) {
    this.path = file.toPath();
    this.table = table;
    this.writeOptions = WriteOptions.defaultOptions();
    this.sawMetadata = new SawMetadata(table, writeOptions);
  }

  public SawWriter(String path, Table table) {
    this.path = setPath(path);
    this.table = table;
    this.writeOptions = WriteOptions.defaultOptions();
    this.sawMetadata = new SawMetadata(table, writeOptions);
  }

  private Path setPath(String parentFolderName) {
    Preconditions.checkArgument(
        parentFolderName != null, "The folder name for the saw output cannot be null");
    Preconditions.checkArgument(
        !parentFolderName.isEmpty(), "The folder name for the saw output cannot be empty");
    return Paths.get(parentFolderName);
  }

  public String write() {
    try {
      return saveTable();
    } catch (IOException e) {
      throw new UncheckedIOException(e);
    }
  }

  /**
   * Saves the data from the given table in the location specified by parentFolderName. Within that
   * folder each table has its own sub-folder, whose name is based on the name of the table.
   *
   * 

NOTE: If you store a table with the same name in the same folder. The data in that folder * will be over-written. * *

The storage format is the tablesaw compressed column-oriented format, which consists of a * set of file in a folder. The name of the folder is based on the name of the table. * * @return The path and name of the table */ private String saveTable() throws IOException { ExecutorService executorService = Executors.newFixedThreadPool(writeOptions.getThreadPoolSize()); CompletionService writerCompletionService = new ExecutorCompletionService<>(executorService); createFolder(path); // creates the folder containing the files String sawFolderName = SawUtils.makeName(table.name()); Path filePath = path.resolve(sawFolderName); if (Files.exists(filePath)) { try (Stream stream = Files.walk(filePath)) { stream .map(Path::toFile) .sorted((o1, o2) -> Comparator.reverseOrder().compare(o1, o2)) .forEach(File::delete); } } Files.createDirectories(filePath); writeTableMetadata(filePath, sawMetadata); try { List> columns = table.columns(); for (int i = 0; i < columns.size(); i++) { Column column = columns.get(i); String pathString = sawMetadata.getColumnMetadataList().get(i).getId(); writerCompletionService.submit( () -> { Path columnPath = filePath.resolve(pathString); writeColumn(columnPath.toString(), column); return null; }); } for (int i = 0; i < table.columnCount(); i++) { Future future = writerCompletionService.take(); future.get(); } } catch (InterruptedException e) { Thread.currentThread().interrupt(); throw new IllegalStateException(e); } catch (ExecutionException e) { throw new IllegalStateException(e); } finally { executorService.shutdown(); } return filePath.toAbsolutePath().toString(); } private void createFolder(Path folderPath) { if (!Files.exists(folderPath)) { try { Files.createDirectories(folderPath); } catch (IOException e) { throw new UncheckedIOException(e); } } } private void writeColumn(String fileName, Column column) { try { final String typeName = column.type().name(); switch (typeName) { case FLOAT: writeColumn(fileName, (FloatColumn) column); break; case DOUBLE: writeColumn(fileName, (DoubleColumn) column); break; case INTEGER: writeColumn(fileName, (IntColumn) column); break; case BOOLEAN: writeColumn(fileName, (BooleanColumn) column); break; case LOCAL_DATE: writeColumn(fileName, (DateColumn) column); break; case LOCAL_TIME: writeColumn(fileName, (TimeColumn) column); break; case LOCAL_DATE_TIME: writeColumn(fileName, (DateTimeColumn) column); break; case STRING: writeColumn(fileName, (StringColumn) column); break; case TEXT: writeColumn(fileName, (TextColumn) column); break; case INSTANT: writeColumn(fileName, (InstantColumn) column); break; case SHORT: writeColumn(fileName, (ShortColumn) column); break; case LONG: writeColumn(fileName, (LongColumn) column); break; default: throw new IllegalArgumentException("Unhandled column type writing columns"); } } catch (IOException e) { throw new UncheckedIOException("IOException writing to file", e); } } private void writeColumn(String fileName, FloatColumn column) throws IOException { try (DataOutputStream dos = columnOutputStream(fileName)) { int i = 0; FloatIterator iterator = (FloatIterator) column.iterator(); while (iterator.hasNext()) { dos.writeFloat(iterator.nextFloat()); i++; if (i == FLUSH_AFTER_ITERATIONS) { dos.flush(); i = 0; } } dos.flush(); } } private void writeColumn(String fileName, DoubleColumn column) throws IOException { try (DataOutputStream dos = columnOutputStream(fileName)) { int i = 0; for (double d : column) { dos.writeDouble(d); i++; if (i == FLUSH_AFTER_ITERATIONS) { dos.flush(); i = 0; } } dos.flush(); } } /** * Writes out the values of the String column encoded as ints to minimize the time required for * subsequent reads * *

The files are written Strings first, then the ints that encode them so they can be read in * the opposite order */ private void writeColumn(String fileName, StringColumn column) throws IOException { try (DataOutputStream dos = columnOutputStream(fileName)) { // write the strings DictionaryMap lookupTable = column.getDictionary(); if (lookupTable.getClass().equals(ByteDictionaryMap.class)) { writeToStream((ByteDictionaryMap) lookupTable, dos); } else if (lookupTable.getClass().equals(ShortDictionaryMap.class)) { writeToStream((ShortDictionaryMap) lookupTable, dos); } else { writeToStream((IntDictionaryMap) lookupTable, dos); } } } /** * Writes the contents of the dictionaryMap to a stream in saw file format * * @param dos The stream to write on */ private void writeToStream(ByteDictionaryMap dictionary, DataOutputStream dos) { try { // write the maps // we write each one keys first, then values // the idea is that there are relatively few unique values so it's cheap to write them more // than once ObjectSet> entries = dictionary.getKeyValueEntries(); for (Byte2ObjectMap.Entry entry : entries) { dos.writeByte(entry.getByteKey()); } for (Byte2ObjectMap.Entry entry : entries) { dos.writeUTF(entry.getValue()); } ObjectSet counts = dictionary.getKeyCountEntries(); for (Byte2IntMap.Entry count : counts) { dos.writeByte(count.getByteKey()); } for (Byte2IntMap.Entry count : counts) { dos.writeInt(count.getIntValue()); } // write the values in column order, including repeats for (byte d : dictionary.values()) { dos.writeByte(d); } dos.flush(); } catch (IOException exception) { throw new UncheckedIOException(exception); } } /** * Writes the contents of the dictionaryMap to a stream in saw file format * * @param dos The stream to write on */ private void writeToStream(ShortDictionaryMap dictionary, DataOutputStream dos) { try { // write the maps // we write each one keys first, then values // the idea is that there are relatively few unique values so it's cheap to write them more // than once ObjectSet> entries = dictionary.getKeyValueEntries(); for (Short2ObjectMap.Entry entry : entries) { dos.writeShort(entry.getShortKey()); } for (Short2ObjectMap.Entry entry : entries) { dos.writeUTF(entry.getValue()); } ObjectSet counts = dictionary.getKeyCountEntries(); for (Short2IntMap.Entry count : counts) { dos.writeShort(count.getShortKey()); } for (Short2IntMap.Entry count : counts) { dos.writeInt(count.getIntValue()); } // write the values in column order, including repeats for (short d : dictionary.values()) { dos.writeShort(d); } dos.flush(); } catch (IOException exception) { throw new UncheckedIOException(exception); } } /** * Writes the contents of the dictionaryMap to a stream in saw file format * * @param dos The stream to write on */ private void writeToStream(IntDictionaryMap dictionary, DataOutputStream dos) { try { // write the maps // we write each one keys first, then values // the idea is that there are relatively few unique values so it's cheap to write them more // than once ObjectSet> entries = dictionary.getKeyValueEntries(); for (Int2ObjectMap.Entry entry : entries) { dos.writeInt(entry.getIntKey()); } for (Int2ObjectMap.Entry entry : entries) { dos.writeUTF(entry.getValue()); } ObjectSet counts = dictionary.getKeyCountEntries(); for (Int2IntMap.Entry count : counts) { dos.writeInt(count.getIntKey()); } for (Int2IntMap.Entry count : counts) { dos.writeInt(count.getIntValue()); } // write the values in column order, including repeats for (int d : dictionary.values()) { dos.writeInt(d); } dos.flush(); } catch (IOException exception) { throw new UncheckedIOException(exception); } } DataOutputStream columnOutputStream(String fileName) throws IOException { FileOutputStream fos = new FileOutputStream(fileName); if (sawMetadata.getCompressionType().equals(CompressionType.NONE)) { return new DataOutputStream(fos); } else { SnappyFramedOutputStream sos = new SnappyFramedOutputStream(fos); return new DataOutputStream(sos); } } /** Writes out the values of the TextColumn */ private void writeColumn(String fileName, TextColumn column) throws IOException { try (DataOutputStream dos = columnOutputStream(fileName)) { int i = 0; for (String s : column) { dos.writeUTF(s); i++; if (i == FLUSH_AFTER_ITERATIONS) { dos.flush(); i = 0; } } dos.flush(); } } private void writeColumn(String fileName, IntColumn column) throws IOException { try (DataOutputStream dos = columnOutputStream(fileName)) { writeIntStream(dos, column.intIterator()); dos.flush(); } } // TODO(lwhite): save the column using integer compression? private void writeIntStream(DataOutputStream dos, IntIterator iterator) throws IOException { int i = 0; while (iterator.hasNext()) { dos.writeInt(iterator.nextInt()); i++; if (i == FLUSH_AFTER_ITERATIONS) { dos.flush(); i = 0; } } } private void writeLongStream(DataOutputStream dos, LongIterator iterator) throws IOException { int i = 0; while (iterator.hasNext()) { dos.writeLong(iterator.nextLong()); i++; if (i == FLUSH_AFTER_ITERATIONS) { dos.flush(); i = 0; } } } private void writeColumn(String fileName, ShortColumn column) throws IOException { try (DataOutputStream dos = columnOutputStream(fileName)) { int i = 0; ShortIterator iterator = (ShortIterator) column.iterator(); while (iterator.hasNext()) { dos.writeShort(iterator.nextShort()); i++; if (i == FLUSH_AFTER_ITERATIONS) { dos.flush(); i = 0; } } dos.flush(); } } private void writeColumn(String fileName, LongColumn column) throws IOException { try (DataOutputStream dos = columnOutputStream(fileName)) { writeLongStream(dos, column.longIterator()); dos.flush(); } } private void writeColumn(String fileName, DateColumn column) throws IOException { try (DataOutputStream dos = columnOutputStream(fileName)) { writeIntStream(dos, column.intIterator()); dos.flush(); } } private void writeColumn(String fileName, DateTimeColumn column) throws IOException { try (DataOutputStream dos = columnOutputStream(fileName)) { writeLongStream(dos, column.longIterator()); dos.flush(); } } private void writeColumn(String fileName, InstantColumn column) throws IOException { try (DataOutputStream dos = columnOutputStream(fileName)) { writeLongStream(dos, column.longIterator()); } } private void writeColumn(String fileName, TimeColumn column) throws IOException { try (DataOutputStream dos = columnOutputStream(fileName)) { writeIntStream(dos, column.intIterator()); dos.flush(); } } // TODO(lwhite): save the column using compressed bitmap? private void writeColumn(String fileName, BooleanColumn column) throws IOException { try (DataOutputStream dos = columnOutputStream(fileName)) { int i = 0; ByteIterator iterator = column.byteIterator(); while (iterator.hasNext()) { dos.writeByte(iterator.nextByte()); i++; if (i == FLUSH_AFTER_ITERATIONS) { dos.flush(); i = 0; } } dos.flush(); } } /** * Writes out a json-formatted representation of the given {@code table}'s metadata to the given * {@code file} * * @param filePath The full file path including file name * @throws IOException if the file can not be read */ private void writeTableMetadata(Path filePath, SawMetadata metadata) throws IOException { Path metaDataPath = filePath.resolve(METADATA_FILE_NAME); try { Files.createFile(metaDataPath); } catch (FileAlreadyExistsException e) { /*overwrite existing file*/ } try (FileOutputStream fOut = new FileOutputStream(metaDataPath.toFile()); OutputStreamWriter myOutWriter = new OutputStreamWriter(fOut)) { String output = metadata.toJson(); myOutWriter.append(output); } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy