org.broadinstitute.hellbender.utils.tsv.TableWriter Maven / Gradle / Ivy
package org.broadinstitute.hellbender.utils.tsv;
import com.opencsv.CSVWriter;
import java.nio.file.Files;
import java.nio.file.Path;
import org.broadinstitute.hellbender.utils.Utils;
import java.io.*;
/**
* Class to write tab separated value files.
*
* The column (and they names) are passed in the constructor parameter along the output {@link Path path}
* or {@link Writer writer}.
*
*
* Extending classes must indicate how we can transcribe row record or type {@link R} to the corresponding
* record data-line in the output by overriding {@link #composeLine(R,DataLine)}.
*
*
* Example:
*
* public class Person {
* public final String name;
* public final int age;
* public final double netWorth;
* }
*
* public class PeopleTableWriter extends TableWriter<Person> {
*
* public MyRecordWriter(final File file) {
* super(file, new TableColumnCollection("name","age","net.worth"));
* }
*
* @Override
* protected void dataLine(final Person person, final DataLine dataLine) {
* dataLine.setAll(person.name, "" + person.age, "" + person.netWorth);
* }
* }
*
*
*
* You must use the {@link DataLine} instance passed and no other.
*
*
* Instead of passing all the values as converted string in column order you may opt to use {@link DataLine#set}
* method family to set values one by one using the column index or column name like so:
*
*
* Example (using the column index):
*
* @Override
* protected void composeLine(final Person person, final DataLine dataLine) {
* dataLine
* .set(0,person.name)
* .set(1,person.age)
* .set(2,person.netWorth);
* }
*
*
*
* Example (using column names):
*
* @Override
* protected void composeLine(final Person person, final DataLine dataLine) {
* dataLine
* .set("name",person.name)
* .set("age",person.age)
* .set("net.worth",person.netWorth);
* }
*
* Notice that you don't need to explicitly convert neither the age nor the net-worth into a
* string thanks to {@link DataLine#set set} various overloads.
*
*
* Alternatively, if you know the column order, that should quite often the case, you can avoid
* indexing all together using {@link DataLine#append append} operations instead:
*
* @Override
* protected void composeLine(final Person person, final DataLine dataLine) {
* dataLine
* .append(person.name)
* .append(person.age)
* .append(person.netWorth);
* }
*
*
*
* At any time the implementation can query the correspondence between column names and position within the data-line
* by querying the {@link TableColumnCollection} object directly that can be obtained from the dataLine's {@link #columns} field.
*
*
* Example (using column names):
*
* @Override
* protected void composeLine(final Person person, final DataLine dataLine) {
* dataLine
* .set("name",person.name)
* .set("age",person.age);
*
* if (dataLine.columns().contains("net.worth"))
* dataLine.set("net.worth",person.netWorth);
* }
*
*
*
* @param the row record type.
* @author Valentin Ruano-Rubio <[email protected]>
*/
public abstract class TableWriter implements Closeable {
private long lineNumber;
/**
* Csv writer use to do the actual writing.
*/
private final CSVWriter writer;
/**
* The table column names.
*/
private final TableColumnCollection columns;
/**
* Whether the header column name line has been written or not.
*/
private boolean headerWritten = false;
public static final String METADATA_TAG = "";
/**
* Creates a new table writer given the file and column names.
*
* @param path the destination path.
* @param tableColumns the table column names.
* @throws IllegalArgumentException if either {@code file} or {@code tableColumns} are {@code null}.
* @throws IOException if one was raised when opening the the destination file for writing.
*/
public TableWriter(final Path path, final TableColumnCollection tableColumns) throws IOException {
this(
new OutputStreamWriter(
Files.newOutputStream(Utils.nonNull(path, "The path cannot be null."))),
tableColumns);
}
/**
* Creates a new table writer given the destination writer and column names.
*
* @param writer the destination writer.
* @param columns the table column names.
* @throws IllegalArgumentException if either {@code writer} or {@code columns} are {@code null}.
* @throws IOException if one was raised when opening the the destination file for writing.
*/
public TableWriter(final Writer writer, final TableColumnCollection columns) throws IOException {
this.columns = Utils.nonNull(columns, "The columns cannot be null.");
this.writer = new CSVWriter(Utils.nonNull(writer, "the input writer cannot be null"),
TableUtils.COLUMN_SEPARATOR, TableUtils.QUOTE_CHARACTER, TableUtils.ESCAPE_CHARACTER);
}
/**
* Writes a comment into the output.
*
* This can be invoked at any time; comment lines can be present anywhere in the file.
*
*
* Comments written before any record, will be output
*
*
* @param comment the comment to write out.
* @throws IllegalArgumentException if {@code comment} is {@code null}.
* @throws IOException if any was raised by this operation.
*/
public final void writeComment(final String comment) throws IOException {
Utils.nonNull(comment, "The comment cannot be null.");
writer.writeNext(new String[]{TableUtils.COMMENT_PREFIX + comment}, false);
lineNumber++;
}
// write a comment line of the form #key=value
public final void writeMetadata(final String key, final String value) throws IOException {
Utils.nonNull(key);
Utils.nonNull(value);
Utils.validateArg(!headerWritten, "Metadata must precede the header.");
writeComment(METADATA_TAG + key + "=" + value);
}
/**
* Writes a new record.
*
* @param record the record to write.
* @throws IOException if it was raised when writing the record.
* @throws ClassCastException if {@code record} is of the correct type
* for this writer.
* @throws IllegalArgumentException if {@code record} is {@code null} or it is not a valid record
* as per the implementation of this writer (see {@link #composeLine}).
*/
public void writeRecord(final R record) throws IOException {
Utils.nonNull(record, "The record cannot be null.");
writeHeaderIfApplies();
final DataLine dataLine = new DataLine(lineNumber + 1, columns,IllegalArgumentException::new);
composeLine(record,dataLine);
writer.writeNext(dataLine.unpack(), false);
lineNumber++;
}
/**
* Write all the records in a {@link Iterable}.
*
* Records are written in the order they appear in the input {@link Iterable}.
*
*
* @param records to write.
* @throws IOException if any raised when writing any of the records.
* @throws ClassCastException if {@code record} is of the correct type
* for this writer.
* @throws IllegalArgumentException if {@code records} is {@code null} or it contains
* some values that would cause such an exception when {@link #writeRecord} is call on
* that value. Previous record in the iterable would have been already written by then.
*/
public final void writeAllRecords(final Iterable records) throws IOException {
Utils.nonNull(records, "the record iterable cannot be null");
for (final R record : records) {
writeRecord(record);
}
}
@Override
public final void close() throws IOException {
writeHeaderIfApplies();
writer.close();
}
/**
* Pushes in-memory buffered content to the output stream.
* @throws IOException
*/
public final void flush() throws IOException {
writeHeaderIfApplies();
writer.flush();
}
/**
* Writes the header if it has not been written already.
*
* The header is written automatically before the first record is written or when the writer is closed
* and no record was written.
*
*
* Comments written using {@link #writeComment} before any record will precede the header
* unless you invoke your method first.
*
*
* Once the header line has been written, invoking this method does not have any effect.
*
*
* @throws IOException if any raised when writing into the destination writer.
*/
public void writeHeaderIfApplies() throws IOException {
if (!headerWritten) {
writer.writeNext(columns.names().toArray(new String[columns.columnCount()]), false);
lineNumber++;
}
headerWritten = true;
}
/**
* Composes the data-line to write into the output to represent a given record
*
* Also the first element cannot contain the {@link TableUtils#COMMENT_PREFIX comment prefix}.
* If that is a genuine valid value for the first column you shall consider to re-order the columns or
* change the encoding of the first column to avoid this issue.
*
*
* Both inputs, {@code record} and {@code dataLine} are guaranteed not to be {@code null}s.
*
*
* @param record the record to write into the data-line.
* @param dataLine the destination data-line object.
* @return never {@code null}.
* @throws ClassCastException if {@code record} is of the correct type
* for this writer.
* @throws IllegalArgumentException if there is some conversion issue that does
* not allow the current write to generate a valid string array to encode the record.
*/
protected abstract void composeLine(final R record, final DataLine dataLine);
}