org.broadinstitute.hellbender.utils.tsv.TableReader Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of gatk Show documentation
Development on GATK 4
The newest version!
package org.broadinstitute.hellbender.utils.tsv;

import com.opencsv.CSVReader;
import java.nio.file.Files;
import java.nio.file.OpenOption;
import java.nio.file.Path;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.gcs.BucketUtils;
import org.broadinstitute.hellbender.utils.io.IOUtils;

import java.io.*;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;

/**
 * Reads the contents of a tab separated value formatted text input into
 * records of an arbitrary type {@link R}.
 * Format description
 * 
 * Tab separated values may contain any number of comment lines (started with {@link TableUtils#COMMENT_PREFIX}),
 * a column name containing line (aka. the header line) and any number of data lines one per record.
 * 
 * While comment lines can contain any sequence of characters, the header and data lines are divided in
 * columns using exactly one {@link TableUtils#COLUMN_SEPARATOR} character.
 * Blank lines are treated is having a single column with the empty string as the only value (or column name)
 * 
 * The header line is the first non-comment line, whereas any other non-comment line after that is
 * considered a data line. Comment lines can appear anywhere in the file and their
 * present is ignored by the reader.
 * 
 * 
 * The header line values, the column names, must all be different (otherwise a formatting exception will be thrown), and
 * all data lines have to have as many values as the header line.
 * 
 * Values can be quoted using {@link TableUtils#QUOTE_CHARACTER} becoming necessary when the value contain
 * any special formatting characters like a new-line, the quote character itself, the column separator character or
 * the escape character {@link TableUtils#ESCAPE_CHARACTER}.
 * Within quotes, especial characters must be escaped using the {@link TableUtils#ESCAPE_CHARACTER}
 * Implementing your own reader
 * 
 * Implementations control how instances of {@link R} are instantiated by extending
 * {@link #createRecord(DataLine) createRecord}. This method is passed an non-null nor null containing
 * {@link DataLine} with exactly as many elements are columns where passed to the
 * {@link #processColumns(TableColumnCollection)} earlier on in the execution.
 * 
 * 
 * The i-th element in the input array represent the value for the i-th column for that record.
 * 
 * 
 * The exact list (array) of column names are always accessible through {@link #columns}.
 * 
 * 
 * Implementations can also override {@link #processColumns} (that by default does nothing) in order to
 * get prepared to received data lines following the that format or simply to verify that
 * that the sequence of column names is expected, throwing an exception if not.
 * 
 * 
 * For the later, extending classes must use {@link #formatException(String)} to create that exception including an explanation
 * message describing the format violation.
 * 
 * 
 * Example:
 * 
 *         public class Person {
 *             public final String name;
 *             public final int age;
 *             public final double netWorth;
 *         }
 *
 *         public class PeopleTableReader extends TableReader<Person> {
 *             // ...
 *
 *             // If you don't trust the columns that you are given,
 *             // you can check them here (see also {@link TableUtils#checkMandatoryColumns}):
 *             @Override
 *             public void processColumns(final TableColumns columns) {
 *                 if (!columns.containsExactly("name","age","net.worth"))
 *                     throw formatException("invalid column names")
 *             }
 *
 *             @Override
 *             protected Person createRecord(final DataLine dataLine) {
 *                  return new Person(
 *                      dataLine.get("name"),
 *                      dataLine.getInt("age"),
 *                      dataLine.getDouble("net.worth")
 *                  );
 *             }
 *         }
 *     
 * 
 *
 * @param  the record type for the reader.
 * @author Valentin Ruano-Rubio <[email protected]>
 */
public abstract class TableReader implements Closeable, Iterable {

    /**
     * Name of the input source.
     * It can be {@code null} indicating that no name was provided at construction.
     */
    private final String source;

    /**
     * Input text reader.
     * 
     * Keeps track of the last line number read for error reporting purposes.
     * 
     */
    private final LineNumberReader reader;

    /**
     * Holds a reference to the column names.
     */
    private TableColumnCollection columns;

    private Map metadata = new HashMap<>();

    /**
     * Holds a reference to the csvReader object use to read and parse the input
     * into {@link String} arrays.
     */
    private CSVReader csvReader;

    /**
     * Indicates whether the reader has tried to fetch the next record.
     * If {@code true} the content of {@link #nextRecord} represent the next record to be returned
     * by {@link #readRecord} ({@code null} if we reached the end of the table), otherwise {@link #nextRecord} reference
     * is invalid and the next record must be fetched using {@link #fetchNextRecord()}.
     */
    private boolean nextRecordFetched = false;

    /**
     * Holds a reference to the next record.
     * 
     * This is {@code null} when we reached the end of the source.
     * 
     */
    private R nextRecord;

    /**
     * Creates a new table reader given the input file path.
     * 
     * This operation will read the first lines of the input file until the
     * column name header line is found.
     * 
     * 
     * The source's name used in error reporting is the file's path as returned by
     * {@link File#getPath}.
     * 
     *
     * @param path the input file path.
     * @throws IllegalArgumentException if {@code name} is {@code null}.
     * @throws IOException              if any is raised when accessing the file.
     */
    public TableReader(final Path path) throws IOException {
        this(
                Utils.nonNull(path, "the input file cannot be null").toString(),
                IOUtils.makeReaderMaybeGzipped(path));
    }

    /**
     * Creates a new table reader given an input {@link Reader}.
     * 
     * This operation will read the first lines of the input file until the
     * column name header line is found.
     * 
     *
     * @param sourceReader the source text reader.
     * @throws IOException if any is raised when reading from {@code sourceReader}.
     */
    public TableReader(final Reader sourceReader) throws IOException {
        this(null, sourceReader);
    }

    /**
     * Creates a new table reader given an input {@link Reader}.
     * 
     * It assigns an arbitrary
     * 
     *
     * @param sourceName   name of the source to use in error messages. It can be {@code null}, indicating that is anonymous.
     * @param sourceReader reader to the text to process.
     * @throws IllegalArgumentException if {@code sourceReader} is {@code null}.
     * @throws IOException              if is raised when reading from the source.
     */
    protected TableReader(final String sourceName, final Reader sourceReader) throws IOException {
        Utils.nonNull(sourceReader, "the reader cannot be null");

        this.source = sourceName;
        this.reader = sourceReader instanceof LineNumberReader ? (LineNumberReader) sourceReader : new LineNumberReader(sourceReader);
        this.csvReader = new CSVReader(this.reader, TableUtils.COLUMN_SEPARATOR, TableUtils.QUOTE_CHARACTER, TableUtils.ESCAPE_CHARACTER);
        findAndProcessHeaderLine();
        this.nextRecordFetched = false;
    }

    /**
     * Process the first lines of the input source until the header line.
     *
     * @throws IOException            if an {@link IOException} occurred when reading from the source.
     * @throws UserException.BadInput if there is formatting error in the input.
     */
    protected void findAndProcessHeaderLine() throws IOException {
        final String[] line = skipCommentLines();
        if (line == null) {
            throw formatException("premature end of table: header line not found");
        } else {
            TableColumnCollection.checkNames(line, UserException.BadInput::new);
            columns = new TableColumnCollection(line);
            processColumns(columns);
        }
    }

    /**
     * Checks whether a line is a comment line or not.
     *
     * @param line input line already split into line-values.
     * @return {@code true} if {@code line} seems to be a comment line.
     */
    protected boolean isCommentLine(final String[] line) {
        return line.length > 0 && line[0].startsWith(TableUtils.COMMENT_PREFIX);
    }

    /**
     * Composes the exception to be thrown due to a formatting error.
     * 
     * The input {@code message} can be omitted by providing a {@code null} value.
     * 
     *
     * @param message custom error message.
     * @return never {@code null}.
     */
    protected final UserException.BadInput formatException(final String message) {
        return new UserException.BadInput(formatExceptionMessageWithLocationInfo(message));
    }

    /**
     * Composes the exception to be thrown due to a formatting error.
     * 
     * The input {@code message} can be omitted by providing a {@code null} value.
     * 
     *
     * @param message custom error message.
     * @return never {@code null}.
     */
    protected final UserException.BadInput formatExceptionWithoutLocation(final String message) {
        return new UserException.BadInput(formatExceptionMessageWithoutLocationInfo(message));
    }

    /**
     * Composes the error exception message string.
     * 
     * The input {@code message} can be omitted by providing a {@code null} value.
     * 
     *
     * @param message custom error message.
     * @return never {@code null}.
     */
    private String formatExceptionMessageWithLocationInfo(final String message) {
        final String explanation = message == null ? "" : ": " + message;
        if (source == null) {
            return String.format("format error at line %d" + explanation, reader.getLineNumber());
        } else {
            return String.format("format error in '%s' at line %d" + explanation, source, reader.getLineNumber());
        }
    }

    /**
     * Composes the error exception message string.
     * 
     * The input {@code message} can be omitted by providing a {@code null} value.
     * 
     *
     * @param message custom error message.
     * @return never {@code null}.
     */
    private String formatExceptionMessageWithoutLocationInfo(final String message) {
        return "format error: " + message;
    }

    /**
     * Process the header line's column names.
     * 
     * Implementations must use {@link #formatException(String)} to create the exception to throw in case
     * there is any formatting issue.
     * 
     *
     * @param tableColumns columns found in the input. It is guarantee not to be
     *                     a {@code null} and not to contain any {@code null} values.
     * @throws UserException.BadInput if there is a formatting issue.
     */
    protected void processColumns(@SuppressWarnings("unused") final TableColumnCollection tableColumns) {
        // nothing by default.
    }

    /**
     * Returns the column collection for this reader.
     * @throws IllegalStateException if this methods is invoked before the table column can be determined
     *  (e.g. when processing a comment line before the header extending {@link #processCommentLine(String, long)}).
     * @return never {@code null}.
     */
    public TableColumnCollection columns() {
        Utils.validate(columns != null, "columns are null");
        return columns;
    }

    /**
     * Returns the next record form the source.
     *
     * @return {@code null} if there is no more record in the input.
     * @throws IOException if a {@link IOException} was thrown when reading from the input.
     */
    public final R readRecord() throws IOException {
        if (nextRecordFetched == false) {
            nextRecord = fetchNextRecord();
        }
        if (nextRecord != null) {
            nextRecordFetched = false;
            return nextRecord;
        } else {
            return null;
        }
    }

    /**
     * Reads the record from a string rather than from the input reader.
     *
     * @return {@code null} for comment or header lines, a non-null record otherwise.
     */
    public final R readRecord(final String line) {
        try {
            final String[] fields = csvReader.getParser().parseLine(line);
            if (isCommentLine(fields) || isHeaderLine(fields)) {
                return null;
            } else if (fields.length != columns.columnCount()) {
                throw formatExceptionWithoutLocation("invalid number of columns");
            } else {
                return createRecord(new DataLine(fields, columns, this::formatExceptionWithoutLocation));
            }
        } catch (final IOException ex) {
            throw new GATKException("the single line input is in fact a multi-line entry");
        }
    }

    /**
     * Fetch the next record from the source.
     *
     * @return {@code null} if there is no more record in the input.
     * @throws IOException if a {@link IOException} was thrown when reading from the input.
     */
    private R fetchNextRecord() throws IOException {
        nextRecordFetched = true;
        String[] line;
        while ((line = csvReader.readNext()) != null) {
            if (isCommentLine(line)) {
                processCommentLine(line, reader.getLineNumber());
            } else if (!isHeaderLine(line)) {
                if (line.length != columns.columnCount()) {
                    throw formatException(String.format("mismatch between number of values in line (%d) and number of columns (%d)", line.length, columns.columnCount()));
                } else {
                    final R result = createRecord(new DataLine(reader.getLineNumber(), line, columns, this::formatException));
                    if (result != null) {
                        return result;
                    }
                }
            }
        }
        return null;
    }

    private void processCommentLine(final String[] line, final long lineNumber) {
        final StringBuilder builder = new StringBuilder();
        builder.append(line[0].substring(TableUtils.COMMENT_PREFIX.length()));
        for (int i = 1; i < line.length; i++)
            builder.append(TableUtils.COLUMN_SEPARATOR_STRING).append(line[i]);
        processCommentLine(builder.toString(), lineNumber);
    }

    /**
     * Called with the content of the comment line every time one is found in the input.
     * 
     *     The comment prefix string ({@link TableUtils#COMMENT_PREFIX}) is not included in the
     *     input string.
     * 
     * 
     *     Notice that since comments might be present before the header line, so the result of invoking {@link #columns()} is undefined.
     * 
     * @param commentText the input comment string.
     * @param lineNumber the source line number that contains the comment line.
     */
    protected void processCommentLine(final String commentText, final long lineNumber) {
        if (commentText.startsWith(TableWriter.METADATA_TAG)) {
            final String[] keyAndValue = commentText.substring(TableWriter.METADATA_TAG.length()).split("=");
            metadata.put(keyAndValue[0], keyAndValue[1]);
        }
        // do nothing with non-metadata lines by default
    }

    /**
     * Determines whether a line is a repetition of the header.
     *
     * 
     *     By default, input lines that match the header exactly are ignored. By overriding this method
     *     extending classes may change what is interpretated as a repetition of the header (e.g. just treat such
     *     lines as regular data line)
     * 
     * @param line the input line.
     * @return {@code true} if the input line is a header line and it should be ignored.
     */
    protected boolean isHeaderLine(final String[] line) {
        return columns.matchesExactly(line);
    }

    /**
     * Skip comment lines from the output.
     * 
     * It returns the contents of the first non comment line found.  As a side effect, it builds a metadata map
     * of key-value pairs from comment lines with the  tag from TableWriter
     * 
     *
     * @return {@code null} if we reached the end of the source, the next non-comment line content otherwise.
     * @throws IOException if it was raised when reading for the source.
     */
    private String[] skipCommentLines() throws IOException {
        String[] line;
        while ((line = csvReader.readNext()) != null) {
            if (isCommentLine(line)) {
                processCommentLine(line, reader.getLineNumber());
            } else {
                break;
            }
        }
        return line;
    }

    /**
     * Transforms a data-line column values into a record.
     * 
     * Implementation should use {@link #formatException(String)} to indicate a formatting error
     * that makes impossible to create a instance of {@link R} given the input line values {@code dataLine}.
     * 
     *
     * @param dataLine values corresponding to the column names that was passed earlier to {@link #processColumns(TableColumnCollection)}}.
     *                 it is guaranteed to not be {@code null}, contain no {@code null} value and have the same columns
     *                 as this readers' (accessible through {@link #columns}).
     * @return never {@code null}.
     */
    protected abstract R createRecord(final DataLine dataLine);

    @Override
    public void close() throws IOException {
        csvReader.close();
    }

    /**
     * Returns an iterator on the remaining records in
     * the input.
     *
     * @return never {@code null}.
     */
    @Override
    public Iterator iterator() {
        return new Iterator() {

            @Override
            public boolean hasNext() {
                if (!nextRecordFetched) {
                    try {
                        nextRecord = fetchNextRecord();
                    } catch (final IOException ex) {
                        throw new UncheckedIOException(ex);
                    }
                }
                return nextRecord != null;
            }

            @Override
            public R next() {
                if (!nextRecordFetched) {
                    try {
                        nextRecord = fetchNextRecord();
                    } catch (final IOException ex) {
                        throw new UncheckedIOException(ex);
                    }
                }
                if (nextRecord == null) {
                    throw new NoSuchElementException("there is no more record in the input");
                } else {
                    nextRecordFetched = false;
                    return nextRecord;
                }
            }
        };
    }

    @Override
    public Spliterator spliterator() {
        return Spliterators.spliteratorUnknownSize(iterator(), Spliterator.ORDERED | Spliterator.NONNULL);
    }

    /**
     * Returns an stream on the reaming records in the source.
     * 
     * Notice that the returned stream will consume records as if you were calling {@link #readRecord} directly.
     * 
     * 
     * Any {@link IOException} raised when using the stream will be propagated up wrapped in a {@link UncheckedIOException}.
     * 
     * 
     * Any format exception will still be indicated with a {@link UserException.BadInput}.
     * 
     *
     * @return never {@code null}.
     */
    public Stream stream() {
        return Utils.stream(this);
    }


    /**
     * Read the remaining records into a list.
     * 
     *     Notice that this operation does not close the reader.
     * 
     *
     * @return never {@code null}, but potentially empty.
     */
    public List toList() {
        return stream().collect(Collectors.toList());
    }

    public Map getMetadata() {
        return metadata;
    }

    /**
     * Returns the reader source name.
     *
     * @return null if the source name cannot be determined.
     */
    public String getSource() {
        return source;
    }
}