All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.utils.tsv.TableReader Maven / Gradle / Ivy

The newest version!
package org.broadinstitute.hellbender.utils.tsv;

import com.opencsv.CSVReader;
import java.nio.file.Files;
import java.nio.file.OpenOption;
import java.nio.file.Path;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.gcs.BucketUtils;
import org.broadinstitute.hellbender.utils.io.IOUtils;

import java.io.*;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;

/**
 * Reads the contents of a tab separated value formatted text input into
 * records of an arbitrary type {@link R}.
 * 

Format description

*

* Tab separated values may contain any number of comment lines (started with {@link TableUtils#COMMENT_PREFIX}), * a column name containing line (aka. the header line) and any number of data lines one per record. *

*

While comment lines can contain any sequence of characters, the header and data lines are divided in * columns using exactly one {@link TableUtils#COLUMN_SEPARATOR} character.

*

Blank lines are treated is having a single column with the empty string as the only value (or column name)

*

* The header line is the first non-comment line, whereas any other non-comment line after that is * considered a data line. Comment lines can appear anywhere in the file and their * present is ignored by the reader. *

*

* The header line values, the column names, must all be different (otherwise a formatting exception will be thrown), and * all data lines have to have as many values as the header line. *

*

Values can be quoted using {@link TableUtils#QUOTE_CHARACTER} becoming necessary when the value contain * any special formatting characters like a new-line, the quote character itself, the column separator character or * the escape character {@link TableUtils#ESCAPE_CHARACTER}.

*

Within quotes, especial characters must be escaped using the {@link TableUtils#ESCAPE_CHARACTER}

*

Implementing your own reader

*

* Implementations control how instances of {@link R} are instantiated by extending * {@link #createRecord(DataLine) createRecord}. This method is passed an non-null nor null containing * {@link DataLine} with exactly as many elements are columns where passed to the * {@link #processColumns(TableColumnCollection)} earlier on in the execution. *

*

* The i-th element in the input array represent the value for the i-th column for that record. *

*

* The exact list (array) of column names are always accessible through {@link #columns}. *

*

* Implementations can also override {@link #processColumns} (that by default does nothing) in order to * get prepared to received data lines following the that format or simply to verify that * that the sequence of column names is expected, throwing an exception if not. *

*

* For the later, extending classes must use {@link #formatException(String)} to create that exception including an explanation * message describing the format violation. *

*

* Example: *

 *         public class Person {
 *             public final String name;
 *             public final int age;
 *             public final double netWorth;
 *         }
 *
 *         public class PeopleTableReader extends TableReader<Person> {
 *             // ...
 *
 *             // If you don't trust the columns that you are given,
 *             // you can check them here (see also {@link TableUtils#checkMandatoryColumns}):
 *             @Override
 *             public void processColumns(final TableColumns columns) {
 *                 if (!columns.containsExactly("name","age","net.worth"))
 *                     throw formatException("invalid column names")
 *             }
 *
 *             @Override
 *             protected Person createRecord(final DataLine dataLine) {
 *                  return new Person(
 *                      dataLine.get("name"),
 *                      dataLine.getInt("age"),
 *                      dataLine.getDouble("net.worth")
 *                  );
 *             }
 *         }
 *     
*

* * @param the record type for the reader. * @author Valentin Ruano-Rubio <[email protected]> */ public abstract class TableReader implements Closeable, Iterable { /** * Name of the input source. *

It can be {@code null} indicating that no name was provided at construction

. */ private final String source; /** * Input text reader. *

* Keeps track of the last line number read for error reporting purposes. *

*/ private final LineNumberReader reader; /** * Holds a reference to the column names. */ private TableColumnCollection columns; private Map metadata = new HashMap<>(); /** * Holds a reference to the csvReader object use to read and parse the input * into {@link String} arrays. */ private CSVReader csvReader; /** * Indicates whether the reader has tried to fetch the next record. *

If {@code true} the content of {@link #nextRecord} represent the next record to be returned * by {@link #readRecord} ({@code null} if we reached the end of the table), otherwise {@link #nextRecord} reference * is invalid and the next record must be fetched using {@link #fetchNextRecord()}.

*/ private boolean nextRecordFetched = false; /** * Holds a reference to the next record. *

* This is {@code null} when we reached the end of the source. *

*/ private R nextRecord; /** * Creates a new table reader given the input file path. *

* This operation will read the first lines of the input file until the * column name header line is found. *

*

* The source's name used in error reporting is the file's path as returned by * {@link File#getPath}. *

* * @param path the input file path. * @throws IllegalArgumentException if {@code name} is {@code null}. * @throws IOException if any is raised when accessing the file. */ public TableReader(final Path path) throws IOException { this( Utils.nonNull(path, "the input file cannot be null").toString(), IOUtils.makeReaderMaybeGzipped(path)); } /** * Creates a new table reader given an input {@link Reader}. *

* This operation will read the first lines of the input file until the * column name header line is found. *

* * @param sourceReader the source text reader. * @throws IOException if any is raised when reading from {@code sourceReader}. */ public TableReader(final Reader sourceReader) throws IOException { this(null, sourceReader); } /** * Creates a new table reader given an input {@link Reader}. *

* It assigns an arbitrary *

* * @param sourceName name of the source to use in error messages. It can be {@code null}, indicating that is anonymous. * @param sourceReader reader to the text to process. * @throws IllegalArgumentException if {@code sourceReader} is {@code null}. * @throws IOException if is raised when reading from the source. */ protected TableReader(final String sourceName, final Reader sourceReader) throws IOException { Utils.nonNull(sourceReader, "the reader cannot be null"); this.source = sourceName; this.reader = sourceReader instanceof LineNumberReader ? (LineNumberReader) sourceReader : new LineNumberReader(sourceReader); this.csvReader = new CSVReader(this.reader, TableUtils.COLUMN_SEPARATOR, TableUtils.QUOTE_CHARACTER, TableUtils.ESCAPE_CHARACTER); findAndProcessHeaderLine(); this.nextRecordFetched = false; } /** * Process the first lines of the input source until the header line. * * @throws IOException if an {@link IOException} occurred when reading from the source. * @throws UserException.BadInput if there is formatting error in the input. */ protected void findAndProcessHeaderLine() throws IOException { final String[] line = skipCommentLines(); if (line == null) { throw formatException("premature end of table: header line not found"); } else { TableColumnCollection.checkNames(line, UserException.BadInput::new); columns = new TableColumnCollection(line); processColumns(columns); } } /** * Checks whether a line is a comment line or not. * * @param line input line already split into line-values. * @return {@code true} if {@code line} seems to be a comment line. */ protected boolean isCommentLine(final String[] line) { return line.length > 0 && line[0].startsWith(TableUtils.COMMENT_PREFIX); } /** * Composes the exception to be thrown due to a formatting error. *

* The input {@code message} can be omitted by providing a {@code null} value. *

* * @param message custom error message. * @return never {@code null}. */ protected final UserException.BadInput formatException(final String message) { return new UserException.BadInput(formatExceptionMessageWithLocationInfo(message)); } /** * Composes the exception to be thrown due to a formatting error. *

* The input {@code message} can be omitted by providing a {@code null} value. *

* * @param message custom error message. * @return never {@code null}. */ protected final UserException.BadInput formatExceptionWithoutLocation(final String message) { return new UserException.BadInput(formatExceptionMessageWithoutLocationInfo(message)); } /** * Composes the error exception message string. *

* The input {@code message} can be omitted by providing a {@code null} value. *

* * @param message custom error message. * @return never {@code null}. */ private String formatExceptionMessageWithLocationInfo(final String message) { final String explanation = message == null ? "" : ": " + message; if (source == null) { return String.format("format error at line %d" + explanation, reader.getLineNumber()); } else { return String.format("format error in '%s' at line %d" + explanation, source, reader.getLineNumber()); } } /** * Composes the error exception message string. *

* The input {@code message} can be omitted by providing a {@code null} value. *

* * @param message custom error message. * @return never {@code null}. */ private String formatExceptionMessageWithoutLocationInfo(final String message) { return "format error: " + message; } /** * Process the header line's column names. *

* Implementations must use {@link #formatException(String)} to create the exception to throw in case * there is any formatting issue. *

* * @param tableColumns columns found in the input. It is guarantee not to be * a {@code null} and not to contain any {@code null} values. * @throws UserException.BadInput if there is a formatting issue. */ protected void processColumns(@SuppressWarnings("unused") final TableColumnCollection tableColumns) { // nothing by default. } /** * Returns the column collection for this reader. * @throws IllegalStateException if this methods is invoked before the table column can be determined * (e.g. when processing a comment line before the header extending {@link #processCommentLine(String, long)}). * @return never {@code null}. */ public TableColumnCollection columns() { Utils.validate(columns != null, "columns are null"); return columns; } /** * Returns the next record form the source. * * @return {@code null} if there is no more record in the input. * @throws IOException if a {@link IOException} was thrown when reading from the input. */ public final R readRecord() throws IOException { if (nextRecordFetched == false) { nextRecord = fetchNextRecord(); } if (nextRecord != null) { nextRecordFetched = false; return nextRecord; } else { return null; } } /** * Reads the record from a string rather than from the input reader. * * @return {@code null} for comment or header lines, a non-null record otherwise. */ public final R readRecord(final String line) { try { final String[] fields = csvReader.getParser().parseLine(line); if (isCommentLine(fields) || isHeaderLine(fields)) { return null; } else if (fields.length != columns.columnCount()) { throw formatExceptionWithoutLocation("invalid number of columns"); } else { return createRecord(new DataLine(fields, columns, this::formatExceptionWithoutLocation)); } } catch (final IOException ex) { throw new GATKException("the single line input is in fact a multi-line entry"); } } /** * Fetch the next record from the source. * * @return {@code null} if there is no more record in the input. * @throws IOException if a {@link IOException} was thrown when reading from the input. */ private R fetchNextRecord() throws IOException { nextRecordFetched = true; String[] line; while ((line = csvReader.readNext()) != null) { if (isCommentLine(line)) { processCommentLine(line, reader.getLineNumber()); } else if (!isHeaderLine(line)) { if (line.length != columns.columnCount()) { throw formatException(String.format("mismatch between number of values in line (%d) and number of columns (%d)", line.length, columns.columnCount())); } else { final R result = createRecord(new DataLine(reader.getLineNumber(), line, columns, this::formatException)); if (result != null) { return result; } } } } return null; } private void processCommentLine(final String[] line, final long lineNumber) { final StringBuilder builder = new StringBuilder(); builder.append(line[0].substring(TableUtils.COMMENT_PREFIX.length())); for (int i = 1; i < line.length; i++) builder.append(TableUtils.COLUMN_SEPARATOR_STRING).append(line[i]); processCommentLine(builder.toString(), lineNumber); } /** * Called with the content of the comment line every time one is found in the input. *

* The comment prefix string ({@link TableUtils#COMMENT_PREFIX}) is not included in the * input string. *

*

* Notice that since comments might be present before the header line, so the result of invoking {@link #columns()} is undefined. *

* @param commentText the input comment string. * @param lineNumber the source line number that contains the comment line. */ protected void processCommentLine(final String commentText, final long lineNumber) { if (commentText.startsWith(TableWriter.METADATA_TAG)) { final String[] keyAndValue = commentText.substring(TableWriter.METADATA_TAG.length()).split("="); metadata.put(keyAndValue[0], keyAndValue[1]); } // do nothing with non-metadata lines by default } /** * Determines whether a line is a repetition of the header. * *

* By default, input lines that match the header exactly are ignored. By overriding this method * extending classes may change what is interpretated as a repetition of the header (e.g. just treat such * lines as regular data line) *

* @param line the input line. * @return {@code true} if the input line is a header line and it should be ignored. */ protected boolean isHeaderLine(final String[] line) { return columns.matchesExactly(line); } /** * Skip comment lines from the output. *

* It returns the contents of the first non comment line found. As a side effect, it builds a metadata map * of key-value pairs from comment lines with the tag from TableWriter *

* * @return {@code null} if we reached the end of the source, the next non-comment line content otherwise. * @throws IOException if it was raised when reading for the source. */ private String[] skipCommentLines() throws IOException { String[] line; while ((line = csvReader.readNext()) != null) { if (isCommentLine(line)) { processCommentLine(line, reader.getLineNumber()); } else { break; } } return line; } /** * Transforms a data-line column values into a record. *

* Implementation should use {@link #formatException(String)} to indicate a formatting error * that makes impossible to create a instance of {@link R} given the input line values {@code dataLine}. *

* * @param dataLine values corresponding to the column names that was passed earlier to {@link #processColumns(TableColumnCollection)}}. * it is guaranteed to not be {@code null}, contain no {@code null} value and have the same columns * as this readers' (accessible through {@link #columns}). * @return never {@code null}. */ protected abstract R createRecord(final DataLine dataLine); @Override public void close() throws IOException { csvReader.close(); } /** * Returns an iterator on the remaining records in * the input. * * @return never {@code null}. */ @Override public Iterator iterator() { return new Iterator() { @Override public boolean hasNext() { if (!nextRecordFetched) { try { nextRecord = fetchNextRecord(); } catch (final IOException ex) { throw new UncheckedIOException(ex); } } return nextRecord != null; } @Override public R next() { if (!nextRecordFetched) { try { nextRecord = fetchNextRecord(); } catch (final IOException ex) { throw new UncheckedIOException(ex); } } if (nextRecord == null) { throw new NoSuchElementException("there is no more record in the input"); } else { nextRecordFetched = false; return nextRecord; } } }; } @Override public Spliterator spliterator() { return Spliterators.spliteratorUnknownSize(iterator(), Spliterator.ORDERED | Spliterator.NONNULL); } /** * Returns an stream on the reaming records in the source. *

* Notice that the returned stream will consume records as if you were calling {@link #readRecord} directly. *

*

* Any {@link IOException} raised when using the stream will be propagated up wrapped in a {@link UncheckedIOException}. *

*

* Any format exception will still be indicated with a {@link UserException.BadInput}. *

* * @return never {@code null}. */ public Stream stream() { return Utils.stream(this); } /** * Read the remaining records into a list. *

* Notice that this operation does not close the reader. *

* * @return never {@code null}, but potentially empty. */ public List toList() { return stream().collect(Collectors.toList()); } public Map getMetadata() { return metadata; } /** * Returns the reader source name. * * @return null if the source name cannot be determined. */ public String getSource() { return source; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy