All Downloads are FREE. Search and download functionalities are using the official Maven repository.

htsjdk.tribble.bed.BEDCodec Maven / Gradle / Ivy

There is a newer version: 4.1.3
Show newest version
/*
 * The MIT License
 *
 * Copyright (c) 2013 The Broad Institute
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
package htsjdk.tribble.bed;

import htsjdk.samtools.util.IOUtil;
import htsjdk.tribble.AbstractFeatureReader;
import htsjdk.tribble.AsciiFeatureCodec;
import htsjdk.tribble.annotation.Strand;
import htsjdk.tribble.index.tabix.TabixFormat;
import htsjdk.tribble.readers.LineIterator;
import htsjdk.tribble.util.ParsingUtils;

import java.util.regex.Pattern;

/**
 * Codec for parsing BED file, as described by UCSC
 * See https://genome.ucsc.edu/FAQ/FAQformat.html#format1
 *
 * @author jrobinso
 *         Date: Dec 20, 2009
 */
public class BEDCodec extends AsciiFeatureCodec {

    /** Default extension for BED files. */
    public static final String BED_EXTENSION = ".bed";

    private static final Pattern SPLIT_PATTERN = Pattern.compile("\\t|( +)");
    private final int startOffsetValue;

    /**
     * Calls {@link #BEDCodec(StartOffset)} with an argument
     * of {@code StartOffset.ONE}
     */
    public BEDCodec() {
        this(StartOffset.ONE);
    }


    /**
     * BED format is 0-based, but Tribble is 1-based.
     * Set desired start position at either ZERO or ONE
     */
    public BEDCodec(final StartOffset startOffset) {
        super(BEDFeature.class);
        this.startOffsetValue = startOffset.value();
    }


    public BEDFeature decodeLoc(String line) {
        return decode(line);
    }

    @Override
    public BEDFeature decode(String line) {

        if (line.trim().isEmpty()) {
            return null;
        }
        // discard header lines in case the caller hasn't called readHeader
        if (isBEDHeaderLine(line)) {
            this.readHeaderLine(line);
            return null;
        }

        String[] tokens = SPLIT_PATTERN.split(line, -1);
        return decode(tokens);
    }

    /**
     * The BED codec doesn't retain the actual header, but we need to parse through
     * it and advance to the beginning of the first feature. This is especially true
     * if we're indexing, since we want to underlying stream offset to be established
     * correctly, but is also the case for when we're simply iterating to satisfy a
     * feature query (otherwise the feature reader can terminate prematurely if the
     * header is large).
     * @param lineIterator
     * @return Always null. The BEDCodec currently doesn't model or preserve the BED header.
     */
    @Override
    public Object readActualHeader(final LineIterator lineIterator) {
        while (lineIterator.hasNext()) {
            // Only peek, since we don't want to actually consume a line of input unless its a header line.
            // This prevents us from advancing past the first feature.
            final String nextLine = lineIterator.peek();
            if (isBEDHeaderLine(nextLine)) {
                // advance the iterator and consume the line (which is a no-op)
                this.readHeaderLine(lineIterator.next());
            } else {
                return null; // break out when we've seen the end of the header
            }
        }

        return null;
    }

    // Return true if the candidateLine looks like a BED header line.
    private boolean isBEDHeaderLine(final String candidateLine) {
        return candidateLine.startsWith("#") || candidateLine.startsWith("track") || candidateLine.startsWith("browser");
    }

    public BEDFeature decode(String[] tokens) {
        int tokenCount = tokens.length;

        // The first 3 columns are non optional for BED.  We will relax this
        // and only require 2.

        if (tokenCount < 2) {
            return null;
        }

        String chr = tokens[0];

        // The BED format uses a first-base-is-zero convention,  Tribble features use 1 => add 1.
        int start = Integer.parseInt(tokens[1]) + startOffsetValue;

        int end = start;
        if (tokenCount > 2) {
            end = Integer.parseInt(tokens[2]);
        }

        FullBEDFeature feature = new FullBEDFeature(chr, start, end);

        // The rest of the columns are optional.  Stop parsing upon encountering
        // a non-expected value

        // Name
        if (tokenCount > 3) {
            String name = tokens[3].replaceAll("\"", "");
            feature.setName(name);
        }

        // Score
        if (tokenCount > 4) {
            try {
                float score = Float.parseFloat(tokens[4]);
                feature.setScore(score);
            } catch (NumberFormatException numberFormatException) {

                // Unexpected, but does not invalidate the previous values.
                // Stop parsing the line here but keep the feature
                // Don't log, would just slow parsing down.
                return feature;
            }
        }

        // Strand
        if (tokenCount > 5) {
            String strandString = tokens[5].trim();
            char strand = (strandString.isEmpty())
                    ? ' ' : strandString.charAt(0);

            if (strand == '-') {
                feature.setStrand(Strand.NEGATIVE);
            } else if (strand == '+') {
                feature.setStrand(Strand.POSITIVE);
            } else {
                feature.setStrand(Strand.NONE);
            }
        }

        //Color
        if (tokenCount > 8) {
            String colorString = tokens[8];
            feature.setColor(ParsingUtils.parseColor(colorString));
        }

        // Coding information is optional
        if (tokenCount > 11) {
            createExons(start, tokens, feature, feature.getStrand());
        }

        return feature;
    }

    protected boolean readHeaderLine(String line) {
        //We don't parse BED header
        return false;
    }

    private void createExons(int start, String[] tokens, FullBEDFeature gene,
                             Strand strand) throws NumberFormatException {

        int cdStart = Integer.parseInt(tokens[6]) + startOffsetValue;
        int cdEnd = Integer.parseInt(tokens[7]);

        int exonCount = Integer.parseInt(tokens[9]);
        String[] exonSizes = new String[exonCount];
        String[] startsBuffer = new String[exonCount];
        ParsingUtils.split(tokens[10], exonSizes, ',');
        ParsingUtils.split(tokens[11], startsBuffer, ',');

        int exonNumber = (strand == Strand.NEGATIVE ? exonCount : 1);

        if (startsBuffer.length == exonSizes.length) {
            for (int i = 0; i < startsBuffer.length; i++) {
                int exonStart = start + Integer.parseInt(startsBuffer[i]);
                int exonEnd = exonStart + Integer.parseInt(exonSizes[i]) - 1;
                gene.addExon(exonStart, exonEnd, cdStart, cdEnd, exonNumber);

                if (strand == Strand.NEGATIVE) {
                    exonNumber--;
                } else {
                    exonNumber++;
                }
            }
        }
    }

    @Override
    public boolean canDecode(final String path) {
        final String toDecode;
        if (IOUtil.hasBlockCompressedExtension(path)) {
            toDecode = path.substring(0, path.lastIndexOf("."));
        } else {
            toDecode = path;
        }
        return toDecode.toLowerCase().endsWith(BED_EXTENSION);
    }

    public int getStartOffset() {
        return this.startOffsetValue;
    }

    /**
     * Indicate whether co-ordinates or 0-based or 1-based.
     * 

* Tribble uses 1-based, BED files use 0. * e.g.: * start_position = bedline_start_position - startIndex.value() */ public enum StartOffset { ZERO(0), ONE(1); private int start; private StartOffset(int start) { this.start = start; } public int value() { return this.start; } } @Override public TabixFormat getTabixFormat() { return TabixFormat.BED; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy