org.biojava.bio.program.sax.FastaSearchParser Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of blast Show documentation
BioJava blast module
There is a newer version: 1.9.7
/*
 *                    BioJava development code
 *
 * This code may be freely distributed and modified under the
 * terms of the GNU Lesser General Public Licence.  This should
 * be distributed with the code.  If you do not have a copy,
 * see:
 *
 *      http://www.gnu.org/copyleft/lesser.html
 *
 * Copyright for this code is held jointly by the individual
 * authors.  These should be listed in @author doc comments.
 *
 * For more information on the BioJava project and its aims,
 * or to join the biojava-l mailing list, visit the home page
 * at:
 *
 *      http://www.biojava.org/
 *
 */

package org.biojava.bio.program.sax;

import java.io.BufferedReader;
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import java.util.StringTokenizer;

import org.biojava.bio.BioException;
import org.biojava.bio.search.SearchContentHandler;
import org.biojava.utils.ParserException;

/**
 * FastaSearchParser objects provide Fasta search
 * parsing functionality for the '-m 10' output format (see the Fasta
 * documentation). Data are passed to a SearchContentHandler which
 * coordinates its interpretation and creation of objects representing
 * the result.
 *
 * If the search output contains no hits only the header data will
 * be sent to the handler before the dataset ends. The handler is
 * responsible for dealing with this state.
 *
 * This class was originally used outside of the BioJava SAX
 * framework, but is now only used to provide functionality for
 * FastaSearchSAXParser.
 *
 * @author Keith James
 * @author Greg Cox
 * @since 1.1
 */
class FastaSearchParser
{
    private static final int    NODATA = 0;
    private static final int  INHEADER = 1;
    private static final int     INHIT = 2;
    private static final int   INQUERY = 3;
    private static final int INSUBJECT = 4;
    private static final int   INALIGN = 5;

    // Valid line identifiers for result annotation
    private static HashSet resultAnnoTokens =
        (HashSet) fillSet(new String [] { "mp_name",   "mp_ver",
                                          "mp_argv",   "mp_extrap",
                                          "mp_stats",  "mp_KS",
                                          "pg_name",   "pg_ver",
                                          "pg_optcut", "pg_cgap" },
                          new HashSet());

    // Valid line identifiers for search parameters
    private static HashSet resultSearchParmTokens =
        (HashSet) fillSet(new String [] { "pg_matrix", "pg_ktup",
                                          "pg_gap-pen" },
                          new HashSet());

    // Valid line identifiers for hit annotation
    private static HashSet hitAnnoTokens =
        (HashSet) fillSet(new String [] { "fa_frame",   "fa_initn",
                                          "fa_init1",   "fa_opt",
                                          "fa_bits",    "sw_score",
                                          "sw_ident",   "sw_gident",
                                          "sw_overlap", "fa_ident",
                                          "fa_gident",  "fa_overlap",
                                          "fa_score" },
                          new HashSet());

    // Valid line identifiers for hit data
    private static HashSet hitDataTokens =
        (HashSet) fillSet(new String [] { "fa_expect", "fa_z-score" },
                          new HashSet());

    private int     searchStatus = NODATA;
    private boolean searchParsed = false;

    private SearchContentHandler handler;
    private String               line;
    private int                  lineNumber;

    StringBuffer   querySeqTokens = new StringBuffer(1024);
    StringBuffer subjectSeqTokens = new StringBuffer(1024);
    StringBuffer      matchTokens = new StringBuffer(1024);

    /**
     * The parseSearch method performs the core parsing
     * operations. It parses one result from the stream before
     * returning. The handler is informed whether or not there are
     * further searches in the stream.
     *
     * @param reader a BufferedReader to read from.
     * @param handler a SearchContentHandler to notify
     * of events.
     *
     * @exception IOException if the BufferedReader fails.
     * @exception BioException if the parser (via the registered
     * SearchContentHandler) fails to resolve a query sequence and
     * target database.
     * @exception ParserException if the parser fails to parse a
     * line.
     */
    public void parseSearch(BufferedReader       reader,
                            SearchContentHandler handler)
        throws IOException, BioException, ParserException
    {
        lineNumber = 0;
        this.handler = handler;

    LINE:
        while ((line = reader.readLine()) != null)
        {
            lineNumber++;
            // This token indicates the end of the formatted search
            // data. Some outputs don't have any alignment consensus
            // tokens, so we need to check here as well as INALIGN
            if (line.startsWith(">>><<<"))
            {
                // If we got here while INHEADER then the search had
                // no hits
                if (searchStatus == INHEADER)
                {
                    // Inform of end of header and search
                    handler.endHeader();
                    handler.endSearch();

                    searchParsed = true;
                    searchStatus = NODATA;
                    continue LINE;
                }

                // Pass final data to handler
                handler.addSubHitProperty("querySeqTokens",   querySeqTokens.substring(0));
                handler.addSubHitProperty("subjectSeqTokens", subjectSeqTokens.substring(0));
                handler.addSubHitProperty("matchTokens",      matchTokens.substring(0));

                handler.endSubHit();
                handler.endSearch();

                searchParsed = true;
                searchStatus = NODATA;

                // Continue getting lines to look for start of another
                // search. This allows setMoreSearches(boolean flag)
                // to be called on the handler. When true it will know
                // to expect more.
                continue LINE;
            }

        STATUS:
            switch (searchStatus)
            {
                case NODATA:
                    // This token marks the line describing the query
                    // sequence file and database searched. It is
                    // followed by header lines containing data about
                    // the search
                    if (line.startsWith(">>>"))
                    {
                        searchStatus = INHEADER;

                        handler.setQueryID(parseQueryID(line));
                        handler.setDatabaseID(parseDatabaseID(line));

                        handler.startSearch();
                        handler.startHeader();

                        // If we already saw an end of search token
                        // then this is the start of another
                        // dataset. We break from the loop and return
                        // that the stream is not empty
                        if (searchParsed)
                        {
                            searchParsed = false;
                            handler.setMoreSearches(true);

                            // We have parsed one result, so return
                            return;
                        }
                        else
                            // We break out and setMoreSearches(false)
                            // is called below
                            break STATUS;
                    }
                    else
                        // Continue getting lines to look for start of another
                        // search. This allows setMoreSearches(boolean flag)
                        // to be called on the handler. When true it will know
                        // to expect more.
                        continue LINE;

                case INHEADER:
                    // This token marks the line describing a hit. It
                    // is followed by header lines containing data
                    // about the hit
                    if (line.startsWith(">>"))
                    {
                        searchStatus = INHIT;

                        handler.endHeader();
                        handler.startHit();

                        querySeqTokens.setLength(0);
                        subjectSeqTokens.setLength(0);
                        matchTokens.setLength(0);

                        handler.addHitProperty("id",   parseID(line));
                        handler.addHitProperty("desc", parseDesc(line));
                    }
                    else
                    {
                        if (! parseHeaderLine(line, resultAnnoTokens))
                            if (! parseHeaderLine(line, resultSearchParmTokens))
                                throw new ParserException("Fasta parser failed to recognise line type",
                                                          null,
                                                          lineNumber,
                                                          line);
                    }
                    break STATUS;

                case INHIT:
                    // This token marks the line describing the query
                    // sequence.
                    if (line.startsWith(">"))
                    {
                        searchStatus = INQUERY;
                        handler.endHit();
                        handler.startSubHit();
                    }
                    else
                    {
                        if (! parseHitLine(line, hitAnnoTokens))
                            if (! parseHitLine(line, hitDataTokens))
                                throw new ParserException("Fasta parser failed to recognise line type",
                                                          null,
                                                          lineNumber,
                                                          line);
                    }
                    break STATUS;

                case INQUERY:
                    // This token marks the line describing the
                    // subject sequence.
                    if (line.startsWith(">"))
                    {
                        searchStatus = INSUBJECT;
                    }
                    else
                    {
                        parseQuerySequence(line);
                    }
                    break STATUS;

                case INSUBJECT:
                    // This token marks the start of lines containing
                    // the consensus symbols from the Fasta alignment,
                    // which we ignore
                    if (line.startsWith("; al_cons:"))
                    {
                        searchStatus = INALIGN;
                    }
                    else if (line.startsWith(">>"))
                    {
                        searchStatus = INHIT;

                        // Pass data to handler
                        handler.addSubHitProperty("querySeqTokens",   querySeqTokens.substring(0));
                        handler.addSubHitProperty("subjectSeqTokens", subjectSeqTokens.substring(0));
                        handler.addSubHitProperty("matchTokens",      matchTokens.substring(0));

                        handler.endSubHit();
                        handler.startHit();

                        querySeqTokens.setLength(0);
                        subjectSeqTokens.setLength(0);
                        matchTokens.setLength(0);

                        handler.addHitProperty("id",   parseID(line));
                        handler.addHitProperty("desc", parseDesc(line));
                    }
                    else
                    {
                        parseSubjectSequence(line);
                    }
                    break STATUS;

                case INALIGN:
                    if (line.startsWith(">>"))
                    {
                        searchStatus = INHIT;

                        // Pass data to handler
                        handler.addSubHitProperty("querySeqTokens",   querySeqTokens.substring(0));
                        handler.addSubHitProperty("subjectSeqTokens", subjectSeqTokens.substring(0));
                        handler.addSubHitProperty("matchTokens",      matchTokens.substring(0));

                        handler.endSubHit();
                        handler.startHit();

                        querySeqTokens.setLength(0);
                        subjectSeqTokens.setLength(0);
                        matchTokens.setLength(0);

                        handler.addHitProperty("id",   parseID(line));
                        handler.addHitProperty("desc", parseDesc(line));
                    }
                    else if (line.startsWith(">>><<<"))
                    {
                        searchStatus = NODATA;

                        // Pass final data to handler
                        handler.addSubHitProperty("querySeqTokens",   querySeqTokens.substring(0));
                        handler.addSubHitProperty("subjectSeqTokens", subjectSeqTokens.substring(0));
                        handler.addSubHitProperty("matchTokens",      matchTokens.substring(0));

                        handler.endSubHit();
                        handler.endSearch();

                        searchParsed = true;
                        handler.setMoreSearches(true);

                        continue LINE;
                    }
                    else
                    {
                        matchTokens.append(line);
                    }
                    break STATUS;

                default:
                    break STATUS;
            } // end switch
        } // end while

        // This is false if we reach here and return
        handler.setMoreSearches(false);
    }

    /**
     * The fillSet method populates a Set
     * with the elements of an array.
     *
     * @param tokenArray a String [] array.
     * @param set a Set to fill.
     *
     * @return a Set.
     */
    private static Set fillSet(String [] tokenArray, Set set)
    {
        for (int i = 0; i < tokenArray.length; i++)
            set.add(tokenArray[i]);

        return set;
    }

    /**
     * The parseID method parses sequence IDs from
     * lines starting with '>' and '>>'.
     *
     * @param line a String to be parsed.
     *
     * @return a String containing the ID.
     *
     * @exception ParserException if an error occurs.
     */
    private String parseID(String line)
        throws ParserException
    {
        String trimmed = line.trim();
        int firstSpace = trimmed.indexOf(' ');

        // For Hit header lines (always start with >>)
        if (trimmed.startsWith(">>"))
        {
            if (trimmed.length() == 2)
                throw new ParserException("Fasta parser encountered a sequence with no Id",
                                          null,
                                          lineNumber,
                                          line);

            if (firstSpace == -1)
                return trimmed.substring(2);
            else
                return trimmed.substring(2, firstSpace);
        }
        // For SubHit header lines (always start with >)
        else
        {
            if (trimmed.length() == 1)
                throw new ParserException("Fasta parser encountered a sequence with no Id",
                                          null,
                                          lineNumber,
                                          line);

            if (firstSpace == -1)
                return trimmed.substring(1);
            else
                return trimmed.substring(1, firstSpace);
        }
    }

    /**
     * The parseDesc method parses the sequence
     * description from subject header lines.
     *
     * @param line a String to be parsed.
     *
     * @return a String containing the description.
     */
    private String parseDesc(String line)
    {
        String trimmed = line.trim();
        int firstSpace = trimmed.indexOf(' ');

        if (firstSpace == -1)
            return "No description";

        return trimmed.substring(firstSpace + 1);
    }

    /**
     * Creates a new parseQueryID instance.
     *
     * @param line a String to be parsed.
     *
     * @return a String.
     *
     * @exception ParserException if an error occurs
     */
    private String parseQueryID(String line)
        throws ParserException
    {
        int firstComma = line.indexOf(",");

        if (firstComma == -1)
            throw new ParserException("Fasta parser failed to parse a query ID",
                                      null,
                                      lineNumber,
                                      line);

        // return string between >>> and ,
        return line.substring(3, firstComma).trim();
    }

    /**
     * The parseDatabaseID method parses a database
     * filename from the relevant output line.
     *
     * @param line a String to be parsed.
     *
     * @return a String.
     *
     * @exception ParserException if an error occurs.
     */
    private String parseDatabaseID(String line)
        throws ParserException
    {
        StringTokenizer st = new StringTokenizer(line);
        String id = null;

        int count = st.countTokens();

        for (int i = 0; i < count - 1; i++)
        {
            id = st.nextToken();
        }

        if (id == null)
            throw new ParserException("Fasta parser failed to parse a database ID",
                                      null,
                                      lineNumber,
                                      line);
        return id;
    }

    private boolean parseHeaderLine(String line,
                                    Set    tokenSet)
        throws ParserException
    {
        String [] data = parseLine(line, tokenSet);

        if (data.length > 0)
        {
            handler.addSearchProperty(data[0], data[1]);
            return true;
        }
        else
        {
            return false;
        }
    }

    private boolean parseHitLine(String line,
                                 Set    tokenSet)
        throws ParserException
    {
        String [] data = parseLine(line, tokenSet);

        if (data.length > 0)
        {
            handler.addHitProperty(data[0], data[1]);
            return true;
        }

        return false;
    }

    private String [] parseLine(String line,
                                Set    tokenSet)
        throws ParserException
    {
        int idTokenStart = line.indexOf(";");
        int   idTokenEnd = line.indexOf(":");

        String   idToken = line.substring(idTokenStart + 1, idTokenEnd);
        idToken          = idToken.trim();

        String   idValue = line.substring(idTokenEnd + 1);
        idValue          = idValue.trim();

        if (tokenSet.contains(idToken))
            return new String [] { idToken, idValue };
        else
            return new String [0];
    }

    private void parseQuerySequence(String line)
    {
        String [] data = parseSequence(line);

        if (data.length > 0)
        {
            // We have a key/value pair
            handler.addSubHitProperty("query" + data[0], data[1]);
        }
        else
        {
            // We have a line of sequence tokens
            querySeqTokens.append(line);
        }
    }

    private void parseSubjectSequence(String line)
    {
        String [] data = parseSequence(line);

        if (data.length > 0)
        {
            // We have a key/value pair
            handler.addSubHitProperty("subject" + data[0], data[1]);
        }
        else
        {
            // We have a line of sequence tokens
            subjectSeqTokens.append(line);
        }
    }

    private String [] parseSequence(String line)
    {
        if (line.startsWith(";"))
        {
            // Check the sequence type given by the report
            if (line.equals("; sq_type: p"))
            {
                return new String [] { "_sq_type", "protein"};
            }
            else if (line.equals("; sq_type: D"))
            {
                return new String [] { "_sq_type", "dna"};
            }

            // Record the coordinates and offset of the alignment
            if (line.startsWith("; al_start:"))
            {
                return new String [] { "_al_start", parseCoord(line) };
            }
            else if (line.startsWith("; al_stop:"))
            {
                 return new String [] { "_al_stop", parseCoord(line) };
            }
            else if (line.startsWith("; al_display_start:"))
            {
                return new String [] {"_al_display_start", parseCoord(line) };
            }
            else if (line.startsWith("; sq_len:"))
            {
                return new String [] { "_sq_len", parseCoord(line) };
            }
            else if (line.startsWith("; sq_offset:"))
            {
                return new String [] { "_sq_offset", parseCoord(line) };
            }
        }

        return new String [0];
    }

    /**
     * The parseCoord method extracts integer coordinates
     * from Fasta output lines.
     *
     * @param line a String to parse.
     *
     * @return a String coordinate.
     */
    private String parseCoord(String line)
    {
        int sepIndex = line.lastIndexOf(":");
        return line.substring(sepIndex + 1).trim();
    }
}