lphy.base.parser.NexusParser Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lphy-base Show documentation
The standard library of LPhy, which contains the required generative distributions and basic functions.
The newest version!
package lphy.base.parser;


import jebl.evolution.io.ImportException;
import jebl.evolution.io.ImportHelper;
import jebl.evolution.io.NexusImporter;
import jebl.evolution.sequences.BasicSequence;
import jebl.evolution.sequences.Sequence;
import jebl.evolution.sequences.SequenceType;
import jebl.evolution.sequences.State;
import jebl.evolution.taxa.Taxon;
import jebl.util.Attributable;
import lphy.base.evolution.Taxa;
import lphy.base.evolution.alignment.Alignment;
import lphy.base.evolution.alignment.CharSetBlock;
import lphy.base.evolution.alignment.ContinuousCharacterData;
import lphy.base.evolution.alignment.MetaDataAlignment;
import lphy.base.evolution.datatype.Continuous;
import lphy.base.evolution.datatype.DataType;
import lphy.base.spi.SequenceTypeBaseImpl;
import lphy.core.logger.LoggerUtils;

import java.awt.*;
import java.io.EOFException;
import java.io.IOException;
import java.io.Reader;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.time.temporal.ChronoUnit;
import java.util.List;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**TODO: why line 895 parseValue(String value) uses Color.decode(colourValue)
 * Merged from {@link jebl.evolution.io.NexusImporter}.
 * Try to use Lphy objects as many as possible.
 *
 * @author Walter Xie
 */
public class NexusParser {

    protected final ImportHelper helper;

    protected NexusBlock nextBlock = null;
    protected String nextBlockName = null;

    protected int taxonCount = 0, siteCount = 0;
    protected SequenceType sequenceType = null;
    protected String gapCharacters = "-";
    protected String matchCharacters = ".";
    protected String missingCharacters = "?";
    protected boolean isInterleaved = false;

    //TODO mv to another class like AbstractNexusData
    protected ContinuousCharacterData continuousCharacterData;

    /**
     * @param fileName the nexus file name,
     *                 which must end with "nex", "nexus", or "nxs"
     */
    public NexusParser(String fileName) {

        Reader reader = getReader(fileName);

        helper = new ImportHelper(reader);
        helper.setExpectedInputLength(0);
        // ! defines a comment to be written out to a log file
        // & defines a meta comment
        helper.setCommentDelimiters('[', ']', '\0', '!', '&');

    }

//    protected final int READ_AHEAD_LIMIT = 50000;
    protected Reader getReader(String fileName) {
        Reader reader = null;
        try {
            if (!(fileName.endsWith("nex") || fileName.endsWith("nexus") || fileName.endsWith("nxs")))
                throw new IOException("Invalid Nexus file name !  fileName = " + fileName);

            final Path nexFile = Paths.get(fileName);

            if (!nexFile.toFile().exists() || nexFile.toFile().isDirectory())
                throw new IOException("Cannot find Nexus file ! " + nexFile +
                        ", user.dir = " + System.getProperty("user.dir"));

            reader = Files.newBufferedReader(nexFile); // StandardCharsets.UTF_8
//            reader.mark(READ_AHEAD_LIMIT); // to reset reader back to READ_AHEAD_LIMIT
        } catch (IOException e) {
            LoggerUtils.logStackTrace(e);
        }
        return reader;
    }

    //****** main ******//

    public static void main(final String[] args) {
        try {
            String fileName = args[0];
            System.out.println("Loading " + fileName);
            final NexusParser importer = new NexusParser(fileName);

            MetaDataAlignment nexusAlignment;
            if (fileName.endsWith("Dengue4.nex")) {
                nexusAlignment = importer.importNexus("forward");

                System.out.println(nexusAlignment.toJSON());

            } else if (fileName.endsWith("primate.nex")) {
                nexusAlignment = importer.importNexus(null);

                Alignment coding = nexusAlignment.charset("coding");
                System.out.println("coding : " + coding.toJSON());
                Alignment noncoding = nexusAlignment.charset("noncoding");
                System.out.println("noncoding : " + noncoding.toJSON());

            } else if (fileName.endsWith("haemulidae_trophic_traits.nex")) {

                importer.importNexus(null);
                System.out.println(importer.continuousCharacterData.toJSON());


            } else { // for testing or dev

            }

        } catch (Exception e) {
            e.printStackTrace();
        }
    } // main


    //****** import ******//

    /**
     * The full pipeline to parse the nexus file.
     * The charset will be handled separately.
     * @param ageDirectionStr  either forward or backward.
     *                         It can be null, if null but the nexus file
     *                         has TIPCALIBRATION block, then assume forward.
     * @return LPHY {@link MetaDataAlignment}.
     */
    public MetaDataAlignment importNexus(String ageDirectionStr) throws IOException, ImportException {
        boolean done = false;

        // create MetaDataAlignment either from readCharactersBlock or readDataBlock
        MetaDataAlignment nexusData = null;
        List taxonList = null;

        while (!done) {
            try {

                NexusBlock block = findNextBlock();

                if (block == NexusBlock.TAXA) {

                    taxonList = readTaxaBlock();

                } else if (block == NexusBlock.CHARACTERS) {

                    if (taxonList == null)
                        throw new NexusImporter.MissingBlockException("TAXA block is missing");

                    nexusData = readCharactersBlock(taxonList);

                } else if (block == NexusBlock.DATA) {

                    // A data block doesn't need a taxon block before it
                    // but if one exists then it will use it.
                    // this reads continuous data into lphy ContinuousCharacterData, not alignments
                    // the rest data type will add alignments
                    nexusData = readDataBlock(taxonList);

                } else if (block == NexusBlock.ASSUMPTIONS) {

                    readAssumptionsBlock(nexusData); // only CHARSET

                } else if (block == NexusBlock.CALIBRATION) {

                    readCalibrationBlock(nexusData, ageDirectionStr); // only TIPCALIBRATION

                } else {
                    //TODO new block
                }

            } catch (EOFException ex) {
                done = true;
            }
        }

        if (DataType.isSame(sequenceType, Continuous.getInstance())) {
            if (continuousCharacterData == null) // TODO
                throw new NexusImporter.MissingBlockException("Fail to load continuous data in MATRIX");
        } else if (nexusData == null)
            throw new NexusImporter.MissingBlockException("DATA or CHARACTERS block is missing");
        else if (nexusData instanceof MetaDataAlignment) // TODO
            LoggerUtils.log.info("Load " + nexusData.toString());

        return nexusData;
    }

    //****** 'DATA' or 'CHARACTERS' block, 'MATRIX' will have data ******//

    protected MetaDataAlignment readCharactersBlock(List taxonList) throws ImportException, IOException {

        siteCount = 0;
        sequenceType = null;

        readDataBlockHeader("MATRIX", NexusBlock.CHARACTERS);

        List sequences = readSequenceData(taxonList);
        MetaDataAlignment nexusData = createNexusAlignment(sequences);

        findEndBlock();

        return nexusData;
    }

    protected MetaDataAlignment readDataBlock(List taxonList) throws ImportException, IOException {

        taxonCount = 0;
        siteCount = 0;
        sequenceType = null;

        readDataBlockHeader("MATRIX", NexusBlock.DATA);

        MetaDataAlignment nexusData = null;
        if ( DataType.isSame(sequenceType, Continuous.getInstance()) ) {

            LoggerUtils.log.info("Loading continuous character data ... ");
            continuousCharacterData = readContinuousCharacterData();
//TODO            nexusData = createNexusData(continuousCharacterData);
        } else {
            List sequences = readSequenceData(taxonList);
            nexusData = createNexusAlignment(sequences);
        }

        findEndBlock();

        return nexusData;
    }

    // use jebl State to convert char into int
    // create lphy MetaDataAlignment from jebl Sequence
    // convert jebl Taxon into lphy Taxon
    private MetaDataAlignment createNexusAlignment(List sequences) {
        if (sequenceType == null)
            throw new IllegalArgumentException("Fail to find data type before parsing sequences !");
        if (siteCount < 1)
            throw new IllegalArgumentException("NCHAR < 1 ! " + siteCount);

        final int seqSize = sequences.size();
        lphy.base.evolution.Taxon[] taxons = new lphy.base.evolution.Taxon[seqSize];
        // init Taxon[]
        for (int t = 0; t < seqSize; t++) {
            Sequence sequence = sequences.get(t);
            Taxon jeblTaxon = sequence.getTaxon();
            if (jeblTaxon == null)
                throw new IllegalArgumentException("Cannot find taxon in sequence ! " + t);
            // TODO getAttributeMap()
            taxons[t] = new lphy.base.evolution.Taxon(jeblTaxon.getName());
        }

        MetaDataAlignment nexusData = new MetaDataAlignment(Taxa.createTaxa(taxons), siteCount, sequenceType);
        // fill in sequences for single partition
        for (int t = 0; t < seqSize; t++) {
            Sequence sequence = sequences.get(t);
            for (int s = 0; s < sequence.getLength(); s++) {
                //*** convert char into int ***//
                State state = sequence.getState(s);
                int stateNum = state.getIndex();
                // the taxon index in List should be same to Taxon[] taxonArray in Alignment
                nexusData.setState(t, s, stateNum);
            }
        }

        return nexusData;
    }

    // Extract data type from 'DATATYPE' block, and convert into {@link SequenceType}.
    private void readDataBlockHeader(String tokenToLookFor, NexusBlock block) throws ImportException, IOException {

        boolean foundDimensions = false, foundTitle = false, foundFormat = false;
        String token;

        do {
            token = helper.readToken(); //TODO read comments after MATRIX, but readToken() skips comments

            if (token.equalsIgnoreCase("TITLE")) {
                if (foundTitle) {
                    throw new ImportException.DuplicateFieldException("TITLE");
                }

                foundTitle = true;
            } else if (token.equalsIgnoreCase("DIMENSIONS")) {

                if (foundDimensions) {
                    throw new ImportException.DuplicateFieldException("DIMENSIONS");
                }

                boolean nchar = (block == NexusBlock.TAXA);
                boolean ntax = (block == NexusBlock.CHARACTERS);

                do {
                    String token2 = helper.readToken("=;");

                    if (helper.getLastDelimiter() != '=') {
                        throw new ImportException.BadFormatException("Unknown subcommand, '" + token2 + "', or missing '=' in DIMENSIONS command");
                    }

                    if (token2.equalsIgnoreCase("NTAX")) {

                        if (block == NexusBlock.CHARACTERS) {
                            throw new ImportException.BadFormatException("NTAX subcommand in CHARACTERS block");
                        }

                        taxonCount = helper.readInteger(";");
                        ntax = true;

                    } else if (token2.equalsIgnoreCase("NCHAR")) {

                        if (block == NexusBlock.TAXA) {
                            throw new ImportException.BadFormatException("NCHAR subcommand in TAXA block");
                        }

                        siteCount = helper.readInteger(";");
                        nchar = true;

                    } else {
                        throw new ImportException.BadFormatException("Unknown subcommand, '" + token2 + "', in DIMENSIONS command");
                    }

                } while (helper.getLastDelimiter() != ';');

                if (!ntax) {
                    throw new ImportException.BadFormatException("NTAX subcommand missing from DIMENSIONS command");
                }
                if (!nchar) {
                    throw new ImportException.BadFormatException("NCHAR subcommand missing from DIMENSIONS command");
                }
                foundDimensions = true;

            } else if (token.equalsIgnoreCase("FORMAT")) {

                if (foundFormat) {
                    throw new ImportException.DuplicateFieldException("FORMAT");
                }

                sequenceType = null;

                do {
                    String token2 = helper.readToken("=;");

                    if (token2.equalsIgnoreCase("GAP")) {

                        if (helper.getLastDelimiter() != '=') {
                            throw new ImportException.BadFormatException("Expecting '=' after GAP subcommand in FORMAT command");
                        }

                        gapCharacters = helper.readToken(";");

                    } else if (token2.equalsIgnoreCase("MISSING")) {

                        if (helper.getLastDelimiter() != '=') {
                            throw new ImportException.BadFormatException("Expecting '=' after MISSING subcommand in FORMAT command");
                        }

                        missingCharacters = helper.readToken(";");

                    } else if (token2.equalsIgnoreCase("MATCHCHAR")) {

                        if (helper.getLastDelimiter() != '=') {
                            throw new ImportException.BadFormatException("Expecting '=' after MATCHCHAR subcommand in FORMAT command");
                        }

                        matchCharacters = helper.readToken(";");

                    } else if (token2.equalsIgnoreCase("DATATYPE")) {

                        if (helper.getLastDelimiter() != '=') {
                            throw new ImportException.BadFormatException("Expecting '=' after DATATYPE subcommand in FORMAT command");
                        }

                        String token3 = helper.readToken(";");

                        try {
                            // add new data type to this method
                            sequenceType = SequenceTypeBaseImpl.getDataType(token3);
                            if (sequenceType == null)
                                throw new RuntimeException("Cannot find the sequence type ! " + token3 +
                                        " does not exist in " + SequenceTypeBaseImpl.getDataTypeList());

                        } catch (UnsupportedOperationException e) {
                            throw new ImportException.UnparsableDataException(e.getMessage());
                        }

                    } else if (token2.equalsIgnoreCase("INTERLEAVE")) {
                        isInterleaved = true;
                    }

                } while (helper.getLastDelimiter() != ';');

                foundFormat = true;
            }
        } while (!token.equalsIgnoreCase(tokenToLookFor));

        if (!foundDimensions) {
            throw new ImportException.MissingFieldException("DIMENSIONS");
        }
        if (block != NexusBlock.TAXA && sequenceType == null) {
            throw new ImportException.MissingFieldException("DATATYPE. Only Nucleotide or Protein sequences are supported.");
        }

    }

    //****** Sequences ******//

    protected List readSequenceData(List taxonList) throws ImportException, IOException {
        boolean sequencherStyle = false;
        String firstSequence = null;
        List sequences = new ArrayList<>();

        if (isInterleaved) {
            List sequencesData = new ArrayList<>(taxonCount);
            List taxons = new ArrayList<>();
            List taxList = (taxonList != null) ? taxonList : taxons;

            int[] charsRead = new int[taxonCount];
            for (int i = 0; i < taxonCount; i++) {
                sequencesData.add(new StringBuilder());
                charsRead[i] = 0;
            }
            //throw new ImportException.UnparsableDataException("At present, interleaved data is not parsable");
            boolean firstLoop = true;

            int readCount = 0;
            while (readCount < siteCount * taxonCount) {

                for (int i = 0; i < taxonCount; i++) {

                    String token = helper.readToken();

                    int sequenceIndex;
                    Taxon taxon = Taxon.getTaxon(token);
                    if (firstLoop) {
                        if (taxonList != null) {
                            sequenceIndex = taxonList.indexOf(taxon);
                        } else {
                            sequenceIndex = taxons.size();
                            taxons.add(taxon);
                        }
                    } else {
                        sequenceIndex = taxList.indexOf(taxon);
                    }

                    if (sequenceIndex < 0) {
                        // taxon not found in taxon list...
                        // ...perhaps it is a numerical taxon reference?
                        throw new ImportException.UnknownTaxonException("Unexpected taxon:" + token
                                + " (expecting " + taxList.get(i).getName() + ")");
                    }

                    StringBuffer buffer = new StringBuffer();

                    helper.readSequenceLine(buffer, sequenceType, ";", gapCharacters, missingCharacters,
                            matchCharacters, firstSequence);

                    String seqString = buffer.toString();

                    // We now check if this file is in Sequencher* style NEXUS, this style has the taxon and site counts
                    // before the sequence data.
                    try {
                        if (firstLoop && Integer.parseInt(taxon.toString()) == taxonCount &&
                                Integer.parseInt(seqString) == siteCount) {
                            i--;
                            taxons.remove(taxon);
                            sequencherStyle = true;
                            continue;
                        }
                    } catch (NumberFormatException e) {
                        // Do nothing, this just means that this is the NEXUS format we usually expect rather than sequencher
                    }

                    readCount += seqString.length();
                    charsRead[sequenceIndex] += seqString.length();

                    sequencesData.get(sequenceIndex).append(seqString);
                    if (i == 0) {
                        firstSequence = seqString;
                    }

                    if (helper.getLastDelimiter() == ';') {
                        if (i < taxonCount - 1) {
                            throw new ImportException.TooFewTaxaException();
                        }
                        for (int k = 0; k < taxonCount; k++) {
                            if (charsRead[k] != siteCount) {
                                throw new ImportException.ShortSequenceException(taxList.get(k).getName()
                                        + " has length " + charsRead[k] + ", expecting " + siteCount);
                            }
                        }
                    }
                }

                firstLoop = false;
            }

            // Sequencher style apparently doesnt use a ';' after the sequence data.
            if (!sequencherStyle && helper.getLastDelimiter() != ';') {
                throw new ImportException.BadFormatException("Expecting ';' after sequences data");
            }

            for (int k = 0; k < taxonCount; k++) {
                Sequence sequence = new BasicSequence(sequenceType, taxList.get(k), sequencesData.get(k));
                sequences.add(sequence);
            }

        } else {

            for (int i = 0; i < taxonCount; i++) {
                String token = helper.readToken();

                Taxon taxon = Taxon.getTaxon(token);

                if (taxonList != null && !taxonList.contains(taxon)) {
                    // taxon not found in taxon list...
                    // ...perhaps it is a numerical taxon reference?
                    StringBuilder message = new StringBuilder("Expected: ").append(token).append("\nActual taxa:\n");
                    for (Taxon taxon1 : taxonList) {
                        message.append(taxon1).append("\n");
                    }
                    throw new ImportException.UnknownTaxonException(message.toString());
                }

                StringBuilder buffer = new StringBuilder();
                helper.readSequence(buffer, sequenceType, ";", siteCount, gapCharacters,
                        missingCharacters, matchCharacters, firstSequence, true);
                String seqString = buffer.toString();

                if (seqString.length() != siteCount) {
                    throw new ImportException.ShortSequenceException(taxon.getName()
                            + " has length " + seqString.length() + ", expecting " + siteCount);
                }

                if (i == 0) {
                    firstSequence = seqString;
                }

                if (helper.getLastDelimiter() == ';' && i < taxonCount - 1) {
                    throw new ImportException.TooFewTaxaException();
                }

                Sequence sequence = new BasicSequence(sequenceType, taxon, seqString);
                sequences.add(sequence);
            }

            if (helper.getLastDelimiter() != ';') {
                throw new ImportException.BadFormatException("Expecting ';' after sequences data");
            }

        }

        return sequences;
    }

    //****** CALIBRATION Block : TIPCALIBRATION ******//

    protected void readCalibrationBlock(MetaDataAlignment nexusData, String ageDirectionStr) throws ImportException, IOException {

        String token;
        do {
            token = helper.readToken(";");
            if (token.equalsIgnoreCase("OPTIONS")) {
                String token2 = helper.readToken("=");
                if (token2.equalsIgnoreCase("SCALE")) {
                    String scale = helper.readToken(";");
                    if (scale.toLowerCase().endsWith("s"))
                        scale = scale.substring(0, scale.length() - 1);

                    ChronoUnit chronoUnit;
                    switch (scale) {
                        case "year":
                            chronoUnit = ChronoUnit.YEARS;
                            break;
//                        case "month":
//                            chronoUnit = ChronoUnit.MONTHS; break;
//                        case "day":
//                            chronoUnit = ChronoUnit.DAYS; break;
                        default:
                            throw new UnsupportedOperationException("Unsupported scale = " + scale);
                    }

                    nexusData.setChronoUnit(chronoUnit);
                }

            } else if (token.equalsIgnoreCase("TIPCALIBRATION")) {

                if (nexusData.getChronoUnit() == null) // TODO is it necessary?
                    throw new ImportException("Cannot find SCALE unit, e.g. year");

                // 94 = 1994:D4ElSal94, // 86 = 1986:D4PRico86,
                Map ageMap = new LinkedHashMap<>();
                do {
                    String date = null;
                    String taxonNm = null;
                    int lastDelimiter;
                    do {
                        String token2 = helper.readToken(":=,;");

                        if (helper.getLastDelimiter() != '=') { // ignore date's labels, e.g. 94 =
                            if (helper.getLastDelimiter() == ':')
                                date = token2;
                            else
                                taxonNm = token2;
                        }

                        lastDelimiter = helper.getLastDelimiter();
                        if (date != null && taxonNm != null) {
                            // put inside loop for same date, 1984:D4Mexico84 D4Philip84 D4Thai84,
                            ageMap.put(taxonNm, date);
                        } else if (lastDelimiter == ',' || lastDelimiter == ';') throw new ImportException();

                    } while (lastDelimiter != ',' && lastDelimiter != ';');
                    // next date mapping
                } while (helper.getLastDelimiter() != ';');

                if (ageMap.size() < 1)
                    throw new ImportException("Cannot parse TIPCALIBRATION !");
                if (ageMap.size() != taxonCount)
                    System.err.println("Warning: " + ageMap.size() +
                            " tips have dates, but taxon count = " + taxonCount);

                // store into AbstractNexusData
                nexusData.assignAges(ageMap, ageDirectionStr);

            } // end if else

        } while (isNotEnd(token));

        //validation ?
    }

    //****** ASSUMPTIONS Block : charset ******//

    /**
     * begin assumptions;
     * charset coding = 2-457 660-896;
     * charset noncoding = 1 458-659 897-898;
     * end;
     */
    protected void readAssumptionsBlock(MetaDataAlignment nexusData) throws ImportException, IOException {

        Map> charsetMap = new TreeMap<>();
        String token;
        do {
            token = helper.readToken(";");

            if (token.equalsIgnoreCase("CHARSET")) {
                String charset = helper.readToken("=");
                List charSetBlocks = new ArrayList<>();
                do {
                    String oneBlock = helper.readToken(";");
                    try {
                        CharSetBlock charSetBlock = CharSetBlock.Utils.parseCharSet(oneBlock);
                        charSetBlocks.add(charSetBlock);
                    } catch (IllegalArgumentException e) {
                        throw new ImportException("Charset " + charset + " : " + e.getMessage());
                    }
                } while (helper.getLastDelimiter() != ';');

                charsetMap.put(charset, charSetBlocks);
            }
        } while (isNotEnd(token));
        //validation ?

        // store into AbstractNexusData, and then handle in SimpleAlignment.Utils.getCharSetAlignment
        nexusData.setCharsetMap(charsetMap);
    }

    //******TODO ContinuousCharacterData ******//

    // rows are taxa, cols are traits.
    // Double[][] taxa should have same order of Taxon[].
    // TODO return MetaDataAlignment
    private ContinuousCharacterData readContinuousCharacterData() throws ImportException, IOException {
        assert taxonCount > 0 && siteCount > 0;
        Double[][] continuousData = new Double[taxonCount][siteCount];
        lphy.base.evolution.Taxon[] taxa = new lphy.base.evolution.Taxon[taxonCount];

        if (isInterleaved) {

            throw new UnsupportedOperationException("in dev");

        } else {

            for (int i = 0; i < taxonCount; i++) {
                // 1st col is taxon name
                String token = helper.readToken();
                taxa[i] = new lphy.base.evolution.Taxon(token);

                // from 2nd col is traits, must be double
                for (int j = 0; j < siteCount; j++) {
                    token = helper.readToken();

                    try {
                        continuousData[i][j] = Double.parseDouble(token);
                    } catch (NumberFormatException ex) {
                        // not enough columns
                        if (j < siteCount - 1)
                            throw new ImportException.ShortSequenceException(taxa[i].getName()
                                    + " has " + j + " traits, expecting " + siteCount);
                        else
                            throw new ImportException.BadFormatException("Double value is expected " +
                                    "for continuous data at taxon " + i + " trait " + j);
                    }

                } // end j loop

                // not enough
                if (helper.getLastDelimiter() == ';' && i < taxonCount - 1)
                    throw new ImportException.TooFewTaxaException(Integer.toString(i+1));
            } // end i loop

        }
        String token = helper.readToken(";");
        if (helper.getLastDelimiter() != ';') {
            throw new ImportException.BadFormatException("Expecting ';' after continuous data\n" +
                    helper.getLastDelimiter());
        }

        return new ContinuousCharacterData(new Taxa.Simple(taxa), continuousData);
    }



    //****** NexusBlock ******//

    /* Not javadoc
     * TODO make it extendable
     * interface NexusBlockImp{ public NexusBlock findNextBlock(); }
     * enum NexusBlock implements NexusBlockImp{ TAXA, ..., DATA; }
     * // or T extends Enum
     * class NexusImporterDefault {
     * protected T enumNexusBlock;
     * protected NexusImporterDefault(T block){
     * this.block = block;
     * } }
     * class NexusParser extends NexusImporterDefault{
     * public NexusParser(NexusBlock block){
     * super(block);
     * } }
     * protected not private ...
     */
    public enum NexusBlock {
        UNKNOWN,
        TAXA,
        CHARACTERS,
        DATA,
        ASSUMPTIONS, // new
        CALIBRATION, // new
        UNALIGNED,
        DISTANCES,
        TREES
    }

    public NexusBlock findNextBlock() throws IOException {
        findToken("BEGIN", true);
        nextBlockName = helper.readToken(";").toUpperCase();
        return findBlockName(nextBlockName);
    }

    protected NexusBlock findBlockName(String blockName) {
        try {
            nextBlock = NexusBlock.valueOf(blockName);
        } catch (IllegalArgumentException e) {
            // handle unknown blocks. java 1.5 throws an exception in valueOf
            nextBlock = null;
        }

        if (nextBlock == null) {
            nextBlock = NexusBlock.UNKNOWN;
        }

        return nextBlock;
    }

    /**
     * Read ahead to the end of the current block.
     */
    public void findEndBlock() throws IOException
    {
        try {
            String token;

            do {
                token = helper.readToken(";");
            } while ( !token.equalsIgnoreCase("END") && !token.equalsIgnoreCase("ENDBLOCK") );
        } catch (EOFException e) {
            // Doesn't matter if the End is missing
        }

        nextBlock = NexusBlock.UNKNOWN;
    }


    protected boolean isNotEnd(String token) {
        return !token.equalsIgnoreCase("END") && !token.equalsIgnoreCase("ENDBLOCK");
    }

    //****** parser ******//

    private void findToken(String query, boolean ignoreCase) throws IOException {
        String token;
        boolean found = false;

        do {
            token = helper.readToken();

            if ((ignoreCase && token.equalsIgnoreCase(query)) || token.equals(query)) {
                found = true;
            }
        } while (!found);
    }

    static void parseAndClearMetaComments(Attributable item, ImportHelper importHelper) throws ImportException.BadFormatException {
        for (String meta : importHelper.getMetaComments()) {
            // A meta-comment which should be in the form:
            // \[&label[=value][,label[=value]>[,/..]]\]
            parseMetaCommentPairs(meta, item);

        }
        importHelper.clearLastMetaComment();
    }

    static void parseMetaCommentPairs(String meta, Attributable item) throws ImportException.BadFormatException {
        // This regex should match key=value pairs, separated by commas
        // This can match the following types of meta comment pairs:
        // value=number, value="string", value={item1, item2, item3}
        // (label must be quoted if it contains spaces (i.e. "my label"=label)

//        Pattern pattern = Pattern.compile("(\"[^\"]*\"+|[^,=\\s]+)\\s*(=\\s*(\\{[^=}]*\\}|\"[^\"]*\"+|[^,]+))?");
        Pattern pattern = Pattern.compile("(\"[^\"]*\"+|[^,=\\s]+)\\s*(=\\s*(\\{(\\{[^\\}]+\\},?)+\\}|\\{[^\\}]+\\}|\"[^\"]*\"+|[^,]+))?");
        Matcher matcher = pattern.matcher(meta);

        while (matcher.find()) {
            String label = matcher.group(1);
            if (label.charAt(0) == '\"') {
                label = label.substring(1, label.length() - 1);
            }
            if (label == null || label.trim().length() == 0) {
                throw new ImportException.BadFormatException("Badly formatted attribute: '" + matcher.group() + "'");
            }
            final String value = matcher.group(2);
            if (value != null && value.trim().length() > 0) {
                // there is a specified value so try to parse it
                item.setAttribute(label, parseValue(value.substring(1)));
            } else {
                item.setAttribute(label, Boolean.TRUE);
            }
        }
    }

    static Object parseValue(String value) {

        value = value.trim();

        if (value.startsWith("{")) {
            value = value.substring(1, value.length() - 1);

            String[] elements;

            if (value.startsWith("{")) {
                // the value is a list of a list so recursively parse the elements
                // and return an array

                // need to match },{ but leave the brackets in place
                value = value.replaceAll("\\},\\{", "}@,@{");
                elements = value.split("@,@");

            } else {
                // the value is a list so recursively parse the elements
                // and return an array
                elements = value.split(",");
            }
            Object[] values = new Object[elements.length];
            for (int i = 0; i < elements.length; i++) {
                values[i] = parseValue(elements[i]);
            }
            return values;
        }

        if (value.startsWith("#")) {
            // I am not sure whether this is a good idea but
            // I am going to assume that a # denotes an RGB colour
            String colourValue = value.substring(1);
            if (colourValue.startsWith("-")) {
                // old style decimal numbers
                try {
                    return Color.decode(colourValue);
                } catch (NumberFormatException nfe1) {
                    // not a colour
                }
            } else {
                return Color.decode("0x" + colourValue);
            }
        }

        // A string qouted by the nexus exporter and such
        if (value.startsWith("\"") && value.endsWith("\"")) {
            return value.subSequence(1, value.length() - 1);
        }

        if (value.equalsIgnoreCase("TRUE") || value.equalsIgnoreCase("FALSE")) {
            return Boolean.valueOf(value);
        }

        // Attempt to format the value as an integer
        try {
            return Integer.parseInt(value);
        } catch (NumberFormatException nfe1) {
            // not an integer
        }

        // Attempt to format the value as a double
        try {
            return Double.parseDouble(value);
        } catch (NumberFormatException nfe2) {
            // not a double
        }

        // return the trimmed string
        return value;
    }


    private List readTaxaBlock() throws ImportException, IOException {
        taxonCount = 0;

        readDataBlockHeader("TAXLABELS", NexusBlock.TAXA);

        if (taxonCount == 0) {
            throw new ImportException.MissingFieldException("NTAXA");
        }

        List taxa = new ArrayList<>();

        do {
            String name = helper.readToken(";");
            if (name.equals("")) {
                throw new ImportException.UnknownTaxonException("Expected nonempty taxon name, got empty string");
            }
            Taxon taxon = Taxon.getTaxon(name);
            taxa.add(taxon);

            parseAndClearMetaComments(taxon, helper);
        } while (helper.getLastDelimiter() != ';');

        if (taxa.size() != taxonCount) {
            throw new ImportException.BadFormatException("Number of taxa doesn't match NTAXA field");
        }

        findEndBlock();

        return taxa;
    }


}