All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.maizegenetics.dna.snp.ReadSequenceAlignmentUtils Maven / Gradle / Ivy

// ReadSequenceAlignmentUtils.java
//
// (c) 1999-2001 PAL Development Core Team
//
// This package may be distributed under the
// terms of the Lesser GNU General Public License (LGPL)
package net.maizegenetics.dna.snp;

import net.maizegenetics.util.FormattedInput;
import net.maizegenetics.util.InputSource;
import net.maizegenetics.taxa.TaxaList;

import java.io.IOException;
import java.io.PushbackReader;
import net.maizegenetics.dna.map.PositionListBuilder;
import net.maizegenetics.dna.snp.genotypecall.GenotypeCallTable;
import net.maizegenetics.dna.snp.genotypecall.GenotypeCallTableBuilder;
import net.maizegenetics.taxa.TaxaListBuilder;

/**
 * reads aligned sequence data from plain text files.

* * recognizes PHYLIP 3.4 INTERLEAVED, PHYLIP SEQUENTIAL, CLUSTAL and derived * formats.

* * Other features: - the dot as "copy character" is recognized, - all base * characters are capitalized, - automatic data type estimation - determination * of corresponding base frequencies. * * @version $Id: ReadSequenceAlignmentUtils.java,v 1.1 2009/07/23 19:39:41 * tcasstevens Exp $ * * @author Korbinian Strimmer * @author Alexei Drummond */ public class ReadSequenceAlignmentUtils { private ReadSequenceAlignmentUtils() { // to prevent instantiation of this utility class. } /** * read from stream, while letting the user set the maximum label (taxa) * label length as non-standard phylip formats now exist */ public static GenotypeTable readBasicAlignments(PushbackReader input, int maxLabelLength) throws IOException { GenotypeTable saa = null; saa = readFile(input, maxLabelLength); return saa; } /** * read from file, while letting the user set the maximum label (taxa) label * length as non-standard phylip formats now exist */ public static GenotypeTable readBasicAlignments(String file, int maxLabelLength) throws IOException { PushbackReader input = InputSource.openFile(file); GenotypeTable saa = readBasicAlignments(input, maxLabelLength); input.close(); return saa; } private static GenotypeTable readFile(PushbackReader in, int maxLabelLength) throws IOException { return readPHYLIP(in, maxLabelLength); } // Read alignment (in PHYLIP 3.4 INTERLEAVED or PHYLIP SEQUENTIAL format) private static GenotypeTable readPHYLIP(PushbackReader in, int maxLabelLength) { FormattedInput fi = FormattedInput.getInstance(); TaxaList idGroup; int numSeqs = 0, numSites = 0, lineLength = 0; char[][] data = null; int c, pos = 0, seq = 0; try { // Parse PHYLIP header line numSeqs = fi.readInt(in); numSites = fi.readInt(in); String[] identifiers = new String[numSeqs]; data = new char[numSeqs][numSites]; // Determine whether sequences are in INTERLEAVED // or in sequential format String header = fi.readLine(in, false); boolean interleaved = true; if (header.length() > 0) { if (header.charAt(0) == 'S') { interleaved = false; } } if (interleaved) // PHYLIP INTERLEAVED { // Reading data while (pos < numSites) { // Go to next block c = fi.readNextChar(in); in.unread(c); for (seq = 0; seq < numSeqs; seq++) { lineLength = readSeqLineP(in, seq, pos, numSites, data, identifiers, fi, maxLabelLength, lineLength); } pos += lineLength; } } else // PHYLIP SEQUENTIAL { //System.out.println("PHYLIP SEQUENTIAL"); for (seq = 0; seq < numSeqs; seq++) { // Go to next block c = fi.readNextChar(in); in.unread(c); // Read label identifiers[seq] = fi.readLabel(in, maxLabelLength).toUpperCase(); // Read sequences for (pos = 0; pos < numSites; pos++) { data[seq][pos] = (char) fi.readNextChar(in); if (data[0][pos] == '.') { if (seq == 0) { throw new IllegalArgumentException( "Copy character (.) in first sequence not allowed (pos. " + (pos + 1) + ")"); } else { data[seq][pos] = data[0][pos]; } } } } } idGroup = new TaxaListBuilder().addAll(identifiers).build(); } catch (IOException e) { throw new IllegalArgumentException("IO error after pos. " + (pos + 1) + ", seq. " + (seq + 1)); } String[] s = new String[numSeqs]; for (int i = 0; i < numSeqs; i++) { s[i] = (new String(data[i])).toUpperCase(); } // SimpleAlignment saa = SimpleAlignment.getInstance(idGroup, s, new Nucleotides()); String[] sites = new String[numSites]; int[] positions = new int[numSites]; for (int i = 0; i < numSites; i++) { positions[i] = i; sites[i] = Integer.toString(i); } GenotypeCallTable genotype = GenotypeCallTableBuilder.getUnphasedNucleotideGenotypeBuilder(numSeqs, numSites) .setBases(s) .build(); return GenotypeTableBuilder.getInstance(genotype, PositionListBuilder.getInstance(numSites), idGroup); } private static int readSeqLineP(PushbackReader in, int s, int pos, int maxPos, char[][] data, String[] identifiers, FormattedInput fi, int maxLabelLength, int lineLength) throws IOException { if (pos == 0) { identifiers[s] = fi.readLabel(in, maxLabelLength).toUpperCase(); } if (s == 0) { String thisLine = fi.readLine(in, false); if (thisLine.length() > maxPos - pos) { lineLength = maxPos - pos; } else { lineLength = thisLine.length(); } for (int i = 0; i < lineLength; i++) { data[0][pos + i] = thisLine.charAt(i); if (data[0][pos + i] == '.') { throw new IllegalArgumentException("Copy character (.) in first sequence not allowed (pos. " + (i + pos + 1) + ")"); } } } else { for (int i = 0; i < lineLength; i++) { data[s][pos + i] = (char) fi.readNextChar(in); if (data[s][pos + i] == '.') { data[s][pos + i] = data[0][pos + i]; } } fi.nextLine(in); } return lineLength; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy