All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.maizegenetics.dna.map.GenomeSequenceBuilder Maven / Gradle / Ivy

Go to download

TASSEL is a software package to evaluate traits associations, evolutionary patterns, and linkage disequilibrium.

The newest version!
/**
 * 
 */
package net.maizegenetics.dna.map;

import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.LongAdder;
import java.util.function.Function;
import java.util.stream.Collectors;

import com.google.common.collect.Range;
import com.google.common.collect.RangeMap;
import com.google.common.collect.TreeRangeMap;

import net.maizegenetics.dna.snp.GenotypeTableBuilder;
import net.maizegenetics.dna.snp.NucleotideAlignmentConstants;
import net.maizegenetics.util.GeneralAnnotation;
import net.maizegenetics.util.GeneralAnnotationStorage;
import net.maizegenetics.util.Tuple;
import net.maizegenetics.util.Utils;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;


/**
 * Builder for a chromosome genome sequence that hides the ReferenceGenomeSequence implementation.
 * The sequence is read from a fasta file and stored in a byte array, 2 alleles packed per byte.
 * 
 * @author Lynn Johnson
 *
 */
public class GenomeSequenceBuilder {
    private static final Logger myLogger = LogManager.getLogger(GenomeSequenceBuilder.class);

    /**
     * Builds GenomeSequence from a fasta file.
     * @param fastaFileName full path to fasta file
     * @return GenomeSequence object
     */
    public static GenomeSequence instance(String fastaFileName) {
        Function charConversion= (c) -> c;
        return instance(fastaFileName,charConversion);
    }

    /**
     * Builds GenomeSequence from a fasta file.  The char conversion provide a mechanism to convert upper and lower case
     * or convert one case to N.  This is useful if a case if used to define a certain class of bases
     * @param fastaFileName full path to fasta file
     * @param charConversion lambda Function to convert characters
     * @return GenomeSequence object
     */
    public static GenomeSequence instance(String fastaFileName, Function charConversion) {
        Map chromPositionMap = readReferenceGenomeChr(fastaFileName, charConversion);
        return new HalfByteGenomeSequence(chromPositionMap);
    }

    /**
     * Builds GenomeSequence from a String with one Chromosome.
     * @param chromosome chromosome object
     * @param sequence in upper case character
     * @return GenomeSequence object
     */
    public static GenomeSequence instance(Chromosome chromosome, String sequence) {
        Function charConversion= (c) -> c;
        Map chromPositionMap = new HashMap<>();
        Chromosome currChr=Chromosome.instance(chromosome.getName(),sequence.length(),chromosome.getAnnotation());
        chromPositionMap.put(currChr,halfByteCompression(sequence.getBytes(),charConversion));
        return new HalfByteGenomeSequence(chromPositionMap);
    }

    protected static  Map readReferenceGenomeChr(String fastaFileName, Function charConversion) {
        // Read specified file, return entire sequence for requested chromosome
        String base="ACGTNacgtn";
        String conv=base.chars().mapToObj(ci -> charConversion.apply((char)ci).toString()).collect(Collectors.joining());
        System.out.println("Genome FASTA character conversion: "+base+" to "+conv);
        Map chromPositionMap = new HashMap();
        Chromosome currChr = null;	
        ByteArrayOutputStream currSeq = new ByteArrayOutputStream();
        String line = null;
        try {
            boolean found = false;
            BufferedReader br = Utils.getBufferedReader(fastaFileName); // this takes care of .gz

            while ((line = br.readLine()) != null && !found) {
                line = line.trim();

                if (line.startsWith(">")) {
                    if (currChr != null) {
                        // end processing current chromosome sequence
                        currChr=Chromosome.instance(currChr.getName(),currSeq.size(),currChr.getAnnotation());
                        chromPositionMap.put(currChr, halfByteCompression(currSeq.toByteArray(),charConversion));
                    }
                    currChr = parseChromosome(line); 
                    currSeq = new ByteArrayOutputStream();
                } else {
                    currSeq.write(line.getBytes());
                }
            }
            // reached end of file - write last bytes
            if (currSeq.size() > 0) {
                currChr=Chromosome.instance(currChr.getName(),currSeq.size(),currChr.getAnnotation());
                chromPositionMap.put(currChr, halfByteCompression(currSeq.toByteArray(),charConversion));
            }
            br.close();
        } catch (IOException ioe) {
            System.out.println("ReferenceGenomeSequence: caught buffered read exception");
        }
        return chromPositionMap;
    }

    private static byte[] halfByteCompression(byte[] unpkSequence, Function charConversion){
        // Take byte array, turn bytes into NucleotideAlignmentConstant
        // allele values, store as half bytes
        int nBytes = (unpkSequence.length+1)/2;
        byte[] packedSequence = new byte[nBytes];
        for (int i = 0; i < unpkSequence.length; i++) {
            byte halfByte = NucleotideAlignmentConstants.getNucleotideAlleleByte(charConversion.apply((char)unpkSequence[i]));
            if(i%2==0) halfByte<<=4;
            packedSequence[i/2]|=halfByte;
        }
        return packedSequence;
    }

    private static  Chromosome parseChromosome (String chromString) {
        String chrS = chromString.replace(">","");
        chrS = chrS.toUpperCase();
        chrS = chrS.replace("CHROMOSOME", ""); 
        chrS = chrS.replace("CHR", ""); // keep chromosome string, minus any leading "chr" or "chromosome"
        GeneralAnnotation myAnnotations = null;

        String currChrDesc = null;
        int spaceIndex = chrS.indexOf(" ");
        if (spaceIndex > 0) {			
            currChrDesc = chrS.substring(chrS.indexOf(" ") + 1);
            myAnnotations = GeneralAnnotationStorage.getBuilder().addAnnotation("Description", currChrDesc).build();
            chrS = chrS.substring(0,chrS.indexOf(" "));
        } 
        return Chromosome.instance(chrS, -1, myAnnotations);
    }

}

/**
 * ReferenceGenomeSequence class.  This class is used to read chromosome sequences
 * from fasta files.  Data is stored as half-bytes packed into a byte array.
 * This byte array comprises the "value" for a hash map whose key is a
 * Chromosome object.
 *
 * The class also contains methods to obtain a full or partial genome sequence for a
 * specified stored chromosome.
 *
 * @author Lynn Johnson
 *
 */
class HalfByteGenomeSequence implements GenomeSequence{
    private Map chromPositionMap;
    private Map chromLengthLookup=new HashMap<>();
    private RangeMap wholeGenomeIndexMap= TreeRangeMap.create();
    private final long genomeSize;


    protected HalfByteGenomeSequence(MapchromPositionMap) {
        this.chromPositionMap = chromPositionMap;
        chromPositionMap.entrySet().stream()
        .forEach(e -> chromLengthLookup.put(e.getKey(),e.getKey().getLength()));
        LongAdder genomeIndex=new LongAdder();
        chromosomes().stream().sorted()
        .forEach(chrom -> {
            int length=chromLengthLookup.get(chrom);
            wholeGenomeIndexMap.put(Range.closed(genomeIndex.longValue(),
                    genomeIndex.longValue()+length-1),chrom);
            genomeIndex.add(length);}
                );
        genomeSize=genomeIndex.longValue();
    }
    @Override
    public Set chromosomes() {
        return chromPositionMap.keySet();
    }

    @Override
    public byte[] chromosomeSequence(Chromosome chrom) {
        return chromosomeSequence(chrom,1,chromLengthLookup.get(chrom));
    }

    @Override
    // Code assumes 1-based coordinates have been passed.  It will catch and return
    // null if the startSite is 0.  Otherwise, the user is on their own to ensure
    // input is 1-based.
    public byte[] chromosomeSequence(Chromosome chrom, int startSite, int lastSite) {
        startSite--;  //shift over to zero base
        lastSite--;   //shift over to zero base
        if (startSite < 0) throw new IllegalArgumentException("GenomeSequenceBuilder.chromosomeSequence: starting parameter is less than 1 for 1-based method");; // method needs 1-based coordinates.
        byte[] packedBytes = chromPositionMap.get(chrom);
        if (packedBytes == null) throw new IllegalArgumentException("GenomeSequenceBuilder.chromosomeSequence: chromosome not found"); // chromosome not found
        if (startSite > packedBytes.length*2 || lastSite > packedBytes.length*2 ) {
            System.out.println("GenomeSequenceBuilder, chrom: " + chrom.getName() + ", startSite: " 
              + startSite + ", lastSite: " + lastSite + ", chromSize: " + chromosomeSize(chrom)); 
            throw new IllegalArgumentException("GenomeSequenceBuilder.chromosomeSequence: requested sequence is out of range"); // requested sequence is out of range
        }
        byte[] fullBytes = new byte[lastSite - startSite + 1];
        for (int i = startSite; i <= lastSite; i++) {
            fullBytes[i - startSite] = (byte) ((i % 2 == 0) ? ((packedBytes[i / 2] & 0xF0) >> 4) : (packedBytes[i / 2] & 0x0F));
        }
        return fullBytes;
    }

    @Override
    public byte genotype(Chromosome chrom, int position) {
        position--; // shift to 0-based
        byte[] packedBytes = chromPositionMap.get(chrom);
        if (packedBytes == null) throw new IllegalArgumentException("GenomeSequenceBuilder.chromosomeSequence: chromosome not found"); 
        if (position > packedBytes.length*2 ) {
            System.out.println("GenomeSequenceBuilder, chrom: " + chrom.getName() + ", position: " 
              + position + ", chromSize: " + chromosomeSize(chrom)); 
            throw new IllegalArgumentException("GenomeSequenceBuilder.chromosomeSequence: requested position is out of range");            
        }
        byte alleleAtPosition = (byte) ((position % 2 == 0) ? ((packedBytes[position / 2] & 0xF0) >> 4) : (packedBytes[position / 2] & 0x0F));
        return alleleAtPosition;
    }
    
    @Override
    public byte genotype(Chromosome chrom, Position positionObject) {
        int position = positionObject.getPosition();
        position--; // shift to 0-based
        byte[] packedBytes = chromPositionMap.get(chrom);
        if (packedBytes == null) throw new IllegalArgumentException("GenomeSequenceBuilder.chromosomeSequence: chromosome not found"); 
        if (position > packedBytes.length*2 ) {
            System.out.println("GenomeSequenceBuilder, chrom: " + chrom.getName() + ", position: " 
              + position + ", chromSize: " + chromosomeSize(chrom)); 
            throw new IllegalArgumentException("GenomeSequenceBuilder.chromosomeSequence: requested position is out of range");            
        }
        byte valueAtPosition = (byte) ((position % 2 == 0) ? ((packedBytes[position / 2] & 0xF0) >> 4) : (packedBytes[position / 2] & 0x0F));
        return valueAtPosition;
    }
    
    @Override
    public String genotypeAsString(Chromosome chrom, int position) {
        // get the bytes, convert to string, return the string
        byte valueAtPosition = genotype(chrom,position);
        return NucleotideAlignmentConstants.getHaplotypeNucleotide(valueAtPosition);
    }
    
    @Override
    public String genotypeAsString(Chromosome chrom, Position positionObject) {
        // get the bytes, convert to string, return the string
        // position is flipped to 0 based in call to genotype(chrom,position)
        int position = positionObject.getPosition();
        byte valueAtPosition = genotype(chrom,position);
        return NucleotideAlignmentConstants.getHaplotypeNucleotide(valueAtPosition);
    }
    
    @Override
    public String genotypeAsString(Chromosome chrom, int startSite, int endSite) {
        // This method takes physical positions.  chromosomeSequence flips the start/end to 0 based. 
        byte[] chromBytes = chromosomeSequence( chrom,  startSite,  endSite);
        StringBuilder builder = new StringBuilder();
        for (int idx = 0; idx < chromBytes.length; idx++) {
            builder.append(NucleotideAlignmentConstants.getHaplotypeNucleotide(chromBytes[idx]));
        }
        return builder.toString();
    }
    
    @Override
    public byte[] genomeSequence(long startSite, long lastSite) {
        if(lastSite-startSite>Integer.MAX_VALUE) throw
            new IllegalArgumentException("Less than "+Integer.MAX_VALUE+" sites must be requested at a time");
        byte[] fullBytes=new byte[(int)(lastSite-startSite+1)];
        long currentSiteToGet=startSite;
        while(currentSiteToGet,Chromosome> rangeChromEntry=wholeGenomeIndexMap.getEntry(currentSiteToGet);
            int chrStart=(int)(currentSiteToGet-rangeChromEntry.getKey().lowerEndpoint());
            int chrLast=(int)Math.min(rangeChromEntry.getKey().upperEndpoint()-rangeChromEntry.getKey().lowerEndpoint(),lastSite-rangeChromEntry.getKey().lowerEndpoint());
            byte[] chromoSeq=chromosomeSequence(rangeChromEntry.getValue(), chrStart+1,chrLast+1);  //+1 for 0 based genome, 1 based chromosomes
            System.arraycopy(chromoSeq,0,fullBytes,(int)(currentSiteToGet-startSite),chromoSeq.length);
            currentSiteToGet+=chromoSeq.length;
        }
        return fullBytes;
    }

    @Override
    public int chromosomeSize(Chromosome chromosome) {
        return chromLengthLookup.get(chromosome);
    }

    @Override
    public long genomeSize() {
        return genomeSize;
    }

    @Override
    public int numberOfChromosomes() {
        return chromPositionMap.size();
    }
    
    @Override
    public Map> fullRefCoordinateToChromCoordinate(ArrayList coordinates) {
        // Returns 0-based value from Chromosome array (values are stored as 0-based) 
        Map> mappedCoordinates = new ConcurrentHashMap>();
        coordinates.stream().parallel().forEach(coordinate -> {
            Map.Entry,Chromosome> rangeChromEntry=wholeGenomeIndexMap.getEntry(coordinate);
            Chromosome curChrom = rangeChromEntry.getValue();
            long chromCoordinate = coordinate - rangeChromEntry.getKey().lowerEndpoint();
            Tuple chromWithCoordinate = new Tuple<>(curChrom, (int)chromCoordinate);
            mappedCoordinates.put(coordinate, chromWithCoordinate);
        });
        return mappedCoordinates;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy