net.maizegenetics.dna.map.GenomeSequenceBuilder Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of tassel Show documentation
Show all versions of tassel Show documentation
TASSEL is a software package to evaluate traits associations, evolutionary patterns, and linkage
disequilibrium.
The newest version!
/**
*
*/
package net.maizegenetics.dna.map;
import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.LongAdder;
import java.util.function.Function;
import java.util.stream.Collectors;
import com.google.common.collect.Range;
import com.google.common.collect.RangeMap;
import com.google.common.collect.TreeRangeMap;
import net.maizegenetics.dna.snp.GenotypeTableBuilder;
import net.maizegenetics.dna.snp.NucleotideAlignmentConstants;
import net.maizegenetics.util.GeneralAnnotation;
import net.maizegenetics.util.GeneralAnnotationStorage;
import net.maizegenetics.util.Tuple;
import net.maizegenetics.util.Utils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
/**
* Builder for a chromosome genome sequence that hides the ReferenceGenomeSequence implementation.
* The sequence is read from a fasta file and stored in a byte array, 2 alleles packed per byte.
*
* @author Lynn Johnson
*
*/
public class GenomeSequenceBuilder {
private static final Logger myLogger = LogManager.getLogger(GenomeSequenceBuilder.class);
/**
* Builds GenomeSequence from a fasta file.
* @param fastaFileName full path to fasta file
* @return GenomeSequence object
*/
public static GenomeSequence instance(String fastaFileName) {
Function charConversion= (c) -> c;
return instance(fastaFileName,charConversion);
}
/**
* Builds GenomeSequence from a fasta file. The char conversion provide a mechanism to convert upper and lower case
* or convert one case to N. This is useful if a case if used to define a certain class of bases
* @param fastaFileName full path to fasta file
* @param charConversion lambda Function to convert characters
* @return GenomeSequence object
*/
public static GenomeSequence instance(String fastaFileName, Function charConversion) {
Map chromPositionMap = readReferenceGenomeChr(fastaFileName, charConversion);
return new HalfByteGenomeSequence(chromPositionMap);
}
/**
* Builds GenomeSequence from a String with one Chromosome.
* @param chromosome chromosome object
* @param sequence in upper case character
* @return GenomeSequence object
*/
public static GenomeSequence instance(Chromosome chromosome, String sequence) {
Function charConversion= (c) -> c;
Map chromPositionMap = new HashMap<>();
Chromosome currChr=Chromosome.instance(chromosome.getName(),sequence.length(),chromosome.getAnnotation());
chromPositionMap.put(currChr,halfByteCompression(sequence.getBytes(),charConversion));
return new HalfByteGenomeSequence(chromPositionMap);
}
protected static Map readReferenceGenomeChr(String fastaFileName, Function charConversion) {
// Read specified file, return entire sequence for requested chromosome
String base="ACGTNacgtn";
String conv=base.chars().mapToObj(ci -> charConversion.apply((char)ci).toString()).collect(Collectors.joining());
System.out.println("Genome FASTA character conversion: "+base+" to "+conv);
Map chromPositionMap = new HashMap();
Chromosome currChr = null;
ByteArrayOutputStream currSeq = new ByteArrayOutputStream();
String line = null;
try {
boolean found = false;
BufferedReader br = Utils.getBufferedReader(fastaFileName); // this takes care of .gz
while ((line = br.readLine()) != null && !found) {
line = line.trim();
if (line.startsWith(">")) {
if (currChr != null) {
// end processing current chromosome sequence
currChr=Chromosome.instance(currChr.getName(),currSeq.size(),currChr.getAnnotation());
chromPositionMap.put(currChr, halfByteCompression(currSeq.toByteArray(),charConversion));
}
currChr = parseChromosome(line);
currSeq = new ByteArrayOutputStream();
} else {
currSeq.write(line.getBytes());
}
}
// reached end of file - write last bytes
if (currSeq.size() > 0) {
currChr=Chromosome.instance(currChr.getName(),currSeq.size(),currChr.getAnnotation());
chromPositionMap.put(currChr, halfByteCompression(currSeq.toByteArray(),charConversion));
}
br.close();
} catch (IOException ioe) {
System.out.println("ReferenceGenomeSequence: caught buffered read exception");
}
return chromPositionMap;
}
private static byte[] halfByteCompression(byte[] unpkSequence, Function charConversion){
// Take byte array, turn bytes into NucleotideAlignmentConstant
// allele values, store as half bytes
int nBytes = (unpkSequence.length+1)/2;
byte[] packedSequence = new byte[nBytes];
for (int i = 0; i < unpkSequence.length; i++) {
byte halfByte = NucleotideAlignmentConstants.getNucleotideAlleleByte(charConversion.apply((char)unpkSequence[i]));
if(i%2==0) halfByte<<=4;
packedSequence[i/2]|=halfByte;
}
return packedSequence;
}
private static Chromosome parseChromosome (String chromString) {
String chrS = chromString.replace(">","");
chrS = chrS.toUpperCase();
chrS = chrS.replace("CHROMOSOME", "");
chrS = chrS.replace("CHR", ""); // keep chromosome string, minus any leading "chr" or "chromosome"
GeneralAnnotation myAnnotations = null;
String currChrDesc = null;
int spaceIndex = chrS.indexOf(" ");
if (spaceIndex > 0) {
currChrDesc = chrS.substring(chrS.indexOf(" ") + 1);
myAnnotations = GeneralAnnotationStorage.getBuilder().addAnnotation("Description", currChrDesc).build();
chrS = chrS.substring(0,chrS.indexOf(" "));
}
return Chromosome.instance(chrS, -1, myAnnotations);
}
}
/**
* ReferenceGenomeSequence class. This class is used to read chromosome sequences
* from fasta files. Data is stored as half-bytes packed into a byte array.
* This byte array comprises the "value" for a hash map whose key is a
* Chromosome object.
*
* The class also contains methods to obtain a full or partial genome sequence for a
* specified stored chromosome.
*
* @author Lynn Johnson
*
*/
class HalfByteGenomeSequence implements GenomeSequence{
private Map chromPositionMap;
private Map chromLengthLookup=new HashMap<>();
private RangeMap wholeGenomeIndexMap= TreeRangeMap.create();
private final long genomeSize;
protected HalfByteGenomeSequence(MapchromPositionMap) {
this.chromPositionMap = chromPositionMap;
chromPositionMap.entrySet().stream()
.forEach(e -> chromLengthLookup.put(e.getKey(),e.getKey().getLength()));
LongAdder genomeIndex=new LongAdder();
chromosomes().stream().sorted()
.forEach(chrom -> {
int length=chromLengthLookup.get(chrom);
wholeGenomeIndexMap.put(Range.closed(genomeIndex.longValue(),
genomeIndex.longValue()+length-1),chrom);
genomeIndex.add(length);}
);
genomeSize=genomeIndex.longValue();
}
@Override
public Set chromosomes() {
return chromPositionMap.keySet();
}
@Override
public byte[] chromosomeSequence(Chromosome chrom) {
return chromosomeSequence(chrom,1,chromLengthLookup.get(chrom));
}
@Override
// Code assumes 1-based coordinates have been passed. It will catch and return
// null if the startSite is 0. Otherwise, the user is on their own to ensure
// input is 1-based.
public byte[] chromosomeSequence(Chromosome chrom, int startSite, int lastSite) {
startSite--; //shift over to zero base
lastSite--; //shift over to zero base
if (startSite < 0) throw new IllegalArgumentException("GenomeSequenceBuilder.chromosomeSequence: starting parameter is less than 1 for 1-based method");; // method needs 1-based coordinates.
byte[] packedBytes = chromPositionMap.get(chrom);
if (packedBytes == null) throw new IllegalArgumentException("GenomeSequenceBuilder.chromosomeSequence: chromosome not found"); // chromosome not found
if (startSite > packedBytes.length*2 || lastSite > packedBytes.length*2 ) {
System.out.println("GenomeSequenceBuilder, chrom: " + chrom.getName() + ", startSite: "
+ startSite + ", lastSite: " + lastSite + ", chromSize: " + chromosomeSize(chrom));
throw new IllegalArgumentException("GenomeSequenceBuilder.chromosomeSequence: requested sequence is out of range"); // requested sequence is out of range
}
byte[] fullBytes = new byte[lastSite - startSite + 1];
for (int i = startSite; i <= lastSite; i++) {
fullBytes[i - startSite] = (byte) ((i % 2 == 0) ? ((packedBytes[i / 2] & 0xF0) >> 4) : (packedBytes[i / 2] & 0x0F));
}
return fullBytes;
}
@Override
public byte genotype(Chromosome chrom, int position) {
position--; // shift to 0-based
byte[] packedBytes = chromPositionMap.get(chrom);
if (packedBytes == null) throw new IllegalArgumentException("GenomeSequenceBuilder.chromosomeSequence: chromosome not found");
if (position > packedBytes.length*2 ) {
System.out.println("GenomeSequenceBuilder, chrom: " + chrom.getName() + ", position: "
+ position + ", chromSize: " + chromosomeSize(chrom));
throw new IllegalArgumentException("GenomeSequenceBuilder.chromosomeSequence: requested position is out of range");
}
byte alleleAtPosition = (byte) ((position % 2 == 0) ? ((packedBytes[position / 2] & 0xF0) >> 4) : (packedBytes[position / 2] & 0x0F));
return alleleAtPosition;
}
@Override
public byte genotype(Chromosome chrom, Position positionObject) {
int position = positionObject.getPosition();
position--; // shift to 0-based
byte[] packedBytes = chromPositionMap.get(chrom);
if (packedBytes == null) throw new IllegalArgumentException("GenomeSequenceBuilder.chromosomeSequence: chromosome not found");
if (position > packedBytes.length*2 ) {
System.out.println("GenomeSequenceBuilder, chrom: " + chrom.getName() + ", position: "
+ position + ", chromSize: " + chromosomeSize(chrom));
throw new IllegalArgumentException("GenomeSequenceBuilder.chromosomeSequence: requested position is out of range");
}
byte valueAtPosition = (byte) ((position % 2 == 0) ? ((packedBytes[position / 2] & 0xF0) >> 4) : (packedBytes[position / 2] & 0x0F));
return valueAtPosition;
}
@Override
public String genotypeAsString(Chromosome chrom, int position) {
// get the bytes, convert to string, return the string
byte valueAtPosition = genotype(chrom,position);
return NucleotideAlignmentConstants.getHaplotypeNucleotide(valueAtPosition);
}
@Override
public String genotypeAsString(Chromosome chrom, Position positionObject) {
// get the bytes, convert to string, return the string
// position is flipped to 0 based in call to genotype(chrom,position)
int position = positionObject.getPosition();
byte valueAtPosition = genotype(chrom,position);
return NucleotideAlignmentConstants.getHaplotypeNucleotide(valueAtPosition);
}
@Override
public String genotypeAsString(Chromosome chrom, int startSite, int endSite) {
// This method takes physical positions. chromosomeSequence flips the start/end to 0 based.
byte[] chromBytes = chromosomeSequence( chrom, startSite, endSite);
StringBuilder builder = new StringBuilder();
for (int idx = 0; idx < chromBytes.length; idx++) {
builder.append(NucleotideAlignmentConstants.getHaplotypeNucleotide(chromBytes[idx]));
}
return builder.toString();
}
@Override
public byte[] genomeSequence(long startSite, long lastSite) {
if(lastSite-startSite>Integer.MAX_VALUE) throw
new IllegalArgumentException("Less than "+Integer.MAX_VALUE+" sites must be requested at a time");
byte[] fullBytes=new byte[(int)(lastSite-startSite+1)];
long currentSiteToGet=startSite;
while(currentSiteToGet,Chromosome> rangeChromEntry=wholeGenomeIndexMap.getEntry(currentSiteToGet);
int chrStart=(int)(currentSiteToGet-rangeChromEntry.getKey().lowerEndpoint());
int chrLast=(int)Math.min(rangeChromEntry.getKey().upperEndpoint()-rangeChromEntry.getKey().lowerEndpoint(),lastSite-rangeChromEntry.getKey().lowerEndpoint());
byte[] chromoSeq=chromosomeSequence(rangeChromEntry.getValue(), chrStart+1,chrLast+1); //+1 for 0 based genome, 1 based chromosomes
System.arraycopy(chromoSeq,0,fullBytes,(int)(currentSiteToGet-startSite),chromoSeq.length);
currentSiteToGet+=chromoSeq.length;
}
return fullBytes;
}
@Override
public int chromosomeSize(Chromosome chromosome) {
return chromLengthLookup.get(chromosome);
}
@Override
public long genomeSize() {
return genomeSize;
}
@Override
public int numberOfChromosomes() {
return chromPositionMap.size();
}
@Override
public Map> fullRefCoordinateToChromCoordinate(ArrayList coordinates) {
// Returns 0-based value from Chromosome array (values are stored as 0-based)
Map> mappedCoordinates = new ConcurrentHashMap>();
coordinates.stream().parallel().forEach(coordinate -> {
Map.Entry,Chromosome> rangeChromEntry=wholeGenomeIndexMap.getEntry(coordinate);
Chromosome curChrom = rangeChromEntry.getValue();
long chromCoordinate = coordinate - rangeChromEntry.getKey().lowerEndpoint();
Tuple chromWithCoordinate = new Tuple<>(curChrom, (int)chromCoordinate);
mappedCoordinates.put(coordinate, chromWithCoordinate);
});
return mappedCoordinates;
}
}