All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.biojava.nbio.genome.GeneFeatureHelper Maven / Gradle / Ivy

/*
 *                    BioJava development code
 *
 * This code may be freely distributed and modified under the
 * terms of the GNU Lesser General Public Licence.  This should
 * be distributed with the code.  If you do not have a copy,
 * see:
 *
 *      http://www.gnu.org/copyleft/lesser.html
 *
 * Copyright for this code is held jointly by the individual
 * authors.  These should be listed in @author doc comments.
 *
 * For more information on the BioJava project and its aims,
 * or to join the biojava-l mailing list, visit the home page
 * at:
 *
 *      http://www.biojava.org/
 *
 */
package org.biojava.nbio.genome;

import org.biojava.nbio.genome.parsers.gff.*;
import org.biojava.nbio.core.sequence.*;
import org.biojava.nbio.core.sequence.io.FastaReaderHelper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.io.FileWriter;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.LinkedHashMap;

/**
 *
 * @author Scooter Willis 
 */
public class GeneFeatureHelper {

	private static final Logger logger = LoggerFactory.getLogger(GeneFeatureHelper.class);

	static public LinkedHashMap loadFastaAddGeneFeaturesFromUpperCaseExonFastaFile(File fastaSequenceFile, File uppercaseFastaFile, boolean throwExceptionGeneNotFound) throws Exception {
		LinkedHashMap chromosomeSequenceList = new LinkedHashMap();
		LinkedHashMap dnaSequenceList = FastaReaderHelper.readFastaDNASequence(fastaSequenceFile);
		for (String accession : dnaSequenceList.keySet()) {
			DNASequence contigSequence = dnaSequenceList.get(accession);
			ChromosomeSequence chromsomeSequence = new ChromosomeSequence(contigSequence.getSequenceAsString());
			chromsomeSequence.setAccession(contigSequence.getAccession());
			chromosomeSequenceList.put(accession, chromsomeSequence);
		}


		LinkedHashMap geneSequenceList = FastaReaderHelper.readFastaDNASequence(uppercaseFastaFile);
		for (DNASequence dnaSequence : geneSequenceList.values()) {
			String geneSequence = dnaSequence.getSequenceAsString();
			String lcGeneSequence = geneSequence.toLowerCase();
			String reverseGeneSequence = dnaSequence.getReverse().getSequenceAsString();
			String lcReverseGeneSequence = reverseGeneSequence.toLowerCase();
			Integer bioStart = null;
			Integer bioEnd = null;
			Strand strand = Strand.POSITIVE;
			boolean geneFound = false;
			String accession = "";
			DNASequence contigDNASequence = null;
			for (String id : dnaSequenceList.keySet()) {
				accession = id;
				contigDNASequence = dnaSequenceList.get(id);
				String contigSequence = contigDNASequence.getSequenceAsString().toLowerCase();
				bioStart = contigSequence.indexOf(lcGeneSequence);
				if (bioStart != -1) {
					bioStart = bioStart + 1;
					bioEnd = bioStart + geneSequence.length() - 1;
					geneFound = true;
					break;
				} else {
					bioStart = contigSequence.indexOf(lcReverseGeneSequence);
					if (bioStart != -1) {
						bioStart = bioStart + 1;
						bioEnd = bioStart - geneSequence.length() - 1;
						strand = Strand.NEGATIVE;
						geneFound = true;
						break;
					}
				}
			}

			if (geneFound) {
				logger.info("Gene {} found at {} {} {} {}",
						dnaSequence.getAccession().toString(), contigDNASequence.getAccession().toString(), bioStart, bioEnd, strand);
				ChromosomeSequence chromosomeSequence = chromosomeSequenceList.get(accession);

				ArrayList exonBoundries = new ArrayList();

				//look for transitions from lowercase to upper case
				for (int i = 0; i < geneSequence.length(); i++) {
					if (i == 0 && Character.isUpperCase(geneSequence.charAt(i))) {
						exonBoundries.add(i);
					} else if (i == geneSequence.length() - 1) {
						exonBoundries.add(i);
					} else if (Character.isUpperCase(geneSequence.charAt(i)) && Character.isLowerCase(geneSequence.charAt(i - 1))) {
						exonBoundries.add(i);
					} else if (Character.isUpperCase(geneSequence.charAt(i)) && Character.isLowerCase(geneSequence.charAt(i + 1))) {
						exonBoundries.add(i);
					}
				}
				if (strand == Strand.NEGATIVE) {
					Collections.reverse(exonBoundries);
				}


				String geneaccession = dnaSequence.getAccession().getID();
				String note = geneaccession;
				String[] values = geneaccession.split(" ");
				geneaccession = values[0];



				GeneSequence geneSeq = chromosomeSequence.addGene(new AccessionID(geneaccession), bioStart, bioEnd, strand);
				geneSeq.addNote(note);
				geneSeq.setSource(uppercaseFastaFile.getName());
				//String transcriptName = geneaccession + "-transcript";
				//TranscriptSequence transcriptSequence = geneSeq.addTranscript(new AccessionID(transcriptName), bioStart, bioEnd);

				int runningFrameLength = 0;
				for (int i = 0; i < exonBoundries.size() - 1; i = i + 2) {
					int cdsBioStart = exonBoundries.get(i) + bioStart;
					int cdsBioEnd = exonBoundries.get(i + 1) + bioStart;
					runningFrameLength = runningFrameLength + Math.abs(cdsBioEnd - cdsBioStart) + 1;
					//String cdsName = transcriptName + "-cds-" + cdsBioStart + "-" + cdsBioEnd;

					//AccessionID cdsAccessionID = new AccessionID(cdsName);
					//ExonSequence exonSequence = geneSeq.addExon(cdsAccessionID, cdsBioStart, cdsBioEnd);
					int remainder = runningFrameLength % 3;
					//int frame = 0;
					if (remainder == 1) {
						//frame = 2; // borrow 2 from next CDS region
					} else if (remainder == 2) {
						//frame = 1;
					}
					//CDSSequence cdsSequence = transcriptSequence.addCDS(cdsAccessionID, cdsBioStart, cdsBioEnd, frame);
				}


			} else {
				if (throwExceptionGeneNotFound) {
					throw new Exception(dnaSequence.getAccession().toString() + " not found");
				}
				logger.info("Gene not found {}", dnaSequence.getAccession().toString());
			}

		}
		return chromosomeSequenceList;
	}

	/**
	 * Output a gff3 feature file that will give the length of each scaffold/chromosome in the fasta file.
	 * Used for gbrowse so it knows length.
	 * @param fastaSequenceFile
	 * @param gffFile
	 * @throws Exception
	 */
	static public void outputFastaSequenceLengthGFF3(File fastaSequenceFile, File gffFile) throws Exception {
		LinkedHashMap dnaSequenceList = FastaReaderHelper.readFastaDNASequence(fastaSequenceFile);
		String fileName = fastaSequenceFile.getName();
		FileWriter fw = new FileWriter(gffFile);
		String newLine = System.getProperty("line.separator");
		fw.write("##gff-version 3" + newLine);
		for (DNASequence dnaSequence : dnaSequenceList.values()) {
			String gff3line = dnaSequence.getAccession().getID() + "\t" + fileName + "\t" + "contig" + "\t" + "1" + "\t" + dnaSequence.getBioEnd() + "\t.\t.\t.\tName=" + dnaSequence.getAccession().getID() + newLine;
			fw.write(gff3line);
		}
		fw.close();
	}

	/**
	 * Loads Fasta file and GFF2 feature file generated from the geneid prediction algorithm
	 *
	 * @param fastaSequenceFile
	 * @param gffFile
	 * @return
	 * @throws Exception
	 */
	static public LinkedHashMap loadFastaAddGeneFeaturesFromGeneIDGFF2(File fastaSequenceFile, File gffFile) throws Exception {
		LinkedHashMap dnaSequenceList = FastaReaderHelper.readFastaDNASequence(fastaSequenceFile);
		LinkedHashMap chromosomeSequenceList = GeneFeatureHelper.getChromosomeSequenceFromDNASequence(dnaSequenceList);
		FeatureList listGenes = GeneIDGFF2Reader.read(gffFile.getAbsolutePath());
		addGeneIDGFF2GeneFeatures(chromosomeSequenceList, listGenes);
		return chromosomeSequenceList;
	}

	/**
	 * Load GFF2 feature file generated from the geneid prediction algorithm and map features onto the chromosome sequences
	 *
	 * @param chromosomeSequenceList
	 * @param listGenes
	 * @throws Exception
	 */
	static public void addGeneIDGFF2GeneFeatures(LinkedHashMap chromosomeSequenceList, FeatureList listGenes) throws Exception {
		Collection geneIds = listGenes.attributeValues("gene_id");
		for (String geneid : geneIds) {
			FeatureList gene = listGenes.selectByAttribute("gene_id", geneid);
			FeatureI geneFeature = gene.get(0);
			ChromosomeSequence seq = chromosomeSequenceList.get(geneFeature.seqname());
			geneid = geneid.replaceAll("_", ".G");
			AccessionID geneAccessionID = new AccessionID(geneid);
			GeneSequence geneSequence = null;
			Collection transcriptids = gene.attributeValues("gene_id");
			for (String transcriptid : transcriptids) {
				// get all the individual features (exons, CDS regions, etc.) of this gene
				FeatureList transcriptFeature = listGenes.selectByAttribute("gene_id", transcriptid);
				transcriptid = transcriptid.replaceAll("_", ".G");




				//      String seqName = feature.seqname();
				//FeatureI startCodon = null;
				//FeatureI stopCodon = null;
				Integer startCodonBegin = null;
				Integer stopCodonEnd = null;
				//String startCodonName = "";
				//String stopCodonName = "";


				// now select only the coding regions of this gene
				FeatureList firstFeatures = transcriptFeature.selectByType("First");
				FeatureList terminalFeatures = transcriptFeature.selectByType("Terminal");
				FeatureList internalFeatures = transcriptFeature.selectByType("Internal");
				FeatureList singleFeatures = transcriptFeature.selectByType("Single");
				FeatureList cdsFeatures = new FeatureList();
				cdsFeatures.add(firstFeatures);
				cdsFeatures.add(terminalFeatures);
				cdsFeatures.add(internalFeatures);
				cdsFeatures.add(singleFeatures);
				// sort them
				cdsFeatures = cdsFeatures.sortByStart();
				Strand strand = Strand.POSITIVE;
				FeatureI feature = cdsFeatures.get(0);
				if (feature.location().isNegative()) {
					strand = Strand.NEGATIVE;
				}
				if (startCodonBegin == null) {
					FeatureI firstFeature = cdsFeatures.get(0);
					if (strand == Strand.NEGATIVE) {
						startCodonBegin = firstFeature.location().bioEnd();
					} else {
						startCodonBegin = firstFeature.location().bioStart();
					}
				}

				if (stopCodonEnd == null) {

					FeatureI lastFeature = cdsFeatures.get(cdsFeatures.size() - 1);
					if (strand == Strand.NEGATIVE) {
						stopCodonEnd = lastFeature.location().bioStart();
					} else {
						stopCodonEnd = lastFeature.location().bioEnd();
					}
				}
				//for gtf ordering can be strand based so first is last and last is first
				if (startCodonBegin > stopCodonEnd) {
					int temp = startCodonBegin;
					startCodonBegin = stopCodonEnd;
					stopCodonEnd = temp;
				}

				AccessionID transcriptAccessionID = new AccessionID(transcriptid);
				if (geneSequence == null) {
					geneSequence = seq.addGene(geneAccessionID, startCodonBegin, stopCodonEnd, strand);
					geneSequence.setSource(((Feature) feature).source());
				} else {
					//if multiple transcripts for one gene make sure the gene is defined as the min and max start/end

					if (startCodonBegin < geneSequence.getBioBegin()) {
						geneSequence.setBioBegin(startCodonBegin);
					}
					if (stopCodonEnd > geneSequence.getBioBegin()) {
						geneSequence.setBioEnd(stopCodonEnd);
					}

				}
				TranscriptSequence transcriptSequence = geneSequence.addTranscript(transcriptAccessionID, startCodonBegin, stopCodonEnd);
				/*
				if (startCodon != null) {
					if (startCodonName == null || startCodonName.length() == 0) {
						startCodonName = transcriptid + "-start_codon-" + startCodon.location().bioStart() + "-" + startCodon.location().bioEnd();
					}
					transcriptSequence.addStartCodonSequence(new AccessionID(startCodonName), startCodon.location().bioStart(), startCodon.location().bioEnd());
				}
				if (stopCodon != null) {
					if (stopCodonName == null || stopCodonName.length() == 0) {
						stopCodonName = transcriptid + "-stop_codon-" + stopCodon.location().bioStart() + "-" + stopCodon.location().bioEnd();
					}
					transcriptSequence.addStopCodonSequence(new AccessionID(stopCodonName), stopCodon.location().bioStart(), stopCodon.location().bioEnd());
				}
				*/

				for (FeatureI cdsFeature : cdsFeatures) {
					Feature cds = (Feature) cdsFeature;
					String cdsName = cds.getAttribute("transcript_name");
					if (cdsName == null || cdsName.length() == 0) {
						cdsName = transcriptid + "-cds-" + cds.location().bioStart() + "-" + cds.location().bioEnd();
					}
					AccessionID cdsAccessionID = new AccessionID(cdsName);
					//ExonSequence exonSequence = geneSequence.addExon(cdsAccessionID, cdsFeature.location().bioStart(), cdsFeature.location().bioEnd());
					CDSSequence cdsSequence = transcriptSequence.addCDS(cdsAccessionID, cdsFeature.location().bioStart(), cdsFeature.location().bioEnd(), cds.frame());
					cdsSequence.setSequenceScore(cds.score());
				}
			}
		}

	}

	static public LinkedHashMap getChromosomeSequenceFromDNASequence(LinkedHashMap dnaSequenceList) {
		LinkedHashMap chromosomeSequenceList = new LinkedHashMap();
		for (String key : dnaSequenceList.keySet()) {
			DNASequence dnaSequence = dnaSequenceList.get(key);
			ChromosomeSequence chromosomeSequence = new ChromosomeSequence(dnaSequence.getProxySequenceReader()); //we want the underlying sequence but don't need storage
			chromosomeSequence.setAccession(dnaSequence.getAccession());
			chromosomeSequenceList.put(key, chromosomeSequence);
		}
		return chromosomeSequenceList;
	}

	/**
	 * Lots of variations in the ontology or descriptors that can be used in GFF3 which requires writing a custom parser to handle a GFF3 generated or used
	 * by a specific application. Probably could be abstracted out but for now easier to handle with custom code to deal with gff3 elements that are not
	 * included but can be extracted from other data elements.
	 * @param fastaSequenceFile
	 * @param gffFile
	 * @param lazyloadsequences If set to true then the fasta file will be parsed for accession id but sequences will be read from disk when needed to save memory
	 * @return
	 * @throws Exception
	 */
	static public LinkedHashMap loadFastaAddGeneFeaturesFromGmodGFF3(File fastaSequenceFile, File gffFile,boolean lazyloadsequences) throws Exception {
		LinkedHashMap dnaSequenceList = FastaReaderHelper.readFastaDNASequence(fastaSequenceFile,lazyloadsequences);
		LinkedHashMap chromosomeSequenceList = GeneFeatureHelper.getChromosomeSequenceFromDNASequence(dnaSequenceList);
		FeatureList listGenes = GFF3Reader.read(gffFile.getAbsolutePath());
		addGmodGFF3GeneFeatures(chromosomeSequenceList, listGenes);
		return chromosomeSequenceList;
	}

	/**
	 * Load GFF3 file using mRNA as the gene feature as not all GFF3 files are complete
	 * @param chromosomeSequenceList
	 * @param listGenes
	 * @throws Exception
	 */
	static public void addGmodGFF3GeneFeatures(LinkedHashMap chromosomeSequenceList, FeatureList listGenes) throws Exception {


		// key off mRNA as being a known feature that may or may not have a parent gene


		FeatureList mRNAFeatures = listGenes.selectByType("mRNA");
		LinkedHashMap featureIDHashMap = FeatureHelper.buildFeatureAtrributeIndex("ID", listGenes);
		LinkedHashMap featureParentHashMap = FeatureHelper.buildFeatureAtrributeIndex("Parent", listGenes);

		for (FeatureI f : mRNAFeatures) {
			String geneID;
			String geneNote = null;
			String geneSource = null;
			String sequenceName = null;
			ChromosomeSequence seq = null;
			GeneSequence geneSequence = null;

			Feature mRNAFeature = (Feature) f;
			String mRNAID = mRNAFeature.getAttribute("ID");
			String mRNAsource = mRNAFeature.source();
			String mRNANote = mRNAFeature.getAttribute("Note");
			String mRNAParent = mRNAFeature.getAttribute("Parent");
			if (mRNAParent != null && mRNAParent.length() > 0) {
			   // FeatureList geneFeatureList = listGenes.selectByAttribute("ID", mRNAParent);
				FeatureList geneFeatureList = featureIDHashMap.get(mRNAParent);
				Feature geneFeature = (Feature) geneFeatureList.get(0);
				geneID = geneFeature.getAttribute("ID");
				geneNote = geneFeature.getAttribute("Note");
				geneSource = geneFeature.source();
				sequenceName = geneFeature.seqname();

				//
			} else {
				//deal with cases where no parent gene is given
				geneID = mRNAID;
				geneSource = mRNAsource;
				sequenceName = mRNAFeature.seqname();
			}

			seq = chromosomeSequenceList.get(sequenceName);

			AccessionID geneAccessionID = new AccessionID(geneID);

		  //  FeatureList mRNAChildren = listGenes.selectByAttribute("Parent", mRNAID);
			FeatureList mRNAChildren = featureParentHashMap.get(mRNAID);
			FeatureList cdsFeatures = mRNAChildren.selectByType("CDS");
			FeatureI feature = cdsFeatures.get(0);
			Strand strand = Strand.POSITIVE;

			if (feature.location().isNegative()) {
				strand = Strand.NEGATIVE;
			}
			cdsFeatures = cdsFeatures.sortByStart();







			//String seqName = feature.seqname();
			FeatureI startCodon = null;
			FeatureI stopCodon = null;
			Integer startCodonBegin = null;
			Integer stopCodonEnd = null;
			String startCodonName = "";
			String stopCodonName = "";
			FeatureList startCodonList = mRNAChildren.selectByType("five_prime_UTR");
			if (startCodonList != null && startCodonList.size() > 0) {
				startCodon = startCodonList.get(0);
				if (strand == Strand.NEGATIVE) {
					startCodonBegin = startCodon.location().bioEnd();
				} else {
					startCodonBegin = startCodon.location().bioStart();
				}
				startCodonName = startCodon.getAttribute("ID");
			}

			FeatureList stopCodonList = mRNAChildren.selectByType("three_prime_UTR");

			if (stopCodonList != null && stopCodonList.size() > 0) {
				stopCodon = stopCodonList.get(0);
				if (strand == Strand.NEGATIVE) {
					stopCodonEnd = stopCodon.location().bioStart();
				} else {
					stopCodonEnd = stopCodon.location().bioEnd();
				}
				stopCodonName = stopCodon.getAttribute("ID");

			}




			if (startCodonBegin == null) {
				if (strand == Strand.NEGATIVE) {
					FeatureI firstFeature = cdsFeatures.get(0);

					startCodonBegin = firstFeature.location().bioEnd();
				} else {
					FeatureI firstFeature = cdsFeatures.get(0);

					startCodonBegin = firstFeature.location().bioStart();
				}
			}

			if (stopCodonEnd == null) {
				if (strand == Strand.NEGATIVE) {
					FeatureI lastFeature = cdsFeatures.get(cdsFeatures.size() - 1);
					stopCodonEnd = lastFeature.location().bioStart();
				} else {
					FeatureI lastFeature = cdsFeatures.get(cdsFeatures.size() - 1);
					stopCodonEnd = lastFeature.location().bioEnd();
				}
			}
			//for gtf ordering can be strand based so first is last and last is first
			if (startCodonBegin > stopCodonEnd) {
				int temp = startCodonBegin;
				startCodonBegin = stopCodonEnd;
				stopCodonEnd = temp;
			}



			AccessionID transcriptAccessionID = new AccessionID(mRNAID);
			geneSequence = seq.getGene(geneID);
			if (geneSequence == null) {
				geneSequence = seq.addGene(geneAccessionID, startCodonBegin, stopCodonEnd, strand);
				geneSequence.setSource(geneSource);
				if (geneNote != null && geneNote.length() > 0) {
					geneSequence.addNote(geneNote);
				}
			} else {

				if (startCodonBegin < geneSequence.getBioBegin()) {
					geneSequence.setBioBegin(startCodonBegin);
				}
				if (stopCodonEnd > geneSequence.getBioBegin()) {
					geneSequence.setBioEnd(stopCodonEnd);
				}

			}
			TranscriptSequence transcriptSequence = geneSequence.addTranscript(transcriptAccessionID, startCodonBegin, stopCodonEnd);
			transcriptSequence.setSource(mRNAsource);
			if (mRNANote != null && mRNANote.length() > 0) {
				transcriptSequence.addNote(mRNANote);

			}
			if (startCodon != null) {
				if (startCodonName == null || startCodonName.length() == 0) {
					startCodonName = mRNAID + "-start_codon-" + startCodon.location().bioStart() + "-" + startCodon.location().bioEnd();
				}
				transcriptSequence.addStartCodonSequence(new AccessionID(startCodonName), startCodon.location().bioStart(), startCodon.location().bioEnd());
			}
			if (stopCodon != null) {
				if (stopCodonName == null || stopCodonName.length() == 0) {
					stopCodonName = mRNAID + "-stop_codon-" + stopCodon.location().bioStart() + "-" + stopCodon.location().bioEnd();
				}
				transcriptSequence.addStopCodonSequence(new AccessionID(stopCodonName), stopCodon.location().bioStart(), stopCodon.location().bioEnd());
			}

			for (FeatureI cdsFeature : cdsFeatures) {
				Feature cds = (Feature) cdsFeature;
				String cdsNote = cdsFeature.getAttribute("Note");
				String cdsSource = cds.source();
				String cdsName = cds.getAttribute("ID");
				if (cdsName == null || cdsName.length() == 0) {
					cdsName = mRNAID + "-cds-" + cds.location().bioStart() + "-" + cds.location().bioEnd();
				}
				AccessionID cdsAccessionID = new AccessionID(cdsName);
				ExonSequence exonSequence = geneSequence.addExon(cdsAccessionID, cdsFeature.location().bioStart(), cdsFeature.location().bioEnd());
				exonSequence.setSource(cdsSource);
				if (cdsNote != null && cdsNote.length() > 0) {
					exonSequence.addNote(cdsNote);
				}
				transcriptSequence.addCDS(cdsAccessionID, cdsFeature.location().bioStart(), cdsFeature.location().bioEnd(), cds.frame());
			}
			geneSequence.addIntronsUsingExons();

		}

	}

	static public LinkedHashMap loadFastaAddGeneFeaturesFromGlimmerGFF3(File fastaSequenceFile, File gffFile) throws Exception {
		LinkedHashMap dnaSequenceList = FastaReaderHelper.readFastaDNASequence(fastaSequenceFile);
		LinkedHashMap chromosomeSequenceList = GeneFeatureHelper.getChromosomeSequenceFromDNASequence(dnaSequenceList);
		FeatureList listGenes = GFF3Reader.read(gffFile.getAbsolutePath());
		addGlimmerGFF3GeneFeatures(chromosomeSequenceList, listGenes);
		return chromosomeSequenceList;
	}

	static public void addGlimmerGFF3GeneFeatures(LinkedHashMap chromosomeSequenceList, FeatureList listGenes) throws Exception {
		FeatureList mRNAFeatures = listGenes.selectByType("mRNA");
		for (FeatureI f : mRNAFeatures) {
			Feature mRNAFeature = (Feature) f;
			String geneid = mRNAFeature.getAttribute("ID");
			String source = mRNAFeature.source();

			FeatureList gene = listGenes.selectByAttribute("Parent", geneid);
			FeatureI geneFeature = gene.get(0);
			ChromosomeSequence seq = chromosomeSequenceList.get(geneFeature.seqname());
			AccessionID geneAccessionID = new AccessionID(geneid);
			GeneSequence geneSequence = null;

			FeatureList cdsFeatures = gene.selectByType("CDS");
			FeatureI feature = cdsFeatures.get(0);
			Strand strand = Strand.POSITIVE;

			if (feature.location().isNegative()) {
				strand = Strand.NEGATIVE;
			}
			cdsFeatures = cdsFeatures.sortByStart();







			//String seqName = feature.seqname();
			FeatureI startCodon = null;
			FeatureI stopCodon = null;
			Integer startCodonBegin = null;
			Integer stopCodonEnd = null;
			String startCodonName = "";
			String stopCodonName = "";
			FeatureList startCodonList = gene.selectByAttribute("Note", "initial-exon");
			if (startCodonList != null && startCodonList.size() > 0) {
				startCodon = startCodonList.get(0);
				if (strand == Strand.NEGATIVE) {
					startCodonBegin = startCodon.location().bioEnd();
				} else {
					startCodonBegin = startCodon.location().bioStart();
				}
				startCodonName = startCodon.getAttribute("ID");
			}

			FeatureList stopCodonList = gene.selectByAttribute("Note", "final-exon");

			if (stopCodonList != null && stopCodonList.size() > 0) {
				stopCodon = stopCodonList.get(0);
				if (strand == Strand.NEGATIVE) {
					stopCodonEnd = stopCodon.location().bioStart();
				} else {
					stopCodonEnd = stopCodon.location().bioEnd();
				}
				stopCodonName = stopCodon.getAttribute("ID");

			}




			if (startCodonBegin == null) {
				if (strand == Strand.NEGATIVE) {
					FeatureI firstFeature = cdsFeatures.get(0);

					startCodonBegin = firstFeature.location().bioEnd();
				} else {
					FeatureI firstFeature = cdsFeatures.get(0);

					startCodonBegin = firstFeature.location().bioStart();
				}
			}

			if (stopCodonEnd == null) {
				if (strand == Strand.NEGATIVE) {
					FeatureI lastFeature = cdsFeatures.get(cdsFeatures.size() - 1);
					stopCodonEnd = lastFeature.location().bioStart();
				} else {
					FeatureI lastFeature = cdsFeatures.get(cdsFeatures.size() - 1);
					stopCodonEnd = lastFeature.location().bioEnd();
				}
			}
			//for gtf ordering can be strand based so first is last and last is first
			if (startCodonBegin > stopCodonEnd) {
				int temp = startCodonBegin;
				startCodonBegin = stopCodonEnd;
				stopCodonEnd = temp;
			}



			AccessionID transcriptAccessionID = new AccessionID(geneid);
			if (geneSequence == null) {
				geneSequence = seq.addGene(geneAccessionID, startCodonBegin, stopCodonEnd, strand);
				geneSequence.setSource(source);
			}
			/*
			else {

				if (startCodonBegin < geneSequence.getBioBegin()) {
					geneSequence.setBioBegin(startCodonBegin);
				}
				if (stopCodonEnd > geneSequence.getBioBegin()) {
					geneSequence.setBioEnd(stopCodonEnd);
				}
			}
			*/
			TranscriptSequence transcriptSequence = geneSequence.addTranscript(transcriptAccessionID, startCodonBegin, stopCodonEnd);
			if (startCodon != null) {
				if (startCodonName == null || startCodonName.length() == 0) {
					startCodonName = geneid + "-start_codon-" + startCodon.location().bioStart() + "-" + startCodon.location().bioEnd();
				}
				transcriptSequence.addStartCodonSequence(new AccessionID(startCodonName), startCodon.location().bioStart(), startCodon.location().bioEnd());
			}
			if (stopCodon != null) {
				if (stopCodonName == null || stopCodonName.length() == 0) {
					stopCodonName = geneid + "-stop_codon-" + stopCodon.location().bioStart() + "-" + stopCodon.location().bioEnd();
				}
				transcriptSequence.addStopCodonSequence(new AccessionID(stopCodonName), stopCodon.location().bioStart(), stopCodon.location().bioEnd());
			}

			for (FeatureI cdsFeature : cdsFeatures) {
				Feature cds = (Feature) cdsFeature;
				String cdsName = cds.getAttribute("ID");
				if (cdsName == null || cdsName.length() == 0) {
					cdsName = geneid + "-cds-" + cds.location().bioStart() + "-" + cds.location().bioEnd();
				}
				AccessionID cdsAccessionID = new AccessionID(cdsName);
				//ExonSequence exonSequence = geneSequence.addExon(cdsAccessionID, cdsFeature.location().bioStart(), cdsFeature.location().bioEnd());
				transcriptSequence.addCDS(cdsAccessionID, cdsFeature.location().bioStart(), cdsFeature.location().bioEnd(), cds.frame());
			}

		}

	}

	static public LinkedHashMap loadFastaAddGeneFeaturesFromGeneMarkGTF(File fastaSequenceFile, File gffFile) throws Exception {
		LinkedHashMap dnaSequenceList = FastaReaderHelper.readFastaDNASequence(fastaSequenceFile);
		LinkedHashMap chromosomeSequenceList = GeneFeatureHelper.getChromosomeSequenceFromDNASequence(dnaSequenceList);
		FeatureList listGenes = GeneMarkGTFReader.read(gffFile.getAbsolutePath());
		addGeneMarkGTFGeneFeatures(chromosomeSequenceList, listGenes);
		return chromosomeSequenceList;
	}

	static public void addGeneMarkGTFGeneFeatures(LinkedHashMap chromosomeSequenceList, FeatureList listGenes) throws Exception {
		Collection geneIds = listGenes.attributeValues("gene_id");
		for (String geneid : geneIds) {
			//       if(geneid.equals("45_g")){
			//           int dummy =1;
			//       }
			FeatureList gene = listGenes.selectByAttribute("gene_id", geneid);
			FeatureI geneFeature = gene.get(0);
			ChromosomeSequence seq = chromosomeSequenceList.get(geneFeature.seqname());
			AccessionID geneAccessionID = new AccessionID(geneid);
			GeneSequence geneSequence = null;
			Collection transcriptids = gene.attributeValues("transcript_id");
			for (String transcriptid : transcriptids) {
				// get all the individual features (exons, CDS regions, etc.) of this gene


				FeatureList transcriptFeature = listGenes.selectByAttribute("transcript_id", transcriptid);
				// now select only the coding regions of this gene
				FeatureList cdsFeatures = transcriptFeature.selectByType("CDS");
				// sort them
				cdsFeatures = cdsFeatures.sortByStart();

				FeatureI feature = cdsFeatures.get(0);
				Strand strand = Strand.POSITIVE;

				if (feature.location().isNegative()) {
					strand = Strand.NEGATIVE;
				}

				//String seqName = feature.seqname();
				FeatureI startCodon = null;
				FeatureI stopCodon = null;
				Integer startCodonBegin = null;
				Integer stopCodonEnd = null;
				String startCodonName = "";
				String stopCodonName = "";
				FeatureList startCodonList = transcriptFeature.selectByType("start_codon");
				if (startCodonList != null && startCodonList.size() > 0) {
					startCodon = startCodonList.get(0);
					if (strand == Strand.POSITIVE) {
						startCodonBegin = startCodon.location().bioStart();
					} else {
						startCodonBegin = startCodon.location().bioEnd();
					}
					startCodonName = startCodon.getAttribute("transcript_name");
				}

				FeatureList stopCodonList = transcriptFeature.selectByType("stop_codon");

				if (stopCodonList != null && stopCodonList.size() > 0) {
					stopCodon = stopCodonList.get(0);
					if (strand == Strand.POSITIVE) {
						stopCodonEnd = stopCodon.location().bioEnd();
					} else {
						stopCodonEnd = stopCodon.location().bioStart();
					}

					stopCodonName = stopCodon.getAttribute("transcript_name");

				}




				if (startCodonBegin == null) {
					if (strand == Strand.NEGATIVE) {
						FeatureI firstFeature = cdsFeatures.get(0);

						startCodonBegin = firstFeature.location().bioEnd();
					} else {
						FeatureI firstFeature = cdsFeatures.get(0);

						startCodonBegin = firstFeature.location().bioStart();
					}
				}

				if (stopCodonEnd == null) {
					if (strand == Strand.NEGATIVE) {
						FeatureI lastFeature = cdsFeatures.get(cdsFeatures.size() - 1);
						stopCodonEnd = lastFeature.location().bioStart();
					} else {
						FeatureI lastFeature = cdsFeatures.get(cdsFeatures.size() - 1);
						stopCodonEnd = lastFeature.location().bioEnd();
					}
				}
				//for gtf ordering can be strand based so first is last and last is first
				if (startCodonBegin > stopCodonEnd) {
					int temp = startCodonBegin;
					startCodonBegin = stopCodonEnd;
					stopCodonEnd = temp;
				}

				AccessionID transcriptAccessionID = new AccessionID(transcriptid);
				if (geneSequence == null) {
					geneSequence = seq.addGene(geneAccessionID, startCodonBegin, stopCodonEnd, strand);
					geneSequence.setSource(((Feature) feature).source());
				} else {
					//if multiple transcripts for one gene make sure the gene is defined as the min and max start/end

					if (startCodonBegin < geneSequence.getBioBegin()) {
						geneSequence.setBioBegin(startCodonBegin);
					}
					if (stopCodonEnd > geneSequence.getBioBegin()) {
						geneSequence.setBioEnd(stopCodonEnd);
					}

				}
				TranscriptSequence transcriptSequence = geneSequence.addTranscript(transcriptAccessionID, startCodonBegin, stopCodonEnd);
				if (startCodon != null) {
					if (startCodonName == null || startCodonName.length() == 0) {
						startCodonName = transcriptid + "-start_codon-" + startCodon.location().bioStart() + "-" + startCodon.location().bioEnd();
					}
					transcriptSequence.addStartCodonSequence(new AccessionID(startCodonName), startCodon.location().bioStart(), startCodon.location().bioEnd());
				}
				if (stopCodon != null) {
					if (stopCodonName == null || stopCodonName.length() == 0) {
						stopCodonName = transcriptid + "-stop_codon-" + stopCodon.location().bioStart() + "-" + stopCodon.location().bioEnd();
					}
					transcriptSequence.addStopCodonSequence(new AccessionID(stopCodonName), stopCodon.location().bioStart(), stopCodon.location().bioEnd());
				}

				for (FeatureI cdsFeature : cdsFeatures) {
					Feature cds = (Feature) cdsFeature;
					// for genemark it appears frame of 2 =1 and frame of 1 = 2
					// doesn't matter when you string cds regions together as one block
					// but does make a difference when you try to make a protein sequence for each CDS region where
					// you give up or borrow based on the frame value
					// compared with gff like files and docs for geneid and glimmer where geneid and glimmer both do it the same
					// way that appears to match the gff3 docs.
					int frame = cds.frame();
					if (frame == 1) {
						frame = 2;
					} else if (frame == 2) {
						frame = 1;
					} else {
						frame = 0;
					}
					String cdsName = cds.getAttribute("transcript_name");
					if (cdsName == null || cdsName.length() == 0) {
						cdsName = transcriptid + "-cds-" + cds.location().bioStart() + "-" + cds.location().bioEnd();
					}
					AccessionID cdsAccessionID = new AccessionID(cdsName);
					//ExonSequence exonSequence = geneSequence.addExon(cdsAccessionID, cdsFeature.location().bioStart(), cdsFeature.location().bioEnd());
					transcriptSequence.addCDS(cdsAccessionID, cdsFeature.location().bioStart(), cdsFeature.location().bioEnd(), frame);
				}
			}
		}

	}

	static public LinkedHashMap getProteinSequences(Collection chromosomeSequences) throws Exception {
		LinkedHashMap proteinSequenceHashMap = new LinkedHashMap();
		for (ChromosomeSequence dnaSequence : chromosomeSequences) {
			for (GeneSequence geneSequence : dnaSequence.getGeneSequences().values()) {
				for (TranscriptSequence transcriptSequence : geneSequence.getTranscripts().values()) {
					//TODO remove?
//                    DNASequence dnaCodingSequence = transcriptSequence.getDNACodingSequence();
//                    logger.info("CDS={}", dnaCodingSequence.getSequenceAsString());

					try {
						ProteinSequence proteinSequence = transcriptSequence.getProteinSequence();

//                        logger.info("{} {}", proteinSequence.getAccession().getID(), proteinSequence);
						if (proteinSequenceHashMap.containsKey(proteinSequence.getAccession().getID())) {
							throw new Exception("Duplicate protein sequence id=" + proteinSequence.getAccession().getID() + " found at Gene id=" + geneSequence.getAccession().getID());
						} else {
							proteinSequenceHashMap.put(proteinSequence.getAccession().getID(), proteinSequence);
						}
					} catch (Exception e) {
						logger.error("Exception: ", e);
					}

				}

			}
		}
		return proteinSequenceHashMap;
	}

	static public LinkedHashMap getGeneSequences(Collection chromosomeSequences) throws Exception {
		LinkedHashMap geneSequenceHashMap = new LinkedHashMap();
		for (ChromosomeSequence chromosomeSequence : chromosomeSequences) {
			for (GeneSequence geneSequence : chromosomeSequence.getGeneSequences().values()) {
				geneSequenceHashMap.put(geneSequence.getAccession().getID(), geneSequence);
			}
		}

		return geneSequenceHashMap;
	}

	public static void main(String args[]) throws Exception {
/*        if (false) {
			LinkedHashMap chromosomeSequenceList = GeneFeatureHelper.loadFastaAddGeneFeaturesFromGeneMarkGTF(new File("Scaffolds.fna"), new File("genemark_hmm.gtf"));
			LinkedHashMap proteinSequenceList = GeneFeatureHelper.getProteinSequences(chromosomeSequenceList.values());
		}

		if (false) {
			LinkedHashMap chromosomeSequenceList = GeneFeatureHelper.loadFastaAddGeneFeaturesFromGlimmerGFF3(new File("Scaffolds.fna"), new File("glimmerhmm.gff"));
			LinkedHashMap proteinSequenceList = GeneFeatureHelper.getProteinSequences(chromosomeSequenceList.values());
			//  for (ProteinSequence proteinSequence : proteinSequenceList.values()) {
			//      logger.info(proteinSequence.getAccession().getID() + " " + proteinSequence);
			//  }
			FastaWriterHelper.writeProteinSequence(new File("predicted_glimmer.faa"), proteinSequenceList.values());

		}
		if (false) {
			GeneFeatureHelper.outputFastaSequenceLengthGFF3(new File("Scaffolds.fna"), new File("scaffolds.gff3"));
		}

 */

		try {

			if (true) {
				//File fastaSequenceFile = new File("Scaffolds.fna");
				//File gff3File = new File("geneid-v6.gff3");
				//LinkedHashMap chromosomeSequenceHashMap = GeneFeatureHelper.loadFastaAddGeneFeaturesFromGmodGFF3(fastaSequenceFile, gff3File,true);
			}

		} catch (Exception e) {
			logger.error("Exception: ", e);
		}

	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy