All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.snpeff.snpEffect.factory.SnpEffPredictorFactory Maven / Gradle / Ivy

The newest version!
package org.snpeff.snpEffect.factory;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Set;

import org.snpeff.fileIterator.FastaFileIterator;
import org.snpeff.interval.Cds;
import org.snpeff.interval.Chromosome;
import org.snpeff.interval.CircularCorrection;
import org.snpeff.interval.Exon;
import org.snpeff.interval.FrameType;
import org.snpeff.interval.Gene;
import org.snpeff.interval.Genome;
import org.snpeff.interval.GffType;
import org.snpeff.interval.IntervalComparatorByEnd;
import org.snpeff.interval.IntervalComparatorByStart;
import org.snpeff.interval.Marker;
import org.snpeff.interval.Transcript;
import org.snpeff.snpEffect.Config;
import org.snpeff.snpEffect.SnpEffectPredictor;
import org.snpeff.util.Gpr;
import org.snpeff.util.GprSeq;

/**
 * This class creates a SnpEffectPredictor from a file (or a set of files) and a configuration
 *
 * @author pcingola
 */
public abstract class SnpEffPredictorFactory {

	// Show a mark every
	public static final int MARK = 100;
	public static int MIN_TOTAL_FRAME_COUNT = 10;

	// Debug mode?
	boolean circularCorrectLargeGap = false;
	boolean createRandSequences = false; // If sequences are not read frmo a file, create random sequences
	boolean debug = false;
	boolean frameCorrection;
	boolean readSequences = true; // Do not read sequences from GFF file (this is only used for debugging)
	boolean storeSequences = false; // Store full gene sequences (in separate 'sequence.chr*.bin' files)
	boolean verbose = false;
	int lineNum;
	int inOffset; // This amount is subtracted to all position coordinates
	int totalSeqsAdded = 0, totalSeqsIgnored = 0; // Number of sequences added and ignored
	String fileName;
	String fastaFile; // Only used for debugging or testing
	String line;
	Config config;
	Genome genome;
	SnpEffectPredictor snpEffectPredictor;
	FrameType frameType;
	Set chromoNamesReference; // Chromosome names used in reference sequence file (e.g. FASTA)
	Map exonsByChromo;
	Map markersById;
	Map genesById;
	Map transcriptsById;
	Random random = new Random(20140410); // Note: we want consistent results in our test cases, so we always initialize the random generator in the same way

	public SnpEffPredictorFactory(Config config, int inOffset) {
		this.config = config;
		this.inOffset = inOffset;

		genome = config.getGenome();
		snpEffectPredictor = new SnpEffectPredictor(config.getGenome());
		exonsByChromo = new HashMap<>();
		markersById = new HashMap<>();
		genesById = new HashMap<>();
		transcriptsById = new HashMap<>();
		chromoNamesReference = new HashSet<>();

		frameCorrection = false;
		frameType = FrameType.UNKNOWN;
	}

	protected void add(Cds cds) {
		Transcript tr = (Transcript) cds.getParent();
		tr.add(cds);
		addMarker(cds, false);
	}

	protected void add(Chromosome chromo) {
		genome.add(chromo);
	}

	/**
	 * Add an exon
	 * 
	 * @param exon
	 * @return exon added.
	 * Note: If the exon exists with the same ID, return old exon.
	 *       If exon exists with same ID and same coordiates, add a new exon with different ID.
	 */
	protected Exon add(Exon exon) {
		Transcript tr = (Transcript) exon.getParent();

		// Make sure the same exon was not added before
		Exon oldex = tr.get(exon.getId());
		if (oldex != null) {
			if (oldex.includes(exon)) return oldex; // Redundant, just ignore it.

			// Create a new exon with same info and different 'id'
			exon = new Exon(tr, exon.getStart(), exon.getEnd(), exon.isStrandMinus(), exon.getId() + "_" + tr.subIntervals().size(), exon.getRank());
		}

		// Add exon
		tr.add(exon);
		addMarker(exon, false);
		return exon;
	}

	/**
	 * Add a Gene
	 */
	protected void add(Gene gene) {
		if (debug) Gpr.debug("\tAdding gene\tID: '" + gene.getId() + "'\tname: '" + gene.getGeneName() + "'\t" + gene.toStr());
		snpEffectPredictor.add(gene);

		if (genesById.containsKey(gene.getId())) throw new RuntimeException("Gene  '" + gene.getId() + "' already exists");
		genesById.put(gene.getId(), gene);
	}

	/**
	 * Add a generic Marker
	 */
	protected void add(Marker marker) {
		if (debug) Gpr.debug("\tAdding " + marker.getClass().getSimpleName() + ":\tID: '" + marker.getId() + "'\t" + marker.toStr());
		addMarker(marker, false);
	}

	/**
	 * Add a transcript
	 */
	protected void add(Transcript tr) {
		Gene gene = (Gene) tr.getParent();
		if (debug) Gpr.debug("\tAdding transcript :\tID: '" + tr.getId() + "' to gene '" + gene.getId() + "'\t" + tr.toStr());
		gene.add(tr);

		if (transcriptsById.containsKey(tr.getId())) throw new RuntimeException("Transcript  '" + tr.getId() + "' already exists");
		transcriptsById.put(tr.getId(), tr);
	}

	/**
	 * Add a marker to the collection
	 */
	protected void addMarker(Marker marker, boolean unique) {
		String key = marker.getId();
		if (unique && markersById.containsKey(key)) throw new RuntimeException("Marker '" + key + "' already exists");
		markersById.put(key, marker);
	}

	/**
	 * Add genomic reference sequences
	 */
	protected void addSequences(String chr, String chrSeq) {
		// Update chromosome length
		int chrLen = chrSeq.length();
		Chromosome chromo = getOrCreateChromosome(chr);
		chromo.setLength(chrLen);
		chromo.detectCircular();

		// Add sequences for each gene
		int seqsAdded = 0, seqsIgnored = 0;

		if (storeSequences) {
			if (verbose) System.out.print("\t\tAdding genomic sequences to genes: ");
			int count = genome.getGenomicSequences().addGeneSequences(chr, chrSeq);
			if (verbose) System.out.println("\tDone (" + count + " sequences added).");
		}

		if (verbose) System.out.print("\t\tAdding genomic sequences to exons: ");

		// Find and add sequences for all exons in this chromosome
		for (Gene gene : genome.getGenes()) {
			// Different chromosome? Skip
			if (!gene.getChromosomeName().equalsIgnoreCase(chr)) continue;

			for (Transcript tr : gene) {
				// Circular chromosomes coordinates are corrected in this step
				CircularCorrection cc = new CircularCorrection(tr, chrLen);
				cc.setDebug(debug);
				cc.setCorrectLargeGap(circularCorrectLargeGap);
				cc.correct();

				for (Exon exon : tr) {
					int ssStart = exon.getStart();
					int ssEnd = exon.getEnd() + 1; // String.substring does not include the last character in the interval (so we have to add 1)

					String seq = null;
					if ((ssStart >= 0) && (ssEnd <= chrLen)) {
						// Regular coordinates
						try {
							seq = chrSeq.substring(ssStart, ssEnd);
						} catch (Throwable t) {
							t.printStackTrace();
							throw new RuntimeException("Error trying to add sequence to exon:\n\tChromosome sequence length: " + chrSeq.length() + "\n\tExon: " + exon);
						}
					} else {
						// Sanity check
						if (!chromo.isCircular()) throw new RuntimeException("Coordinated out of bounds on a non-circular chromosome. This should never happen!Error trying to add sequence to exon:\n\tExon: " + exon);

						if ((ssStart < 0) && (ssEnd > 0)) {
							// Negative start coordinates? This is probably a circular genome
							// Convert to 2 intervals:
							//     i) Interval before zero: This gets mapped to the end of the chromosome
							//     ii) Interval after zero: This are "normal" coordinates
							// Then we concatenate both sequences
							ssStart += chrLen;
							seq = chrSeq.substring(ssStart, chrLen) + chrSeq.substring(0, ssEnd);
						} else if ((ssStart < 0) && (ssEnd < 0)) {
							// Negative start coordinates? This is probably a circular genome
							// Convert to 2 intervals:
							//     i) Interval before zero: This gets mapped to the end of the chromosome
							//     ii) Interval after zero: This are "normal" coordinates
							// Then we concatenate both sequences
							ssStart += chrLen;
							ssEnd += chrLen;
							seq = chrSeq.substring(ssStart, ssEnd);
						}
					}

					// Set sequence
					if (seq != null) {
						// Sanity check
						if (seq.length() != exon.size()) warning("Exon sequence length does not match exon.size()\n" + exon);

						// Reverse strand? => reverse complement of the sequence
						if (exon.isStrandMinus()) seq = GprSeq.reverseWc(seq);
						seq = seq.toUpperCase();
						exon.setSequence(seq);
						seqsAdded++;
					}
				}
			}
		}

		if (verbose) System.out.println("\tDone (" + seqsAdded + " sequences added, " + seqsIgnored + " ignored).");
		totalSeqsAdded += seqsAdded;
		totalSeqsIgnored += seqsIgnored;
	}

	/**
	 * Adjust chromosome length using gene information
	 * This is used when the sequence is not available (which makes sense on test-cases and debugging only)
	 */
	protected void adjustChromosomes() {
		if (verbose) System.out.print("\n\tAdjusting chromosomes lengths: ");

		// Chromosome length should be longer than any gene's end coordinate
		HashMap lenByChr = new HashMap<>();
		for (Gene gene : config.getGenome().getGenes()) {
			String chrName = gene.getChromosomeName();
			Integer len = lenByChr.get(chrName);
			int max = Math.max(gene.getEnd(), (len != null ? len : 0));
			lenByChr.put(chrName, max);
		}

		// Set length
		int adjusted = 0;
		for (String chrName : lenByChr.keySet()) {
			Chromosome chr = config.getGenome().getChromosome(chrName);
			int newEnd = lenByChr.get(chrName);
			if (chr.getEnd() < newEnd) {
				if (chr.size() <= 1) { // If start = end = 0, then size() is 1
					chr.setEnd(lenByChr.get(chrName));
					mark(adjusted++);
				} else if (verbose) System.out.println("\t\tChromosome '" + chr.getId() + "' has length of " + chr.size() + ", but genes end at " + lenByChr.get(chrName) + ". Assuming circular genome, not adjusting");
			}
		}
	}

	/**
	 * Adjust genes: recalculate start, end, strand, etc.
	 */
	void adjustGenes() {
		int i = 1;
		if (verbose) System.out.print("\n\tAdjusting genes: ");
		for (Gene gene : genome.getGenes())
			if (gene.adjust()) mark(i++);

	}

	/**
	 * Adjust transcripts: recalculate start, end, strand, etc.
	 */
	protected void adjustTranscripts() {
		int i = 1;
		if (verbose) System.out.print("\n\tAdjusting transcripts: ");
		for (Gene gene : genome.getGenes())
			for (Transcript tr : gene)
				if (tr.adjust()) mark(i++);

	}

	/**
	 * Perform some actions before reading sequences
	 */
	protected void beforeExonSequences() {
		// Sometimes we have to guess exon info from CDS info (not the best
		// case scenario, but there are a lot of crappy genome annotations
		// around)
		exonsFromCds();

		// Some annotation formats split exons in two parts (e.g. stop-codon
		// not part of exon in GTF).
		deleteRedundant();

		// Some annotations introduce zero size introns
		collapseZeroLenIntrons();
	}

	/**
	 * Only coding transcripts have CDS: Make sure that transcripts having CDS are protein coding
	 *
	 * It might not be always "precise" though:
	 *
	 * 		$ grep CDS genes.gtf | cut -f 2 | ~/snpEff/scripts/uniqCount.pl
	 * 		113	IG_C_gene
	 * 		64	IG_D_gene
	 * 		24	IG_J_gene
	 * 		366	IG_V_gene
	 * 		21	TR_C_gene
	 * 		3	TR_D_gene
	 * 		82	TR_J_gene
	 * 		296	TR_V_gene
	 * 		461	non_stop_decay
	 * 		63322	nonsense_mediated_decay
	 * 		905	polymorphic_pseudogene
	 * 		34	processed_transcript
	 * 		1340112	protein_coding
	 */
	protected void codingFromCds() {
		int i = 0;
		if (verbose) System.out.print("\n\tMarking as 'coding' from CDS information: ");
		for (Gene gene : genome.getGenes())
			for (Transcript tr : gene) {
				if (tr.getCds() != null && !tr.getCds().isEmpty()) {
					// If transcript doesn't have protein coding flag set (and doesn't have biotype information), use CDS as a proxy for 'protein coding'
					if (!tr.isProteinCoding() && (tr.getBioType() == null)) {
						tr.setProteinCoding(true);
						i++;
						if (debug) System.err.println("\t\tMarking as protein coding transcript " + tr.getId());
					}
				}
			}
		if (verbose) System.out.print("\n\tDone: " + i + " transcripts marked");
	}

	//	/**
	//	 * Get (or create) a chromosome and set it's length
	//	 */
	//	void chromoLen(String chromoName, int len) {
	//		Chromosome chromo = getOrCreateChromosome(chromoName);
	//		chromo.setLength(len);
	//	}

	/**
	 * Collapse exons having zero size introns between them
	 */
	protected void collapseZeroLenIntrons() {
		if (verbose) System.out.print("\n\tCollapsing zero length introns (if needed): ");

		int count = 0;
		for (Gene gene : genome.getGenes())
			for (Transcript tr : gene)
				if (tr.collapseZeroGap()) mark(count++);

		if (verbose) System.out.println("\n\t\tTotal collapsed transcripts: " + count);
	}

	/**
	 * Count number of exons by chromosome
	 */
	@SuppressWarnings("unused")
	void countExonsByChromo() {
		exonsByChromo = new HashMap<>();

		for (Gene gint : genome.getGenes()) {
			Chromosome chromo = gint.getChromosome();
			for (Transcript tint : gint) {
				for (Exon eint : tint) {
					// Get current count
					String chromoName = chromo.getId();
					Integer count = exonsByChromo.get(chromoName);

					// Increment
					if (count == null) count = 1;
					else count++;

					// Store
					exonsByChromo.put(chromoName, count);
				}
			}
		}
	}

	public abstract SnpEffectPredictor create();

	/**
	 * Create random sequences for exons
	 *
	 * Note: This is only used for test cases!
	 */
	protected void createRandSequences() {
		// Find all exons and add a 'random' sequence to each of them
		for (Gene g : genome.getGenes())
			for (Transcript tr : g)
				for (Exon ex : tr) {
					String sequence = GprSeq.randSequence(random, ex.size());
					ex.setSequence(sequence);
				}
	}

	/**
	 * Consolidate transcripts:
	 * If two exons are one right next to the other, join them
	 * E.g. exon1:1234-2345, exon2:2346-2400 => exon:1234-2400
	 * This happens mostly in GTF files, where the stop-codon is specified separated from the exon info.
	 */
	protected void deleteRedundant() {
		if (verbose) System.out.print("\n\tDeleting redundant exons (if needed): ");
		int count = 0;
		for (Gene gene : genome.getGenes())
			for (Transcript tr : gene)
				if (tr.deleteRedundant()) mark(count++);

		if (verbose) System.out.println("\n\t\tTotal transcripts with deleted exons: " + count);
	}

	/**
	 * Error: Throw a runtime exception (show some details)
	 */
	void error(String msg) {
		throw new RuntimeException("FATAL ERROR: " + msg + ". File '" + fileName + "' line " + lineNum + "\n\t'" + line + "'\n");
	}

	/**
	 * Error: Throw a runtime exception (show some details)
	 */
	void error(String msg, Throwable t) {
		throw new RuntimeException("FATAL ERROR: " + msg + ". File '" + fileName + "' line " + lineNum + "\n\t'" + line + "'\n", t);
	}

	/**
	 * Create exons from CDS info
	 */
	protected void exonsFromCds() {
		if (verbose) System.out.print("\n\tCreate exons from CDS (if needed): ");

		int count = 0;
		for (Gene gene : genome.getGenes()) {
			for (Transcript tr : gene) {
				// CDS length
				int lenCds = 0;
				for (Cds cds : tr.getCds())
					lenCds += cds.size();

				// Exon length
				int lenExons = 0;
				for (Exon ex : tr)
					lenExons += ex.size();

				// Cds length larger than exons? => something is missing
				if (lenCds > lenExons) {
					exonsFromCds(tr);
					count++;
				}
			}
		}
		if (verbose) System.out.println("\n\tExons created for " + count + " transcripts.");
	}

	/**
	 * Create exons from CDS info
	 * WARNING: We might end up with redundant exons if some exons existed before this process
	 *
	 * @param tr : Transcript with CDS info, but no exons
	 */
	protected void exonsFromCds(Transcript tr) {
		List cdss = tr.getCds();

		// First: Check and adjust strand info
		boolean trStrandMinus = tr.isStrandMinus();
		int cdsStrandSum = 0;
		for (Cds cds : cdss)
			cdsStrandSum += cds.isStrandMinus() ? -1 : 1;
		boolean cdsStrandMinus = cdsStrandSum < 0;
		if (cdsStrandMinus != trStrandMinus) {
			if (verbose) System.out.print(cdsStrandMinus ? '-' : '+');
			tr.setStrandMinus(cdsStrandMinus);
		}

		// Sort CDS by strand
		if (tr.isStrandPlus()) Collections.sort(cdss, new IntervalComparatorByStart()); // Sort by start position
		else Collections.sort(cdss, new IntervalComparatorByEnd(true)); // Sort by end position (reversed)

		// Add cds as exons
		// WARNING: We might end up with redundant exons if some exons existed before this process
		int rank = 1;
		for (Cds cds : cdss) {
			// Create exon and add it to transcript
			String id = GffType.EXON + "_" + cds.getChromosomeName() + "_" + cds.getStart() + "_" + cds.getEnd();
			if (tr.get(id) == null) { // Don't add an exon twice
				Exon exon = new Exon(tr, cds.getStart(), cds.getEnd(), trStrandMinus, id, rank);
				tr.add(exon);
			}

			rank++;
			if (verbose) System.out.print('.');
		}
	}

	protected Gene findGene(String id) {
		Gene gene = genesById.get(id);
		if (gene != null) return gene;
		return genesById.get(GffType.GENE + "_" + id); // Alternative gene ID
	}

	protected Gene findGene(String geneId, String id) {
		Gene gene = findGene(geneId);
		if (gene != null) return gene;
		return genesById.get(GffType.GENE + "_" + id); // Alternative gene ID
	}

	protected Marker findMarker(String id) {
		return markersById.get(id);
	}

	protected Transcript findTranscript(String id) {
		Transcript tr = transcriptsById.get(id);
		if (tr != null) return tr;
		return transcriptsById.get(GffType.TRANSCRIPT + "_" + id); // Alternative transcript ID
	}

	protected Transcript findTranscript(String trId, String id) {
		Transcript tr = findTranscript(trId);
		if (tr != null) return tr;
		return transcriptsById.get(GffType.TRANSCRIPT + "_" + id);
	}

	/**
	 * Finish up procedure to ensure consistency
	 */
	void finishUp() {
		// Adjust
		adjustTranscripts(); // Adjust transcripts: recalculate start, end, strand, etc.
		adjustGenes(); // Adjust genes: recalculate start, end, strand, etc.
		adjustChromosomes(); // Adjust chromosome sizes

		// Adjust exons: Most file formats don't have exon rank information.
		rankExons();

		// If some UTRs are missing: calculate UTR information from CDS whenever possible
		if (verbose) System.out.print("\n\tCreate UTRs from CDS (if needed): ");
		utrFromCds();

		// Correct according to frame information
		if (frameCorrection) frameCorrection();

		// Remove empty chromosomes
		removeEmptyChromos();

		// Mark as coding if there is a CDS
		codingFromCds();

		// Check that exons have sequences
		if (readSequences) { // Note: In some test cases we ignore sequences
			boolean error = !config.getGenome().isMostExonsHaveSequence();
			if (error) error("Most Exons do not have sequences!\n" + showChromoNamesDifferences() + "\n\n");
		}

		// Done
		if (verbose) System.out.println("");
	}

	/**
	 * Correct exon's coordinates, according to frame information
	 */
	void frameCorrection() {
		if (verbose) System.out.print("\n\tCorrecting exons based on frame information.\n\t");

		//---
		// Sanity check. Are all frames zero?
		//---
		int countByFrame[] = new int[3];
		for (Gene gene : genome.getGenes())
			for (Transcript tr : gene) {
				for (Exon ex : tr) {
					int frame = ex.getFrame();
					if (frame >= 0 && frame <= 2) countByFrame[frame]++; // Any value other than {0, 1, 2} is ignored (-1 means missing, other values are invalid)
				}

				for (Cds cds : tr.getCds()) {
					int frame = cds.getFrame();
					if (frame >= 0 && frame <= 2) countByFrame[frame]++; // Any value other than {0, 1, 2} is ignored (-1 means missing, other values are invalid)
				}
			}

		int countByFrameTotal = countByFrame[0] + countByFrame[1] + countByFrame[2];
		int countByFrameNonZero = countByFrame[1] + countByFrame[2];
		if ((countByFrameTotal >= MIN_TOTAL_FRAME_COUNT) && (countByFrameNonZero <= 0)) System.err.println("WARNING: All frames are zero! This seems rather odd, please check that 'frame' information in your 'genes' file is accurate.");

		//---
		// Perform exon frame adjustment
		//---
		int i = 1;
		for (Gene gene : genome.getGenes())
			for (Transcript tr : gene) {
				boolean corrected = tr.frameCorrection();

				if (corrected) {
					if (debug) System.err.println("\tTranscript " + tr.getId() + " corrected using frame (exons: " + tr.numChilds() + ").");
					else if (verbose) Gpr.showMark(i++, 1);

				}
			}

		if (verbose) System.out.print("");

	}

	/**
	 * Get a chromosome. If it doesn't exist, create it
	 */
	protected Chromosome getOrCreateChromosome(String chromoName) {
		Chromosome chromo = genome.getChromosome(chromoName);

		// Not found? => Create a new one
		if (chromo == null) {
			chromo = new Chromosome(genome, 0, 0, chromoName);
			genome.add(chromo);
		}

		return chromo;
	}

	public Map getProteinByTrId() {
		return null;
	}

	/**
	 * Does this chromosome have any exons?
	 */
	boolean hasExons(String chromoName) {
		Integer count = exonsByChromo.get(chromoName);
		return (count != null) && (count > 0);
	}

	/**
	 * Show a mark onthe screen (to show progress)
	 */
	void mark(int count) {
		if (verbose) Gpr.showMark(count, MARK, "\t\t");
	}

	/**
	 * Parse a string as a 'position'.
	 * Note: It subtracts 'inOffset' so that all coordinates are zero-based
	 */
	protected int parsePosition(String posStr) {
		return Gpr.parseIntSafe(posStr) - inOffset;
	}

	/**
	 * Rank exons
	 */
	void rankExons() {
		int i = 1;
		if (verbose) System.out.print("\n\tRanking exons: ");
		for (Gene gene : genome.getGenes())
			for (Transcript tr : gene)
				if (tr.rankExons()) mark(i++);
	}

	/**
	 * Read exon sequences from a FASTA file
	 */
	protected void readExonSequences() {
		List files = config.getFileListGenomeFasta();

		// Force a specific file?
		if (fastaFile != null) {
			files.clear();
			files.add(fastaFile);
		}

		// Try all files in the list until one is available
		for (String file : files) {

			if (Gpr.canRead(file)) {
				if (verbose) System.out.println("\tReading FASTA file: '" + file + "'");

				// Read fasta sequence
				FastaFileIterator ffi = new FastaFileIterator(file);
				for (String seq : ffi) {
					String chromo = ffi.getName();
					chromoNamesReference.add(chromo);
					if (verbose) System.out.println("\t\tReading sequence '" + chromo + "', length: " + seq.length());
					addSequences(chromo, seq); // Add all sequences
				}
				return;
			} else if (verbose) System.out.println("\tFASTA file: '" + file + "' not found.");
		}

		throw new RuntimeException("Cannot find reference sequence.");
	}

	/**
	 * Remove empty chromosomes
	 */
	void removeEmptyChromos() {
		if (verbose) System.out.println("\n\tRemove empty chromosomes: ");
		ArrayList chrToDelete = new ArrayList<>();
		for (Chromosome chr : config.getGenome())
			if (chr.size() <= 1) chrToDelete.add(chr);

		for (Chromosome chr : chrToDelete) {
			if (verbose) System.out.println("\t\tRemoving empty chromosome: '" + chr.getId() + "'");
			config.getGenome().remove(chr);
		}

		// Show remaining chromosomes
		if (verbose) {
			if (chrToDelete.size() > 0) {
				System.out.print("\t\tChromosome left: ");
				for (Chromosome chr : config.getGenome())
					System.out.print(chr.getId() + " ");
				System.out.println("");
			}
		}
	}

	protected void replaceTranscript(Transcript trOld, Transcript trNew) {
		transcriptsById.remove(trOld.getId());
		transcriptsById.put(trNew.getId(), trNew);
	}

	public void setCircularCorrectLargeGap(boolean circularCorrectLargeGap) {
		this.circularCorrectLargeGap = circularCorrectLargeGap;
	}

	public void setCreateRandSequences(boolean createRandSequences) {
		this.createRandSequences = createRandSequences;
	}

	public void setDebug(boolean debug) {
		this.debug = debug;
	}

	public void setFastaFile(String fastaFile) {
		this.fastaFile = fastaFile;
	}

	public void setFileName(String fileName) {
		this.fileName = fileName;
	}

	public void setRandom(Random random) {
		this.random = random;
	}

	/**
	 * Read sequences?
	 * Note: This is only used for debugging and testing
	 */
	public void setReadSequences(boolean readSequences) {
		this.readSequences = readSequences;
	}

	public void setStoreSequences(boolean storeSequences) {
		this.storeSequences = storeSequences;
	}

	public void setVerbose(boolean verbose) {
		this.verbose = verbose;
	}

	/**
	 * Shw differences in chromosome names
	 */
	protected String showChromoNamesDifferences() {
		if (chromoNamesReference.isEmpty()) return "";

		// Get all chromosome names
		Set chrs = new HashSet<>();
		for (Gene g : config.getGenome().getGenes())
			chrs.add(g.getChromosomeName());

		//---
		// Show chromosomes not present in reference sequence file
		//---
		int counMissinfRef = 0;
		StringBuilder sbMissingRef = new StringBuilder();
		ArrayList chrsSorted = new ArrayList<>();
		chrsSorted.addAll(chrs);
		Collections.sort(chrsSorted);
		for (String chr : chrsSorted) {
			if (!chromoNamesReference.contains(chr)) {
				counMissinfRef++;
				if (sbMissingRef.length() > 0) sbMissingRef.append(", ");
				sbMissingRef.append("'" + chr + "'");
			}
		}

		//---
		// Show chromosomes not present in genes file
		//---
		int counMissinfGenes = 0;
		StringBuilder sbMissingGenes = new StringBuilder();
		ArrayList chrsRefSorted = new ArrayList<>();
		chrsRefSorted.addAll(chromoNamesReference);
		Collections.sort(chrsRefSorted);
		for (String chr : chrsRefSorted) {
			if (!chrs.contains(chr)) {
				counMissinfGenes++;
				if (sbMissingGenes.length() > 0) sbMissingRef.append(", ");
				sbMissingGenes.append("'" + chr + "'");
			}
		}

		// Show differences
		String msg = "";
		if (counMissinfRef > 0 && counMissinfGenes > 0) {
			msg = "There might be differences in the chromosome names used in the genes file " //
					+ "('" + fileName + "')" //
					+ "\nand the chromosme names used in the 'reference sequence' file" //
					+ (fastaFile != null ? " ('" + fastaFile + "')" : "") + "." //
					+ "\nPlease check that chromosome names in both files match.\n";
		}
		return msg //
				+ (sbMissingRef.length() > 0 ? "\tChromosome names missing in 'reference sequence' file:\t" + sbMissingRef.toString() : "") //
				+ (sbMissingGenes.length() > 0 ? "\n\tChromosome names missing in 'genes' file             :\t" + sbMissingGenes.toString() : "")//
		;
	}

	String unquote(String qstr) {
		return qstr.replaceAll("\"", "");
	}

	/**
	 * Create missing UTRs from CDS information
	 */
	void utrFromCds() {
		int i = 1;
		for (Gene gene : genome.getGenes())
			for (Transcript tr : gene)
				if (tr.utrFromCds(debug)) mark(i++);
	}

	/**
	 * Warning: Show a warning message (show some details)
	 * @param msg
	 */
	void warning(String msg) {
		if (verbose) System.err.println("WARNING: " + msg + ". File '" + fileName + "' line " + lineNum + "\t'" + line + "'");
	}

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy