All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.snpeff.vcf.FileIndexChrPos Maven / Gradle / Ivy

The newest version!
package org.snpeff.vcf;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.channels.FileChannel;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Set;

import org.snpeff.interval.Chromosome;
import org.snpeff.util.Gpr;

/**
 * Index a file that has "chr \t pos" as the beginning of a line (e.g. VCF)
 *
 * WARNING: It is assumed that the file is ordered by position (chromosome order does not matter)
 *
 * @author pcingola
 */
public class FileIndexChrPos {

	/**
	 * A part of a file
	 *
	 */
	public class FileRegion {
		long start, end;
		String lineStart, lineEnd;

		@Override
		public String toString() {
			return start + "\t" + lineStart //
					+ "\n"//
					+ end + "\t" + lineEnd //"
					;
		}
	}

	/**
	 * A line and the position on the file where it begins
	 */
	public class LineAndPos {
		public String line; // Line
		public long position; // Position in file where line starts

		@Override
		public String toString() {
			String str = "";
			if (line != null) {
				if (line.length() > 50) str = line.substring(0, 49) + "...";
				else str = line;
			}
			return position + "\t" + str;
		}
	}

	public static final int POS_OFFSET = 1; // VCF files are one-based
	private static final int BUFF_SIZE = 17; // 1024 * 1024;

	boolean verbose = false;
	boolean debug = false;
	String fileName;
	long size = 0;
	RandomAccessFile file;
	HashMap fileRegions = new HashMap(); // Store file regions by chromosome

	public FileIndexChrPos(String fileName) {
		this.fileName = fileName;
	}

	/**
	 * Get chromosome info
	 */
	String chromo(String line) {
		if (line.startsWith("#")) return null;
		return line.split("\\t")[0];
	}

	/**
	 * Close file
	 */
	public void close() {
		try {
			if (file != null) file.close();
		} catch (IOException e) {
			System.err.println("I/O problem while closing file '" + fileName + "'");
			throw new RuntimeException(e);
		}
		file = null;
	}

	/**
	 * Dump a region of the file to STDOUT
	 *
	 * @param posStart : Start file coordinate
	 * @param posEnd   : End file coordinate
	 * @param toString : Return a sting with file contents?
	 *
	 * @return If toString is 'true', return a string with file's content between those coordinates (this is used only for test cases and debugging)
	 */
	String dump(long start, long end, boolean toString) {
		if (verbose) System.err.println("\tDumping file '" + fileName + "' interval [ " + start + " , " + end + " ]");

		StringBuilder sb = new StringBuilder();

		try {
			byte buff[] = new byte[BUFF_SIZE];
			file.seek(start);
			for (long curr = start; curr <= end;) {
				long len = Math.min(BUFF_SIZE, end - curr + 1); // Maximum length to read
				int read = file.read(buff, 0, (int) len); // Read file

				if (read <= 0) break; // Error or nothing read, abort

				String out = new String(buff, 0, read);

				// Show or append to string
				if (toString) sb.append(out);
				else System.out.print(out);

				curr += read;
			}
		} catch (Exception e) {
			throw new RuntimeException("Error reading file '" + fileName + "' from position " + start + " to " + end);
		}

		return sb.toString();
	}

	/**
	 * Dump all lines in the interval chr:posStart-posEnd
	 *
	 * @param chr      : Chromosome
	 * @param posStart : Start coordinate in chromosome (zero-based)
	 * @param posEnd   : End coordinate in chromosome (zero-based)
	 * @param toString : Return a sting with file contents?
	 *
	 * @return If toString is 'true', return a string with file's content between those coordinates (this is used only for test cases and debugging)
	 */
	public String dump(String chr, int posStart, int posEnd, boolean toString) {
		long fileStart = find(chr, posStart, true);
		long fileEnd = find(chr, posEnd, false);

		return dump(fileStart, fileEnd - 1, toString);
	}

	/**
	 * Find the position in the file for the first character of the first line whose genomic position is less or equal than 'chrPos'
	 * @param chrPos : Chromosome coordinate (zero-based)
	 * @param start : File start coordinate (zero-based)
	 * @param lineStart : Line at 'start' coordinate
	 * @param end : File end coordinate (zero-based)
	 * @param lineEnd : Line at 'end' coordinate
	 * @return position in file between [start, end] where chrPos can be found
	 */
	long find(int chrPos, long start, String lineStart, long end, String lineEnd, boolean lessEq) {
		//---
		// Check break conditions
		//---
		int posStart = pos(lineStart);
		int posEnd = pos(lineEnd);
		if (debug) Gpr.debug("Find:\t" + chrPos + "\t[" + posStart + ", " + posEnd + "]\tFile: [" + start + " , " + end + "]\tsize: " + (end - start) //
				+ "\n\t\t\t\t" + s(lineStart) //
				+ "\n\t\t\t\t" + s(lineEnd) //
				+ "\n");

		if (chrPos == posStart) return found(start, lineStart, lessEq); // Is it lineStart?
		if (posEnd == chrPos) return found(end, lineEnd, lessEq); // Is it lineEnd?
		if (chrPos < posStart) return start; // Before start?
		if (posEnd < chrPos) return end + lineEnd.length() + 1; // After end?
		if (start + 1 >= end) { // Only one byte of difference between start an end? (i.e. we are at a line boundary)
			if (chrPos <= posStart) return found(start, lineStart, lessEq);
			if (chrPos < posEnd) return found(end, lineEnd, true);
			return found(end, lineEnd, false);
		}

		if (posStart >= posEnd) throw new RuntimeException("This should never happen! Is the file sorted by position?"); // Sanity check

		//---
		// Recurse
		//---
		long mid = (start + end) / 2;
		LineAndPos lpmid = getLine(mid);
		String lineMid = lpmid.line;
		long posMid = pos(lineMid);

		if (chrPos <= posMid) return find(chrPos, start, lineStart, mid, lineMid, lessEq);
		else return find(chrPos, mid, lineMid, end, lineEnd, lessEq);
	}

	/**
	 * Find the position in the file for the first character of the first line equal or less than a specific chr:pos
	 */
	public long find(String chr, int pos, boolean lessEq) {
		chr = Chromosome.simpleName(chr);
		FileRegion fr = fileRegions.get(chr);
		if (fr == null) throw new RuntimeException("No such chromosome: '" + chr + "'");

		// Find position in file
		long posFound = find(pos, fr.start, fr.lineStart, fr.end, fr.lineEnd, lessEq);

		// Get line information
		LineAndPos linePos = getLine(posFound);

		// Get beginning of line position
		if (linePos != null) return linePos.position;
		return -1;
	}

	/**
	 * Calculate coordinate of this line or next line
	 */
	long found(long filePos, String fileLine, boolean lessEq) {
		if (!lessEq) {
			long pos = filePos + fileLine.length() + 1; // Next line
			return (pos < size() ? pos : size());
		}

		// Beginning of 'filePos' line
		return filePos;
	}

	/**
	 * Get a byte from a file
	 */
	public byte get(long bytePosition) {
		try {
			// Change position if needed
			if (file.getFilePointer() != bytePosition) file.seek(bytePosition);
			return (byte) file.read();
		} catch (IOException e) {
			throw new RuntimeException("Error readin file '" + fileName + "' at position " + bytePosition, e);
		}
	}

	/**
	 * Read 'len' bytes after 'bytePosition' or until a '\n' is reached.
	 * If len is negative, read 'abs(len)' bytes before bytePosition or until '\n' is reached
	 *
	 * @param bytePosition
	 * @param len
	 * @return An array of 'len' bytes. null if either end of file (len > 0) or beginning of file (len < 0)
	 */
	public byte[] get(long bytePosition, int len) {
		try {
			int size = Math.abs(len);

			// Change position if needed
			long pos = bytePosition;
			if (len < 0) {
				if (bytePosition <= 0) return null;
				pos -= size;

				// Before beginning of file?
				if (pos < 0) {
					pos = 0;
					size = (int) bytePosition;
				}
				pos = Math.max(pos, 0);
			}
			if (file.getFilePointer() != pos) file.seek(pos);

			byte buff[] = new byte[size];
			int read = file.read(buff);

			// Nothing to read?
			if (read <= 0) return null;

			// Buffer was too long? Return an array of byte with exactly the number of bytes
			if (read < buff.length) {
				byte newBuff[] = new byte[read];
				System.arraycopy(buff, 0, newBuff, 0, read);
				buff = newBuff;
			}

			// Only return bytes until 'new line'
			if (len > 0) {
				// Find new line
				int newLine = -1;
				for (int i = 0; i < read; i++)
					if (buff[i] == '\n') {
						newLine = i;
						break;
					}

				// Copy only the part until a newLine
				if (newLine >= 0) {
					byte newBuff[] = new byte[newLine + 1];
					System.arraycopy(buff, 0, newBuff, 0, newLine + 1);
					buff = newBuff;
				}
			} else if (len < 0) {
				// Find new line backwards
				int newLine = -1;
				for (int i = read - 1; i >= 0; i--)
					if (buff[i] == '\n') {
						newLine = i;
						break;
					}

				// Copy only the part until a newLine
				if (newLine >= 0) {
					byte newBuff[] = new byte[read - newLine];
					for (int i = newLine, j = 0; i < read; i++, j++)
						newBuff[j] = buff[i];
					buff = newBuff;
				}
			}

			return buff;
		} catch (IOException e) {
			throw new RuntimeException("Error readin file '" + fileName + "' at position " + bytePosition, e);
		}
	}

	/**
	 * Available chromosomes
	 * @return
	 */
	public Set getChromos() {
		return fileRegions.keySet();
	}

	/**
	 * Get position where 'chr' ends
	 * @param chr
	 * @return -1 if 'chr' is not in the index
	 */
	public long getEnd(String chr) {
		chr = Chromosome.simpleName(chr);
		FileRegion fr = fileRegions.get(chr);
		if (fr == null) return -1;
		return fr.end;
	}

	/**
	 * Get file region for a given chrosmome
	 */
	FileRegion getFileRegion(String chr) {
		chr = Chromosome.simpleName(chr);
		FileRegion fr = fileRegions.get(chr);
		if (fr == null) {
			fr = new FileRegion();
			fileRegions.put(chr, fr);
		}
		return fr;
	}

	/**
	 * Get the line where 'pos' hits
	 * @return A string with the line that 'pos' hits, null if it's out of boundaries
	 */
	public LineAndPos getLine(long pos) {
		int BUFF_SIZE = 10240;
		long size = size();
		if ((pos >= size) || (pos < 0)) return null;

		LineAndPos linePos = new LineAndPos();
		StringBuffer sb = new StringBuffer();

		// Get bytes after 'pos'
		long position;
		for (position = pos; position < size;) {
			byte b[] = get(position, BUFF_SIZE);
			if (b == null) break; // End of file
			sb.append(new String(b));
			if (b[b.length - 1] == '\n') break; // Found new line?
			position += b.length;
		}

		// Get bytes before 'pos'
		for (position = pos; position >= 0;) {
			byte b[] = get(position, -BUFF_SIZE);
			if (b == null) break; // Beginning of file
			sb.insert(0, new String(b));
			position -= b.length;
			if (b[0] == '\n') break; // Found new line?
		}

		// Remove leading and trailing '\n'?
		int lineStart = 0, lineEnd = sb.length();
		if (sb.charAt(0) == '\n') lineStart = 1;
		if (sb.charAt(sb.length() - 1) == '\n') lineEnd = sb.length() - 1;

		linePos.line = sb.toString().substring(lineStart, lineEnd);
		linePos.position = position + lineStart;

		return linePos;
	}

	/**
	 * A slow method for getLine
	 */
	public LineAndPos getLineSlow(long pos) {
		long size = size();
		if ((pos >= size) || (pos < 0)) return null;

		LineAndPos linePos = new LineAndPos();
		StringBuffer sb = new StringBuffer();

		// Get bytes before 'pos'
		long position;
		for (position = pos - 1; position >= 0; position--) {
			byte b = get(position);
			if (b == '\n') break;
			sb.insert(0, (char) b);
		}
		linePos.position = position + 1;

		// Get bytes after 'pos'
		for (position = pos; position < size; position++) {
			byte b = get(position);
			if (b == '\n') break;
			sb.append((char) b);
		}
		linePos.line = sb.toString();

		// if (debug) Gpr.debug("Line & Position: " + linePos);
		return linePos;
	}

	/**
	 * Get position where 'chr' starts
	 * @param chr
	 * @return -1 if 'chr' is not in the index
	 */
	public long getStart(String chr) {
		chr = Chromosome.simpleName(chr);
		FileRegion fr = fileRegions.get(chr);
		if (fr == null) return -1;
		return fr.start;
	}

	/**
	 * Index chromosomes in the whole file
	 */
	public void index() {
		if (file == null) throw new RuntimeException("File error (forgot to open the file?).");

		// Last line (minus '\n' character, minus one)
		long end = size() - 1;
		if (end < 0) return; // Empty database file

		String lineEnd = getLine(end).line;
		String chrEnd = chromo(lineEnd);

		// Add fileRegion.end for last chromsome in the file
		FileRegion fr = getFileRegion(chrEnd);
		fr.end = end;
		fr.lineEnd = lineEnd;
		if (verbose) System.err.println("\tindex:\t" + chrEnd + "\t" + end);

		// Find first non-comment line
		long start = 0;
		String lineStart = "";
		for (start = 0; start < size; start += lineStart.length() + 1) {
			lineStart = getLine(start).line;
			if (chromo(lineStart) != null) break;
		}

		String chrStart = chromo(lineStart);

		// Add fileRegion.start for first chromsome in the file
		fr = getFileRegion(chrStart);
		fr.start = start;
		fr.lineStart = lineStart;
		if (verbose) System.err.println("\tindex:\t" + chrStart + "\t" + start);

		// Index the rest of the file
		indexChromos(start, lineStart, end, lineEnd);
	}

	/**
	 * Index chromosomes in a region of a file
	 * @param start
	 * @param lineStart
	 * @param end
	 * @param lineEnd
	 */
	void indexChromos(long start, String lineStart, long end, String lineEnd) {
		if (debug) Gpr.debug("Index:"//
				+ "\n\t" + start + "(" + (((double) start) / size()) + ") :\t" + s(lineStart) //
				+ "\n\t" + end + "(" + (((double) end) / size()) + ") :\t" + s(lineEnd));

		if (start > end) throw new RuntimeException("This should never happen! Start: " + start + "\tEnd: " + end);

		// Parse chromosome start and end
		String chrStart = chromo(lineStart);
		if (chrStart == null) throw new RuntimeException("Cannot extract chromosome data from line:"//
				+ "\n\tPosition : " + start + " (byte position in file) "//
				+ "\n\tLine     : " + lineStart//
		);

		String chrEnd = chromo(lineEnd);
		if (chrEnd == null) throw new RuntimeException("Cannot extract chromosome data from line:"//
				+ "\n\tPosition : " + end + " (byte position in file) "//
				+ "\n\tLine     : " + lineEnd//
		);

		if (chrStart.equals(chrEnd)) {
			if (debug) Gpr.debug("Chromo:\tlineStart: " + chrStart + "\tlineEnd: " + chrEnd + "\t==> Back!");
			return;
		}
		if (debug) Gpr.debug("Chromo:\tlineStart: " + chrStart + "\tlineEnd: " + chrEnd);

		if ((start + lineStart.length() + 1) >= end) {
			if (verbose) System.err.println("\t\t" + chrStart + " / " + chrEnd + "\t" + start + " / " + end);

			// Add index where chromosome starts
			getFileRegion(chrEnd).start = getLine(end).position;
			getFileRegion(chrEnd).lineStart = lineEnd;

			// Add index where chromosome ends
			getFileRegion(chrStart).end = getLine(start).position;
			getFileRegion(chrStart).lineEnd = lineStart;
			return;
		}

		long mid = (start + end) / 2;
		String lineMid = getLine(mid).line;
		if (debug) Gpr.debug("Mid: " + mid + "\t" + s(lineMid));

		if (debug) Gpr.debug("First half recustion:");
		indexChromos(start, lineStart, mid, lineMid);

		if (debug) Gpr.debug("Second half recustion:");
		indexChromos(mid, lineMid, end, lineEnd);
	}

	void init(FileChannel channel) throws IOException {
	}

	/**
	 * Open file and initiate mappings
	 */
	public void open() {
		try {
			File f = new File(fileName);
			size = f.length();
			file = new RandomAccessFile(f, "r");
		} catch (FileNotFoundException e) {
			System.err.println("File not found '" + fileName + "'");
			throw new RuntimeException(e);
		}
	}

	/**
	 * The position argument of a line (second column in tab-separated format). Negative if not found
	 * @return The position argument of a line. Negative if not found
	 */
	public int pos(String line) {
		if (line.startsWith("#")) return -1; // In VCF, positions are one-based, so zero denotes an error
		return Gpr.parseIntSafe(line.split("\\t")[1]) - POS_OFFSET;
	}

	String s(String s) {
		if (s == null) return "null";
		return s.length() <= 50 ? s : s.substring(0, 50) + "...";
	}

	public void setDebug(boolean debug) {
		this.debug = debug;
	}

	public void setVerbose(boolean verbose) {
		this.verbose = verbose;
	}

	/**
	 * File size
	 */
	public long size() {
		return size;
	}

	@Override
	public String toString() {
		StringBuilder sb = new StringBuilder();

		ArrayList keys = new ArrayList<>();
		keys.addAll(fileRegions.keySet());
		Collections.sort(keys);

		for (String key : keys)
			sb.append(key + ":\n" + Gpr.prependEachLine("\t\t", fileRegions.get(key).toString()) + "\n");

		return sb.toString();
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy