All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.biojava.nbio.genome.parsers.twobit.TwoBitParser Maven / Gradle / Ivy

There is a newer version: 7.1.3
Show newest version
/*
 *                    BioJava development code
 *
 * This code may be freely distributed and modified under the
 * terms of the GNU Lesser General Public Licence.  This should
 * be distributed with the code.  If you do not have a copy,
 * see:
 *
 *      http://www.gnu.org/copyleft/lesser.html
 *
 * Copyright for this code is held jointly by the individual
 * authors.  These should be listed in @author doc comments.
 *
 * For more information on the BioJava project and its aims,
 * or to join the biojava-l mailing list, visit the home page
 * at:
 *
 *      http://www.biojava.org/
 *
 */
package org.biojava.nbio.genome.parsers.twobit;


import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.RandomAccessFile;
import java.util.HashMap;

/**
 * downloaded from http://storage.bioinf.fbb.msu.ru/~roman/TwoBitParser.java
 *
 * Class is a parser of UCSC Genome Browser file format .2bit used to store
 * nucleotide sequence information. This class extends InputStream and can
 * be used as it after choosing one of names of containing sequences. This
 * parser can be used to do some work like UCSC tool named twoBitToFa. For
 * it just run this class with input file path as single parameter and set
 * stdout stream into output file. If you have any problems or ideas don't
 * hesitate to contact me through email: rsutormin[at]gmail.com.
 * @author Roman Sutormin
 */
public class TwoBitParser extends InputStream {

	private static final Logger logger = LoggerFactory.getLogger(TwoBitParser.class);

	public int DEFAULT_BUFFER_SIZE = 10000;
	//
	private RandomAccessFile raf;
	private File f;
	private boolean reverse = false;
	private String[] seq_names;
	private HashMap seq2pos = new HashMap();
	private String cur_seq_name;
	private long[][] cur_nn_blocks;
	private long[][] cur_mask_blocks;
	private long cur_seq_pos;
	private long cur_dna_size;
	private int cur_nn_block_num;
	private int cur_mask_block_num;
	private int[] cur_bits;
	private byte[] buffer;
	private long buffer_size;
	private long buffer_pos;
	private long start_file_pos;
	private long file_pos;
	//
	private static final char[] bit_chars = {
			'T','C','A','G'
	};

	public TwoBitParser(File f) throws Exception {
		this.f = f;
		raf = new RandomAccessFile(f,"r");
		long sign = readFourBytes();
		if(sign==0x1A412743) {
			logger.debug("2bit: Normal number architecture");
		}
		else if(sign==0x4327411A) {
			reverse = true;
			logger.debug("2bit: Reverse number architecture");
		}
		else throw new Exception("Wrong start signature in 2BIT format");
		readFourBytes();
		int seq_qnt = (int)readFourBytes();
		readFourBytes();
		seq_names = new String[seq_qnt];
		for(int i=0;i0)?0:-1;
		cur_mask_block_num = (cur_mask_blocks.length>0)?0:-1;
		cur_bits = new int[4];
		file_pos = start_file_pos;
		buffer_size = 0;
		buffer_pos = -1;
	}
	/**
	 * @return number (starting from 0) of next readable nucleotide in sequence stream.
	 */
	public long getCurrentSequencePosition() {
		if(cur_seq_name==null) throw new RuntimeException("Sequence is not set");
		return cur_seq_pos;
	}
	public void setCurrentSequencePosition(long pos) throws IOException {
		if(cur_seq_name==null) throw new RuntimeException("Sequence is not set");
		if(pos>cur_dna_size) throw new RuntimeException(
				"Postion is too high (more than "+cur_dna_size+")");
		if(cur_seq_pos>pos) {
			reset();
		}
		skip(pos-cur_seq_pos);
	}
	private void loadBits() throws IOException {
		if((buffer==null)||(buffer_pos<0)||(file_pos=buffer_pos+buffer_size)) {
			if((buffer==null)||(buffer.length!=DEFAULT_BUFFER_SIZE)) {
				buffer = new byte[DEFAULT_BUFFER_SIZE];
			}
			buffer_pos = file_pos;
			buffer_size = raf.read(buffer);
		}
		int cur_byte = buffer[(int)(file_pos-buffer_pos)]& 0xff;
		for(int i=0;i<4;i++) {
			cur_bits[3-i] = cur_byte%4;
			cur_byte /= 4;
		}
	}
	/**
	 * Method reads 1 nucleotide from sequence stream. You should set current sequence
	 * before use it.
	 */
	@Override
	public int read() throws IOException {
		if(cur_seq_name==null) throw new IOException("Sequence is not set");
		if(cur_seq_pos==cur_dna_size) {
			logger.debug("End of sequence (file position:{})", raf.getFilePointer());
			return -1;
		}
		int bit_num = (int)cur_seq_pos%4;
		if(bit_num==0) {
			loadBits();
		}
		else if(bit_num==3) {
			file_pos++;
		}
		char ret = 'N';
		if((cur_nn_block_num>=0)&&
				(cur_nn_blocks[cur_nn_block_num][0]<=cur_seq_pos)) {
			if(cur_bits[bit_num]!=0) {
				throw new IOException("Wrong data in NN-block ("+cur_bits[bit_num]+") "+
						"at position "+cur_seq_pos);
			}
			if(cur_nn_blocks[cur_nn_block_num][0]+cur_nn_blocks[cur_nn_block_num][1]==cur_seq_pos+1) {
				cur_nn_block_num++;
				if(cur_nn_block_num>=cur_nn_blocks.length) {
					cur_nn_block_num = -1;
				}
			}
			ret = 'N';
		}
		else {
			ret = bit_chars[cur_bits[bit_num]];
		}
		if((cur_mask_block_num>=0)&&
				(cur_mask_blocks[cur_mask_block_num][0]<=cur_seq_pos)) {
			ret = Character.toLowerCase(ret);
			if(cur_mask_blocks[cur_mask_block_num][0]+cur_mask_blocks[cur_mask_block_num][1]==cur_seq_pos+1) {
				cur_mask_block_num++;
				if(cur_mask_block_num>=cur_mask_blocks.length) {
					cur_mask_block_num = -1;
				}
			}
		}
		cur_seq_pos++;
		return ret;
	}
	/**
	 * Method skips n nucleotides in sequence stream. You should set current sequence
	 * before use it.
	 */
	@Override
	public synchronized long skip(long n) throws IOException {
		if(cur_seq_name==null) throw new IOException("Sequence is not set");
		if(n<4) {
			int ret = 0;
			while((ret=0)) ret++;
			return ret;
		}
		if(n>cur_dna_size-cur_seq_pos) {
			n = cur_dna_size-cur_seq_pos;
		}
		cur_seq_pos += n;
		file_pos = start_file_pos+(cur_seq_pos/4);
		raf.seek(file_pos);
		if((cur_seq_pos%4)!=0) {
			loadBits();
		}
		while((cur_nn_block_num>=0)&&
				(cur_nn_blocks[cur_nn_block_num][0]+cur_nn_blocks[cur_nn_block_num][1]<=cur_seq_pos)) {
			cur_nn_block_num++;
			if(cur_nn_block_num>=cur_nn_blocks.length) cur_nn_block_num = -1;
		}
		while((cur_mask_block_num>=0)&&
				(cur_mask_blocks[cur_mask_block_num][0]+cur_mask_blocks[cur_mask_block_num][1]<=cur_seq_pos)) {
			cur_mask_block_num++;
			if(cur_mask_block_num>=cur_mask_blocks.length) cur_mask_block_num = -1;
		}
		return n;
	}
	/**
	 * Method closes current sequence and it's necessary to invoke it before setting
	 * new current sequence.
	 */
	@Override
	public void close() throws IOException {
		cur_seq_name = null;
		cur_nn_blocks = null;
		cur_mask_blocks = null;
		cur_seq_pos = -1;
		cur_dna_size = -1;
		cur_nn_block_num = -1;
		cur_mask_block_num = -1;
		cur_bits = null;
		buffer_size = 0;
		buffer_pos = -1;
		file_pos = -1;
		start_file_pos = -1;
	}
	@Override
	public int available() throws IOException {
		if(cur_seq_name==null) throw new IOException("Sequence is not set");
		return (int)(cur_dna_size-cur_seq_pos);
	}
	/**
	 * Method closes random access file descriptor. You can't use any reading methods
	 * after it.
	 * @throws Exception
	 */
	public void closeParser() throws Exception {
		raf.close();
	}
	public File getFile() {
		return f;
	}
	public String loadFragment(long seq_pos,int len) throws IOException {
		if(cur_seq_name==null) throw new IOException("Sequence is not set");
		setCurrentSequencePosition(seq_pos);
		char[] ret = new char[len];
		int i = 0;
		for(;i{} pos={}, len={}", cur_seq_name, cur_seq_pos, len);
		char[] line = new char[60];
		boolean end = false;
		long qnt_all = 0;
		while(!end) {
			int qnt = 0;
			for(;(qnt0) {
				logger.info(new String(line,0,qnt));
			}
			if(qnt_all>=len) end = true;
		}
	}
	public static void main(String[] args) throws Exception {
		if(args.length==0) {
			logger.info("Usage:   [ [ []]]");
			logger.info("Resulting fasta data will be written in stdout.");
			return;
		}
		TwoBitParser p = new TwoBitParser(new File(args[0]));
		if(args.length==1) {
			String[] names = p.getSequenceNames();
			for(int i=0;i2) {
				long start = Long.parseLong(args[2]);
				p.skip(start);
			}
			if(args.length>3) {
				long len = Long.parseLong(args[3]);
				p.printFastaSequence(len);
			}
			else {
				p.printFastaSequence();
			}
			p.close();
		}
		p.closeParser();
	}
}





© 2015 - 2025 Weber Informatics LLC | Privacy Policy