All Downloads are FREE. Search and download functionalities are using the official Maven repository.

umcg.genetica.io.gtf.GtfReader Maven / Gradle / Ivy

There is a newer version: 1.0.7
Show newest version
/*
 * To change this template, choose Tools | Templates
 * and open the template in the editor.
 */
package umcg.genetica.io.gtf;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.regex.Pattern;
import org.apache.log4j.Logger;
import umcg.genetica.collections.intervaltree.PerChrIntervalTree;

/**
 *
 * 
 * 
 * @author Patrick Deelen
 */
public class GtfReader implements GffReaderInterface{

	private static final Pattern TAB_PATTERN = Pattern.compile("\\t");
	private static final Pattern SPACE_PATTERN = Pattern.compile(" ");
	private static final Pattern SEMICOLON_PATTERN = Pattern.compile(";");
	private static final Logger LOGGER = Logger.getLogger(GtfReader.class);
	
	private final BufferedReader gtfFileBufferedReader;

	public GtfReader(File gftFile) throws FileNotFoundException{
		this(new FileReader(gftFile));
	}
	
	public GtfReader(Reader gftReader) {
		
		if(gftReader == null){
			throw new IllegalArgumentException("Supplied gft reader = null");
		}
		
		gtfFileBufferedReader = new BufferedReader(gftReader);
		
	}
	
	@Override
	public GffElement nextElement() throws IOException{
		
		
		String line;
		do {
			
			line = gtfFileBufferedReader.readLine();
		
			if(line == null){
				return null;
			}
			
		} while (line.charAt(0) == '#');
		

		
		String[] lineElements = TAB_PATTERN.split(line);
		
		if(lineElements.length != 9){
			throw new IOException("Line does not contain 9 tab sepparated columns: " + line);
		}
		
		String seqname = lineElements[0];
		String source = lineElements[1];
		String feature = lineElements[2]; 
		int start = Integer.parseInt(lineElements[3]);
		int end = Integer.parseInt(lineElements[4]);
		float score = lineElements[5].charAt(0) == '.' ? Float.NaN : Float.parseFloat(lineElements[5]);
		char strand = lineElements[6].charAt(0);
		int frame = lineElements[7].charAt(0) == '.' ? -1 : Integer.parseInt(lineElements[7]);
		String attributesInfo = lineElements[8];
		
		LinkedHashMap attributes = new LinkedHashMap();
		
		for(String attributeInfo : SEMICOLON_PATTERN.split(attributesInfo)){
			
			attributeInfo = attributeInfo.trim();
			String[] attributeInfoElements = SPACE_PATTERN.split(attributeInfo);
			if(attributeInfoElements.length != 2){
				throw new IOException("Line attributes format error part: " + attributeInfo + " of line: " + line);
			}
			
			String attributeName = attributeInfoElements[0];
			String attributeValue = attributeInfoElements[1];
			
			if(attributeValue.charAt(0) == '\"' && attributeValue.charAt(attributeValue.length()-1) == '\"'){
				attributeValue = attributeValue.substring(1, attributeValue.length()-1);
			}
			
			attributes.put(attributeName, attributeValue);
			
		}
		
		return new GffElement(seqname, source, feature, start, end, score, strand, frame, attributes);
		
	}
	
	@Override
	public void close() throws IOException{
		gtfFileBufferedReader.close();
	}

	@Override
	public Iterator iterator() {
		return new GffElementIterator(this);
	}
	
	public PerChrIntervalTree createIntervalTree() throws Exception{
		
		return PerChrIntervalTree.createFromChrGroupedIterable(this, GffElement.class);
				
	}
	
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy