org.biojava.nbio.ronn.Jronn Maven / Gradle / Ivy
/* BioJava development code
*
* This code may be freely distributed and modified under the
* terms of the GNU Lesser General Public Licence. This should
* be distributed with the code. If you do not have a copy,
* see:
*
* http://www.gnu.org/copyleft/lesser.html
*
* Copyright for this code is held jointly by the individual
* authors. These should be listed in @author doc comments.
*
* For more information on the BioJava project and its aims,
* or to join the biojava-l mailing list, visit the home page
* at:
*
* http://www.biojava.org/
*
*/
package org.biojava.nbio.ronn;
import org.biojava.nbio.core.sequence.ProteinSequence;
import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
import org.biojava.nbio.data.sequence.FastaSequence;
import org.biojava.nbio.data.sequence.SequenceUtil;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.stream.Collectors;
/**
* This class gives public API to RONN functions.
* It is build on top of the command line client. Due to this fact a few things
* could be improved and extended pending the refactoring of the command line client.
*
* The input sequence limitations - the input sequence must not contain any ambiguous characters,
* and have a minimum length of 19 amino acids.
*
* @author Peter Troshin
* @version 1.0
* @since 3.0.2
*
*/
public class Jronn implements Serializable {
/**
*
*/
private static final long serialVersionUID = 8104272449130849946L;
// Load models
private static final ModelLoader loader = new ModelLoader();
static {
try {
loader.loadModels();
} catch (NumberFormatException e) {
throw new RuntimeException("Fails to load models!" + e.getMessage(), e);
} catch (IOException e) {
throw new RuntimeException("Fails to load models!" + e.getMessage(), e);
}
}
/**
* Holder for the ranges, contain pointers to starting and ending position
* on the sequence which comprises a disordered region. Immutable.
* @author pvtroshin
*/
public static class Range {
/**
* Range starting position counts from 1 (the first position on the sequence is 1)
*/
public final int from;
/**
* The range ending position includes the last residue.
*/
public final int to;
public final float score;
public Range(int from, int to, float score) {
assert from>=0;
assert from0;
assert probability>0 && probability<1;
int count=0;
int regionLen=0;
List ranges = new ArrayList();
for(float score: scores) {
count++;
// Round to 2 decimal points before comparison
score = (float) (Math.round(score*100.0)/100.0);
if(score>probability) {
regionLen++;
} else {
if(regionLen>0) {
ranges.add(new Range(count-regionLen, count-1,score));
}
regionLen=0;
}
}
// In case of the range to boundary runs to the very end of the sequence
if(regionLen>1) {
ranges.add(new Range(count-regionLen+1, count,scores[scores.length-1]));
}
return ranges.toArray(new Range[ranges.size()]);
}
/**
* Calculates the probability of disorder scores for each residue in the sequence for
* many sequences in the input.
*
* @param sequences the list of the FastaSequence objects
* @return the Map with key->FastaSequence, value->probability of disorder for each residue
* @see #getDisorder(FastaSequence)
*/
public static Map getDisorderScores(List sequences) {
Map results = new TreeMap();
results = sequences.stream().collect(Collectors.toMap(fastaSequence -> fastaSequence, fastaSequence -> predictSerial(fastaSequence)));
return results;
}
/**
* Calculates the disordered regions of the sequence for many sequences in the input.
*
* @param sequences sequences the list of the FastaSequence objects
* @return
* @see #getDisorder(FastaSequence)
*/
public static Map getDisorder(List sequences) {
Map disorderRanges = new TreeMap();
disorderRanges = sequences.stream().collect(Collectors.toMap(fastaSequence -> fastaSequence, fastaSequence -> getDisorder(fastaSequence) ));
return disorderRanges;
}
/**
* Calculates the disordered regions of the protein sequence.
* @param fastaFile input file name containing the sequence in FASTA
* @return the Map with key->FastaSequence, value->the list of disordered regions for each sequence
* @throws FileNotFoundException if the input file cannot be found
* @throws IOException of the system cannot access or read from the input file
* @see #getDisorder(FastaSequence)
* @see #Jronn.Range
*/
public static Map getDisorder(String fastaFile) throws FileNotFoundException, IOException {
final List sequences = SequenceUtil.readFasta(new FileInputStream(fastaFile));
return getDisorder(sequences);
}
/**
* TODO
*
* High performance method for calculating disorder. Use multiple threads to achieve the speedup.
*
* @param fastaFile fully qualified path to the input FASTA file
* @param outputFile file name of the file for the results
* @param threadNumber the number of threads to use, default
* @param controls the format of the result file
* @throws FileNotFoundException if input file in not found
* @throws IOException if the input or the output files cannot be accessed
* @see ORonn.ResultLayout
public static void calculateDisorder(String fastaFile, String outputFile, int threadNumber, ResultLayout layout) throws FileNotFoundException, IOException {
final List sequences = SequenceUtil.readFasta(new FileInputStream(fastaFile));
InputParameters in = new InputParameters();
in.setFilePrm(fastaFile, InputParameters.inputKey);
in.setFilePrm(outputFile, InputParameters.outputKey);
//in.setThreadNum(Integer.toString(threadNumber));
ORonn.predictParallel(sequences, in, loader);
}
*/
}