
org.snpeff.serializer.MarkerSerializer Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of SnpEff Show documentation
Show all versions of SnpEff Show documentation
Variant annotation and effect prediction package.
The newest version!
package org.snpeff.serializer;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintStream;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.zip.GZIPOutputStream;
import org.snpeff.SnpEff;
import org.snpeff.fileIterator.LineFileIterator;
import org.snpeff.interval.Cds;
import org.snpeff.interval.Chromosome;
import org.snpeff.interval.Exon;
import org.snpeff.interval.Gene;
import org.snpeff.interval.Genome;
import org.snpeff.interval.Marker;
import org.snpeff.interval.MarkerParentId;
import org.snpeff.interval.MarkerSeq;
import org.snpeff.interval.Markers;
import org.snpeff.interval.Motif;
import org.snpeff.interval.NextProt;
import org.snpeff.interval.RareAminoAcid;
import org.snpeff.interval.Regulation;
import org.snpeff.interval.SpliceSiteAcceptor;
import org.snpeff.interval.SpliceSiteBranch;
import org.snpeff.interval.SpliceSiteBranchU12;
import org.snpeff.interval.SpliceSiteDonor;
import org.snpeff.interval.Transcript;
import org.snpeff.interval.Utr3prime;
import org.snpeff.interval.Utr5prime;
import org.snpeff.snpEffect.EffectType;
import org.snpeff.util.Gpr;
/**
* Serialize markers to (and from) file
*
* Note: Marker's children are serialized first (e.g. a transcript get all
* exons serialized first).
*
* Note: Since Marker is a tree-like structure, we first load all the markers and then
* assign parents. Markers are assigned a fake parent object (MarkerParentId)
* which is later replaced by the 'real' parent.
*
* Note: All 'IDs' used have not meaning outside this serialization process.
*
* @author pcingola
*/
public class MarkerSerializer {
PrintStream outFile;
int lineNum;
String line;
int parsedField;
String fields[];
int currId = 0;
Genome genome;
Map byId;
Map byMarker;
Set doNotSave;
public MarkerSerializer() {
this(null);
}
public MarkerSerializer(Genome genome) {
this.genome = genome;
byId = new HashMap<>();
byMarker = new HashMap<>();
}
public void doNotSave(Marker m) {
if (doNotSave == null) doNotSave = new HashSet<>();
doNotSave.add(m);
}
protected TxtSerializable getById(int id) {
return byId.get(id);
}
public int getIdByMarker(Marker m) {
Integer id = byMarker.get(m);
if (isDoNotSave(m)) return -1;
if (id == null) { //
throw new RuntimeException("Marker has no numeric ID. \n" //
+ "\tClass : " + m.getClass().getSimpleName() + "\n" //
+ "\tMarker ID: '" + m.getId() + "'\n" //
+ "\t" + m);
}
return id;
}
protected Marker getMarkerById(int id) {
return (Marker) getById(id);
}
public String getNextField() {
if (fields.length <= parsedField) return "";
return fields[parsedField++];
}
public boolean getNextFieldBoolean() {
return Gpr.parseBoolSafe(getNextField());
}
public int getNextFieldInt() {
return Gpr.parseIntSafe(getNextField());
}
public Marker getNextFieldMarker() {
return getMarkerById(getNextFieldInt());
}
public Markers getNextFieldMarkers() {
Markers markers = new Markers();
String fieldIdsStr = getNextField();
if (fieldIdsStr.isEmpty()) return markers;
String fieldIds[] = fieldIdsStr.split(",");
for (String idStr : fieldIds) {
int id = Gpr.parseIntSafe(idStr);
Marker m = getMarkerById(id);
if (m != null) markers.add(m);
else throw new RuntimeException("Marker '" + id + "' not found. This should never happen!");
}
return markers;
}
protected int getNextId() {
return ++currId;
}
boolean isDoNotSave(Marker m) {
return doNotSave != null && doNotSave.contains(m);
}
/**
* Load data from file
*/
public Markers load(String fileName) {
//---
// Load data from file
//---
LineFileIterator lfi = new LineFileIterator(fileName, true); // File is gzipped
int lineNum = 0;
for (String l : lfi) {
line = l;
if (lineNum == 0) {
// First line should be 'header' showing version number
String fields[] = line.split("\t");
if (fields.length > 1) {
String soft = fields[0];
String versionNumber = fields[1];
// Check for compatibility
if (!soft.equals(SnpEff.SOFTWARE_NAME)) throw new RuntimeException("Database file '" + fileName + "' is not compatible with this program version. Try installing the appropriate database.");
if (!versionNumber.equals(SnpEff.VERSION_MAJOR)) throw new RuntimeException("Database file '" + fileName + "' is not compatible with this program version:"//
+ "\n\tDatabase version : '" + versionNumber + "'"//
+ "\n\tProgram version : '" + SnpEff.VERSION_MAJOR + "'" //
+ "\nTry installing the appropriate database." //
);
}
} else {
parsedField = 0;
fields = line.split("\t", -1);
// Parse field type
String typeStr = fields[0];
EffectType type = EffectType.valueOf(typeStr);
// Parse serialization id
String idStr = fields[1];
int id = Gpr.parseIntSafe(idStr);
Marker m = null;
switch (type) {
case GENOME:
if (genome == null) m = new Genome();
else m = genome;
break;
case CHROMOSOME:
m = new Chromosome();
break;
case SEQUENCE:
m = new MarkerSeq();
break;
case GENE:
m = new Gene();
break;
case TRANSCRIPT:
m = new Transcript();
break;
case CDS:
m = new Cds();
break;
case EXON:
m = new Exon();
break;
case UTR_3_PRIME:
m = new Utr3prime();
break;
case UTR_5_PRIME:
m = new Utr5prime();
break;
case RARE_AMINO_ACID:
m = new RareAminoAcid();
break;
case SPLICE_SITE_ACCEPTOR:
m = new SpliceSiteAcceptor();
break;
case SPLICE_SITE_BRANCH:
m = new SpliceSiteBranch();
break;
case SPLICE_SITE_BRANCH_U12:
m = new SpliceSiteBranchU12();
break;
case SPLICE_SITE_DONOR:
m = new SpliceSiteDonor();
break;
case NEXT_PROT:
m = new NextProt();
break;
case MOTIF:
m = new Motif();
break;
case REGULATION:
m = new Regulation();
break;
default:
throw new RuntimeException("Unimplemented for type '" + type + "'");
}
try {
// Parse line
m.serializeParse(this);
} catch (Throwable t) {
t.printStackTrace();
throw new RuntimeException("Error parsing line " + (lineNum + 1) + " from file '" + fileName + "'\n\t" + line + "\n\tField [" + parsedField + "] : '" + (parsedField < fields.length ? fields[parsedField] : "-") + "'", t);
}
// Add to hash
byId.put(id, m);
}
lineNum++;
}
//---
// Assign parents
//---
Markers markers = new Markers();
for (TxtSerializable tm : byId.values()) {
if (tm instanceof Marker) {
Marker m = (Marker) tm;
// Do we need to replace parent?
if (m.getParent() instanceof MarkerParentId) {
// Find parent ID
MarkerParentId mpid = (MarkerParentId) m.getParent();
int parentId = mpid.getParentId();
// Find and set parent
Marker parent = getMarkerById(parentId);
m.setParent(parent);
}
// Add to markers
markers.add(m);
}
}
return markers;
}
/**
* Save all markers
*/
public String save(Iterable markersCollection) {
StringBuilder idStr = new StringBuilder();
for (Marker m : markersCollection) {
int id = save(m);
if (idStr.length() > 0) idStr.append(",");
idStr.append(id);
}
return idStr.toString();
}
/**
* Save a marker
*/
public int save(Marker m) {
if (m == null) return -1;
if (shouldSkip(m)) return getIdByMarker(m); // Already done
// Store already saved IDs
int id = getNextId();
if (byMarker.put(m, id) != null) throw new RuntimeException("Marker already had a numeric ID. Marker : " + m.toStr());
// Print line
String line = m.serializeSave(this);
outFile.print(line + "\n");
lineNum++;
return id;
}
/**
* Save data to file
*/
public void save(String fileName, Markers markers) {
try {
lineNum = 0;
currId = 0;
outFile = new PrintStream(new GZIPOutputStream(new FileOutputStream(fileName)));
// Write header first
outFile.print(SnpEff.SOFTWARE_NAME + "\t" + SnpEff.VERSION_MAJOR + "\n");
// Serialize all markers
for (Marker m : markers)
save(m);
outFile.close();
} catch (IOException e) {
throw new RuntimeException(e);
}
}
boolean shouldSkip(Marker m) {
return byMarker.containsKey(m) || isDoNotSave(m);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy