org.opencb.cellbase.build.transform.GwasParser Maven / Gradle / Ivy
The newest version!
package org.opencb.cellbase.build.transform;
import org.apache.commons.lang.math.NumberUtils;
import org.broad.tribble.readers.TabixReader;
import org.opencb.biodata.models.variant.Variant;
import org.opencb.cellbase.core.common.clinical.gwas.GwasStudy;
import org.opencb.cellbase.core.common.clinical.gwas.GwasTest;
import org.opencb.cellbase.core.common.clinical.gwas.GwasTrait;
import org.opencb.cellbase.core.serializer.CellBaseSerializer;
import org.opencb.cellbase.core.common.clinical.gwas.Gwas;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.text.NumberFormat;
import java.util.HashMap;
import java.util.Map;
/**
* @author Luis Miguel Cruz
* @version 1.2.3
* @since October 08, 2014
*/
public class GwasParser extends CellBaseParser {
private static final int REF = 0;
private static final int ALT = 1;
private final Path gwasFile;
private final Path dbSnpTabixFilePath;
private int invalidStartRecords;
private int invalidChromosome;
private int gwasLinesNotFoundInDbsnp;
public GwasParser(Path gwasFile, Path dbSnpTabixFilePath, CellBaseSerializer serializer) {
super(serializer);
this.gwasFile = gwasFile;
this.dbSnpTabixFilePath = dbSnpTabixFilePath;
this.invalidStartRecords = 0;
this.invalidChromosome = 0;
this.gwasLinesNotFoundInDbsnp = 0;
}
public void parse() {
if (Files.exists(gwasFile) && Files.exists(dbSnpTabixFilePath)) {
try {
logger.info("Opening gwas file " + gwasFile + " ...");
BufferedReader inputReader = new BufferedReader(new FileReader(gwasFile.toFile()));
logger.info("Ignoring gwas file header line ...");
inputReader.readLine();
Map variantMap = new HashMap<>();
logger.info("Opening dbSNP tabix file " + dbSnpTabixFilePath + " ...");
TabixReader dbsnpTabixReader = new TabixReader(dbSnpTabixFilePath.toString());
long processedGwasLines = 0;
logger.info("Parsing gwas file ...");
for (String line; (line = inputReader.readLine())!= null;) {
if (!line.isEmpty()) {
processedGwasLines++;
Gwas gwasRecord = buildGwasObject(line.split("\t"), dbsnpTabixReader);
if (gwasRecord != null) {
addGwasRecordToVariantMap(variantMap, gwasRecord);
}
}
}
dbsnpTabixReader.close();
logger.info("Serializing parsed variants ...");
for (Gwas gwasOutputRecord : variantMap.values()) {
serializer.serialize(gwasOutputRecord);
}
logger.info("Done");
this.disconnect();
this.printSummary(processedGwasLines, variantMap);
} catch (IOException e) {
logger.error("Unable to parse " + gwasFile + " using dbSNP file " + dbSnpTabixFilePath + ": " + e.getMessage());
}
}
}
private Gwas buildGwasObject(String[] values, TabixReader dbsnpTabixReader) {
Gwas gwas = null;
Integer start = parseStart(values);
if (start != null) {
Integer end = start;
String chromosome = parseChromosome(values[11]);
if (chromosome != null) {
String snpId = values[21].trim();
String[] refAndAlt = getRefAndAltFromDbsnp(chromosome, start, snpId, dbsnpTabixReader);
if (refAndAlt != null) {
gwas = new Gwas(chromosome, start, end, refAndAlt[REF], refAndAlt[ALT], values[10], values[13], values[14],
values[15], values[16], values[17], values[18], values[19], values[20], snpId, values[22], values[23],
values[24], values[25], parseFloat(values[26]), values[33]);
addGwasStudy(values, gwas);
} else {
gwasLinesNotFoundInDbsnp++;
}
} else {
invalidChromosome++;
}
} else {
invalidStartRecords++;
}
return gwas;
}
private Integer parseStart(String[] values) {
Integer start = null;
if (NumberUtils.isDigits(values[12])) {
start = Integer.parseInt(values[12]);
}
return start;
}
private String parseChromosome(String chromosome) {
String transformedChromosome = null;
if(!chromosome.isEmpty()){
switch(chromosome) {
case "23":
transformedChromosome = "X";
break;
case "24":
transformedChromosome = "Y";
break;
case "25":
transformedChromosome = "MT";
break;
default:
transformedChromosome = chromosome;
}
}
return transformedChromosome;
}
private Float parseFloat(String value) {
Float riskAlleleFrequency = null;
if (NumberUtils.isNumber(value)) {
riskAlleleFrequency = Float.parseFloat(value);
}
return riskAlleleFrequency;
}
private String[] getRefAndAltFromDbsnp(String chromosome, Integer start, String snpId, TabixReader dbsnpTabixReader) {
String[] refAndAlt = null;
TabixReader.Iterator dbsnpIterator = dbsnpTabixReader.query(chromosome + ":" + start + "-" + start);
try {
String dbSnpRecord = dbsnpIterator.next();
boolean found = false;
while (dbSnpRecord != null && !found) {
String[] dbsnpFields = dbSnpRecord.split("\t");
if (snpId.equalsIgnoreCase(dbsnpFields[2])) {
refAndAlt = new String[2];
refAndAlt[REF] = dbsnpFields[3];
refAndAlt[ALT] = dbsnpFields[4];
found = true;
}
dbSnpRecord = dbsnpIterator.next();
}
} catch (IOException e) {
logger.warn("Error reading position '" + chromosome + ":" + start + "' in dbSNP: " + e.getMessage());
}
return refAndAlt;
}
private void addGwasStudy(String[] values, Gwas gwas) {
// Add the study values
GwasStudy study = new GwasStudy(values[1], values[2], values[3], values[4], values[5], values[6], values[8], values[9], values[32]);
addGwasTraitToStudy(values, study);
gwas.addStudy(study);
}
private void addGwasTraitToStudy(String[] values, GwasStudy study) {
// Add the trait values
GwasTrait trait = new GwasTrait(values[7], values[0]);
addGwasTestToTrait(values, trait);
study.addTrait(trait);
}
private void addGwasTestToTrait(String[] values, GwasTrait trait) {
// Add the test values
Float pValue = parseFloat(values[27]);
Float pValueMlog = parseFloat(values[28]);
GwasTest test = new GwasTest(pValue, pValueMlog, values[29], values[30], values[31]);
trait.addTest(test);
}
private void printSummary(long processedGwasLines, Map variantMap) {
NumberFormat formatter = NumberFormat.getInstance();
logger.info("");
logger.info("Summary");
logger.info("=======");
logger.info("Processed " + formatter.format(processedGwasLines) + " gwas lines");
logger.info("Serialized " + formatter.format(variantMap.size()) + " variants");
logger.info(formatter.format(gwasLinesNotFoundInDbsnp) + " gwas lines ignored because variant not found in dbsnp");
if (invalidStartRecords != 0) {
logger.info(formatter.format(invalidStartRecords) + " gwas lines ignored because have no valid 'start' value");
}
if (invalidChromosome != 0) {
logger.info(formatter.format(invalidChromosome) + " gwas lines ignored because have no valid chromosome");
}
}
private void addGwasRecordToVariantMap(Map variantMap, Gwas gwasRecord) {
String[] alternates = gwasRecord.getAlternate().split(",");
for (int i=0; i < alternates.length; i++) {
String alternate = alternates[i];
Variant variantKey =
new Variant(gwasRecord.getChromosome(), gwasRecord.getStart(), gwasRecord.getEnd(), gwasRecord.getReference(), alternate);
if (variantMap.containsKey(variantKey)) {
updateGwasEntry(variantMap, gwasRecord, variantKey);
} else {
// if a gwas record has several alternatives, it has to be cloned to avoid side effects (set gwasRecord
// alternative would update the previous instance of gwas record saved in the 'variantMap')
gwasRecord = cloneGwasRecordIfNecessary(gwasRecord, i);
gwasRecord.setAlternate(alternate);
variantMap.put(variantKey, gwasRecord);
}
}
}
private Gwas cloneGwasRecordIfNecessary(Gwas gwasRecord, int i) {
if (i > 0) {
gwasRecord = new Gwas(gwasRecord);
}
return gwasRecord;
}
private void updateGwasEntry(Map variantMap, Gwas gwasVO, Variant gwasKey) {
Gwas gwas = variantMap.get(gwasKey);
gwas.addStudies(gwasVO.getStudies());
variantMap.put(gwasKey, gwas);
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy