
umcg.genetica.io.gwascatalog.GWASCatalog Maven / Gradle / Ivy
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package umcg.genetica.io.gwascatalog;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import umcg.genetica.io.text.TextFile;
/**
*
* @author harmjan
*/
public class GWASCatalog {
private HashSet loci = new HashSet();
private HashSet snps = new HashSet();
private HashSet traits = new HashSet();
private GWASTrait[] traitArray = null;
private HashMap publicationToObj = new HashMap();
private HashMap snpToObj = new HashMap();
private HashMap locusToObj = new HashMap();
private HashMap traitToObj = new HashMap();
private HashMap cleanTraitToObj = new HashMap();
public GWASCatalog() {
}
public GWASCatalog(String gwasCatalogLoc) throws IOException {
this.read(gwasCatalogLoc);
}
public GWASCatalog(String gwasCatalogLoc, double pvaluethreshold) throws IOException {
this.read(gwasCatalogLoc);
}
public void read(String calatogloc) throws IOException {
TextFile tf = new TextFile(calatogloc, TextFile.R);
String[] headerelems = tf.readLineElemsReturnReference(TextFile.tab);
int dateAddedCol = -1;
int pubMedidCol = -1;
int firstAuthorCol = -1;
int publishDateCol = -1;
int journalCol = -1;
int studyCol = -1;
int diseaseCol = -1;
int samplesizeCol = -1;
int samplesizeReplicationCol = -1;
int topSNPCol = -1;
int snpCol = -1;
int pvalCol = -1;
int chrCol = -1;
int chrPosCol = -1;
int reportedGeneCol = -1;
int mappedGeneCol = -1;
int col = 0;
for (String e : headerelems) {
if (e.equals("Date Added to Catalog")) {
dateAddedCol = col;
} else if (e.equals("PUBMEDID")) {
pubMedidCol = col;
} else if (e.equals("First Author")) {
firstAuthorCol = col;
} else if (e.equals("Date")) {
publishDateCol = col;
} else if (e.equals("Journal")) {
journalCol = col;
} else if (e.equals("Study")) {
studyCol = col;
} else if (e.equals("Disease/Trait")) {
diseaseCol = col;
} else if (e.equals("Initial Sample Size")) {
samplesizeCol = col;
} else if (e.equals("Replication Sample Size")) {
samplesizeReplicationCol = col;
} else if (e.equals("Strongest SNP-Risk Allele")) {
topSNPCol = col;
} else if (e.equals("SNPs")) {
snpCol = col;
} else if (e.equals("p-Value")) {
pvalCol = col;
} else if (e.equals("Chr_id")) {
chrCol = col;
} else if (e.equals("Chr_pos")) {
chrPosCol = col;
} else if (e.equals("Reported Gene(s)")) {
reportedGeneCol = col;
} else if (e.equals("Mapped_gene")) {
mappedGeneCol = col;
}
col++;
}
String[] elems = tf.readLineElemsReturnReference(TextFile.tab);
int numtraits = 0;
int numsnps = 0;
int numpubs = 0;
while (elems != null) {
if (elems.length > 11) {
String pubname = elems[pubMedidCol] + "; " + elems[firstAuthorCol] + "; " + elems[publishDateCol] + "; " + elems[journalCol] + "; " + elems[studyCol];
// String pubname = elems[pubMedidCol];
// int studysize = Integer.parseInt(elems[samplesizeCol]);
// int studySizeReplication = Integer.parseInt(elems[samplesizeReplicationCol]);
String trait = elems[diseaseCol].trim();
String cleanedTrait = trait.replaceAll(" ", "_").replaceAll("[^a-zA-Z0-9\\-_]+", "");
String otherSNPs = elems[snpCol].trim();
String[] topSNPElems = elems[topSNPCol].split("-");
String riskallele = null;
String mGene = elems[mappedGeneCol];
String rGene = elems[reportedGeneCol];
HashSet mappedGenes = new HashSet();
HashSet reportedGenes = new HashSet();
if (!mGene.equals("NR") && !mGene.equals("Intergenic") && !mGene.equals(" - ")) {
if (mGene.contains(" - ")) {
String[] mGenes = mGene.split(" - ");
mappedGenes.addAll(Arrays.asList(mGenes));
} else if (mGene.contains(";")) {
String[] mGenes = mGene.split(";");
mappedGenes.addAll(Arrays.asList(mGenes));
} else {
mappedGenes.add(mGene);
}
}
if (!rGene.equals("NR") && !rGene.equals("Intergenic") && !rGene.equals(" - ")) {
if (rGene.contains(" - ")) {
String[] rGenes = rGene.split(" - ");
reportedGenes.addAll(Arrays.asList(rGenes));
} else if (rGene.contains(";")) {
String[] rGenes = rGene.split(";");
reportedGenes.addAll(Arrays.asList(rGenes));
} else {
reportedGenes.add(rGene);
}
}
byte chr = -1;
int chrPos = -1;
try {
chr = Byte.parseByte(elems[chrCol]);
chrPos = Integer.parseInt(elems[chrPosCol]);
} catch (NumberFormatException ex) {
//System.out.println("Chromosome and/or position unparseable for trait: " + trait + " associated with SNP " + snp + ": chr: " + elems[chrCol] + ", pos: " + elems[chrPosCol]);
}
if (topSNPElems.length > 1) {
riskallele = topSNPElems[1];
if (riskallele.equals("?")) {
riskallele = null;
}
}
GWASPublication pub = publicationToObj.get(pubname);
if (pub == null) {
pub = new GWASPublication();
pub.id = numpubs;
pub.name = pubname;
publicationToObj.put(pubname, pub);
numpubs++;
}
GWASTrait gwasTraitObj = traitToObj.get(trait);
if (gwasTraitObj == null) {
gwasTraitObj = new GWASTrait();
gwasTraitObj.name = trait;
gwasTraitObj.cleanName = cleanedTrait;
gwasTraitObj.id = numtraits;
gwasTraitObj.setMappedGenes(mappedGenes);
gwasTraitObj.setReportedGenes(reportedGenes);
traitToObj.put(trait, gwasTraitObj);
cleanTraitToObj.put(cleanedTrait, gwasTraitObj);
traits.add(gwasTraitObj);
numtraits++;
}
// parse the top SNP: remove whitespace..
String topSNP = topSNPElems[0];
topSNP = topSNP.trim();
while (topSNP.startsWith(" ")) {
topSNP = topSNP.substring(1);
}
GWASSNP gwasTopSNPObj = snpToObj.get(topSNP);
if (gwasTopSNPObj == null) {
gwasTopSNPObj = new GWASSNP();
gwasTopSNPObj.setName(topSNP);
gwasTopSNPObj.setId(numsnps);
gwasTopSNPObj.setChr(chr);
gwasTopSNPObj.setPosition(chrPos);
snpToObj.put(topSNP, gwasTopSNPObj);
snps.add(gwasTopSNPObj);
numsnps++;
}
Double topSNPAssocPVal = null;
try {
topSNPAssocPVal = Double.parseDouble(elems[pvalCol]);
} catch (NumberFormatException e) {
// Sometimes the pvalue is unreported...
// System.out.println("P-value unparseable for trait: " + gwasTraitObj.getName() + " associated with SNP " + gwasSNPObj.getName() + ": " + elems[pvalCol]);
}
gwasTopSNPObj.getAssociatedTraits().add(gwasTraitObj);
gwasTraitObj.addTopSNP(gwasTopSNPObj);
if (topSNPAssocPVal != null) {
Double previousP = gwasTopSNPObj.getPValueAssociatedWithTrait(gwasTraitObj);
if (previousP == null || previousP > topSNPAssocPVal) {
gwasTopSNPObj.setPValueAssociatedWithTrait(gwasTraitObj, topSNPAssocPVal);
gwasTopSNPObj.getRiskAllele().put(gwasTraitObj, riskallele);
}
}
// parse all the other reported SNPs..
String[] otherSNPElems = otherSNPs.split(",");
for (int s = 0; s < otherSNPElems.length; s++) {
String snpname = otherSNPElems[s].trim();
while (snpname.startsWith(" ")) {
snpname = snpname.substring(1);
}
GWASSNP gwasSNPObj = snpToObj.get(snpname);
if (gwasSNPObj == null) {
gwasSNPObj = new GWASSNP();
gwasSNPObj.setName(snpname);
gwasSNPObj.setId(numsnps);
gwasSNPObj.setChr(chr);
gwasSNPObj.setPosition(chrPos);
snpToObj.put(snpname, gwasSNPObj);
snps.add(gwasSNPObj);
numsnps++;
}
// The GWAS Catalog often only publishes a single p-value for a couple of SNPs.
// We'll assume that all the reported SNPs have an LD ~ 1.0
Double pval = null;
try {
pval = Double.parseDouble(elems[pvalCol]);
} catch (NumberFormatException e) {
//System.out.println("P-value unparseable for trait: " + gwasTraitObj.getName() + " associated with SNP " + gwasSNPObj.getName() + ": " + elems[pvalCol]);
}
gwasSNPObj.getAssociatedTraits().add(gwasTraitObj);
if (pval != null) {
Double previousP = gwasSNPObj.getPValueAssociatedWithTrait(gwasTraitObj);
if (previousP == null || previousP > pval) {
gwasSNPObj.setPValueAssociatedWithTrait(gwasTraitObj, pval);
gwasSNPObj.getRiskAllele().put(gwasTraitObj, riskallele);
}
}
gwasTraitObj.snps.add(gwasSNPObj);
pub.snps.add(gwasSNPObj);
pub.setPValueAssociatedWithTrait(gwasSNPObj, gwasTraitObj, pval);
gwasSNPObj.getPublishedIn().add(pub);
}
gwasTraitObj.appendMappedGenes(mappedGenes);
gwasTraitObj.appendReportedGenes(reportedGenes);
gwasTraitObj.publishedIn.add(pub);
pub.traits.add(gwasTraitObj);
}
elems = tf.readLineElemsReturnReference(TextFile.tab);
}
System.out.println(numpubs + " pubs, " + numsnps + " snps, " + numtraits + " traits");
tf.close();
}
public GWASTrait[] getTraits() {
if (traitArray == null) {
traitArray = new GWASTrait[traits.size()];
traits.toArray(traitArray);
}
return traitArray;
}
/**
* @return the loci
*/
public HashSet getLoci() {
return loci;
}
/**
* @param loci the loci to set
*/
public void setLoci(HashSet loci) {
this.loci = loci;
}
/**
* @return the snps
*/
public HashSet getSnps() {
return snps;
}
/**
* @param snps the snps to set
*/
public void setSnps(HashSet snps) {
this.snps = snps;
}
/**
* @param traits the traits to set
*/
public void setTraits(HashSet traits) {
this.traits = traits;
}
/**
* @return the publicationToObj
*/
public HashMap getPublicationToObj() {
return publicationToObj;
}
/**
* @param publicationToObj the publicationToObj to set
*/
public void setPublicationToObj(HashMap publicationToObj) {
this.publicationToObj = publicationToObj;
}
/**
* @return the snpToObj
*/
public HashMap getSnpToObj() {
return snpToObj;
}
/**
* @param snpToObj the snpToObj to set
*/
public void setSnpToObj(HashMap snpToObj) {
this.snpToObj = snpToObj;
}
/**
* @return the locusToObj
*/
public HashMap getLocusToObj() {
return locusToObj;
}
/**
* @param locusToObj the locusToObj to set
*/
public void setLocusToObj(HashMap locusToObj) {
this.locusToObj = locusToObj;
}
/**
* @return the traitToObj
*/
public HashMap getTraitToObj() {
return traitToObj;
}
/**
* @param traitToObj the traitToObj to set
*/
public void setTraitToObj(HashMap traitToObj) {
this.traitToObj = traitToObj;
}
public GWASSNP[] getSnpsArray() {
GWASSNP[] snpsr = new GWASSNP[snps.size()];
snpsr = snps.toArray(snpsr);
return snpsr;
}
public GWASSNP[] getSNPsForTraitContainingKey(String key) {
System.out.println("Looking for " + key + " snps");
HashSet s = new HashSet();
key = key.toLowerCase();
for (GWASTrait t : traits) {
if (t.getName().toLowerCase().contains(key)) {
System.out.println("Found trait: " + t.getName());
GWASSNP[] traitsnps = t.getSNPs();
s.addAll(Arrays.asList(traitsnps));
}
}
return s.toArray(new GWASSNP[s.size()]);
}
public GWASTrait[] getTraitsForCertainKey(String key) {
key = key.toLowerCase();
ArrayList selected = new ArrayList();
for (GWASTrait t : traits) {
if (t.getName().toLowerCase().contains(key)) {
selected.add(t);
}
}
return selected.toArray(new GWASTrait[selected.size()]);
}
public GWASLocus[] getLociForCertainKey(String key) {
System.out.println("Looking for " + key + " snps");
HashSet s = new HashSet();
key = key.toLowerCase();
for (GWASTrait t : traits) {
if (t.getName().toLowerCase().contains(key)) {
System.out.println("Found trait: " + t.getName());
s.addAll(t.loci);
}
}
return s.toArray(new GWASLocus[s.size()]);
}
public HashSet getReportedGenesForCertainKey(String key) {
System.out.println("Looking for " + key + " snps");
HashSet s = new HashSet();
key = key.toLowerCase();
for (GWASTrait t : traits) {
if (t.getName().toLowerCase().contains(key)) {
System.out.println("Found trait: " + t.getName());
s.addAll(t.getReportedGenes());
}
}
return s;
}
public HashSet getMappedGenesForCertainKey(String key) {
System.out.println("Looking for " + key + " snps");
HashSet s = new HashSet();
key = key.toLowerCase();
for (GWASTrait t : traits) {
if (t.getName().toLowerCase().contains(key)) {
System.out.println("Found trait: " + t.getName());
s.addAll(t.getMappedGenes());
}
}
return s;
}
public HashSet getTraitsForCertainSnps(String key) {
HashSet m = new HashSet();
key = key.toLowerCase();
for (GWASSNP s : snps) {
if (s.getName().equalsIgnoreCase(key)) {
HashSet t = s.getAssociatedTraits();
for (GWASTrait tmp : t) {
m.add(tmp.cleanName);
}
}
}
return m;
}
}
/*
Date Added to Catalog
PUBMEDID
First Author
Date
Journal
Link
Study
Disease/Trait
Initial Sample Size
Replication Sample Size
Region
Chr_id
Chr_pos
Reported Gene(s)
Mapped_gene
Upstream_gene_id
Downstream_gene_id
Snp_gene_ids
Upstream_gene_distance
Downstream_gene_distance
Strongest SNP-Risk Allele
SNPs
Merged
Snp_id_current
Context
Intergenic
Risk Allele Frequency
p-Value
Pvalue_mlog
p-Value (text)
OR or beta
95% CI (text)
Platform [SNPs passing QC]
CNV
*/
© 2015 - 2025 Weber Informatics LLC | Privacy Policy