All Downloads are FREE. Search and download functionalities are using the official Maven repository.

umcg.genetica.io.gwascatalog.GWASCatalog Maven / Gradle / Ivy

There is a newer version: 1.0.7
Show newest version
/*
 * To change this template, choose Tools | Templates
 * and open the template in the editor.
 */
package umcg.genetica.io.gwascatalog;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import umcg.genetica.io.text.TextFile;

/**
 *
 * @author harmjan
 */
public class GWASCatalog {

    private HashSet loci = new HashSet();
    private HashSet snps = new HashSet();
    private HashSet traits = new HashSet();
    private GWASTrait[] traitArray = null;
    private HashMap publicationToObj = new HashMap();
    private HashMap snpToObj = new HashMap();
    private HashMap locusToObj = new HashMap();
    private HashMap traitToObj = new HashMap();
    private HashMap cleanTraitToObj = new HashMap();

    public GWASCatalog() {
    }

    public GWASCatalog(String gwasCatalogLoc) throws IOException {
        this.read(gwasCatalogLoc);
    }

    public GWASCatalog(String gwasCatalogLoc, double pvaluethreshold) throws IOException {
        this.read(gwasCatalogLoc);
    }

    public void read(String calatogloc) throws IOException {
        TextFile tf = new TextFile(calatogloc, TextFile.R);
        String[] headerelems = tf.readLineElemsReturnReference(TextFile.tab);

        int dateAddedCol = -1;
        int pubMedidCol = -1;
        int firstAuthorCol = -1;
        int publishDateCol = -1;
        int journalCol = -1;
        int studyCol = -1;

        int diseaseCol = -1;
        int samplesizeCol = -1;
        int samplesizeReplicationCol = -1;

        int topSNPCol = -1;
        int snpCol = -1;

        int pvalCol = -1;
        int chrCol = -1;
        int chrPosCol = -1;

        int reportedGeneCol = -1;
        int mappedGeneCol = -1;

        int col = 0;
        for (String e : headerelems) {
            if (e.equals("Date Added to Catalog")) {
                dateAddedCol = col;
            } else if (e.equals("PUBMEDID")) {
                pubMedidCol = col;
            } else if (e.equals("First Author")) {
                firstAuthorCol = col;
            } else if (e.equals("Date")) {
                publishDateCol = col;
            } else if (e.equals("Journal")) {
                journalCol = col;
            } else if (e.equals("Study")) {
                studyCol = col;
            } else if (e.equals("Disease/Trait")) {
                diseaseCol = col;
            } else if (e.equals("Initial Sample Size")) {
                samplesizeCol = col;
            } else if (e.equals("Replication Sample Size")) {
                samplesizeReplicationCol = col;
            } else if (e.equals("Strongest SNP-Risk Allele")) {
                topSNPCol = col;
            } else if (e.equals("SNPs")) {
                snpCol = col;
            } else if (e.equals("p-Value")) {
                pvalCol = col;
            } else if (e.equals("Chr_id")) {
                chrCol = col;
            } else if (e.equals("Chr_pos")) {
                chrPosCol = col;
            } else if (e.equals("Reported Gene(s)")) {
                reportedGeneCol = col;
            } else if (e.equals("Mapped_gene")) {
                mappedGeneCol = col;
            }
            col++;
        }

        String[] elems = tf.readLineElemsReturnReference(TextFile.tab);
        int numtraits = 0;
        int numsnps = 0;
        int numpubs = 0;
        while (elems != null) {
            if (elems.length > 11) {
                String pubname = elems[pubMedidCol] + "; " + elems[firstAuthorCol] + "; " + elems[publishDateCol] + "; " + elems[journalCol] + "; " + elems[studyCol];
//                String pubname = elems[pubMedidCol];
//	    int studysize = Integer.parseInt(elems[samplesizeCol]);
//	    int studySizeReplication = Integer.parseInt(elems[samplesizeReplicationCol]);

                String trait = elems[diseaseCol].trim();
                String cleanedTrait = trait.replaceAll(" ", "_").replaceAll("[^a-zA-Z0-9\\-_]+", "");
                String otherSNPs = elems[snpCol].trim();
                String[] topSNPElems = elems[topSNPCol].split("-");
                String riskallele = null;

                String mGene = elems[mappedGeneCol];
                String rGene = elems[reportedGeneCol];

                HashSet mappedGenes = new HashSet();
                HashSet reportedGenes = new HashSet();
                if (!mGene.equals("NR") && !mGene.equals("Intergenic") && !mGene.equals(" - ")) {
                    if (mGene.contains(" - ")) {
                        String[] mGenes = mGene.split(" - ");
                        mappedGenes.addAll(Arrays.asList(mGenes));
                    } else if (mGene.contains(";")) {
                        String[] mGenes = mGene.split(";");
                        mappedGenes.addAll(Arrays.asList(mGenes));
                    } else {
                        mappedGenes.add(mGene);
                    }
                }
                if (!rGene.equals("NR") && !rGene.equals("Intergenic") && !rGene.equals(" - ")) {
                    if (rGene.contains(" - ")) {
                        String[] rGenes = rGene.split(" - ");
                        reportedGenes.addAll(Arrays.asList(rGenes));
                    } else if (rGene.contains(";")) {
                        String[] rGenes = rGene.split(";");
                        reportedGenes.addAll(Arrays.asList(rGenes));
                    } else {
                        reportedGenes.add(rGene);
                    }
                }

                byte chr = -1;
                int chrPos = -1;
                try {
                    chr = Byte.parseByte(elems[chrCol]);
                    chrPos = Integer.parseInt(elems[chrPosCol]);
                } catch (NumberFormatException ex) {
                    //System.out.println("Chromosome and/or position unparseable for trait: " + trait + " associated with SNP " + snp + ": chr: " + elems[chrCol] + ", pos: " + elems[chrPosCol]);
                }
                if (topSNPElems.length > 1) {
                    riskallele = topSNPElems[1];
                    if (riskallele.equals("?")) {
                        riskallele = null;
                    }
                }

                GWASPublication pub = publicationToObj.get(pubname);
                if (pub == null) {
                    pub = new GWASPublication();
                    pub.id = numpubs;
                    pub.name = pubname;
                    publicationToObj.put(pubname, pub);
                    numpubs++;
                }

                GWASTrait gwasTraitObj = traitToObj.get(trait);
                if (gwasTraitObj == null) {
                    gwasTraitObj = new GWASTrait();
                    gwasTraitObj.name = trait;
                    gwasTraitObj.cleanName = cleanedTrait;
                    gwasTraitObj.id = numtraits;
                    gwasTraitObj.setMappedGenes(mappedGenes);
                    gwasTraitObj.setReportedGenes(reportedGenes);
                    traitToObj.put(trait, gwasTraitObj);
                    cleanTraitToObj.put(cleanedTrait, gwasTraitObj);
                    traits.add(gwasTraitObj);
                    numtraits++;
                }

                // parse the top SNP: remove whitespace..
                String topSNP = topSNPElems[0];
                topSNP = topSNP.trim();
                while (topSNP.startsWith(" ")) {
                    topSNP = topSNP.substring(1);
                }

                GWASSNP gwasTopSNPObj = snpToObj.get(topSNP);
                if (gwasTopSNPObj == null) {
                    gwasTopSNPObj = new GWASSNP();
                    gwasTopSNPObj.setName(topSNP);
                    gwasTopSNPObj.setId(numsnps);
                    gwasTopSNPObj.setChr(chr);
                    gwasTopSNPObj.setPosition(chrPos);
                    snpToObj.put(topSNP, gwasTopSNPObj);
                    snps.add(gwasTopSNPObj);
                    numsnps++;
                }

                Double topSNPAssocPVal = null;
                try {
                    topSNPAssocPVal = Double.parseDouble(elems[pvalCol]);
                } catch (NumberFormatException e) {
                    // Sometimes the pvalue is unreported...
                    // System.out.println("P-value unparseable for trait: " + gwasTraitObj.getName() + " associated with SNP " + gwasSNPObj.getName() + ": " + elems[pvalCol]);
                }
                gwasTopSNPObj.getAssociatedTraits().add(gwasTraitObj);
                gwasTraitObj.addTopSNP(gwasTopSNPObj);

                if (topSNPAssocPVal != null) {
                    Double previousP = gwasTopSNPObj.getPValueAssociatedWithTrait(gwasTraitObj);
                    if (previousP == null || previousP > topSNPAssocPVal) {
                        gwasTopSNPObj.setPValueAssociatedWithTrait(gwasTraitObj, topSNPAssocPVal);
                        gwasTopSNPObj.getRiskAllele().put(gwasTraitObj, riskallele);
                    }
                }
                // parse all the other reported SNPs..
                String[] otherSNPElems = otherSNPs.split(",");
                for (int s = 0; s < otherSNPElems.length; s++) {

                    String snpname = otherSNPElems[s].trim();
                    while (snpname.startsWith(" ")) {
                        snpname = snpname.substring(1);
                    }

                    GWASSNP gwasSNPObj = snpToObj.get(snpname);
                    if (gwasSNPObj == null) {
                        gwasSNPObj = new GWASSNP();
                        gwasSNPObj.setName(snpname);
                        gwasSNPObj.setId(numsnps);
                        gwasSNPObj.setChr(chr);
                        gwasSNPObj.setPosition(chrPos);
                        snpToObj.put(snpname, gwasSNPObj);
                        snps.add(gwasSNPObj);
                        numsnps++;
                    }

                    // The GWAS Catalog often only publishes a single p-value for a couple of SNPs. 
                    // We'll assume that all the reported SNPs have an LD ~ 1.0
                    Double pval = null;
                    try {
                        pval = Double.parseDouble(elems[pvalCol]);
                    } catch (NumberFormatException e) {
                        //System.out.println("P-value unparseable for trait: " + gwasTraitObj.getName() + " associated with SNP " + gwasSNPObj.getName() + ": " + elems[pvalCol]);
                    }
                    gwasSNPObj.getAssociatedTraits().add(gwasTraitObj);

                    if (pval != null) {
                        Double previousP = gwasSNPObj.getPValueAssociatedWithTrait(gwasTraitObj);
                        if (previousP == null || previousP > pval) {
                            gwasSNPObj.setPValueAssociatedWithTrait(gwasTraitObj, pval);
                            gwasSNPObj.getRiskAllele().put(gwasTraitObj, riskallele);
                        }
                    }

                    gwasTraitObj.snps.add(gwasSNPObj);
                    pub.snps.add(gwasSNPObj);
                    pub.setPValueAssociatedWithTrait(gwasSNPObj, gwasTraitObj, pval);
                    gwasSNPObj.getPublishedIn().add(pub);
                }

                gwasTraitObj.appendMappedGenes(mappedGenes);
                gwasTraitObj.appendReportedGenes(reportedGenes);
                gwasTraitObj.publishedIn.add(pub);
                pub.traits.add(gwasTraitObj);

            }
            elems = tf.readLineElemsReturnReference(TextFile.tab);
        }

        System.out.println(numpubs + " pubs, " + numsnps + " snps, " + numtraits + " traits");
        tf.close();
    }

    public GWASTrait[] getTraits() {
        if (traitArray == null) {
            traitArray = new GWASTrait[traits.size()];
            traits.toArray(traitArray);
        }
        return traitArray;
    }

    /**
     * @return the loci
     */
    public HashSet getLoci() {
        return loci;
    }

    /**
     * @param loci the loci to set
     */
    public void setLoci(HashSet loci) {
        this.loci = loci;
    }

    /**
     * @return the snps
     */
    public HashSet getSnps() {
        return snps;
    }

    /**
     * @param snps the snps to set
     */
    public void setSnps(HashSet snps) {
        this.snps = snps;
    }

    /**
     * @param traits the traits to set
     */
    public void setTraits(HashSet traits) {
        this.traits = traits;
    }

    /**
     * @return the publicationToObj
     */
    public HashMap getPublicationToObj() {
        return publicationToObj;
    }

    /**
     * @param publicationToObj the publicationToObj to set
     */
    public void setPublicationToObj(HashMap publicationToObj) {
        this.publicationToObj = publicationToObj;
    }

    /**
     * @return the snpToObj
     */
    public HashMap getSnpToObj() {
        return snpToObj;
    }

    /**
     * @param snpToObj the snpToObj to set
     */
    public void setSnpToObj(HashMap snpToObj) {
        this.snpToObj = snpToObj;
    }

    /**
     * @return the locusToObj
     */
    public HashMap getLocusToObj() {
        return locusToObj;
    }

    /**
     * @param locusToObj the locusToObj to set
     */
    public void setLocusToObj(HashMap locusToObj) {
        this.locusToObj = locusToObj;
    }

    /**
     * @return the traitToObj
     */
    public HashMap getTraitToObj() {
        return traitToObj;
    }

    /**
     * @param traitToObj the traitToObj to set
     */
    public void setTraitToObj(HashMap traitToObj) {
        this.traitToObj = traitToObj;
    }

    public GWASSNP[] getSnpsArray() {
        GWASSNP[] snpsr = new GWASSNP[snps.size()];
        snpsr = snps.toArray(snpsr);
        return snpsr;
    }

    public GWASSNP[] getSNPsForTraitContainingKey(String key) {
        System.out.println("Looking for " + key + " snps");
        HashSet s = new HashSet();
        key = key.toLowerCase();
        for (GWASTrait t : traits) {
            if (t.getName().toLowerCase().contains(key)) {
                System.out.println("Found trait: " + t.getName());
                GWASSNP[] traitsnps = t.getSNPs();
                s.addAll(Arrays.asList(traitsnps));
            }
        }

        return s.toArray(new GWASSNP[s.size()]);
    }

    public GWASTrait[] getTraitsForCertainKey(String key) {
        key = key.toLowerCase();
        ArrayList selected = new ArrayList();
        for (GWASTrait t : traits) {
            if (t.getName().toLowerCase().contains(key)) {
                selected.add(t);
            }
        }

        return selected.toArray(new GWASTrait[selected.size()]);
    }

    public GWASLocus[] getLociForCertainKey(String key) {
        System.out.println("Looking for " + key + " snps");
        HashSet s = new HashSet();
        key = key.toLowerCase();
        for (GWASTrait t : traits) {
            if (t.getName().toLowerCase().contains(key)) {
                System.out.println("Found trait: " + t.getName());
                s.addAll(t.loci);
            }
        }

        return s.toArray(new GWASLocus[s.size()]);
    }

    public HashSet getReportedGenesForCertainKey(String key) {
        System.out.println("Looking for " + key + " snps");
        HashSet s = new HashSet();
        key = key.toLowerCase();
        for (GWASTrait t : traits) {
            if (t.getName().toLowerCase().contains(key)) {
                System.out.println("Found trait: " + t.getName());
                s.addAll(t.getReportedGenes());
            }
        }

        return s;
    }

    public HashSet getMappedGenesForCertainKey(String key) {
        System.out.println("Looking for " + key + " snps");
        HashSet s = new HashSet();
        key = key.toLowerCase();
        for (GWASTrait t : traits) {
            if (t.getName().toLowerCase().contains(key)) {
                System.out.println("Found trait: " + t.getName());
                s.addAll(t.getMappedGenes());
            }
        }

        return s;
    }

    public HashSet getTraitsForCertainSnps(String key) {
        HashSet m = new HashSet();
        key = key.toLowerCase();
        for (GWASSNP s : snps) {
            if (s.getName().equalsIgnoreCase(key)) {
                HashSet t = s.getAssociatedTraits();
                for (GWASTrait tmp : t) {
                    m.add(tmp.cleanName);
                }
            }
        }

        return m;
    }
}


/*
Date Added to Catalog
PUBMEDID
First Author
Date
Journal
Link
Study
Disease/Trait
Initial Sample Size
Replication Sample Size
Region
Chr_id
Chr_pos
Reported Gene(s)
Mapped_gene
Upstream_gene_id
Downstream_gene_id
Snp_gene_ids
Upstream_gene_distance
Downstream_gene_distance
Strongest SNP-Risk Allele
SNPs
Merged
Snp_id_current
Context
Intergenic
Risk Allele Frequency
p-Value
Pvalue_mlog
p-Value (text)
OR or beta
95% CI (text)
Platform [SNPs passing QC]
CNV
 */




© 2015 - 2025 Weber Informatics LLC | Privacy Policy