All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.monarchinitiative.phenol.io.assoc.HpoAssociationParser Maven / Gradle / Ivy

There is a newer version: 2.1.1
Show newest version
package org.monarchinitiative.phenol.io.assoc;

import com.google.common.collect.*;
import org.monarchinitiative.phenol.base.PhenolException;
import org.monarchinitiative.phenol.formats.Gene;
import org.monarchinitiative.phenol.formats.hpo.*;
import org.monarchinitiative.phenol.ontology.data.Ontology;
import org.monarchinitiative.phenol.ontology.data.TermId;
import org.monarchinitiative.phenol.io.obo.hpo.HpoDiseaseAnnotationParser;

import java.io.*;
import java.util.*;
import java.util.stream.Collectors;
import java.util.zip.GZIPInputStream;

/**
 * 

This class parses the files {@code mim2gene_medgen}, available from * ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/mim2gene_medgen * as well as the file {@code Homo_sapiens_gene_info.gz}, available from * * ftp://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz. * mim2gene_medgen contains the MIM number of diseases and EntrezGene number of genes associated with the disease; * The relevant lines of the file are marked with "phenotype". The Homo_sapiens_gene_info.gz file contains the entrez gene * number of genes as well as their gene symbol.

*

The goal of this class it to provide associations

* * geneIdToSymbol - Key: EntrezGeneId [{@link TermId}] , Value: EntrezGeneSymbol-String
* associationMap - Key: DiseaseId [{@link TermId}] , Value: GeneToAssociation [{@link GeneToAssociation}]
* diseaseToGeneMap - Key: DiseaseId [{@link TermId}] , Value: GeneId [{@link TermId}]
* geneToDiseaseMap - Key: GeneId [{@link TermId}] , Value: Disease [{@link TermId}]
* diseaseToAssociations - Key: DiseaseId [{@link TermId}] , Value: List of genes [{@link DiseaseToGeneAssociation}],
* termToDisease - Key: phenotypeId {@link TermId} , Value: Disease [{@link HpoDisease}] FROM {@link HpoDiseaseAnnotationParser}
* phenotypeToGeneList - List of {@link HpoGeneAnnotation} generated from linking termToDisease, diseaseToGene
* associationList - List of all {@link DiseaseToGeneAssociation}
* *

* Michael Gargano * Peter Robinson */ public class HpoAssociationParser { private final Ontology hpoOntology; private final String homoSapiensGeneInfoPath; private final String mim2geneMedgenPath; private final File orphaToGenePath; /** Key--an EntrezGene id; value--the corresponding symbol. all */ private BiMap allGeneIdToSymbolMap; private ImmutableMap geneIdToSymbolMap; /** Key: an OMIM curie (e.g., OMIM:600100); value--corresponding GeneToAssociation object). For instance, * MICROVASCULAR COMPLICATIONS OF DIABETES, SUSCEPTIBILITY TO, 1; is associated to the gene VEGF as POLYGENIC, * and MARFAN SYNDROME is associated to the gene FBN1 as MENDELIAN.*/ private ImmutableMultimap associationMap; /** Key: a disease Id; Value: a geneId */ private ImmutableMultimap diseaseToGeneMap; /** Key: a gene Id; Value: a diseaseId */ private ImmutableMultimap geneToDiseaseMap; /** Key: a disease Id; Value: disease obj, all gene associations. */ private ImmutableMap diseaseToAssociationsMap; /** Key: an phenotype Id Value: disease obj, HpoDisease, Immutable. */ private Map termToDisease; /** Key: a phenotype Id; Value: geneId */ private ImmutableList phenotypeToGeneList; /** List of all associations */ private List associationList; private static final String ENTREZ_GENE_PREFIX = "NCBIGene"; private static final String OMIM_PREFIX = "OMIM"; public HpoAssociationParser(String geneInfoPath, String mim2geneMedgenPath, File orphaToGenePath, Ontology hpoOntology){ this.hpoOntology = hpoOntology; this.homoSapiensGeneInfoPath = geneInfoPath; this.mim2geneMedgenPath = mim2geneMedgenPath; this.orphaToGenePath = orphaToGenePath; } public HpoAssociationParser(File geneInfoPath, File mim2geneMedgenPath, File orphaToGenePath, Ontology hpoOntology){ this.hpoOntology = hpoOntology; this.homoSapiensGeneInfoPath = geneInfoPath.getAbsolutePath(); this.mim2geneMedgenPath = mim2geneMedgenPath.getAbsolutePath(); this.orphaToGenePath = orphaToGenePath; } /** Parse everything except the Orphanet data!.*/ public HpoAssociationParser(String geneInfoPath, String mim2geneMedgenPath, Ontology hpoOntology){ this.hpoOntology = hpoOntology; this.homoSapiensGeneInfoPath = geneInfoPath; this.mim2geneMedgenPath = mim2geneMedgenPath; this.orphaToGenePath = null; } public Map getDiseaseToAssociationsMap() { return this.diseaseToAssociationsMap; } public Map getGeneIdToSymbolMap() { return this.geneIdToSymbolMap;} public Multimap getDiseaseToGeneIdMap() { return this.diseaseToGeneMap; } public Multimap getGeneToDiseaseIdMap() { return this.geneToDiseaseMap; } public List getPhenotypeToGene() { return this.phenotypeToGeneList; } public Map getTermToDisease() { return this.termToDisease; } public Multimap getDiseasetoGeneAssociation(){ return this.associationMap; } /* Builds a list of HpoGeneAnnotations, which are just an object that represents a relationship from Gene to HP Term. @Parameter: Map of PhenotypeID's to DiseaseID's */ public void setTermToGene(Multimap phenotypeToDisease) throws PhenolException{ if(this.diseaseToGeneMap.isEmpty()){ throw new PhenolException("Error: Associations not parsed. Please call parse then set the term to gene mapping."); } ImmutableList.Builder builderGeneAnnotationList = new ImmutableList.Builder<>(); for(TermId phenotype : phenotypeToDisease.keySet()){ Map mappedGenes = new HashMap<>(); phenotypeToDisease.get(phenotype).stream() .flatMap(disease -> this.diseaseToGeneMap.get(disease).stream()).collect(Collectors.toList()).forEach((gene) -> { try { Integer entrezId = Integer.parseInt(gene.getId()); if(!mappedGenes.containsKey(entrezId)){ String entrezGeneSymbol = this.geneIdToSymbolMap.get(gene); if(entrezGeneSymbol == null){ entrezGeneSymbol = "-"; } String hpoTermName = hpoOntology.getTermMap().get(phenotype).getName(); HpoGeneAnnotation geneAnnotation = new HpoGeneAnnotation(entrezId, entrezGeneSymbol, hpoTermName, phenotype); builderGeneAnnotationList.add(geneAnnotation); mappedGenes.put(entrezId, true); } }catch(Exception e){ return; } }); } this.phenotypeToGeneList = builderGeneAnnotationList.build(); } public void setTermToDisease(Map termToDisease){ this.termToDisease = termToDisease; } /* Generate and set all the bla32 maps Disease -> Gene Entire Object Multimap DiseaseId to GeneId Multimap GeneId -> DiseaseID */ private void setAssociationMaps(){ Multimap geneToDisease = ArrayListMultimap.create(); ImmutableMap.Builder builderDiseasetoAssociation = new ImmutableMap.Builder<>(); for (DiseaseToGeneAssociation g2p : associationList) { TermId diseaseId = g2p.getDiseaseId(); List geneList = g2p.getGeneList(); builderDiseasetoAssociation.put(diseaseId, g2p); for (Gene g: geneList) { TermId geneId = g.getId(); if(!geneToDisease.containsEntry(geneId, diseaseId)){ geneToDisease.put(geneId, diseaseId); } } } ImmutableMultimap.Builder builderGeneToDisease = new ImmutableMultimap.Builder<>(); builderGeneToDisease.putAll(geneToDisease); this.geneToDiseaseMap = builderGeneToDisease.build(); this.diseaseToGeneMap = builderGeneToDisease.build().inverse(); this.diseaseToAssociationsMap = builderDiseasetoAssociation.build(); } public void parse() { ImmutableList.Builder builder = new ImmutableList.Builder<>(); try { parseGeneInfo(); parseDiseaseToGene(); for (TermId omimCurie : associationMap.keySet()) { Collection g2aList = associationMap.get(omimCurie); DiseaseToGeneAssociation g2p = new DiseaseToGeneAssociation(omimCurie, ImmutableList.copyOf(g2aList)); builder.add(g2p); } } catch (IOException e) { e.printStackTrace(); } this.associationList = builder.build(); this.setAssociationMaps(); } /** * Creates a multimap from the medgene_medgen file. We need a multimap because some * OMIM phenotype id's (the key of the multimap) are associated with more than one * gene (EntrezGene id). This method must be called AFTER {@link #parseGeneInfo()}. * @throws IOException if the mim2gene_medgen file cannot be read */ private void parseDiseaseToGene() throws IOException { Multimap orphaToGene; Multimap associationMap = ArrayListMultimap.create(); Map geneMap = new HashMap<>(); try (BufferedReader br = new BufferedReader(new FileReader(mim2geneMedgenPath))) { String line; while ((line = br.readLine()) != null) { if (line.startsWith("#")) continue; String[] associations = line.split("\t"); if (associations[2].equals("phenotype")) { String mimid = associations[0]; TermId omimCurie = TermId.of(OMIM_PREFIX, mimid); String entrezGeneNumber = associations[1]; TermId entrezId = TermId.of(ENTREZ_GENE_PREFIX, entrezGeneNumber); String symbol = this.allGeneIdToSymbolMap.get(entrezId); if (!"-".equals(entrezGeneNumber)) { if (symbol == null) { symbol = "-"; } else { if (!geneMap.containsKey(entrezId)) { geneMap.put(entrezId, symbol); } } TermId geneId = TermId.of(ENTREZ_GENE_PREFIX, entrezGeneNumber); Gene gene = new Gene(geneId, symbol); if (associations[5].contains("susceptibility")) { GeneToAssociation g2a = new GeneToAssociation(gene, AssociationType.POLYGENIC); if (!associationMap.containsEntry(omimCurie, g2a)) { associationMap.put(omimCurie, g2a); } } else { GeneToAssociation g2a = new GeneToAssociation(gene, AssociationType.MENDELIAN); if (!associationMap.containsEntry(omimCurie, g2a)) { associationMap.put(omimCurie, g2a); } } } } } } if(this.orphaToGenePath != null){ Map geneSymbolToId = this.allGeneIdToSymbolMap.inverse(); try{ OrphaGeneToDiseaseParser parser = new OrphaGeneToDiseaseParser(this.orphaToGenePath); orphaToGene = parser.getOrphaDiseaseToGeneSymbolMap(); for (Map.Entry entry : orphaToGene.entries()) { TermId orpha = entry.getKey(); String geneSymbol = entry.getValue(); if(geneSymbolToId.containsKey(geneSymbol)){ Gene gene = new Gene(geneSymbolToId.get(geneSymbol), geneSymbol); GeneToAssociation g2a = new GeneToAssociation(gene, AssociationType.UNKNOWN); if(!associationMap.containsEntry(orpha,g2a)){ associationMap.put(orpha, g2a); } } } }catch(PhenolException e){ System.err.println(e.toString()); } } ImmutableMultimap.Builder associationBuilder = new ImmutableMultimap.Builder<>(); associationBuilder.putAll(associationMap); this.associationMap = associationBuilder.build(); ImmutableMap.Builder geneBuilder = new ImmutableMap.Builder<>(); geneBuilder.putAll(geneMap); geneIdToSymbolMap = geneBuilder.build(); this.allGeneIdToSymbolMap = null; } private void parseGeneInfo() throws IOException { ImmutableBiMap.Builder builder=new ImmutableBiMap.Builder<>(); InputStream fileStream = new FileInputStream(homoSapiensGeneInfoPath); InputStream gzipStream = new GZIPInputStream(fileStream); Reader decoder = new InputStreamReader(gzipStream); BufferedReader br = new BufferedReader(decoder); String line; while ((line=br.readLine())!=null) { String[] a = line.split("\t"); String taxon=a[0]; if (! taxon.equals("9606")) continue; // i.e., we want only Homo sapiens sapiens and not Neaderthal etc. if(!("unknown".equals(a[9]) | "tRNA".equals(a[9]) | "rRNA".equals(a[9]) | "pseudo".equals(a[9]))){ String geneId=a[1]; String symbol=a[2]; TermId tid = TermId.of(ENTREZ_GENE_PREFIX,geneId); builder.put(tid,symbol); } } this.allGeneIdToSymbolMap = builder.build(); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy