
org.monarchinitiative.phenol.annotations.obo.mpo.MpAnnotationParser Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of phenol-annotations Show documentation
Show all versions of phenol-annotations Show documentation
phenol-annotation contains the annotation functionality for ontologies
package org.monarchinitiative.phenol.annotations.obo.mpo;
import org.monarchinitiative.phenol.base.PhenolException;
import org.monarchinitiative.phenol.base.PhenolRuntimeException;
import org.monarchinitiative.phenol.annotations.formats.mpo.*;
import org.monarchinitiative.phenol.ontology.data.TermId;
import java.io.*;
import java.util.*;
import java.util.regex.Pattern;
/**
* Class to parse the following files from the MGI site.
* 1. MGI_GenePheno.rpt (for now, omit MGI_PhenoGenoMP.rpt).
* 2. MGI_Pheno_Sex.rpt
* MGI_Pheno_Sex.rpt has a subset of the same models as MGI_GenePheno but additionally adds information
* on whether the phenotypes were sex-specific. This class allows users to parse either each individual
* model ({@link MpSimpleModel} or to combine the models according to gene ({@link MpGeneModel}.
* @author Hannah Blau
* @author Peter Robinson
*/
public class MpAnnotationParser {
/**
* Path of MGI_GenePheno.rpt file.
*/
private final String genePhenoPath;
/**
* Path of MGI_GenePheno.rpt file.
*/
private final String phenoSexPath;
/**
* Used to store sexSpecific-specific phenotype annotations.
*/
private Map> geno2ssannotMap = Map.of();
/**
* Key: A term id representing the genotype accession id of an MPO model
* Value: the corresponding MpSimpleModel object
*/
private Map genotypeAccessionToMpSimpleModelMap;
/**
* Show verbose debugging information.
*/
private boolean verbose = true;
private final List parseErrors;
/**
* @param path Path of MGI_GenePheno.rpt file.
*/
private MpAnnotationParser(String path) {
this.genePhenoPath = path;
this.phenoSexPath = null;//do not use sex-specific phenotypes
parseErrors = new ArrayList<>();
parse();
}
/**
* Input single-gene phenotype data and include sexSpecific-specific phenotypes using
* appropriate modifiers
*
* @param phenoSexPath Path to MGI_GenePheno.rpt
* @param genePhenoPath Path to MGI_Pheno_Sex.rpt
*/
private MpAnnotationParser(String genePhenoPath, String phenoSexPath) {
this.genePhenoPath = genePhenoPath;
this.phenoSexPath = phenoSexPath;
parseErrors = new ArrayList<>();
try {
parsePhenoSexData();
parse();
} catch (IOException | PhenolException e) {
throw new PhenolRuntimeException("Could not parse MGI_GenePheno.rpt: " + e.getMessage());
}
}
public static Map loadIndividualModels(String genePhenoPath) {
//Map> gene2simpleMap = new HashMap<>();
Map builder = new HashMap<>();
MpAnnotationParser annotParser = new MpAnnotationParser(genePhenoPath);
if (annotParser.genotypeAccessionToMpSimpleModelMap.isEmpty()) {
for (String e : annotParser.parseErrors) {
System.err.println(e);
}
throw new PhenolRuntimeException("Could not parse " + genePhenoPath);
}
return annotParser.genotypeAccessionToMpSimpleModelMap;
}
public static Map
loadIndividualModelsWithSexSpecificPhenotypes(String genePhenoPath, String phenoSexPath) {
//Map> gene2simpleMap = new HashMap<>();
//ImmutableMap.Builder builder = new ImmutableMap.Builder<>();
MpAnnotationParser annotParser = new MpAnnotationParser(genePhenoPath, phenoSexPath);
if (annotParser.genotypeAccessionToMpSimpleModelMap.isEmpty()) {
for (String e : annotParser.parseErrors) {
System.err.println(e);
}
throw new PhenolRuntimeException("Could not parse " + genePhenoPath);
}
return annotParser.genotypeAccessionToMpSimpleModelMap;
}
public static Map loadMpGeneModels(String genePhenoPath) {
Map> gene2simpleMap = new HashMap<>();
Map builder = new HashMap<>();
Map simpleModelMap = loadIndividualModels(genePhenoPath);
for (MpSimpleModel simplemod : simpleModelMap.values()) {
TermId geneId = simplemod.getMarkerId();
gene2simpleMap.putIfAbsent(geneId, new ArrayList<>());
List lst = gene2simpleMap.get(geneId);
lst.add(simplemod);
}
// when we get here, the simpleModelMap has key-a gene ID, value-collection of
// all simple models that have a knockout of the corresponding gene
for (TermId geneId : gene2simpleMap.keySet()) {
List modCollection = gene2simpleMap.get(geneId);
MpGeneModel genemod = new MpGeneModel(geneId, modCollection);
builder.put(geneId, genemod);
}
return Map.copyOf(builder);
}
public static Map
loadMpGeneModelsWithSexSpecificPhenotypes(String genePhenoPath, String phenoSexPath) {
Map> gene2simpleMap = new HashMap<>();
Map builder = new HashMap<>();
Map simpleModelMap =
loadIndividualModelsWithSexSpecificPhenotypes(genePhenoPath, phenoSexPath);
for (MpSimpleModel simplemod : simpleModelMap.values()) {
TermId geneId = simplemod.getMarkerId();
gene2simpleMap.putIfAbsent(geneId, new ArrayList<>());
List lst = gene2simpleMap.get(geneId);
lst.add(simplemod);
}
// when we get here, the simpleModelMap has key-a gene ID, value-collection of
// all simple models that have a knockout of the corresponding gene
for (TermId geneId : gene2simpleMap.keySet()) {
List modCollection = gene2simpleMap.get(geneId);
MpGeneModel genemod = new MpGeneModel(geneId, modCollection);
builder.put(geneId, genemod);
}
return Map.copyOf(builder);
}
/*
public Map getGenotypeAccessionToMpSimpleModelMap() {
return genotypeAccessionToMpSimpleModelMap;
}*/
/**
* Parse data from MGI_Pheno_Sex.rpt.
* Note that there may be multiple lines that suuport the same assertion that differ only in PMID
*
* @throws IOException if MGI_Pheno_Sex.rpt cannot be successfully parsed
* @throws PhenolException upon parse issues with MGI_Pheno_Sex.rpt.
*/
private void parsePhenoSexData() throws IOException, PhenolException {
int EXPECTED_NUMBER_SEXSPECIFIC_FIELDS = 7;
BufferedReader br = new BufferedReader(new FileReader(this.phenoSexPath));
//this.sexSpecificAnnotationMap = ArrayListMultimap.create();
this.geno2ssannotMap = new HashMap<>();
String line;
line = br.readLine(); // the header
if (!line.startsWith("Genotype ID")) {
throw new PhenolException("Malformed header of MGI_Pheno_Sex.rpt: " + line);
}
while ((line = br.readLine()) != null) {
//System.out.println(line);
String[] A = line.split("\t");
if (A.length < EXPECTED_NUMBER_SEXSPECIFIC_FIELDS) {
if (verbose) {
//throw new PhenolException("Unexpected number of fields (" + A.length + ") in line " + line);
System.err.println("[Phenol-ERROR] Unexpected number of fields in MGI_Pheno_Sex.rpt(" + A.length + ") in line " + line);
}
continue;
}
SexSpecificAnnotationLine ssaline = new SexSpecificAnnotationLine(A);
TermId genotypeId = ssaline.genotypeID;
MpAnnotation annot = ssaline.toMpAnnotation();
geno2ssannotMap.putIfAbsent(genotypeId, new HashMap<>());
Map annotset = geno2ssannotMap.get(genotypeId);
if (annotset.containsKey(annot.getTermId())) {
// there is a previous annotation for this MP term --
// the current annotation is from a separate PMID
MpAnnotation previousannot = annotset.get(annot.getTermId());
MpAnnotation mergedannot = MpAnnotation.merge(previousannot, annot);
annotset.put(mergedannot.getTermId(), mergedannot);
} else {
annotset.put(annot.getTermId(), annot);
}
}
}
/**
* Parse the data in MGI_GenePheno.rpt. Interpolate the sexSpecific-specific data if available.
*/
private void parse() {
Map> annotationCollector = new HashMap<>();
String line;
try (BufferedReader br = new BufferedReader(new FileReader(this.genePhenoPath))){
while ((line = br.readLine()) != null) {
//System.out.println(line);
String[] A = line.split("\t");
/* Expected number of fields of the MGI_GenePheno.rpt file (note -- there
appears to be a stray tab between the penultimate and last column) */
int EXPECTED_NUMBER_OF_FIELDS = 8;
if (A.length < EXPECTED_NUMBER_OF_FIELDS) {
if (verbose) {
//throw new PhenolException("Unexpected number of fields (" + A.length + ") in line " + line);
System.err.println("[Phenol-ERROR] Unexpected number of fields in MGI_GenePheno.rpt (" + A.length + ") in line " + line);
}
continue;
}
try {
AnnotationLine annot = new AnnotationLine(A);
TermId modelId = annot.getGenotypeAccessionId();
annotationCollector.computeIfAbsent(modelId, key -> new HashSet<>()).add(annot);
} catch (PhenolException e) {
String err = String.format("[PARSE ERROR] %s (%s)", e.getMessage(), line);
this.parseErrors.add(err);
}
}
} catch (IOException ioe) {
throw new PhenolRuntimeException("Could not parse MGI_GenePheno.rpt: " + ioe.getMessage());
}
// When we get here, we have parsed all of the MGI_GenePheno.rpt file.
// Annotation lines are groups according to genotype accession id in the multimap
// our goal in the following is to parse everything into corresponding MpSimpleModel objects
Map builder = new HashMap<>();
for (TermId genoId : annotationCollector.keySet()) {
Collection annotationLines = annotationCollector.get(genoId);
List annotbuilder = new ArrayList<>();
Iterator it = annotationLines.iterator();
MpStrain background = null;
MpAllelicComposition allelicComp = null;
TermId alleleId = null;
String alleleSymbol = null;
TermId markerId = null;
// get the sexSpecific-specific annotations for this genotypeId, if any
Map sexSpecific = Map.of(); // default, empty set
if (this.geno2ssannotMap.containsKey(genoId)) {
Map imapbuilder = new HashMap<>();
Map annots = this.geno2ssannotMap.get(genoId);
for (MpAnnotation mpann : annots.values()) {
imapbuilder.put(mpann.getTermId(), mpann);
}
try {
sexSpecific = Map.copyOf(imapbuilder);
} catch (Exception e) {
System.err.println("Error building map of sexSpecific-specific annotations for " + genoId.getValue() + ": " + e.getMessage());
}
}
while (it.hasNext()) {
AnnotationLine aline = it.next();
MpAnnotation annot = aline.toMpAnnotation();
TermId mpoId = aline.getMpId();
background = aline.geneticBackground;
allelicComp = aline.getAllelicComp();
alleleId = aline.getAlleleId();
alleleSymbol = aline.getAlleleSymbol();
markerId = aline.getMarkerAccessionId();
// TODO we could check that these are identical for any given genotype id
// check if we have a sexSpecific-specific annotation matching the current annotation
// the following adds mpoId if it is present in sexSpexifix, otherwise it adds the default annot
annotbuilder.add(sexSpecific.getOrDefault(mpoId, annot));
// Note that we do not check for sexSpecific-specific annotations that are not present in the "main" file
// in practice, these are only sexSpecific-specific normal -- i.e., a phenotype was ruled out in one sexSpecific
// even though "somebody" thought the phenotype might be present.
// this type of normality (absence of a phenotype) is not useful for downstream analysis at the
// present time and so we skip it to avoid unnecessarily complicating the implementation.
}
MpSimpleModel mod = new MpSimpleModel(genoId, background, allelicComp, alleleId, alleleSymbol, markerId, List.copyOf(annotbuilder));
builder.put(genoId, mod);
}
genotypeAccessionToMpSimpleModelMap = Map.copyOf(builder);
}
/*public List getParseErrors() {
return this.parseErrors;
}*/
/**
* A convenience class that allows us to collect all of the annotations that belong
* to a given model (genotype accession id).
* The MGI_GenePheno.rpt file has the following structure:
*
* - 0. Allelic Composition
* - 1. Allele Symbol(s)
* - 2. Allele ID(s)
* - 3. Genetic Background
* - 4. Mammalian Phenotype ID
* - 5. PubMed ID (pipe-delimited)
* - 6. MGI Marker Accession ID (pipe-delimited)
* - 7. MGI Genotype Accession ID (pipe-delimited)
*
*/
private static class AnnotationLine {
/**
* [0] Index of Allelic Composition Allele Symbol(s) field
*/
private final int ALLELIC_COMPOSITION_IDX = 0;
/**
* [1] Index of Allele Symbol(s) field.
*/
private final int ALLELE_SYMBOL_IDX = 1;
/**
* [1] Index of Allele ID(s) field.
*/
private final int ALLELE_ID_IDX = 2;
/**
* [2] Index of Genetic Background field
*/
private final int GENETIC_BACKGROUND_IDX = 3;
/**
* [3] Index of Mammalian Phenotype ID
*/
private final int MPO_IDX = 4;
/**
* Index of PubMed ID (pipe-delimited)
*/
private final int PUBMED_IDX = 5;
/**
* Index of MGI Marker Accession ID (pipe-delimited). For example, for a model with
* a mutation of the RB1 gene, this would be the id for that gene (MGI:97874).
*/
private final int MGI_MARKER_IDX = 6;
/**
* Index of MGI Genotype Accession ID (pipe-delimited).
*/
private final int GENOTYPE_ACCESSION_IDX = 7;
private final MpAllelicComposition allelicComp;
private final String alleleSymbol;
private final TermId alleleId;
private final MpStrain geneticBackground;
private final TermId mpId;
private final Set pmidSet;
private final TermId markerAccessionId;
private final TermId genotypeAccessionId;
AnnotationLine(String[] annotations) throws PhenolException {
this.allelicComp = MpAllelicComposition.fromString(annotations[ALLELIC_COMPOSITION_IDX]);
this.alleleSymbol = annotations[ALLELE_SYMBOL_IDX];
this.alleleId = parseOrThrowException(annotations[ALLELE_ID_IDX]);
this.geneticBackground = MpStrain.fromString(annotations[GENETIC_BACKGROUND_IDX]);
this.mpId = parseOrThrowException(annotations[MPO_IDX]);
String pmids = annotations[PUBMED_IDX];
String[] pubMedIds = pmids.split(Pattern.quote("|"));
this.pmidSet = Set.of(pubMedIds);
this.markerAccessionId = parseOrThrowException(annotations[MGI_MARKER_IDX]);
this.genotypeAccessionId = parseOrThrowException(annotations[GENOTYPE_ACCESSION_IDX]);
}
private TermId parseOrThrowException(String termId) throws PhenolException {
try {
return TermId.of(termId);
} catch (PhenolRuntimeException e) {
throw new PhenolException(e.getMessage());
}
}
public MpAllelicComposition getAllelicComp() {
return allelicComp;
}
public String getAlleleSymbol() {
return alleleSymbol;
}
public TermId getAlleleId() {
return alleleId;
}
public MpStrain getGeneticBackground() {
return geneticBackground;
}
public TermId getMpId() {
return mpId;
}
public Set getPmidSet() {
return pmidSet;
}
public TermId getMarkerAccessionId() {
return markerAccessionId;
}
public TermId getGenotypeAccessionId() {
return genotypeAccessionId;
}
/**
* @return THe {@link MpAnnotation} object corresponding to this {@link AnnotationLine}.
*/
public MpAnnotation toMpAnnotation() {
return new MpAnnotation.Builder(this.mpId, this.pmidSet).build();
}
}
/**
* A convenience class that allows us to collect all of the annotations that belong
* to a given model (genotype accession id).
*/
private static class SexSpecificAnnotationLine {
private final TermId genotypeID;
private final MpSex sex;
private final TermId mpId;
private final MpAllelicComposition allelicComposition;
private final MpStrain strain;
private final boolean sexSpecificNormal;
private final Set pmidList;
SexSpecificAnnotationLine(String[] A) throws PhenolException {
this.genotypeID = TermId.of(A[0]);
this.sex = MpSex.fromString(A[1]);
this.mpId = TermId.of(A[2]);
this.allelicComposition = MpAllelicComposition.fromString(A[3]);
this.strain = MpStrain.fromString(A[4]);
sexSpecificNormal = A[5].equals("Y");
String pmids = A[6];
String[] B = pmids.split(Pattern.quote("|"));
this.pmidList = Set.of(B);
}
public MpAnnotation toMpAnnotation() {
if (sexSpecificNormal) {
MpAnnotation.Builder builder = new MpAnnotation.Builder(this.mpId, this.pmidList)
.sexSpecificNormal(this.sex);
return builder.build();
} else {
MpAnnotation.Builder builder = new MpAnnotation.Builder(this.mpId, this.pmidList)
.sexSpecific(this.sex);
return builder.build();
}
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy