
org.monarchinitiative.phenol.annotations.hpo.HpoAnnotationModel Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of phenol-annotations Show documentation
Show all versions of phenol-annotations Show documentation
phenol-annotation contains the annotation functionality for ontologies
package org.monarchinitiative.phenol.annotations.hpo;
import org.monarchinitiative.phenol.annotations.formats.hpo.HpoFrequency;
import org.monarchinitiative.phenol.base.PhenolRuntimeException;
import org.monarchinitiative.phenol.ontology.data.TermId;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* This class represents one disease-entity annotation consisting usually of multiple annotations lines, and using
* the new format introduced in 2018. Colloquially, these files have been called "small files". This class
* is meant to be used for parsing the files, and does not perform any kind of analysis. THe main use case
* is to hold the data from one HPO Annotation file, such as {@code OMIM-100200.tab}, which in turn will be
* use to create the aggregated file called {@code phenotype.hpoa} (the "big-file").
*
* @author Peter Robinson
* Created by peter on 1/20/2018.
* @deprecated to be removed in v3.0.0.
*/
@Deprecated(forRemoval = true)
public class HpoAnnotationModel {
/**
* The base name of the HPO Annotation file.
*/
private final String basename;
/**
* List of {@link HpoAnnotationEntry} objects representing the original lines of the small file
*/
private List entryList;
/**
* These are the databases currently represented in our data resource.
*/
private enum Database {
OMIM, DECIPHER, UNKNOWN
}
/**
* What is the source of the current HpoAnnotationModel?
*/
private final Database database;
private static final String EMPTY_STRING = "";
/**
* To be used for matching n/m frequencies.
*/
private final static Pattern n_of_m_pattern = Pattern.compile("^(\\d+)/(\\d+?)");
private final static Pattern percentage_pattern = Pattern.compile("^(\\d*\\.?\\d+)%");
private final static Pattern hpoTerm_pattern = Pattern.compile("^HP:\\d{7}$");
/**
* @return The base name of the HPO Annotation file.
*/
public String getBasename() {
return basename;
}
/**
* The constructor creates an immutable copy of the original list of {@link HpoAnnotationEntry} objects
* provided by the parser
*
* @param name Name of the "small file"
* @param entries List of {@link HpoAnnotationEntry} objects -- one per line of the small file.
*/
public HpoAnnotationModel(String name, List entries) {
basename = name;
entryList = List.copyOf(entries);
if (basename.contains("OMIM")) this.database = Database.OMIM;
else if (basename.contains("DECIPHER")) this.database = Database.DECIPHER;
else this.database = Database.UNKNOWN;
}
public HpoAnnotationModel mergeWithInheritanceAnnotations(Collection inherit) {
List builder = new ArrayList<>();
builder.addAll(this.entryList);
builder.addAll(inherit);
return new HpoAnnotationModel(this.basename, List.copyOf(builder));
}
/**
* Private constructor, intended to be used by {@link #getMergedModel()}
*
* @param base base name of small file
* @param db database (OMIM, DECIPHER)
* @param entries list of (merged) entries.
*/
private HpoAnnotationModel(String base, Database db, List entries) {
this.basename = base;
this.database = db;
this.entryList = entries;
}
public boolean isOMIM() {
return this.database.equals(Database.OMIM);
}
public boolean isDECIPHER() {
return this.database.equals(Database.DECIPHER);
}
/**
* @return the {@link HpoAnnotationEntry} objects -- one per line of the small file.
*/
public List getEntryList() {
return entryList;
}
public int getNumberOfAnnotations() {
return entryList.size();
}
private boolean divergentNegation(List entrylist) {
String firstItemNegated = entrylist.get(0).getNegation();
if (firstItemNegated == null) firstItemNegated = EMPTY_STRING;
for (int i = 1; i < entrylist.size(); ++i) {
if (!firstItemNegated.equals(entrylist.get(i).getNegation())) {
return true;
}
}
return false; // if we get here we can still merge. Items are not divergent
}
private boolean divergentSex(List entrylist) {
String firstItemSex = entrylist.get(0).getSex();
if (firstItemSex == null) firstItemSex = EMPTY_STRING;
for (int i = 1; i < entrylist.size(); ++i) {
if (!firstItemSex.equals(entrylist.get(i).getSex())) {
return true;
}
}
return false; // if we get here we can still merge. Items are not divergent
}
private boolean divergentOnset(List entrylist) {
String firstItemOnsetId = entrylist.get(0).getAgeOfOnsetId();
if (firstItemOnsetId == null) firstItemOnsetId = EMPTY_STRING;
for (int i = 1; i < entrylist.size(); ++i) {
if (!firstItemOnsetId.equals(entrylist.get(i).getAgeOfOnsetId())) {
return true;
}
}
return false; // if we get here we can still merge. Items are not divergent
}
/**
* We want to merge entries with different n-of-m frequencies. For instance, if
* we have 2/3 and 5/7 then we would merge this to 7/10. If one of the entries
* is not n-of-m, then we will transform it as if the percentage or ontology term
* represents 10 observations. If the field is empty, then we will assume it is
* 100%, i.e., 10/10
*
* @param entrylist List of frequency strings
* @return merged frequency string
*/
private String mergeFrequencies(final List entrylist) {
int sum_of_numerators = 0;
int sum_of_denominators = 0;
final int DEFAULT_NUMBER_OF_OBSERVATIONS = 10;
for (HpoAnnotationEntry e : entrylist) {
String q = e.getFrequencyModifier();
Matcher matcher = n_of_m_pattern.matcher(q);
Matcher percentageMatcher = percentage_pattern.matcher(q);
Matcher termMatcher = hpoTerm_pattern.matcher(q);
if ( q.isEmpty()) {
// 1) No frequency entry available. Assume 100%, i.e., 10/10
sum_of_numerators += DEFAULT_NUMBER_OF_OBSERVATIONS;
sum_of_denominators += DEFAULT_NUMBER_OF_OBSERVATIONS;
} else if (matcher.matches()){
// 2) The frequency string is of the form n/m
String n_str=matcher.group(1);
String m_str=matcher.group(2);
// if we match the regex, the following "must" work.
int n=Integer.parseInt(n_str);
int m=Integer.parseInt(m_str);
sum_of_numerators += n;
sum_of_denominators += m;
} else if (percentageMatcher.matches()) {
String p_str=percentageMatcher.group(1);
// If we match the regex, the following "must" work
double d = Double.parseDouble(p_str);
int n = (int)Math.round(d/10.0);
sum_of_numerators += n;
sum_of_denominators += DEFAULT_NUMBER_OF_OBSERVATIONS;
} else if (termMatcher.matches()){
TermId freqid = TermId.of(q);
HpoFrequency hpofreq=HpoFrequency.fromTermId(freqid);
double proportion = hpofreq.mean();
int n=(int)Math.round(proportion*10.0);
sum_of_numerators += n;
sum_of_denominators += DEFAULT_NUMBER_OF_OBSERVATIONS;
} else {
// should never happen but if it does we want to know right away
throw new PhenolRuntimeException("Could not parse frequency entry: \"" + q+"\"");
}
}
return String.format("%d/%d",sum_of_numerators,sum_of_denominators);
}
private String mergeModifiers(final List entrylist) {
List modifiers=new ArrayList<>();
for (HpoAnnotationEntry entry : entrylist) {
String mod = entry.getModifier();
if (mod!=null && !mod.isEmpty()) {
modifiers.add(mod);
}
}
if (modifiers.isEmpty()) {
return ""; // no modifiers, return empty string
} else {
return String.join(";",modifiers);
}
}
private String mergeDescriptions(final List entrylist) {
List descriptions=new ArrayList<>();
for (HpoAnnotationEntry entry : entrylist) {
String mod = entry.getDescription();
if (mod!=null && mod.isEmpty()) {
descriptions.add(mod);
}
}
if (descriptions.isEmpty()) {
return ""; // no modifiers, return empty string
} else {
return String.join(";",descriptions);
}
}
private String mergePublications(final List entrylist) {
Set pubs=new HashSet<>();
for (HpoAnnotationEntry entry : entrylist) {
pubs.add(entry.getPublication());
}
return String.join(";",pubs);
}
private String getHighestEvidenceCode(final List entrylist) {
String evi="IEA";//default
for (HpoAnnotationEntry entry : entrylist) {
if (entry.getEvidenceCode().equals("PCS")) {
return "PCS";
} else if (entry.getEvidenceCode().equals("TAS")) {
evi="TAS"; // better than IEA
}
}
return evi;
}
private String mergeBiocuration(final List entrylist) {
Set biocuration=new HashSet<>();
for (HpoAnnotationEntry entry : entrylist) {
biocuration.add(entry.getBiocuration());
}
return String.join(";",biocuration);
}
/**
* If this method is called, then we have checked that Sex, Negation, AgeOfOnset are the same
* Merge everything else, concatenating biocuration and PMID and modifier and description
*
* @param entrylist List of annotation lines to the same HPO term that we will merge
* @return a merged entry
*/
private HpoAnnotationEntry mergeEntries(List entrylist) {
HpoAnnotationEntry first = entrylist.get(0);
String diseaseId=first.getDiseaseID();
String diseaseName=first.getDiseaseName();
TermId phenoId=first.getPhenotypeId();
String phenoName=first.getPhenotypeLabel();
String onsetId=first.getAgeOfOnsetId();
String onsetName=first.getAgeOfOnsetLabel();
String mergedFrequency = mergeFrequencies(entrylist);
String sex=first.getSex();
String negation=first.getNegation();
String mergedModifiers=mergeModifiers(entrylist);
String mergedDescriptions=mergeDescriptions(entrylist);
String mergedPublications=mergePublications(entrylist);
String evidence=getHighestEvidenceCode(entrylist);
String mergedBiocuration=mergeBiocuration(entrylist);
return new HpoAnnotationEntry(diseaseId,
diseaseName,
phenoId,
phenoName,
onsetId,
onsetName,
mergedFrequency,
sex,
negation,
mergedModifiers,
mergedDescriptions,
mergedPublications,
evidence,
mergedBiocuration);
}
public HpoAnnotationModel getMergedModel() {
Map> termId2AnnotEntryListMap = new HashMap<>();
for (HpoAnnotationEntry entry : this.entryList) {
termId2AnnotEntryListMap.putIfAbsent(entry.getPhenotypeId(), new ArrayList<>());
termId2AnnotEntryListMap.get(entry.getPhenotypeId()).add(entry);
}
List builder = new ArrayList<>();
for (TermId tid : termId2AnnotEntryListMap.keySet()) {
List entrylist = termId2AnnotEntryListMap.get(tid);
if (entrylist.size() == 1) { // No duplicate entries for this TermId
builder.add(entrylist.get(0));
} else {
boolean mergable = true;
// check for things that keep us from merging
if (divergentNegation(entrylist)) {
mergable = false;
} else if (divergentSex(entrylist)) {
mergable = false;
} else if (divergentOnset(entrylist)) {
mergable = false;
}
if (mergable) {
HpoAnnotationEntry merged = mergeEntries(entrylist);
builder.add(merged);
} else {
builder.addAll(entrylist); // cannot merge, add each separately
}
}
}
return new HpoAnnotationModel(this.basename, List.copyOf(builder));
}
/**
* By construction, the disease ID field of each of the entries in this object must be the same
* Therefore, we return the first one. Also by construction, there must be at least one entry
* in ({@link #entryList} for this object to have been created
* @return The diseaseID of this model
*/
public TermId getDiseaseId() {
HpoAnnotationEntry entry = entryList.iterator().next();
return TermId.of(entry.getDiseaseID());
}
public String getDiseaseName() {
return entryList
.stream()
.map(HpoAnnotationEntry::getDiseaseName)
.findAny()
.orElse("n/a");
}
public void addInheritanceEntryCollection(Collection entries) {
List builder = new ArrayList<>();
builder.addAll(this.entryList);
builder.addAll(entries);
this.entryList = List.copyOf(builder);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy