org.monarchinitiative.phenol.io.obo.hpo.HpoAnnotationLine Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of phenol-io Show documentation
Show all versions of phenol-io Show documentation
phenol-io contains the generic I/O functionality for ontologies
package org.monarchinitiative.phenol.io.obo.hpo;
import com.google.common.collect.ImmutableList;
import org.monarchinitiative.phenol.base.PhenolException;
import org.monarchinitiative.phenol.formats.hpo.*;
import org.monarchinitiative.phenol.ontology.data.Ontology;
import org.monarchinitiative.phenol.ontology.data.TermId;
import java.util.List;
/**
* This class represents one line of the V2 (post 2018) HPO annotation line from the "big file"
* (phenotype.hpoa). It is intended to be use as part of the processing of the big file. A
* convenience class that will allow us to collect the annotation lines for each disease that we
* want to parse; from these data, we will construct the {@link HpoDisease}. Note that
* this class performs only the first part of the parse. The class {@link
* HpoDiseaseAnnotationParser} will complete the input of the lines once all of the lines for some
* disease have been input into individual {@link HpoAnnotationLine} objects. The reason for this is
* that in some cases, we may have multiple lines for some annotation and we will want to combine
* them into one annotation for some computational disease models. The class checks whether the
* number of fields is correct, and this can be got with {@link #hasValidNumberOfFields}
*
* @author Peter Robinson
*/
class HpoAnnotationLine {
private static final int DATABASE_ID_IDX = 0;
private static final int DB_NAME_IDX = 1;
private static final int QUALIFIER_IDX = 2;
private static final int PHENOTYPE_ID_IDX = 3;
private static final int DB_REFERENCE_IDX = 4;
private static final int EVIDENCE_IDX = 5;
private static final int ONSET_ID_IDX = 6;
private static final int FREQUENCY_IDX = 7;
private static final int SEX_IDX = 8;
private static final int MODIFIER_IDX = 9;
private static final int ASPECT_IDX = 10;
private static final int BIOCURATION_IDX = 11;
private static final String DEFAULT_FREQUENCY_STRING="n/a";
private static final String EMPTY_STRING="";
private boolean valid_number_of_fields=true;
/**
* Names of the header fields. Every valid small file should have a header line with exactly these
* fields.
*/
private static final String[] headerFields = {
"DatabaseID",
"DiseaseName",
"Qualifier",
"HPO_ID",
"Reference",
"Evidence",
"Onset",
"Frequency",
"Sex",
"Modifier",
"Aspect",
"Biocuration"
};
/** 1. The diseaseId. For instance, "OMIM:300200". */
private String databaseId;
/** 3. The disease name, e.g., Marfan syndrome . */
private String DbObjectName;
/** 4. true is this is a negated annotation, i.e., some phenotype is not present in some disease.*/
private boolean NOT;
/** 5. The phenotype term id */
private TermId phenotypeId;
/** 6. Publication */
private String publication;
/** 7. Evidence about assertion */
private String evidence;
/** 8. The onset (can be null) */
private TermId onsetId = null;
/** 9. The frequency (can be null) */
private String frequency;
/** 10. Male, female */
private String sex;
/** 11. Modifier terms (0..n) */
private String modifierList;
/** 12. aspect */
@SuppressWarnings("unused")
private String aspect;
/** 13. the biocurator/date, e.g., HPO:skoehler[2018-02-17] */
private String biocuration;
private HpoAnnotationLine(String line) throws PhenolException {
String[] fields = line.split("\t");
if (fields.length != headerFields.length) {
valid_number_of_fields = false;
throw new PhenolException("[phenol:ERROR] Annotation line with " + fields.length + "fields: \"" + line + "\"");
}
this.databaseId = fields[DATABASE_ID_IDX];
this.DbObjectName = fields[DB_NAME_IDX];
String phenoId = fields[PHENOTYPE_ID_IDX];
this.phenotypeId = TermId.of(phenoId);
String onset = fields[ONSET_ID_IDX];
if (onset != null && onset.startsWith("HP:")) {
onsetId = TermId.of(onset);
}
this.frequency = fields[FREQUENCY_IDX];
this.sex = fields[SEX_IDX];
String neg = fields[QUALIFIER_IDX];
this.NOT = (neg != null && neg.equalsIgnoreCase("NOT"));
this.aspect = fields[ASPECT_IDX];
this.modifierList = fields[MODIFIER_IDX];
this.publication = fields[DB_REFERENCE_IDX];
this.evidence = fields[EVIDENCE_IDX];
this.biocuration = fields[BIOCURATION_IDX];
}
static HpoAnnotationLine constructFromString(String line) throws PhenolException {
try {
return new HpoAnnotationLine(line);
} catch (PhenolException e) {
throw new PhenolException(String.format("Exception [%s] parsing line: %s",e.getMessage(),line));
}
}
/**
* @param line The header line of a V2 small file
* @return true iff the fields of the line exactly match {@link #headerFields}.
*/
static boolean isValidHeaderLine(String line) throws PhenolException {
String[] fields = line.split("\t");
if (fields.length != headerFields.length) {
String msg = String.format("Expected %d fields in header line but got %d",headerFields.length, fields.length);
throw new PhenolException(msg);
}
for (int i = 0; i < headerFields.length; i++) {
if (!fields[i].equals(headerFields[i])) {
String msg = String.format("Expected header field %d to be %s but got %s",i, headerFields[i], fields[i]);
throw new PhenolException(msg);
}
}
return true;
}
final String getDiseaseId() {
return databaseId;
}
/**
* This function creates a TermId form the Curie-type identifier for the disease, e.g., OMIM:600100.
* @return The TermId representing this disease.
* @throws PhenolException if the {@link #databaseId} field is invalid (not OMIM, ORPHA, or DECIPHER)
*/
TermId getDiseaseTermId() throws PhenolException {
if (databaseId.startsWith("OMIM") ||
databaseId.startsWith("ORPHA") ||
databaseId.startsWith("DECIPHER")
)
return TermId.of(databaseId);
// we should never get here
throw new PhenolException("Invalid disease id (not OMIM/ORPHA/DECIPHER: "+databaseId);
}
TermId getPhenotypeId() {
return phenotypeId;
}
TermId getOnsetId() {
return onsetId;
}
String getFrequency() {
return frequency;
}
String getSex() {
return sex;
}
private String getModifierList() {
return modifierList;
}
List getPublication() {
ImmutableList.Builder builder = new ImmutableList.Builder<>();
if (publication == null || publication.isEmpty()) return builder.build();
String[] A = publication.split(";") ;
for (String a : A ){
builder.add(a.trim());
}
return builder.build();
}
String getEvidence() {
return evidence;
}
String getBiocuration() {
return biocuration;
}
String getDbObjectName() {
return DbObjectName;
}
String getAspect() { return this.aspect;}
/** @return true if this annotation is negated. */
boolean isNOT() {
return NOT;
}
boolean hasValidNumberOfFields() {
return valid_number_of_fields;
}
/**
* This method can be used to create a full HpoAnnotation object. Note that it is not
* intended to be used to parse annotation lines that represent models of inheritance or
* negative annotations. For this reason, the method is package-private and should only be
* called by {@link org.monarchinitiative.phenol.io.assoc.HpoAssociationParser}.
* @param line AN HpoAnnotationLinbe object representing opne phenotype annotation
* @param ontology reference to HPO Ontology object
* @return corresponding HpoAnnotation object
*/
static HpoAnnotation toHpoAnnotation(HpoAnnotationLine line, Ontology ontology) {
TermId phenoId = line.getPhenotypeId();
double frequency = getFrequency(line.getFrequency(), ontology);
String frequencyString=line.frequency.isEmpty() ? DEFAULT_FREQUENCY_STRING : line.frequency;
HpoOnset onset = getOnset(line.onsetId);
return new HpoAnnotation(phenoId,
frequency,
frequencyString,
onset,
getModifiers(line.getModifierList()),
line.getPublication());
}
/**
* @param lst List of semicolon separated HPO term ids from the modifier subontology
* @return Immutable List of {@link TermId} objects
*/
private static List getModifiers(String lst) {
ImmutableList.Builder builder = new ImmutableList.Builder<>();
if (lst == null || lst.isEmpty()) return builder.build(); //return empty list
String[] modifierTermStrings = lst.split(";");
for (String mt : modifierTermStrings) {
TermId mtId = TermId.of(mt.trim());
builder.add(mtId);
}
return builder.build();
}
/**
* Extract the {@link HpoFrequency} object that corresponds to the frequency modifier in an
* annotation line. Note that we are expecting there to be one of three kinds of frequency
* information (an HPO term, n/m or x%). If we find nothing or there is some parsing error,
* return the default frequency (obligate, 100%).
*
* @param freq The representation of the frequency, if any, in the {@code
* phenotype_annotation.tab} file
* @return the corresponding {@link HpoFrequency} object or the default {@link HpoFrequency}
* object (100%).
*/
private static double getFrequency(String freq, Ontology ontology) {
if (freq == null || freq.isEmpty()) {
return HpoFrequency.ALWAYS_PRESENT.mean();
}
int i = freq.indexOf('%');
if (i > 0) {
return 0.01 * Double.parseDouble(freq.substring(0, i));
}
i = freq.indexOf('/');
if (i > 0 && freq.length() > (i + 1)) {
int n = Integer.parseInt(freq.substring(0, i));
int m = Integer.parseInt(freq.substring(i + 1));
return (double) n / (double) m;
}
try {
TermId tid = string2TermId(freq, ontology);
if (tid != null) return HpoFrequency.fromTermId(tid).mean();
} catch (Exception e) {
e.printStackTrace();
}
// if we get here we could not parse the Frequency, return the default 100%
return HpoFrequency.ALWAYS_PRESENT.mean();
}
/**
* Go from HP:0000123 to the corresponding TermId
*
* @param hp String version of an HPO term id
* @return corresponding {@link TermId} object
*/
private static TermId string2TermId(String hp, Ontology ontology) {
if (!hp.startsWith("HP:")) {
return null;
} else {
TermId tid = TermId.of(hp);
if (ontology.getTermMap().containsKey(tid)) {
return ontology
.getTermMap()
.get(tid)
.getId(); // replace alt_id with current if if necessary
} else {
return null;
}
}
}
/**
* This fucnction transforms a {@link TermId} into an {@link HpoOnset} object. If the argument is
* null, it means that no annotation for the onset was provided in the annotation line, and
* then this function returns null.
* @param ons The {@link TermId} of an HPO Onset
* @return The {@link HpoOnset} object corresponding to the {@link TermId} in the argument
*/
private static HpoOnset getOnset(TermId ons) {
if (ons == null) return HpoOnset.UNKNOWN;
return HpoOnset.fromTermId(ons);
}
@Override
public String toString() {
String [] fields={this.databaseId,
this.DbObjectName ,
this.phenotypeId.getValue() ,
this.onsetId==null?EMPTY_STRING:this.onsetId.getValue(),
this.frequency ,
this.sex ,
this.NOT?"NOT":"" ,
this.aspect,
this.modifierList ,
this.publication,
this.evidence ,
this.biocuration};
return String.join("\t",fields);
}
}