All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.monarchinitiative.phenol.annotations.hpo.OrphanetXML2HpoDiseaseModelParser Maven / Gradle / Ivy

There is a newer version: 2.1.2
Show newest version
package org.monarchinitiative.phenol.annotations.hpo;

import org.monarchinitiative.phenol.base.PhenolRuntimeException;
import org.monarchinitiative.phenol.annotations.constants.hpo.HpoFrequencyTermIds;
import org.monarchinitiative.phenol.ontology.data.Ontology;
import org.monarchinitiative.phenol.ontology.data.TermId;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.FileInputStream;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;

import javax.xml.namespace.QName;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.Attribute;
import javax.xml.stream.events.EndElement;
import javax.xml.stream.events.StartElement;
import javax.xml.stream.events.XMLEvent;

/**
 * This class is an XML parser for the Orphanet file with HPO-based disease annotations
 * ({@code en_product4_HPO.xml} (see http://www.orphadata.org/). Note that the section of the
 * XML that denotes an individual HPO annotation with frequency is like this
 * 
 *     
 *        
 *          HP:0010535
 *          Sleep apnea
 *         
 *         
 *            453313
 *             Occasional (29-5%)
 *        
 *       
 *    
 * 
* That is, one line of the annotation is contained with an {@code HPODisorderAssociation} node. * Each disease begins like this *
 *      
 *       437
 *       Hypophosphatemic rickets
 *       
 *         
 *             (...)
 * 
* That is, we extract the Orphanumber and the name and then there is a list of annotations. * * @author Peter Robinson * @deprecated to be removed in v3.0.0. */ @Deprecated(forRemoval = true) public class OrphanetXML2HpoDiseaseModelParser { private final static Logger logger = LoggerFactory.getLogger(OrphanetXML2HpoDiseaseModelParser.class); /** * Path to {@code en_product4_HPO.xml} file. */ private final String orphanetXmlPath; /** * Reference to the HPO Ontology. */ private final Ontology ontology; /** * A String of the form ORPHA:orphadata[2019-01-05] that we will use as the biocuration entry. */ private final String orphanetBiocurationString; /** * A map of diseases parsed from Orphanet. */ private final Map orphanetDiseaseMap = new HashMap<>(); /** * If true, replace obsolete term ids without throwing Exception. */ private final boolean replaceObsoleteTermId; private static final String DISORDER = "Disorder"; private static final String ORPHA_NUMBER = "OrphaNumber"; private static final String ORPHA_CODE = "OrphaCode"; private static final String NAME = "Name"; private static final String HPO_DISORDER_ASSOCIATION = "HPODisorderAssociation"; private static final String DIAGNOSTIC_CRITERIA = "DiagnosticCriteria"; private static final String DISORDER_TYPE = "DisorderType"; private static final String DISORDER_GROUP = "DisorderGroup"; private static final String EXPERT_LINK = "ExpertLink"; private static final String HPO_DISORDER_ASSOCIATION_LIST = "HPODisorderAssociationList"; private static final String HPO_DISORDER_SET_STATUS = "HPODisorderSetStatus"; private static final String HPO_DISORDER_SET_STATUS_LIST = "HPODisorderSetStatusList"; private static final String HPO_FREQUENCY = "HPOFrequency"; private static final String HPO = "HPO"; private static final String HPO_ID = "HPOId"; private static final String HPO_TERM = "HPOTerm"; private static final String JDBOR = "JDBOR"; private static final String AVAILABILITY = "Availability"; private static final String FULL_NAME = "FullName"; private static final String SHORT_IDENTIFIER = "ShortIdentifier"; private static final String LEGAL_CODE = "LegalCode"; private static final String LICENSE = "Licence"; private static final String SOURCE = "Source"; private static final String VALIDATION_DATE = "ValidationDate"; private static final String VALIDATION_STATUS = "ValidationStatus"; private static final String ONLINE = "Online"; /** * These are the local names of the Orphanet product4.xml file. */ private final Set allowableXmlNodeNames = Stream.of(AVAILABILITY, DIAGNOSTIC_CRITERIA, DISORDER, DISORDER_GROUP, DISORDER_TYPE, EXPERT_LINK, FULL_NAME, HPO_DISORDER_ASSOCIATION, HPO_DISORDER_ASSOCIATION_LIST, HPO_DISORDER_SET_STATUS, HPO_DISORDER_SET_STATUS_LIST, HPO_FREQUENCY, HPO_ID, HPO, HPO_TERM, JDBOR, LEGAL_CODE, LICENSE, NAME, ONLINE, ORPHA_CODE, ORPHA_NUMBER, SOURCE, SHORT_IDENTIFIER, VALIDATION_DATE, VALIDATION_STATUS) .collect(Collectors.toCollection(HashSet::new)); public OrphanetXML2HpoDiseaseModelParser(String xmlpath, Ontology onto, boolean tolerant) { orphanetXmlPath = xmlpath; this.ontology = onto; this.replaceObsoleteTermId = tolerant; String todaysDate = getTodaysDate(); orphanetBiocurationString = String.format("ORPHA:orphadata[%s]", todaysDate); try { parse(); } catch (XMLStreamException | IOException e) { e.printStackTrace(); } } public Map getOrphanetDiseaseMap() { return this.orphanetDiseaseMap; } /** * Transform the Orphanet codes into HPO Frequency TermId's. *

* The frequency ids are *

*
  • Excluded (0%): Orphanet id 28440
  • *
  • Frequent (79-30%): Orphanet id: 28419
  • *
  • Obligate (100%): Orphanet id: 28405
  • *
  • Occasional (29-5%): Orphanet id: 28426
  • *
  • Very frequent (99-80%): Orphanet id 28412
  • *
  • Very rare : Orphanet id 28433
  • *
* * @param fstring An Orphanet id (attribute in XML file) corresponding to a frequency category * @return corresponding HPO Frequency TermId */ private TermId string2frequency(String fstring) throws PhenolRuntimeException { switch (fstring) { case "28405": return HpoFrequencyTermIds.OBLIGATE; case "28412": return HpoFrequencyTermIds.VERY_FREQUENT; case "28419": return HpoFrequencyTermIds.FREQUENT; case "28426": return HpoFrequencyTermIds.OCCASIONAL; case "28433": return HpoFrequencyTermIds.VERY_RARE; case "28440": return HpoFrequencyTermIds.EXCLUDED; } // the following should never happen, actually! throw new PhenolRuntimeException("[ERROR] Could not find TermId for Orphanet frequency {}. " + "This indicates a serious and unexpected error, please report to the developers" + fstring); } /** * This method performs the XML parse of the Orphanet file * * @throws XMLStreamException If there is an XML stream issue * @throws IOException If the file cannot be opened */ @SuppressWarnings("ConstantConditions") private void parse() throws XMLStreamException, IOException { XMLInputFactory xmlInputFactory = XMLInputFactory.newInstance(); XMLEventReader xmlEventReader = xmlInputFactory.createXMLEventReader(new FileInputStream(orphanetXmlPath)); boolean inFrequency = false; boolean inDiagnosticCriterion = false; boolean inDisorderType = false; boolean inDisorderGroup = false; String currentHpoId = null; String currentHpoTermLabel = null; TermId currentFrequencyTermId = null; String currentOrphanumber = null; String currentDiseaseName = null; List currentAnnotationEntryList = new ArrayList<>(); while (xmlEventReader.hasNext()) { XMLEvent xmlEvent = xmlEventReader.nextEvent(); if (xmlEvent.isStartElement()) { StartElement startElement = xmlEvent.asStartElement(); String localName = startElement.getName().getLocalPart(); if (!allowableXmlNodeNames.contains(localName)) { throw new PhenolRuntimeException("Unexpected XML Node in Orphanet product_4 XML: " + localName); } switch (localName) { case DISORDER_TYPE: inDisorderType = true; break; case ORPHA_CODE: if (inFrequency || inDiagnosticCriterion) { continue; } // Orphanumbers are used for the Disorder but also for the Frequency nodes xmlEvent = xmlEventReader.nextEvent(); currentOrphanumber = xmlEvent.asCharacters().getData(); break; case NAME: if (inFrequency || inDiagnosticCriterion || inDisorderGroup || inDisorderType) { continue; } // skip, we have no need to parse the name of the frequency element // since we get the class from the attribute "id" xmlEvent = xmlEventReader.nextEvent(); currentDiseaseName = xmlEvent.asCharacters().getData(); break; case DISORDER_GROUP: inDisorderGroup = true; break; case HPO_ID: xmlEvent = xmlEventReader.nextEvent(); currentHpoId = xmlEvent.asCharacters().getData(); break; case HPO_TERM: xmlEvent = xmlEventReader.nextEvent(); currentHpoTermLabel = xmlEvent.asCharacters().getData(); break; case HPO_FREQUENCY: // if we are here, then we can grab the frequency from the id attribute. Attribute idAttr = startElement.getAttributeByName(new QName("id")); if (idAttr != null) { currentFrequencyTermId = string2frequency(idAttr.getValue()); } inFrequency = true; break; case DIAGNOSTIC_CRITERIA: inDiagnosticCriterion = true; break; case HPO: // no-op, no need to get the id attribute from this node break; case JDBOR: // no-op, no need to do anything for the very top level node break; default: // no-op, no need to do anything for many node types! break; } } else if (xmlEvent.isEndElement()) { EndElement endElement = xmlEvent.asEndElement(); String localPart = endElement.getName().getLocalPart(); switch (localPart) { case HPO_FREQUENCY: inFrequency = false; break; case DIAGNOSTIC_CRITERIA: inDiagnosticCriterion = false; break; case HPO_DISORDER_ASSOCIATION: try { HpoAnnotationEntry entry = HpoAnnotationEntry.fromOrphaData( String.format("ORPHA:%s", currentOrphanumber), currentDiseaseName, currentHpoId, currentHpoTermLabel, currentFrequencyTermId, ontology, orphanetBiocurationString, replaceObsoleteTermId); currentHpoId = null; currentHpoTermLabel = null; currentFrequencyTermId = null;// reset currentAnnotationEntryList.add(entry); } catch (HpoAnnotationModelException e) { logger.warn(String.format("Parse error for %s [ORPHA:%s] HPOid: %s (%s)", currentDiseaseName != null ? currentDiseaseName : "n/a", currentOrphanumber != null ? currentOrphanumber : "n/a", currentHpoId != null ? currentHpoId : "n/a", e.getMessage()) ); } break; case DISORDER_GROUP: inDisorderGroup = false; break; case DISORDER_TYPE: inDisorderType = false; break; case DISORDER: TermId orphaDiseaseId = TermId.of(String.format("ORPHA:%s", currentOrphanumber)); HpoAnnotationModel model = new HpoAnnotationModel(String.format("ORPHA:%s", currentOrphanumber), currentAnnotationEntryList); orphanetDiseaseMap.put(orphaDiseaseId, model); inDisorderType = false; currentOrphanumber = null; currentDiseaseName = null; currentAnnotationEntryList.clear(); } } } } /** * We are using this to supply a date created value for the Orphanet annotations. * After some research, no better way of getting the current date was found. * * @return A String such as 2018-02-22 */ private String getTodaysDate() { Date date = new Date(); return new SimpleDateFormat("yyyy-MM-dd").format(date); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy