All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.monarchinitiative.phenol.annotations.hpo.HpoAnnotationFileParser Maven / Gradle / Ivy

There is a newer version: 2.1.2
Show newest version
package org.monarchinitiative.phenol.annotations.hpo;

import org.monarchinitiative.phenol.base.PhenolException;
import org.monarchinitiative.phenol.ontology.data.Ontology;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.*;

/**
 * Parse of a single HPO Annotation File into a {@link HpoAnnotationModel} object. The HPO project uses a single
 * tab-separated file with 14 fields (see {@link #expectedFields}) to store information about individual
 * diseases. Colloquially, we have called these files "small-files" to distinguish them from the
 * {@code phenotype.hpoa} file that is created by combining the information from all ca. 7000 small files
 * (and which colloquially we have called the "big-file").
 *
 * @author Peter Robinson
 * @deprecated to be removed in v3.0.0.
 * Created by peter on 2/05/2018.
 */
@Deprecated(forRemoval = true)
public class HpoAnnotationFileParser {
  private final static Logger logger = LoggerFactory.getLogger(OrphanetXML2HpoDiseaseModelParser.class);
  /**
   * A reference to the HPO Ontology object.
   */
  private final Ontology ontology;
  /**
   * Path to a file such as "OMIM-600123.tab" containing data about the phenotypes of a disease.
   */
  private final File hpoAnnotationFile;
  /**
   * The column names of the small file.
   */
  private static final String[] expectedFields = {
    "#diseaseID",
    "diseaseName",
    "phenotypeID",
    "phenotypeName",
    "onsetID",
    "onsetName",
    "frequency",
    "sex",
    "negation",
    "modifier",
    "description",
    "publication",
    "evidence",
    "biocuration"};
  /**
   * Number of tab-separated fields in a valid small file.
   */
  private static final int NUMBER_OF_FIELDS = expectedFields.length;
  /**
   * A list of all erroneous Small File lines encountered during parsing
   */
  private List parseErrors;

  public HpoAnnotationFileParser(File file, Ontology ontology) {
    this.hpoAnnotationFile = file;
    this.ontology = ontology;
  }
  /**
   * Set up parser for an individual HPO Annotation file ("small file") with verbosity false
   *
   * @param path     Path to the HPO annotation file
   * @param ontology reference to HPO Ontology object
   */
  public HpoAnnotationFileParser(String path, Ontology ontology) {
    this(new File(path), ontology);
  }


  /**
   * Parse a single HPO Annotation file. If {@code faultTolerant} is set to true, then we will parse as
   * much as we can of an annotation file and return the {@link HpoAnnotationModel} object, even if one or more
   * parse errors occured. Otherwise, an {@link HpoAnnotationModelException} will be thrown
   *
   * @param faultTolerant If true, report errors to STDERR but do not throw an exception
   * @return A {@link HpoAnnotationModel} object corresponding to the data in the HPO Annotation file
   * @throws HpoAnnotationModelException if faultTolerant is false, parse errors are not thrown, rather only IO exceptions are thrown
   */
  public HpoAnnotationModel parse(boolean faultTolerant) throws HpoAnnotationModelException {
    String basename = hpoAnnotationFile.getName();
    List entryList = new ArrayList<>();
    this.parseErrors = new ArrayList<>();
    try {
      BufferedReader br = new BufferedReader(new FileReader(hpoAnnotationFile));
      String line = br.readLine();
      qcHeaderLine(line);
      while ((line = br.readLine()) != null) {
        try {
          HpoAnnotationEntry entry = HpoAnnotationEntry.fromLine(line, ontology);
          entryList.add(entry);
        } catch (ObsoleteTermIdRuntimeException obsE) {
          // try to rescue obsolete termid!
          Optional entryOpt = HpoAnnotationEntry.fromLineReplaceObsoletePhenotypeData(line, ontology);
          entryOpt.ifPresent(entryList::add);
        } catch (PhenolException e) {
          parseErrors.add(String.format("%s:%s", hpoAnnotationFile, e.getMessage()));
        }
      }
      br.close();
      if (parseErrors.size() > 0) {
        String errstr = String.join("\n", parseErrors);
        if (faultTolerant) {
          logger.warn(String.format("Errors encountered while parsing HPO Annotation file at %s.\n%s",
            hpoAnnotationFile, errstr));
        } else {
          throw new HpoAnnotationModelException(String.format("Errors encountered while parsing HPO Annotation file at %s.\n%s",
            hpoAnnotationFile, errstr));
        }
      }
      return new HpoAnnotationModel(basename, entryList);
    } catch (IOException e) {
      throw new HpoAnnotationModelException(String.format("Error parsing %s: %s", hpoAnnotationFile, e.getMessage()));
    }
  }

  /**
   * Parse a single HPO Annotation file with the default setting of no fault-tolerance, i.e. if even a single parse
   * error is encountered, throw an {@link HpoAnnotationModelException}.
   *
   * @throws HpoAnnotationModelException if any parse error of IO problem is encountered.
   */
  public HpoAnnotationModel parse() throws HpoAnnotationModelException {
    return parse(false);
  }

  /**
   * Can be used with fault-tolerant parsing to determine if parse errors were encountered.
   *
   * @return true if one or more parse errors occured
   */
  public boolean hasErrors() {
    return parseErrors.size() > 0;
  }

  /**
   * @return A slit of strings describing all parse errors (can be empty but not null)
   */
  public List getParseErrors() {
    return parseErrors;
  }


  /**
   * This method checks that the nead has the expected number and order of lines.
   * If it doesn't, then a serious error has occured somewhere and it is better to
   * die and figure out what is wrong than to attempt error correction
   *
   * @param line a header line of a V2 small file
   * @throws HpoAnnotationModelException if the number of fields in the head is not equal to {@link #NUMBER_OF_FIELDS} or if a column header is incorrect
   */
  private void qcHeaderLine(String line) throws HpoAnnotationModelException {
    String[] fields = line.split("\t");
    if (fields.length != NUMBER_OF_FIELDS) {
      String msg = String.format("Malformed header line\n" + line +
        "\nExpecting %d fields but got %d", NUMBER_OF_FIELDS, fields.length);
      throw new HpoAnnotationModelException(msg);
    }
    for (int i = 0; i < fields.length; i++) {
      if (!fields[i].equals(expectedFields[i])) {
        throw new HpoAnnotationModelException(String.format("Malformed field %d. Expected %s but got %s",
          i, expectedFields[i], fields[i]));
      }
    }
    // if we get here, all is good
  }


}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy