org.opencb.cellbase.lib.builders.PharmGKBBuilder Maven / Gradle / Ivy
The newest version!
/*
* Copyright 2015-2020 OpenCB
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.opencb.cellbase.lib.builders;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.ObjectReader;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.opencb.biodata.models.core.Xref;
import org.opencb.biodata.models.pharma.*;
import org.opencb.biodata.models.pharma.guideline.BasicObject;
import org.opencb.cellbase.core.serializer.CellBaseFileSerializer;
import org.opencb.commons.utils.FileUtils;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.nio.file.Path;
import java.util.*;
import java.util.stream.Collectors;
import static org.opencb.cellbase.lib.EtlCommons.*;
public class PharmGKBBuilder extends CellBaseBuilder {
private final Path inputDir;
private final Path pharmGKBDir;
private static final String CHEMICALS_BASENAME = "chemicals";
private static final String CHEMICALS_TSV_FILENAME = "chemicals.tsv";
private static final String VARIANTS_BASENAME = "variants";
private static final String VARIANTS_TSV_FILENAME = "variants.tsv";
private static final String GENES_BASENAME = "genes";
private static final String GENES_TSV_FILENAME = "genes.tsv";
private static final String CLINICAL_ANNOTATIONS_BASENAME = "clinicalAnnotations";
private static final String CLINICAL_ANNOTATIONS_TSV_FILENAME = "clinical_annotations.tsv";
private static final String CLINICAL_ANN_ALLELES_TSV_FILENAME = "clinical_ann_alleles.tsv";
private static final String CLINICAL_ANN_EVIDENCE_TSV_FILENAME = "clinical_ann_evidence.tsv";
private static final String VARIANT_ANNOTATIONS_BASENAME = "variantAnnotations";
private static final String VARIANT_ANNOTATIONS_TSV_FILENAME = "var_drug_ann.tsv";
private static final String PHENOTYPE_ANNOTATIONS_TSV_FILENAME = "var_pheno_ann.tsv";
private static final String FUNCTIONAL_ANNOTATIONS_TSV_FILENAME = "var_fa_ann.tsv";
private static final String STUDY_PARAMETERS_TSV_FILENAME = "study_parameters.tsv";
private static final String GUIDELINE_ANNOTATIONS_BASENAME = "guidelineAnnotations";
private static final String DRUG_LABELS_BASENAME = "drugLabels";
private static final String DRUG_LABELS_TSV_FILENAME = "drugLabels.tsv";
private static final String RELATIONSHIPS_BASENAME = "relationships";
private static final String RELATIONSHIPS_TSV_FILENAME = "relationships.tsv";
private static final String GUIDELINE_ANNOTATION_EVIDENCE_TYPE = "Guideline Annotation";
private static final String DRUG_LABEL_ANNOTATION_EVIDENCE_TYPE = "Label Annotation";
private static final String VARIANT_ANNOTATION_EVIDENCE_TYPE = "Variant Drug Annotation";
private static final String FUNCTIONAL_ANNOTATION_EVIDENCE_TYPE = "Variant Functional Assay Annotation";
private static final String PHENOTYPE_ANNOTATION_EVIDENCE_TYPE = "Variant Phenotype Annotation";
private static final String LOCATION_KEY = "location";
private static final String CHROMOSOME_KEY = "chrom";
private static final String POSITION_KEY = "pos";
private static final String GENE_ENTITY = "Gene";
private static final String CHEMICAL_ENTITY = "Chemical";
private static final String PHARMGKB_ID_KEY = "PHARMGKB_ID";
private static final String PHARMGKB_ASSOCIATION_TYPE_KEY = "PHARMGKB_ASSOCIATION_TYPE";
private static final String PHARMGKB_LEVEL_OVERRIDE_KEY = "PHARMGKB_LEVEL_OVERRIDE";
private static final String PHARMGKB_LEVEL_MODIFIERS_KEY = "PHARMGKB_LEVEL_MODIFIERS";
private static final String PHARMGKB_LAST_UPDATE_DATE_KEY = "PHARMGKB_LAST_UPDATE_DATE";
private static final String PHARMGKB_IS_VIP_KEY = "PHARMGKB_IS_VIP";
public PharmGKBBuilder(Path inputDir, CellBaseFileSerializer serializer) {
super(serializer);
this.inputDir = inputDir;
this.pharmGKBDir = inputDir.resolve(PHARMGKB_DATA);
}
@Override
public void parse() throws Exception {
// Check input folder
FileUtils.checkDirectory(inputDir);
// PharmGKB
FileUtils.checkDirectory(pharmGKBDir);
logger.info("Parsing {} files and building the data models...", PHARMGKB_NAME);
// Parse chemical file
Map chemicalsMap = parseChemicalFile();
// Parse clinical annotation files
parseClinicalAnnotationFiles(chemicalsMap);
// Parse gene file
parseGeneFile(chemicalsMap);
logger.info("Parsing {} files finished.", PHARMGKB_NAME);
// Generation the pharmacogenomics JSON file
logger.info("Writing {} JSON file to {} ...", PHARMACOGENOMICS_DATA, serializer.getOutdir());
int counter = 0;
for (Map.Entry entry : chemicalsMap.entrySet()) {
((CellBaseFileSerializer) serializer).serialize(entry.getValue(), PHARMACOGENOMICS_DATA);
if (++counter % 1000 == 0) {
logger.info("\t\t {} chemicals/drugs written.", counter);
}
}
serializer.close();
logger.info("Writing {} JSON file done!", PHARMACOGENOMICS_DATA);
}
private Map parseChemicalFile() throws IOException {
Path chemicalsFile = pharmGKBDir.resolve(CHEMICALS_BASENAME).resolve(CHEMICALS_TSV_FILENAME);
Map chemicalsMap = new HashMap<>();
try (BufferedReader br = FileUtils.newBufferedReader(chemicalsFile)) {
// Skip first line, i.e. the header line
String line = br.readLine();
while ((line = br.readLine()) != null) {
String[] fields = line.split("\t", -1);
// 0 1 2 3 4 5 6 7 8
// PharmGKB Accession ID Name Generic Names Trade Names Brand Mixtures Type Cross-references SMILES InChI
// 9 10 11 12 13 14
// Dosing Guideline External Vocabulary Clinical Annotation Count Variant Annotation Count Pathway Count VIP Count
// 15 16 17 18
// Dosing Guideline Sources Top Clinical Annotation Level Top FDA Label Testing Level Top Any Drug Label Testing Level
// 19 20 21 22 23
// Label Has Dosing Info Has Rx Annotation RxNorm Identifiers ATC Identifiers PubChem Compound Identifiers
PharmaChemical pharmaChemical = new PharmaChemical()
.setId(fields[0])
.setSource(PHARMGKB_NAME)
.setName(fields[1])
.setSmiles(fields[7])
.setInChI(fields[8]);
// Generic Names
if (StringUtils.isNotEmpty(fields[2])) {
pharmaChemical.setGenericNames(stringFieldToList(fields[2]));
}
// Trade Names
if (StringUtils.isNotEmpty(fields[3])) {
pharmaChemical.setTradeNames(stringFieldToList(fields[3]));
}
// Brand Mixtures
if (StringUtils.isNotEmpty(fields[4])) {
pharmaChemical.setTradeMixtures(stringFieldToList(fields[4]));
}
// Types
if (StringUtils.isNotEmpty(fields[5])) {
pharmaChemical.setTypes(Arrays.stream(fields[5].split(",")).map(String::trim).collect(Collectors.toList()));
}
// We need to keep the name not the ID to map by drug name in the clinical annotation method
chemicalsMap.put(pharmaChemical.getName(), pharmaChemical);
}
}
logger.info("Number of Chemical items read {}", chemicalsMap.size());
return chemicalsMap;
}
/**
* This method parses clinical_annotations.tsv, then it parses alleles and evidences to add them to the first one.
* @param chemicalsMap
* @throws IOException
*/
private void parseClinicalAnnotationFiles(Map chemicalsMap) throws IOException {
Map variantAnnotationMap = new HashMap<>();
Map> drugToVariantAnnotationIdMap = new HashMap<>();
Map> variantMap = parseVariantFile();
// clinical_annotations.tsv
try (BufferedReader br = FileUtils.newBufferedReader(pharmGKBDir.resolve(CLINICAL_ANNOTATIONS_BASENAME)
.resolve(CLINICAL_ANNOTATIONS_TSV_FILENAME))) {
// Skip first line, i.e. the header line
String line = br.readLine();
while ((line = br.readLine()) != null) {
String[] fields = line.split("\t", -1);
// Sanity check
if (StringUtils.isEmpty(fields[0])) {
logger.warn("Clinical annotation ID is missing in clinical annotations line: {}", line);
continue;
}
// 0 1 2 3 4 5 6
// Clinical Annotation ID Variant/Haplotypes Gene Level of Evidence Level Override Level Modifiers Score
// 7 8 9 10 11 12 13
// Phenotype Category PMID Count Evidence Count Drug(s) Phenotype(s) Latest History Date (YYYY-MM-DD) URL
// 14
// Specialty Population
PharmaVariantAnnotation pharmaVariantAnnotation = new PharmaVariantAnnotation()
.setConfidence(fields[3])
.setScore(fields[6])
.setUrl(fields[13])
.setPopulation(fields[14]);
// Variant or haplotypes
if (StringUtils.isNotEmpty(fields[1])) {
if (isHaplotype(fields[1])) {
// Haplotype
pharmaVariantAnnotation.setHaplotypes(getHaplotypeList(fields[1]));
} else {
// Variant
pharmaVariantAnnotation.setVariantId(fields[1]);
}
}
// Genes
if (StringUtils.isNotEmpty(fields[2])) {
pharmaVariantAnnotation.setGeneNames(Arrays.asList(fields[2].split(";")));
}
if (StringUtils.isNotEmpty(fields[7])) {
pharmaVariantAnnotation.setPhenotypeTypes(Arrays.asList(fields[7].split(";")));
}
if (StringUtils.isNotEmpty(fields[11])) {
pharmaVariantAnnotation.setPhenotypes(Arrays.asList(fields[11].split(";")));
}
Map attributes = new HashMap<>();
attributes.put(PHARMGKB_ID_KEY, fields[0]);
attributes.put(PHARMGKB_LEVEL_OVERRIDE_KEY, fields[4]);
attributes.put(PHARMGKB_LEVEL_MODIFIERS_KEY, fields[5]);
attributes.put(PHARMGKB_LAST_UPDATE_DATE_KEY, fields[12]);
pharmaVariantAnnotation.setAttributes(attributes);
// Add some fields from the variant map
if (variantMap.containsKey(pharmaVariantAnnotation.getVariantId())) {
pharmaVariantAnnotation.setLocation((String) variantMap.get(pharmaVariantAnnotation.getVariantId()).get(LOCATION_KEY));
pharmaVariantAnnotation
.setChromosome((String) variantMap.get(pharmaVariantAnnotation.getVariantId()).get(CHROMOSOME_KEY));
pharmaVariantAnnotation.setPosition((int) variantMap.get(pharmaVariantAnnotation.getVariantId()).get(POSITION_KEY));
} else {
logger.warn("Variant {} from clinical annotation not found in the variant map, so chromosome and position are not set",
pharmaVariantAnnotation.getVariantId());
}
// Add the annotation to the annotationMap by annotation ID
variantAnnotationMap.put(fields[0], pharmaVariantAnnotation);
// Process the drug names to update the drugToClinicalAnnotationId map
// This will be used at the end of the method to update the chemical map
if (StringUtils.isNotEmpty(fields[10])) {
// Drugs are separated by semicolon
String[] drugs = fields[10].split(";");
for (String drug : drugs) {
if (!drugToVariantAnnotationIdMap.containsKey(drug)) {
// Add the drug to the map
drugToVariantAnnotationIdMap.put(drug, new ArrayList<>());
}
// Add the clinical annotation ID to that drug
drugToVariantAnnotationIdMap.get(drug).add(fields[0]);
}
}
}
}
// Update the clinical annotation map by parsing the clinical annotation evidences
parseClinicalAnnotationEvidenceFile(variantAnnotationMap);
// Update the clinical annotation map by parsing the clinical annotation alleles
parseClinicalAnnotationAlleleFile(variantAnnotationMap);
// Update chemicals map by adding the clinical annotation
for (Map.Entry> entry : drugToVariantAnnotationIdMap.entrySet()) {
if (chemicalsMap.containsKey(entry.getKey())) {
for (String variantAnnotationId : entry.getValue()) {
chemicalsMap.get(entry.getKey()).getVariants().add(variantAnnotationMap.get(variantAnnotationId));
}
} else {
logger.warn("Drug '{}' not found in the chemicals map", entry.getKey());
}
}
}
private Map> parseVariantFile() throws IOException {
Map> variantMap = new HashMap<>();
// Parse the variant file (i.e., variants.tsv)
Path varPath = pharmGKBDir.resolve(VARIANTS_BASENAME).resolve(VARIANTS_TSV_FILENAME);
try (BufferedReader br = FileUtils.newBufferedReader(varPath)) {
// Skip first line, i.e. the header line
String line = br.readLine();
while ((line = br.readLine()) != null) {
String[] fields = line.split("\t", -1);
String variantName = fields[1];
// Sanity check
if (StringUtils.isEmpty(variantName)) {
logger.warn("Variant name is missing in variant line: {}", line);
continue;
}
if (variantMap.containsKey(variantName)) {
logger.warn("Variant name is duplicated in variant line: {}", line);
continue;
}
// 0 1 2 3 4 5 6
// Variant ID Variant Name Gene IDs Gene Symbols Location Variant Annotation count Clinical Annotation count
// 7 8 9 10
// Level 1/2 Clinical Annotation count Guideline Annotation count Label Annotation count Synonyms
String location = fields[4];
if (StringUtils.isEmpty(location)) {
logger.warn("Location is missing for Variant name {}", variantName);
continue;
}
if (!location.startsWith("NC_")) {
logger.warn("Unknown location {}, it has to be a RefSeq ID", location);
continue;
}
Map attrMap = new HashMap<>();
String[] splits = location.split("[_\\.:]");
try {
int chrom = Integer.parseInt(splits[1]);
if (chrom >= 1 && chrom <= 22) {
attrMap.put(CHROMOSOME_KEY, String.valueOf(chrom));
} else if (chrom == 23) {
attrMap.put(CHROMOSOME_KEY, "X");
} else if (chrom == 24) {
attrMap.put(CHROMOSOME_KEY, "Y");
} else if (chrom == 12920) {
attrMap.put(CHROMOSOME_KEY, "MT");
} else {
logger.warn("Unknown chromosome {}", chrom);
continue;
}
} catch (NumberFormatException e) {
logger.warn("Error computing chromosome from location {}: {}", location, e.getMessage());
continue;
}
try {
int position = Integer.parseInt(splits[3]);
attrMap.put(POSITION_KEY, position);
} catch (NumberFormatException e) {
logger.warn("Error computing chromosome position from location {}: {}", location, e.getMessage());
continue;
}
attrMap.put(LOCATION_KEY, attrMap.get(CHROMOSOME_KEY) + ":" + attrMap.get(POSITION_KEY));
// Add it to the variant map
variantMap.put(variantName, attrMap);
}
}
logger.info("Number of variants = {}", variantMap.size());
return variantMap;
}
private void parseClinicalAnnotationEvidenceFile(Map variantAnnotationMap) throws IOException {
// For CellBase, variant annotation correponds to the PharmGKB clinical annotation
// Processing clinical annotation evidences implies to process the variant annotation, guideline annotations,
// drug label annotations, phenotype annotations and functional analysis annotations
Map variantAssociationMap = new HashMap<>();
Map guidelineAnnotationsMap = parseGuidelineAnnotationFiles();
Map drugLabelAnnotationsMap = parseDrugLabelAnnotationFile();
// Parse study parameters and update the variant, phenotype and functional annotations with the parsed study parameters
parseVariantAnnotationFile(variantAssociationMap);
parsePhenotypeAnnotationFile(variantAssociationMap);
parseFunctionalAnnotationFile(variantAssociationMap);
parseStudyParameterFile(variantAssociationMap);
// Parse the clinical annotation alleles file (i.e., clinical_ann_alleles.tsv)
Path evidencesPath = pharmGKBDir.resolve(CLINICAL_ANNOTATIONS_BASENAME).resolve(CLINICAL_ANN_EVIDENCE_TSV_FILENAME);
try (BufferedReader br = FileUtils.newBufferedReader(evidencesPath)) {
// Skip first line, i.e. the header line
String line = br.readLine();
while ((line = br.readLine()) != null) {
String[] fields = line.split("\t", -1);
String clinicalAnnotationId = fields[0];
// Sanity check
if (StringUtils.isEmpty(clinicalAnnotationId)) {
logger.warn("Clinical annotation ID is missing in clinical annotation evidence line: {}", line);
continue;
}
// 0 1 2 3 4 5 6
// Clinical Annotation ID Evidence ID Evidence Type Evidence URL PMID Summary Score
String evidenceId = fields[1];
String evidenceType = fields[2];
PharmaClinicalEvidence evidence = new PharmaClinicalEvidence()
.setType(evidenceType)
.setUrl(fields[3])
.setPubmed(fields[4])
.setSummary(fields[5])
.setScore(fields[6]);
switch (evidenceType) {
case VARIANT_ANNOTATION_EVIDENCE_TYPE:
case PHENOTYPE_ANNOTATION_EVIDENCE_TYPE:
case FUNCTIONAL_ANNOTATION_EVIDENCE_TYPE: {
if (variantAssociationMap.containsKey(evidenceId)) {
evidence.getVariantAssociations().add(variantAssociationMap.get(evidenceId));
} else {
logger.warn("Evidence ID '{}' of type '{}' not found in the variant association map", evidenceId, evidenceType);
}
break;
}
case GUIDELINE_ANNOTATION_EVIDENCE_TYPE: {
if (guidelineAnnotationsMap.containsKey(evidenceId)) {
evidence.getGuidelineAnnotations().add(guidelineAnnotationsMap.get(evidenceId));
} else {
logger.warn("Evidence ID '{}' of type '{}' not found in the variant annotations map",
evidenceId, evidenceType);
}
break;
}
case DRUG_LABEL_ANNOTATION_EVIDENCE_TYPE: {
if (drugLabelAnnotationsMap.containsKey(evidenceId)) {
evidence.getDrugLabelAnnotations().add(drugLabelAnnotationsMap.get(evidenceId));
} else {
logger.warn("Evidence ID '{}' of type '{}' not found in the drug label annotations map",
evidenceId, evidenceType);
}
break;
}
default: {
logger.warn("Unknown evidence type '{}': this evidence is skipped. Valid evidence types are: {}",
evidenceType,
StringUtils.join(
Arrays.asList(VARIANT_ANNOTATION_EVIDENCE_TYPE, GUIDELINE_ANNOTATION_EVIDENCE_TYPE,
DRUG_LABEL_ANNOTATION_EVIDENCE_TYPE, FUNCTIONAL_ANNOTATION_EVIDENCE_TYPE,
PHENOTYPE_ANNOTATION_EVIDENCE_TYPE), ","));
break;
}
}
Map attributes = new HashMap<>();
attributes.put(PHARMGKB_ID_KEY, fields[0]);
evidence.setAttributes(attributes);
// Add evidence to clinical annotation
if (variantAnnotationMap.containsKey(clinicalAnnotationId)) {
variantAnnotationMap.get(clinicalAnnotationId).getEvidences().add(evidence);
} else {
logger.warn("Clinical annotation ID {} from clinical annotation evidence not found in clinical annotations",
clinicalAnnotationId);
}
}
}
}
private void parseClinicalAnnotationAlleleFile(Map variantAnnotationMap) throws IOException {
// Parse the clinical annotation alleles file (i.e., clinical_ann_alleles.tsv)
Path allelesPath = pharmGKBDir.resolve(CLINICAL_ANNOTATIONS_BASENAME).resolve(CLINICAL_ANN_ALLELES_TSV_FILENAME);
try (BufferedReader br = FileUtils.newBufferedReader(allelesPath)) {
// Skip first line, i.e. the header line
String line = br.readLine();
while ((line = br.readLine()) != null) {
String[] fields = line.split("\t", -1);
// For CellBase, variant annotation is equivalent to PharmGKB clinical annotation
String variantAnnotationId = fields[0];
// Sanity check
if (StringUtils.isEmpty(variantAnnotationId)) {
logger.warn("Clinical annotation ID is missing in clinical annotation alleles line: {}", line);
continue;
}
// 0 1 2 3
// Clinical Annotation ID Genotype/Allele Annotation Text Allele Function
PharmaClinicalAllele clinicalAllele = new PharmaClinicalAllele()
.setAllele(fields[1])
.setAnnotation(fields[2])
.setDescription(fields[3]);
Map attributes = new HashMap<>();
attributes.put(PHARMGKB_ID_KEY, variantAnnotationId);
clinicalAllele.setAttributes(attributes);
// Add allele to clinical annotation
if (variantAnnotationMap.containsKey(variantAnnotationId)) {
variantAnnotationMap.get(variantAnnotationId).getAlleles().add(clinicalAllele);
} else {
logger.warn("Clinical annotation ID {} from clinical annotation alleles file not found in the clinical annotations map",
variantAnnotationId);
}
}
}
}
private void parseVariantAnnotationFile(Map variantAssociationMap) throws IOException {
// For CellBase, variant association corresponds to PharmGKB variant annotation
// Parse the variant annotation file (i.e., var_drug_ann.tsv)
Path varDrugPath = pharmGKBDir.resolve(VARIANT_ANNOTATIONS_BASENAME).resolve(VARIANT_ANNOTATIONS_TSV_FILENAME);
int counter = 0;
try (BufferedReader br = FileUtils.newBufferedReader(varDrugPath)) {
// Skip first line, i.e. the header line
String line = br.readLine();
while ((line = br.readLine()) != null) {
String[] fields = line.split("\t", -1);
String variantAnnotationId = fields[0];
// Sanity check
if (StringUtils.isEmpty(variantAnnotationId)) {
logger.warn("Variant annotation ID is missing in variant annotations line: {}", line);
continue;
}
// 0 1 2 3 4 5
// Variant Annotation ID Variant/Haplotypes Gene Drug(s) PMID Phenotype Category
// 6 7 8 9 10
// Significance Notes Sentence Alleles Specialty Population
PharmaVariantAssociation variantAssociation = new PharmaVariantAssociation()
.setGeneName(fields[2])
.setPubmed(fields[4])
.setPhenotypeType(fields[5])
.setSignificance(fields[6])
.setDiscussion(fields[7])
.setDescription(fields[8])
.setAlleles(fields[9])
.setPopulation(fields[10]);
// Variant or haplotypes
if (StringUtils.isNotEmpty(fields[1])) {
if (isHaplotype(fields[1])) {
// Haplotype
variantAssociation.setHaplotypes(getHaplotypeList(fields[1]));
} else {
// Variant
variantAssociation.setVariantId(fields[1]);
}
}
Map attributes = new HashMap<>();
attributes.put(PHARMGKB_ID_KEY, fields[0]);
attributes.put(PHARMGKB_ASSOCIATION_TYPE_KEY, VARIANT_ANNOTATION_EVIDENCE_TYPE);
variantAssociation.setAttributes(attributes);
if (StringUtils.isNotEmpty(fields[3])) {
variantAssociation.setDrugs(stringFieldToList(fields[3]));
}
// Add the annotation to the variantAnnotationMap by variant and gene
variantAssociationMap.put(variantAnnotationId, variantAssociation);
counter++;
}
}
logger.info("Number of variant annotations = {}", counter);
}
private Map parseGuidelineAnnotationFiles() throws IOException {
Map guidelineAnnotationMap = new HashMap<>();
ObjectMapper mapper = new ObjectMapper();
ObjectReader objectReader = mapper.readerFor(PharmaGuidelineAnnotation.class);
// Parse the guideline annotations JSON files
Path guidelinesPath = pharmGKBDir.resolve(GUIDELINE_ANNOTATIONS_BASENAME);
FileUtils.checkDirectory(guidelinesPath);
for (File file : Objects.requireNonNull(guidelinesPath.toFile().listFiles())) {
if (file.getName().endsWith("json")) {
PharmaGuidelineAnnotation guidelineAnnotation = objectReader.readValue(file);
if (guidelineAnnotation.getGuideline() != null
&& StringUtils.isEmpty(guidelineAnnotation.getGuideline().getId())) {
logger.warn("Guideline ID is missing for guideline filename: {}", file.getName());
continue;
}
// Add the guideline annotation to the map by guideline ID (= Evidence ID)
guidelineAnnotationMap.put(guidelineAnnotation.getGuideline().getId(), guidelineAnnotation);
}
}
logger.info("Number of guideline annotations = {}", guidelineAnnotationMap.size());
return guidelineAnnotationMap;
}
private Map parseDrugLabelAnnotationFile() throws IOException {
Map drugLabelAnnotationMap = new HashMap<>();
// Parse the drug labels annotations file (i.e., drugLabels.tsv)
Path drugLabelPath = pharmGKBDir.resolve(DRUG_LABELS_BASENAME).resolve(DRUG_LABELS_TSV_FILENAME);
try (BufferedReader br = FileUtils.newBufferedReader(drugLabelPath)) {
// Skip first line, i.e. the header line
String line = br.readLine();
while ((line = br.readLine()) != null) {
String[] fields = line.split("\t", -1);
String drugLabelId = fields[0];
// Sanity check
if (StringUtils.isEmpty(drugLabelId)) {
logger.warn("PharmGKB ID is missing in drug label line: {}", line);
continue;
}
// 0 1 2 3 4 5 6 7
// PharmGKB ID Name Source Biomarker Flag Testing Level Has Prescribing Info Has Dosing Info Has Alternate Drug
// 8 9 10 11 12 13
// Cancer Genome Prescribing Chemicals Genes Variants/Haplotypes Latest History Date (YYYY-MM-DD)
PharmaDrugLabelAnnotation labelAnnotation = new PharmaDrugLabelAnnotation()
.setName(fields[1])
.setSource(fields[2])
.setBiomarkerFlag(fields[3])
.setTestingLevel(fields[4])
.setPrescribingInfo(fields[5])
.setDosingInfo(fields[6])
.setAlternateDrug(fields[7])
.setCancerGenome(fields[8]);
Map attributes = new HashMap<>();
attributes.put(PHARMGKB_ID_KEY, drugLabelId);
labelAnnotation.setAttributes(attributes);
// Add the drug label annotation to the map by ParhmGKB (= Evidence ID)
drugLabelAnnotationMap.put(drugLabelId, labelAnnotation);
}
}
logger.info("Number of drug label annotations = {}", drugLabelAnnotationMap.size());
return drugLabelAnnotationMap;
}
private void parsePhenotypeAnnotationFile(Map variantAssociationMap) throws IOException {
// Parse the variant annotation file (i.e., var_pheno_ann.tsv)
Path varDrugPath = pharmGKBDir.resolve(VARIANT_ANNOTATIONS_BASENAME).resolve(PHENOTYPE_ANNOTATIONS_TSV_FILENAME);
int counter = 0;
try (BufferedReader br = FileUtils.newBufferedReader(varDrugPath)) {
// Skip first line, i.e. the header line
String line = br.readLine();
while ((line = br.readLine()) != null) {
String[] fields = line.split("\t", -1);
String variantAnnotationId = fields[0];
// Sanity check
if (StringUtils.isEmpty(variantAnnotationId)) {
logger.warn("Variant annotation ID is missing in phenotype annotations line: {}", line);
continue;
}
// 0 1 2 3 4 5 6 7 8
// Variant Annotation ID Variant/Haplotypes Gene Drug(s) PMID Phenotype Category Significance Notes Sentence
// 9 10 .....
// Alleles Specialty Population .....
PharmaVariantAssociation variantAssociation = new PharmaVariantAssociation()
.setGeneName(fields[2])
.setPubmed(fields[4])
.setPhenotypeType(fields[5])
.setSignificance(fields[6])
.setDiscussion(fields[7])
.setDescription(fields[8])
.setAlleles(fields[9])
.setPopulation(fields[10]);
// Variant or haplotypes
if (StringUtils.isNotEmpty(fields[1])) {
if (isHaplotype(fields[1])) {
// Haplotype
variantAssociation.setHaplotypes(getHaplotypeList(fields[1]));
} else {
// Variant
variantAssociation.setVariantId(fields[1]);
}
}
Map attributes = new HashMap<>();
attributes.put(PHARMGKB_ID_KEY, variantAnnotationId);
attributes.put(PHARMGKB_ASSOCIATION_TYPE_KEY, PHENOTYPE_ANNOTATION_EVIDENCE_TYPE);
variantAssociation.setAttributes(attributes);
if (StringUtils.isNotEmpty(fields[3])) {
variantAssociation.setDrugs(stringFieldToList(fields[3]));
}
// Add the annotation to the variantAnnotationMap by variant and gene
variantAssociationMap.put(variantAnnotationId, variantAssociation);
counter++;
}
}
logger.info("Number of phenotype annotations = {}", counter);
}
private void parseFunctionalAnnotationFile(Map variantAssociationMap) throws IOException {
// Parse the variant annotation file (i.e., var_fa_ann.tsv)
Path varDrugPath = pharmGKBDir.resolve(VARIANT_ANNOTATIONS_BASENAME).resolve(FUNCTIONAL_ANNOTATIONS_TSV_FILENAME);
int counter = 0;
try (BufferedReader br = FileUtils.newBufferedReader(varDrugPath)) {
// Skip first line, i.e. the header line
String line = br.readLine();
while ((line = br.readLine()) != null) {
String[] fields = line.split("\t", -1);
String variantAnnotationId = fields[0];
// Sanity check
if (StringUtils.isEmpty(variantAnnotationId)) {
logger.warn("Variant annotation ID is missing in variant annotations line: {}", line);
continue;
}
// 0 1 2 3 4 5
// Variant Annotation ID Variant/Haplotypes Gene Drug(s) PMID Phenotype Category
// 6 7 8 9 10 11 .....
// Significance Notes Sentence Alleles Specialty Population Assay type .....
PharmaVariantAssociation variantAssociation = new PharmaVariantAssociation()
.setGeneName(fields[2])
.setPubmed(fields[4])
.setPhenotypeType(fields[5])
.setSignificance(fields[6])
.setDiscussion(fields[7])
.setDescription(fields[8])
.setAlleles(fields[9])
.setPopulation(fields[10])
.setAssayType(fields[11]);
// Variant or haplotypes
if (StringUtils.isNotEmpty(fields[1])) {
if (isHaplotype(fields[1])) {
// Haplotype
variantAssociation.setHaplotypes(getHaplotypeList(fields[1]));
} else {
// Variant
variantAssociation.setVariantId(fields[1]);
}
}
Map attributes = new HashMap<>();
attributes.put(PHARMGKB_ID_KEY, variantAnnotationId);
attributes.put(PHARMGKB_ASSOCIATION_TYPE_KEY, FUNCTIONAL_ANNOTATION_EVIDENCE_TYPE);
variantAssociation.setAttributes(attributes);
if (StringUtils.isNotEmpty(fields[3])) {
variantAssociation.setDrugs(stringFieldToList(fields[3]));
}
// Add the annotation to the variantAnnotationMap by variant and gene
variantAssociationMap.put(variantAnnotationId, variantAssociation);
counter++;
}
}
logger.info("Number of variant annotations = {}", counter);
}
private void parseStudyParameterFile(Map variantAssociationMap) throws IOException {
Map> studyParametersMap = new HashMap<>();
// Parse the study parameters file (i.e., study_parameters.tsv)
Path studyParamsPath = pharmGKBDir.resolve(VARIANT_ANNOTATIONS_BASENAME).resolve(STUDY_PARAMETERS_TSV_FILENAME);
try (BufferedReader br = FileUtils.newBufferedReader(studyParamsPath)) {
// Skip first line, i.e. the header line
String line = br.readLine();
while ((line = br.readLine()) != null) {
String[] fields = line.split("\t", -1);
String variantAnnotationId = fields[1];
// Sanity check
if (StringUtils.isEmpty(variantAnnotationId)) {
logger.warn("Variant annotation ID is missing in study parameters line: {}", line);
continue;
}
// 0 1 2 3 4 5
// Study Parameters ID Variant Annotation ID Study Type Study Cases Study Controls Characteristics
// 6 7 8 9
// Characteristics Type Frequency In Cases Allele Of Frequency In Cases Frequency In Controls
// 10 11 12 13 14 15
// Allele Of Frequency In Controls P Value Ratio Stat Type Ratio Stat Confidence Interval Start Confidence Interval Stop
// 16
// Biogeographical Groups
PharmaStudyParameters studyParams = new PharmaStudyParameters()
.setStudyType(fields[2])
.setStudyCases(fields[3])
.setStudyControls(fields[4])
.setCharacteristics(fields[5])
.setCharacteristicsType(fields[6])
.setFrequencyInCases(fields[7])
.setAlleleOfFrequencyInCases(fields[8])
.setFrequencyInControls(fields[9])
.setAlleleOfFrequencyInControls(fields[10])
.setpValue(fields[11])
.setRatioStatType(fields[12])
.setRatioStat(fields[13])
.setConfidenceIntervalStart(fields[14])
.setConfidenceIntervalStop(fields[15])
.setBiogeographicalGroups(fields[16]);
Map attributes = new HashMap<>();
attributes.put(PHARMGKB_ID_KEY, variantAnnotationId);
studyParams.setAttributes(attributes);
// Add the study parameters map
if (!studyParametersMap.containsKey(variantAnnotationId)) {
studyParametersMap.put(variantAnnotationId, new ArrayList<>());
}
studyParametersMap.get(variantAnnotationId).add(studyParams);
}
}
logger.info("Number of study parameters lines = {}", studyParametersMap.size());
for (Map.Entry> entry : studyParametersMap.entrySet()) {
if (variantAssociationMap.containsKey(entry.getKey())) {
variantAssociationMap.get(entry.getKey()).setStudyParameters(entry.getValue());
} else {
logger.warn("Study parameters with variant annotation ID {} not found in variant association map", entry.getKey());
}
}
}
private void parseGeneFile(Map chemicalsMap) throws IOException {
// To relate genes with chemicals we will take the relationships from:
// 1. From guidelines (from the members 'relatedGenes' and 'relatedChemicals')
// 2. From the file relationships.tsv (from the relationship Gene - Chemical)
// Create the PharmGKB gene ID map by chemical name
Map> pgkbGeneIdMapByChemicalName = new HashMap<>();
// Create and populate guideline annotations map by PharmGKB gene ID
List guidelineAnnotations = new ArrayList<>(parseGuidelineAnnotationFiles().values());
Map> guidelineAnnotationMapByPgkbGeneId = new HashMap<>();
for (PharmaGuidelineAnnotation guidelineAnnotation : guidelineAnnotations) {
if (guidelineAnnotation.getGuideline() != null
&& CollectionUtils.isNotEmpty(guidelineAnnotation.getGuideline().getRelatedGenes())) {
for (BasicObject relatedGene : guidelineAnnotation.getGuideline().getRelatedGenes()) {
if (StringUtils.isNotEmpty(relatedGene.getId())) {
String pgkbGeneId = relatedGene.getId();
if (StringUtils.isNotEmpty(pgkbGeneId)) {
// Populate the guideline annotation map by PharmGKB gene ID
if (!guidelineAnnotationMapByPgkbGeneId.containsKey(pgkbGeneId)) {
guidelineAnnotationMapByPgkbGeneId.put(pgkbGeneId, new ArrayList<>());
}
guidelineAnnotationMapByPgkbGeneId.get(pgkbGeneId).add(guidelineAnnotation);
// Populate the PharmGKB gene ID map by chemical names
if (CollectionUtils.isNotEmpty(guidelineAnnotation.getGuideline().getRelatedChemicals())) {
for (BasicObject relatedChemical : guidelineAnnotation.getGuideline().getRelatedChemicals()) {
String chemicalName = relatedChemical.getName();
if (StringUtils.isNotEmpty(chemicalName)) {
if (!pgkbGeneIdMapByChemicalName.containsKey(chemicalName)) {
pgkbGeneIdMapByChemicalName.put(chemicalName, new HashSet<>());
}
pgkbGeneIdMapByChemicalName.get(chemicalName).add(pgkbGeneId);
}
}
}
}
}
}
}
}
// Parse the genes file (i.e., genes.tsv)
Map geneAnnotationMapByPgkbGeneId = new HashMap<>();
Path genesPath = pharmGKBDir.resolve(GENES_BASENAME).resolve(GENES_TSV_FILENAME);
try (BufferedReader br = FileUtils.newBufferedReader(genesPath)) {
// Skip first line, i.e. the header line
String line = br.readLine();
while ((line = br.readLine()) != null) {
String[] fields = line.split("\t", -1);
String pgkbGeneId = fields[0];
// Sanity check
if (StringUtils.isEmpty(pgkbGeneId)) {
logger.warn("PharmGKB accession ID is missing in genes file line: {}", line);
continue;
}
// 0 1 2 3 4 5 6 7 8
// PharmGKB Accession Id NCBI Gene ID HGNC ID Ensembl Id Name Symbol Alternate Names Alternate Symbols Is VIP
// 9 10 11 12 13
// Has Variant Annotation Cross-references Has CPIC Dosing Guideline Chromosome Chromosomal Start - GRCh37
// 14 15 16
// Chromosomal Stop - GRCh37 Chromosomal Start - GRCh38 Chromosomal Stop - GRCh38
PharmaGeneAnnotation geneAnnotation = new PharmaGeneAnnotation()
.setId(pgkbGeneId)
.setName(fields[4]);
List xrefs = new ArrayList<>();
if (StringUtils.isNotEmpty(fields[1])) {
xrefs.add(new Xref(fields[1], "NCBI", "NCBI gene ID"));
}
if (StringUtils.isNotEmpty(fields[2])) {
xrefs.add(new Xref(fields[2], "HGNC", "HGNC gene ID"));
}
if (StringUtils.isNotEmpty(fields[3])) {
xrefs.add(new Xref(fields[3], "Ensembl", "Ensembl gene ID"));
}
if (StringUtils.isNotEmpty(fields[5])) {
xrefs.add(new Xref(fields[5], "HGNC", "HGNC gene symbol"));
}
if (CollectionUtils.isNotEmpty(xrefs)) {
geneAnnotation.setXrefs(xrefs);
}
if (StringUtils.isNotEmpty(fields[9])) {
geneAnnotation.setHasVariantAnnotation(fields[9].toLowerCase(Locale.ROOT).equals("yes"));
}
// Set guidelines by getting them from the guideline annotations map
if (guidelineAnnotationMapByPgkbGeneId.containsKey(pgkbGeneId)) {
geneAnnotation.setGuidelineAnnotations(guidelineAnnotationMapByPgkbGeneId.get(pgkbGeneId));
}
Map attributes = new HashMap<>();
attributes.put(PHARMGKB_IS_VIP_KEY, fields[8]);
geneAnnotation.setAttributes(attributes);
// Add to the map
if (geneAnnotationMapByPgkbGeneId.containsKey(pgkbGeneId)) {
logger.warn("PharmGKB gene ID {} is duplicated in the PharmGKB file {}", pgkbGeneId, GENES_TSV_FILENAME);
} else {
geneAnnotationMapByPgkbGeneId.put(pgkbGeneId, geneAnnotation);
}
}
}
// Parse the chemical-gene relationships and update the PharmGKB gene ID map byh chemical name
// In addtion, updata the gene annotation map with additional fields (e.g., evidences, pubmeds...)
parseChemicalGeneRelationships(pgkbGeneIdMapByChemicalName, geneAnnotationMapByPgkbGeneId);
// Finally, update the chemical map with the gene annotation
for (Map.Entry entry : chemicalsMap.entrySet()) {
String chemicalName = entry.getKey();
if (pgkbGeneIdMapByChemicalName.containsKey(chemicalName)) {
for (String pgkbGeneId : pgkbGeneIdMapByChemicalName.get(chemicalName)) {
if (geneAnnotationMapByPgkbGeneId.containsKey(pgkbGeneId)) {
entry.getValue().getGenes().add(geneAnnotationMapByPgkbGeneId.get(pgkbGeneId));
}
}
}
}
logger.info("Number of parsed genes = {}", geneAnnotationMapByPgkbGeneId.size());
}
private void parseChemicalGeneRelationships(Map> pgkbGeneIdMapByChemicalName,
Map geneAnnotationMapByPgkbGeneId) throws IOException {
int counter = 0;
// Parse the genes file (i.e., relationships.tsv)
Path relationshipsPath = pharmGKBDir.resolve(RELATIONSHIPS_BASENAME).resolve(RELATIONSHIPS_TSV_FILENAME);
try (BufferedReader br = FileUtils.newBufferedReader(relationshipsPath)) {
// Skip first line, i.e. the header line
String line = br.readLine();
while ((line = br.readLine()) != null) {
String[] fields = line.split("\t", -1);
// 0 1 2 3 4 5 6 7 8 0 10
// Entity1_id Entity1_name Entity1_type Entity2_id Entity2_name Entity2_type Evidence Association PK PD PMIDs
String pgkbGeneId = fields[0];
String entity1Type = fields[2];
String chemicalName = fields[4];
String entity2Type = fields[5];
if (StringUtils.isNotEmpty(pgkbGeneId) && StringUtils.isNotEmpty(entity1Type) && StringUtils.isNotEmpty(chemicalName)
&& StringUtils.isNotEmpty(entity2Type) && entity1Type.equals(GENE_ENTITY) && entity2Type.equals(CHEMICAL_ENTITY)) {
if (!pgkbGeneIdMapByChemicalName.containsKey(chemicalName)) {
pgkbGeneIdMapByChemicalName.put(chemicalName, new HashSet<>());
}
pgkbGeneIdMapByChemicalName.get(chemicalName).add(pgkbGeneId);
// Update gene annotation map
if (geneAnnotationMapByPgkbGeneId.containsKey(pgkbGeneId)) {
if (StringUtils.isNotEmpty(fields[6])) {
geneAnnotationMapByPgkbGeneId.get(pgkbGeneId).setEvidences(Arrays.asList(fields[6].split(",", -1)));
}
if (StringUtils.isNotEmpty(fields[7])) {
geneAnnotationMapByPgkbGeneId.get(pgkbGeneId).setConfidence(fields[7]);
}
if (StringUtils.isNotEmpty(fields[10])) {
geneAnnotationMapByPgkbGeneId.get(pgkbGeneId).setPubmed(Arrays.asList(fields[10].split(";", -1)));
}
} else {
logger.warn("PhamGKB gene ID {} found in the file {} but not in the file {}", pgkbGeneId,
RELATIONSHIPS_TSV_FILENAME, GENES_TSV_FILENAME);
}
counter++;
}
}
}
logger.info("Number of parsed {}-{} relationships = {}", GENE_ENTITY, CHEMICAL_ENTITY, counter);
}
private List stringFieldToList(String field) {
if (field.startsWith("\"")) {
return Arrays
.stream(field.replace("\"\"\"", "\"").replace("\"\"", "\"").replace("\", \"", "\",\"").split("\",\""))
.map(s -> s.replace("\"", "").trim())
.collect(Collectors.toList());
} else {
if (field.contains(", ")) {
return Arrays
.stream(field.replace(", ", ",").split(","))
.map(String::trim)
.collect(Collectors.toList());
} else {
return Collections.singletonList(field);
}
}
}
private boolean isHaplotype(String value) {
return (!value.startsWith("rs") && value.contains("*"));
}
private List getHaplotypeList(String value) {
return Arrays.stream(value.split(",")).map(s -> s.trim()).collect(Collectors.toList());
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy