All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.opencb.cellbase.lib.builders.RefSeqGeneBuilderIndexer Maven / Gradle / Ivy

The newest version!
/*
 * Copyright 2015-2020 OpenCB
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.opencb.cellbase.lib.builders;

import org.apache.commons.lang.StringUtils;
import org.apache.poi.ss.usermodel.*;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.opencb.biodata.formats.io.FileFormatException;
import org.opencb.biodata.models.core.MirnaTarget;
import org.opencb.biodata.models.core.TargetGene;
import org.opencb.biodata.models.variant.avro.GeneDrugInteraction;
import org.opencb.biodata.models.variant.avro.GeneTraitAssociation;
import org.opencb.commons.utils.FileUtils;
import org.rocksdb.RocksDBException;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;

public class RefSeqGeneBuilderIndexer extends GeneBuilderIndexer{

    public RefSeqGeneBuilderIndexer(Path refSeqDirectoryPath) {
        super(refSeqDirectoryPath);
    }

    public void index(Path maneFile, Path lrgFile, Path proteinFastaFile, Path cDnaFastaFile, Path geneDrugFile, Path hpoFilePath,
                      Path disgenetFile, Path miRTarBaseFile, Path cancerGeneGensus, Path cancerHotspot, Path tso500File,
                      Path eglhHaemOncFile) throws IOException, RocksDBException, FileFormatException {
        indexManeMapping(maneFile, "refseq");
        indexLrgMapping(lrgFile, "refseq");
        indexProteinSequences(proteinFastaFile);
        indexCdnaSequences(cDnaFastaFile);
        indexDrugs(geneDrugFile);
        indexDiseases(hpoFilePath, disgenetFile);
        indexMiRTarBase(miRTarBaseFile);
        indexCancerGeneCensus(cancerGeneGensus);
        indexCancerHotspot(cancerHotspot);
        indexTSO500(tso500File);
        indexEGLHHaemOnc(eglhHaemOncFile);
    }

    private void indexDrugs(Path geneDrugFile) throws IOException, RocksDBException {
        if (geneDrugFile != null && Files.exists(geneDrugFile) && Files.size(geneDrugFile) > 0) {
            logger.info("Loading gene-drug interaction data from '{}'", geneDrugFile);
            BufferedReader br = FileUtils.newBufferedReader(geneDrugFile);

            // Skip header
            br.readLine();

            int lineCounter = 1;
            String line;
            String currentGene = "";
            List drugs = new ArrayList<>();
            while ((line = br.readLine()) != null) {
                String[] parts = line.split("\t");
                String geneName = parts[0];
                if (currentGene.equals("")) {
                    currentGene = geneName;
                } else if (!currentGene.equals(geneName)) {
                    rocksDbManager.update(rocksdb, currentGene + DRUGS_SUFFIX, drugs);
                    drugs = new ArrayList<>();
                    currentGene = geneName;
                }

                String source = null;
                if (parts.length >= 4) {
                    source = parts[3];
                }

                String interactionType = null;
                if (parts.length >= 5) {
                    interactionType = parts[4];
                }

                String drugName = null;
                if (parts.length >= 8) {
                    // if drug name column is empty, use drug claim name instead
                    drugName = StringUtils.isEmpty(parts[7]) ? parts[6] : parts[7];
                }
                if (StringUtils.isEmpty(drugName)) {
                    // no drug name
                    continue;
                }

                String chemblId = null;
                if (parts.length >= 9) {
                    chemblId = parts[8];
                }

                List publications = new ArrayList<>();
                if (parts.length >= 10 && parts[9] != null) {
                    publications = Arrays.asList(parts[9].split(","));
                }

                GeneDrugInteraction drug = new GeneDrugInteraction(
                        geneName, drugName, source, null, null, interactionType, chemblId, publications);
                drugs.add(drug);
                lineCounter++;
            }
            br.close();
            // update last gene
            rocksDbManager.update(rocksdb, currentGene + DRUGS_SUFFIX, drugs);
        } else {
            logger.warn("Gene drug file " + geneDrugFile + " not found");
            logger.warn("Ignoring " + geneDrugFile);
        }
    }

    public List getDrugs(String id) throws RocksDBException, IOException {
        String key = id + DRUGS_SUFFIX;
        return rocksDbManager.getDrugs(rocksdb, key);
    }

    private void indexDiseases(Path hpoFilePath, Path disgenetFilePath) throws IOException, RocksDBException {
        Map> geneDiseaseAssociationMap = new HashMap<>(50000);

        String line;
        if (hpoFilePath != null && hpoFilePath.toFile().exists() && Files.size(hpoFilePath) > 0) {
            BufferedReader bufferedReader = FileUtils.newBufferedReader(hpoFilePath);
            // skip first header line
            bufferedReader.readLine();
            while ((line = bufferedReader.readLine()) != null) {
                String[] fields = line.split("\t");
                String omimId = fields[6];
                String geneSymbol = fields[3];
                String hpoId = fields[0];
                String diseaseName = fields[1];
                GeneTraitAssociation disease =
                        new GeneTraitAssociation(omimId, diseaseName, hpoId, 0f, 0, new ArrayList<>(), new ArrayList<>(), "hpo");
                addValueToMapElement(geneDiseaseAssociationMap, geneSymbol, disease);
            }
            bufferedReader.close();
        }

        if (disgenetFilePath != null && disgenetFilePath.toFile().exists() && Files.size(disgenetFilePath) > 0) {
            BufferedReader bufferedReader = FileUtils.newBufferedReader(disgenetFilePath);
            // skip first header line
            bufferedReader.readLine();
            while ((line = bufferedReader.readLine()) != null) {
                String[] fields = line.split("\t");
                String diseaseId = fields[4];
                String diseaseName = fields[5];
                String score = fields[9];
                String numberOfPubmeds = fields[13].trim();
                String numberOfSNPs = fields[14];
                String source = fields[15];
                GeneTraitAssociation disease = new GeneTraitAssociation(diseaseId, diseaseName, "", Float.parseFloat(score),
                        Integer.parseInt(numberOfPubmeds), Arrays.asList(numberOfSNPs), Arrays.asList(source), "disgenet");
                addValueToMapElement(geneDiseaseAssociationMap, fields[1], disease);
            }
            bufferedReader.close();
        }

        for (Map.Entry> entry : geneDiseaseAssociationMap.entrySet()) {
            rocksDbManager.update(rocksdb, entry.getKey() + DISEASE_SUFFIX, entry.getValue());
        }
    }

    public List getDiseases(String id) throws RocksDBException, IOException {
        String key = id + DISEASE_SUFFIX;
        return rocksDbManager.getDiseases(rocksdb, key);
    }

    private void indexMiRTarBase(Path miRTarBaseFile) throws IOException, RocksDBException {
        if (miRTarBaseFile != null && Files.exists(miRTarBaseFile) && Files.size(miRTarBaseFile) > 0) {
            logger.info("Loading mirna targets from '{}'", miRTarBaseFile);
            FileInputStream file = new FileInputStream(miRTarBaseFile.toFile());
            Workbook workbook = new XSSFWorkbook(file);
            Sheet sheet = workbook.getSheetAt(0);
            Iterator iterator = sheet.iterator();
            String currentMiRTarBaseId = null;
            String currentMiRNA = null;
            String currentGene = null;
            List targetGenes = new ArrayList();
            Map> geneToMirna = new HashMap();
            while (iterator.hasNext()) {

                Row currentRow = iterator.next();
                Iterator cellIterator = currentRow.iterator();

                Cell cell = cellIterator.next();
                String miRTarBaseId = cell.getStringCellValue();

                // skip header
                if (miRTarBaseId.startsWith("miRTarBase")) {
                    continue;
                }

                if (currentMiRTarBaseId == null) {
                    currentMiRTarBaseId = miRTarBaseId;
                }

                cell = cellIterator.next();
                String miRNA = cell.getStringCellValue();
                if (currentMiRNA == null) {
                    currentMiRNA = miRNA;
                }

                // species
                cellIterator.next();

                cell = cellIterator.next();
                String geneName = cell.getStringCellValue();
                if (currentGene == null) {
                    currentGene = geneName;
                }

                // entrez
                cellIterator.next();
                // species
                cellIterator.next();

                if (!miRTarBaseId.equals(currentMiRTarBaseId) || !geneName.equals(currentGene)) {
                    // new entry, store current one
                    MirnaTarget miRnaTarget = new MirnaTarget(currentMiRTarBaseId, "miRTarBase", currentMiRNA,
                            targetGenes);
                    addValueToMapElement(geneToMirna, currentGene, miRnaTarget);
                    targetGenes = new ArrayList();
                    currentGene = geneName;
                    currentMiRTarBaseId = miRTarBaseId;
                    currentMiRNA = miRNA;
                }

                // experiment
                cell = cellIterator.next();
                String experiment = cell.getStringCellValue();

                // support type
                cell = cellIterator.next();
                String supportType = cell.getStringCellValue();

                // pubmeds
                cell = cellIterator.next();
                String pubmed = null;
                // seems to vary, so check both
                if (cell.getCellType().equals(CellType.NUMERIC)) {
                    pubmed = String.valueOf(cell.getNumericCellValue());
                } else {
                    pubmed = cell.getStringCellValue();
                }

                targetGenes.add(new TargetGene(experiment, supportType, pubmed));
            }

            // parse last entry
            MirnaTarget miRnaTarget = new MirnaTarget(currentMiRTarBaseId, "miRTarBase", currentMiRNA,
                    targetGenes);
            addValueToMapElement(geneToMirna, currentGene, miRnaTarget);

            for (Map.Entry> entry : geneToMirna.entrySet()) {
                rocksDbManager.update(rocksdb, entry.getKey() + MIRTARBASE_SUFFIX, entry.getValue());
            }
        } else {
            logger.error("mirtarbase file not found");
        }
    }

    public List getMirnaTargets(String geneName) throws RocksDBException, IOException {
        String key = geneName + MIRTARBASE_SUFFIX;
        return rocksDbManager.getMirnaTargets(rocksdb, key);
    }

    private static  void addValueToMapElement(Map> map, String key, T value) {
        if (map.containsKey(key)) {
            map.get(key).add(value);
        } else {
            List valueList = new ArrayList<>();
            valueList.add(value);
            map.put(key, valueList);
        }
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy