All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.julielab.genemapper.resources.SnomedCtGeneProductDictionaryCreator Maven / Gradle / Ivy

Go to download

This project assembles code and files required to build the dictionaries and indexes used by the JCoRe Gene Mapper.

The newest version!
package de.julielab.genemapper.resources;

import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;
import de.julielab.java.utilities.FileUtilities;

import java.io.*;
import java.util.Collection;
import java.util.HashSet;
import java.util.Set;

public class SnomedCtGeneProductDictionaryCreator {

    private final Set GENE_GENE_PRODUCT_CONCEPT_IDS = Set.of("67271001", "88878007");

    public static void main(String[] args) throws IOException {
        File snomedCtFullPath = new File(args[0]);
        File outputPath = new File(args[1]);
        if (!snomedCtFullPath.exists())
            throw new FileNotFoundException(snomedCtFullPath.getAbsolutePath());
        SnomedCtGeneProductDictionaryCreator creator = new SnomedCtGeneProductDictionaryCreator();
        creator.create(snomedCtFullPath, outputPath);
    }

    private Multimap readRelations(File terminologyDirectory) throws IOException {
        Multimap isaRelations = HashMultimap.create();
        File[] relationshipFile = terminologyDirectory.listFiles((dir, name) -> name.contains("_Relationship_"));
        if (relationshipFile.length == 0)
            throw new FileNotFoundException("The Relationship reference set was not found in the Terminology directory at " + terminologyDirectory.getAbsolutePath());
        try (BufferedReader br = FileUtilities.getReaderFromFile(relationshipFile[0])) {
            br.lines()
                    // filter comments
                    .filter(line -> !line.startsWith("#"))
                    // filter header row
                    .filter(line -> !line.startsWith("id"))
                    .map(line -> line.split("\\t"))
                    // is active
                    .filter(split -> split[2].equals("1"))
                    // is a Is-A relation
                    .filter(split -> split[7].equals("116680003"))
                    .forEach(s -> isaRelations.put(s[4], s[5]));
        }
        return isaRelations;
    }

    public void create(File snomedCtFullPath, File outputPath) throws IOException {
        File terminologyDirectory = new File(snomedCtFullPath.getAbsolutePath(), "Terminology");
        if (!terminologyDirectory.exists())
            throw new FileNotFoundException(terminologyDirectory.getAbsolutePath());

        Multimap isaRelations = readRelations(terminologyDirectory);
        Multimap conceptId2terms = readConceptTerms(terminologyDirectory);

        writeDictionary(outputPath, isaRelations, conceptId2terms);
    }

    private void writeDictionary(File outputPath, Multimap isaRelations, Multimap conceptId2terms) throws IOException {
        try (BufferedWriter bw = FileUtilities.getWriterToFile(outputPath)) {
            for (String conceptId : conceptId2terms.keySet()) {
                Collection terms = conceptId2terms.get(conceptId);
                boolean isGeneOrGeneProduct = isGeneOrGeneProduct(conceptId, isaRelations, new HashSet<>());
                String label = isGeneOrGeneProduct ? "GeneOrGeneProduct" : "NonGeneOrProduct";
                for (String term : terms) {
                    bw.write(term + "\t" + label);
                    bw.newLine();
                }
            }
        }
    }

    private Multimap readConceptTerms(File terminologyDirectory) throws IOException {
        File[] descriptionFile = terminologyDirectory.listFiles((dir, name) -> name.contains("_Description_"));
        if (descriptionFile.length == 0)
            throw new FileNotFoundException("The Relationship reference set was not found in the Terminology directory at " + terminologyDirectory.getAbsolutePath());
        Multimap conceptId2terms = HashMultimap.create();
        try (BufferedReader br = FileUtilities.getReaderFromFile(descriptionFile[0])) {
            br.lines()
                    // filter comments
                    .filter(line -> !line.startsWith("#"))
                    // filter header row
                    .filter(line -> !line.startsWith("id"))
                    .map(line -> line.split("\\t"))
                    // is active
                    .filter(split -> split[2].equals("1"))
                    .forEach(split -> {
                        String conceptId = split[4];
                        conceptId2terms.put(conceptId, split[7]);
                    });
        }
        return conceptId2terms;
    }

    private boolean isGeneOrGeneProduct(String conceptId, Multimap isaRelations, Set blacklist) {
        if (GENE_GENE_PRODUCT_CONCEPT_IDS.contains(conceptId))
            return true;
        blacklist.add(conceptId);
        Collection parents = isaRelations.get(conceptId);
        if (parents.stream().anyMatch(GENE_GENE_PRODUCT_CONCEPT_IDS::contains))
            return true;
        boolean isGeneOrGeneProduct = false;
        for (String parentId : parents) {
            if (blacklist.add(parentId))
                isGeneOrGeneProduct = isGeneOrGeneProduct || isGeneOrGeneProduct(parentId, isaRelations, blacklist);
        }
        return isGeneOrGeneProduct;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy