de.julielab.genemapper.resources.SnomedCtGeneProductDictionaryCreator Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of gene-mapper-resources Show documentation
Show all versions of gene-mapper-resources Show documentation
This project assembles code and files required to build the dictionaries and indexes used by the JCoRe
Gene Mapper.
The newest version!
package de.julielab.genemapper.resources;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;
import de.julielab.java.utilities.FileUtilities;
import java.io.*;
import java.util.Collection;
import java.util.HashSet;
import java.util.Set;
public class SnomedCtGeneProductDictionaryCreator {
private final Set GENE_GENE_PRODUCT_CONCEPT_IDS = Set.of("67271001", "88878007");
public static void main(String[] args) throws IOException {
File snomedCtFullPath = new File(args[0]);
File outputPath = new File(args[1]);
if (!snomedCtFullPath.exists())
throw new FileNotFoundException(snomedCtFullPath.getAbsolutePath());
SnomedCtGeneProductDictionaryCreator creator = new SnomedCtGeneProductDictionaryCreator();
creator.create(snomedCtFullPath, outputPath);
}
private Multimap readRelations(File terminologyDirectory) throws IOException {
Multimap isaRelations = HashMultimap.create();
File[] relationshipFile = terminologyDirectory.listFiles((dir, name) -> name.contains("_Relationship_"));
if (relationshipFile.length == 0)
throw new FileNotFoundException("The Relationship reference set was not found in the Terminology directory at " + terminologyDirectory.getAbsolutePath());
try (BufferedReader br = FileUtilities.getReaderFromFile(relationshipFile[0])) {
br.lines()
// filter comments
.filter(line -> !line.startsWith("#"))
// filter header row
.filter(line -> !line.startsWith("id"))
.map(line -> line.split("\\t"))
// is active
.filter(split -> split[2].equals("1"))
// is a Is-A relation
.filter(split -> split[7].equals("116680003"))
.forEach(s -> isaRelations.put(s[4], s[5]));
}
return isaRelations;
}
public void create(File snomedCtFullPath, File outputPath) throws IOException {
File terminologyDirectory = new File(snomedCtFullPath.getAbsolutePath(), "Terminology");
if (!terminologyDirectory.exists())
throw new FileNotFoundException(terminologyDirectory.getAbsolutePath());
Multimap isaRelations = readRelations(terminologyDirectory);
Multimap conceptId2terms = readConceptTerms(terminologyDirectory);
writeDictionary(outputPath, isaRelations, conceptId2terms);
}
private void writeDictionary(File outputPath, Multimap isaRelations, Multimap conceptId2terms) throws IOException {
try (BufferedWriter bw = FileUtilities.getWriterToFile(outputPath)) {
for (String conceptId : conceptId2terms.keySet()) {
Collection terms = conceptId2terms.get(conceptId);
boolean isGeneOrGeneProduct = isGeneOrGeneProduct(conceptId, isaRelations, new HashSet<>());
String label = isGeneOrGeneProduct ? "GeneOrGeneProduct" : "NonGeneOrProduct";
for (String term : terms) {
bw.write(term + "\t" + label);
bw.newLine();
}
}
}
}
private Multimap readConceptTerms(File terminologyDirectory) throws IOException {
File[] descriptionFile = terminologyDirectory.listFiles((dir, name) -> name.contains("_Description_"));
if (descriptionFile.length == 0)
throw new FileNotFoundException("The Relationship reference set was not found in the Terminology directory at " + terminologyDirectory.getAbsolutePath());
Multimap conceptId2terms = HashMultimap.create();
try (BufferedReader br = FileUtilities.getReaderFromFile(descriptionFile[0])) {
br.lines()
// filter comments
.filter(line -> !line.startsWith("#"))
// filter header row
.filter(line -> !line.startsWith("id"))
.map(line -> line.split("\\t"))
// is active
.filter(split -> split[2].equals("1"))
.forEach(split -> {
String conceptId = split[4];
conceptId2terms.put(conceptId, split[7]);
});
}
return conceptId2terms;
}
private boolean isGeneOrGeneProduct(String conceptId, Multimap isaRelations, Set blacklist) {
if (GENE_GENE_PRODUCT_CONCEPT_IDS.contains(conceptId))
return true;
blacklist.add(conceptId);
Collection parents = isaRelations.get(conceptId);
if (parents.stream().anyMatch(GENE_GENE_PRODUCT_CONCEPT_IDS::contains))
return true;
boolean isGeneOrGeneProduct = false;
for (String parentId : parents) {
if (blacklist.add(parentId))
isGeneOrGeneProduct = isGeneOrGeneProduct || isGeneOrGeneProduct(parentId, isaRelations, blacklist);
}
return isGeneOrGeneProduct;
}
}