de.julielab.genemapper.resources.Synonym2IdsDictionaryGenerator Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of gene-mapper-resources Show documentation
Show all versions of gene-mapper-resources Show documentation
This project assembles code and files required to build the dictionaries and indexes used by the JCoRe
Gene Mapper.
The newest version!
package de.julielab.genemapper.resources;
import de.julielab.java.utilities.FileUtilities;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.IOException;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
/**
*
* Create a tab-separated two-column dictionary of gene names and their possible IDs.
*
* The expected input is a tab-separated three-column file where the columns are gene name, gene Id and gene name priority.
* The lines must be sorted by name. The priority
* is a coding scheme for the source of the gene name with respect to its id: -1 denoted an official symbol, for example.
* The priorities are specified in the _makeGeneDictionary.sh script.
*/
public class Synonym2IdsDictionaryGenerator {
private final static Logger log = LoggerFactory.getLogger(Synonym2IdsDictionaryGenerator.class);
public static void main(String[] args) throws Exception {
String synsWithIdDictPath = args[0];
String geneInfo = args[1];
String outputPath = args[2];
Map id2taxMap = readEgTaxMap(Path.of(geneInfo));
writeDictionary(synsWithIdDictPath, id2taxMap, outputPath);
}
private static Map readEgTaxMap(Path geneInfo) throws IOException {
Map id2tax;
log.info("Reading file gene ID to taxonomy ID map from {}", geneInfo);
try (BufferedReader br = FileUtilities.getReaderFromFile(geneInfo.toFile())) {
id2tax = br.lines().collect(
Collectors.toMap(l -> l.split("\\t", 3)[1].intern(), l -> l.split("\\t", 3)[0].intern()));
}
return id2tax;
}
private static void writeDictionary(String synsWithIdDictPath, Map id2taxMapPath, String outputPath) throws IOException {
String lastSyn = null;
try (BufferedReader br = FileUtilities.getReaderFromFile(new File(synsWithIdDictPath)); BufferedWriter bw = FileUtilities.getWriterToFile(new File(outputPath))) {
String line;
// each list item has an array that lists the geneId, taxId and priority of its input line, in that order
List synIdAccumulator = new ArrayList<>();
while ((line = br.readLine()) != null) {
String[] split = line.split("\\t");
String synonym = split[0];
String id = split[1];
String priority = split[2];
String taxId = id2taxMapPath.get(id);
if (taxId == null) {
log.warn("Could not retrieve the taxonomy ID for gene ID {} for synonym '{}'. Skipping this entry.", id, synonym);
continue;
}
if (lastSyn != null && !lastSyn.equals(synonym)) {
writeAccumulator(bw, synIdAccumulator, lastSyn);
synIdAccumulator.clear();
}
synIdAccumulator.add(new String[]{id, taxId, priority});
lastSyn = synonym;
}
// for the last synonym
writeAccumulator(bw, synIdAccumulator, lastSyn);
}
}
private static void writeAccumulator(BufferedWriter bw, List synIdAccumulator, String synonym) throws IOException {
if (!synIdAccumulator.isEmpty()) {
// sort the ID items by priority
synIdAccumulator.sort(Comparator.comparingInt(a -> Integer.parseInt(a[2])));
// segregate the information about an ID by colons and the resulting items by pipes
String output = synonym + "\t" + synIdAccumulator.stream().map(a -> String.join(":", a)).collect(Collectors.joining("|"));
bw.write(output);
bw.newLine();
}
}
}