All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.julielab.genemapper.resources.Synonym2IdsDictionaryGenerator Maven / Gradle / Ivy

Go to download

This project assembles code and files required to build the dictionaries and indexes used by the JCoRe Gene Mapper.

The newest version!
package de.julielab.genemapper.resources;

import de.julielab.java.utilities.FileUtilities;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.IOException;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;

/**
 * 

* Create a tab-separated two-column dictionary of gene names and their possible IDs. *

*

The expected input is a tab-separated three-column file where the columns are gene name, gene Id and gene name priority. * The lines must be sorted by name. The priority * is a coding scheme for the source of the gene name with respect to its id: -1 denoted an official symbol, for example. * The priorities are specified in the _makeGeneDictionary.sh script.

*/ public class Synonym2IdsDictionaryGenerator { private final static Logger log = LoggerFactory.getLogger(Synonym2IdsDictionaryGenerator.class); public static void main(String[] args) throws Exception { String synsWithIdDictPath = args[0]; String geneInfo = args[1]; String outputPath = args[2]; Map id2taxMap = readEgTaxMap(Path.of(geneInfo)); writeDictionary(synsWithIdDictPath, id2taxMap, outputPath); } private static Map readEgTaxMap(Path geneInfo) throws IOException { Map id2tax; log.info("Reading file gene ID to taxonomy ID map from {}", geneInfo); try (BufferedReader br = FileUtilities.getReaderFromFile(geneInfo.toFile())) { id2tax = br.lines().collect( Collectors.toMap(l -> l.split("\\t", 3)[1].intern(), l -> l.split("\\t", 3)[0].intern())); } return id2tax; } private static void writeDictionary(String synsWithIdDictPath, Map id2taxMapPath, String outputPath) throws IOException { String lastSyn = null; try (BufferedReader br = FileUtilities.getReaderFromFile(new File(synsWithIdDictPath)); BufferedWriter bw = FileUtilities.getWriterToFile(new File(outputPath))) { String line; // each list item has an array that lists the geneId, taxId and priority of its input line, in that order List synIdAccumulator = new ArrayList<>(); while ((line = br.readLine()) != null) { String[] split = line.split("\\t"); String synonym = split[0]; String id = split[1]; String priority = split[2]; String taxId = id2taxMapPath.get(id); if (taxId == null) { log.warn("Could not retrieve the taxonomy ID for gene ID {} for synonym '{}'. Skipping this entry.", id, synonym); continue; } if (lastSyn != null && !lastSyn.equals(synonym)) { writeAccumulator(bw, synIdAccumulator, lastSyn); synIdAccumulator.clear(); } synIdAccumulator.add(new String[]{id, taxId, priority}); lastSyn = synonym; } // for the last synonym writeAccumulator(bw, synIdAccumulator, lastSyn); } } private static void writeAccumulator(BufferedWriter bw, List synIdAccumulator, String synonym) throws IOException { if (!synIdAccumulator.isEmpty()) { // sort the ID items by priority synIdAccumulator.sort(Comparator.comparingInt(a -> Integer.parseInt(a[2]))); // segregate the information about an ID by colons and the resulting items by pipes String output = synonym + "\t" + synIdAccumulator.stream().map(a -> String.join(":", a)).collect(Collectors.joining("|")); bw.write(output); bw.newLine(); } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy