de.julielab.genemapper.resources.Synonym2IdsDictionaryGenerator Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of gene-mapper-resources Show documentation
This project assembles code and files required to build the dictionaries and indexes used by the JCoRe Gene Mapper.
The newest version!
package de.julielab.genemapper.resources;

import de.julielab.java.utilities.FileUtilities;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.IOException;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;

/**
 * 
 * Create a tab-separated two-column dictionary of gene names and their possible IDs.
 * 
 * The expected input is a tab-separated three-column file where the columns are gene name, gene Id and gene name priority.
 * The lines must be sorted by name. The priority
 * is a coding scheme for the source of the gene name with respect to its id: -1 denoted an official symbol, for example.
 * The priorities are specified in the _makeGeneDictionary.sh script.
 */
public class Synonym2IdsDictionaryGenerator {
    private final static Logger log = LoggerFactory.getLogger(Synonym2IdsDictionaryGenerator.class);

    public static void main(String[] args) throws Exception {
        String synsWithIdDictPath = args[0];
        String geneInfo = args[1];
        String outputPath = args[2];
        Map id2taxMap = readEgTaxMap(Path.of(geneInfo));
        writeDictionary(synsWithIdDictPath, id2taxMap, outputPath);
    }

    private static Map readEgTaxMap(Path geneInfo) throws IOException {
        Map id2tax;
        log.info("Reading file gene ID to taxonomy ID map from {}", geneInfo);
        try (BufferedReader br = FileUtilities.getReaderFromFile(geneInfo.toFile())) {
            id2tax = br.lines().collect(
                    Collectors.toMap(l -> l.split("\\t", 3)[1].intern(), l -> l.split("\\t", 3)[0].intern()));
        }
        return id2tax;
    }

    private static void writeDictionary(String synsWithIdDictPath, Map id2taxMapPath, String outputPath) throws IOException {
        String lastSyn = null;

        try (BufferedReader br = FileUtilities.getReaderFromFile(new File(synsWithIdDictPath)); BufferedWriter bw = FileUtilities.getWriterToFile(new File(outputPath))) {
            String line;
            // each list item has an array that lists the geneId, taxId and priority of its input line, in that order
            List synIdAccumulator = new ArrayList<>();
            while ((line = br.readLine()) != null) {
                String[] split = line.split("\\t");
                String synonym = split[0];
                String id = split[1];
                String priority = split[2];
                String taxId = id2taxMapPath.get(id);
                if (taxId == null) {
                    log.warn("Could not retrieve the taxonomy ID for gene ID {} for synonym '{}'. Skipping this entry.", id, synonym);
                    continue;
                }
                if (lastSyn != null && !lastSyn.equals(synonym)) {
                    writeAccumulator(bw, synIdAccumulator, lastSyn);
                    synIdAccumulator.clear();
                }
                synIdAccumulator.add(new String[]{id, taxId, priority});
                lastSyn = synonym;
            }
            // for the last synonym
            writeAccumulator(bw, synIdAccumulator, lastSyn);
        }
    }

    private static void writeAccumulator(BufferedWriter bw, List synIdAccumulator, String synonym) throws IOException {
        if (!synIdAccumulator.isEmpty()) {
            // sort the ID items by priority
            synIdAccumulator.sort(Comparator.comparingInt(a -> Integer.parseInt(a[2])));
            // segregate the information about an ID by colons and the resulting items by pipes
            String output = synonym + "\t" + synIdAccumulator.stream().map(a -> String.join(":", a)).collect(Collectors.joining("|"));
            bw.write(output);
            bw.newLine();
        }
    }
}