All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.ibm.fhir.term.graph.loader.impl.UMLSTermGraphLoader Maven / Gradle / Ivy

There is a newer version: 4.11.1
Show newest version
/*
 * (C) Copyright IBM Corp. 2020, 2021
 *
 * SPDX-License-Identifier: Apache-2.0
 */

package com.ibm.fhir.term.graph.loader.impl;

import static com.ibm.fhir.term.graph.loader.util.FHIRTermGraphLoaderUtil.toLabel;
import static com.ibm.fhir.term.graph.loader.util.FHIRTermGraphLoaderUtil.toMap;
import static com.ibm.fhir.term.util.CodeSystemSupport.normalize;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Properties;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.stream.Collectors;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.MissingOptionException;
import org.apache.commons.cli.Options;
import org.apache.commons.configuration2.Configuration;
import org.apache.tinkerpop.gremlin.structure.Edge;
import org.apache.tinkerpop.gremlin.structure.Vertex;
import org.janusgraph.core.schema.JanusGraphManagement;

import com.ibm.fhir.term.graph.loader.FHIRTermGraphLoader;
import com.ibm.fhir.term.graph.loader.util.COSObject;
import com.ibm.fhir.term.graph.loader.util.ConfigLoader;

/*
 * This class will load UMLS concepts and relationships into a JanusGraph.
 */
public class UMLSTermGraphLoader extends AbstractTermGraphLoader {
    private static final Logger LOG = Logger.getLogger(UMLSTermGraphLoader.class.getName());

    private static final String UMLS_CONCEPT_NAMES_AND_SOURCES_FILE = "MRCONSO.RRF";
    private static final String UMLS_SOURCE_INFORMATION_FILE = "MRSAB.RRF";
    private static final String UMLS_RELATED_CONCEPTS_FILE = "MRREL.RRF";
    private static final String UMLS_DELIMITER = "\\|";
    public static final String COS_BUCKET_NAME = "TERM_COS_BUCKET_NAME";

    // Map to track AUI to SCUI relationships, since MRREL uses AUI, but granularity of concepts used in MRCONSO is at SCUI/CODE level
    private Map auiToCodeMap = new ConcurrentHashMap<>(1000000);

    // Map of code system name to preferred label, configured in properties file
    private Properties codeSystemMap = new Properties();

    // Map of code system id to corresponding vertex
    private Map codeSystemVertices = new ConcurrentHashMap<>();

    // Map of abbreviated source name to the current version of that source
    private Map sabToVersion = new HashMap<>();

    // Map of concept name to corresponding vertex
    private Map vertexMap = null;

    private String cosBucketName = null;

    public UMLSTermGraphLoader(Map options) {
        super(options);
        cosBucketName = System.getenv(COS_BUCKET_NAME);
        vertexMap = new HashMap<>(250000);
    }

    /**
     * Initialize a UMLSTermGraphLoader
     *
     * @param options
     */
    public UMLSTermGraphLoader(Map options, Configuration configuration) {
        super(options, configuration);
        cosBucketName = System.getenv(COS_BUCKET_NAME);
        vertexMap = new HashMap<>(250000);
    }

    /**
     * Loads UMLS data into JanusGraph
     *
     * @throws RuntimeException
     */
    @Override
    public void load() {
        try {
            loadSourceAttributes();
            loadConcepts();
            loadRelations();
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    }

    /**
     * Create a code system vertex for the provided abbreviated source name
     *
     * @param sab
     * @return
     */
    private final Vertex createCodeSystemVertex(String sab) {
        String version = sabToVersion.get(sab);
        String url = (String) codeSystemMap.getOrDefault(sab, sab);
        Vertex csv = g.addV("CodeSystem").property("url", url).property("version", version).next();
        g.tx().commit();
        return csv;

    }

    /**
     * Loads all UMLS concept data from the provided conceptFile
     *
     * @throws FileNotFoundException
     * @throws IOException
     */
    private void loadConcepts() throws FileNotFoundException, IOException {
        // MRCONSO.RRF
        // CUI, LAT, TS, LUI, STT, SUI, ISPREF, AUI, SAUI, SCUI, SDUI, SAB, TTY, CODE, STR, SRL, SUPPRESS, CVF
        // https://www.ncbi.nlm.nih.gov/books/NBK9685/table/ch03.T.concept_names_and_sources_file_mr/
        //
        LOG.info("Loading concepts.....");

        Map sabCounterMap = new HashMap<>();

        try (BufferedReader reader = new BufferedReader(COSObject.getItem(cosBucketName, UMLS_CONCEPT_NAMES_AND_SOURCES_FILE))) {
            reader.lines().forEach(line -> {
                if (line == null) {
                    return;
                }
                String[] tokens = line.split(UMLS_DELIMITER);
                if (tokens.length < 18) {
                    // Expect 18 tokens per valid line
                    return;
                }
                String lat = tokens[1];
                String aui = tokens[7];
                String scui = tokens[9];
                String sab = tokens[11];
                String tty = tokens[12];
                String str = tokens[14];
                String suppress = tokens[16];

                String code = scui;
                if (code == null || code.trim().length() == 0) {
                    code = tokens[13];
                }

                if (!"O".equals(suppress)) {
                    auiToCodeMap.put(aui, code);

                    Vertex codeSystemVertex = codeSystemVertices.computeIfAbsent(sab, s -> createCodeSystemVertex(s));

                    AtomicInteger counter = sabCounterMap.computeIfAbsent(sab, s -> new AtomicInteger(0));
                    counter.incrementAndGet();

                    Vertex v = null;
                    if (vertexMap.containsKey(code)) {
                        v = vertexMap.get(code);
                    } else {
                        String codeLowerCase = normalize(code);
                        v = g.addV("Concept").property("code", code).property("codeLowerCase", codeLowerCase).next();
                        vertexMap.put(code, v);

                        g.V(codeSystemVertex).addE("concept").to(v).next();
                    }
                    if (v == null) {
                        LOG.severe("Could not find SCUI in vertexMap");
                    } else {
                        if (tty.equals("PT")) { // Preferred entries provide preferred name and language
                            v.property("display", str);
                            v.property("language", lat);
                        }
                        // add new designation
                        Vertex w = g.addV("Designation").property("language", lat).property("value", str).next();
                        g.V(v).addE("designation").to(w).next();

                        if ((sabCounterMap.values().stream().collect(Collectors.summingInt(AtomicInteger::get)) % 10000) == 0) {
                            String counters = sabCounterMap.entrySet().stream().map(e -> e.getKey() + ":" + e.getValue()).collect(Collectors.joining(","));
                            LOG.info("counter: " + counters);
                            g.tx().commit();
                        }

                    }
                }
            });

            for (Entry entry : sabCounterMap.entrySet()) {
                Vertex codeSystemVertex = codeSystemVertices.get(entry.getKey());
                g.V(codeSystemVertex).property("count", entry.getValue().get()).next();
            }
            // commit any uncommitted work
            g.tx().commit();
        }

        g.tx().commit();
        LOG.info("Done loading concepts.....");
    }

    /**
     * Loads all UMLS relationship data from the provided relationshipFile
     *
     * @throws FileNotFoundException
     * @throws IOException
     */
    private void loadRelations() throws FileNotFoundException, IOException {
        // MRREL
        // CUI1, AUI1, STYPE1, REL, CUI2, AUI2, STYPE2, RELA, RUI, SRUI, SAB, SL, RG, DIR, SUPPRESS, CVF
        // https://www.ncbi.nlm.nih.gov/books/NBK9685/table/ch03.T.related_concepts_file_mrrel_rrf/
        //
        LOG.info("Loading relations.....");

        AtomicInteger counter = new AtomicInteger(0);

        try (BufferedReader reader = new BufferedReader(COSObject.getItem(cosBucketName, UMLS_RELATED_CONCEPTS_FILE))) {
            reader.lines().forEach(line -> {
                if (line == null) {
                    return;
                }
                String[] tokens = line.split(UMLS_DELIMITER);
                if (tokens.length < 16) {
                    // Expect 16 tokens per valid line
                    return;
                }
                String aui1 = tokens[1];
                String rel = tokens[3];
                String rela = tokens[7];
                String aui2 = tokens[5];
                String rg = tokens[12]; // relationship group
                String dir = tokens[13];
                String suppress = tokens[14];

                if (!"N".equals(dir) && !"O".equals(suppress)) { // Don't load relations that are not in source order or suppressed
                    String code1 = auiToCodeMap.get(aui1);
                    String code2 = auiToCodeMap.get(aui2);

                    if (code1 != null && code2 != null) {
                        Vertex v1 = vertexMap.get(code1);
                        Vertex v2 = vertexMap.get(code2);

                        if (v1 != null && v2 != null) {
                            String label = null;
                            if (rela != null && !rela.isEmpty()) {
                                label = toLabel(rela);
                            }
                            if (label == null || label.isEmpty()) {
                                label = rel;
                            }
                            if (labelFilter.accept(label)) {
                                if (janusGraph.getEdgeLabel(label) == null) {
                                    LOG.info("Adding label: " + label);
                                    JanusGraphManagement management = janusGraph.openManagement();
                                    management.makeEdgeLabel(label).make();
                                    management.commit();
                                }

                                Edge e = g.V(v2).addE(label).to(v1).next();

                                if (!"".equals(rg)) {
                                    g.E(e).property("group", rg).next();
                                }
                            }
                        }

                        if ((counter.get() % 10000) == 0) {
                            LOG.info("counter: " + counter.get());
                            g.tx().commit();
                        }

                        counter.getAndIncrement();
                    }
                }
            });

            // commit any uncommitted work
            g.tx().commit();
            LOG.info("Done loading relations.....");
        }
    }

    /**
     * Loads all UMLS source attribute data from the provided sourceAttributeFile
     *
     * @throws IOException
     */
    private void loadSourceAttributes() throws IOException {
        try (Reader codeSystemReader = new InputStreamReader(getClass().getClassLoader().getResourceAsStream("conf/umlsCodesystemMap.properties"))) {
            // Load UMLS name to preferred CodeSystem name map
            codeSystemMap.load(codeSystemReader);
        }

        try (BufferedReader reader = new BufferedReader(COSObject.getItem(cosBucketName, UMLS_SOURCE_INFORMATION_FILE))) {
            // Load latest version for code systems in UMLS
            reader.lines().forEach(line -> {
                if (line==null) {
                    return;
                }
                String[] tokens = line.split(UMLS_DELIMITER);
                if (tokens.length < 25) {
                    // Expect 25 tokens per valid line
                    return;
                }
                String rsab = tokens[3];
                String sver = tokens[6];
                String curver = tokens[21];

                if ("Y".equals(curver)) {
                    if ("SNOMEDCT_US".equals(rsab)) {
                        // special case version for SNOMED
                        sver = "http://snomed.info/sct/731000124108/version/" + sver.replaceAll("_", "");
                    }
                    sabToVersion.put(rsab, sver);
                }
            });
        }
    }

    /**
     * Load UMLS data using properties provided in arguments
     *
     * @param args
     */
    public static void main(String[] args) {
        UMLSTermGraphLoader loader = null;
        Options options = null;
        try {
            long start = System.currentTimeMillis();

            options = FHIRTermGraphLoader.Type.UMLS.options();

            CommandLineParser parser = new DefaultParser();
            CommandLine commandLine = parser.parse(options, args);
            Map commandLineMap = toMap(commandLine);
            String propFileName = commandLineMap.get("config");
            
            Configuration configuration = ConfigLoader.load(propFileName);
            loader = new UMLSTermGraphLoader(commandLineMap, configuration);
            loader.load();

            long end = System.currentTimeMillis();
            LOG.info("Loading time (milliseconds): " + (end - start));
        } catch (MissingOptionException e) {
            LOG.log(Level.SEVERE, "MissingOptionException: ", e);
            HelpFormatter formatter = new HelpFormatter();
            formatter.printHelp("UMLSTermGraphLoader", options);
        } catch (Exception e) {
            LOG.log(Level.SEVERE, "An error occurred: " + e.getMessage());
        } finally {
            if (loader != null) {
                loader.close();
            }
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy