All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.julielab.genemapper.resources.SpecialistLexiconNameExpansion Maven / Gradle / Ivy

Go to download

This project assembles code and files required to build the dictionaries and indexes used by the JCoRe Gene Mapper.

The newest version!
package de.julielab.genemapper.resources;

import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;
import com.google.common.collect.Sets;
import com.lahodiuk.ahocorasick.AhoCorasickOptimized;
import com.sun.istack.NotNull;
import de.julielab.geneexpbase.AhoCorasickLongestMatchCallback;
import de.julielab.geneexpbase.scoring.JaccardScorer;
import de.julielab.geneexpbase.scoring.JaroWinklerScorer;
import de.julielab.geneexpbase.scoring.Scorer;
import de.julielab.java.utilities.FileUtilities;
import edu.northwestern.at.utils.corpuslinguistics.lexicon.LexiconEntry;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.Range;
import org.jetbrains.annotations.Nullable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.IOException;
import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.function.Predicate;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.Stream;

/**
 * This class makes best efforts to connect all given family and gene group names into connected records.
 * It also extends the names with the SPECIALIST Lexicon.
 *
 * @see https://lexsrv3.nlm.nih.gov/LexSysGroup/Projects/lexicon/current/
 */
public class SpecialistLexiconNameExpansion {
    private final static Logger log = LoggerFactory.getLogger(SpecialistLexiconNameExpansion.class);
    private static final Scorer jaroWinkler = new JaroWinklerScorer();
    private static final Scorer jaccard = new JaccardScorer();
    private static Set inputStopwords;
    private final AtomicInteger geneGroupIDcounter = new AtomicInteger();
    private final Matcher numfinder = Pattern.compile("[0-9]+").matcher("");

    public SpecialistLexiconNameExpansion() {
        inputStopwords = new HashSet<>();
        inputStopwords.add("family");
        inputStopwords.add("superfamily");
        inputStopwords.add("subfamily");
        inputStopwords.add("group");
        inputStopwords.add("factor");
        inputStopwords.add("receptor");
    }

    public static void main(String[] args) throws IOException, XMLStreamException {
        if (args.length < 3) {
            System.err.println("Usage: " + SpecialistLexiconNameExpansion.class.getCanonicalName() + "    []* ");
        } else {
            File[] inputFiles = IntStream.range(2, args.length).mapToObj(i -> args[i]).map(File::new).toArray(File[]::new);
            SpecialistLexiconNameExpansion dictCreator = new SpecialistLexiconNameExpansion();
            dictCreator.createDict(new File(args[0]), inputFiles, new File(args[1]));
        }

    }

    private String normalize(String input) {
        return input.toLowerCase().replaceAll("\\p{P}", " ");
    }


    public void createDict(File specialistXmlLexicon, File[] inputNamesFiles, File dictionaryDestination) throws IOException, XMLStreamException {
        log.info("Reading SPECIALIST Lexicon from {}, name lists from {} and writing dictionary to {}.", specialistXmlLexicon, inputNamesFiles, dictionaryDestination);
        Multimap adaptedname2id = HashMultimap.create();
        // euis are the IDs of entries in the SPECIALIST lexicon (Entry Unique Identifier): https://lhncbc.nlm.nih.gov/LSG/Projects/lexicon/current/docs/designDoc/UDF/lexRecord/syntax/eui.html, https://www.ncbi.nlm.nih.gov/books/NBK9680/
        Multimap eui2abbreui = HashMultimap.create();
        Multimap eui2acroeui = HashMultimap.create();
        Map eui2entry = new HashMap<>();
        Map acronymEntries = new HashMap<>();
        Map abbreviationEntries = new HashMap<>();
        Multimap id2inputName = HashMultimap.create();
        Map adaptedName2originalName = new HashMap<>();
        Multimap egsmlong2acro = HashMultimap.create();

        // Index the gene group names with the ID from their source
        // Also, do some name normalization and index the original names by their adapted variant
        Multimap normalized2OriginalInputNames = HashMultimap.create();
        for (File inputNamesFile : inputNamesFiles) {
            FileUtilities.getReaderFromFile(inputNamesFile).lines().filter(Predicate.not(String::isBlank)).filter(Predicate.not(l -> l.startsWith("#"))).forEach(l -> {
                String[] split = l.split("\t");
                if (split.length < 3)
                    throw new IllegalArgumentException("Unsupported format in file " + inputNamesFile.getName() + ". Expected three columns with 'name', 'id' and 'priority'. Got: " + l);
                String originalName = split[0];
                String name = adaptName(originalName);
                adaptedName2originalName.put(name, originalName);
                if (!inputStopwords.contains(name)) {
                    String id = split[1];
                    adaptedname2id.put(name, id);
                    String normalizedName = normalize(name);
                    if (!normalized2OriginalInputNames.containsKey(normalizedName) && !normalized2OriginalInputNames.get(normalizedName).contains(l)) {
                        normalized2OriginalInputNames.put(normalizedName, name);
                    }
                    id2inputName.put(id, name);
                }
            });
        }
        log.info("Got {} input names to check for connections amongst each other and with the SPECIALIST Lexicon", adaptedName2originalName.size());

        int i = 0;
        int numNames;
        Map> names2entries;
        do {
            numNames = normalized2OriginalInputNames.size();
            names2entries = findEntriesForInputNames(specialistXmlLexicon, normalized2OriginalInputNames, eui2entry, eui2abbreui, eui2acroeui, acronymEntries, abbreviationEntries);
            extendNamesWithLexiconItems(adaptedname2id, normalized2OriginalInputNames, names2entries);
            ++i;
        } while (normalized2OriginalInputNames.size() != numNames);
        log.info("After {} iterations of name expansion, a final number of {} names was identified.", i, numNames);

        // Iterate over the externally given records by unique ID.
        // For each ID, get the names associated with it and look for matches in the LEXICON.
        // Create GeneGroups out of the external IDs, enriched by the LEXICON entries.
        // When multiple external resources are used, there is currently no attempt to merge the different sources.
        // Thus, when two sources would include AMPK, there would be two output records for it.
        List geneGroups = new ArrayList<>();
        Multimap lexiconEui2genegroup = HashMultimap.create();
        for (String externalId : id2inputName.keySet()) {
            boolean lexiconEntryFoundForId = false;
            GeneGroup genegroup4externalId = null;
            // for each name for this ID, look for matching LEXICON entries
            for (String externalInputName : id2inputName.get(externalId)) {
                Multimap lexiconNames = names2entries.get(externalInputName);
                if (lexiconNames != null) {
                    lexiconEntryFoundForId = true;
                    Set alreadySeenLexiconEntries = new HashSet<>();
                    for (SpecialistEntry lexiconEntry : lexiconNames.values()) {
                        // omit LEXICON entries with very diverse abbreviations and acronyms (e.g. 'APC' has diverse long forms); we have no facility
                        // for disambiguation
                        if (!lexiconEntry.hasAmbiguousAbbreviationsAndAcronyms() && alreadySeenLexiconEntries.add(lexiconEntry.getEui())) {
                            // collect the non-ambiguous LEXICON entries connected by acronym or abbreviation relations for the current external entity name
                            // Since we exclude ambiguous entries, the hope is that all the found entries will actually be related to the input concept.
                            Set networkNodes = new HashSet<>();
                            networkNodes.add(lexiconEntry);
                            int networkSize;
                            do {
                                networkSize = networkNodes.size();
                                // follow the abbreviation and acronym links to other LEXICON entries
                                lexiconEntry.getAbbreviationLongformEuis().stream().map(eui2entry::get).filter(Objects::nonNull).filter(Predicate.not(SpecialistEntry::hasAmbiguousAbbreviationsAndAcronyms)).forEach(networkNodes::add);
                                lexiconEntry.getAcronymLongformEuis().stream().map(eui2entry::get).filter(Objects::nonNull).filter(Predicate.not(SpecialistEntry::hasAmbiguousAbbreviationsAndAcronyms)).forEach(networkNodes::add);
                            } while (networkSize < networkNodes.size());
                            // We would like to have a short base name which can be used as an identifier in applications.
                            // Sort the base names by length. The first will be used as the GeneGroup base, the
                            // others will be variants.
                            List basesByLength = networkNodes.stream().map(SpecialistEntry::getBase).sorted(Comparator.comparingInt(String::length)).collect(Collectors.toList());
                            GeneGroup gg;
                            if (genegroup4externalId != null) {
                                gg = genegroup4externalId;
                            } else {
                                gg = new GeneGroup("GENO:" + geneGroupIDcounter.incrementAndGet());
                                geneGroups.add(gg);
                            }
                            gg.addExternalId(externalId);
                            gg.addInputName(externalInputName);
                            gg.addBase(basesByLength.get(0));
                            IntStream.range(1, basesByLength.size()).mapToObj(basesByLength::get).forEach(gg::addSpellingVariant);
                            networkNodes.stream().map(SpecialistEntry::getAbbreviations).flatMap(Collection::stream).forEach(gg::addAbbreviation);
                            networkNodes.stream().map(SpecialistEntry::getAcronyms).flatMap(Collection::stream).forEach(gg::addAcronym);
                            networkNodes.stream().map(SpecialistEntry::getInflectionVariants).flatMap(Collection::stream).forEach(gg::addInflectionVariant);
                            networkNodes.stream().map(SpecialistEntry::getSpellingVariants).flatMap(Collection::stream).forEach(gg::addSpellingVariant);
                            networkNodes.stream().map(SpecialistEntry::getEui).forEach(gg::addConnectedLexiconEntry);
                            networkNodes.forEach(n -> lexiconEui2genegroup.put(n.getEui(), gg));
                            genegroup4externalId = gg;
                        }
                    }
                }
            }
            if (!lexiconEntryFoundForId) {
                // Un-enriched GeneGroup because we could not find a LEXICON entry for the names of this external ID.
                GeneGroup gg = new GeneGroup("GENO:" + geneGroupIDcounter.incrementAndGet());
                gg.addBase(externalId.substring(externalId.indexOf(':') + 1));
                gg.addExternalId(externalId);
                id2inputName.get(externalId).forEach(gg::addInputName);
                geneGroups.add(gg);
            }
        }
        // add potentially missed input names to the according gene groups
        for (GeneGroup gg : geneGroups) {
            for (String externalId : gg.getExternalIds()) {
                for (String name : id2inputName.get(externalId))
                    gg.addInputName(name);
            }
        }

        // That's it. Write the output files and be done.

        log.info("Writing {} connected gene groups to {}", geneGroups.size(), dictionaryDestination);
        writeGeneGroups(dictionaryDestination, geneGroups, adaptedname2id, names2entries);

        log.info("Writing a dictionary of all gene group names to familyrecords.dict");
        try (BufferedWriter bw = FileUtilities.getWriterToFile(new File("familyrecords.dict"))) {
            for (GeneGroup gg : geneGroups) {
                Set allnames = gg.getAllNames().collect(Collectors.toSet());
                String id = gg.getId();
                boolean firstBaseEncountered = false;
                for (String name : allnames) {
                    String prio = "2";
                    // make the first encountered base the "symbol" and all others a "synonym"
                    if (gg.bases.contains(name) && !firstBaseEncountered) {
                        prio = "-1";
                        firstBaseEncountered = true;
                    }
                    bw.write(name + "\t" + id + "\t" + prio);
                    bw.newLine();
                }
            }
        }
        log.info("Done.");
    }

    /**
     * Removes the 'family' suffix word for better matching with the dictionary.
     *
     * @param originalName
     * @return
     */
    @NotNull
    public String adaptName(String originalName) {
        return originalName.replaceAll(" family$", "");
    }

    public boolean numberCompatible(String t1, String t2) {
        numfinder.reset(t1);
        Set number1 = new HashSet<>();
        while (numfinder.find())
            number1.add(numfinder.group());
        numfinder.reset(t2);
        Set number2 = new HashSet<>();
        while (numfinder.find())
            number1.add(numfinder.group());
        return (number1.isEmpty() && number2.isEmpty()) || number1.equals(number2);
    }

    public void writeGeneGroups(File dictionaryDestination, List geneGroups, Multimap name2id, Map> names2entries) throws IOException {
        try (BufferedWriter bw = FileUtilities.getWriterToFile(dictionaryDestination)) {
            bw.write("# Created " + new Date());
            bw.newLine();
            for (GeneGroup group : geneGroups) {
                bw.write(group.getId());
                bw.newLine();
                bw.write("bases:\t" + group.getBases().stream().sorted().collect(Collectors.joining(", ")));
                bw.newLine();
                bw.write("inflections:\t" + group.getInflectionVariants().stream().sorted().collect(Collectors.joining(", ")));
                bw.newLine();
                bw.write("spellings:\t" + group.getSpellingVariants().stream().sorted().collect(Collectors.joining(", ")));
                bw.newLine();
                bw.write("acronyms:\t" + group.getAcronyms().stream().sorted().collect(Collectors.joining(", ")));
                bw.newLine();
                bw.write("abbreviations:\t" + group.getAbbreviations().stream().sorted().collect(Collectors.joining(", ")));
                bw.newLine();
                bw.write("externalids:\t" + group.getExternalIds().stream().sorted().collect(Collectors.joining(", ")));
                bw.newLine();
                bw.write("inputnames:\t" + group.getInputNames().stream().sorted().collect(Collectors.joining(", ")));
                bw.newLine();
                bw.write("lexiconEuis:\t" + group.getConnectedLexiconEntries().stream().sorted().collect(Collectors.joining(", ")));
                bw.newLine();
                bw.newLine();
            }
        }
    }

    public void extendNamesWithLexiconItems(Multimap name2id, Multimap normalized2OriginalInputNames, Map> names2entries) {
        for (String inputName : names2entries.keySet()) {
            Multimap specialistEntries = names2entries.get(inputName);
            Collection ids = name2id.get(inputName);
            for (EntryType type : specialistEntries.keySet()) {
                Collection entries4type = specialistEntries.get(type);
                switch (type) {
                    case ABBREVIATION:
                        for (SpecialistEntry e : entries4type) {
                            for (String id : ids) {
                                name2id.put(e.getBase(), id);
                                if (e.uniformAbbreviationAndAcronymSuffix()) {
                                    e.getInflectionVariants().forEach(var -> name2id.put(var, id));
                                    e.getSpellingVariants().forEach(var -> name2id.put(var, id));
                                    e.getAbbreviations().forEach(abbr -> name2id.put(abbr, id));
                                }
                            }
                        }
                        break;
                    case ACRONYM:
                        for (SpecialistEntry e : entries4type) {
                            for (String id : ids) {
                                name2id.put(id, e.getBase());
                                if (e.uniformAbbreviationAndAcronymSuffix()) {
                                    e.getInflectionVariants().forEach(var -> name2id.put(var, id));
                                    e.getSpellingVariants().forEach(var -> name2id.put(var, id));
                                    e.getAcronyms().forEach(acr -> name2id.put(acr, id));
                                }
                            }
                        }
                        break;
                    case SPELLING:
                    case INFLECTION:
                        for (SpecialistEntry e : entries4type) {
                            for (String id : ids) {
                                name2id.put(e.getBase(), id);
                                e.getInflectionVariants().forEach(var -> name2id.put(var, id));
                                e.getSpellingVariants().forEach(var -> name2id.put(var, id));
                            }
                        }
                        break;
                    case BASE:
                        for (SpecialistEntry e : entries4type) {
                            if (!e.isAcronymEntry()) {
                                for (String id : ids) {
                                    name2id.put(e.getBase(), id);
                                    e.getInflectionVariants().forEach(var -> name2id.put(var, id));
                                    e.getSpellingVariants().forEach(var -> name2id.put(var, id));
                                }
                            } else if (e.uniformAbbreviationSuffix()) {
                                for (String id : ids) {
                                    e.getInflectionVariants().forEach(var -> name2id.put(var, id));
                                    e.getSpellingVariants().forEach(var -> name2id.put(var, id));
                                    e.getAbbreviations().forEach(abbr -> name2id.put(abbr, id));
                                }
                            } else if (e.uniformAcronymSuffix()) {
                                for (String id : ids) {
                                    e.getInflectionVariants().forEach(var -> name2id.put(var, id));
                                    e.getSpellingVariants().forEach(var -> name2id.put(var, id));
                                    e.getAcronyms().forEach(acr -> name2id.put(acr, id));
                                }
                            }
                        }
                        break;
                    default:
                        throw new IllegalArgumentException("Unknown Specialist Entry type " + type);
                }
            }
        }
        for (String newname : name2id.keySet()) {
            String normalizedNewname = normalize(newname);
            normalized2OriginalInputNames.put(normalizedNewname, newname);
        }
    }

    public Map> findEntriesForInputNames(File specialistXmlLexicon, Multimap normalized2OriginalInputNames, Map eui2entry, Multimap eui2abbreui, Multimap eui2acroeui, Map acronymEntries, Map abbreviationEntries) throws IOException, XMLStreamException {
        Map> names2entries = new HashMap<>();
        try (BufferedReader br = FileUtilities.getReaderFromFile(specialistXmlLexicon)) {

            XMLStreamReader reader = XMLInputFactory.newFactory().createXMLStreamReader(br);
            SpecialistEntry currentEntry = null;
            Multimap exactInputs4CurrentEntry = HashMultimap.create();
            Multimap normalizedInputs4CurrentEntry = HashMultimap.create();
            while (reader.hasNext()) {
                int eventType = reader.next();
                EntryType entryType = null;
                if (eventType == XMLStreamConstants.END_ELEMENT) {
                    String elementName = reader.getName().getLocalPart();
                    if (elementName.equals("lexRecord")) {
                        storeLexiconRecord(eui2entry, eui2abbreui, eui2acroeui, acronymEntries, abbreviationEntries, names2entries, currentEntry, exactInputs4CurrentEntry, normalizedInputs4CurrentEntry);
                        exactInputs4CurrentEntry.clear();
                        normalizedInputs4CurrentEntry.clear();
                    }
                } else if (eventType == XMLStreamConstants.START_ELEMENT) {
                    String elementName = reader.getName().getLocalPart();
                    String elementText = null;
                    EntryType refType = null;
                    switch (elementName) {
                        case "lexRecord": {
//                            storeLexiconRecord(eui2abbreui, eui2acroeui, acronymEntries, abbreviationEntries, names2entries, currentEntry, exactInputs4CurrentEntry, normalizedInputs4CurrentEntry);
                            currentEntry = new SpecialistEntry();
//                            exactInputs4CurrentEntry.clear();
//                            normalizedInputs4CurrentEntry.clear();
                            break;
                        }
                        case "acronyms":
                            entryType = EntryType.ACRONYM;
                            refType = EntryType.ACRONYM_LONGFORM_EUID;
                        case "abbreviations":
                            if (entryType == null)
                                entryType = EntryType.ABBREVIATION;
                            if (refType == null)
                                refType = EntryType.ABBREVIATION_LONGFORM_EUID;
                            String originalElementText = reader.getElementText();
                            // for acronyms and abbreviations, the term ID for which the acronym/abbreviations stands
                            // is separated by a pipe character
                            String[] split = originalElementText.split("\\|");
                            elementText = split[0];
                            String longformEntryEui = split.length > 1 ? split[1] : "";
                            currentEntry.add(elementText, entryType);
                            currentEntry.add(longformEntryEui, refType);
                            String normalizedElementText = normalize(elementText);
                            if (normalized2OriginalInputNames.containsKey(normalizedElementText)) {
                                Collection originalInputNames = normalized2OriginalInputNames.get(normalizedElementText);
                                for (String originalInputName : originalInputNames)
                                    if (originalInputName.equals(elementText))
                                        exactInputs4CurrentEntry.put(originalInputName, entryType);
                                    else
                                        normalizedInputs4CurrentEntry.put(originalInputName, entryType);
                            }
                            break;
                        case "base":
                            entryType = EntryType.BASE;
                        case "spellingVars":
                            if (entryType == null)
                                entryType = EntryType.SPELLING;
                        case "inflVars":
                            if (entryType == null)
                                entryType = EntryType.INFLECTION;
                            if (elementText == null)
                                elementText = reader.getElementText();
                            currentEntry.add(elementText, entryType);
                            normalizedElementText = normalize(elementText);
                            if (normalized2OriginalInputNames.containsKey(normalizedElementText)) {
                                Collection originalInputNames = normalized2OriginalInputNames.get(normalizedElementText);
                                for (String originalInputName : originalInputNames)
                                    if (originalInputName.equals(elementText))
                                        exactInputs4CurrentEntry.put(originalInputName, entryType);
                                    else
                                        normalizedInputs4CurrentEntry.put(originalInputName, entryType);
                            }
                            break;
                        case "cat":
                            currentEntry.setCategory(reader.getElementText());
                            break;
                        case "eui":
                            currentEntry.setEui(reader.getElementText());
                            break;
                    }
                }
            }
        }
        return names2entries;
    }

    private void storeLexiconRecord(Map eui2entry, Multimap eui2abbreui, Multimap eui2acroeui, Map acronymEntries, Map abbreviationEntries, Map> names2entries, SpecialistEntry currentEntry, Multimap exactInputs4CurrentEntry, Multimap normalizedInputs4CurrentEntry) {
        if (currentEntry != null && currentEntry.getCategory().equals("noun")) {
            eui2entry.put(currentEntry.getEui(), currentEntry);
            for (Map.Entry p : exactInputs4CurrentEntry.entries()) {
                String name = p.getKey();
                Multimap type2entries = names2entries.compute(name, (k, v) -> v != null ? v : HashMultimap.create());
                type2entries.put(p.getValue(), currentEntry);
            }
            for (Map.Entry p : normalizedInputs4CurrentEntry.entries()) {
                String name = p.getKey();
                if (!names2entries.containsKey(name)) {
                    Multimap type2entries = names2entries.compute(name, (k, v) -> v != null ? v : HashMultimap.create());
                    type2entries.put(p.getValue(), currentEntry);
                }
            }
            if (!exactInputs4CurrentEntry.isEmpty() || !normalizedInputs4CurrentEntry.isEmpty()) {
                for (String longformEui : currentEntry.getAbbreviationLongformEuis())
                    eui2abbreui.put(longformEui, currentEntry.getEui());
                for (String longformEui : currentEntry.getAcronymLongformEuis())
                    eui2acroeui.put(longformEui, currentEntry.getEui());
                if (currentEntry.isAcronymEntry())
                    acronymEntries.put(currentEntry.getEui(), currentEntry);
                if (currentEntry.isAbbreviationEntry())
                    abbreviationEntries.put(currentEntry.getEui(), currentEntry);
            }
        }
    }

    /**
     * 

* Tries to find a span of text in matchedText that matches the given acronym shortform. * Upon success, the found long form is returned. *

*

The algorithm tries to find consecutive tokens in matchedText that either begin with or contain * the characters in acronym in order.

* * @param acronym A sequence of characters that represent an acronym. * @param matchedText Some text that might contain the long form of the acronym. * @return The long form of the given acronym, if it could be found, null otherwise. */ @Nullable private String findLongform(CharSequence acronym, String matchedText) { String longform = null; String[] abbreviationAcronymTokens = matchedText.split("[\\s\\p{P}]+"); int shortformpos = 0; int foundAcronymPositions = 0; List acronymTokenIndices = new ArrayList<>(); for (int i = 0; i < abbreviationAcronymTokens.length && shortformpos < acronym.length(); ++i) { String token = abbreviationAcronymTokens[i]; for (int j = 0; j < token.length() && shortformpos < acronym.length(); ++j) { char tchar = Character.toLowerCase(token.charAt(j)); char schar = Character.toLowerCase(acronym.charAt(shortformpos)); // the first character of a token must be part of the acronym, else we don't accept it if (j == 0 && tchar != schar) { shortformpos = 0; foundAcronymPositions = 0; acronymTokenIndices.clear(); continue; } if (j == 0) acronymTokenIndices.add(i); if (tchar == schar) { ++foundAcronymPositions; ++shortformpos; } } } if (foundAcronymPositions == acronym.length()) { longform = acronymTokenIndices.stream().map(i -> abbreviationAcronymTokens[i]).collect(Collectors.joining(" ")); } return longform; } private enum EntryType {ABBREVIATION, ACRONYM, SPELLING, INFLECTION, BASE, ACRONYM_LONGFORM_EUID, ABBREVIATION_LONGFORM_EUID} private class SpecialistEntry { private String base; private List spellingVariants = Collections.emptyList(); private List inflectionVariants = Collections.emptyList(); private List abbreviations = Collections.emptyList(); private List acronyms = Collections.emptyList(); private List abbreviationLongformEuis = Collections.emptyList(); private List acronymLongformEuis = Collections.emptyList(); private String eui; private String category; public String getEui() { return eui; } public void setEui(String eui) { this.eui = eui; } public boolean isAcronymEntry() { return !acronyms.isEmpty(); } public void add(String variant, EntryType type) { switch (type) { case SPELLING: addSpellingVariant(variant); break; case INFLECTION: addInflectionVariant(variant); break; case ABBREVIATION: addAbbreviation(variant); break; case ACRONYM: addAcronym(variant); break; case ABBREVIATION_LONGFORM_EUID: addAbbreviationLongformEui(variant); break; case ACRONYM_LONGFORM_EUID: addAcronymLongformEui(variant); break; case BASE: base = variant; break; } } public String getBase() { return base; } public boolean uniformAcronymSuffix() { if (acronyms.isEmpty()) return false; if (acronyms.size() == 1) return true; String[] split = acronyms.get(0).split("\\s+"); String suffix = split[split.length - 1]; for (String acronym : acronyms) { if (!acronym.endsWith(suffix)) return false; } return true; } /** *

Tries to find out whether the abbreviations and acronyms of the LEXICON entry denote the same or different concepts via string comparison.

*

All pairwise combinations of the abbreviations and acronyms are compared - after resolution of possible acronyms within the strings themselves - are compared for token overlap. If at least one combination has less then half overlap, false is returned.

* * @return Whether the pairwise token overlap is at least less than 0.5. */ public boolean hasAmbiguousAbbreviationsAndAcronyms() { List abbreviationsAndAcronyms = Stream.concat(abbreviations.stream(), acronyms.stream()).collect(Collectors.toList()); // At first, we do acronym resolution. This makes some combinations of strings more similar and we can use a higher threshold. Pattern shortformP = Pattern.compile("[A-Z][A-Z]+"); Set shortforms = abbreviationsAndAcronyms.stream().map(shortformP::matcher).filter(Matcher::find).map(Matcher::group).collect(Collectors.toSet()); Map short2long = new HashMap<>(); for (String shortform : shortforms) { for (String abbreviationAcronym : abbreviationsAndAcronyms) { String longform = findLongform(shortform, abbreviationAcronym); if (longform != null && !shortform.equals(longform)) short2long.put(shortform, longform); } } // We use the AC to find short forms in the strings and replace them by long forms. AhoCorasickOptimized ac = new AhoCorasickOptimized(shortforms); double minscore = 1; AhoCorasickLongestMatchCallback callback = new AhoCorasickLongestMatchCallback(); for (String s : abbreviationsAndAcronyms) { // make the expanded version of s ac.match(s, callback); for (Range shortformRange : callback.getLongestMatches().keySet()) { s = s.substring(0, shortformRange.getMinimum()) + short2long.get(callback.getLongestMatches().get(shortformRange)) + s.substring(shortformRange.getMaximum() + 1); } callback.clear(); for (String s2 : abbreviationsAndAcronyms) { // make the expanded version of s2 ac.match(s2, callback); for (Range shortformRange : callback.getLongestMatches().keySet()) { s2 = s2.substring(0, shortformRange.getMinimum()) + short2long.get(callback.getLongestMatches().get(shortformRange)) + s2.substring(shortformRange.getMaximum() + 1); } callback.clear(); // now compare s and s2 double score = jaccard.getScore(s, s2); if (score < minscore) minscore = score; } } return minscore < .5; } public boolean uniformAbbreviationAndAcronymSuffix() { List abbreviationsAndAcronyms = Stream.concat(abbreviations.stream(), acronyms.stream()).collect(Collectors.toList()); double minscore = 1; for (String s : abbreviationsAndAcronyms) { for (String s2 : abbreviationsAndAcronyms) { double score = jaroWinkler.getScore(s, s2); if (score < minscore) minscore = score; } } return minscore > .7; } public boolean uniformAbbreviationSuffix() { if (abbreviations.isEmpty()) return false; if (abbreviations.size() == 1) return true; String[] split = abbreviations.get(0).split("\\s+"); String suffix = split[split.length - 1]; for (String abbreviation : abbreviations) { if (!abbreviation.endsWith(suffix)) return false; } return true; } public List getSpellingVariants() { return spellingVariants; } public List getInflectionVariants() { return inflectionVariants; } public List getAbbreviations() { return abbreviations; } public List getAcronyms() { return acronyms; } public void addSpellingVariant(String variant) { if (spellingVariants.isEmpty()) spellingVariants = new ArrayList<>(); spellingVariants.add(variant); } public void addInflectionVariant(String variant) { if (inflectionVariants.isEmpty()) inflectionVariants = new ArrayList<>(); inflectionVariants.add(variant); } public void addAbbreviation(String variant) { if (abbreviations.isEmpty()) abbreviations = new ArrayList<>(); abbreviations.add(variant); } public void addAcronym(String variant) { if (acronyms.isEmpty()) acronyms = new ArrayList<>(); acronyms.add(variant); } public List getAbbreviationLongformEuis() { return abbreviationLongformEuis; } public List getAcronymLongformEuis() { return acronymLongformEuis; } public void addAbbreviationLongformEui(String variant) { if (abbreviationLongformEuis.isEmpty()) abbreviationLongformEuis = new ArrayList<>(); abbreviationLongformEuis.add(variant); } public void addAcronymLongformEui(String variant) { if (acronymLongformEuis.isEmpty()) acronymLongformEuis = new ArrayList<>(); acronymLongformEuis.add(variant); } @Override public boolean equals(Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; SpecialistEntry that = (SpecialistEntry) o; return eui.equals(that.eui); } @Override public int hashCode() { return Objects.hash(eui); } public String getCategory() { return category; } public void setCategory(String category) { this.category = category; } public boolean isAbbreviationEntry() { return !abbreviations.isEmpty(); } } private class GeneGroup { private final String id; private Set bases = Collections.emptySet(); private Set acronyms = Collections.emptySet(); private Set abbreviations = Collections.emptySet(); private Set abbreviationLongforms = Collections.emptySet(); private Set acronymLongforms = Collections.emptySet(); private Set spellingVariants = Collections.emptySet(); private Set inflectionVariants = Collections.emptySet(); private Set externalIds = Collections.emptySet(); private Set inputNames = Collections.emptySet(); private Set externalIdNamespaces = Collections.emptySet(); private Set connectedLexiconEntries = Collections.emptySet(); public GeneGroup(String id) { this.id = id; } private Stream getAllNames() { return Stream.concat(bases.stream(), Stream.concat(spellingVariants.stream(), Stream.concat(inflectionVariants.stream(), Stream.concat(abbreviations.stream(), acronyms.stream())))); } public Set getBases() { return bases; } public Set getAcronyms() { return acronyms; } public Set getAbbreviations() { return abbreviations; } public Set getSpellingVariants() { return spellingVariants; } public Set getInputNames() { return inputNames; } public Set getInflectionVariants() { return inflectionVariants; } public Set getExternalIds() { return externalIds; } public void addConnectedLexiconEntry(String entry) { if (connectedLexiconEntries.isEmpty()) connectedLexiconEntries = new HashSet<>(); connectedLexiconEntries.add(entry); } public void addExternalIdNamespace(String namespace) { if (externalIdNamespaces.isEmpty()) externalIdNamespaces = new HashSet<>(); externalIdNamespaces.add(namespace); } public void addInputName(String variant) { if (inputNames.isEmpty()) inputNames = new HashSet<>(); inputNames.add(variant); } public void addAbbreviation(String variant) { if (abbreviations.isEmpty()) abbreviations = new HashSet<>(); abbreviations.add(variant); } public void addAcronym(String variant) { if (acronyms.isEmpty()) acronyms = new HashSet<>(); acronyms.add(variant); } public void addAbbreviationLongform(String variant) { if (abbreviationLongforms.isEmpty()) abbreviationLongforms = new HashSet<>(); abbreviationLongforms.add(variant); } public void addAcronymLongform(String variant) { if (acronymLongforms.isEmpty()) acronymLongforms = new HashSet<>(); acronymLongforms.add(variant); } public void addSpellingVariant(String variant) { if (spellingVariants.isEmpty()) spellingVariants = new HashSet<>(); spellingVariants.add(variant); } public void addInflectionVariant(String variant) { if (inflectionVariants.isEmpty()) inflectionVariants = new HashSet<>(); inflectionVariants.add(variant); } public void addExternalId(String externalId) { if (externalIds.isEmpty()) externalIds = new HashSet<>(); externalIds.add(externalId); addExternalIdNamespace(externalId.substring(0, externalId.indexOf(':'))); } public void addBase(String base) { if (bases.isEmpty()) bases = new HashSet<>(); bases.add(base); } public void addSpellingVariants(Collection spellingVariants) { if (this.spellingVariants.isEmpty()) this.spellingVariants = new HashSet<>(); this.spellingVariants.addAll(spellingVariants); } public void addInflectionVariants(Collection inflectionVariants) { if (this.inflectionVariants.isEmpty()) this.inflectionVariants = new HashSet<>(); this.inflectionVariants.addAll(inflectionVariants); } public String getId() { return id; } public boolean isCompatibleTo(GeneGroup otherGroup) { Set thisNames = Stream.concat(bases.stream(), Stream.concat(spellingVariants.stream(), inflectionVariants.stream())).collect(Collectors.toSet()); Set otherNames = Stream.concat(otherGroup.getBases().stream(), Stream.concat(otherGroup.getSpellingVariants().stream(), otherGroup.getInflectionVariants().stream())).collect(Collectors.toSet()); double maxScore = 0; for (String thisname : thisNames) { for (String othername : otherNames) { double score = jaroWinkler.getScore(thisname, othername); if (score > maxScore) maxScore = score; } } boolean iscompatible = maxScore > .9; return iscompatible; } public void merge(GeneGroup gg) { bases.addAll(gg.getBases()); addSpellingVariants(gg.getSpellingVariants()); addInflectionVariants(gg.getInflectionVariants()); gg.getAcronyms().forEach(this::addAcronym); gg.getAbbreviations().forEach(this::addAbbreviation); } public Set getExternalIdNamespaces() { return externalIdNamespaces; } public Set getConnectedLexiconEntries() { return connectedLexiconEntries; } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy