de.julielab.genemapper.resources.SpecialistLexiconNameExpansion Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of gene-mapper-resources Show documentation
This project assembles code and files required to build the dictionaries and indexes used by the JCoRe Gene Mapper.
The newest version!
package de.julielab.genemapper.resources;

import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;
import com.google.common.collect.Sets;
import com.lahodiuk.ahocorasick.AhoCorasickOptimized;
import com.sun.istack.NotNull;
import de.julielab.geneexpbase.AhoCorasickLongestMatchCallback;
import de.julielab.geneexpbase.scoring.JaccardScorer;
import de.julielab.geneexpbase.scoring.JaroWinklerScorer;
import de.julielab.geneexpbase.scoring.Scorer;
import de.julielab.java.utilities.FileUtilities;
import edu.northwestern.at.utils.corpuslinguistics.lexicon.LexiconEntry;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.Range;
import org.jetbrains.annotations.Nullable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.IOException;
import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.function.Predicate;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.Stream;

/**
 * This class makes best efforts to connect all given family and gene group names into connected records.
 * It also extends the names with the SPECIALIST Lexicon.
 *
 * @see https://lexsrv3.nlm.nih.gov/LexSysGroup/Projects/lexicon/current/
 */
public class SpecialistLexiconNameExpansion {
    private final static Logger log = LoggerFactory.getLogger(SpecialistLexiconNameExpansion.class);
    private static final Scorer jaroWinkler = new JaroWinklerScorer();
    private static final Scorer jaccard = new JaccardScorer();
    private static Set inputStopwords;
    private final AtomicInteger geneGroupIDcounter = new AtomicInteger();
    private final Matcher numfinder = Pattern.compile("[0-9]+").matcher("");

    public SpecialistLexiconNameExpansion() {
        inputStopwords = new HashSet<>();
        inputStopwords.add("family");
        inputStopwords.add("superfamily");
        inputStopwords.add("subfamily");
        inputStopwords.add("group");
        inputStopwords.add("factor");
        inputStopwords.add("receptor");
    }

    public static void main(String[] args) throws IOException, XMLStreamException {
        if (args.length < 3) {
            System.err.println("Usage: " + SpecialistLexiconNameExpansion.class.getCanonicalName() + "    []* ");
        } else {
            File[] inputFiles = IntStream.range(2, args.length).mapToObj(i -> args[i]).map(File::new).toArray(File[]::new);
            SpecialistLexiconNameExpansion dictCreator = new SpecialistLexiconNameExpansion();
            dictCreator.createDict(new File(args[0]), inputFiles, new File(args[1]));
        }

    }

    private String normalize(String input) {
        return input.toLowerCase().replaceAll("\\p{P}", " ");
    }


    public void createDict(File specialistXmlLexicon, File[] inputNamesFiles, File dictionaryDestination) throws IOException, XMLStreamException {
        log.info("Reading SPECIALIST Lexicon from {}, name lists from {} and writing dictionary to {}.", specialistXmlLexicon, inputNamesFiles, dictionaryDestination);
        Multimap adaptedname2id = HashMultimap.create();
        // euis are the IDs of entries in the SPECIALIST lexicon (Entry Unique Identifier): https://lhncbc.nlm.nih.gov/LSG/Projects/lexicon/current/docs/designDoc/UDF/lexRecord/syntax/eui.html, https://www.ncbi.nlm.nih.gov/books/NBK9680/
        Multimap eui2abbreui = HashMultimap.create();
        Multimap eui2acroeui = HashMultimap.create();
        Map eui2entry = new HashMap<>();
        Map acronymEntries = new HashMap<>();
        Map abbreviationEntries = new HashMap<>();
        Multimap id2inputName = HashMultimap.create();
        Map adaptedName2originalName = new HashMap<>();
        Multimap egsmlong2acro = HashMultimap.create();

        // Index the gene group names with the ID from their source
        // Also, do some name normalization and index the original names by their adapted variant
        Multimap normalized2OriginalInputNames = HashMultimap.create();
        for (File inputNamesFile : inputNamesFiles) {
            FileUtilities.getReaderFromFile(inputNamesFile).lines().filter(Predicate.not(String::isBlank)).filter(Predicate.not(l -> l.startsWith("#"))).forEach(l -> {
                String[] split = l.split("\t");
                if (split.length < 3)
                    throw new IllegalArgumentException("Unsupported format in file " + inputNamesFile.getName() + ". Expected three columns with 'name', 'id' and 'priority'. Got: " + l);
                String originalName = split[0];
                String name = adaptName(originalName);
                adaptedName2originalName.put(name, originalName);
                if (!inputStopwords.contains(name)) {
                    String id = split[1];
                    adaptedname2id.put(name, id);
                    String normalizedName = normalize(name);
                    if (!normalized2OriginalInputNames.containsKey(normalizedName) && !normalized2OriginalInputNames.get(normalizedName).contains(l)) {
                        normalized2OriginalInputNames.put(normalizedName, name);
                    }
                    id2inputName.put(id, name);
                }
            });
        }
        log.info("Got {} input names to check for connections amongst each other and with the SPECIALIST Lexicon", adaptedName2originalName.size());

        int i = 0;
        int numNames;
        Map> names2entries;
        do {
            numNames = normalized2OriginalInputNames.size();
            names2entries = findEntriesForInputNames(specialistXmlLexicon, normalized2OriginalInputNames, eui2entry, eui2abbreui, eui2acroeui, acronymEntries, abbreviationEntries);
            extendNamesWithLexiconItems(adaptedname2id, normalized2OriginalInputNames, names2entries);
            ++i;
        } while (normalized2OriginalInputNames.size() != numNames);
        log.info("After {} iterations of name expansion, a final number of {} names was identified.", i, numNames);

        // Iterate over the externally given records by unique ID.
        // For each ID, get the names associated with it and look for matches in the LEXICON.
        // Create GeneGroups out of the external IDs, enriched by the LEXICON entries.
        // When multiple external resources are used, there is currently no attempt to merge the different sources.
        // Thus, when two sources would include AMPK, there would be two output records for it.
        List geneGroups = new ArrayList<>();
        Multimap lexiconEui2genegroup = HashMultimap.create();
        for (String externalId : id2inputName.keySet()) {
            boolean lexiconEntryFoundForId = false;
            GeneGroup genegroup4externalId = null;
            // for each name for this ID, look for matching LEXICON entries
            for (String externalInputName : id2inputName.get(externalId)) {
                Multimap lexiconNames = names2entries.get(externalInputName);
                if (lexiconNames != null) {
                    lexiconEntryFoundForId = true;
                    Set alreadySeenLexiconEntries = new HashSet<>();
                    for (SpecialistEntry lexiconEntry : lexiconNames.values()) {
                        // omit LEXICON entries with very diverse abbreviations and acronyms (e.g. 'APC' has diverse long forms); we have no facility
                        // for disambiguation
                        if (!lexiconEntry.hasAmbiguousAbbreviationsAndAcronyms() && alreadySeenLexiconEntries.add(lexiconEntry.getEui())) {
                            // collect the non-ambiguous LEXICON entries connected by acronym or abbreviation relations for the current external entity name
                            // Since we exclude ambiguous entries, the hope is that all the found entries will actually be related to the input concept.
                            Set networkNodes = new HashSet<>();
                            networkNodes.add(lexiconEntry);
                            int networkSize;
                            do {
                                networkSize = networkNodes.size();
                                // follow the abbreviation and acronym links to other LEXICON entries
                                lexiconEntry.getAbbreviationLongformEuis().stream().map(eui2entry::get).filter(Objects::nonNull).filter(Predicate.not(SpecialistEntry::hasAmbiguousAbbreviationsAndAcronyms)).forEach(networkNodes::add);
                                lexiconEntry.getAcronymLongformEuis().stream().map(eui2entry::get).filter(Objects::nonNull).filter(Predicate.not(SpecialistEntry::hasAmbiguousAbbreviationsAndAcronyms)).forEach(networkNodes::add);
                            } while (networkSize < networkNodes.size());
                            // We would like to have a short base name which can be used as an identifier in applications.
                            // Sort the base names by length. The first will be used as the GeneGroup base, the
                            // others will be variants.
                            List basesByLength = networkNodes.stream().map(SpecialistEntry::getBase).sorted(Comparator.comparingInt(String::length)).collect(Collectors.toList());
                            GeneGroup gg;
                            if (genegroup4externalId != null) {
                                gg = genegroup4externalId;
                            } else {
                                gg = new GeneGroup("GENO:" + geneGroupIDcounter.incrementAndGet());
                                geneGroups.add(gg);
                            }
                            gg.addExternalId(externalId);
                            gg.addInputName(externalInputName);
                            gg.addBase(basesByLength.get(0));
                            IntStream.range(1, basesByLength.size()).mapToObj(basesByLength::get).forEach(gg::addSpellingVariant);
                            networkNodes.stream().map(SpecialistEntry::getAbbreviations).flatMap(Collection::stream).forEach(gg::addAbbreviation);
                            networkNodes.stream().map(SpecialistEntry::getAcronyms).flatMap(Collection::stream).forEach(gg::addAcronym);
                            networkNodes.stream().map(SpecialistEntry::getInflectionVariants).flatMap(Collection::stream).forEach(gg::addInflectionVariant);
                            networkNodes.stream().map(SpecialistEntry::getSpellingVariants).flatMap(Collection::stream).forEach(gg::addSpellingVariant);
                            networkNodes.stream().map(SpecialistEntry::getEui).forEach(gg::addConnectedLexiconEntry);
                            networkNodes.forEach(n -> lexiconEui2genegroup.put(n.getEui(), gg));
                            genegroup4externalId = gg;
                        }
                    }
                }
            }
            if (!lexiconEntryFoundForId) {
                // Un-enriched GeneGroup because we could not find a LEXICON entry for the names of this external ID.
                GeneGroup gg = new GeneGroup("GENO:" + geneGroupIDcounter.incrementAndGet());
                gg.addBase(externalId.substring(externalId.indexOf(':') + 1));
                gg.addExternalId(externalId);
                id2inputName.get(externalId).forEach(gg::addInputName);
                geneGroups.add(gg);
            }
        }
        // add potentially missed input names to the according gene groups
        for (GeneGroup gg : geneGroups) {
            for (String externalId : gg.getExternalIds()) {
                for (String name : id2inputName.get(externalId))
                    gg.addInputName(name);
            }
        }

        // That's it. Write the output files and be done.

        log.info("Writing {} connected gene groups to {}", geneGroups.size(), dictionaryDestination);
        writeGeneGroups(dictionaryDestination, geneGroups, adaptedname2id, names2entries);

        log.info("Writing a dictionary of all gene group names to familyrecords.dict");
        try (BufferedWriter bw = FileUtilities.getWriterToFile(new File("familyrecords.dict"))) {
            for (GeneGroup gg : geneGroups) {
                Set allnames = gg.getAllNames().collect(Collectors.toSet());
                String id = gg.getId();
                boolean firstBaseEncountered = false;
                for (String name : allnames) {
                    String prio = "2";
                    // make the first encountered base the "symbol" and all others a "synonym"
                    if (gg.bases.contains(name) && !firstBaseEncountered) {
                        prio = "-1";
                        firstBaseEncountered = true;
                    }
                    bw.write(name + "\t" + id + "\t" + prio);
                    bw.newLine();
                }
            }
        }
        log.info("Done.");
    }

    /**
     * Removes the 'family' suffix word for better matching with the dictionary.
     *
     * @param originalName
     * @return
     */
    @NotNull
    public String adaptName(String originalName) {
        return originalName.replaceAll(" family$", "");
    }

    public boolean numberCompatible(String t1, String t2) {
        numfinder.reset(t1);
        Set number1 = new HashSet<>();
        while (numfinder.find())
            number1.add(numfinder.group());
        numfinder.reset(t2);
        Set number2 = new HashSet<>();
        while (numfinder.find())
            number1.add(numfinder.group());
        return (number1.isEmpty() && number2.isEmpty()) || number1.equals(number2);
    }

    public void writeGeneGroups(File dictionaryDestination, List geneGroups, Multimap name2id, Map> names2entries) throws IOException {
        try (BufferedWriter bw = FileUtilities.getWriterToFile(dictionaryDestination)) {
            bw.write("# Created " + new Date());
            bw.newLine();
            for (GeneGroup group : geneGroups) {
                bw.write(group.getId());
                bw.newLine();
                bw.write("bases:\t" + group.getBases().stream().sorted().collect(Collectors.joining(", ")));
                bw.newLine();
                bw.write("inflections:\t" + group.getInflectionVariants().stream().sorted().collect(Collectors.joining(", ")));
                bw.newLine();
                bw.write("spellings:\t" + group.getSpellingVariants().stream().sorted().collect(Collectors.joining(", ")));
                bw.newLine();
                bw.write("acronyms:\t" + group.getAcronyms().stream().sorted().collect(Collectors.joining(", ")));
                bw.newLine();
                bw.write("abbreviations:\t" + group.getAbbreviations().stream().sorted().collect(Collectors.joining(", ")));
                bw.newLine();
                bw.write("externalids:\t" + group.getExternalIds().stream().sorted().collect(Collectors.joining(", ")));
                bw.newLine();
                bw.write("inputnames:\t" + group.getInputNames().stream().sorted().collect(Collectors.joining(", ")));
                bw.newLine();
                bw.write("lexiconEuis:\t" + group.getConnectedLexiconEntries().stream().sorted().collect(Collectors.joining(", ")));
                bw.newLine();
                bw.newLine();
            }
        }
    }

    public void extendNamesWithLexiconItems(Multimap name2id, Multimap normalized2OriginalInputNames, Map> names2entries) {
        for (String inputName : names2entries.keySet()) {
            Multimap specialistEntries = names2entries.get(inputName);
            Collection ids = name2id.get(inputName);
            for (EntryType type : specialistEntries.keySet()) {
                Collection entries4type = specialistEntries.get(type);
                switch (type) {
                    case ABBREVIATION:
                        for (SpecialistEntry e : entries4type) {
                            for (String id : ids) {
                                name2id.put(e.getBase(), id);
                                if (e.uniformAbbreviationAndAcronymSuffix()) {
                                    e.getInflectionVariants().forEach(var -> name2id.put(var, id));
                                    e.getSpellingVariants().forEach(var -> name2id.put(var, id));
                                    e.getAbbreviations().forEach(abbr -> name2id.put(abbr, id));
                                }
                            }
                        }
                        break;
                    case ACRONYM:
                        for (SpecialistEntry e : entries4type) {
                            for (String id : ids) {
                                name2id.put(id, e.getBase());
                                if (e.uniformAbbreviationAndAcronymSuffix()) {
                                    e.getInflectionVariants().forEach(var -> name2id.put(var, id));
                                    e.getSpellingVariants().forEach(var -> name2id.put(var, id));
                                    e.getAcronyms().forEach(acr -> name2id.put(acr, id));
                                }
                            }
                        }
                        break;
                    case SPELLING:
                    case INFLECTION:
                        for (SpecialistEntry e : entries4type) {
                            for (String id : ids) {
                                name2id.put(e.getBase(), id);
                                e.getInflectionVariants().forEach(var -> name2id.put(var, id));
                                e.getSpellingVariants().forEach(var -> name2id.put(var, id));
                            }
                        }
                        break;
                    case BASE:
                        for (SpecialistEntry e : entries4type) {
                            if (!e.isAcronymEntry()) {
                                for (String id : ids) {
                                    name2id.put(e.getBase(), id);
                                    e.getInflectionVariants().forEach(var -> name2id.put(var, id));
                                    e.getSpellingVariants().forEach(var -> name2id.put(var, id));
                                }
                            } else if (e.uniformAbbreviationSuffix()) {
                                for (String id : ids) {
                                    e.getInflectionVariants().forEach(var -> name2id.put(var, id));
                                    e.getSpellingVariants().forEach(var -> name2id.put(var, id));
                                    e.getAbbreviations().forEach(abbr -> name2id.put(abbr, id));
                                }
                            } else if (e.uniformAcronymSuffix()) {
                                for (String id : ids) {
                                    e.getInflectionVariants().forEach(var -> name2id.put(var, id));
                                    e.getSpellingVariants().forEach(var -> name2id.put(var, id));
                                    e.getAcronyms().forEach(acr -> name2id.put(acr, id));
                                }
                            }
                        }
                        break;
                    default:
                        throw new IllegalArgumentException("Unknown Specialist Entry type " + type);
                }
            }
        }
        for (String newname : name2id.keySet()) {
            String normalizedNewname = normalize(newname);
            normalized2OriginalInputNames.put(normalizedNewname, newname);
        }
    }

    public Map> findEntriesForInputNames(File specialistXmlLexicon, Multimap normalized2OriginalInputNames, Map eui2entry, Multimap eui2abbreui, Multimap eui2acroeui, Map acronymEntries, Map abbreviationEntries) throws IOException, XMLStreamException {
        Map> names2entries = new HashMap<>();
        try (BufferedReader br = FileUtilities.getReaderFromFile(specialistXmlLexicon)) {

            XMLStreamReader reader = XMLInputFactory.newFactory().createXMLStreamReader(br);
            SpecialistEntry currentEntry = null;
            Multimap exactInputs4CurrentEntry = HashMultimap.create();
            Multimap normalizedInputs4CurrentEntry = HashMultimap.create();
            while (reader.hasNext()) {
                int eventType = reader.next();
                EntryType entryType = null;
                if (eventType == XMLStreamConstants.END_ELEMENT) {
                    String elementName = reader.getName().getLocalPart();
                    if (elementName.equals("lexRecord")) {
                        storeLexiconRecord(eui2entry, eui2abbreui, eui2acroeui, acronymEntries, abbreviationEntries, names2entries, currentEntry, exactInputs4CurrentEntry, normalizedInputs4CurrentEntry);
                        exactInputs4CurrentEntry.clear();
                        normalizedInputs4CurrentEntry.clear();
                    }
                } else if (eventType == XMLStreamConstants.START_ELEMENT) {
                    String elementName = reader.getName().getLocalPart();
                    String elementText = null;
                    EntryType refType = null;
                    switch (elementName) {
                        case "lexRecord": {
//                            storeLexiconRecord(eui2abbreui, eui2acroeui, acronymEntries, abbreviationEntries, names2entries, currentEntry, exactInputs4CurrentEntry, normalizedInputs4CurrentEntry);
                            currentEntry = new SpecialistEntry();
//                            exactInputs4CurrentEntry.clear();
//                            normalizedInputs4CurrentEntry.clear();
                            break;
                        }
                        case "acronyms":
                            entryType = EntryType.ACRONYM;
                            refType = EntryType.ACRONYM_LONGFORM_EUID;
                        case "abbreviations":
                            if (entryType == null)
                                entryType = EntryType.ABBREVIATION;
                            if (refType == null)
                                refType = EntryType.ABBREVIATION_LONGFORM_EUID;
                            String originalElementText = reader.getElementText();
                            // for acronyms and abbreviations, the term ID for which the acronym/abbreviations stands
                            // is separated by a pipe character
                            String[] split = originalElementText.split("\\|");
                            elementText = split[0];
                            String longformEntryEui = split.length > 1 ? split[1] : "";
                            currentEntry.add(elementText, entryType);
                            currentEntry.add(longformEntryEui, refType);
                            String normalizedElementText = normalize(elementText);
                            if (normalized2OriginalInputNames.containsKey(normalizedElementText)) {
                                Collection originalInputNames = normalized2OriginalInputNames.get(normalizedElementText);
                                for (String originalInputName : originalInputNames)
                                    if (originalInputName.equals(elementText))
                                        exactInputs4CurrentEntry.put(originalInputName, entryType);
                                    else
                                        normalizedInputs4CurrentEntry.put(originalInputName, entryType);
                            }
                            break;
                        case "base":
                            entryType = EntryType.BASE;
                        case "spellingVars":
                            if (entryType == null)
                                entryType = EntryType.SPELLING;
                        case "inflVars":
                            if (entryType == null)
                                entryType = EntryType.INFLECTION;
                            if (elementText == null)
                                elementText = reader.getElementText();
                            currentEntry.add(elementText, entryType);
                            normalizedElementText = normalize(elementText);
                            if (normalized2OriginalInputNames.containsKey(normalizedElementText)) {
                                Collection originalInputNames = normalized2OriginalInputNames.get(normalizedElementText);
                                for (String originalInputName : originalInputNames)
                                    if (originalInputName.equals(elementText))
                                        exactInputs4CurrentEntry.put(originalInputName, entryType);
                                    else
                                        normalizedInputs4CurrentEntry.put(originalInputName, entryType);
                            }
                            break;
                        case "cat":
                            currentEntry.setCategory(reader.getElementText());
                            break;
                        case "eui":
                            currentEntry.setEui(reader.getElementText());
                            break;
                    }
                }
            }
        }
        return names2entries;
    }

    private void storeLexiconRecord(Map eui2entry, Multimap eui2abbreui, Multimap eui2acroeui, Map acronymEntries, Map abbreviationEntries, Map> names2entries, SpecialistEntry currentEntry, Multimap exactInputs4CurrentEntry, Multimap normalizedInputs4CurrentEntry) {
        if (currentEntry != null && currentEntry.getCategory().equals("noun")) {
            eui2entry.put(currentEntry.getEui(), currentEntry);
            for (Map.Entry p : exactInputs4CurrentEntry.entries()) {
                String name = p.getKey();
                Multimap type2entries = names2entries.compute(name, (k, v) -> v != null ? v : HashMultimap.create());
                type2entries.put(p.getValue(), currentEntry);
            }
            for (Map.Entry p : normalizedInputs4CurrentEntry.entries()) {
                String name = p.getKey();
                if (!names2entries.containsKey(name)) {
                    Multimap type2entries = names2entries.compute(name, (k, v) -> v != null ? v : HashMultimap.create());
                    type2entries.put(p.getValue(), currentEntry);
                }
            }
            if (!exactInputs4CurrentEntry.isEmpty() || !normalizedInputs4CurrentEntry.isEmpty()) {
                for (String longformEui : currentEntry.getAbbreviationLongformEuis())
                    eui2abbreui.put(longformEui, currentEntry.getEui());
                for (String longformEui : currentEntry.getAcronymLongformEuis())
                    eui2acroeui.put(longformEui, currentEntry.getEui());
                if (currentEntry.isAcronymEntry())
                    acronymEntries.put(currentEntry.getEui(), currentEntry);
                if (currentEntry.isAbbreviationEntry())
                    abbreviationEntries.put(currentEntry.getEui(), currentEntry);
            }
        }
    }

    /**
     * 
     * Tries to find a span of text in matchedText that matches the given acronym shortform.
     * Upon success, the found long form is returned.
     * 
     * The algorithm tries to find consecutive tokens in matchedText that either begin with or contain
     * the characters in acronym in order.
     *
     * @param acronym     A sequence of characters that represent an acronym.
     * @param matchedText Some text that might contain the long form of the acronym.
     * @return The long form of the given acronym, if it could be found, null otherwise.
     */
    @Nullable
    private String findLongform(CharSequence acronym, String matchedText) {
        String longform = null;
        String[] abbreviationAcronymTokens = matchedText.split("[\\s\\p{P}]+");
        int shortformpos = 0;
        int foundAcronymPositions = 0;
        List acronymTokenIndices = new ArrayList<>();
        for (int i = 0; i < abbreviationAcronymTokens.length && shortformpos < acronym.length(); ++i) {
            String token = abbreviationAcronymTokens[i];
            for (int j = 0; j < token.length() && shortformpos < acronym.length(); ++j) {
                char tchar = Character.toLowerCase(token.charAt(j));
                char schar = Character.toLowerCase(acronym.charAt(shortformpos));
                // the first character of a token must be part of the acronym, else we don't accept it
                if (j == 0 && tchar != schar) {
                    shortformpos = 0;
                    foundAcronymPositions = 0;
                    acronymTokenIndices.clear();
                    continue;
                }
                if (j == 0)
                    acronymTokenIndices.add(i);
                if (tchar == schar) {
                    ++foundAcronymPositions;
                    ++shortformpos;
                }
            }
        }
        if (foundAcronymPositions == acronym.length()) {
            longform = acronymTokenIndices.stream().map(i -> abbreviationAcronymTokens[i]).collect(Collectors.joining(" "));
        }
        return longform;
    }

    private enum EntryType {ABBREVIATION, ACRONYM, SPELLING, INFLECTION, BASE, ACRONYM_LONGFORM_EUID, ABBREVIATION_LONGFORM_EUID}

    private class SpecialistEntry {
        private String base;
        private List spellingVariants = Collections.emptyList();
        private List inflectionVariants = Collections.emptyList();
        private List abbreviations = Collections.emptyList();
        private List acronyms = Collections.emptyList();
        private List abbreviationLongformEuis = Collections.emptyList();
        private List acronymLongformEuis = Collections.emptyList();
        private String eui;
        private String category;


        public String getEui() {
            return eui;
        }

        public void setEui(String eui) {
            this.eui = eui;
        }

        public boolean isAcronymEntry() {
            return !acronyms.isEmpty();
        }

        public void add(String variant, EntryType type) {
            switch (type) {
                case SPELLING:
                    addSpellingVariant(variant);
                    break;
                case INFLECTION:
                    addInflectionVariant(variant);
                    break;
                case ABBREVIATION:
                    addAbbreviation(variant);
                    break;
                case ACRONYM:
                    addAcronym(variant);
                    break;
                case ABBREVIATION_LONGFORM_EUID:
                    addAbbreviationLongformEui(variant);
                    break;
                case ACRONYM_LONGFORM_EUID:
                    addAcronymLongformEui(variant);
                    break;

                case BASE:
                    base = variant;
                    break;
            }
        }

        public String getBase() {
            return base;
        }

        public boolean uniformAcronymSuffix() {
            if (acronyms.isEmpty())
                return false;
            if (acronyms.size() == 1)
                return true;
            String[] split = acronyms.get(0).split("\\s+");
            String suffix = split[split.length - 1];
            for (String acronym : acronyms) {
                if (!acronym.endsWith(suffix))
                    return false;
            }
            return true;
        }

        /**
         * Tries to find out whether the abbreviations and acronyms of the LEXICON entry denote the same or different concepts via string comparison.
         * All pairwise combinations of the abbreviations and acronyms are compared - after resolution of possible acronyms within the strings themselves - are compared for token overlap. If at least one combination has less then half overlap, false is returned.
         *
         * @return Whether the pairwise token overlap is at least less than 0.5.
         */
        public boolean hasAmbiguousAbbreviationsAndAcronyms() {
            List abbreviationsAndAcronyms = Stream.concat(abbreviations.stream(), acronyms.stream()).collect(Collectors.toList());
            // At first, we do acronym resolution. This makes some combinations of strings more similar and we can use a higher threshold.
            Pattern shortformP = Pattern.compile("[A-Z][A-Z]+");
            Set shortforms = abbreviationsAndAcronyms.stream().map(shortformP::matcher).filter(Matcher::find).map(Matcher::group).collect(Collectors.toSet());
            Map short2long = new HashMap<>();
            for (String shortform : shortforms) {
                for (String abbreviationAcronym : abbreviationsAndAcronyms) {
                    String longform = findLongform(shortform, abbreviationAcronym);

                    if (longform != null && !shortform.equals(longform))
                        short2long.put(shortform, longform);
                }
            }
            // We use the AC to find short forms in the strings and replace them by long forms.
            AhoCorasickOptimized ac = new AhoCorasickOptimized(shortforms);
            double minscore = 1;
            AhoCorasickLongestMatchCallback callback = new AhoCorasickLongestMatchCallback();
            for (String s : abbreviationsAndAcronyms) {
                // make the expanded version of s
                ac.match(s, callback);
                for (Range shortformRange : callback.getLongestMatches().keySet()) {
                    s = s.substring(0, shortformRange.getMinimum()) + short2long.get(callback.getLongestMatches().get(shortformRange)) + s.substring(shortformRange.getMaximum() + 1);
                }
                callback.clear();
                for (String s2 : abbreviationsAndAcronyms) {
                    // make the expanded version of s2
                    ac.match(s2, callback);
                    for (Range shortformRange : callback.getLongestMatches().keySet()) {
                        s2 = s2.substring(0, shortformRange.getMinimum()) + short2long.get(callback.getLongestMatches().get(shortformRange)) + s2.substring(shortformRange.getMaximum() + 1);
                    }
                    callback.clear();
                    // now compare s and s2
                    double score = jaccard.getScore(s, s2);
                    if (score < minscore)
                        minscore = score;
                }
            }
            return minscore < .5;
        }

        public boolean uniformAbbreviationAndAcronymSuffix() {
            List abbreviationsAndAcronyms = Stream.concat(abbreviations.stream(), acronyms.stream()).collect(Collectors.toList());
            double minscore = 1;
            for (String s : abbreviationsAndAcronyms) {
                for (String s2 : abbreviationsAndAcronyms) {
                    double score = jaroWinkler.getScore(s, s2);
                    if (score < minscore)
                        minscore = score;
                }
            }
            return minscore > .7;
        }

        public boolean uniformAbbreviationSuffix() {
            if (abbreviations.isEmpty())
                return false;
            if (abbreviations.size() == 1)
                return true;
            String[] split = abbreviations.get(0).split("\\s+");
            String suffix = split[split.length - 1];
            for (String abbreviation : abbreviations) {
                if (!abbreviation.endsWith(suffix))
                    return false;
            }
            return true;
        }

        public List getSpellingVariants() {
            return spellingVariants;
        }

        public List getInflectionVariants() {
            return inflectionVariants;
        }

        public List getAbbreviations() {
            return abbreviations;
        }

        public List getAcronyms() {
            return acronyms;
        }

        public void addSpellingVariant(String variant) {
            if (spellingVariants.isEmpty())
                spellingVariants = new ArrayList<>();
            spellingVariants.add(variant);
        }

        public void addInflectionVariant(String variant) {
            if (inflectionVariants.isEmpty())
                inflectionVariants = new ArrayList<>();
            inflectionVariants.add(variant);
        }

        public void addAbbreviation(String variant) {
            if (abbreviations.isEmpty())
                abbreviations = new ArrayList<>();
            abbreviations.add(variant);
        }

        public void addAcronym(String variant) {
            if (acronyms.isEmpty())
                acronyms = new ArrayList<>();
            acronyms.add(variant);
        }

        public List getAbbreviationLongformEuis() {
            return abbreviationLongformEuis;
        }

        public List getAcronymLongformEuis() {
            return acronymLongformEuis;
        }

        public void addAbbreviationLongformEui(String variant) {
            if (abbreviationLongformEuis.isEmpty())
                abbreviationLongformEuis = new ArrayList<>();
            abbreviationLongformEuis.add(variant);
        }

        public void addAcronymLongformEui(String variant) {
            if (acronymLongformEuis.isEmpty())
                acronymLongformEuis = new ArrayList<>();
            acronymLongformEuis.add(variant);
        }


        @Override
        public boolean equals(Object o) {
            if (this == o) return true;
            if (o == null || getClass() != o.getClass()) return false;
            SpecialistEntry that = (SpecialistEntry) o;
            return eui.equals(that.eui);
        }

        @Override
        public int hashCode() {
            return Objects.hash(eui);
        }

        public String getCategory() {
            return category;
        }

        public void setCategory(String category) {
            this.category = category;
        }

        public boolean isAbbreviationEntry() {
            return !abbreviations.isEmpty();
        }
    }

    private class GeneGroup {
        private final String id;
        private Set bases = Collections.emptySet();
        private Set acronyms = Collections.emptySet();
        private Set abbreviations = Collections.emptySet();
        private Set abbreviationLongforms = Collections.emptySet();
        private Set acronymLongforms = Collections.emptySet();
        private Set spellingVariants = Collections.emptySet();
        private Set inflectionVariants = Collections.emptySet();
        private Set externalIds = Collections.emptySet();
        private Set inputNames = Collections.emptySet();
        private Set externalIdNamespaces = Collections.emptySet();
        private Set connectedLexiconEntries = Collections.emptySet();

        public GeneGroup(String id) {
            this.id = id;
        }

        private Stream getAllNames() {
            return Stream.concat(bases.stream(), Stream.concat(spellingVariants.stream(), Stream.concat(inflectionVariants.stream(), Stream.concat(abbreviations.stream(), acronyms.stream()))));
        }

        public Set getBases() {
            return bases;
        }

        public Set getAcronyms() {
            return acronyms;
        }

        public Set getAbbreviations() {
            return abbreviations;
        }

        public Set getSpellingVariants() {
            return spellingVariants;
        }

        public Set getInputNames() {
            return inputNames;
        }

        public Set getInflectionVariants() {
            return inflectionVariants;
        }

        public Set getExternalIds() {
            return externalIds;
        }

        public void addConnectedLexiconEntry(String entry) {
            if (connectedLexiconEntries.isEmpty())
                connectedLexiconEntries = new HashSet<>();
            connectedLexiconEntries.add(entry);
        }

        public void addExternalIdNamespace(String namespace) {
            if (externalIdNamespaces.isEmpty())
                externalIdNamespaces = new HashSet<>();
            externalIdNamespaces.add(namespace);
        }

        public void addInputName(String variant) {
            if (inputNames.isEmpty())
                inputNames = new HashSet<>();
            inputNames.add(variant);
        }

        public void addAbbreviation(String variant) {
            if (abbreviations.isEmpty())
                abbreviations = new HashSet<>();
            abbreviations.add(variant);
        }

        public void addAcronym(String variant) {
            if (acronyms.isEmpty())
                acronyms = new HashSet<>();
            acronyms.add(variant);
        }

        public void addAbbreviationLongform(String variant) {
            if (abbreviationLongforms.isEmpty())
                abbreviationLongforms = new HashSet<>();
            abbreviationLongforms.add(variant);
        }

        public void addAcronymLongform(String variant) {
            if (acronymLongforms.isEmpty())
                acronymLongforms = new HashSet<>();
            acronymLongforms.add(variant);
        }

        public void addSpellingVariant(String variant) {
            if (spellingVariants.isEmpty())
                spellingVariants = new HashSet<>();
            spellingVariants.add(variant);
        }

        public void addInflectionVariant(String variant) {
            if (inflectionVariants.isEmpty())
                inflectionVariants = new HashSet<>();
            inflectionVariants.add(variant);
        }

        public void addExternalId(String externalId) {
            if (externalIds.isEmpty())
                externalIds = new HashSet<>();
            externalIds.add(externalId);
            addExternalIdNamespace(externalId.substring(0, externalId.indexOf(':')));
        }

        public void addBase(String base) {
            if (bases.isEmpty())
                bases = new HashSet<>();
            bases.add(base);
        }


        public void addSpellingVariants(Collection spellingVariants) {
            if (this.spellingVariants.isEmpty())
                this.spellingVariants = new HashSet<>();
            this.spellingVariants.addAll(spellingVariants);
        }

        public void addInflectionVariants(Collection inflectionVariants) {
            if (this.inflectionVariants.isEmpty())
                this.inflectionVariants = new HashSet<>();
            this.inflectionVariants.addAll(inflectionVariants);
        }

        public String getId() {
            return id;
        }

        public boolean isCompatibleTo(GeneGroup otherGroup) {
            Set thisNames = Stream.concat(bases.stream(), Stream.concat(spellingVariants.stream(), inflectionVariants.stream())).collect(Collectors.toSet());
            Set otherNames = Stream.concat(otherGroup.getBases().stream(), Stream.concat(otherGroup.getSpellingVariants().stream(), otherGroup.getInflectionVariants().stream())).collect(Collectors.toSet());
            double maxScore = 0;
            for (String thisname : thisNames) {
                for (String othername : otherNames) {
                    double score = jaroWinkler.getScore(thisname, othername);
                    if (score > maxScore)
                        maxScore = score;
                }
            }
            boolean iscompatible = maxScore > .9;
            return iscompatible;
        }

        public void merge(GeneGroup gg) {
            bases.addAll(gg.getBases());
            addSpellingVariants(gg.getSpellingVariants());
            addInflectionVariants(gg.getInflectionVariants());
            gg.getAcronyms().forEach(this::addAcronym);
            gg.getAbbreviations().forEach(this::addAbbreviation);
        }

        public Set getExternalIdNamespaces() {
            return externalIdNamespaces;
        }

        public Set getConnectedLexiconEntries() {
            return connectedLexiconEntries;
        }
    }


}