Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
de.julielab.genemapper.resources.SpecialistLexiconNameExpansion Maven / Gradle / Ivy
Go to download
This project assembles code and files required to build the dictionaries and indexes used by the JCoRe
Gene Mapper.
package de.julielab.genemapper.resources;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;
import com.google.common.collect.Sets;
import com.lahodiuk.ahocorasick.AhoCorasickOptimized;
import com.sun.istack.NotNull;
import de.julielab.geneexpbase.AhoCorasickLongestMatchCallback;
import de.julielab.geneexpbase.scoring.JaccardScorer;
import de.julielab.geneexpbase.scoring.JaroWinklerScorer;
import de.julielab.geneexpbase.scoring.Scorer;
import de.julielab.java.utilities.FileUtilities;
import edu.northwestern.at.utils.corpuslinguistics.lexicon.LexiconEntry;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.Range;
import org.jetbrains.annotations.Nullable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.IOException;
import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.function.Predicate;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.Stream;
/**
* This class makes best efforts to connect all given family and gene group names into connected records.
* It also extends the names with the SPECIALIST Lexicon.
*
* @see https://lexsrv3.nlm.nih.gov/LexSysGroup/Projects/lexicon/current/
*/
public class SpecialistLexiconNameExpansion {
private final static Logger log = LoggerFactory.getLogger(SpecialistLexiconNameExpansion.class);
private static final Scorer jaroWinkler = new JaroWinklerScorer();
private static final Scorer jaccard = new JaccardScorer();
private static Set inputStopwords;
private final AtomicInteger geneGroupIDcounter = new AtomicInteger();
private final Matcher numfinder = Pattern.compile("[0-9]+").matcher("");
public SpecialistLexiconNameExpansion() {
inputStopwords = new HashSet<>();
inputStopwords.add("family");
inputStopwords.add("superfamily");
inputStopwords.add("subfamily");
inputStopwords.add("group");
inputStopwords.add("factor");
inputStopwords.add("receptor");
}
public static void main(String[] args) throws IOException, XMLStreamException {
if (args.length < 3) {
System.err.println("Usage: " + SpecialistLexiconNameExpansion.class.getCanonicalName() + " [ ]* ");
} else {
File[] inputFiles = IntStream.range(2, args.length).mapToObj(i -> args[i]).map(File::new).toArray(File[]::new);
SpecialistLexiconNameExpansion dictCreator = new SpecialistLexiconNameExpansion();
dictCreator.createDict(new File(args[0]), inputFiles, new File(args[1]));
}
}
private String normalize(String input) {
return input.toLowerCase().replaceAll("\\p{P}", " ");
}
public void createDict(File specialistXmlLexicon, File[] inputNamesFiles, File dictionaryDestination) throws IOException, XMLStreamException {
log.info("Reading SPECIALIST Lexicon from {}, name lists from {} and writing dictionary to {}.", specialistXmlLexicon, inputNamesFiles, dictionaryDestination);
Multimap adaptedname2id = HashMultimap.create();
// euis are the IDs of entries in the SPECIALIST lexicon (Entry Unique Identifier): https://lhncbc.nlm.nih.gov/LSG/Projects/lexicon/current/docs/designDoc/UDF/lexRecord/syntax/eui.html, https://www.ncbi.nlm.nih.gov/books/NBK9680/
Multimap eui2abbreui = HashMultimap.create();
Multimap eui2acroeui = HashMultimap.create();
Map eui2entry = new HashMap<>();
Map acronymEntries = new HashMap<>();
Map abbreviationEntries = new HashMap<>();
Multimap id2inputName = HashMultimap.create();
Map adaptedName2originalName = new HashMap<>();
Multimap egsmlong2acro = HashMultimap.create();
// Index the gene group names with the ID from their source
// Also, do some name normalization and index the original names by their adapted variant
Multimap normalized2OriginalInputNames = HashMultimap.create();
for (File inputNamesFile : inputNamesFiles) {
FileUtilities.getReaderFromFile(inputNamesFile).lines().filter(Predicate.not(String::isBlank)).filter(Predicate.not(l -> l.startsWith("#"))).forEach(l -> {
String[] split = l.split("\t");
if (split.length < 3)
throw new IllegalArgumentException("Unsupported format in file " + inputNamesFile.getName() + ". Expected three columns with 'name', 'id' and 'priority'. Got: " + l);
String originalName = split[0];
String name = adaptName(originalName);
adaptedName2originalName.put(name, originalName);
if (!inputStopwords.contains(name)) {
String id = split[1];
adaptedname2id.put(name, id);
String normalizedName = normalize(name);
if (!normalized2OriginalInputNames.containsKey(normalizedName) && !normalized2OriginalInputNames.get(normalizedName).contains(l)) {
normalized2OriginalInputNames.put(normalizedName, name);
}
id2inputName.put(id, name);
}
});
}
log.info("Got {} input names to check for connections amongst each other and with the SPECIALIST Lexicon", adaptedName2originalName.size());
int i = 0;
int numNames;
Map> names2entries;
do {
numNames = normalized2OriginalInputNames.size();
names2entries = findEntriesForInputNames(specialistXmlLexicon, normalized2OriginalInputNames, eui2entry, eui2abbreui, eui2acroeui, acronymEntries, abbreviationEntries);
extendNamesWithLexiconItems(adaptedname2id, normalized2OriginalInputNames, names2entries);
++i;
} while (normalized2OriginalInputNames.size() != numNames);
log.info("After {} iterations of name expansion, a final number of {} names was identified.", i, numNames);
// Iterate over the externally given records by unique ID.
// For each ID, get the names associated with it and look for matches in the LEXICON.
// Create GeneGroups out of the external IDs, enriched by the LEXICON entries.
// When multiple external resources are used, there is currently no attempt to merge the different sources.
// Thus, when two sources would include AMPK, there would be two output records for it.
List geneGroups = new ArrayList<>();
Multimap lexiconEui2genegroup = HashMultimap.create();
for (String externalId : id2inputName.keySet()) {
boolean lexiconEntryFoundForId = false;
GeneGroup genegroup4externalId = null;
// for each name for this ID, look for matching LEXICON entries
for (String externalInputName : id2inputName.get(externalId)) {
Multimap lexiconNames = names2entries.get(externalInputName);
if (lexiconNames != null) {
lexiconEntryFoundForId = true;
Set alreadySeenLexiconEntries = new HashSet<>();
for (SpecialistEntry lexiconEntry : lexiconNames.values()) {
// omit LEXICON entries with very diverse abbreviations and acronyms (e.g. 'APC' has diverse long forms); we have no facility
// for disambiguation
if (!lexiconEntry.hasAmbiguousAbbreviationsAndAcronyms() && alreadySeenLexiconEntries.add(lexiconEntry.getEui())) {
// collect the non-ambiguous LEXICON entries connected by acronym or abbreviation relations for the current external entity name
// Since we exclude ambiguous entries, the hope is that all the found entries will actually be related to the input concept.
Set networkNodes = new HashSet<>();
networkNodes.add(lexiconEntry);
int networkSize;
do {
networkSize = networkNodes.size();
// follow the abbreviation and acronym links to other LEXICON entries
lexiconEntry.getAbbreviationLongformEuis().stream().map(eui2entry::get).filter(Objects::nonNull).filter(Predicate.not(SpecialistEntry::hasAmbiguousAbbreviationsAndAcronyms)).forEach(networkNodes::add);
lexiconEntry.getAcronymLongformEuis().stream().map(eui2entry::get).filter(Objects::nonNull).filter(Predicate.not(SpecialistEntry::hasAmbiguousAbbreviationsAndAcronyms)).forEach(networkNodes::add);
} while (networkSize < networkNodes.size());
// We would like to have a short base name which can be used as an identifier in applications.
// Sort the base names by length. The first will be used as the GeneGroup base, the
// others will be variants.
List basesByLength = networkNodes.stream().map(SpecialistEntry::getBase).sorted(Comparator.comparingInt(String::length)).collect(Collectors.toList());
GeneGroup gg;
if (genegroup4externalId != null) {
gg = genegroup4externalId;
} else {
gg = new GeneGroup("GENO:" + geneGroupIDcounter.incrementAndGet());
geneGroups.add(gg);
}
gg.addExternalId(externalId);
gg.addInputName(externalInputName);
gg.addBase(basesByLength.get(0));
IntStream.range(1, basesByLength.size()).mapToObj(basesByLength::get).forEach(gg::addSpellingVariant);
networkNodes.stream().map(SpecialistEntry::getAbbreviations).flatMap(Collection::stream).forEach(gg::addAbbreviation);
networkNodes.stream().map(SpecialistEntry::getAcronyms).flatMap(Collection::stream).forEach(gg::addAcronym);
networkNodes.stream().map(SpecialistEntry::getInflectionVariants).flatMap(Collection::stream).forEach(gg::addInflectionVariant);
networkNodes.stream().map(SpecialistEntry::getSpellingVariants).flatMap(Collection::stream).forEach(gg::addSpellingVariant);
networkNodes.stream().map(SpecialistEntry::getEui).forEach(gg::addConnectedLexiconEntry);
networkNodes.forEach(n -> lexiconEui2genegroup.put(n.getEui(), gg));
genegroup4externalId = gg;
}
}
}
}
if (!lexiconEntryFoundForId) {
// Un-enriched GeneGroup because we could not find a LEXICON entry for the names of this external ID.
GeneGroup gg = new GeneGroup("GENO:" + geneGroupIDcounter.incrementAndGet());
gg.addBase(externalId.substring(externalId.indexOf(':') + 1));
gg.addExternalId(externalId);
id2inputName.get(externalId).forEach(gg::addInputName);
geneGroups.add(gg);
}
}
// add potentially missed input names to the according gene groups
for (GeneGroup gg : geneGroups) {
for (String externalId : gg.getExternalIds()) {
for (String name : id2inputName.get(externalId))
gg.addInputName(name);
}
}
// That's it. Write the output files and be done.
log.info("Writing {} connected gene groups to {}", geneGroups.size(), dictionaryDestination);
writeGeneGroups(dictionaryDestination, geneGroups, adaptedname2id, names2entries);
log.info("Writing a dictionary of all gene group names to familyrecords.dict");
try (BufferedWriter bw = FileUtilities.getWriterToFile(new File("familyrecords.dict"))) {
for (GeneGroup gg : geneGroups) {
Set allnames = gg.getAllNames().collect(Collectors.toSet());
String id = gg.getId();
boolean firstBaseEncountered = false;
for (String name : allnames) {
String prio = "2";
// make the first encountered base the "symbol" and all others a "synonym"
if (gg.bases.contains(name) && !firstBaseEncountered) {
prio = "-1";
firstBaseEncountered = true;
}
bw.write(name + "\t" + id + "\t" + prio);
bw.newLine();
}
}
}
log.info("Done.");
}
/**
* Removes the 'family' suffix word for better matching with the dictionary.
*
* @param originalName
* @return
*/
@NotNull
public String adaptName(String originalName) {
return originalName.replaceAll(" family$", "");
}
public boolean numberCompatible(String t1, String t2) {
numfinder.reset(t1);
Set number1 = new HashSet<>();
while (numfinder.find())
number1.add(numfinder.group());
numfinder.reset(t2);
Set number2 = new HashSet<>();
while (numfinder.find())
number1.add(numfinder.group());
return (number1.isEmpty() && number2.isEmpty()) || number1.equals(number2);
}
public void writeGeneGroups(File dictionaryDestination, List geneGroups, Multimap name2id, Map> names2entries) throws IOException {
try (BufferedWriter bw = FileUtilities.getWriterToFile(dictionaryDestination)) {
bw.write("# Created " + new Date());
bw.newLine();
for (GeneGroup group : geneGroups) {
bw.write(group.getId());
bw.newLine();
bw.write("bases:\t" + group.getBases().stream().sorted().collect(Collectors.joining(", ")));
bw.newLine();
bw.write("inflections:\t" + group.getInflectionVariants().stream().sorted().collect(Collectors.joining(", ")));
bw.newLine();
bw.write("spellings:\t" + group.getSpellingVariants().stream().sorted().collect(Collectors.joining(", ")));
bw.newLine();
bw.write("acronyms:\t" + group.getAcronyms().stream().sorted().collect(Collectors.joining(", ")));
bw.newLine();
bw.write("abbreviations:\t" + group.getAbbreviations().stream().sorted().collect(Collectors.joining(", ")));
bw.newLine();
bw.write("externalids:\t" + group.getExternalIds().stream().sorted().collect(Collectors.joining(", ")));
bw.newLine();
bw.write("inputnames:\t" + group.getInputNames().stream().sorted().collect(Collectors.joining(", ")));
bw.newLine();
bw.write("lexiconEuis:\t" + group.getConnectedLexiconEntries().stream().sorted().collect(Collectors.joining(", ")));
bw.newLine();
bw.newLine();
}
}
}
public void extendNamesWithLexiconItems(Multimap name2id, Multimap normalized2OriginalInputNames, Map> names2entries) {
for (String inputName : names2entries.keySet()) {
Multimap specialistEntries = names2entries.get(inputName);
Collection ids = name2id.get(inputName);
for (EntryType type : specialistEntries.keySet()) {
Collection entries4type = specialistEntries.get(type);
switch (type) {
case ABBREVIATION:
for (SpecialistEntry e : entries4type) {
for (String id : ids) {
name2id.put(e.getBase(), id);
if (e.uniformAbbreviationAndAcronymSuffix()) {
e.getInflectionVariants().forEach(var -> name2id.put(var, id));
e.getSpellingVariants().forEach(var -> name2id.put(var, id));
e.getAbbreviations().forEach(abbr -> name2id.put(abbr, id));
}
}
}
break;
case ACRONYM:
for (SpecialistEntry e : entries4type) {
for (String id : ids) {
name2id.put(id, e.getBase());
if (e.uniformAbbreviationAndAcronymSuffix()) {
e.getInflectionVariants().forEach(var -> name2id.put(var, id));
e.getSpellingVariants().forEach(var -> name2id.put(var, id));
e.getAcronyms().forEach(acr -> name2id.put(acr, id));
}
}
}
break;
case SPELLING:
case INFLECTION:
for (SpecialistEntry e : entries4type) {
for (String id : ids) {
name2id.put(e.getBase(), id);
e.getInflectionVariants().forEach(var -> name2id.put(var, id));
e.getSpellingVariants().forEach(var -> name2id.put(var, id));
}
}
break;
case BASE:
for (SpecialistEntry e : entries4type) {
if (!e.isAcronymEntry()) {
for (String id : ids) {
name2id.put(e.getBase(), id);
e.getInflectionVariants().forEach(var -> name2id.put(var, id));
e.getSpellingVariants().forEach(var -> name2id.put(var, id));
}
} else if (e.uniformAbbreviationSuffix()) {
for (String id : ids) {
e.getInflectionVariants().forEach(var -> name2id.put(var, id));
e.getSpellingVariants().forEach(var -> name2id.put(var, id));
e.getAbbreviations().forEach(abbr -> name2id.put(abbr, id));
}
} else if (e.uniformAcronymSuffix()) {
for (String id : ids) {
e.getInflectionVariants().forEach(var -> name2id.put(var, id));
e.getSpellingVariants().forEach(var -> name2id.put(var, id));
e.getAcronyms().forEach(acr -> name2id.put(acr, id));
}
}
}
break;
default:
throw new IllegalArgumentException("Unknown Specialist Entry type " + type);
}
}
}
for (String newname : name2id.keySet()) {
String normalizedNewname = normalize(newname);
normalized2OriginalInputNames.put(normalizedNewname, newname);
}
}
public Map> findEntriesForInputNames(File specialistXmlLexicon, Multimap normalized2OriginalInputNames, Map eui2entry, Multimap eui2abbreui, Multimap eui2acroeui, Map acronymEntries, Map abbreviationEntries) throws IOException, XMLStreamException {
Map> names2entries = new HashMap<>();
try (BufferedReader br = FileUtilities.getReaderFromFile(specialistXmlLexicon)) {
XMLStreamReader reader = XMLInputFactory.newFactory().createXMLStreamReader(br);
SpecialistEntry currentEntry = null;
Multimap exactInputs4CurrentEntry = HashMultimap.create();
Multimap normalizedInputs4CurrentEntry = HashMultimap.create();
while (reader.hasNext()) {
int eventType = reader.next();
EntryType entryType = null;
if (eventType == XMLStreamConstants.END_ELEMENT) {
String elementName = reader.getName().getLocalPart();
if (elementName.equals("lexRecord")) {
storeLexiconRecord(eui2entry, eui2abbreui, eui2acroeui, acronymEntries, abbreviationEntries, names2entries, currentEntry, exactInputs4CurrentEntry, normalizedInputs4CurrentEntry);
exactInputs4CurrentEntry.clear();
normalizedInputs4CurrentEntry.clear();
}
} else if (eventType == XMLStreamConstants.START_ELEMENT) {
String elementName = reader.getName().getLocalPart();
String elementText = null;
EntryType refType = null;
switch (elementName) {
case "lexRecord": {
// storeLexiconRecord(eui2abbreui, eui2acroeui, acronymEntries, abbreviationEntries, names2entries, currentEntry, exactInputs4CurrentEntry, normalizedInputs4CurrentEntry);
currentEntry = new SpecialistEntry();
// exactInputs4CurrentEntry.clear();
// normalizedInputs4CurrentEntry.clear();
break;
}
case "acronyms":
entryType = EntryType.ACRONYM;
refType = EntryType.ACRONYM_LONGFORM_EUID;
case "abbreviations":
if (entryType == null)
entryType = EntryType.ABBREVIATION;
if (refType == null)
refType = EntryType.ABBREVIATION_LONGFORM_EUID;
String originalElementText = reader.getElementText();
// for acronyms and abbreviations, the term ID for which the acronym/abbreviations stands
// is separated by a pipe character
String[] split = originalElementText.split("\\|");
elementText = split[0];
String longformEntryEui = split.length > 1 ? split[1] : "";
currentEntry.add(elementText, entryType);
currentEntry.add(longformEntryEui, refType);
String normalizedElementText = normalize(elementText);
if (normalized2OriginalInputNames.containsKey(normalizedElementText)) {
Collection originalInputNames = normalized2OriginalInputNames.get(normalizedElementText);
for (String originalInputName : originalInputNames)
if (originalInputName.equals(elementText))
exactInputs4CurrentEntry.put(originalInputName, entryType);
else
normalizedInputs4CurrentEntry.put(originalInputName, entryType);
}
break;
case "base":
entryType = EntryType.BASE;
case "spellingVars":
if (entryType == null)
entryType = EntryType.SPELLING;
case "inflVars":
if (entryType == null)
entryType = EntryType.INFLECTION;
if (elementText == null)
elementText = reader.getElementText();
currentEntry.add(elementText, entryType);
normalizedElementText = normalize(elementText);
if (normalized2OriginalInputNames.containsKey(normalizedElementText)) {
Collection originalInputNames = normalized2OriginalInputNames.get(normalizedElementText);
for (String originalInputName : originalInputNames)
if (originalInputName.equals(elementText))
exactInputs4CurrentEntry.put(originalInputName, entryType);
else
normalizedInputs4CurrentEntry.put(originalInputName, entryType);
}
break;
case "cat":
currentEntry.setCategory(reader.getElementText());
break;
case "eui":
currentEntry.setEui(reader.getElementText());
break;
}
}
}
}
return names2entries;
}
private void storeLexiconRecord(Map eui2entry, Multimap eui2abbreui, Multimap eui2acroeui, Map acronymEntries, Map abbreviationEntries, Map> names2entries, SpecialistEntry currentEntry, Multimap exactInputs4CurrentEntry, Multimap normalizedInputs4CurrentEntry) {
if (currentEntry != null && currentEntry.getCategory().equals("noun")) {
eui2entry.put(currentEntry.getEui(), currentEntry);
for (Map.Entry p : exactInputs4CurrentEntry.entries()) {
String name = p.getKey();
Multimap type2entries = names2entries.compute(name, (k, v) -> v != null ? v : HashMultimap.create());
type2entries.put(p.getValue(), currentEntry);
}
for (Map.Entry p : normalizedInputs4CurrentEntry.entries()) {
String name = p.getKey();
if (!names2entries.containsKey(name)) {
Multimap type2entries = names2entries.compute(name, (k, v) -> v != null ? v : HashMultimap.create());
type2entries.put(p.getValue(), currentEntry);
}
}
if (!exactInputs4CurrentEntry.isEmpty() || !normalizedInputs4CurrentEntry.isEmpty()) {
for (String longformEui : currentEntry.getAbbreviationLongformEuis())
eui2abbreui.put(longformEui, currentEntry.getEui());
for (String longformEui : currentEntry.getAcronymLongformEuis())
eui2acroeui.put(longformEui, currentEntry.getEui());
if (currentEntry.isAcronymEntry())
acronymEntries.put(currentEntry.getEui(), currentEntry);
if (currentEntry.isAbbreviationEntry())
abbreviationEntries.put(currentEntry.getEui(), currentEntry);
}
}
}
/**
*
* Tries to find a span of text in matchedText that matches the given acronym shortform .
* Upon success, the found long form is returned.
*
* The algorithm tries to find consecutive tokens in matchedText that either begin with or contain
* the characters in acronym in order.
*
* @param acronym A sequence of characters that represent an acronym.
* @param matchedText Some text that might contain the long form of the acronym.
* @return The long form of the given acronym, if it could be found, null otherwise.
*/
@Nullable
private String findLongform(CharSequence acronym, String matchedText) {
String longform = null;
String[] abbreviationAcronymTokens = matchedText.split("[\\s\\p{P}]+");
int shortformpos = 0;
int foundAcronymPositions = 0;
List acronymTokenIndices = new ArrayList<>();
for (int i = 0; i < abbreviationAcronymTokens.length && shortformpos < acronym.length(); ++i) {
String token = abbreviationAcronymTokens[i];
for (int j = 0; j < token.length() && shortformpos < acronym.length(); ++j) {
char tchar = Character.toLowerCase(token.charAt(j));
char schar = Character.toLowerCase(acronym.charAt(shortformpos));
// the first character of a token must be part of the acronym, else we don't accept it
if (j == 0 && tchar != schar) {
shortformpos = 0;
foundAcronymPositions = 0;
acronymTokenIndices.clear();
continue;
}
if (j == 0)
acronymTokenIndices.add(i);
if (tchar == schar) {
++foundAcronymPositions;
++shortformpos;
}
}
}
if (foundAcronymPositions == acronym.length()) {
longform = acronymTokenIndices.stream().map(i -> abbreviationAcronymTokens[i]).collect(Collectors.joining(" "));
}
return longform;
}
private enum EntryType {ABBREVIATION, ACRONYM, SPELLING, INFLECTION, BASE, ACRONYM_LONGFORM_EUID, ABBREVIATION_LONGFORM_EUID}
private class SpecialistEntry {
private String base;
private List spellingVariants = Collections.emptyList();
private List inflectionVariants = Collections.emptyList();
private List abbreviations = Collections.emptyList();
private List acronyms = Collections.emptyList();
private List abbreviationLongformEuis = Collections.emptyList();
private List acronymLongformEuis = Collections.emptyList();
private String eui;
private String category;
public String getEui() {
return eui;
}
public void setEui(String eui) {
this.eui = eui;
}
public boolean isAcronymEntry() {
return !acronyms.isEmpty();
}
public void add(String variant, EntryType type) {
switch (type) {
case SPELLING:
addSpellingVariant(variant);
break;
case INFLECTION:
addInflectionVariant(variant);
break;
case ABBREVIATION:
addAbbreviation(variant);
break;
case ACRONYM:
addAcronym(variant);
break;
case ABBREVIATION_LONGFORM_EUID:
addAbbreviationLongformEui(variant);
break;
case ACRONYM_LONGFORM_EUID:
addAcronymLongformEui(variant);
break;
case BASE:
base = variant;
break;
}
}
public String getBase() {
return base;
}
public boolean uniformAcronymSuffix() {
if (acronyms.isEmpty())
return false;
if (acronyms.size() == 1)
return true;
String[] split = acronyms.get(0).split("\\s+");
String suffix = split[split.length - 1];
for (String acronym : acronyms) {
if (!acronym.endsWith(suffix))
return false;
}
return true;
}
/**
* Tries to find out whether the abbreviations and acronyms of the LEXICON entry denote the same or different concepts via string comparison.
* All pairwise combinations of the abbreviations and acronyms are compared - after resolution of possible acronyms within the strings themselves - are compared for token overlap. If at least one combination has less then half overlap, false is returned.
*
* @return Whether the pairwise token overlap is at least less than 0.5.
*/
public boolean hasAmbiguousAbbreviationsAndAcronyms() {
List abbreviationsAndAcronyms = Stream.concat(abbreviations.stream(), acronyms.stream()).collect(Collectors.toList());
// At first, we do acronym resolution. This makes some combinations of strings more similar and we can use a higher threshold.
Pattern shortformP = Pattern.compile("[A-Z][A-Z]+");
Set shortforms = abbreviationsAndAcronyms.stream().map(shortformP::matcher).filter(Matcher::find).map(Matcher::group).collect(Collectors.toSet());
Map short2long = new HashMap<>();
for (String shortform : shortforms) {
for (String abbreviationAcronym : abbreviationsAndAcronyms) {
String longform = findLongform(shortform, abbreviationAcronym);
if (longform != null && !shortform.equals(longform))
short2long.put(shortform, longform);
}
}
// We use the AC to find short forms in the strings and replace them by long forms.
AhoCorasickOptimized ac = new AhoCorasickOptimized(shortforms);
double minscore = 1;
AhoCorasickLongestMatchCallback callback = new AhoCorasickLongestMatchCallback();
for (String s : abbreviationsAndAcronyms) {
// make the expanded version of s
ac.match(s, callback);
for (Range shortformRange : callback.getLongestMatches().keySet()) {
s = s.substring(0, shortformRange.getMinimum()) + short2long.get(callback.getLongestMatches().get(shortformRange)) + s.substring(shortformRange.getMaximum() + 1);
}
callback.clear();
for (String s2 : abbreviationsAndAcronyms) {
// make the expanded version of s2
ac.match(s2, callback);
for (Range shortformRange : callback.getLongestMatches().keySet()) {
s2 = s2.substring(0, shortformRange.getMinimum()) + short2long.get(callback.getLongestMatches().get(shortformRange)) + s2.substring(shortformRange.getMaximum() + 1);
}
callback.clear();
// now compare s and s2
double score = jaccard.getScore(s, s2);
if (score < minscore)
minscore = score;
}
}
return minscore < .5;
}
public boolean uniformAbbreviationAndAcronymSuffix() {
List abbreviationsAndAcronyms = Stream.concat(abbreviations.stream(), acronyms.stream()).collect(Collectors.toList());
double minscore = 1;
for (String s : abbreviationsAndAcronyms) {
for (String s2 : abbreviationsAndAcronyms) {
double score = jaroWinkler.getScore(s, s2);
if (score < minscore)
minscore = score;
}
}
return minscore > .7;
}
public boolean uniformAbbreviationSuffix() {
if (abbreviations.isEmpty())
return false;
if (abbreviations.size() == 1)
return true;
String[] split = abbreviations.get(0).split("\\s+");
String suffix = split[split.length - 1];
for (String abbreviation : abbreviations) {
if (!abbreviation.endsWith(suffix))
return false;
}
return true;
}
public List getSpellingVariants() {
return spellingVariants;
}
public List getInflectionVariants() {
return inflectionVariants;
}
public List getAbbreviations() {
return abbreviations;
}
public List getAcronyms() {
return acronyms;
}
public void addSpellingVariant(String variant) {
if (spellingVariants.isEmpty())
spellingVariants = new ArrayList<>();
spellingVariants.add(variant);
}
public void addInflectionVariant(String variant) {
if (inflectionVariants.isEmpty())
inflectionVariants = new ArrayList<>();
inflectionVariants.add(variant);
}
public void addAbbreviation(String variant) {
if (abbreviations.isEmpty())
abbreviations = new ArrayList<>();
abbreviations.add(variant);
}
public void addAcronym(String variant) {
if (acronyms.isEmpty())
acronyms = new ArrayList<>();
acronyms.add(variant);
}
public List getAbbreviationLongformEuis() {
return abbreviationLongformEuis;
}
public List getAcronymLongformEuis() {
return acronymLongformEuis;
}
public void addAbbreviationLongformEui(String variant) {
if (abbreviationLongformEuis.isEmpty())
abbreviationLongformEuis = new ArrayList<>();
abbreviationLongformEuis.add(variant);
}
public void addAcronymLongformEui(String variant) {
if (acronymLongformEuis.isEmpty())
acronymLongformEuis = new ArrayList<>();
acronymLongformEuis.add(variant);
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
SpecialistEntry that = (SpecialistEntry) o;
return eui.equals(that.eui);
}
@Override
public int hashCode() {
return Objects.hash(eui);
}
public String getCategory() {
return category;
}
public void setCategory(String category) {
this.category = category;
}
public boolean isAbbreviationEntry() {
return !abbreviations.isEmpty();
}
}
private class GeneGroup {
private final String id;
private Set bases = Collections.emptySet();
private Set acronyms = Collections.emptySet();
private Set abbreviations = Collections.emptySet();
private Set abbreviationLongforms = Collections.emptySet();
private Set acronymLongforms = Collections.emptySet();
private Set spellingVariants = Collections.emptySet();
private Set inflectionVariants = Collections.emptySet();
private Set externalIds = Collections.emptySet();
private Set inputNames = Collections.emptySet();
private Set externalIdNamespaces = Collections.emptySet();
private Set connectedLexiconEntries = Collections.emptySet();
public GeneGroup(String id) {
this.id = id;
}
private Stream getAllNames() {
return Stream.concat(bases.stream(), Stream.concat(spellingVariants.stream(), Stream.concat(inflectionVariants.stream(), Stream.concat(abbreviations.stream(), acronyms.stream()))));
}
public Set getBases() {
return bases;
}
public Set getAcronyms() {
return acronyms;
}
public Set getAbbreviations() {
return abbreviations;
}
public Set getSpellingVariants() {
return spellingVariants;
}
public Set getInputNames() {
return inputNames;
}
public Set getInflectionVariants() {
return inflectionVariants;
}
public Set getExternalIds() {
return externalIds;
}
public void addConnectedLexiconEntry(String entry) {
if (connectedLexiconEntries.isEmpty())
connectedLexiconEntries = new HashSet<>();
connectedLexiconEntries.add(entry);
}
public void addExternalIdNamespace(String namespace) {
if (externalIdNamespaces.isEmpty())
externalIdNamespaces = new HashSet<>();
externalIdNamespaces.add(namespace);
}
public void addInputName(String variant) {
if (inputNames.isEmpty())
inputNames = new HashSet<>();
inputNames.add(variant);
}
public void addAbbreviation(String variant) {
if (abbreviations.isEmpty())
abbreviations = new HashSet<>();
abbreviations.add(variant);
}
public void addAcronym(String variant) {
if (acronyms.isEmpty())
acronyms = new HashSet<>();
acronyms.add(variant);
}
public void addAbbreviationLongform(String variant) {
if (abbreviationLongforms.isEmpty())
abbreviationLongforms = new HashSet<>();
abbreviationLongforms.add(variant);
}
public void addAcronymLongform(String variant) {
if (acronymLongforms.isEmpty())
acronymLongforms = new HashSet<>();
acronymLongforms.add(variant);
}
public void addSpellingVariant(String variant) {
if (spellingVariants.isEmpty())
spellingVariants = new HashSet<>();
spellingVariants.add(variant);
}
public void addInflectionVariant(String variant) {
if (inflectionVariants.isEmpty())
inflectionVariants = new HashSet<>();
inflectionVariants.add(variant);
}
public void addExternalId(String externalId) {
if (externalIds.isEmpty())
externalIds = new HashSet<>();
externalIds.add(externalId);
addExternalIdNamespace(externalId.substring(0, externalId.indexOf(':')));
}
public void addBase(String base) {
if (bases.isEmpty())
bases = new HashSet<>();
bases.add(base);
}
public void addSpellingVariants(Collection spellingVariants) {
if (this.spellingVariants.isEmpty())
this.spellingVariants = new HashSet<>();
this.spellingVariants.addAll(spellingVariants);
}
public void addInflectionVariants(Collection inflectionVariants) {
if (this.inflectionVariants.isEmpty())
this.inflectionVariants = new HashSet<>();
this.inflectionVariants.addAll(inflectionVariants);
}
public String getId() {
return id;
}
public boolean isCompatibleTo(GeneGroup otherGroup) {
Set thisNames = Stream.concat(bases.stream(), Stream.concat(spellingVariants.stream(), inflectionVariants.stream())).collect(Collectors.toSet());
Set otherNames = Stream.concat(otherGroup.getBases().stream(), Stream.concat(otherGroup.getSpellingVariants().stream(), otherGroup.getInflectionVariants().stream())).collect(Collectors.toSet());
double maxScore = 0;
for (String thisname : thisNames) {
for (String othername : otherNames) {
double score = jaroWinkler.getScore(thisname, othername);
if (score > maxScore)
maxScore = score;
}
}
boolean iscompatible = maxScore > .9;
return iscompatible;
}
public void merge(GeneGroup gg) {
bases.addAll(gg.getBases());
addSpellingVariants(gg.getSpellingVariants());
addInflectionVariants(gg.getInflectionVariants());
gg.getAcronyms().forEach(this::addAcronym);
gg.getAbbreviations().forEach(this::addAbbreviation);
}
public Set getExternalIdNamespaces() {
return externalIdNamespaces;
}
public Set getConnectedLexiconEntries() {
return connectedLexiconEntries;
}
}
}