de.julielab.geneexpbase.resources.SpecialistLexiconGreekDictCreator Maven / Gradle / Ivy

Go to download
package de.julielab.geneexpbase.resources;

import com.lahodiuk.ahocorasick.AhoCorasickOptimized;
import de.julielab.geneexpbase.AhoCorasickLongestMatchCallback;
import de.julielab.geneexpbase.TermNormalizer;
import de.julielab.java.utilities.FileUtilities;
import org.apache.commons.lang3.Range;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.IOException;
import java.util.*;

/**
 * This class uses the SPECIALIST Lexicon in its XML format to extract words
 * that contain greek character names (alpha, beta) by coincidence.
 * The resulting dictionary is used by the {@link TermNormalizer}
 * to distinguish words that actually contain a greek symbol from words that just have some substring looking like one.
 *
 * @see https://lexsrv3.nlm.nih.gov/LexSysGroup/Projects/lexicon/current/
 */
public class SpecialistLexiconGreekDictCreator {
    private final static Logger log = LoggerFactory.getLogger(SpecialistLexiconGreekDictCreator.class);

    public static void main(String[] args) throws IOException, XMLStreamException {
        if (args.length != 2) {
            System.err.println("Usage: " + SpecialistLexiconGreekDictCreator.class.getCanonicalName() + "  ");
        } else {
            SpecialistLexiconGreekDictCreator dictCreator = new SpecialistLexiconGreekDictCreator();
            dictCreator.createDict(new File(args[0]), new File(args[1]));
        }

    }

    public void createDict(File specialistXmlLexicon, File dictionaryDestination) throws IOException, XMLStreamException {
        log.info("Reading SPECIALIST Lexicon from {} and writing dictionary to {}.", specialistXmlLexicon, dictionaryDestination);
        TermNormalizer termNormalizer = new TermNormalizer();
        final AhoCorasickLongestMatchCallback callback = new AhoCorasickLongestMatchCallback();
        Set alreadyWritten = new HashSet<>();
        try (BufferedReader br = FileUtilities.getReaderFromFile(specialistXmlLexicon); BufferedWriter bw = FileUtilities.getWriterToFile(dictionaryDestination)) {
            bw.write("# Created " + new Date());
            bw.newLine();
            XMLStreamReader reader = XMLInputFactory.newFactory().createXMLStreamReader(br);
            Set toWrite = new HashSet<>();
            boolean recordHasNonEmbeddedGreek = false;
            while (reader.hasNext()) {
                int eventType = reader.next();
                if (eventType == XMLStreamConstants.START_ELEMENT) {
                    if (recordHasNonEmbeddedGreek)
                        continue;
                    String elementName = reader.getName().getLocalPart();
                    String elementText = null;
                    switch (elementName) {
                        case "acronyms":
                        case "abbreviations":
                            elementText = reader.getElementText();
                            // for acronyms and abbreviations, the term ID for which the acronym/abbreviations stands
                            // is separated by a pipe character
                            elementText = elementText.split("\\|")[0];
                        case "base":
                        case "spellingVars":
                        case "inflVars":
                            if (elementText == null)
                                elementText = reader.getElementText();
                            if (checkTextForGreek(elementText, termNormalizer.getGreekHighLowKinaseAC(), callback) && !alreadyWritten.contains(elementText) && !elementText.matches(".*\\s.*")) {
                                toWrite.add(elementText);
                                // bw.write(elementText);
                                //bw.newLine();
                                //alreadyWritten.add(elementText);
                            } else if (!callback.getLongestMatches().isEmpty()) {
                                recordHasNonEmbeddedGreek = true;
                                toWrite.clear();
                            }

                    }
                } else if (eventType == XMLStreamConstants.END_ELEMENT) {
                    String elementName = reader.getName().getLocalPart();
                    if (elementName.equalsIgnoreCase("lexRecord")) {
                        if (!recordHasNonEmbeddedGreek) {
                            for (String entry : toWrite) {
                                bw.write(entry);
                                bw.newLine();
                            }
                        }
                        toWrite.clear();
                        recordHasNonEmbeddedGreek = false;
                    }
                }
            }
        }
        log.info("Done.");
    }

    private boolean checkTextForGreek(String elementText, AhoCorasickOptimized greekAC, AhoCorasickLongestMatchCallback callback) {
        callback.clear();
        // match on lower cased variant because the greek symbol list is completely lowercased, too
        greekAC.match(elementText.toLowerCase(), callback);
        TreeMap, String> longestMatches = callback.getLongestMatches();
        int numEmbeddedGreek = 0;
        for (Map.Entry, String> e : longestMatches.entrySet()) {
            Range range = e.getKey();
            // Now check if this match is embedded mid-word (e.g. "synthetase") or separated by punctuation or whitespace
            boolean seperatedLeft = true;
            boolean seperatedRight = true;

            if (range.getMinimum() > 0) {
                char leftChar = elementText.charAt(range.getMinimum() - 1);
                if (!(Character.isWhitespace(leftChar) || elementText.substring(range.getMinimum() - 1, range.getMinimum()).matches("\\p{P}|[0-9]")))
                    seperatedLeft = false;
            }

            if (range.getMaximum() < elementText.length() - 1) {
                char rightChar = elementText.charAt(range.getMaximum() + 1);
                if (!(Character.isWhitespace(rightChar) || elementText.substring(range.getMaximum() + 1, range.getMaximum() + 2).matches("\\p{P}|[0-9]")))
                    seperatedRight = false;
            }

            if (!seperatedLeft || !seperatedRight)
                ++numEmbeddedGreek;
        }

        return numEmbeddedGreek > 0;
    }
}