All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.julielab.geneexpbase.resources.SpecialistLexiconGreekDictCreator Maven / Gradle / Ivy

package de.julielab.geneexpbase.resources;

import com.lahodiuk.ahocorasick.AhoCorasickOptimized;
import de.julielab.geneexpbase.AhoCorasickLongestMatchCallback;
import de.julielab.geneexpbase.TermNormalizer;
import de.julielab.java.utilities.FileUtilities;
import org.apache.commons.lang3.Range;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.IOException;
import java.util.*;

/**
 * 

This class uses the SPECIALIST Lexicon in its XML format to extract words * that contain greek character names (alpha, beta) by coincidence.

*

The resulting dictionary is used by the {@link TermNormalizer} * to distinguish words that actually contain a greek symbol from words that just have some substring looking like one.

* * @see https://lexsrv3.nlm.nih.gov/LexSysGroup/Projects/lexicon/current/ */ public class SpecialistLexiconGreekDictCreator { private final static Logger log = LoggerFactory.getLogger(SpecialistLexiconGreekDictCreator.class); public static void main(String[] args) throws IOException, XMLStreamException { if (args.length != 2) { System.err.println("Usage: " + SpecialistLexiconGreekDictCreator.class.getCanonicalName() + " "); } else { SpecialistLexiconGreekDictCreator dictCreator = new SpecialistLexiconGreekDictCreator(); dictCreator.createDict(new File(args[0]), new File(args[1])); } } public void createDict(File specialistXmlLexicon, File dictionaryDestination) throws IOException, XMLStreamException { log.info("Reading SPECIALIST Lexicon from {} and writing dictionary to {}.", specialistXmlLexicon, dictionaryDestination); TermNormalizer termNormalizer = new TermNormalizer(); final AhoCorasickLongestMatchCallback callback = new AhoCorasickLongestMatchCallback(); Set alreadyWritten = new HashSet<>(); try (BufferedReader br = FileUtilities.getReaderFromFile(specialistXmlLexicon); BufferedWriter bw = FileUtilities.getWriterToFile(dictionaryDestination)) { bw.write("# Created " + new Date()); bw.newLine(); XMLStreamReader reader = XMLInputFactory.newFactory().createXMLStreamReader(br); Set toWrite = new HashSet<>(); boolean recordHasNonEmbeddedGreek = false; while (reader.hasNext()) { int eventType = reader.next(); if (eventType == XMLStreamConstants.START_ELEMENT) { if (recordHasNonEmbeddedGreek) continue; String elementName = reader.getName().getLocalPart(); String elementText = null; switch (elementName) { case "acronyms": case "abbreviations": elementText = reader.getElementText(); // for acronyms and abbreviations, the term ID for which the acronym/abbreviations stands // is separated by a pipe character elementText = elementText.split("\\|")[0]; case "base": case "spellingVars": case "inflVars": if (elementText == null) elementText = reader.getElementText(); if (checkTextForGreek(elementText, termNormalizer.getGreekHighLowKinaseAC(), callback) && !alreadyWritten.contains(elementText) && !elementText.matches(".*\\s.*")) { toWrite.add(elementText); // bw.write(elementText); //bw.newLine(); //alreadyWritten.add(elementText); } else if (!callback.getLongestMatches().isEmpty()) { recordHasNonEmbeddedGreek = true; toWrite.clear(); } } } else if (eventType == XMLStreamConstants.END_ELEMENT) { String elementName = reader.getName().getLocalPart(); if (elementName.equalsIgnoreCase("lexRecord")) { if (!recordHasNonEmbeddedGreek) { for (String entry : toWrite) { bw.write(entry); bw.newLine(); } } toWrite.clear(); recordHasNonEmbeddedGreek = false; } } } } log.info("Done."); } private boolean checkTextForGreek(String elementText, AhoCorasickOptimized greekAC, AhoCorasickLongestMatchCallback callback) { callback.clear(); // match on lower cased variant because the greek symbol list is completely lowercased, too greekAC.match(elementText.toLowerCase(), callback); TreeMap, String> longestMatches = callback.getLongestMatches(); int numEmbeddedGreek = 0; for (Map.Entry, String> e : longestMatches.entrySet()) { Range range = e.getKey(); // Now check if this match is embedded mid-word (e.g. "synthetase") or separated by punctuation or whitespace boolean seperatedLeft = true; boolean seperatedRight = true; if (range.getMinimum() > 0) { char leftChar = elementText.charAt(range.getMinimum() - 1); if (!(Character.isWhitespace(leftChar) || elementText.substring(range.getMinimum() - 1, range.getMinimum()).matches("\\p{P}|[0-9]"))) seperatedLeft = false; } if (range.getMaximum() < elementText.length() - 1) { char rightChar = elementText.charAt(range.getMaximum() + 1); if (!(Character.isWhitespace(rightChar) || elementText.substring(range.getMaximum() + 1, range.getMaximum() + 2).matches("\\p{P}|[0-9]"))) seperatedRight = false; } if (!seperatedLeft || !seperatedRight) ++numEmbeddedGreek; } return numEmbeddedGreek > 0; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy