
de.julielab.geneexpbase.resources.SpecialistLexiconGreekDictCreator Maven / Gradle / Ivy
package de.julielab.geneexpbase.resources;
import com.lahodiuk.ahocorasick.AhoCorasickOptimized;
import de.julielab.geneexpbase.AhoCorasickLongestMatchCallback;
import de.julielab.geneexpbase.TermNormalizer;
import de.julielab.java.utilities.FileUtilities;
import org.apache.commons.lang3.Range;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.IOException;
import java.util.*;
/**
* This class uses the SPECIALIST Lexicon in its XML format to extract words
* that contain greek character names (alpha, beta) by coincidence.
* The resulting dictionary is used by the {@link TermNormalizer}
* to distinguish words that actually contain a greek symbol from words that just have some substring looking like one.
*
* @see https://lexsrv3.nlm.nih.gov/LexSysGroup/Projects/lexicon/current/
*/
public class SpecialistLexiconGreekDictCreator {
private final static Logger log = LoggerFactory.getLogger(SpecialistLexiconGreekDictCreator.class);
public static void main(String[] args) throws IOException, XMLStreamException {
if (args.length != 2) {
System.err.println("Usage: " + SpecialistLexiconGreekDictCreator.class.getCanonicalName() + "
© 2015 - 2025 Weber Informatics LLC | Privacy Policy