All Downloads are FREE. Search and download functionalities are using the official Maven repository.

eu.fbk.dkm.pikes.resources.WordNet Maven / Gradle / Ivy

Go to download

A collection of Java classes for accessing and querying a number of NLP resources.

The newest version!
package eu.fbk.dkm.pikes.resources;

import com.google.common.base.Charsets;
import com.google.common.base.Objects;
import com.google.common.base.Preconditions;
import com.google.common.collect.*;
import com.google.common.io.Resources;
import eu.fbk.rdfpro.util.Environment;
import net.didion.jwnl.JWNL;
import net.didion.jwnl.JWNLException;
import net.didion.jwnl.data.*;
import net.didion.jwnl.dictionary.Dictionary;

import javax.annotation.Nullable;
import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.util.*;

public final class WordNet {

    public static final String POS_NOUN = "n";

    public static final String POS_VERB = "v";

    public static final String POS_ADJECTIVE = "a";

    public static final String POS_ADVERB = "r";

    private static final Map BBN_TO_SYNSET;

    private static final Map> SYNSET_TO_BBN; // built from bbnToSynset

    private static final Map BBN_TO_SST;

    private static final Map SYNSET_TO_SST; // contains partial mapping overriding
    // lexicographer file

    private static String dictionaryPath = Objects.firstNonNull(
            Environment.getProperty("wordnet.home"), "wordnet");

    private static Dictionary dictionary;

    static {
        // TODO: need better mapping
        final Map bbnToSynset = Maps.newLinkedHashMap();
        bbnToSynset.put("person", "00007846-n");
        bbnToSynset.put("organization", "08008335-n"); // was 07950920
        bbnToSynset.put("gpe", "00027167-n");
        bbnToSynset.put("location", "00027167-n");
        bbnToSynset.put("event", "00029378-n");
        bbnToSynset.put("product", "04007894-n");
        bbnToSynset.put("fac", "03315023-n");
        bbnToSynset.put("work_of_art", "02743547-n");
        bbnToSynset.put("law", "06532330-n");
        bbnToSynset.put("language", "06282651-n");
        bbnToSynset.put("quantity", "00033615-n");
        bbnToSynset.put("date", "15113229-n");
        bbnToSynset.put("time", "15113229-n");
        bbnToSynset.put("percent", "13817526-n");
        bbnToSynset.put("money", "13384557-n");
        bbnToSynset.put("ordinal", "14429985-n");
        bbnToSynset.put("cardinal", "13582013-n");
        BBN_TO_SYNSET = ImmutableMap.copyOf(bbnToSynset);

        final Map> synsetToBbn = Maps.newHashMap();
        for (final Map.Entry entry : bbnToSynset.entrySet()) {
            final String bbn = entry.getKey();
            final String synset = entry.getValue();
            final List list = synsetToBbn.get(synset);
            if (list == null) {
                synsetToBbn.put(synset, ImmutableList.of(bbn));
            } else {
                synsetToBbn.put(
                        synset,
                        Ordering.natural().immutableSortedCopy(
                                Iterables.concat(list, ImmutableList.of(bbn))));
            }
        }
        SYNSET_TO_BBN = ImmutableMap.copyOf(synsetToBbn);

        final Map bbnToSst = Maps.newLinkedHashMap();
        bbnToSst.put("person", "B-noun.person");
        bbnToSst.put("organization", "B-noun.group");
        bbnToSst.put("gpe", "B-noun.location");
        bbnToSst.put("location", "B-noun.location");
        bbnToSst.put("event", "B-noun.event");
        bbnToSst.put("product", "B-noun.artifact");
        bbnToSst.put("fac", "B-noun.artifact");
        bbnToSst.put("work_of_art", "B-noun.artifact");
        bbnToSst.put("law", "B-noun.communication");
        bbnToSst.put("language", "B-noun.communication");
        bbnToSst.put("quantity", "B-noun.quantity");
        bbnToSst.put("date", "B-noun.time");
        bbnToSst.put("time", "B-noun.time");
        bbnToSst.put("percent", "B-noun.relation");
        bbnToSst.put("money", "B-noun.possession");
        bbnToSst.put("ordinal", "B-noun.state");
        bbnToSst.put("cardinal", "B-noun.quantity");
        BBN_TO_SST = ImmutableMap.copyOf(bbnToSst);

        final Map synsetToSst = Maps.newHashMap();
        synsetToSst.put("00007846-n", "B-noun.person");
        synsetToSst.put("00027167-n", "B-noun.location");
        synsetToSst.put("00033615-n", "B-noun.quantity");
        SYNSET_TO_SST = ImmutableMap.copyOf(synsetToSst);
    }

    private static Dictionary getDictionary() {
        synchronized (WordNet.class) {
            if (dictionary == null) {
                JWNL.shutdown(); // in case it was previously initialized
                try {
                    final String properties = Resources.toString(
                            WordNet.class.getResource("jwnl.xml"), Charsets.UTF_8).replace(
                            "DICTIONARY_PATH_PLACEHOLDER", dictionaryPath);
                    final InputStream stream = new ByteArrayInputStream(
                            properties.getBytes(Charsets.UTF_8));
                    JWNL.initialize(stream);
                    dictionary = Dictionary.getInstance();
                } catch (final Throwable ex) {
                    JWNL.shutdown();
                    throw new Error("Cannot initialize JWNL using dictionary path '"
                            + dictionaryPath + "'", ex);
                }
            }
            return dictionary;
        }
    }

    private static void releaseDictionary() {
        synchronized (WordNet.class) {
            dictionary = null;
            JWNL.shutdown(); // safe to call it multiple times
        }
    }

    private static Synset getSynset(final String id) {
        final POS pos = POS.getPOSForKey(getPOS(id));
        final long offset = getOffset(id);
        try {
            synchronized (WordNet.class) {
                return getDictionary().getSynsetAt(pos, offset);
            }
        } catch (final JWNLException ex) {
            throw new Error(ex);
        }
    }

    // synsetID has the form offset-x, where x is n for nouns, a for adjectives, v for verbs, r
    // for adverbs

    public static void init() {
        getDictionary();
    }

    public static List getSynsetsForLemma(String lemma, String pos) {
        try {
            synchronized (WordNet.class) {
                IndexWord indexWord = getDictionary().lookupIndexWord(POS.getPOSForKey(pos), lemma);
                if (indexWord == null) {
                    return new ArrayList<>();
                }
                Synset[] synsets = indexWord.getSenses();
                ArrayList ret = new ArrayList<>();
                for (int i = 0; i < synsets.length; i++) {
                    Synset synset = synsets[i];
                    ret.add(getSynsetID(synset.getOffset(), synset.getPOS().getKey()));
                }

                return ret;
            }
        } catch (final JWNLException ex) {
            throw new Error(ex);
        }
    }

    public static String getPath() {
        synchronized (WordNet.class) {
            return dictionaryPath;
        }
    }

    public static void setPath(final String dictionaryPath) {
        Preconditions.checkNotNull(dictionaryPath);
        synchronized (WordNet.class) {
            if (!WordNet.dictionaryPath.equals(dictionaryPath)) {
                releaseDictionary();
                WordNet.dictionaryPath = dictionaryPath;
            }
        }
    }

    // MANIPULATION OF SYNSET IDS

    public static String getSynsetID(final long offset, final String pos) {
        return String.format("%08d-%s", offset, pos);
    }

    /**
     * Return the synset ID starting from a readable format:
     * 
    *
  • lemma
  • *
  • "-" (dash)
  • *
  • synset number
  • *
  • POS
  • *
*

* For example: look-3v * * @param readableSynsetID an absolute URL giving the base location of the image */ @Nullable public static String getSynsetID(@Nullable final String readableSynsetID) { if (readableSynsetID == null) { return null; } try { final int length = readableSynsetID.length(); final int offset = readableSynsetID.lastIndexOf('-'); final String lemma = readableSynsetID.substring(0, offset); final int index = Integer.parseInt(readableSynsetID.substring(offset + 1, length - 1)) - 1; final POS pos = POS.getPOSForKey(readableSynsetID.substring(length - 1, length)); final IndexWord word; synchronized (WordNet.class) { word = getDictionary().getIndexWord(pos, lemma); } final Synset synset = word.getSenses()[index]; return getSynsetID(synset.getOffset(), pos.getKey()); } catch (final JWNLException ex) { throw new Error(ex); } catch (final Throwable ex) { throw new IllegalArgumentException("Illegal (readable) synset ID " + readableSynsetID, ex); } } @Nullable public static String getReadableSynsetID(@Nullable final String synsetID) { if (synsetID == null) { return null; } final Synset synset = getSynset(synsetID); if (synset == null) { throw new IllegalArgumentException("Illegal synset ID " + synsetID); } final String lemma = synset.getWords()[0].getLemma(); final POS pos = POS.getPOSForKey(getPOS(synsetID)); try { final IndexWord word; synchronized (WordNet.class) { word = getDictionary().lookupIndexWord(pos, lemma); } final Synset[] senses = word.getSenses(); for (int i = 0; i < senses.length; ++i) { if (senses[i].equals(synset)) { return lemma + "-" + (i + 1) + pos.getKey(); } } throw new Error("Could not determine sense index for lemma " + lemma + " and synset " + synsetID); } catch (final JWNLException ex) { throw new Error(ex); } // return synset.getSenseKey(lemma); // TODO } public static String getPOS(final String synsetID) { Preconditions.checkNotNull(synsetID); final int index = synsetID.lastIndexOf('-'); if (index == synsetID.length() - 1 || synsetID.isEmpty()) { throw new IllegalArgumentException("Cannot extract POS from '" + synsetID + "' - invalid string"); } return "" + Character.toLowerCase(index < 0 ? synsetID.charAt(0) : synsetID .charAt(index + 1)); } public static long getOffset(String synsetID) { Preconditions.checkNotNull(synsetID); try { final int index = synsetID.lastIndexOf('-'); if (index > 0) { synsetID = synsetID.substring(0, index); } return Long.parseLong(synsetID); } catch (final Throwable ex) { throw new IllegalArgumentException("Cannot extract offset from '" + synsetID + "'", ex); } } public static Set getLemmas(final String synsetID) { final Set lemmas = Sets.newLinkedHashSet(); final Synset synset = getSynset(synsetID); if (synset != null) { for (final Word word : synset.getWords()) { lemmas.add(word.getLemma()); } } return lemmas; } public static Set getGenericSet(final String synsetID, final PointerType... pointerTypes) { final Set ret = Sets.newHashSet(); final Synset synset = getSynset(synsetID); if (synset != null) { for (final PointerType pointerType : pointerTypes) { for (final Pointer pointer : synset.getPointers(pointerType)) { try { final Synset target = pointer.getTargetSynset(); ret.add(getSynsetID(target.getOffset(), target.getPOS().getKey())); } catch (final Throwable ex) { throw new RuntimeException(ex); } } } } return ret; } public static Set getGenericSet(final String synsetID, final boolean recursive, final PointerType... pointerTypes) { if (!recursive) { return getGenericSet(synsetID, pointerTypes); } final Set result = Sets.newHashSet(); final List queue = Lists.newArrayList(synsetID); while (!queue.isEmpty()) { final String id = queue.remove(0); if (result.add(id)) { queue.addAll(getGenericSet(id, pointerTypes)); } } return result; } public static Set getHypernyms(final String synsetID) { return getGenericSet(synsetID, PointerType.HYPERNYM); } public static Set getHyponyms(final String synsetID) { return getGenericSet(synsetID, PointerType.HYPONYM); } public static Set getHypernyms(final String synsetID, final boolean recursive) { return getGenericSet(synsetID, recursive, PointerType.HYPERNYM, PointerType.INSTANCE_HYPERNYM); } public static Set getHyponims(final String synsetID, final boolean recursive) { return getGenericSet(synsetID, recursive, PointerType.HYPONYM, PointerType.INSTANCES_HYPONYM); } // returns only noun synsets @Nullable public static String mapBBNToSynset(@Nullable final String bbn) { return bbn == null ? null : BBN_TO_SYNSET.get(bbn.trim().toLowerCase()); } // works only for noun synset @Nullable public static String mapSynsetToBBN(@Nullable final String synsetID) { final List ids = Lists.newLinkedList(); ids.add(synsetID); while (!ids.isEmpty()) { final String id = ids.remove(0); final List bbns = SYNSET_TO_BBN.get(id); if (bbns != null && !bbns.isEmpty()) { return bbns.get(0); // return only first BBN in case of ambiguity } try { final Synset source = getSynset(id); final List hypernymIDs = Lists.newArrayList(); for (final PointerType type : new PointerType[] { PointerType.HYPERNYM, PointerType.INSTANCE_HYPERNYM }) { for (final Pointer pointer : source.getPointers(type)) { final Synset target = pointer.getTargetSynset(); hypernymIDs.add(getSynsetID(target.getOffset(), target.getPOS().getKey())); } } Collections.sort(hypernymIDs); // necessary in order to get deterministic results ids.addAll(hypernymIDs); } catch (final JWNLException ex) { throw new Error("Unexpected exception (!)", ex); } } return null; } @Nullable public static String mapSynsetToSST(@Nullable final String synsetID) { if (synsetID != null) { final String sst = SYNSET_TO_SST.get(synsetID); if (sst != null) { return sst; } return "B-" + getSynset(synsetID).getLexFileName(); } return null; } // returns always B-noun.XXX sst @Nullable public static String mapBBNToSST(@Nullable final String bbn) { if (bbn != null) { return BBN_TO_SST.get(bbn.trim().toLowerCase()); } return null; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy