All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.wordnet.SynonymMap Maven / Gradle / Ivy

package org.apache.lucene.wordnet;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;
import java.util.TreeSet;

/**
 * Loads the WordNet  prolog file wn_s.pl 
 * into a thread-safe main-memory hash map that can be used for fast
 * high-frequency lookups of synonyms for any given (lowercase) word string.
 * 

* There holds: If B is a synonym for A (A -> B) then A is also a synonym for B (B -> A). * There does not necessarily hold: A -> B, B -> C then A -> C. *

* Loading typically takes some 1.5 secs, so should be done only once per * (server) program execution, using a singleton pattern. Once loaded, a * synonym lookup via {@link #getSynonyms(String)}takes constant time O(1). * A loaded default synonym map consumes about 10 MB main memory. * An instance is immutable, hence thread-safe. *

* This implementation borrows some ideas from the Lucene Syns2Index demo that * Dave Spencer originally contributed to Lucene. Dave's approach * involved a persistent Lucene index which is suitable for occasional * lookups or very large synonym tables, but considered unsuitable for * high-frequency lookups of medium size synonym tables. *

* Example Usage: *

 * String[] words = new String[] { "hard", "woods", "forest", "wolfish", "xxxx"};
 * SynonymMap map = new SynonymMap(new FileInputStream("samples/fulltext/wn_s.pl"));
 * for (int i = 0; i < words.length; i++) {
 *     String[] synonyms = map.getSynonyms(words[i]);
 *     System.out.println(words[i] + ":" + java.util.Arrays.asList(synonyms).toString());
 * }
 * 
* * Example output: *
 * hard:[arduous, backbreaking, difficult, fermented, firmly, grueling, gruelling, heavily, heavy, intemperately, knockout, laborious, punishing, severe, severely, strong, toilsome, tough]
 * woods:[forest, wood]
 * forest:[afforest, timber, timberland, wood, woodland, woods]
 * wolfish:[edacious, esurient, rapacious, ravening, ravenous, voracious, wolflike]
 * xxxx:[]
 * 
* *

* See also:
* prologdb * man page
* Dave's synonym demo site */ public class SynonymMap { /** the index data; Map */ private final HashMap table; private static final String[] EMPTY = new String[0]; private static final boolean DEBUG = false; /** * Constructs an instance, loading WordNet synonym data from the given input * stream. Finally closes the stream. The words in the stream must be in * UTF-8 or a compatible subset (for example ASCII, MacRoman, etc.). * * @param input * the stream to read from (null indicates an empty synonym map) * @throws IOException * if an error occured while reading the stream. */ public SynonymMap(InputStream input) throws IOException { this.table = input == null ? new HashMap(0) : read(toByteArray(input)); } /** * Returns the synonym set for the given word, sorted ascending. * * @param word * the word to lookup (must be in lowercase). * @return the synonyms; a set of zero or more words, sorted ascending, each * word containing lowercase characters that satisfy * Character.isLetter(). */ public String[] getSynonyms(String word) { String[] synonyms = table.get(word); if (synonyms == null) return EMPTY; String[] copy = new String[synonyms.length]; // copy for guaranteed immutability System.arraycopy(synonyms, 0, copy, 0, synonyms.length); return copy; } /** * Returns a String representation of the index data for debugging purposes. * * @return a String representation */ @Override public String toString() { StringBuilder buf = new StringBuilder(); Iterator iter = new TreeMap(table).keySet().iterator(); int count = 0; int f0 = 0; int f1 = 0; int f2 = 0; int f3 = 0; while (iter.hasNext()) { String word = iter.next(); buf.append(word + ":"); String[] synonyms = getSynonyms(word); buf.append(Arrays.asList(synonyms)); buf.append("\n"); count += synonyms.length; if (synonyms.length == 0) f0++; if (synonyms.length == 1) f1++; if (synonyms.length == 2) f2++; if (synonyms.length == 3) f3++; } buf.append("\n\nkeys=" + table.size() + ", synonyms=" + count + ", f0=" + f0 +", f1=" + f1 + ", f2=" + f2 + ", f3=" + f3); return buf.toString(); } /** * Analyzes/transforms the given word on input stream loading. This default implementation simply * lowercases the word. Override this method with a custom stemming * algorithm or similar, if desired. * * @param word * the word to analyze * @return the same word, or a different word (or null to indicate that the * word should be ignored) */ protected String analyze(String word) { return word.toLowerCase(); } protected boolean isValid(String str) { for (int i=str.length(); --i >= 0; ) { if (!Character.isLetter(str.charAt(i))) return false; } return true; } private HashMap read(byte[] data) { int WORDS = (int) (76401 / 0.7); // presizing int GROUPS = (int) (88022 / 0.7); // presizing HashMap> word2Groups = new HashMap>(WORDS); // Map HashMap> group2Words = new HashMap>(GROUPS); // Map HashMap internedWords = new HashMap(WORDS);// Map Charset charset = Charset.forName("UTF-8"); int lastNum = -1; Integer lastGroup = null; int len = data.length; int i=0; while (i < len) { // until EOF /* Part A: Parse a line */ // scan to beginning of group while (i < len && data[i] != '(') i++; if (i >= len) break; // EOF i++; // parse group int num = 0; while (i < len && data[i] != ',') { num = 10*num + (data[i] - 48); i++; } i++; // if (DEBUG) System.err.println("num="+ num); // scan to beginning of word while (i < len && data[i] != '\'') i++; i++; // scan to end of word int start = i; do { while (i < len && data[i] != '\'') i++; i++; } while (i < len && data[i] != ','); // word must end with "'," if (i >= len) break; // EOF String word = charset.decode(ByteBuffer.wrap(data, start, i-start-1)).toString(); // String word = new String(data, 0, start, i-start-1); // ASCII /* * Part B: ignore phrases (with spaces and hyphens) and * non-alphabetic words, and let user customize word (e.g. do some * stemming) */ if (!isValid(word)) continue; // ignore word = analyze(word); if (word == null || word.length() == 0) continue; // ignore /* Part C: Add (group,word) to tables */ // ensure compact string representation, minimizing memory overhead String w = internedWords.get(word); if (w == null) { word = new String(word); // ensure compact string internedWords.put(word, word); } else { word = w; } Integer group = lastGroup; if (num != lastNum) { group = Integer.valueOf(num); lastGroup = group; lastNum = num; } // add word --> group ArrayList groups = word2Groups.get(word); if (groups == null) { groups = new ArrayList(1); word2Groups.put(word, groups); } groups.add(group); // add group --> word ArrayList words = group2Words.get(group); if (words == null) { words = new ArrayList(1); group2Words.put(group, words); } words.add(word); } /* Part D: compute index data structure */ HashMap word2Syns = createIndex(word2Groups, group2Words); /* Part E: minimize memory consumption by a factor 3 (or so) */ // if (true) return word2Syns; word2Groups = null; // help gc //TODO: word2Groups.clear(); would be more appropriate ? group2Words = null; // help gc //TODO: group2Words.clear(); would be more appropriate ? return optimize(word2Syns, internedWords); } private HashMap createIndex(Map> word2Groups, Map> group2Words) { HashMap word2Syns = new HashMap(); for (final Map.Entry> entry : word2Groups.entrySet()) { // for each word ArrayList group = entry.getValue(); String word = entry.getKey(); // HashSet synonyms = new HashSet(); TreeSet synonyms = new TreeSet(); for (int i=group.size(); --i >= 0; ) { // for each groupID of word ArrayList words = group2Words.get(group.get(i)); for (int j=words.size(); --j >= 0; ) { // add all words String synonym = words.get(j); // note that w and word are interned if (synonym != word) { // a word is implicitly it's own synonym synonyms.add(synonym); } } } int size = synonyms.size(); if (size > 0) { String[] syns = new String[size]; if (size == 1) syns[0] = synonyms.first(); else synonyms.toArray(syns); // if (syns.length > 1) Arrays.sort(syns); // if (DEBUG) System.err.println("word=" + word + ":" + Arrays.asList(syns)); word2Syns.put(word, syns); } } return word2Syns; } private HashMap optimize(HashMap word2Syns, HashMap internedWords) { if (DEBUG) { System.err.println("before gc"); for (int i=0; i < 10; i++) System.gc(); System.err.println("after gc"); } // collect entries int len = 0; int size = word2Syns.size(); String[][] allSynonyms = new String[size][]; String[] words = new String[size]; Iterator> iter = word2Syns.entrySet().iterator(); for (int j=0; j < size; j++) { Map.Entry entry = iter.next(); allSynonyms[j] = entry.getValue(); words[j] = entry.getKey(); len += words[j].length(); } // assemble large string containing all words StringBuilder buf = new StringBuilder(len); for (int j=0; j < size; j++) buf.append(words[j]); String allWords = new String(buf.toString()); // ensure compact string across JDK versions buf = null; // intern words at app level via memory-overlaid substrings for (int p=0, j=0; j < size; j++) { String word = words[j]; internedWords.put(word, allWords.substring(p, p + word.length())); p += word.length(); } // replace words with interned words for (int j=0; j < size; j++) { String[] syns = allSynonyms[j]; for (int k=syns.length; --k >= 0; ) { syns[k] = internedWords.get(syns[k]); } word2Syns.remove(words[j]); word2Syns.put(internedWords.get(words[j]), syns); } if (DEBUG) { words = null; allSynonyms = null; internedWords = null; allWords = null; System.err.println("before gc"); for (int i=0; i < 10; i++) System.gc(); System.err.println("after gc"); } return word2Syns; } // the following utility methods below are copied from Apache style Nux library - see http://dsd.lbl.gov/nux private static byte[] toByteArray(InputStream input) throws IOException { try { // safe and fast even if input.available() behaves weird or buggy int len = Math.max(256, input.available()); byte[] buffer = new byte[len]; byte[] output = new byte[len]; len = 0; int n; while ((n = input.read(buffer)) >= 0) { if (len + n > output.length) { // grow capacity byte tmp[] = new byte[Math.max(output.length << 1, len + n)]; System.arraycopy(output, 0, tmp, 0, len); System.arraycopy(buffer, 0, tmp, len, n); buffer = output; // use larger buffer for future larger bulk reads output = tmp; } else { System.arraycopy(buffer, 0, output, len, n); } len += n; } if (len == output.length) return output; buffer = null; // help gc buffer = new byte[len]; System.arraycopy(output, 0, buffer, 0, len); return buffer; } finally { input.close(); } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy