All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.wordnet.Syns2Index Maven / Gradle / Ivy

package org.apache.lucene.wordnet;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.io.PrintStream;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;

/**
 * Convert the prolog file wn_s.pl from the WordNet prolog download
 * into a Lucene index suitable for looking up synonyms and performing query expansion ({@link SynExpand#expand SynExpand.expand(...)}).
 *
 * This has been tested with WordNet 2.0.
 *
 * The index has fields named "word" ({@link #F_WORD})
 * and "syn" ({@link #F_SYN}).
 * 

* The source word (such as 'big') can be looked up in the * "word" field, and if present there will be fields named "syn" * for every synonym. What's tricky here is that there could be multiple * fields with the same name, in the general case for words that have multiple synonyms. * That's not a problem with Lucene, you just use {@link org.apache.lucene.document.Document#getValues} *

*

* While the WordNet file distinguishes groups of synonyms with * related meanings we don't do that here. *

* * This can take 4 minutes to execute and build an index on a "fast" system and the index takes up almost 3 MB. * * @see WordNet home page * @see prologdb man page * @see sample site that uses it */ public class Syns2Index { /** * */ private static final PrintStream o = System.out; /** * */ private static final PrintStream err = System.err; /** * */ public static final String F_SYN = "syn"; /** * */ public static final String F_WORD = "word"; /** * */ private static final Analyzer ana = new StandardAnalyzer(); /** * Takes arg of prolog file name and index directory. */ public static void main(String[] args) throws Throwable { // get command line arguments String prologFilename = null; // name of file "wn_s.pl" String indexDir = null; if (args.length == 2) { prologFilename = args[0]; indexDir = args[1]; } else { usage(); System.exit(1); } // ensure that the prolog file is readable if (! (new File(prologFilename)).canRead()) { err.println("Error: cannot read Prolog file: " + prologFilename); System.exit(1); } // exit if the target index directory already exists if ((new File(indexDir)).isDirectory()) { err.println("Error: index directory already exists: " + indexDir); err.println("Please specify a name of a non-existent directory"); System.exit(1); } o.println("Opening Prolog file " + prologFilename); final FileInputStream fis = new FileInputStream(prologFilename); final BufferedReader br = new BufferedReader(new InputStreamReader(fis)); String line; // maps a word to all the "groups" it's in final Map word2Nums = new TreeMap(); // maps a group to all the words in it final Map num2Words = new TreeMap(); // number of rejected words int ndecent = 0; // status output int mod = 1; int row = 1; // parse prolog file o.println( "[1/2] Parsing " + prologFilename); while ((line = br.readLine()) != null) { // occasional progress if ((++row) % mod == 0) // periodically print out line we read in { mod *= 2; o.println("\t" + row + " " + line + " " + word2Nums.size() + " " + num2Words.size() + " ndecent=" + ndecent); } // syntax check if (! line.startsWith("s(")) { err.println("OUCH: " + line); System.exit(1); } // parse line line = line.substring(2); int comma = line.indexOf(','); String num = line.substring(0, comma); int q1 = line.indexOf('\''); line = line.substring(q1 + 1); int q2 = line.indexOf('\''); String word = line.substring(0, q2).toLowerCase(); // make sure is a normal word if (! isDecent(word)) { ndecent++; continue; // don't store words w/ spaces } // 1/2: word2Nums map // append to entry or add new one List lis =(List) word2Nums.get(word); if (lis == null) { lis = new LinkedList(); lis.add(num); word2Nums.put(word, lis); } else lis.add(num); // 2/2: num2Words map lis = (List) num2Words.get(num); if (lis == null) { lis = new LinkedList(); lis.add(word); num2Words.put(num, lis); } else lis.add(word); } // close the streams fis.close(); br.close(); // create the index o.println( "[2/2] Building index to store synonyms, " + " map sizes are " + word2Nums.size() + " and " + num2Words.size()); index(indexDir, word2Nums, num2Words); } /** * Checks to see if a word contains only alphabetic characters by * checking it one character at a time. * * @param s string to check * @return true if the string is decent */ private static boolean isDecent(String s) { int len = s.length(); for (int i = 0; i < len; i++) { if (!Character.isLetter(s.charAt(i))) { return false; } } return true; } /** * Forms a Lucene index based on the 2 maps. * * @param indexDir the direcotry where the index should be created * @param word2Nums * @param num2Words */ private static void index(String indexDir, Map word2Nums, Map num2Words) throws Throwable { int row = 0; int mod = 1; // override the specific index if it already exists IndexWriter writer = new IndexWriter(indexDir, ana, true, IndexWriter.MaxFieldLength.LIMITED); writer.setUseCompoundFile(true); // why? Iterator i1 = word2Nums.keySet().iterator(); while (i1.hasNext()) // for each word { String g = (String) i1.next(); Document doc = new Document(); int n = index(word2Nums, num2Words, g, doc); if (n > 0) { doc.add( new Field( F_WORD, g, Field.Store.YES, Field.Index.NOT_ANALYZED)); if ((++row % mod) == 0) { o.println("\trow=" + row + "/" + word2Nums.size() + " doc= " + doc); mod *= 2; } writer.addDocument(doc); } // else degenerate } o.println( "Optimizing.."); writer.optimize(); writer.close(); } /** * Given the 2 maps fills a document for 1 word. */ private static int index(Map word2Nums, Map num2Words, String g, Document doc) throws Throwable { List keys = (List) word2Nums.get(g); // get list of key#'s Iterator i2 = keys.iterator(); Set already = new TreeSet(); // keep them sorted // pass 1: fill up 'already' with all words while (i2.hasNext()) // for each key# { already.addAll((List) num2Words.get(i2.next())); // get list of words } int num = 0; already.remove(g); // of course a word is it's own syn Iterator it = already.iterator(); while (it.hasNext()) { String cur = (String) it.next(); // don't store things like 'pit bull' -> 'american pit bull' if (!isDecent(cur)) { continue; } num++; doc.add( new Field( F_SYN, cur, Field.Store.YES, Field.Index.NO)); } return num; } /** * */ private static void usage() { o.println("\n\n" + "java org.apache.lucene.wordnet.Syns2Index \n\n"); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy