edu.usc.ir.geo.gazetteer.GeoNameResolver Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-geo-gazetteer Show documentation
An implementation of a geonames.org-based Gazetteer using Apache Lucene.
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package edu.usc.ir.geo.gazetteer;

import java.io.BufferedReader;
import java.io.Closeable;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.PriorityQueue;
import java.util.logging.Level;
import java.util.logging.Logger;

import edu.usc.ir.geo.gazetteer.domain.Location;
import edu.usc.ir.geo.gazetteer.service.Launcher;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.lang3.StringUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.DoubleField;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.IntField;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.SortedNumericSortField;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

import com.google.gson.Gson;

public class GeoNameResolver implements Closeable {
	private static final String JSON_OPT = "json";
	/**
	 * Below constants define name of field in lucene index
	 */
	public static final String FIELD_NAME_ID = "ID";
	public static final String FIELD_NAME_NAME = "name";
	public static final String FIELD_NAME_LONGITUDE = "longitude";
	public static final String FIELD_NAME_LATITUDE = "latitude";
	public static final String FIELD_NAME_ALTERNATE_NAMES = "alternatenames";
	public static final String FIELD_NAME_FEATURE_CODE = "featureCode";
	public static final String FIELD_NAME_COUNTRY_CODE = "countryCode";
	public static final String FIELD_NAME_ADMIN1_CODE = "admin1Code";
	public static final String FIELD_NAME_ADMIN2_CODE = "admin2Code";
	public static final String FIELD_NAME_POPULATION = "population";
	/**
	 * Below constants define weight multipliers used for result relevance.
	 */
	private static final int WEIGHT_SORT_ORDER = 20;
	private static final int WEIGHT_SIZE_ALT_NAME = 50;
	private static final int WEIGHT_NAME_MATCH = 20000;
	private static final int WEIGHT_NAME_PART_MATCH = 15000;

	private static final Logger LOG = Logger.getLogger(GeoNameResolver.class
			.getName());
	private static final Double OUT_OF_BOUNDS = 999999.0;
	private static Analyzer analyzer = new StandardAnalyzer();
	private static IndexWriter indexWriter;
	private static Directory indexDir;
	private static int hitsPerPage = 8;

	private IndexReader indexReader;

	public GeoNameResolver(){
	}

	/**
	 * Creates a GeoNameResolver for given path
	 * @param indexPath the path to lucene index
	 * @throws IOException
	 */
	public GeoNameResolver(String indexPath) throws IOException {
		this.indexReader = createIndexReader(indexPath);
	}

	/**
	 *
	 * @param locationNames List of location na,es
	 * @param count Number of results per location
	 * @return resolved Geo Names
	 * @throws IOException
	 */
	public HashMap> searchGeoName(List locationNames,
													   int count) throws IOException {
		return resolveEntities(locationNames, count, this.indexReader);
	}

	/**
	 * Search corresponding GeoName for each location entity
	 * @param count
	 * 			  Number of results for one locations
	 * @param querystr
	 *            it's the NER actually
	 *
	 * @return HashMap each name has a list of resolved entities
	 * @throws IOException
	 * @throws RuntimeException
	 */

	public HashMap> searchGeoName(String indexerPath,
													   List locationNameEntities,
													   int count) throws IOException {

		if (locationNameEntities.size() == 0
				|| locationNameEntities.get(0).length() == 0)
			return new HashMap>();
		IndexReader reader = createIndexReader(indexerPath);
		HashMap> resolvedEntities =
				resolveEntities(locationNameEntities, count, reader);
		reader.close();
		return resolvedEntities;

	}

	private IndexReader createIndexReader(String indexerPath) throws IOException {
		File indexfile = new File(indexerPath);
		indexDir = FSDirectory.open(indexfile.toPath());


		if (!DirectoryReader.indexExists(indexDir)) {
			LOG.log(Level.SEVERE,
					"No Lucene Index Dierctory Found, Invoke indexBuild() First !");
			System.exit(1);
		}

		return DirectoryReader.open(indexDir);
	}

	private HashMap> resolveEntities(List locationNames,
														  int count, IndexReader reader) throws IOException {
		if (locationNames.size() >= 200)
			hitsPerPage = 5; // avoid heavy computation
		IndexSearcher searcher = new IndexSearcher(reader);
		Query q = null;

		HashMap> allCandidates = new HashMap>();

		for (String name : locationNames) {

			if (!allCandidates.containsKey(name)) {
				try {
					//query is wrapped in additional quotes (") to avoid query tokenization on space
					q = new MultiFieldQueryParser(new String[] { FIELD_NAME_NAME,
							FIELD_NAME_ALTERNATE_NAMES }, analyzer).parse(String.format("\"%s\"", name) );

					//sort descending on population
					SortField populationSort = new SortedNumericSortField(FIELD_NAME_POPULATION, SortField.Type.LONG, true);

					Sort sort = new Sort(populationSort);
					//Fetch 3 times desired values, these will be sorted on code and only desired number will be kept
					ScoreDoc[] hits = searcher.search(q, hitsPerPage * 3 , sort).scoreDocs;

					List topHits = new ArrayList();

					for (int i = 0; i < hits.length; ++i) {
						Location tmpLocObj = new Location();

						int docId = hits[i].doc;
						Document d;
						try {
							d = searcher.doc(docId);
							tmpLocObj.setName(d.get(FIELD_NAME_NAME));
							tmpLocObj.setLongitude(d.get(FIELD_NAME_LONGITUDE));
							tmpLocObj.setLatitude(d.get(FIELD_NAME_LATITUDE));
							//If alternate names are empty put name as actual name
							//This covers missing data and equals weight for later computation
							if (d.get(FIELD_NAME_ALTERNATE_NAMES).isEmpty()){
								tmpLocObj.setAlternateNames(d.get(FIELD_NAME_NAME));
							}else{
								tmpLocObj.setAlternateNames(d.get(FIELD_NAME_ALTERNATE_NAMES));
							}
							tmpLocObj.setCountryCode(d.get(FIELD_NAME_COUNTRY_CODE));
							tmpLocObj.setAdmin1Code(d.get(FIELD_NAME_ADMIN1_CODE));
							tmpLocObj.setAdmin2Code(d.get(FIELD_NAME_ADMIN2_CODE));
							tmpLocObj.setFeatureCode(d.get(FIELD_NAME_FEATURE_CODE));

						} catch (IOException e) {
							e.printStackTrace();
						}
						topHits.add(tmpLocObj);
					}
					//Picking hitsPerPage number of locations from feature code sorted list 
					allCandidates.put(name, pickTopSortedByCode(topHits,hitsPerPage));
				} catch (org.apache.lucene.queryparser.classic.ParseException e) {
					e.printStackTrace();
				}
			}
		}

		HashMap> resolvedEntities = new HashMap>();
		pickBestCandidates(resolvedEntities, allCandidates, count);
		return resolvedEntities;
	}
	
	/**
	 * Sorts inputLocations as per FeatureCodeComparator and returns at most topCount locations 
	 * @param inputLocations List of locations to be sorted
	 * @param topCount Number of locations to be kept in curtailed list
	 * @return List of at most topCount locations sorted by edu.usc.ir.geo.gazetteer.CustomLuceneGeoGazetteerComparator.FeatureCodeComparator 
	 */
	private List pickTopSortedByCode(List inputLocations, int topCount) {
		if(inputLocations == null || inputLocations.size()==0){
			return new ArrayList<>();
		}
		
		Collections.sort(inputLocations, new CustomLuceneGeoGazetteerComparator.FeatureCodeComparator());
		return inputLocations.subList(0, inputLocations.size() > topCount ? topCount : inputLocations.size() - 1);
	}

	/**
	 * Select the best match for each location name extracted from a document,
	 * choosing from among a list of lists of candidate matches. Filter uses the
	 * following features: 1) edit distance between name and the resolved name,
	 * choose smallest one 2) content (haven't implemented)
	 *
	 * @param resolvedEntities
	 *            final result for the input stream
	 * @param allCandidates
	 *            each location name may hits several documents, this is the
	 *            collection for all hitted documents
	 * @param count
	 * 			  Number of results for one locations
	 * @throws IOException
	 * @throws RuntimeException
	 */

	private void pickBestCandidates(
			HashMap> resolvedEntities,
			HashMap> allCandidates, int count) {

		for (String extractedName : allCandidates.keySet()) {

			List cur = allCandidates.get(extractedName);
			if(cur.isEmpty())
				continue;//continue if no results found

			int maxWeight = Integer.MIN_VALUE ;
			//In case weight is equal for all return top element
			int bestIndex = 0;
			//Priority queue to return top elements
			PriorityQueue pq = new PriorityQueue<>(cur.size(), new Comparator() {
				@Override
				public int compare(Location o1, Location o2) {
					return Integer.compare(o2.getWeight(), o1.getWeight());
				}
			});

			for (int i = 0; i < cur.size(); ++i) {
				int weight = 0;
				// get cur's ith resolved entry's name
				String resolvedName = String.format(" %s ", cur.get(i).getName());
				if (resolvedName.contains(String.format(" %s ", extractedName))) {
					// Assign a weight as per configuration if extracted name is found as a exact word in name
					weight = WEIGHT_NAME_MATCH;
				} else if (resolvedName.contains(extractedName)) {
					// Assign a weight as per configuration if extracted name is found partly in name
					weight = WEIGHT_NAME_PART_MATCH;
				}
				// get all alternate names of cur's ith resolved entry's
				String[] altNames = cur.get(i).getAlternateNames().split(",");
				float altEditDist = 0;
				for(String altName : altNames){
					if(altName.contains(extractedName)){
						altEditDist+=StringUtils.getLevenshteinDistance(extractedName, altName);
					}
				}
				//lesser the edit distance more should be the weight
				weight += getCalibratedWeight(altNames.length, altEditDist);

				//Give preference to sorted results. 0th result should have more priority
				weight += (cur.size()-i) * WEIGHT_SORT_ORDER;

				cur.get(i).setWeight(weight);

				if (weight > maxWeight) {
					maxWeight = weight;
					bestIndex = i;
				}

				pq.add(cur.get(i)) ;
			}
			if (bestIndex == -1)
				continue;

			List resultList = new ArrayList<>();

			for(int i =0 ; i< count && !pq.isEmpty() ; i++){
				resultList.add(pq.poll());
			}

			resolvedEntities.put(extractedName, resultList);
		}
	}

	/**
	 * Returns a weight for average edit distance for set of alternate name


	 * altNamesSize * WEIGHT_SIZE_ALT_NAME - (altEditDist/altNamesSize) ;


	 * altNamesSize * WEIGHT_SIZE_ALT_NAME ensure more priority for results with more alternate names.

	 * altEditDist/altNamesSize is average edit distance. 

	 * Lesser the average, higher the over all expression
	 * @param altNamesSize - Count of altNames
	 * @param altEditDist - sum of individual edit distances
	 * @return
	 */
	public float getCalibratedWeight(int altNamesSize, float altEditDist) {
		return altNamesSize * WEIGHT_SIZE_ALT_NAME - (altEditDist/altNamesSize) ;
	}

	/**
	 * Build the gazetteer index line by line
	 *
	 * @param gazetteerPath
	 *            path of the gazetteer file
	 * @param indexerPath
	 *            path to the created Lucene index directory.
	 * @throws IOException
	 * @throws RuntimeException
	 */
	public void buildIndex(String gazetteerPath, String indexerPath)
			throws IOException {
		File indexfile = new File(indexerPath);
		indexDir = FSDirectory.open(indexfile.toPath());
		if (!DirectoryReader.indexExists(indexDir)) {
			IndexWriterConfig config = new IndexWriterConfig(analyzer);
			indexWriter = new IndexWriter(indexDir, config);
			Logger logger = Logger.getLogger(this.getClass().getName());
			logger.log(Level.WARNING, "Start Building Index for Gazatteer");
			BufferedReader filereader = new BufferedReader(
					new InputStreamReader(new FileInputStream(gazetteerPath),
							"UTF-8"));
			String line;
			int count = 0;
			while ((line = filereader.readLine()) != null) {
				try {
					count += 1;
					if (count % 100000 == 0) {
						logger.log(Level.INFO, "Indexed Row Count: " + count);
					}
					addDoc(indexWriter, line);

				} catch (RuntimeException re) {
					logger.log(Level.WARNING, "Skipping... Error on line: {}",
							line);
				}
			}
			logger.log(Level.WARNING, "Building Finished");
			filereader.close();
			indexWriter.close();
		}
	}

	/**
	 * Index gazetteer's one line data by built-in Lucene Index functions
	 *
	 * @param indexWriter
	 *            Lucene indexWriter to be loaded
	 * @param line
	 *            a line from the gazetteer file
	 * @throws IOException
	 * @throws NumberFormatException
	 */
	private static void addDoc(IndexWriter indexWriter, final String line) {
		String[] tokens = line.split("\t");

		int ID = Integer.parseInt(tokens[0]);
		String name = tokens[1];
		String alternatenames = tokens[3];

		Double latitude = -999999.0;
		try {
			latitude = Double.parseDouble(tokens[4]);
		} catch (NumberFormatException e) {
			latitude = OUT_OF_BOUNDS;
		}
		Double longitude = -999999.0;
		try {
			longitude = Double.parseDouble(tokens[5]);
		} catch (NumberFormatException e) {
			longitude = OUT_OF_BOUNDS;
		}

		int population = 0;
		try {
			population = Integer.parseInt(tokens[14]);
		} catch (NumberFormatException e) {
			population = 0;// Treat as population does not exists
		}

		// Additional fields to rank more known locations higher
		// All available codes can be viewed on www.geonames.org
		String featureCode = tokens[7];// more granular category
		String countryCode = tokens[8];
		String admin1Code = tokens[10];// eg US State
		String admin2Code = tokens[11];// eg county

		Document doc = new Document();
		doc.add(new IntField(FIELD_NAME_ID, ID, Field.Store.YES));
		doc.add(new TextField(FIELD_NAME_NAME, name, Field.Store.YES));
		doc.add(new DoubleField(FIELD_NAME_LONGITUDE, longitude, Field.Store.YES));
		doc.add(new DoubleField(FIELD_NAME_LATITUDE, latitude, Field.Store.YES));
		doc.add(new TextField(FIELD_NAME_ALTERNATE_NAMES, alternatenames, Field.Store.YES));
		doc.add(new TextField(FIELD_NAME_FEATURE_CODE, featureCode, Field.Store.YES));
		doc.add(new TextField(FIELD_NAME_COUNTRY_CODE, countryCode, Field.Store.YES));
		doc.add(new TextField(FIELD_NAME_ADMIN1_CODE, admin1Code, Field.Store.YES));
		doc.add(new TextField(FIELD_NAME_ADMIN2_CODE, admin2Code, Field.Store.YES));
		doc.add(new NumericDocValuesField(FIELD_NAME_POPULATION, population));//sort enabled field


		try {
			indexWriter.addDocument(doc);
		} catch (IOException e) {
			e.printStackTrace();
		}
	}

	@Override
	public void close() throws IOException {
		if (indexReader != null) {
			this.indexReader.close();
		}
	}
	/**
	 * Writes the result as formatted json to given PrintStream 
	 * @param resolvedEntities map of resolved entities
	 * @param out the print stream for writing output
	 */
	public static void writeResultJson(Map> resolvedEntities,
			   PrintStream out) {
		out.println(new Gson().toJson(resolvedEntities) );
	}
	
	/**
	 * Writes the result to given PrintStream
	 * @deprecated Use writeResultJson instead 
	 * @param resolvedEntities map of resolved entities
	 * @param out the print stream for writing output
	 */
	@Deprecated
	public static void writeResult(Map> resolvedEntities,
								   PrintStream out) {
		out.println("[");
		List keys = (List)(List) Arrays.asList(resolvedEntities.keySet().toArray());
		//TODO: use org.json.JSONArray and remove this custom formatting code
		for (int j=0; j < keys.size(); j++) {
			String n = keys.get(j);
			out.println("{\"" + n + "\" : [");
			List terms = resolvedEntities.get(n);
			for (int i = 0; i < terms.size(); i++) {
				Location res = terms.get(i);
				if (i < terms.size() - 1) {
					out.println(res + ",");
				} else {
					out.println(res);
				}
			}

			if (j < keys.size() -1){
				out.println("]},");
			}
			else{
				out.println("]}");
			}
		}
		out.println("]");
	}

	public static void main(String[] args) throws Exception {
		Option buildOpt = OptionBuilder.withArgName("gazetteer file").hasArg().withLongOpt("build")
				.withDescription("The Path to the Geonames allCountries.txt")
				.create('b');

		Option searchOpt = OptionBuilder.withArgName("set of location names").withLongOpt("search").hasArgs()
				.withDescription("Location names to search the Gazetteer for")
				.create('s');

		Option indexOpt = OptionBuilder
				.withArgName("directoryPath")
				.withLongOpt("index")
				.hasArgs()
				.withDescription(
						"The path to the Lucene index directory to either create or read")
				.create('i');

		Option helpOpt = OptionBuilder.withLongOpt("help")
				.withDescription("Print this message.").create('h');

		Option resultCountOpt = OptionBuilder.withArgName("number of results").withLongOpt("count").hasArgs()
				.withDescription("Number of best results to be returned for one location").withType(Integer.class)
				.create('c');

		Option serverOption = OptionBuilder.withArgName("Launch Server")
				.withLongOpt("server")
				.withDescription("Launches Geo Gazetteer Service")
				.create("server");

		Option jsonOption = OptionBuilder.withArgName("outputs json")
				.withLongOpt(JSON_OPT)
				.withDescription("Formats output in well defined json structure")
				.create(JSON_OPT);

		String indexPath = null;
		String gazetteerPath = null;
		Options options = new Options();
		options.addOption(buildOpt);
		options.addOption(searchOpt);
		options.addOption(indexOpt);
		options.addOption(helpOpt);
		options.addOption(resultCountOpt);
		options.addOption(serverOption);
		options.addOption(jsonOption);

		// create the parser
		CommandLineParser parser = new DefaultParser();
		GeoNameResolver resolver = new GeoNameResolver();

		try {
			// parse the command line arguments
			CommandLine line = parser.parse(options, args);

			if (line.hasOption("index")) {
				indexPath = line.getOptionValue("index");
			}

			if (line.hasOption("build")) {
				gazetteerPath = line.getOptionValue("build");
			}

			if (line.hasOption("help")) {
				HelpFormatter formatter = new HelpFormatter();
				formatter.printHelp("lucene-geo-gazetteer", options);
				System.exit(1);
			}

			if (indexPath != null && gazetteerPath != null) {
				LOG.info("Building Lucene index at path: [" + indexPath
						+ "] with geoNames.org file: [" + gazetteerPath + "]");
				resolver.buildIndex(gazetteerPath, indexPath);
			}

			if (line.hasOption("search")) {
				List geoTerms = new ArrayList(Arrays.asList(line
						.getOptionValues("search")));
				String countStr = line.getOptionValue("count", "1");
				int count = 1;
				if (countStr.matches("\\d+"))
					count = Integer.parseInt(countStr);

				Map> resolved = resolver
						.searchGeoName(indexPath, geoTerms, count);
				if(line.hasOption(JSON_OPT)){
					writeResultJson(resolved, System.out);
				}else{
					writeResult(resolved, System.out);
				}
			} else if (line.hasOption("server")){
				if (indexPath == null) {
					System.err.println("Index path is required");
					System.exit(-2);
				}

				//TODO: get port from CLI args
				int port = 8765;
				Launcher.launchService(port, indexPath);
			} else {
				System.err.println("Sub command not recognised");
				System.exit(-1);
			}

		} catch (ParseException exp) {
			// oops, something went wrong
			System.err.println("Parsing failed.  Reason: " + exp.getMessage());
		}
	}

}