All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.usc.ir.geo.gazetteer.GeoNameResolver Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package edu.usc.ir.geo.gazetteer;

import java.io.BufferedReader;
import java.io.Closeable;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.PriorityQueue;
import java.util.logging.Level;
import java.util.logging.Logger;

import edu.usc.ir.geo.gazetteer.domain.Location;
import edu.usc.ir.geo.gazetteer.service.Launcher;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.lang3.StringUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.DoubleField;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.IntField;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.SortedNumericSortField;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

import com.google.gson.Gson;

public class GeoNameResolver implements Closeable {
	private static final String JSON_OPT = "json";
	/**
	 * Below constants define name of field in lucene index
	 */
	public static final String FIELD_NAME_ID = "ID";
	public static final String FIELD_NAME_NAME = "name";
	public static final String FIELD_NAME_LONGITUDE = "longitude";
	public static final String FIELD_NAME_LATITUDE = "latitude";
	public static final String FIELD_NAME_ALTERNATE_NAMES = "alternatenames";
	public static final String FIELD_NAME_FEATURE_CODE = "featureCode";
	public static final String FIELD_NAME_COUNTRY_CODE = "countryCode";
	public static final String FIELD_NAME_ADMIN1_CODE = "admin1Code";
	public static final String FIELD_NAME_ADMIN2_CODE = "admin2Code";
	public static final String FIELD_NAME_POPULATION = "population";
	/**
	 * Below constants define weight multipliers used for result relevance.
	 */
	private static final int WEIGHT_SORT_ORDER = 20;
	private static final int WEIGHT_SIZE_ALT_NAME = 50;
	private static final int WEIGHT_NAME_MATCH = 20000;
	private static final int WEIGHT_NAME_PART_MATCH = 15000;

	private static final Logger LOG = Logger.getLogger(GeoNameResolver.class
			.getName());
	private static final Double OUT_OF_BOUNDS = 999999.0;
	private static Analyzer analyzer = new StandardAnalyzer();
	private static IndexWriter indexWriter;
	private static Directory indexDir;
	private static int hitsPerPage = 8;

	private IndexReader indexReader;

	public GeoNameResolver(){
	}

	/**
	 * Creates a GeoNameResolver for given path
	 * @param indexPath the path to lucene index
	 * @throws IOException
	 */
	public GeoNameResolver(String indexPath) throws IOException {
		this.indexReader = createIndexReader(indexPath);
	}

	/**
	 *
	 * @param locationNames List of location na,es
	 * @param count Number of results per location
	 * @return resolved Geo Names
	 * @throws IOException
	 */
	public HashMap> searchGeoName(List locationNames,
													   int count) throws IOException {
		return resolveEntities(locationNames, count, this.indexReader);
	}

	/**
	 * Search corresponding GeoName for each location entity
	 * @param count
	 * 			  Number of results for one locations
	 * @param querystr
	 *            it's the NER actually
	 *
	 * @return HashMap each name has a list of resolved entities
	 * @throws IOException
	 * @throws RuntimeException
	 */

	public HashMap> searchGeoName(String indexerPath,
													   List locationNameEntities,
													   int count) throws IOException {

		if (locationNameEntities.size() == 0
				|| locationNameEntities.get(0).length() == 0)
			return new HashMap>();
		IndexReader reader = createIndexReader(indexerPath);
		HashMap> resolvedEntities =
				resolveEntities(locationNameEntities, count, reader);
		reader.close();
		return resolvedEntities;

	}

	private IndexReader createIndexReader(String indexerPath) throws IOException {
		File indexfile = new File(indexerPath);
		indexDir = FSDirectory.open(indexfile.toPath());


		if (!DirectoryReader.indexExists(indexDir)) {
			LOG.log(Level.SEVERE,
					"No Lucene Index Dierctory Found, Invoke indexBuild() First !");
			System.exit(1);
		}

		return DirectoryReader.open(indexDir);
	}

	private HashMap> resolveEntities(List locationNames,
														  int count, IndexReader reader) throws IOException {
		if (locationNames.size() >= 200)
			hitsPerPage = 5; // avoid heavy computation
		IndexSearcher searcher = new IndexSearcher(reader);
		Query q = null;

		HashMap> allCandidates = new HashMap>();

		for (String name : locationNames) {

			if (!allCandidates.containsKey(name)) {
				try {
					//query is wrapped in additional quotes (") to avoid query tokenization on space
					q = new MultiFieldQueryParser(new String[] { FIELD_NAME_NAME,
							FIELD_NAME_ALTERNATE_NAMES }, analyzer).parse(String.format("\"%s\"", name) );

					//sort descending on population
					SortField populationSort = new SortedNumericSortField(FIELD_NAME_POPULATION, SortField.Type.LONG, true);

					Sort sort = new Sort(populationSort);
					//Fetch 3 times desired values, these will be sorted on code and only desired number will be kept
					ScoreDoc[] hits = searcher.search(q, hitsPerPage * 3 , sort).scoreDocs;

					List topHits = new ArrayList();

					for (int i = 0; i < hits.length; ++i) {
						Location tmpLocObj = new Location();

						int docId = hits[i].doc;
						Document d;
						try {
							d = searcher.doc(docId);
							tmpLocObj.setName(d.get(FIELD_NAME_NAME));
							tmpLocObj.setLongitude(d.get(FIELD_NAME_LONGITUDE));
							tmpLocObj.setLatitude(d.get(FIELD_NAME_LATITUDE));
							//If alternate names are empty put name as actual name
							//This covers missing data and equals weight for later computation
							if (d.get(FIELD_NAME_ALTERNATE_NAMES).isEmpty()){
								tmpLocObj.setAlternateNames(d.get(FIELD_NAME_NAME));
							}else{
								tmpLocObj.setAlternateNames(d.get(FIELD_NAME_ALTERNATE_NAMES));
							}
							tmpLocObj.setCountryCode(d.get(FIELD_NAME_COUNTRY_CODE));
							tmpLocObj.setAdmin1Code(d.get(FIELD_NAME_ADMIN1_CODE));
							tmpLocObj.setAdmin2Code(d.get(FIELD_NAME_ADMIN2_CODE));
							tmpLocObj.setFeatureCode(d.get(FIELD_NAME_FEATURE_CODE));

						} catch (IOException e) {
							e.printStackTrace();
						}
						topHits.add(tmpLocObj);
					}
					//Picking hitsPerPage number of locations from feature code sorted list 
					allCandidates.put(name, pickTopSortedByCode(topHits,hitsPerPage));
				} catch (org.apache.lucene.queryparser.classic.ParseException e) {
					e.printStackTrace();
				}
			}
		}

		HashMap> resolvedEntities = new HashMap>();
		pickBestCandidates(resolvedEntities, allCandidates, count);
		return resolvedEntities;
	}
	
	/**
	 * Sorts inputLocations as per FeatureCodeComparator and returns at most topCount locations 
	 * @param inputLocations List of locations to be sorted
	 * @param topCount Number of locations to be kept in curtailed list
	 * @return List of at most topCount locations sorted by edu.usc.ir.geo.gazetteer.CustomLuceneGeoGazetteerComparator.FeatureCodeComparator 
	 */
	private List pickTopSortedByCode(List inputLocations, int topCount) {
		if(inputLocations == null || inputLocations.size()==0){
			return new ArrayList<>();
		}
		
		Collections.sort(inputLocations, new CustomLuceneGeoGazetteerComparator.FeatureCodeComparator());
		return inputLocations.subList(0, inputLocations.size() > topCount ? topCount : inputLocations.size() - 1);
	}

	/**
	 * Select the best match for each location name extracted from a document,
	 * choosing from among a list of lists of candidate matches. Filter uses the
	 * following features: 1) edit distance between name and the resolved name,
	 * choose smallest one 2) content (haven't implemented)
	 *
	 * @param resolvedEntities
	 *            final result for the input stream
	 * @param allCandidates
	 *            each location name may hits several documents, this is the
	 *            collection for all hitted documents
	 * @param count
	 * 			  Number of results for one locations
	 * @throws IOException
	 * @throws RuntimeException
	 */

	private void pickBestCandidates(
			HashMap> resolvedEntities,
			HashMap> allCandidates, int count) {

		for (String extractedName : allCandidates.keySet()) {

			List cur = allCandidates.get(extractedName);
			if(cur.isEmpty())
				continue;//continue if no results found

			int maxWeight = Integer.MIN_VALUE ;
			//In case weight is equal for all return top element
			int bestIndex = 0;
			//Priority queue to return top elements
			PriorityQueue pq = new PriorityQueue<>(cur.size(), new Comparator() {
				@Override
				public int compare(Location o1, Location o2) {
					return Integer.compare(o2.getWeight(), o1.getWeight());
				}
			});

			for (int i = 0; i < cur.size(); ++i) {
				int weight = 0;
				// get cur's ith resolved entry's name
				String resolvedName = String.format(" %s ", cur.get(i).getName());
				if (resolvedName.contains(String.format(" %s ", extractedName))) {
					// Assign a weight as per configuration if extracted name is found as a exact word in name
					weight = WEIGHT_NAME_MATCH;
				} else if (resolvedName.contains(extractedName)) {
					// Assign a weight as per configuration if extracted name is found partly in name
					weight = WEIGHT_NAME_PART_MATCH;
				}
				// get all alternate names of cur's ith resolved entry's
				String[] altNames = cur.get(i).getAlternateNames().split(",");
				float altEditDist = 0;
				for(String altName : altNames){
					if(altName.contains(extractedName)){
						altEditDist+=StringUtils.getLevenshteinDistance(extractedName, altName);
					}
				}
				//lesser the edit distance more should be the weight
				weight += getCalibratedWeight(altNames.length, altEditDist);

				//Give preference to sorted results. 0th result should have more priority
				weight += (cur.size()-i) * WEIGHT_SORT_ORDER;

				cur.get(i).setWeight(weight);

				if (weight > maxWeight) {
					maxWeight = weight;
					bestIndex = i;
				}

				pq.add(cur.get(i)) ;
			}
			if (bestIndex == -1)
				continue;

			List resultList = new ArrayList<>();

			for(int i =0 ; i< count && !pq.isEmpty() ; i++){
				resultList.add(pq.poll());
			}

			resolvedEntities.put(extractedName, resultList);
		}
	}

	/**
	 * Returns a weight for average edit distance for set of alternate name

* altNamesSize * WEIGHT_SIZE_ALT_NAME - (altEditDist/altNamesSize) ;

* altNamesSize * WEIGHT_SIZE_ALT_NAME ensure more priority for results with more alternate names.
* altEditDist/altNamesSize is average edit distance.
* Lesser the average, higher the over all expression * @param altNamesSize - Count of altNames * @param altEditDist - sum of individual edit distances * @return */ public float getCalibratedWeight(int altNamesSize, float altEditDist) { return altNamesSize * WEIGHT_SIZE_ALT_NAME - (altEditDist/altNamesSize) ; } /** * Build the gazetteer index line by line * * @param gazetteerPath * path of the gazetteer file * @param indexerPath * path to the created Lucene index directory. * @throws IOException * @throws RuntimeException */ public void buildIndex(String gazetteerPath, String indexerPath) throws IOException { File indexfile = new File(indexerPath); indexDir = FSDirectory.open(indexfile.toPath()); if (!DirectoryReader.indexExists(indexDir)) { IndexWriterConfig config = new IndexWriterConfig(analyzer); indexWriter = new IndexWriter(indexDir, config); Logger logger = Logger.getLogger(this.getClass().getName()); logger.log(Level.WARNING, "Start Building Index for Gazatteer"); BufferedReader filereader = new BufferedReader( new InputStreamReader(new FileInputStream(gazetteerPath), "UTF-8")); String line; int count = 0; while ((line = filereader.readLine()) != null) { try { count += 1; if (count % 100000 == 0) { logger.log(Level.INFO, "Indexed Row Count: " + count); } addDoc(indexWriter, line); } catch (RuntimeException re) { logger.log(Level.WARNING, "Skipping... Error on line: {}", line); } } logger.log(Level.WARNING, "Building Finished"); filereader.close(); indexWriter.close(); } } /** * Index gazetteer's one line data by built-in Lucene Index functions * * @param indexWriter * Lucene indexWriter to be loaded * @param line * a line from the gazetteer file * @throws IOException * @throws NumberFormatException */ private static void addDoc(IndexWriter indexWriter, final String line) { String[] tokens = line.split("\t"); int ID = Integer.parseInt(tokens[0]); String name = tokens[1]; String alternatenames = tokens[3]; Double latitude = -999999.0; try { latitude = Double.parseDouble(tokens[4]); } catch (NumberFormatException e) { latitude = OUT_OF_BOUNDS; } Double longitude = -999999.0; try { longitude = Double.parseDouble(tokens[5]); } catch (NumberFormatException e) { longitude = OUT_OF_BOUNDS; } int population = 0; try { population = Integer.parseInt(tokens[14]); } catch (NumberFormatException e) { population = 0;// Treat as population does not exists } // Additional fields to rank more known locations higher // All available codes can be viewed on www.geonames.org String featureCode = tokens[7];// more granular category String countryCode = tokens[8]; String admin1Code = tokens[10];// eg US State String admin2Code = tokens[11];// eg county Document doc = new Document(); doc.add(new IntField(FIELD_NAME_ID, ID, Field.Store.YES)); doc.add(new TextField(FIELD_NAME_NAME, name, Field.Store.YES)); doc.add(new DoubleField(FIELD_NAME_LONGITUDE, longitude, Field.Store.YES)); doc.add(new DoubleField(FIELD_NAME_LATITUDE, latitude, Field.Store.YES)); doc.add(new TextField(FIELD_NAME_ALTERNATE_NAMES, alternatenames, Field.Store.YES)); doc.add(new TextField(FIELD_NAME_FEATURE_CODE, featureCode, Field.Store.YES)); doc.add(new TextField(FIELD_NAME_COUNTRY_CODE, countryCode, Field.Store.YES)); doc.add(new TextField(FIELD_NAME_ADMIN1_CODE, admin1Code, Field.Store.YES)); doc.add(new TextField(FIELD_NAME_ADMIN2_CODE, admin2Code, Field.Store.YES)); doc.add(new NumericDocValuesField(FIELD_NAME_POPULATION, population));//sort enabled field try { indexWriter.addDocument(doc); } catch (IOException e) { e.printStackTrace(); } } @Override public void close() throws IOException { if (indexReader != null) { this.indexReader.close(); } } /** * Writes the result as formatted json to given PrintStream * @param resolvedEntities map of resolved entities * @param out the print stream for writing output */ public static void writeResultJson(Map> resolvedEntities, PrintStream out) { out.println(new Gson().toJson(resolvedEntities) ); } /** * Writes the result to given PrintStream * @deprecated Use writeResultJson instead * @param resolvedEntities map of resolved entities * @param out the print stream for writing output */ @Deprecated public static void writeResult(Map> resolvedEntities, PrintStream out) { out.println("["); List keys = (List)(List) Arrays.asList(resolvedEntities.keySet().toArray()); //TODO: use org.json.JSONArray and remove this custom formatting code for (int j=0; j < keys.size(); j++) { String n = keys.get(j); out.println("{\"" + n + "\" : ["); List terms = resolvedEntities.get(n); for (int i = 0; i < terms.size(); i++) { Location res = terms.get(i); if (i < terms.size() - 1) { out.println(res + ","); } else { out.println(res); } } if (j < keys.size() -1){ out.println("]},"); } else{ out.println("]}"); } } out.println("]"); } public static void main(String[] args) throws Exception { Option buildOpt = OptionBuilder.withArgName("gazetteer file").hasArg().withLongOpt("build") .withDescription("The Path to the Geonames allCountries.txt") .create('b'); Option searchOpt = OptionBuilder.withArgName("set of location names").withLongOpt("search").hasArgs() .withDescription("Location names to search the Gazetteer for") .create('s'); Option indexOpt = OptionBuilder .withArgName("directoryPath") .withLongOpt("index") .hasArgs() .withDescription( "The path to the Lucene index directory to either create or read") .create('i'); Option helpOpt = OptionBuilder.withLongOpt("help") .withDescription("Print this message.").create('h'); Option resultCountOpt = OptionBuilder.withArgName("number of results").withLongOpt("count").hasArgs() .withDescription("Number of best results to be returned for one location").withType(Integer.class) .create('c'); Option serverOption = OptionBuilder.withArgName("Launch Server") .withLongOpt("server") .withDescription("Launches Geo Gazetteer Service") .create("server"); Option jsonOption = OptionBuilder.withArgName("outputs json") .withLongOpt(JSON_OPT) .withDescription("Formats output in well defined json structure") .create(JSON_OPT); String indexPath = null; String gazetteerPath = null; Options options = new Options(); options.addOption(buildOpt); options.addOption(searchOpt); options.addOption(indexOpt); options.addOption(helpOpt); options.addOption(resultCountOpt); options.addOption(serverOption); options.addOption(jsonOption); // create the parser CommandLineParser parser = new DefaultParser(); GeoNameResolver resolver = new GeoNameResolver(); try { // parse the command line arguments CommandLine line = parser.parse(options, args); if (line.hasOption("index")) { indexPath = line.getOptionValue("index"); } if (line.hasOption("build")) { gazetteerPath = line.getOptionValue("build"); } if (line.hasOption("help")) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp("lucene-geo-gazetteer", options); System.exit(1); } if (indexPath != null && gazetteerPath != null) { LOG.info("Building Lucene index at path: [" + indexPath + "] with geoNames.org file: [" + gazetteerPath + "]"); resolver.buildIndex(gazetteerPath, indexPath); } if (line.hasOption("search")) { List geoTerms = new ArrayList(Arrays.asList(line .getOptionValues("search"))); String countStr = line.getOptionValue("count", "1"); int count = 1; if (countStr.matches("\\d+")) count = Integer.parseInt(countStr); Map> resolved = resolver .searchGeoName(indexPath, geoTerms, count); if(line.hasOption(JSON_OPT)){ writeResultJson(resolved, System.out); }else{ writeResult(resolved, System.out); } } else if (line.hasOption("server")){ if (indexPath == null) { System.err.println("Index path is required"); System.exit(-2); } //TODO: get port from CLI args int port = 8765; Launcher.launchService(port, indexPath); } else { System.err.println("Sub command not recognised"); System.exit(-1); } } catch (ParseException exp) { // oops, something went wrong System.err.println("Parsing failed. Reason: " + exp.getMessage()); } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy