All Downloads are FREE. Search and download functionalities are using the official Maven repository.

prerna.wikidata.WikiDescriptionExtractor Maven / Gradle / Ivy

The newest version!
package prerna.wikidata;

import java.util.List;
import java.util.Vector;
import java.util.concurrent.Callable;
import java.util.concurrent.CompletionService;
import java.util.concurrent.ExecutorCompletionService;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.wikidata.wdtk.wikibaseapi.WbSearchEntitiesResult;
import org.wikidata.wdtk.wikibaseapi.WikibaseDataFetcher;

import prerna.util.Utility;

public class WikiDescriptionExtractor {

	private static List ignoreResultsList = new Vector<>();
	static {
		ignoreResultsList.add("Wikimedia disambiguation page");
		ignoreResultsList.add("scientific article published on");
	}

	public static final Logger LOGGER = LogManager.getLogger(WikiDescriptionExtractor.class);
	private Logger logger;

	public WikiDescriptionExtractor() {
		
	}
	
	public List getDescriptions(String searchTerm) throws Exception {
		Logger logger = getLogger();
		List descriptionList = new Vector<>();

		WikibaseDataFetcher wbdf = WikibaseDataFetcher.getWikidataDataFetcher();
		searchTerm = searchTerm.trim().replace("_", " ");
		List searchResults = wbdf.searchEntities(searchTerm, new Long(10));
		int numReturns = searchResults.size();
		if(numReturns == 0) {
			logger.info("Found no results searching for " + Utility.cleanLogString(searchTerm));
			return descriptionList;
		}
		logger.info("Querying wikidata returned " + numReturns + " results for " + Utility.cleanLogString(searchTerm));
		List> descriptionExtractors = new Vector<>();
		for(int i = 0; i < searchResults.size(); i++) {
			WbSearchEntitiesResult res = searchResults.get(i);
			WikiDescriptionCallable callable = new WikiDescriptionCallable(wbdf, res);
			callable.setLogger(this.logger);
			descriptionExtractors.add(callable);
		}
		
		ExecutorService executorService = Executors.newFixedThreadPool(searchResults.size());
		CompletionService completionService = new ExecutorCompletionService<>(executorService);
		for (Callable descriptionExtractor : descriptionExtractors) {
			completionService.submit(descriptionExtractor);
		}
		
		// Continue until all have completed
		while (numReturns > 0) {
			try {
				String foundDescription = completionService.take().get();
				if(foundDescription != null) {
					foundDescription = foundDescription.trim();
					if(!foundDescription.isEmpty()) {
						boolean ignoreDescription = false;;
						IGNORE_LOOP : for(String ignore : ignoreResultsList) {
							if(foundDescription.startsWith(ignore)) {
								ignoreDescription = true;
								break IGNORE_LOOP;
							}
						}
						if(!ignoreDescription) {
							descriptionList.add(foundDescription.trim());
						}
					}
				}
				numReturns--;
			} catch (Exception e) {
				logger.error("StackTrace: ", e);
				throw e;
			} finally {
				executorService.shutdownNow();
			}
		}
		
		return descriptionList;
	}
	
	public void setLogger(Logger logger) {
		this.logger = logger;
	}
	
	/**
	 * Get the correct logger
	 * @return
	 */
	private Logger getLogger() {
		if(this.logger == null) {
			return LOGGER;
		}
		return this.logger;
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy