All Downloads are FREE. Search and download functionalities are using the official Maven repository.

eu.socialsensor.sfc.builder.SolrQueryBuilder Maven / Gradle / Ivy

package eu.socialsensor.sfc.builder;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;

import org.apache.log4j.Logger;

import eu.socialsensor.framework.common.domain.Item;
import eu.socialsensor.framework.common.domain.Query;
import eu.socialsensor.framework.common.domain.dysco.CustomDysco;
import eu.socialsensor.framework.common.domain.dysco.Dysco;
import eu.socialsensor.framework.common.domain.dysco.Dysco.DyscoType;
import eu.socialsensor.framework.common.domain.dysco.Entity;
import eu.socialsensor.sfc.builder.solrQueryBuilder.CustomSolrQueryBuilder;
import eu.socialsensor.sfc.builder.solrQueryBuilder.KeywordsExtractor;
import eu.socialsensor.sfc.builder.solrQueryBuilder.QueryFormulator;
import eu.socialsensor.sfc.builder.solrQueryBuilder.TrendingSolrQueryBuilder;
import eu.socialsensor.sfc.builder.solrQueryBuilder.graph.GraphCreator;
/**
 * @brief Class responsible for the creation of a SolrQuery
 * on the basis of DySco content. SolrQuery can be then used for 
 * the retrieval of items/mediaitmes/webpages from Apache Solr database.
 * @author ailiakop
 * @email  [email protected]
 */
public class SolrQueryBuilder {
	
	public final Logger logger = Logger.getLogger(SolrQueryBuilder.class);

	//number of words in a solr query - the more words you include, 
	//the more complicated the query turns out to be
	private static final Integer NUMBER_OF_KEYWORDS_IN_QUERY = 4;
	
	public SolrQueryBuilder() {
		logger.info("SolrQueryBuilder instance created");
	}
	
	/**
	 * Returns one aggregated Solr query based on DySco content(trending/custom)
	 * @param dysco
	 * @return the solr query as String
	 */
	public String getSolrQueryString(Dysco dysco) {
	
		if(dysco.getDyscoType() == null) {
			logger.error("Dysco Type is not defined - Cannot extract solr query");
			return null;
		}
		
		if(dysco.getDyscoType().equals(DyscoType.CUSTOM)) {
			logger.info("Find solr query for custom dysco : " + dysco.getId());
			
			CustomDysco customDysco = (CustomDysco) dysco;
			CustomSolrQueryBuilder customBuilder = new CustomSolrQueryBuilder(customDysco);
			
			return customBuilder.createSolrQuery();
		}
		else {
			logger.info("Find solr query for trending dysco : "+dysco.getId());
			
			TrendingSolrQueryBuilder trendingBuilder = new TrendingSolrQueryBuilder(dysco);
			return trendingBuilder.createSolrQuery();
		}
	}
	
	/**
	 * Returns the Solr Queries derived from the DySco content(trending/custom)
	 * @param dysco
	 * @return List of Solr Queries
	 */
	public List getSolrQueries(Dysco dysco) {
		
		if(dysco.getDyscoType() == null) {
			logger.error("Dysco Type is not defined - Cannot extract solr query");
			return null;
		}
		
		if(dysco.getDyscoType().equals(DyscoType.CUSTOM)) {
			logger.info("Find solr query for custom dysco : " + dysco.getId());
			
			CustomDysco customDysco = (CustomDysco) dysco;
			CustomSolrQueryBuilder customBuilder = new CustomSolrQueryBuilder(customDysco);
			
			return customBuilder.createSolrQueries();
		}
		else {
			logger.info("Find solr query for trending dysco : " + dysco.getId());
			
			TrendingSolrQueryBuilder trendingBuilder = new TrendingSolrQueryBuilder(dysco);
			return trendingBuilder.createPrimalSolrQueries();
		}
	}
	
	/**
	 * Returns the Solr Queries for a DySco that resulted from a query expansion process. 
	 * The process utilizes items retrieved from primal Solr queries, derived from DySco content, 
	 * to construct a directed and weighted graph of keywords. The co-occurance of keywords adjacency 
	 * serves for adding weights to the graph. Finally the most weighted paths are selected to extract 
	 * additional solr queries, related to the topic at hand. 
	 * The Solr queries are the product of merging the primal Solr queries and the additional solr
	 * queries that were computed from the algorithm on the basis of their scores. 
	 * @param items
	 * @param queryNumberLimit
	 * @param dysco
	 * @return A list of Solr queries
	 */
	public List getExpandedSolrQueries(List items, Dysco dysco, Integer queryNumberLimit) {

		List formulatedSolrQueries = new ArrayList();
		
		List primalSolrQueries = dysco.getSolrQueries();
		for(Query pQuery : primalSolrQueries) {
			if(pQuery.getScore() == null)
				pQuery.setScore(10.0);
		}

		//Process keywords from items collection
		KeywordsExtractor keywordExtractor = new KeywordsExtractor(items);
		keywordExtractor.processItemsText();
		List topKeywords = keywordExtractor.getTopKeywords();
		Set contentToProcess = keywordExtractor.getTextContent();

		//Create the graph on the content Keywords Exctractor gave as output
		GraphCreator graphCreator = new GraphCreator(contentToProcess, topKeywords);
		graphCreator.setSubstituteWords(keywordExtractor.getWordsToReplace());
		graphCreator.createGraph();
		graphCreator.pruneLowConnectivityNodes();
		
		//if graph has no nodes return no queries
		if(graphCreator.getGraph().getNodes().size() == 0)
			return formulatedSolrQueries;
		
		//Track solr queries on the graph 
		QueryFormulator qFormulator = new QueryFormulator(graphCreator.getGraph(), keywordExtractor.getTopHashtags());
		qFormulator.generateKeywordQueries(NUMBER_OF_KEYWORDS_IN_QUERY);
		qFormulator.generateHashtagQueries();
	
		Map> scaledRankedKeywords = qFormulator.getRankedKeywordQueries();
		Map> scaledRankedHashtags = qFormulator.getRankedHashtagQueries();
		
		//Process formulated keywords and hashtags queries to boost repetitive queries and eliminate similar.
		//Keep the highly ranked queries.
		while(formulatedSolrQueries.size() < queryNumberLimit) {
			boolean keywordFound = false;
			boolean done = false;
			Double elementToRemove = 0.0; 
			
			if(scaledRankedKeywords.isEmpty() && scaledRankedHashtags.isEmpty())
				break;

			if(scaledRankedKeywords.isEmpty()) { 
				for(Double hashtagScore : scaledRankedHashtags.keySet()) {
					if(hashtagScore < 1.5) {
						done = true;
						break;
					}
					for(String solrQuery : scaledRankedHashtags.get(hashtagScore)) {
						formulatedSolrQueries.add(new Query(solrQuery, hashtagScore));
						if(formulatedSolrQueries.size() >= queryNumberLimit) {
							done = true;
							break;
						}
					}
					if(done)
						break;
				}
			}
			else if(scaledRankedHashtags.isEmpty()) {
				for(Double keywordScore : scaledRankedKeywords.keySet()) {
					if(keywordScore < 0.5) {
						done = true;
						break;
					}
						
					for(String solrQuery : scaledRankedKeywords.get(keywordScore)) {
						formulatedSolrQueries.add(new Query(solrQuery, keywordScore));
						if(formulatedSolrQueries.size() >= queryNumberLimit) {
							done = true;
							break;
						}
					}
					if(done)
						break;
				}
			}
			else {
				for(Double keyScore : scaledRankedKeywords.keySet()) {
					boolean hashtagFound = false;
					for(Double hashScore : scaledRankedHashtags.keySet()) {
						if(keyScore < 0.5 && hashScore < 0.5) {
							done = true;
							break;
						}
					
						if(keyScore > hashScore) {
							break;
						}
					
						for(String solrQuery : scaledRankedHashtags.get(hashScore)) {
							formulatedSolrQueries.add(new Query(solrQuery,hashScore));
							if(formulatedSolrQueries.size() >= queryNumberLimit) {
								done = true;
								break;
							}
						}
						elementToRemove = hashScore;
						hashtagFound = true;
						break;
					}
					
					if(done)
						break;
					
					if(!hashtagFound) {
						for(String solrQuery : scaledRankedKeywords.get(keyScore)) {
							formulatedSolrQueries.add(new Query(solrQuery,keyScore));
							if(formulatedSolrQueries.size() >= queryNumberLimit) {
								done = true;
								break;
							}
						}
						elementToRemove = keyScore;
						keywordFound = true;
					}
					else {
						scaledRankedHashtags.remove(elementToRemove);
					}
					break;
				}
				
				if(keywordFound) {
					scaledRankedKeywords.remove(elementToRemove);
				}
			}
			
			if(done)
				break;
		}
		
		//if no queries have been formulated return no queries
		if(formulatedSolrQueries.isEmpty())
			return formulatedSolrQueries;
		
		List processedQueries = new ArrayList();
	
		//detect dysco entities inside newly formulated queries 
		for(Query query : formulatedSolrQueries) {
			for(Entity entity : dysco.getEntities()) {
				String entityName = entity.getName();
				if(query.getName().contains(entityName) || query.getName().equals(entityName)) {
					String temp = query.getName().replace(entityName, "\"" + entityName + "\"");
					query.setName(temp);
				}
			}
			query.setIsFromExpansion(true);
			processedQueries.add(query);
		}
		
		//merge primal solr queries and additional formulated queries on the basis of their scores
		return mergeSolrQueries(primalSolrQueries, processedQueries, 3*queryNumberLimit);
	}

	/**
	 * Returns the resulted queries after merging two sets of queries on the basis of their scores. 
	 * In case there is an overlap between two queries the algorithm re-assigns queries scores to boost
	 * those that occur more often than others. 
	 * The number of resulted queries is limited by queryLimit. 
	 * @param primalQueries
	 * @param processedQueries
	 * @param queryLimit
	 * @return List of merged queries
	 */
	private List mergeSolrQueries(List primalQueries, List processedQueries, int queryLimit) {
		
		List finalSolrQueries = new ArrayList();
		
		Map primalSolrQueriesWeights = new HashMap();
		Map processedSolrQueriesWeights = new HashMap();
		Map> allRankedQueries = new TreeMap>(Collections.reverseOrder());
		
		for(Query q : primalQueries) {
			primalSolrQueriesWeights.put(q.getName(), q);
		}
		
		for(Query q : processedQueries) {
			processedSolrQueriesWeights.put(q.getName(), q);
		}
			
		for(Query primalQuery : primalQueries) {
			List entities = new ArrayList();
			
			String restPrimalQuery = primalQuery.getName();
	
			int start = 0, end = 0;
			while(start != -1 && end != -1) {
				start = restPrimalQuery.indexOf("\"");
		
				if(start == -1)
					break;
				
    			String temp = restPrimalQuery.substring(start+1);
    		
    			end = temp.indexOf("\"") + start + 1;	
    			if(end == -1)
					break;
    			
    			end += 1;
   
    			String entity = restPrimalQuery.substring(start, end);
    	
    			restPrimalQuery = restPrimalQuery.replace(entity, "").trim();
    			entities.add(entity);
			}
			
			for(Query processedQuery : processedQueries) {
	
				List otherEntities = new ArrayList();
				String restProcessedQuery = processedQuery.getName();
				
				start = 0;
				end = 0;
				while(start != -1 && end != -1) {
					start = restProcessedQuery.indexOf("\"");
				
					if(start == -1)
						break;
	    			
					String temp = restProcessedQuery.substring(start+1);
	    
	    			end = temp.indexOf("\"") + start + 1;
	    
	    			if(end == -1)
						break;
	    			
	    			end += 1;
	    			String entity = restProcessedQuery.substring(start, end);
	   
	    			restProcessedQuery = restProcessedQuery.replace(entity, "").trim();
	    			otherEntities.add(entity);
				}
				
				for(String ent : entities) {
					for(String oEnt : otherEntities) {
						if(ent.equals(oEnt)) {
							processedQuery.setScore(processedSolrQueriesWeights.get(processedQuery.getName()).getScore()+primalQuery.getScore());
							primalQuery.setScore(primalSolrQueriesWeights.get(primalQuery.getName()).getScore()+processedQuery.getScore());
							
							processedSolrQueriesWeights.put(processedQuery.getName(), processedQuery);
							primalSolrQueriesWeights.put(primalQuery.getName(), primalQuery);
						}
					}		
				}
				
				String[] words = restPrimalQuery.split("//s+");
				for(String word : words) {
					if(processedQuery.getName().contains(word) || processedQuery.getName().equals(word)) {
						processedQuery.setScore(processedSolrQueriesWeights.get(processedQuery.getName()).getScore()+primalQuery.getScore());
						processedSolrQueriesWeights.put(processedQuery.getName(), processedQuery);
					}
						
				}
				
				String[] otherWords = restProcessedQuery.split("//s+");
				for(String word : otherWords) {
					if(primalQuery.getName().contains(word) || primalQuery.getName().equals(word)) {
						primalQuery.setScore(primalSolrQueriesWeights.get(primalQuery.getName()).getScore()+processedQuery.getScore());
						primalSolrQueriesWeights.put(primalQuery.getName(), primalQuery);
					}		
				}
			}
		}
		
		for(Map.Entry entry : primalSolrQueriesWeights.entrySet()) {
			
			if(allRankedQueries.containsKey(entry.getValue().getScore())) {
				boolean exists = false;
				List alreadyIn = allRankedQueries.get(entry.getValue().getScore());
				
				for(Query inQuery : alreadyIn) {
					if(inQuery.getName().equals(entry.getKey())) {
						exists = true;
						break;
					}
				}
				
				if(!exists) {
					alreadyIn.add(entry.getValue());
					allRankedQueries.put(entry.getValue().getScore(), alreadyIn);
				}
			}
			else {
				List alreadyIn = new ArrayList();
				
				alreadyIn.add(entry.getValue());
				allRankedQueries.put(entry.getValue().getScore(), alreadyIn);
			}
		}
		
		for(Map.Entry entry : processedSolrQueriesWeights.entrySet()) {
		
			if(allRankedQueries.containsKey(entry.getValue().getScore())) {
				boolean exists = false;
				List alreadyIn = allRankedQueries.get(entry.getValue().getScore());
				
				for(Query inQuery : alreadyIn) {
					if(inQuery.getName().equals(entry.getKey())) {
						exists = true;
						break;
					}
				}
				
				if(!exists) {
					alreadyIn.add(entry.getValue());
					allRankedQueries.put(entry.getValue().getScore(), alreadyIn);
				}
			}
			else {
				List alreadyIn = new ArrayList();
				
				alreadyIn.add(entry.getValue());
				allRankedQueries.put(entry.getValue().getScore(), alreadyIn);
			}
		}
		
		Map> allScaledRankedQueries = scaleKeywordsToWeight(allRankedQueries);
		for(Map.Entry> entry : allScaledRankedQueries.entrySet()) {
			
			if(finalSolrQueries.size() == queryLimit)
				break;
			
			for(Query finalQuery : entry.getValue()) {
				if(finalSolrQueries.size() == queryLimit)
					break;
				
				finalSolrQueries.add(finalQuery);
			}
		}
		return finalSolrQueries;
	}
	
	/**
	 * Scale solr queries scores on a 0-30 scale to accommodate comparison purposes
	 * @param inputData
	 * @return Map of scaled scores to the queries they correspond to
	 */
	private Map> scaleKeywordsToWeight(Map> inputData) {
		Map> scaledData = new TreeMap>(Collections.reverseOrder());
		
		if(inputData.isEmpty())
			return scaledData;
		
		double max = 0.0;
		double min = 1000000.0;
		
		for(Double score : inputData.keySet()) {
			if(score > max) {
				max = score;
			}
			
			if(score < min) {
				min = score;
			}
		}

		for(Map.Entry> entry : inputData.entrySet()) {
			if(min == max) {
				scaledData.put(1.0, entry.getValue());
			}
			else {
				Double value = 30 * (entry.getKey() - min)/(max - min);
				scaledData.put(value, entry.getValue());
			}
		}
		return scaledData; 
	}
	
	
	/**
	 * @param args
	 */
	public static void main(String[] args) {
		
	}

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy