All Downloads are FREE. Search and download functionalities are using the official Maven repository.

eu.socialsensor.sfc.builder.solrQueryBuilder.TrendingSolrQueryBuilder Maven / Gradle / Ivy

package eu.socialsensor.sfc.builder.solrQueryBuilder;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;

import org.apache.log4j.Logger;

import eu.socialsensor.framework.common.domain.Keyword;
import eu.socialsensor.framework.common.domain.Query;
import eu.socialsensor.framework.common.domain.Stopwords;
import eu.socialsensor.framework.common.domain.dysco.Dysco;
import eu.socialsensor.framework.common.domain.dysco.Entity;

/**
 * @brief The class that creates the solr query based on the 
 * the content of a trending DySco (keywords,entities,hashtags)
 * @author ailiakop
 * @email  [email protected]
 */
public class TrendingSolrQueryBuilder {
	
	public static final int NUMBER_OF_QUERIES = 10;
	public static final int MIN_KEYWORD_LENGTH = 3;
	
	public final Logger logger = Logger.getLogger(TrendingSolrQueryBuilder.class);
	
	private List entities = new ArrayList();
	private List keywords = new ArrayList();
	private List hashtags = new ArrayList();
	
	private List mikeywords = new ArrayList();
	private List mientities = new ArrayList();
	
	private Dysco dysco = null;
	
	Stopwords stopwords = new Stopwords();
	
	public TrendingSolrQueryBuilder(Dysco dysco) {
		this.dysco = dysco;
		
		addfilteredDyscoContent();
		eliminateRepeatedKeywords();
	}
	
	/**
	 * Formulates one solr query connected with AND's and OR's
	 * ready to be used directly for retrieval from solr.
	 * @return
	 */
	public String createSolrQuery() {
		
		String solrQuery = "";
		String query = "";
		
		if(mikeywords.isEmpty() && mientities.isEmpty() && hashtags.isEmpty())
			return solrQuery;
		
		boolean first = true;

		if(!mientities.isEmpty()) {
			for(Entity entity : mientities) {
				for(Keyword key : mikeywords) {
					if(first) {
						query += "(\""+entity.getName()+"\" AND "+ key.getName()+")";
						first = false;
					}
					else {
						query += " OR (\"" + entity.getName()+"\" AND "+ key.getName()+")";
					}
				}
				
				if(mikeywords.isEmpty()) {
					if(first) {
						query += "\""+entity.getName()+"\"";
						first = false;
					}	
					else {
						query += " OR \"" + entity.getName()+"\"";
					}
				}
			}
		}
		
		if(!hashtags.isEmpty()) {
			for(Keyword hashtag : hashtags) {
				for(Keyword key : mikeywords) {
					if(first) {
						query += "("+hashtag.getName()+" AND "+ key.getName()+")";
						first = false;
					}
					else {
						query += " OR (" + hashtag.getName()+" AND "+ key.getName()+")";
					}
				}
				
				if(mikeywords.isEmpty()) {
					if(first) {
						query += hashtag.getName();
						first = false;
					}	
					else {
						query += " OR " + hashtag.getName()+"";
					}
				}
			}
		}
	
		if(mientities.isEmpty() && hashtags.isEmpty()) {
			for(Keyword key : mikeywords) {
				if(first) {
					query += key.getName();
					first = false;
				}
				else {
					query += " OR " + key.getName()+"";
				}
			}
		}
		
		//Final formulation of solr query
		
		if(!query.equals("")) {
			solrQuery += "(title : "+query+") OR (description:"+query+") OR (tags:"+query+")";
		}
		
		return solrQuery;
	}
	
	/**
	 * Formulates primal solr queries out of DySco content (keywords/entities/hashtags).
	 * The formulated queries are the product of the combination of keywords and entities, whereas
	 * hashtags are used independently. Queries are ranked by their calculated scores, which are produced 
	 * by processing the frequency scores of keywords,entities and hashtags in the DySco. The formulated queries
	 * need to be aggregated to be used for solr retrieval.
	 * @return the list of queries
	 */
	public List createPrimalSolrQueries() {
		
		Map> rankedQueries = new TreeMap>(Collections.reverseOrder());
		
		//create queries from hashtags
		for(Keyword hashtag : hashtags) {
			
			Query query = new Query();
			query.setName(hashtag.getName());
			query.setScore(hashtag.getScore());
			query.setType(Query.Type.Keywords);
			
			double score = hashtag.getScore();
			List alreadyIn = rankedQueries.get(score);
			
			if(alreadyIn == null) {
				alreadyIn = new ArrayList();
				rankedQueries.put(score, alreadyIn);
			}
			alreadyIn.add(query);
			
			for(Entity entity : entities) {
				String qStr = hashtag.getName() + " " + entity.getName();
				
				query = new Query();
				query.setName(qStr);
				query.setScore(hashtag.getScore());
				query.setType(Query.Type.Keywords);
				
				score = hashtag.getScore();
				alreadyIn = rankedQueries.get(score);
				
				if(alreadyIn == null) {
					alreadyIn = new ArrayList();
					rankedQueries.put(score, alreadyIn);
				}
				alreadyIn.add(query);
			}
			
			for(Keyword keyword : keywords) {	
				String qStr = hashtag.getName() + " " + keyword.getName();
				
				query = new Query();
				query.setName(qStr);
				query.setScore(hashtag.getScore());
				query.setType(Query.Type.Keywords);
				
				score = hashtag.getScore();
				alreadyIn = rankedQueries.get(score);
				if(alreadyIn == null) {
					alreadyIn = new ArrayList();
					rankedQueries.put(score, alreadyIn);
				}
				alreadyIn.add(query);
			}
			
		}
		
		//create queries from entities - keywords combination
		for(Entity entity : entities) {
			for(Keyword keyword : keywords) {	
				Query query = new Query();
			
				String resQuery = getRightEntityKeywordCombination(entity.getName(), keyword.getName());
		
				query.setName(resQuery);
				double aggScore = entity.getCont() + keyword.getScore();
				query.setScore(aggScore);
				query.setType(Query.Type.Keywords);

				List alreadyIn = rankedQueries.get(aggScore);
				if(alreadyIn == null) {
					alreadyIn = new ArrayList();
					rankedQueries.put(aggScore, alreadyIn);
				}
				alreadyIn.add(query);
				
			}

			Query query = new Query();

			query.setName("\""+entity.getName()+"\"");
			
			query.setScore(entity.getCont());
			query.setType(Query.Type.Keywords);

			double entityScore = entity.getCont();
			
			List alreadyIn = rankedQueries.get(entityScore);
			if(alreadyIn == null) {
				alreadyIn = new ArrayList();
				rankedQueries.put(entityScore, alreadyIn);
			}
			
			alreadyIn.add(query);
		}
		
		for(Keyword keyword : keywords) {
			String name = keyword.getName();
			String[] parts = name.split("\\s+");
			
			double kScore = parts.length * keyword.getScore();
			
			if(parts.length >= TrendingSolrQueryBuilder.MIN_KEYWORD_LENGTH) {		
					
				Query query = new Query();
				query.setName(name);
				query.setScore(kScore);
				query.setType(Query.Type.Keywords);

				List alreadyIn = rankedQueries.get(kScore);
				if(alreadyIn == null) {
					alreadyIn = new ArrayList();
					rankedQueries.put(kScore, alreadyIn);
				}
				alreadyIn.add(query);
			}
		}
		
		List solrQueries = new ArrayList();
		for(Map.Entry> entry : rankedQueries.entrySet()) {
			for(Query q : entry.getValue()) {
				if(solrQueries.size() == TrendingSolrQueryBuilder.NUMBER_OF_QUERIES) {
					break;
				}
				solrQueries.add(q);
				
			}
		}
		return solrQueries;
	}
	
	/**
	 * Combines an entity string with a keywords string, detecting an overlap between the two
	 * if exists.
	 * @param ent
	 * @param keywords
	 * @return the combination of the entity and the keyword as string
	 */
	private String getRightEntityKeywordCombination(String entity, String keywords) {
		String combination = "";
		
		List splittedKeywords = new ArrayList();
		
		entity = entity.toLowerCase();
		keywords = keywords.toLowerCase();
		
		for(String key : keywords.split("\\s+"))
			splittedKeywords.add(key);
		
		String[] entityWords = entity.split("\\s+"); 
			
		List wordsFound = new ArrayList();
		for(String eWord : entityWords) {
			if(splittedKeywords.contains(eWord)) {
				wordsFound.add(eWord);
			}
		}
		
		if(wordsFound.size() == entityWords.length) {
			String resQuery = keywords.replace(entity, "");
			if(resQuery.length() == 0)
				combination = "\""+entity +"\"";
			else
				combination = "\""+entity +"\""+resQuery;
		}
		else if(wordsFound.isEmpty()) {
			combination = "\""+entity +"\" " + keywords;
		}
		else {
			int lastIndex = 0;
			for(int i=0;i keywords.length()) {
						keywords = "\""+entityWords[i]+"\" "+keywords;
					}
					else {
						String part1 = keywords.substring(0,lastIndex);
						String part2 = keywords.substring(lastIndex+1);
						part1 +="\""+entityWords[i]+"\" ";
						keywords = part1 + part2;
					}
				}
			}
			combination = keywords;
		}
		
		return combination;
	}
	
	/**
	 * Filters DySco's content from stopwords, urls, emails and
	 * other unnecessary features.
	 */
	private void addfilteredDyscoContent() {
		
		List filteredEntities = new ArrayList();
		
		//Filter entities
		if(dysco.getEntities() != null) {
			filteredEntities.addAll(dysco.getEntities());
			for(Entity entity : dysco.getEntities()) {
				
				int r_entity = -1;
				for(Entity f_entity : filteredEntities) {
					if(f_entity.getName().contains(entity.getName()) && !f_entity.getName().equals(entity.getName())) {
						r_entity = filteredEntities.indexOf(entity);
						break;
					}
					else if(entity.getName().contains(f_entity.getName()) && !f_entity.getName().equals(entity.getName())) {
						r_entity = filteredEntities.indexOf(f_entity);;
						break;
					}
				}
				
				if(r_entity != -1) {
					filteredEntities.remove(r_entity);
				}
				
				int index = filteredEntities.indexOf(entity);
				if(index != -1) {
					if(entity.getName().contains("#") || Stopwords.isStopword(entity.getName().toLowerCase())
							|| entity.getName().split(" ").length > 3) {
						filteredEntities.remove(entity);
						continue;
					}
					if(entity.getName().contains("http")) {
						String newEntity = entity.getName().substring(0,entity.getName().indexOf("http")-1);
						filteredEntities.get(index).setName(newEntity);
					}
					if(entity.getName().contains("@")) {
						String newEntity = entity.getName().replace("@", "");
						filteredEntities.get(index).setName(newEntity);
					}
					
					filteredEntities.get(index).setName(filteredEntities.get(index).getName().toLowerCase());
					filteredEntities.get(index).setName(filteredEntities.get(index).getName().replaceAll("'s", ""));
					filteredEntities.get(index).setName(filteredEntities.get(index).getName().replaceAll("[^A-Za-z0-9 ]", ""));
					filteredEntities.get(index).setName(filteredEntities.get(index).getName().replaceAll("\\s+", " "));
				}
			}
			entities.addAll(filteredEntities);
		}
			
		//Filter keywords
		if(dysco.getKeywords() != null) {
			Map keywordsToFilter = new HashMap();
			keywordsToFilter.putAll(dysco.getKeywords());
		
			for(String key : dysco.getKeywords().keySet()) {
			
				if(key.contains("@")||key.contains("#") || stopwords.is(key) || key.equals("http")
						|| key.split(" ").length > 3) {
					keywordsToFilter.remove(key);
					continue;
				}
				
				if(key.contains("http")) {
					String newKey = key.replaceAll("http","");
					keywordsToFilter.put(newKey, dysco.getKeywords().get(key));
					keywordsToFilter.remove(key);
				}
				
				String keyToFilter = key;
				keyToFilter = keyToFilter.toLowerCase();
				
				keyToFilter = keyToFilter.replaceAll("'s", "");
				keyToFilter = keyToFilter.replaceAll("[^A-Za-z0-9 ]", "");
				keyToFilter = keyToFilter.replaceAll("\\s+", " ");
			
				//Create the keyword to use
				Keyword keyword = new Keyword(keyToFilter,dysco.getKeywords().get(key).floatValue());
				keywords.add(keyword);
			}
		}
		
		if(dysco.getHashtags() != null) {
			for(String hashtag : dysco.getHashtags().keySet()) {
				//Create the keyword to use
				Keyword keyword = new Keyword(hashtag.replace("#", ""),dysco.getHashtags().get(hashtag).floatValue());
				hashtags.add(keyword);
			}
		}
			
	}
	
	/**
	 * Eliminates duplicate keywords that may exist both in hashtag or entity list 
	 * and the keywords list 
	 */
	private void eliminateRepeatedKeywords() {
		List keywordsToEliminate = new ArrayList();
		for(Keyword key : keywords) {
			for(Entity ent : entities) {
				if(ent.getName().equals(key.getName())) {
					keywordsToEliminate.add(key);
					ent.setCont(ent.getCont()+key.getScore());
				}
			}
			for(Keyword hash : hashtags) {
				if(hash.getName().equals(key.getName())) {
					keywordsToEliminate.add(key);
					hash.setScore(hash.getScore()+key.getScore());
				}
			}
		}
		
		for(Keyword key : keywordsToEliminate) {
			keywords.remove(key);
		}
		
	}
	
	public static void main(String[] args) {
		
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy