All Downloads are FREE. Search and download functionalities are using the official Maven repository.

eu.socialsensor.sfc.builder.ranking.TrendsRanker Maven / Gradle / Ivy

package eu.socialsensor.sfc.builder.ranking;

import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TimeZone;
import java.util.TreeMap;

import org.apache.log4j.Logger;

import eu.socialsensor.framework.client.search.solr.SolrDyscoHandler;
import eu.socialsensor.framework.client.search.solr.SolrItemHandler;
import eu.socialsensor.framework.common.domain.Item;
import eu.socialsensor.framework.common.domain.Query;
import eu.socialsensor.framework.common.domain.dysco.Dysco;
import eu.socialsensor.framework.common.domain.dysco.Entity;
import eu.socialsensor.sfc.builder.solrQueryBuilder.Calculator;

/**
 * Class responsible for calculating a score for a DySco on the basis of
 * an updated RSS collection. The score reflects the quality of the information
 * the DySco entails as a news worthy event. 
 * @author ailiakop
 * @email [email protected]
 */
public class TrendsRanker {
	
	public final Logger logger = Logger.getLogger(TrendsRanker.class);
	
	private static Long DAY_IN_MILLISECONDS = 86400000L;
	
	private SolrItemHandler solrItemHandler;
	
	private BoundedList dyscoScoresList = new BoundedList(200);
	private BoundedList rankerScoresList = new BoundedList(200);
	
	public TrendsRanker(String solrCollection) {
		try {
			solrItemHandler = SolrItemHandler.getInstance(solrCollection);
			logger.info("SolrItemHandler initialized.. ");
		} catch (Exception e) {
			e.printStackTrace();
		}
	}
	
	/**
	 * Returns the score that was calculated for a DySco from 
	 * comparing its content to an updated RSS collection
	 * @param dysco
	 * @return a double number
	 */
	public Double getContentScore(Dysco dysco) {
		Double score = 0.0;
		
		try {
			List solrQueries = dysco.getSolrQueries();
			List queriesScores = new ArrayList();
		
			for(Query sQuery : solrQueries) {
				float queryLength = sQuery.getName().length();
			
				String query = "(title : ("+sQuery.getName()+")) OR (description : ("+sQuery.getName()+"))";
		
				Map itemsByRelevance = solrItemHandler.findItemsWithScore(query);
		
				float avgScore = Calculator.computeAverageFloat(itemsByRelevance.values()) * sQuery.getScore().floatValue();
				//float maxScore = Collections.max(itemsByRelevance.values()) * sQuery.getScore().floatValue();
			
				avgScore *= (queryLength/100);
	
				queriesScores.add(avgScore);
			}
		
			long dateTimeOfDysco = dysco.getCreationDate().getTime();
			long currentDateTime = System.currentTimeMillis();
		
			double timeDiff = (double) Math.abs(dateTimeOfDysco - currentDateTime)/DAY_IN_MILLISECONDS;
		
			double timeEval = Math.sqrt(20/(20 + (Math.exp(timeDiff))));
	
			score = Calculator.computeAverageFloat(queriesScores) * timeEval;
			//score = Collections.max(queriesScores) * timeEval;
		}
		catch(Exception e) {
			logger.equals(e.getMessage());
		}
		return score;
	}
	
	/**
	 * Returns the score that was calculated for a DySco from 
	 * comparing its content to an updated RSS collection
	 * @param dysco
	 * @return a double number
	 */
	public Double getContentScore(Dysco dysco, String listId) {
		Double score = 0.0;
		
		List solrQueries = dysco.getSolrQueries();
		if(solrQueries == null) {
			return score;
		}
		
		List queriesScores = new ArrayList();
		try {
			for(Query sQuery : solrQueries) {
				float queryLength = sQuery.getName().length();
			
				//String query = "(title : ("+sQuery.getName()+")) OR (description : ("+sQuery.getName()+"))";
		
				String queryName = sQuery.getName();
				queryName = queryName.replaceAll("\"", " ");
				queryName = queryName.trim();
				queryName = queryName.replaceAll("\\s+", " AND ");		
				String query = "(title : (" + queryName + ")) OR (description : (" + queryName + "))";
				
				query += " AND (lists : " + listId + ")";
			
				Map itemsByRelevance = solrItemHandler.findItemsWithScore(query);
				//System.out.println(query + " => " + itemsByRelevance.size());
				
				float avgScore = Calculator.computeAverageFloat(itemsByRelevance.values()) * sQuery.getScore().floatValue();
			
				//float maxScore = 0;
				//if(!itemsByRelevance.isEmpty())
				//	maxScore = Collections.max(itemsByRelevance.values()) * sQuery.getScore().floatValue();
			
				avgScore *= (queryLength/100);
				queriesScores.add(avgScore);
			
				//maxScore *= (queryLength/100);
				//queriesScores.add(maxScore);
			}
		
			long dateTimeOfDysco = dysco.getCreationDate().getTime();
			long currentDateTime = System.currentTimeMillis();
		
			double timeDiff = (double) Math.abs(dateTimeOfDysco - currentDateTime)/DAY_IN_MILLISECONDS;
			double timeEval = Math.sqrt(20 / (20 + (Math.exp(timeDiff))));
	
			score = Calculator.computeAverageFloat(queriesScores) * timeEval;
			//if(!queriesScores.isEmpty())
			//	score = Collections.max(queriesScores) * timeEval;
		}
		catch(Exception e) {
			logger.error(e.getMessage());
		}
		return score;
	}
	
	/**
	 * Returns a ranked lists of DyScos. The DyScos are ranked on the basis
	 * of the calculated scores from comparing their content to the updated
	 * RSS collection
	 * @param dyscos
	 * @return
	 */
	public List rankDyscos(List dyscos) {
		List rankedDyscos = new LinkedList();
		Map> dyscosByValues = new TreeMap>(Collections.reverseOrder());		
		
		try {
			for(Dysco dysco : dyscos) {
				Double score = dysco.getRankerScore();
				
				if(score == null)
					continue;
				
				List alreadyIn = dyscosByValues.get(score);
				if(alreadyIn == null) {
					alreadyIn = new ArrayList();
					dyscosByValues.put(score, alreadyIn);
				}
				alreadyIn.add(dysco);
			}
		
			for(Map.Entry> entry : dyscosByValues.entrySet()) {
				for(Dysco dysco : entry.getValue()) {
					rankedDyscos.add(dysco);
				}
			}
		}
		catch(Exception e) {
			logger.error(e.getMessage());
		}
		
		return rankedDyscos;
	}
	
	/**
	 * Evaluates a set of newly created DyScos. In case they are duplicates it eliminates
	 * them using a heuristic rule. Number of duplicates are multiplied over the score 
	 * computed by comparing dyscos solr queries to the RSS collection. The returned DyScos
	 * are ranked by their updated ranker scores.
	 * @param dyscos
	 * @return the updated list of ranked DyScos.
	 */
	public List evaluateDyscosByContent(List dyscos) {
	
		Map dyscosTitles = new HashMap();
		Map dyscosCooccurrences = new HashMap();
		
		for(Dysco dysco : dyscos) {
			
			Double dyscoScore = dysco.getScore();
			dyscoScoresList.push(dyscoScore);
			
			List entities = dysco.getEntities();
			Set keywords = dysco.getKeywords().keySet();
			
			int entitiesFound = 0;
			int keywordsFound = 0;
			boolean isDuplicate = false;
			for(Map.Entry entry : dyscosTitles.entrySet()) {
				
				for(Entity ent : entities){
					if(entry.getValue().contains(ent.getName().toLowerCase()))
						entitiesFound++;
				}
				
				for(String key : keywords) {
					if(entry.getValue().contains(key.toLowerCase()))
						keywordsFound++;
				}
			
				if((entitiesFound >=1 && keywordsFound >= 2) || dysco.getTitle().toLowerCase().equals(entry.getValue())) {
					isDuplicate = true;
					Integer newScore = dyscosCooccurrences.get(entry.getKey()) + 1;
					dyscosCooccurrences.put(entry.getKey(),newScore);
					break;
				}
			}
			
			if(!isDuplicate) {
				dyscosTitles.put(dysco.getId(),dysco.getTitle().toLowerCase());
				dyscosCooccurrences.put(dysco.getId(),1);
				continue;
			}
		}
		for(Dysco dysco : dyscos) {
			if(!dyscosCooccurrences.containsKey(dysco.getId())) {
				dysco.setRankerScore(-1.0);
			}
			else {
				Double rankerScore = getContentScore(dysco) * dyscosCooccurrences.get(dysco.getId());
				dysco.setRankerScore(rankerScore);
				rankerScoresList.push(rankerScore);	
			}
		}
		
		Double minDyscoScore = Collections.min(dyscoScoresList);
		Double minRankerScore = Collections.min(rankerScoresList);
		Double maxDyscoScore = Collections.max(dyscoScoresList);
		Double maxRankerScore = Collections.max(rankerScoresList);
		
		for(Dysco dysco : dyscos) {
			double rankerScore = dysco.getRankerScore();
			if(rankerScore != -1) {
				Double normalizedRankerScore = (rankerScore - minRankerScore) / (maxRankerScore - minRankerScore);
				dysco.setNormalizedRankerScore(normalizedRankerScore);
				
				double normalizedDyscoScore = (dysco.getScore() - minDyscoScore) / (maxDyscoScore - minDyscoScore);
				dysco.setNormalizedDyscoScore(normalizedDyscoScore);
			}
			else {
				dysco.setNormalizedRankerScore(-1);
				dysco.setNormalizedDyscoScore(-1);
			}
		}
		
		return rankDyscos(dyscos);
	}
	
	
	/**
	 * Evaluates a set of newly created DyScos. In case they are duplicates it eliminates
	 * them using a heuristic rule. Number of duplicates are multiplied over the score 
	 * computed by comparing dyscos solr queries to the RSS collection. The returned DyScos
	 * are ranked by their updated ranker scores.
	 * @param dyscos
	 * @return the updated list of ranked DyScos.
	 */
	public List evaluateDyscosByContent(List dyscos, String listId) {
	
		logger.info("Evaluate " + dyscos.size() + " dyscos from list " + listId);
		
		Map dyscosTitles = new HashMap();
		Map dyscosCooccurrences = new HashMap();
		
		for(Dysco dysco : dyscos) {
			
			Double dyscoScore = dysco.getScore();
			
			if(dyscoScore == null) {
				dysco.setScore(0.);
				dyscoScore = 0.;
			}
			
			dyscoScoresList.push(dyscoScore);
			
			String currentDyscoTitle = dysco.getTitle();
			if(currentDyscoTitle == null) {
				continue;
			}
			
			List entities = dysco.getEntities();
			Set keywords = dysco.getKeywords().keySet();
			
			int entitiesFound = 0;
			int keywordsFound = 0;
			boolean isDuplicate = false;
			
			for(Map.Entry entry : dyscosTitles.entrySet()) {
				
				String dyscoId = entry.getKey();
				String dyscoTitle = entry.getValue();
				if(entities != null) {
					for(Entity entity : entities) {
						String entityName = entity.getName();
						if(entityName != null && dyscoTitle.contains(entityName.toLowerCase())) {
							entitiesFound++;
						}
					}
				}
				
				if(keywords != null) {
					for(String keyword : keywords) {
						if(dyscoTitle.contains(keyword.toLowerCase())) {
							keywordsFound++;
						}
					}
				}
				
				if((entitiesFound >=1 && keywordsFound >= 2) || currentDyscoTitle.toLowerCase().equals(dyscoTitle)) {
					isDuplicate = true;
					Integer pScore = dyscosCooccurrences.get(dyscoId);
					Integer newScore =  (pScore==null ? 0 : pScore) + 1;
					dyscosCooccurrences.put(dyscoId, newScore);
					break;
				}
			}
			
			if(!isDuplicate) {
				dyscosTitles.put(dysco.getId(), currentDyscoTitle.toLowerCase());
				dyscosCooccurrences.put(dysco.getId(), 1);
				continue;
			}
		}
		
		for(Dysco dysco : dyscos) {
			if(!dyscosCooccurrences.containsKey(dysco.getId())) {
				dysco.setRankerScore(-1.0);
			}
			else {
				
				Double contentScore = getContentScore(dysco, listId);
				Integer cooccurrences = dyscosCooccurrences.get(dysco.getId());
				if(cooccurrences == null)
					cooccurrences = 0;
				
				Double rankerScore =  contentScore * cooccurrences;
				dysco.setRankerScore(rankerScore);
				
				rankerScoresList.push(rankerScore);
			}
		}
		
		Double minDyscoScore = dyscoScoresList.isEmpty() ? 0d : Collections.min(dyscoScoresList);
		Double minRankerScore = rankerScoresList.isEmpty() ? 0d : Collections.min(rankerScoresList);
		Double maxDyscoScore = dyscoScoresList.isEmpty() ? 0d : Collections.max(dyscoScoresList);
		Double maxRankerScore = rankerScoresList.isEmpty() ? 0d : Collections.max(rankerScoresList);
		
		logger.info("Min Dysco Score: " + minDyscoScore);
		logger.info("Max Dysco Score: " + maxDyscoScore);
		logger.info("Min Ranker Score: " + minRankerScore);
		logger.info("Max Ranker Score: " + maxRankerScore);
		
		logger.info("Size of RankerScoresList: " + rankerScoresList.size());
		logger.info("Size of DyscoScoresList: " + dyscoScoresList.size());
		
		for(Dysco dysco : dyscos) {
			double rankerScore = dysco.getRankerScore();
			if(rankerScore >= 0) {
				Double normalizedRankerScore = 0d;
				if((maxRankerScore - minRankerScore) != 0)
					normalizedRankerScore = (rankerScore - minRankerScore) / (maxRankerScore - minRankerScore);
				dysco.setNormalizedRankerScore(normalizedRankerScore);
				
				Double normalizedDyscoScore = 0d;
				if((maxDyscoScore - minDyscoScore) != 0)
					normalizedDyscoScore = (dysco.getScore() - minDyscoScore) / (maxDyscoScore - minDyscoScore);
				dysco.setNormalizedDyscoScore(normalizedDyscoScore);
				
				logger.info("Dysco: " + dysco.getId() + ",  normalizedRankerScore= " + normalizedRankerScore + 
						",  normalizedDyscoScore=" + normalizedDyscoScore);
			}
			else {
				dysco.setNormalizedRankerScore(-1.0);
				dysco.setNormalizedDyscoScore(-1.0);
				
				logger.info("Dysco: " + dysco.getId() + ",  normalizedRankerScore=-1,  normalizedDyscoScore=-1");
			}
		}
		return rankDyscos(dyscos);
	}

	private static class BoundedList extends LinkedList {

	    /**
		 * 
		 */
		private static final long serialVersionUID = -8828336215826576541L;
		private final int bound;

	    public BoundedList(int bound) {
	        this.bound = bound;
	    }

	    public synchronized void push(T item) {
	    	try {
	    		super.push(item);
	    		if (super.size() > bound) {	        	
	        		super.removeLast();                
	        	}
	        }
	    	catch(Exception e) {
        		
        	}
	    }
	}

	public static void main(String...args) throws Exception {
		
		SolrDyscoHandler solrDyscoHandler = SolrDyscoHandler.getInstance("http://xxx.xxx.xxx.xxx/solr/dyscos");
        TrendsRanker ranker = new TrendsRanker("http://xxx.xxx.xxx.xxx/solr/NewsFeed");
        
        Date date = new Date(System.currentTimeMillis() - 15*60000);
        TimeZone tz = TimeZone.getDefault();
	    Date gmtDate = new Date( date.getTime() - tz.getRawOffset() );
	    if ( tz.inDaylightTime( gmtDate )){
	        Date dstDate = new Date( gmtDate.getTime() - tz.getDSTSavings() );
	        if ( tz.inDaylightTime( dstDate )) {
	        	gmtDate = dstDate;
	        }
	     }
	    SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'");
		String formattedDate = dateFormat.format(gmtDate);
		
		List dyscos = solrDyscoHandler.findDyscosInTimeframe(formattedDate).getResults();
        System.out.println(dyscos.size() + " dyscos");
        
		List rankedDyscos = ranker.evaluateDyscosByContent(dyscos, "1");
		System.out.println(rankedDyscos.size() + " ranked dyscos");
		
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy