All Downloads are FREE. Search and download functionalities are using the official Maven repository.

aima.core.nlp.ranking.HITS Maven / Gradle / Ivy

Go to download

AIMA-Java Core Algorithms from the book Artificial Intelligence a Modern Approach 3rd Ed.

The newest version!
package aima.core.nlp.ranking;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;

/**
 * Artificial Intelligence A Modern Approach (3rd Edition): page 871.
*
* *
 * function HITS(query) returns pages (with hub and authority numbers)
 *   pages ← EXPAND-PAGES(RELEVANT-PAGES(query))
 *   for each p in pages do 
 *   	p.AUTHORITY ← 1
 *   	p.HUB ← 1
 *   repeat until convergence do
 *   	for each p in pages do
 *   		p.AUTHORITY ← Σi INLINKi(p).HUB
 *   		p.HUB ← Σi OUTLINKi(p).AUTHORITY
 *   	NORMALIZE(pages)
 *   return pages
 * 
* * Figure 22.1 The HITS algorithm for computing hubs and authorities with * respect to a query. RELEVANT-PAGES fetches the pages that match the query, * and EXPAND-PAGES add in every page that links to or is linked from one of the * relevant pages. NORMALIZE divides each page's score by the sum of the squares * of all pages' scores (separately for both the authority and hubs scores.
*
* * @author Jonathon Belotti (thundergolfer) * */ public class HITS { final int RANK_HISTORY_DEPTH; final double DELTA_TOLERANCE; // somewhat arbitrary Map pTable; // DETECT CONVERGENCE VARS double[] prevAuthVals; double[] prevHubVals; double prevAveHubDelta = 0; double prevAveAuthDelta = 0; //////////////////////////// // TODO: Improve the convergence detection functionality public HITS(Map pTable, int rank_hist_depth, double delta_tolerance) { this.pTable = pTable; this.RANK_HISTORY_DEPTH = rank_hist_depth; this.DELTA_TOLERANCE = delta_tolerance; } public HITS(Map pTable) { this(pTable, 3, 0.05); } // function HITS(query) returns pages with hub and authority number public List hits(String query) { // pages <- EXPAND-PAGES(RELEVANT-PAGES(query)) List pages = expandPages(relevantPages(query)); // for each p in pages for (Page p : pages) { // p.AUTHORITY <- 1 p.authority = 1; // p.HUB <- 1 p.hub = 1; } // repeat until convergence do while (!convergence(pages)) { // for each p in pages do for (Page p : pages) { // p.AUTHORITY <- &Sigmai INLINKi(p).HUB p.authority = SumInlinkHubScore(p); // p.HUB <- Σi OUTLINKi(p).AUTHORITY p.hub = SumOutlinkAuthorityScore(p); } // NORMALIZE(pages) normalize(pages); } return pages; } /** * Fetches and returns all pages that match the query * * @param query * @return * @throws UnsupportedEncodingException */ public List relevantPages(String query) { List relevantPages = new ArrayList(); for (Page p : pTable.values()) { if (matches(query, p.getContent())) { relevantPages.add(p); } } return relevantPages; } /** * Simple check if query string is a substring of a block of text. * * @param query * @param text * @return */ public boolean matches(String query, String text) { return text.contains(query); } /** * Adds pages that are linked to or is linked from one of the pages passed * as argument. * * @param pages * @return */ public List expandPages(List pages) { List expandedPages = new ArrayList(); Set inAndOutLinks = new HashSet(); // Go through all pages an build a list of String links for (int i = 0; i < pages.size(); i++) { Page currP = pages.get(i); if (!expandedPages.contains(currP)) { expandedPages.add(currP); } List currInlinks = currP.getInlinks(); for (int j = 0; j < currInlinks.size(); j++) { inAndOutLinks.add(currInlinks.get(i)); } List currOutlinks = currP.getOutlinks(); for (int j = 0; j < currOutlinks.size(); j++) { inAndOutLinks.add(currOutlinks.get(i)); } } // go through String links and add their respective pages to our return // list Iterator it = inAndOutLinks.iterator(); while (it.hasNext()) { String addr = it.next(); Page p = pTable.get(addr); if (p != null && !expandedPages.contains(p)) { // a valid link may // not have an // associated page // in our table expandedPages.add(p); } } return expandedPages; } // end expandPages(); /** * Divides each page's score by the sum of the squares of all pages' scores * (separately for both the authority and hubs scores * * @param pages * @return */ public List normalize(List pages) { double hubTotal = 0; double authTotal = 0; for (Page p : pages) { // Sum Hub scores over all pages hubTotal += Math.pow(p.hub, 2); // Sum Authority scores over all pages authTotal += Math.pow(p.authority, 2); } // divide all hub and authority scores for all pages for (Page p : pages) { if (hubTotal > 0) { p.hub /= hubTotal; } else { p.hub = 0; } if (authTotal > 0) { p.authority /= authTotal; } else { p.authority = 0; } } return pages; // with normalised scores now } // end normalize() /** * Calculate the Authority score of a page by summing the Hub scores of that * page's inlinks. * * @param page * @param pagesTable * @return */ public double SumInlinkHubScore(Page page) { List inLinks = page.getInlinks(); double hubScore = 0; for (int i = 0; i < inLinks.size(); i++) { Page inLink = pTable.get(inLinks.get(i)); if (inLink != null) { hubScore += inLink.hub; } else { // page is linked to by a Page not in our table continue; } } return hubScore; } // end SumInlinkHubScore() /** * Calculate the Hub score of a page by summing the Authority scores of that * page's outlinks. * * @param page * @param pagesTable * @return */ public double SumOutlinkAuthorityScore(Page page) { List outLinks = page.getOutlinks(); double authScore = 0; for (int i = 0; i < outLinks.size(); i++) { Page outLink = pTable.get(outLinks.get(i)); if (outLink != null) { authScore += outLink.authority; } } return authScore; } /** * pg. 872 : "If we then normalize the scores and repeat k times the process * will converge" * * @return */ private boolean convergence(List pages) { double aveHubDelta = 100; double aveAuthDelta = 100; if (pages == null) { return true; } // get current values from pages double[] currHubVals = new double[pages.size()]; double[] currAuthVals = new double[pages.size()]; for (int i = 0; i < pages.size(); i++) { Page currPage = pages.get(i); currHubVals[i] = currPage.hub; currHubVals[i] = currPage.authority; } if (prevHubVals == null || prevAuthVals == null) { prevHubVals = currHubVals; prevAuthVals = currAuthVals; return false; } // compare to past values aveHubDelta = getAveDelta(currHubVals, prevHubVals); aveAuthDelta = getAveDelta(currAuthVals, prevAuthVals); if (aveHubDelta + aveAuthDelta < DELTA_TOLERANCE || (Math.abs(prevAveHubDelta - aveHubDelta) < 0.01 && Math.abs(prevAveAuthDelta - aveAuthDelta) < 0.01)) { return true; } else { prevHubVals = currHubVals; prevAuthVals = currAuthVals; prevAveHubDelta = aveHubDelta; prevAveAuthDelta = aveAuthDelta; return false; } } /** * Determine how much values in a list are changing. Useful for detecting * convergence of data values. * * @param r * @return */ public double getAveDelta(double[] curr, double[] prev) { double aveDelta = 0; assert (curr.length == prev.length); for (int j = 0; j < curr.length; j++) { aveDelta += Math.abs(curr[j] - prev[j]); } aveDelta /= curr.length; return aveDelta; } /** * Return from a set of Pages the Page with the greatest Hub value * * @param pageTable * @return */ public Page getMaxHub(List result) { Page maxHub = result.get(0); for (int i = 1; i < result.size(); i++) { Page currPage = result.get(i); if (currPage.hub > maxHub.hub) { maxHub = currPage; } } return maxHub; } /** * Return from a set of Pages the Page with the greatest Authority value * * @param pageTable * @return */ public Page getMaxAuthority(List result) { Page maxAuthority = result.get(0); for (int i = 1; i < result.size(); i++) { Page currPage = result.get(i); if (currPage.authority > maxAuthority.authority) { maxAuthority = currPage; } } return maxAuthority; } /** * Organize the list of pages according to their descending Hub scores. * * @param result */ public void sortHub(List result) { Collections.sort(result, new Comparator() { public int compare(Page p1, Page p2) { // Sorts by 'TimeStarted' property return p1.hub < p2.hub ? -1 : p1.hub > p2.hub ? 1 : secondaryOrderSort(p1, p2); } // If 'TimeStarted' property is equal sorts by 'TimeEnded' property public int secondaryOrderSort(Page p1, Page p2) { return p1.getLocation().compareToIgnoreCase(p2.getLocation()) < 1 ? -1 : p1.getLocation().compareToIgnoreCase(p2.getLocation()) > 1 ? 1 : 0; } }); } /** * Organize the list of pages according to their descending Authority Scores * * @param result */ public void sortAuthority(List result) { Collections.sort(result, new Comparator() { public int compare(Page p1, Page p2) { // Sorts by 'TimeStarted' property return p1.hub < p2.hub ? -1 : p1.hub > p2.hub ? 1 : secondaryOrderSort(p1, p2); } // If 'TimeStarted' property is equal sorts by 'TimeEnded' property public int secondaryOrderSort(Page p1, Page p2) { return p1.getLocation().compareToIgnoreCase(p2.getLocation()) < 1 ? -1 : p1.getLocation().compareToIgnoreCase(p2.getLocation()) > 1 ? 1 : 0; } }); } /** * Simple console display of HITS Algorithm results. * * @param result */ public void report(List result) { // Print Pages out ranked by highest authority sortAuthority(result); System.out.println("AUTHORITY RANKINGS : "); for (int i = 0; i < result.size(); i++) { Page currP = result.get(i); System.out.printf(currP.getLocation() + ": " + "%.5f" + '\n', currP.authority); } System.out.println(); // Print Pages out ranked by highest hub sortHub(result); System.out.println("HUB RANKINGS : "); for (int i = 0; i < result.size(); i++) { Page currP = result.get(i); System.out.printf(currP.getLocation() + ": " + "%.5f" + '\n', currP.hub); } System.out.println(); // Print Max Authority System.out.println("Page with highest Authority score: " + getMaxAuthority(result).getLocation()); // Print Max Authority System.out.println("Page with highest Hub score: " + getMaxAuthority(result).getLocation()); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy