aima.core.nlp.ranking.HITS Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of aima-core Show documentation
Show all versions of aima-core Show documentation
AIMA-Java Core Algorithms from the book Artificial Intelligence a Modern Approach 3rd Ed.
The newest version!
package aima.core.nlp.ranking;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
/**
* Artificial Intelligence A Modern Approach (3rd Edition): page 871.
*
*
*
* function HITS(query) returns pages (with hub and authority numbers)
* pages ← EXPAND-PAGES(RELEVANT-PAGES(query))
* for each p in pages do
* p.AUTHORITY ← 1
* p.HUB ← 1
* repeat until convergence do
* for each p in pages do
* p.AUTHORITY ← Σi INLINKi(p).HUB
* p.HUB ← Σi OUTLINKi(p).AUTHORITY
* NORMALIZE(pages)
* return pages
*
*
* Figure 22.1 The HITS algorithm for computing hubs and authorities with
* respect to a query. RELEVANT-PAGES fetches the pages that match the query,
* and EXPAND-PAGES add in every page that links to or is linked from one of the
* relevant pages. NORMALIZE divides each page's score by the sum of the squares
* of all pages' scores (separately for both the authority and hubs scores.
*
*
* @author Jonathon Belotti (thundergolfer)
*
*/
public class HITS {
final int RANK_HISTORY_DEPTH;
final double DELTA_TOLERANCE; // somewhat arbitrary
Map pTable;
// DETECT CONVERGENCE VARS
double[] prevAuthVals;
double[] prevHubVals;
double prevAveHubDelta = 0;
double prevAveAuthDelta = 0;
////////////////////////////
// TODO: Improve the convergence detection functionality
public HITS(Map pTable, int rank_hist_depth, double delta_tolerance) {
this.pTable = pTable;
this.RANK_HISTORY_DEPTH = rank_hist_depth;
this.DELTA_TOLERANCE = delta_tolerance;
}
public HITS(Map pTable) {
this(pTable, 3, 0.05);
}
// function HITS(query) returns pages with hub and authority number
public List hits(String query) {
// pages <- EXPAND-PAGES(RELEVANT-PAGES(query))
List pages = expandPages(relevantPages(query));
// for each p in pages
for (Page p : pages) {
// p.AUTHORITY <- 1
p.authority = 1;
// p.HUB <- 1
p.hub = 1;
}
// repeat until convergence do
while (!convergence(pages)) {
// for each p in pages do
for (Page p : pages) {
// p.AUTHORITY <- &Sigmai INLINKi(p).HUB
p.authority = SumInlinkHubScore(p);
// p.HUB <- Σi OUTLINKi(p).AUTHORITY
p.hub = SumOutlinkAuthorityScore(p);
}
// NORMALIZE(pages)
normalize(pages);
}
return pages;
}
/**
* Fetches and returns all pages that match the query
*
* @param query
* @return
* @throws UnsupportedEncodingException
*/
public List relevantPages(String query) {
List relevantPages = new ArrayList();
for (Page p : pTable.values()) {
if (matches(query, p.getContent())) {
relevantPages.add(p);
}
}
return relevantPages;
}
/**
* Simple check if query string is a substring of a block of text.
*
* @param query
* @param text
* @return
*/
public boolean matches(String query, String text) {
return text.contains(query);
}
/**
* Adds pages that are linked to or is linked from one of the pages passed
* as argument.
*
* @param pages
* @return
*/
public List expandPages(List pages) {
List expandedPages = new ArrayList();
Set inAndOutLinks = new HashSet();
// Go through all pages an build a list of String links
for (int i = 0; i < pages.size(); i++) {
Page currP = pages.get(i);
if (!expandedPages.contains(currP)) {
expandedPages.add(currP);
}
List currInlinks = currP.getInlinks();
for (int j = 0; j < currInlinks.size(); j++) {
inAndOutLinks.add(currInlinks.get(i));
}
List currOutlinks = currP.getOutlinks();
for (int j = 0; j < currOutlinks.size(); j++) {
inAndOutLinks.add(currOutlinks.get(i));
}
}
// go through String links and add their respective pages to our return
// list
Iterator it = inAndOutLinks.iterator();
while (it.hasNext()) {
String addr = it.next();
Page p = pTable.get(addr);
if (p != null && !expandedPages.contains(p)) { // a valid link may
// not have an
// associated page
// in our table
expandedPages.add(p);
}
}
return expandedPages;
} // end expandPages();
/**
* Divides each page's score by the sum of the squares of all pages' scores
* (separately for both the authority and hubs scores
*
* @param pages
* @return
*/
public List normalize(List pages) {
double hubTotal = 0;
double authTotal = 0;
for (Page p : pages) {
// Sum Hub scores over all pages
hubTotal += Math.pow(p.hub, 2);
// Sum Authority scores over all pages
authTotal += Math.pow(p.authority, 2);
}
// divide all hub and authority scores for all pages
for (Page p : pages) {
if (hubTotal > 0) {
p.hub /= hubTotal;
} else {
p.hub = 0;
}
if (authTotal > 0) {
p.authority /= authTotal;
} else {
p.authority = 0;
}
}
return pages; // with normalised scores now
} // end normalize()
/**
* Calculate the Authority score of a page by summing the Hub scores of that
* page's inlinks.
*
* @param page
* @param pagesTable
* @return
*/
public double SumInlinkHubScore(Page page) {
List inLinks = page.getInlinks();
double hubScore = 0;
for (int i = 0; i < inLinks.size(); i++) {
Page inLink = pTable.get(inLinks.get(i));
if (inLink != null) {
hubScore += inLink.hub;
} else {
// page is linked to by a Page not in our table
continue;
}
}
return hubScore;
} // end SumInlinkHubScore()
/**
* Calculate the Hub score of a page by summing the Authority scores of that
* page's outlinks.
*
* @param page
* @param pagesTable
* @return
*/
public double SumOutlinkAuthorityScore(Page page) {
List outLinks = page.getOutlinks();
double authScore = 0;
for (int i = 0; i < outLinks.size(); i++) {
Page outLink = pTable.get(outLinks.get(i));
if (outLink != null) {
authScore += outLink.authority;
}
}
return authScore;
}
/**
* pg. 872 : "If we then normalize the scores and repeat k times the process
* will converge"
*
* @return
*/
private boolean convergence(List pages) {
double aveHubDelta = 100;
double aveAuthDelta = 100;
if (pages == null) {
return true;
}
// get current values from pages
double[] currHubVals = new double[pages.size()];
double[] currAuthVals = new double[pages.size()];
for (int i = 0; i < pages.size(); i++) {
Page currPage = pages.get(i);
currHubVals[i] = currPage.hub;
currHubVals[i] = currPage.authority;
}
if (prevHubVals == null || prevAuthVals == null) {
prevHubVals = currHubVals;
prevAuthVals = currAuthVals;
return false;
}
// compare to past values
aveHubDelta = getAveDelta(currHubVals, prevHubVals);
aveAuthDelta = getAveDelta(currAuthVals, prevAuthVals);
if (aveHubDelta + aveAuthDelta < DELTA_TOLERANCE || (Math.abs(prevAveHubDelta - aveHubDelta) < 0.01
&& Math.abs(prevAveAuthDelta - aveAuthDelta) < 0.01)) {
return true;
} else {
prevHubVals = currHubVals;
prevAuthVals = currAuthVals;
prevAveHubDelta = aveHubDelta;
prevAveAuthDelta = aveAuthDelta;
return false;
}
}
/**
* Determine how much values in a list are changing. Useful for detecting
* convergence of data values.
*
* @param r
* @return
*/
public double getAveDelta(double[] curr, double[] prev) {
double aveDelta = 0;
assert (curr.length == prev.length);
for (int j = 0; j < curr.length; j++) {
aveDelta += Math.abs(curr[j] - prev[j]);
}
aveDelta /= curr.length;
return aveDelta;
}
/**
* Return from a set of Pages the Page with the greatest Hub value
*
* @param pageTable
* @return
*/
public Page getMaxHub(List result) {
Page maxHub = result.get(0);
for (int i = 1; i < result.size(); i++) {
Page currPage = result.get(i);
if (currPage.hub > maxHub.hub) {
maxHub = currPage;
}
}
return maxHub;
}
/**
* Return from a set of Pages the Page with the greatest Authority value
*
* @param pageTable
* @return
*/
public Page getMaxAuthority(List result) {
Page maxAuthority = result.get(0);
for (int i = 1; i < result.size(); i++) {
Page currPage = result.get(i);
if (currPage.authority > maxAuthority.authority) {
maxAuthority = currPage;
}
}
return maxAuthority;
}
/**
* Organize the list of pages according to their descending Hub scores.
*
* @param result
*/
public void sortHub(List result) {
Collections.sort(result, new Comparator() {
public int compare(Page p1, Page p2) {
// Sorts by 'TimeStarted' property
return p1.hub < p2.hub ? -1 : p1.hub > p2.hub ? 1 : secondaryOrderSort(p1, p2);
}
// If 'TimeStarted' property is equal sorts by 'TimeEnded' property
public int secondaryOrderSort(Page p1, Page p2) {
return p1.getLocation().compareToIgnoreCase(p2.getLocation()) < 1 ? -1
: p1.getLocation().compareToIgnoreCase(p2.getLocation()) > 1 ? 1 : 0;
}
});
}
/**
* Organize the list of pages according to their descending Authority Scores
*
* @param result
*/
public void sortAuthority(List result) {
Collections.sort(result, new Comparator() {
public int compare(Page p1, Page p2) {
// Sorts by 'TimeStarted' property
return p1.hub < p2.hub ? -1 : p1.hub > p2.hub ? 1 : secondaryOrderSort(p1, p2);
}
// If 'TimeStarted' property is equal sorts by 'TimeEnded' property
public int secondaryOrderSort(Page p1, Page p2) {
return p1.getLocation().compareToIgnoreCase(p2.getLocation()) < 1 ? -1
: p1.getLocation().compareToIgnoreCase(p2.getLocation()) > 1 ? 1 : 0;
}
});
}
/**
* Simple console display of HITS Algorithm results.
*
* @param result
*/
public void report(List result) {
// Print Pages out ranked by highest authority
sortAuthority(result);
System.out.println("AUTHORITY RANKINGS : ");
for (int i = 0; i < result.size(); i++) {
Page currP = result.get(i);
System.out.printf(currP.getLocation() + ": " + "%.5f" + '\n', currP.authority);
}
System.out.println();
// Print Pages out ranked by highest hub
sortHub(result);
System.out.println("HUB RANKINGS : ");
for (int i = 0; i < result.size(); i++) {
Page currP = result.get(i);
System.out.printf(currP.getLocation() + ": " + "%.5f" + '\n', currP.hub);
}
System.out.println();
// Print Max Authority
System.out.println("Page with highest Authority score: " + getMaxAuthority(result).getLocation());
// Print Max Authority
System.out.println("Page with highest Hub score: " + getMaxAuthority(result).getLocation());
}
}