All Downloads are FREE. Search and download functionalities are using the official Maven repository.

aima.core.nlp.ranking.WikiLinkFinder Maven / Gradle / Ivy

Go to download

AIMA-Java Core Algorithms from the book Artificial Intelligence a Modern Approach 3rd Ed.

The newest version!
package aima.core.nlp.ranking;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * 
 * @author Jonathon Belotti (thundergolfer)
 *
 */
public class WikiLinkFinder implements LinkFinder {

	// TODO
	// Make more intelligent link search
	public List getOutlinks(Page page) {

		String content = page.getContent();
		List outLinks = new ArrayList();
		// search content for all href="x" outlinks
		List allMatches = new ArrayList();
		Matcher m = Pattern.compile("href=\"(/wiki/.*?)\"").matcher(content);
		while (m.find()) {
			allMatches.add(m.group());
		}
		for (int i = 0; i < allMatches.size(); i++) {
			String match = allMatches.get(i);
			String[] tokens = match.split("\"");
			String location = tokens[1].toLowerCase(); // also, tokens[0] = the
														// text before the first
														// quote,
														// and tokens[2] is the
														// text after the second
														// quote
			outLinks.add(location);
		}

		return outLinks;
	}

	@Override
	public List getInlinks(Page target, Map pageTable) {

		String location = target.getLocation().toLowerCase(); // make comparison
																// case
																// insensitive
		List inlinks = new ArrayList(); // initialise a list for
														// the inlinks

		// go through all pages and if they link back to target then add that
		// page's location to the target's inlinks
		Iterator keySetIterator = pageTable.keySet().iterator();
		while (keySetIterator.hasNext()) {
			Page p = pageTable.get(keySetIterator.next());
			for (int i = 0; i < p.getOutlinks().size(); i++) {
				String pForward = p.getOutlinks().get(i).toLowerCase().replace('\\', '/');
				String pBackward = p.getOutlinks().get(i).toLowerCase().replace('/', '\\');
				if (pForward.equals(location) || pBackward.equals(location)) {
					inlinks.add(p.getLocation().toLowerCase());
					break;
				}
			}
		}
		return inlinks;
	}

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy