aima.core.nlp.ranking.PagesDataset Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of aima-core Show documentation
Show all versions of aima-core Show documentation
AIMA-Java Core Algorithms from the book Artificial Intelligence a Modern Approach 3rd Ed.
The newest version!
package aima.core.nlp.ranking;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
/**
*
* @author Jonathon Belotti (thundergolfer)
*
*/
public class PagesDataset {
static String wikiPagesFolderPath = "src\\main\\resources\\aima\\core\\ranking\\data\\pages";
static String testFilesFolderPath = "src\\main\\resources\\aima\\core\\ranking\\data\\pages\\test_pages";
private static WikiLinkFinder wlf;
public static Map loadDefaultPages() {
return loadPages(wikiPagesFolderPath);
}
public static Map loadTestPages() {
return loadPages(testFilesFolderPath);
}
/**
* Access a folder of .txt files containing wikipedia html source, and give
* back a hashtable of pages, which each page having it's correct inlink
* list and outlink list.
*
* @param folderPath
* @return a hashtable of Page objects, accessed by article name (which is a
* location for wikipedia: \wiki\*article name*)
*/
public static Map loadPages(String folderPath) {
Map pageTable = new Hashtable();
Page currPage;
File[] listOfFiles;
wlf = new WikiLinkFinder();
File folder = new File(folderPath);
if (folder.exists() && folder.isDirectory()) {
listOfFiles = folder.listFiles();
} else {
return null;
} // maybe should throw exception instead?
// Access each .txt file to create a new Page object for that file's
// article
for (int i = 0; i < listOfFiles.length; i++) {
File currFile = listOfFiles[i];
if (currFile.isFile()) {
currPage = wikiPageFromFile(folder, currFile);
pageTable.put(currPage.getLocation(), currPage);
}
}
// now that all pages are loaded and their outlinks have been
// determined,
// we can determine a page's inlinks and then return the loaded table
return pageTable = determineAllInlinks(pageTable);
} // end loadPages()
public static Page wikiPageFromFile(File folder, File f) {
Page p;
String pageLocation = getPageName(f); // will be like: \wiki\*article
// name*.toLowercase()
String content = loadFileText(folder, f); // get html source as string
p = new Page(pageLocation); // create the page object
p.setContent(content); // give the page its html source as a string
p.getOutlinks().addAll(wlf.getOutlinks(p)); // search html source for
// links
return p;
}
public static Map determineAllInlinks(Map pageTable) {
Page currPage;
Set keySet = pageTable.keySet();
Iterator keySetIterator = keySet.iterator();
while (keySetIterator.hasNext()) {
currPage = pageTable.get(keySetIterator.next());
// add the inlinks to an currently empty List object
currPage.getInlinks().addAll(wlf.getInlinks(currPage, pageTable));
}
return pageTable;
}
public static String getPageName(File f) {
String wikiPrefix = "/wiki/";
String filename = f.getName();
if (filename.indexOf(".") > 0)
filename = filename.substring(0, filename.lastIndexOf("."));
return wikiPrefix + filename.toLowerCase();
} // end getPageName()
public static String loadFileText(File folder, File file) {
String pageContent = "";
BufferedReader br = null;
// repeat for all files
try {
String sCurrentLine;
String folderPath = folder.getAbsolutePath();
String fileName = file.getName();
br = new BufferedReader(new FileReader(folderPath + File.separator + fileName));
while ((sCurrentLine = br.readLine()) != null) {
pageContent += sCurrentLine;
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if (br != null)
br.close();
} catch (IOException ex) {
ex.printStackTrace();
}
}
return pageContent;
} // end loadFileText()
// TODO:
// Be able to automatically retrieve an arbitrary number of
// wikipaedia pages and create a hashtable of Pages from them.
// TODO:
// Be able to automatically retreive an arbitraru number of webpages
// that are in a network conducive to application of the HITS algorithm
}