pageunit.linkchecker.LinkChecker Maven / Gradle / Ivy
package pageunit.linkchecker;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.SocketException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;
import pageunit.html.HTMLAnchor;
import pageunit.html.HTMLComponent;
import pageunit.html.HTMLForm;
import pageunit.html.HTMLIMG;
import pageunit.html.HTMLParseException;
import sun.net.URLCanonicalizer;
/** A simple HTML Link Checker.
* Should have Properties to set depth, URLs to check. etc.
* Responses not adequate; need to check at least for 404-type errors!
*
* XXX Move cache into a ThreadLocal, then un-synchronize all the methods.
* @author Ian Darwin, http://darwinsys.com/
*/
public class LinkChecker {
static int indent = 0;
final static List cache = new ArrayList();
/**
* Start checking, given a URL by name.
* Calls checkLink to check each link.
* @param rootURLString Where to start checking
* @throws IOException if the reading fails
*/
public synchronized static void checkStartingAt(String rootURLString) throws IOException {
URL rootURL = null;
if (rootURLString == null) {
System.out.println("checkOut(null) isn't very useful");
return;
}
try {
rootURL = new URL(rootURLString);
} catch (MalformedURLException e) {
// If not a valid URL, try again as a file.
rootURL = new File(rootURLString).toURL();
}
System.out.printf("LinkChecker.checkStartingAt(%s)%n", rootURL);
try {
List urlTags = new LinkExtractor().parse(new InputStreamReader((InputStream)rootURL.getContent()));
for (HTMLComponent tag : urlTags) {
// System.out.printf("TAG %s%n", tag);
String href = null;
if (tag instanceof HTMLAnchor) {
href = ((HTMLAnchor)tag).getURL();
}
if (tag instanceof HTMLForm) {
href = ((HTMLForm)tag).getAction();
}
if (tag instanceof HTMLIMG) {
href = ((HTMLIMG)tag).getSrc();
}
for (int j=0; j