All Downloads are FREE. Search and download functionalities are using the official Maven repository.

pageunit.linkchecker.LinkChecker Maven / Gradle / Ivy

package pageunit.linkchecker;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.SocketException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;

import pageunit.html.HTMLAnchor;
import pageunit.html.HTMLComponent;
import pageunit.html.HTMLForm;
import pageunit.html.HTMLIMG;
import pageunit.html.HTMLParseException;
import sun.net.URLCanonicalizer;

/** A simple HTML Link Checker. 
 * Should have Properties to set depth, URLs to check. etc.
 * Responses not adequate; need to check at least for 404-type errors!
 * 
* XXX Move cache into a ThreadLocal, then un-synchronize all the methods. * @author Ian Darwin, http://darwinsys.com/ */ public class LinkChecker { static int indent = 0; final static List cache = new ArrayList(); /** * Start checking, given a URL by name. * Calls checkLink to check each link. * @param rootURLString Where to start checking * @throws IOException if the reading fails */ public synchronized static void checkStartingAt(String rootURLString) throws IOException { URL rootURL = null; if (rootURLString == null) { System.out.println("checkOut(null) isn't very useful"); return; } try { rootURL = new URL(rootURLString); } catch (MalformedURLException e) { // If not a valid URL, try again as a file. rootURL = new File(rootURLString).toURL(); } System.out.printf("LinkChecker.checkStartingAt(%s)%n", rootURL); try { List urlTags = new LinkExtractor().parse(new InputStreamReader((InputStream)rootURL.getContent())); for (HTMLComponent tag : urlTags) { // System.out.printf("TAG %s%n", tag); String href = null; if (tag instanceof HTMLAnchor) { href = ((HTMLAnchor)tag).getURL(); } if (tag instanceof HTMLForm) { href = ((HTMLForm)tag).getAction(); } if (tag instanceof HTMLIMG) { href = ((HTMLIMG)tag).getSrc(); } for (int j=0; j




© 2015 - 2024 Weber Informatics LLC | Privacy Policy