pageunit.linkchecker.LinkChecker Maven / Gradle / Ivy

Go to download

package pageunit.linkchecker;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.SocketException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;

import pageunit.html.HTMLAnchor;
import pageunit.html.HTMLComponent;
import pageunit.html.HTMLForm;
import pageunit.html.HTMLIMG;
import pageunit.html.HTMLParseException;
import sun.net.URLCanonicalizer;

/** A simple HTML Link Checker. 
 * Should have Properties to set depth, URLs to check. etc.
 * Responses not adequate; need to check at least for 404-type errors!
 * 

 * XXX Move cache into a ThreadLocal, then un-synchronize all the methods.
 * @author Ian Darwin, http://darwinsys.com/
 */
public class LinkChecker {

	static int indent = 0;
	final static List cache = new ArrayList();
  
	/**
	 * Start checking, given a URL by name.
	 * Calls checkLink to check each link.
	 * @param rootURLString Where to start checking
	 * @throws IOException if the reading fails
	 */
	public synchronized static void checkStartingAt(String rootURLString) throws IOException {
		URL rootURL = null;
		
		if (rootURLString == null) {
			System.out.println("checkOut(null) isn't very useful");
			return;
		}

		try {
			rootURL = new URL(rootURLString);
		} catch (MalformedURLException e) {
			// If not a valid URL, try again as a file.
			rootURL = new File(rootURLString).toURL();
		}		
		System.out.printf("LinkChecker.checkStartingAt(%s)%n", rootURL);
		
		try {
			List urlTags = new LinkExtractor().parse(new InputStreamReader((InputStream)rootURL.getContent()));
			for (HTMLComponent tag : urlTags) {
				// System.out.printf("TAG %s%n", tag);
						
				String href = null;
				if (tag instanceof HTMLAnchor) {
					href = ((HTMLAnchor)tag).getURL();
				}
				if (tag instanceof HTMLForm) {
					href = ((HTMLForm)tag).getAction();
				}
				if (tag instanceof HTMLIMG) {
					href = ((HTMLIMG)tag).getSrc();
				}				
		
				for (int j=0; j

    

    

    
            
    
            

    
        
            
                Related Artifacts
                
                     mysql-connector-java mysql
 facebook-messenger com.github.codedrinker
 selenium-java org.seleniumhq.selenium
 instagram-java com.github.sola92
 gson com.google.code.gson
 poi org.apache.poi
 httpclient org.apache.httpcomponents
 json org.json
 facebook-java-api com.google.code.facebook-java-api
 poi-ooxml org.apache.poi
 jackson-databind com.fasterxml.jackson.core
 junit junit
 primefaces org.primefaces
 ojdbc7 com.github.noraui
 jfoenix com.jfoenix
 testng org.testng
 json-simple com.googlecode.json-simple
 selenium-server org.seleniumhq.selenium
 itextpdf com.itextpdf
 spring-core org.springframework
                
            
        
        
            
                Related Groups
                
                     org.springframework
 org.apache.poi
 org.hibernate
 org.springframework.boot
 com.fasterxml.jackson.core
 com.itextpdf
 org.seleniumhq.selenium
 mysql
 org.finos.legend.engine
 org.apache.httpcomponents
 org.apache.logging.log4j
 org.openjfx
 org.apache.commons
 org.json
 com.google.guava
 com.google.zxing
 net.sf.jasperreports
 javax.xml.bind
 ojdbc
 com.google.code.facebook-java-api