![JAR search and dependency download from the Maven repository](/logo.png)
nyla.solutions.global.patterns.creational.builder.mapped.WebCrawlerMappedKeyDirector Maven / Gradle / Ivy
package nyla.solutions.global.patterns.creational.builder.mapped;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import nyla.solutions.global.exception.SystemException;
import nyla.solutions.global.net.http.HTTP;
import nyla.solutions.global.patterns.cache.CacheFarm;
import nyla.solutions.global.util.Debugger;
import nyla.solutions.global.util.Text;
import nyla.solutions.global.xml.DOM4J;
import nyla.solutions.global.xml.DomJsoupHtmlStragety;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
/**
* Use MappedKeyEngineer to construct output from HTML pages.
* Each HREF in the HTML will be processed.
*
* Set the quota and ignorePathRegExp to limit crawled pages.
* @author Gregory Green
*
*/
public class WebCrawlerMappedKeyDirector extends MappedKeyDirector
{
/**
* Director method to construct a document
* @param path the
* @param engineer
*/
public void constructDocument(String urlPath, MappedKeyEngineer engineer)
{
//clear cache
CacheFarm.getCache().remove(this.documentCacheKey);
try
{
crawl(urlPath, engineer);
}
finally
{
CacheFarm.getCache().remove(this.documentCacheKey);
crawledPaths.clear();
}
}// --------------------------------------------
/**
* Director method to construct a document
* @param path the
* @param engineer
*/
protected void crawl(String urlPath, MappedKeyEngineer engineer)
{
Debugger.println(this, "checked ="+urlPath);
if(Text.matches(urlPath, ignorePathRegExp) || crawledPaths.contains(urlPath))
{
Debugger.println(this, "skipped ="+urlPath);
return;
}
Debugger.println(this, "processing ="+urlPath);
try
{
URL url = new URL(urlPath);
//open url
URLConnection connection = url.openConnection();
//test if html
String contentType = connection.getContentType();
//String html = new HttpClientText(url).getText();
org.w3c.dom.Document document =new DomJsoupHtmlStragety().toDocument(connection.getInputStream());
//get document in cache, to potentially be used by the engineer
CacheFarm.getCache().put(documentCacheKey, document);
//cache XML
CacheFarm.getCache().put(xmlCacheKey, new DOM4J(document));
Map mapText = super.constructMapToText(urlPath);
//save document information
engineer.construct(urlPath, mapText);
if(!HTTP.isHtmlContentType(contentType))
{
//add single index
return; //do not check links
}
//find all hrefs
NodeList nodes = document.getElementsByTagName("a");
NamedNodeMap attributes = null;
Node href = null;
String path = null;
for (int i = 0; i < nodes.getLength(); i++)
{
attributes = nodes.item(i).getAttributes();
if(attributes == null)
continue; //skip
href = attributes.getNamedItem("href");
path = href.getNodeValue();
if(!this.mustSkip(path))
{
try
{
//recursive build
crawl(HTTP.toURL(url,path).toString(),engineer);
}
catch(MalformedURLException e)
{
Debugger.printWarn(this,"skipped parent url="+url+" link="+path+" "+e);
}
}
}
}
catch (Exception e)
{
throw new SystemException("unable to process url="+urlPath+" error="+Debugger.stackTrace(e));
}
}// --------------------------------------------
private boolean mustSkip(String urlPath)
{
if(Text.matches(urlPath, ignorePathRegExp) || crawledPaths.contains(urlPath))
{
Debugger.println(this, "skipped ="+urlPath);
return true;
}
return false;
}// --------------------------------------------
/**
* @return the xmlCacheKey
*/
public String getXmlCacheKey()
{
return xmlCacheKey;
}
/**
* @param xmlCacheKey the xmlCacheKey to set
*/
public void setXmlCacheKey(String xmlCacheKey)
{
this.xmlCacheKey = xmlCacheKey;
}
private String xmlCacheKey = "xml";
private String documentCacheKey = "document";
private Set
© 2015 - 2025 Weber Informatics LLC | Privacy Policy