All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.github.panhongan.util.web.page.Crawler Maven / Gradle / Ivy

There is a newer version: 0.1.8
Show newest version
package com.github.panhongan.util.web.page;

import java.util.HashMap;
import java.util.Map;

import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.alibaba.fastjson.JSONObject;
import com.github.panhongan.util.control.workflow.WorkflowNode;
import com.github.panhongan.util.web.url.HTTPConstants;

public class Crawler extends WorkflowNode {
	
	private static final Logger logger = LoggerFactory.getLogger(Crawler.class);
	
	private Map headers = new HashMap();
	
	public Crawler() {
		
	}
	
	public Crawler(String name) {
		super(name);
	}
	
	public void addHeader(String key, String value) {
		if (key != null && value != null) {
			headers.put(key, value);
		}
	}
	
	public void addHeaders(Map map) {
		if (map != null) {
			for (String key : map.keySet()) {
				this.addHeader(key,  map.get(key));
			}
		}
	}
	
	public void clearHeaders() {
		headers.clear();
	}
	
	public void setHeader(String key, String value) {
		if (value != null && headers.containsKey(key)) {
			headers.put(key, value);
		}
	}
	
	public void removeHeader(String key) {
		if (key != null) {
			headers.remove(key);
		}
	}
	
	public Map getHeaders() {
		return headers;
	}
	
	@Override
	public Object processData(Object input) {
		Document doc = null;
		
		if (input instanceof JSONObject) {
			try {
				JSONObject json = (JSONObject)input;
				String url = json.getString("url");
				String method = json.getString("method");
				doc = this.crawl(method, url);
			} catch (Exception e) {
				logger.warn(e.getMessage());
			}
		}
		
		return doc;
	}
	
	public Document crawl(String method, String url) {
		Document doc = null;
		
		if (url == null) {
			logger.warn("url is null");
			return doc;
		}
		
		try {
			Connection conn = Jsoup.connect(url);
			for (String key : headers.keySet()) {
				conn.header(key, headers.get(key));
			}
			conn.ignoreContentType(true);
			conn.timeout(5 * 1000);
			
			if (HTTPConstants.Method.GET.equals(method)) {
				doc = conn.get();
			} else if (HTTPConstants.Method.POST.equals(method)) {
				doc = conn.post();
			}
		} catch (Exception e) {
			logger.warn(e.getMessage());
		}
		
		return doc;
	}
	
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy