All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.antbrains.urlcrawler.crawler.Writer Maven / Gradle / Ivy

package com.antbrains.urlcrawler.crawler;
 	
import java.io.IOException;
import java.sql.DriverManager;
import java.util.ArrayList;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.TimeUnit;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.log4j.Logger;

import com.antbrains.urlcrawler.db.CrawlTask;
import com.antbrains.urlcrawler.db.HbaseTool; 

public class Writer extends Thread {
	protected static Logger logger = Logger.getLogger(Writer.class);
	BlockingQueue resQueue;
	Connection hbaseConn;
	String dbName;
	public Writer(String dbName, BlockingQueue resQueue, String zkQuorum, String zkPort) throws Exception {
		this.resQueue = resQueue;
		this.dbName=dbName;
		Configuration myConf = HBaseConfiguration.create();
		myConf.set("hbase.zookeeper.quorum", zkQuorum);
		if (zkPort != null) {
			myConf.set("hbase.zookeeper.property.clientPort", zkPort);
		}
		hbaseConn =ConnectionFactory.createConnection(myConf);
	}

	private volatile boolean bStop;

	public void stopMe() {
		logger.info("receive stop signal");
		bStop = true;
	}

	int batchSize = 100;
	ArrayList cache = new ArrayList<>(batchSize);
	long updateInterval = 60_000L;
	long lastUpdate;

	@Override
	public void run() {
		lastUpdate = System.currentTimeMillis();
		while (!bStop) {
			try {
				CrawlTask task = resQueue.poll(3, TimeUnit.SECONDS);
				if (task == null) {
					if (System.currentTimeMillis() - lastUpdate > this.updateInterval) {
						this.flushCache();
					}
				} else {
					this.cache.add(task);
					if (cache.size() >= this.batchSize) {
						this.flushCache();
					}
				}
			} catch (InterruptedException e) {
			}
		}
		this.flushCache();
		//DBUtils.closeAll(phoenixConn, null, null);
		try {
			hbaseConn.close();
		} catch (IOException e) {
			logger.error(e.getMessage(),e);
		}
		logger.info("stopped");
	}
	
	
	private void flushCache() {
		// save to hbase
		try {
			HbaseTool.updateWebPage(dbName,hbaseConn, cache);
			ArrayList succ=new ArrayList<>();
			ArrayList fail=new ArrayList<>();
			ArrayList all=new ArrayList<>();
			for(CrawlTask task:cache){
				all.add(task.crawlUrl);
				if(task.status==CrawlTask.STATUS_FAILED){
					fail.add(task);
				}else if(task.status==CrawlTask.STATUS_SUCC){
					succ.add(task.crawlUrl);
				}else{
					logger.warn("algo bug: "+task);
				}
			}
			HbaseTool.addRows(dbName, HbaseTool.TB_URLDB_SUCC, hbaseConn, succ);
			HbaseTool.addFailed(dbName, hbaseConn, fail);
			HbaseTool.delRows(dbName, HbaseTool.TB_URLDB_CRAWLING, hbaseConn, all);
		} catch (Exception e) {
			logger.error(e.getMessage());
		}
		// update mysql status

		cache.clear();
		lastUpdate = System.currentTimeMillis();
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy