
com.antbrains.urlcrawler.crawler.Writer Maven / Gradle / Ivy
package com.antbrains.urlcrawler.crawler;
import java.io.IOException;
import java.sql.DriverManager;
import java.util.ArrayList;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.TimeUnit;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.log4j.Logger;
import com.antbrains.urlcrawler.db.CrawlTask;
import com.antbrains.urlcrawler.db.HbaseTool;
public class Writer extends Thread {
protected static Logger logger = Logger.getLogger(Writer.class);
BlockingQueue resQueue;
Connection hbaseConn;
String dbName;
public Writer(String dbName, BlockingQueue resQueue, String zkQuorum, String zkPort) throws Exception {
this.resQueue = resQueue;
this.dbName=dbName;
Configuration myConf = HBaseConfiguration.create();
myConf.set("hbase.zookeeper.quorum", zkQuorum);
if (zkPort != null) {
myConf.set("hbase.zookeeper.property.clientPort", zkPort);
}
hbaseConn =ConnectionFactory.createConnection(myConf);
}
private volatile boolean bStop;
public void stopMe() {
logger.info("receive stop signal");
bStop = true;
}
int batchSize = 100;
ArrayList cache = new ArrayList<>(batchSize);
long updateInterval = 60_000L;
long lastUpdate;
@Override
public void run() {
lastUpdate = System.currentTimeMillis();
while (!bStop) {
try {
CrawlTask task = resQueue.poll(3, TimeUnit.SECONDS);
if (task == null) {
if (System.currentTimeMillis() - lastUpdate > this.updateInterval) {
this.flushCache();
}
} else {
this.cache.add(task);
if (cache.size() >= this.batchSize) {
this.flushCache();
}
}
} catch (InterruptedException e) {
}
}
this.flushCache();
//DBUtils.closeAll(phoenixConn, null, null);
try {
hbaseConn.close();
} catch (IOException e) {
logger.error(e.getMessage(),e);
}
logger.info("stopped");
}
private void flushCache() {
// save to hbase
try {
HbaseTool.updateWebPage(dbName,hbaseConn, cache);
ArrayList succ=new ArrayList<>();
ArrayList fail=new ArrayList<>();
ArrayList all=new ArrayList<>();
for(CrawlTask task:cache){
all.add(task.crawlUrl);
if(task.status==CrawlTask.STATUS_FAILED){
fail.add(task);
}else if(task.status==CrawlTask.STATUS_SUCC){
succ.add(task.crawlUrl);
}else{
logger.warn("algo bug: "+task);
}
}
HbaseTool.addRows(dbName, HbaseTool.TB_URLDB_SUCC, hbaseConn, succ);
HbaseTool.addFailed(dbName, hbaseConn, fail);
HbaseTool.delRows(dbName, HbaseTool.TB_URLDB_CRAWLING, hbaseConn, all);
} catch (Exception e) {
logger.error(e.getMessage());
}
// update mysql status
cache.clear();
lastUpdate = System.currentTimeMillis();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy