Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/**
*
* amass - web crawling made easy
* Copyright (c) 2011-2013, Sandeep Gupta
*
* http://www.sangupta/projects/amass
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package com.sangupta.amass;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.atomic.AtomicInteger;
import com.sangupta.amass.core.AfterCrawlHandler;
import com.sangupta.amass.core.BeforeCrawlHandler;
import com.sangupta.amass.core.CrawlHandler;
import com.sangupta.amass.core.CrawlingHandler;
import com.sangupta.amass.core.QueueMessageConverter;
import com.sangupta.amass.domain.AmassSignal;
import com.sangupta.amass.domain.CrawlableURL;
import com.sangupta.amass.impl.CrawlingQueue;
import com.sangupta.amass.impl.CrawlingWorker;
import com.sangupta.jerry.util.DateUtils;
/**
* Amass is a crawler that allows you to crawl a given number of URLs,
* allowing both pre-processing and post-processing. It also allows to
* submit a URL more than once, which then increases its priority to be
* crawled.
*
* Amass is a high-throughput concurrent library for gathering data from
* the Internet. It also supports minimum time-thresholds before a URL
* is crawled again.
*
* @author sangupta
*
*/
public class Amass {
/**
* Helps identify thread groups uniquely when multiple {@link Amass} instances
* are created.
*/
private static final AtomicInteger AMASS_INSTANCE_COUNT = new AtomicInteger(1);
/**
* The total number of worker thread of this {@link Amass} instance.
*/
private final int numThreads;
/**
* The handler that needs to be executed before each URL is crawled by
* this {@link Amass} instance.
*
*/
private final BeforeCrawlHandler beforeCrawlHandler;
/**
* The handler that needs to be executed after each URL is crawled by
* this {@link Amass} instance.
*
*/
private final AfterCrawlHandler afterCrawlHandler;
/**
* The handler that needs to be executed for crawling each URL.
*
*/
private final CrawlHandler crawlHandler;
/**
* The job queue that is used by this {@link Amass} instance.
*/
private final CrawlingQueue crawlingQueue;
/**
* The thread-group that this {@link Amass} instance creates.
*
*/
private final ThreadGroup workerGroup;
/**
* The actual worker threads of this {@link Amass} instance.
*
*/
private final Thread[] workerThreads;
/**
* The actual worker objects that have been created for this {@link Amass} instance.
*/
private final CrawlingWorker[] workers;
/**
* The state signal for this {@link Amass} instance.
*/
private final AmassSignal amassSignal;
/**
* Indicates whether closure of this {@link Amass} instance has been seeked.
* Once the instance is closed, no more crawling jobs can be submitted to this
* instance.
*/
private volatile boolean closed = false;
public Amass(int numThreads, AfterCrawlHandler afterCrawlHandler) {
this(numThreads, null, null, null, null, afterCrawlHandler);
}
/**
* Create a new instance of {@link Amass} that uses the given number of
* threads for crawling purposes.
*
* @param numThreads
* the number of threads to run in parallel
*
* @param beforeCrawlHandler
* the handler to run before starting to crawl
*
* @param afterCrawlHandler
* the handler to run after completing the crawl
*/
public Amass(final int numThreads, final BeforeCrawlHandler beforeCrawlHandler, final AfterCrawlHandler afterCrawlHandler) {
this(numThreads, null, null, beforeCrawlHandler, null, afterCrawlHandler);
}
public Amass(int numThreads, CrawlingHandler crawlingHandler) {
this(numThreads, null, null, crawlingHandler, crawlingHandler, crawlingHandler);
}
public Amass(int numThreads, BlockingQueue