All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.serphacker.serposcope.task.google.GoogleTask Maven / Gradle / Ivy

The newest version!
/* 
 * Serposcope - SEO rank checker https://serposcope.serphacker.com/
 * 
 * Copyright (c) 2016 SERP Hacker
 * @author Pierre Nogues 
 * @license https://opensource.org/licenses/MIT MIT License
 */
package com.serphacker.serposcope.task.google;

import com.google.inject.Inject;
import com.google.inject.assistedinject.Assisted;
import com.serphacker.serposcope.db.google.GoogleDB;
import com.serphacker.serposcope.di.CaptchaSolverFactory;
import com.serphacker.serposcope.di.ScrapClientFactory;
//import com.serphacker.serposcope.di.ScraperFactory;
import com.serphacker.serposcope.models.base.Proxy;
import com.serphacker.serposcope.models.base.Run;
import com.serphacker.serposcope.models.google.GoogleSettings;
import com.serphacker.serposcope.models.google.GoogleRank;
import com.serphacker.serposcope.models.google.GoogleSearch;
import com.serphacker.serposcope.models.google.GoogleSerp;
import com.serphacker.serposcope.models.google.GoogleSerpEntry;
import com.serphacker.serposcope.models.google.GoogleTarget;
import com.serphacker.serposcope.scraper.captcha.solver.CaptchaSolver;
import com.serphacker.serposcope.scraper.google.GoogleScrapResult;
import com.serphacker.serposcope.scraper.google.scraper.GoogleScraper;
import com.serphacker.serposcope.scraper.http.ScrapClient;
import com.serphacker.serposcope.scraper.http.proxy.DirectNoProxy;
import com.serphacker.serposcope.scraper.http.proxy.ProxyRotator;
import com.serphacker.serposcope.task.AbstractTask;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.atomic.AtomicInteger;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.serphacker.serposcope.scraper.http.proxy.ScrapProxy;
import java.util.stream.Collectors;
import com.serphacker.serposcope.di.GoogleScraperFactory;
import com.serphacker.serposcope.models.google.GoogleBest;
import com.serphacker.serposcope.models.google.GoogleTargetSummary;
import java.io.IOException;
import java.util.Collections;
import java.util.concurrent.ConcurrentHashMap;

public class GoogleTask extends AbstractTask {

    protected static final Logger LOG = LoggerFactory.getLogger(GoogleTask.class);
    
    GoogleScraperFactory googleScraperFactory;
    CaptchaSolverFactory captchaSolverFactory;
    ScrapClientFactory scrapClientFactory;
    
    GoogleDB googleDB;
    ProxyRotator rotator;

    Run previousRun;
    final Map previousRunsByDay = new ConcurrentHashMap<>();
    final Map> targetsByGroup = new ConcurrentHashMap<>();
    final Map summariesByTarget = new ConcurrentHashMap<>();
    
    LinkedBlockingQueue searches;
    GoogleSettings googleOptions;
    protected final AtomicInteger searchDone = new AtomicInteger();
    final AtomicInteger captchaCount = new AtomicInteger();
    
    Thread[] threads;
    volatile int totalSearch;
    volatile boolean interrupted;
    
    CaptchaSolver solver;
    String httpUserAgent;
    int httpTimeoutMS;
    boolean updateRun;
    boolean shuffle = true;
    
    @Inject
    public GoogleTask(
        GoogleScraperFactory googleScraperFactory,
        CaptchaSolverFactory captchaSolverFactory,
        ScrapClientFactory scrapClientFactory,
        GoogleDB googleDB,
        @Assisted Run run
    ){
        super(run);
        this.googleScraperFactory = googleScraperFactory;
        this.captchaSolverFactory = captchaSolverFactory;
        this.scrapClientFactory = scrapClientFactory;
        this.googleDB = googleDB;
        this.updateRun = run.getId() == 0 ? false : true;
        
        httpUserAgent = ScrapClient.DEFAULT_USER_AGENT;
        httpTimeoutMS = ScrapClient.DEFAULT_TIMEOUT_MS;
    }    
    

    @Override
    public Run.Status doRun() {
        solver = initializeCaptchaSolver();
        googleOptions = googleDB.options.get();

        initializeSearches();
        initializePreviousRuns();
        initializeTargets();
        
        
        int nThread = googleOptions.getMaxThreads();
        List proxies = baseDB.proxy.list().stream().map(Proxy::toScrapProxy).collect(Collectors.toList());
        
        if(proxies.isEmpty()){
            LOG.warn("no proxy configured, using direct connection");
            proxies.add(new DirectNoProxy());
        }
        
        if( proxies.size() < nThread ){
            LOG.info("less proxy ({}) than max thread ({}), setting thread number to {}", 
                new Object[]{proxies.size(), nThread, nThread});
            nThread = proxies.size();
        }
        
        rotator = new ProxyRotator(proxies);
        totalSearch = searches.size();
        
        startThreads(nThread);
        waitForThreads();
        
        finalizeSummaries();
        
        if(solver != null){
            try {solver.close();} catch (IOException ex) {}
        }
        
        LOG.warn("{} proxies failed during the task", proxies.size() - rotator.list().size());
        
        int remainingSearch = totalSearch - searchDone.get();
        if(remainingSearch > 0){
            run.setErrors(remainingSearch);
            LOG.warn("{} searches have not been checked", remainingSearch);
            return Run.Status.DONE_WITH_ERROR;
        }
        
        return Run.Status.DONE_SUCCESS;
    }
    
    protected void startThreads(int nThread){
        threads = new Thread[nThread];
        for (int iThread = 0; iThread < threads.length; iThread++) {
            threads[iThread] = new Thread(new GoogleTaskRunnable(this), "google-" + iThread);
            threads[iThread].start();
        }        
    }
    
    protected void waitForThreads(){
        while(true){
            try {
                for (Thread thread : threads) {
                    thread.join();
                }
                return;
            }catch(InterruptedException ex){
                interruptThreads();
            }
        }
    }
    
    protected void interruptThreads(){
        interrupted = true;
        for (Thread thread : threads) {
            thread.interrupt();
        }
    }
    
    protected boolean shouldStop(){
        if(searchDone.get() == totalSearch){
            return true;
        }
        
        if(interrupted){
            return true;
        }
        
        return false;
    }
    
    protected void incCaptchaCount(int captchas){
        run.setCaptchas(captchaCount.addAndGet(captchas));
        baseDB.run.updateCaptchas(run);
    }
    
    protected void onSearchDone(GoogleSearch search, GoogleScrapResult res){
        insertSearchResult(search, res);
        incSearchDone();
    }
    
    protected void incSearchDone(){
        run.setProgress((int) (((float)searchDone.incrementAndGet()/(float)totalSearch)*100f) );
        baseDB.run.updateProgress(run);
    }
    
    protected void insertSearchResult(GoogleSearch search, GoogleScrapResult res) {
        Map history = getHistory(search);

        GoogleSerp serp = new GoogleSerp(run.getId(), search.getId(), run.getStarted());
        for (String url : res.urls) {
            GoogleSerpEntry entry = new GoogleSerpEntry(url);
            entry.fillPreviousPosition(history);
            serp.addEntry(entry);
        }
        googleDB.serp.insert(serp);

        List groups = googleDB.search.listGroups(search);
        for (Integer group : groups) {
            List targets = targetsByGroup.get(group);
            if (targets == null) {
                continue;
            }
            for (GoogleTarget target : targets) {
                int best = googleDB.rank.getBest(group, target.getId(), search.getId()).getRank();
                int rank = GoogleRank.UNRANKED;
                String rankedUrl = null;
                for (int i = 0; i < res.urls.size(); i++) {
                    if (target.match(res.urls.get(i))) {
                        rankedUrl = res.urls.get(i);
                        rank = i + 1;
                        break;
                    }
                }
                
                int previousRank = GoogleRank.UNRANKED;
                if (previousRun != null) {
                    previousRank = googleDB.rank.get(previousRun.getId(), group, target.getId(), search.getId());
                }
                
                GoogleRank gRank = new GoogleRank(run.getId(), group, target.getId(), search.getId(), rank, previousRank, rankedUrl);
                googleDB.rank.insert(gRank);
                
                GoogleTargetSummary summary = summariesByTarget.get(target.getId());
                summary.addRankCandidat(gRank);
                
                if(rank != GoogleRank.UNRANKED && rank <= best){
                    googleDB.rank.insertBest(new GoogleBest(group, target.getId(), search.getId(), rank, run.getStarted(), rankedUrl));
                }
            }
        }
    }    
    
    protected void initializeSearches() {
        List searchList;
        if(updateRun){
            searchList = googleDB.search.listUnchecked(run.getId());
        } else {
            searchList = googleDB.search.list();
        }
        if(shuffle){
            Collections.shuffle(searchList);
        }
        searches = new LinkedBlockingQueue<>(searchList);
        LOG.info("{} searches to do", searches.size());
    }
    
    protected void initializeTargets() {
        Map previousScorePercent = new HashMap<>();
        
        if(previousRun != null){
            previousScorePercent = googleDB.targetSummary.getPreviousScore(previousRun.getId());
        } 
        
        List targets = googleDB.target.list();
        for (GoogleTarget target : targets) {
            targetsByGroup.putIfAbsent(target.getGroupId(), new ArrayList<>());
            targetsByGroup.get(target.getGroupId()).add(target);
            summariesByTarget.put(
                target.getId(), 
                new GoogleTargetSummary(target.getGroupId(), target.getId(), run.getId(), previousScorePercent.getOrDefault(target.getId(), 0))
            );
        }
        
        if(updateRun){
            List summaries = googleDB.targetSummary.list(run.getId());
            for (GoogleTargetSummary summary : summaries) {
                summariesByTarget.put(summary.getTargetId(), summary);
            }
        }
    }
    
    protected void initializePreviousRuns(){
        previousRun = baseDB.run.findPrevious(run.getId());
        if(previousRun == null){
            return;
        }
        
        short[] days = new short[]{1,7,30,90};
        
        for (short day : days) {
            List pastRuns = baseDB.run.findByDay(run.getModule(), run.getDay().minusDays(day));
            if(!pastRuns.isEmpty()){
                previousRunsByDay.put(day, pastRuns.get(0).getId());
            }
        }
    }
    
    protected Map getHistory(GoogleSearch search){
        Map history = new HashMap<>();
        
        for (Map.Entry entry : previousRunsByDay.entrySet()) {
            GoogleSerp serp = googleDB.serp.get(entry.getValue(), search.getId());
            if(serp != null){
                history.put(entry.getKey(), serp);
            }
        }
        return history;
    }    
    
    protected void finalizeSummaries(){
        Map searchCountByGroup = googleDB.search.countByGroup();
        for (GoogleTargetSummary summary : summariesByTarget.values()) {
            summary.computeScoreBP(searchCountByGroup.getOrDefault(summary.getGroupId(), 0));
        }
        googleDB.targetSummary.insert(summariesByTarget.values());
    }
    
    protected GoogleScraper genScraper(){
        return googleScraperFactory.get(
            scrapClientFactory.get(httpUserAgent, httpTimeoutMS),
            solver
        );
    }

    @Override
    protected void onCrash(Exception ex) {
        
    }
    
    protected final CaptchaSolver initializeCaptchaSolver(){
        solver = captchaSolverFactory.get(baseDB.config.getConfig());
        if(solver != null){
            if(!solver.init()){
                LOG.info("failed to init captcha solver {}", solver.getFriendlyName());
                return null;
            }
            return solver;
        } else {
            LOG.info("no captcha service configured");
            return null;
        }
        
    }
    
    int getSearchDone(){
        return searchDone != null ? searchDone.get() : 0;
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy