All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.watchrabbit.crawler.executor.service.CrawlExecutorServiceImpl Maven / Gradle / Ivy

/*
 * Copyright 2015 Mariusz.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.watchrabbit.crawler.executor.service;

import com.watchrabbit.commons.clock.Stopwatch;
import com.watchrabbit.crawler.api.CrawlForm;
import com.watchrabbit.crawler.api.CrawlResult;
import com.watchrabbit.crawler.api.LinkDto;
import com.watchrabbit.crawler.driver.factory.RemoteWebDriverFactory;
import com.watchrabbit.crawler.driver.service.LoaderService;
import com.watchrabbit.crawler.executor.facade.AuthServiceFacade;
import com.watchrabbit.crawler.executor.facade.ManagerServiceFacade;
import com.watchrabbit.crawler.executor.listener.CrawlListener;
import com.watchrabbit.crawler.executor.strategy.KeywordGenerateStrategy;
import java.util.Collection;
import java.util.List;
import java.util.Optional;
import java.util.concurrent.TimeUnit;
import static java.util.stream.Collectors.toList;
import org.apache.commons.lang.StringUtils;
import org.openqa.selenium.By;
import org.openqa.selenium.Cookie;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.remote.RemoteWebDriver;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;

/**
 *
 * @author Mariusz
 */
@Service
public class CrawlExecutorServiceImpl implements CrawlExecutorService {
    
    private static final Logger LOGGER = LoggerFactory.getLogger(CrawlExecutorServiceImpl.class);
    
    @Autowired
    AuthServiceFacade authServiceFacade;
    
    @Autowired
    RemoteWebDriverFactory remoteWebDriverFactory;
    
    @Autowired
    ManagerServiceFacade managerServiceFacade;
    
    @Autowired
    LoaderService loaderService;
    
    @Autowired
    KeywordGenerateStrategy keywordGenerateStrategy;
    
    @Autowired(required = false)
    CrawlListener crawlListener = (pageId, driver) -> 0;
    
    @Override
    public void processPage(CrawlForm form) {
        Collection session = authServiceFacade.getSession(form.getDomain());
        RemoteWebDriver driver = remoteWebDriverFactory.produceDriver();
        try {
            Stopwatch stopwatch = Stopwatch.createStarted(() -> enableSession(driver, form, session));
            LOGGER.debug("Finished loading {} in {}", form.getUrl(), stopwatch.getExecutionTime(TimeUnit.MILLISECONDS));
            
            List links = collectLinks(driver).stream()
                    .map(link -> new LinkDto.Builder()
                            .withUrl(link)
                            .build()
                    ).collect(toList());
            if (form.isGateway()) {
                LOGGER.debug("Processing gateway {}", form.getUrl());
                List keywords = keywordGenerateStrategy.generateKeywords(form, driver);
                links.addAll(
                        keywords.stream()
                        .map(keyword -> new LinkDto.Builder()
                                .withKeyword(keyword)
                                .withUrl(form.getUrl())
                                .build()
                        ).collect(toList())
                );
            }
            double importanceFactor = crawlListener.accept(form.getId(), driver);
            managerServiceFacade.consumeResult(new CrawlResult.Builder()
                    .withDomain(form.getDomain())
                    .withMiliseconds(stopwatch.getExecutionTime(TimeUnit.MILLISECONDS))
                    .withUrl(form.getUrl())
                    .withLinks(links)
                    .withId(form.getId())
                    .withImportanceFactor(importanceFactor)
                    .build()
            );
        } catch (Exception ex) {
            LOGGER.error("Execption on processing page " + form.getUrl(), ex);
            managerServiceFacade.onError(form);
        } finally {
            remoteWebDriverFactory.returnWebDriver(driver);
        }
    }
    
    private void enableSession(RemoteWebDriver driver, CrawlForm form, Collection session) {
        driver.get(form.getUrl());
        loaderService.waitFor(driver);
        if (!session.isEmpty()) {
            driver.manage().deleteAllCookies();
            session.forEach(driver.manage()::addCookie);
            
            driver.get(form.getUrl());
            loaderService.waitFor(driver);
        }
        if (StringUtils.isNotEmpty(form.getKeyword())) {
            Optional searchFormOptional = findSearchInput(driver);
            searchFormOptional.ifPresent(searchForm -> {
                searchForm.input.sendKeys(form.getKeyword());
                loaderService.waitFor(driver);
                searchForm.submit.click();
                loaderService.waitFor(driver);
            });
            
        }
    }
    
    private List collectLinks(RemoteWebDriver driver) {
        return driver.findElements(By.xpath("//a")).stream()
                .filter(element -> element.isDisplayed())
                .map(link -> link.getAttribute("href"))
                .filter(link -> link != null)
                .filter(link -> link.startsWith("http"))
                .distinct()
                .collect(toList());
    }
    
    private Optional findSearchInput(RemoteWebDriver driver) {
        for (WebElement form : driver.findElements(By.xpath("//form"))) {
            LOGGER.debug("Looking to form with action {}", form.getAttribute("action"));
            List inputs = form.findElements(By.xpath(".//input")).stream()
                    .filter(input -> input.getAttribute("type").equals("text"))
                    .filter(input -> input.isDisplayed())
                    .collect(toList());
            List passwords = form.findElements(By.xpath(".//input")).stream()
                    .filter(input -> input.getAttribute("type").equals("password"))
                    .filter(input -> input.isDisplayed())
                    .collect(toList());
            if (inputs.size() == 1 && passwords.isEmpty()) {
                List submit = form.findElements(By.xpath(".//button[@type='submit']"));
                if (submit.isEmpty()) {
                    submit = form.findElements(By.xpath(".//input[@type='submit']"));
                }
                if (submit.size() == 1) {
                    return Optional.of(new SearchForm(inputs.get(0), submit.get(0)));
                }
            }
        }
        LOGGER.error("Cannot find form in gateway page");
        return Optional.empty();
    }
    
    private class SearchForm {
        
        WebElement input;
        
        WebElement submit;
        
        public SearchForm(WebElement input, WebElement submit) {
            this.input = input;
            this.submit = submit;
        }
        
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy