com.watchrabbit.crawler.manager.service.ManagerServiceImpl Maven / Gradle / Ivy
/*
* Copyright 2015 Mariusz.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.watchrabbit.crawler.manager.service;
import com.watchrabbit.commons.clock.Clock;
import com.watchrabbit.commons.clock.SystemClock;
import com.watchrabbit.crawler.api.CrawlForm;
import com.watchrabbit.crawler.api.CrawlResult;
import com.watchrabbit.crawler.manager.facade.ExecutorServiceFacade;
import com.watchrabbit.crawler.manager.model.Address;
import com.watchrabbit.crawler.manager.policy.CleanupPolicy;
import com.watchrabbit.crawler.manager.policy.EtiquettePolicy;
import com.watchrabbit.crawler.manager.policy.ImportancePolicy;
import com.watchrabbit.crawler.manager.policy.RevisitPolicy;
import com.watchrabbit.crawler.manager.repository.AddressRepository;
import com.watchrabbit.crawler.manager.util.InternetAddress;
import java.util.Date;
import java.util.List;
import java.util.Objects;
import static java.util.stream.Collectors.toList;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;
/**
*
* @author Mariusz
*/
@Service
public class ManagerServiceImpl implements ManagerService {
private static final Logger LOGGER = LoggerFactory.getLogger(ManagerServiceImpl.class);
Clock clock = SystemClock.getInstance();
@Value("${crawler.manager.urlProcessingTimeout:3600}")
int urlProcessingTimeout;
@Value("${crawler.manager.queueTopResultsLimit:1000}")
int queueTopResultsLimit;
@Autowired
ExecutorServiceFacade executorServiceFacade;
@Autowired
AddressRepository addressRepository;
@Autowired
ImportancePolicy importancePolicy;
@Autowired
RevisitPolicy revisitPolicy;
@Autowired
LeaseService leaseService;
@Autowired
EtiquettePolicy etiquettePolicy;
@Autowired
CleanupPolicy cleanupPolicy;
@Override
public List findDiscoveredAddresses(int limit) {
return addressRepository.findOrderByNextExecutionDate(limit);
}
@Override
public void addPage(String url, boolean isGateway) {
Address address = new Address.Builder()
.withUrl(url)
.withNextExecutionDate(new Date())
.withGateway(isGateway)
.withDomainName(InternetAddress.getDomainName(url))
.build();
addressRepository.save(address);
}
@Override
public void onCrawlError(CrawlForm form) {
Address address = addressRepository.find(form.getId());
cleanupPolicy.onError(address);
}
@Override
public void orderExecution(List ids) {
ids.forEach(this::orderExecution);
}
@Override
public void orderExecution(String id) {
Address address = addressRepository.find(id);
LOGGER.info("Pushing {} to execution with keyword {}", address.getUrl(), address.getKeyword());
etiquettePolicy.onDomainProcessing(address.getDomainName());
leaseService.createLease(address.getUrl(), urlProcessingTimeout);
executorServiceFacade.processPage(new CrawlForm.Builder()
.withDomain(address.getDomainName())
.withUrl(address.getUrl())
.withId(id)
.withGateway(address.isGateway())
.withKeyword(address.getKeyword())
.build()
);
putOnQueueEnd(address);
}
@Override
public void onCrawlResult(CrawlResult result) {
importancePolicy.processCrawlResult(result);
double importance = importancePolicy.getImportance(result.getId());
Date nextExecutionDate = revisitPolicy.getNextExecutionDate(importance);
Address address = addressRepository.find(result.getId());
address.setNextExecutionDate(nextExecutionDate);
leaseService.removeLease(result.getId());
addressRepository.save(address);
}
@Override
public List findIdsForExecution(int limit) {
return addressRepository.findOrderByNextExecutionDate(queueTopResultsLimit).stream()
.map(address -> new AddressWrapper(address))
.distinct()
.map(wrapper -> wrapper.address)
.filter(address -> etiquettePolicy.canProcessDomain(address.getDomainName()))
.filter(address -> !leaseService.hasLease(address.getId()))
.map(adress -> adress.getId())
.limit(limit)
.collect(toList());
}
private void putOnQueueEnd(Address address) {
addressRepository.findLastByNextExecutionDate()
.ifPresent(last -> address.setNextExecutionDate(last.getNextExecutionDate()));
addressRepository.save(address);
}
private class AddressWrapper {
Address address;
public AddressWrapper(Address address) {
this.address = address;
}
@Override
public int hashCode() {
int hash = 7;
hash = 17 * hash + Objects.hashCode(this.address.getDomainName());
return hash;
}
@Override
public boolean equals(Object obj) {
if (obj == null) {
return false;
}
if (getClass() != obj.getClass()) {
return false;
}
final AddressWrapper other = (AddressWrapper) obj;
if (!Objects.equals(this.address.getDomainName(), other.address.getDomainName())) {
return false;
}
return true;
}
}
}