All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.watchrabbit.crawler.manager.policy.OPICImportancePolicy Maven / Gradle / Ivy

There is a newer version: 8.0.9
Show newest version
/*
 * Copyright 2015 Mariusz.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.watchrabbit.crawler.manager.policy;

import com.watchrabbit.crawler.api.CrawlResult;
import com.watchrabbit.crawler.api.LinkDto;
import com.watchrabbit.crawler.manager.model.Address;
import com.watchrabbit.crawler.manager.model.AddressOPIC;
import com.watchrabbit.crawler.manager.repository.AddressOPICRepository;
import com.watchrabbit.crawler.manager.repository.AddressRepository;
import com.watchrabbit.crawler.manager.util.InternetAddress;
import java.util.Date;
import java.util.List;
import static java.util.stream.Collectors.toList;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;

/**
 *
 * @author Mariusz
 */
public class OPICImportancePolicy implements ImportancePolicy {

    private static final Logger LOGGER = LoggerFactory.getLogger(OPICImportancePolicy.class);

    @Value("${crawler.manager.opicHistoricalResults:10}")
    int opicHistoricalResults;

    @Autowired
    AddressRepository addressRepository;

    @Autowired
    RevisitPolicy revisitPolicy;

    @Autowired
    CleanupPolicy cleanupPolicy;

    @Autowired
    AddressOPICRepository addressOPICRepository;

    @Autowired(required = false)
    LinkFilter linkFilter;

    @Override
    public void processCrawlResult(CrawlResult crawlResult) {
        AddressOPIC addressOPIC = addressOPICRepository.find(crawlResult.getId());
        if (addressOPIC == null) {
            addressOPIC = new AddressOPIC.Builder()
                    .withId(crawlResult.getId())
                    .build();
        }
        double cash = addressOPIC.getCash();
        List links = crawlResult.getLinks();
        if (linkFilter != null) {
            links = linkFilter.filterLinks(links);
        }
        links = links.stream()
                .filter(link -> !cleanupPolicy.isOnBlacklist(link))
                .collect(toList());
        double change = cash / links.size();
        links.forEach(url -> distribute(url, change));
        addressOPIC.resetCash(opicHistoricalResults);
        addressOPIC.addCash(crawlResult.getImportanceFactor());
        addressOPICRepository.save(addressOPIC);
    }

    @Override
    public double getImportance(String id) {
        AddressOPIC addressOPIC = addressOPICRepository.find(id);
        if (addressOPIC == null) {
            LOGGER.debug("Cannot find opic metrics for {}. Creating new", id);
            addressOPIC = new AddressOPIC.Builder()
                    .withId(id)
                    .build();
            addressOPICRepository.save(addressOPIC);
            return addressOPIC.getImportance();
        }
        return addressOPIC.getImportance();
    }

    private void distribute(LinkDto link, double change) {
        Address address = addressRepository.findByUrlAndKeyword(link.getUrl(), link.getKeyword());
        if (address == null) {
            LOGGER.debug("Creating new address {} with keyword {}", link.getUrl(), link.getKeyword());
            Date nextExecutionDate = revisitPolicy.getNextExecutionDate(change);
            address = new Address.Builder()
                    .withNextExecutionDate(nextExecutionDate)
                    .withDomainName(InternetAddress.getDomainName(link.getUrl()))
                    .withUrl(link.getUrl())
                    .withKeyword(link.getKeyword())
                    .build();
            addressRepository.save(address);
        }
        AddressOPIC addressOPIC = addressOPICRepository.find(address.getId());
        if (addressOPIC == null) {
            addressOPIC = new AddressOPIC.Builder()
                    .withId(address.getId())
                    .build();
        }
        addressOPIC.addCash(change);
        addressOPICRepository.save(addressOPIC);
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy