All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.watchrabbit.crawler.manager.policy.DefaultCleanupPolicy Maven / Gradle / Ivy

/*
 * Copyright 2015 Mariusz.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.watchrabbit.crawler.manager.policy;

import com.watchrabbit.commons.clock.Clock;
import com.watchrabbit.commons.clock.SystemClock;
import com.watchrabbit.crawler.api.LinkDto;
import com.watchrabbit.crawler.manager.model.Address;
import com.watchrabbit.crawler.manager.repository.AddressRepository;
import java.util.Calendar;
import java.util.Date;
import java.util.HashSet;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import org.springframework.beans.factory.annotation.Autowired;

/**
 *
 * @author Mariusz
 */
public class DefaultCleanupPolicy implements CleanupPolicy {

    private final Set blacklist = new HashSet<>();

    private final Map withWarning = new ConcurrentHashMap<>();

    Clock clock = SystemClock.getInstance();

    @Autowired
    AddressRepository addressRepository;

    @Override
    public void onError(Address address) {
        BlacklistElement blacklistElement = new BlacklistElement(address.getUrl(), address.getKeyword());
        if (withWarning.containsKey(blacklistElement)) {
            if (withWarning.get(blacklistElement).after(clock.getDate())) {
                withWarning.remove(blacklistElement);
                blacklist.add(blacklistElement);
            } else {
                withWarning.remove(blacklistElement);
            }
        } else {
            putOnQueueEnd(address);
            Calendar calendar = clock.getCalendar();
            calendar.add(Calendar.DAY_OF_MONTH, 1);
            withWarning.put(blacklistElement, calendar.getTime());
        }
    }

    @Override
    public boolean isOnBlacklist(LinkDto linkDto) {
        return blacklist.contains(new BlacklistElement(linkDto.getUrl(), linkDto.getKeyword()));
    }

    private void putOnQueueEnd(Address address) {
        addressRepository.findLastByNextExecutionDate()
                .ifPresent(last -> address.setNextExecutionDate(last.getNextExecutionDate()));
        addressRepository.save(address);
    }

    private class BlacklistElement {

        private final String url;

        private final String keyword;

        public BlacklistElement(String url, String keyword) {
            this.url = url;
            this.keyword = keyword;
        }

        @Override
        public int hashCode() {
            int hash = 5;
            hash = 37 * hash + Objects.hashCode(this.url);
            hash = 37 * hash + Objects.hashCode(this.keyword);
            return hash;
        }

        @Override
        public boolean equals(Object obj) {
            if (obj == null) {
                return false;
            }
            if (getClass() != obj.getClass()) {
                return false;
            }
            final BlacklistElement other = (BlacklistElement) obj;
            if (!Objects.equals(this.url, other.url)) {
                return false;
            }
            if (!Objects.equals(this.keyword, other.keyword)) {
                return false;
            }
            return true;
        }

    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy