All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.codelibs.fess.crawler.service.impl.EsUrlQueueService Maven / Gradle / Ivy

The newest version!
/*
 * Copyright 2012-2024 CodeLibs Project and the Others.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
 * either express or implied. See the License for the specific language
 * governing permissions and limitations under the License.
 */
package org.codelibs.fess.crawler.service.impl;

import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.stream.Collectors;

import javax.annotation.PostConstruct;
import javax.annotation.PreDestroy;
import javax.annotation.Resource;

import org.codelibs.core.lang.StringUtil;
import org.codelibs.fess.crawler.Constants;
import org.codelibs.fess.crawler.entity.AccessResult;
import org.codelibs.fess.crawler.entity.EsUrlQueue;
import org.codelibs.fess.crawler.entity.UrlQueue;
import org.codelibs.fess.crawler.exception.EsAccessException;
import org.codelibs.fess.crawler.service.UrlQueueService;
import org.codelibs.fess.crawler.util.EsCrawlerConfig;
import org.opensearch.action.DocWriteRequest.OpType;
import org.opensearch.action.bulk.BulkRequestBuilder;
import org.opensearch.action.bulk.BulkResponse;
import org.opensearch.action.search.SearchResponse;
import org.opensearch.action.support.WriteRequest.RefreshPolicy;
import org.opensearch.action.update.UpdateRequestBuilder;
import org.opensearch.common.unit.TimeValue;
import org.opensearch.index.query.QueryBuilders;
import org.opensearch.search.SearchHit;
import org.opensearch.search.SearchHits;
import org.opensearch.search.sort.SortBuilders;
import org.opensearch.search.sort.SortOrder;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class EsUrlQueueService extends AbstractCrawlerService implements UrlQueueService {
    private static final Logger logger = LoggerFactory.getLogger(EsUrlQueueService.class);

    @Resource
    protected EsDataService dataService;

    protected Map sessionCache = new ConcurrentHashMap<>();

    protected int pollingFetchSize = 1000;

    protected int maxCrawlingQueueSize = 100;

    public EsUrlQueueService(final EsCrawlerConfig crawlerConfig) {
        this.index = crawlerConfig.getQueueIndex();
        setNumberOfShards(crawlerConfig.getQueueShards());
        setNumberOfReplicas(crawlerConfig.getQueueReplicas());
    }

    public EsUrlQueueService(final String name, final String type) {
        this.index = name + "." + type;
    }

    @PostConstruct
    public void init() {
        fesenClient.addOnConnectListener(() -> createMapping("queue"));
    }

    @PreDestroy
    public void destroy() {
        sessionCache.entrySet().stream().map(e -> e.getValue().waitingQueue).forEach(q -> q.forEach(urlQueue -> {
            try {
                insert(urlQueue);
            } catch (final Exception e) {
                logger.warn("Failed to restore " + urlQueue, e);
            }
        }));
    }

    public void clearCache() {
        sessionCache.clear();
    }

    @Override
    public void updateSessionId(final String oldSessionId, final String newSessionId) {
        SearchResponse response = getClient().get(c -> c.prepareSearch(index).setScroll(new TimeValue(scrollTimeout))
                .setQuery(QueryBuilders.boolQuery().filter(QueryBuilders.termQuery(SESSION_ID, oldSessionId))).setSize(scrollSize)
                .execute());
        String scrollId = response.getScrollId();
        try {
            while (scrollId != null) {
                final SearchHits searchHits = response.getHits();
                if (searchHits.getHits().length == 0) {
                    break;
                }

                final BulkResponse bulkResponse = getClient().get(c -> {
                    final BulkRequestBuilder builder = c.prepareBulk();
                    for (final SearchHit searchHit : searchHits) {
                        final UpdateRequestBuilder updateRequest =
                                c.prepareUpdate().setIndex(index).setId(searchHit.getId()).setDoc(SESSION_ID, newSessionId);
                        builder.add(updateRequest);
                    }

                    return builder.execute();
                });
                if (bulkResponse.hasFailures()) {
                    throw new EsAccessException(bulkResponse.buildFailureMessage());
                }

                final String sid = scrollId;
                response = getClient().get(c -> c.prepareSearchScroll(sid).setScroll(new TimeValue(scrollTimeout)).execute());
                if (!scrollId.equals(response.getScrollId())) {
                    getClient().clearScroll(scrollId);
                }
                scrollId = response.getScrollId();
            }
        } finally {
            getClient().clearScroll(scrollId);
        }
    }

    @Override
    public void add(final String sessionId, final String url) {
        if (exists(sessionId, url)) {
            return;
        }
        final EsUrlQueue urlQueue = new EsUrlQueue();
        urlQueue.setSessionId(sessionId);
        urlQueue.setUrl(url);
        urlQueue.setCreateTime(System.currentTimeMillis());
        urlQueue.setLastModified(0L);
        urlQueue.setDepth(0);
        urlQueue.setMethod(Constants.GET_METHOD);
        insert(urlQueue);
    }

    @Override
    public void insert(final EsUrlQueue urlQueue) {
        try {
            super.insert(urlQueue, urlQueue.getId() == null ? OpType.CREATE : OpType.INDEX);
        } catch (final EsAccessException e) {
            final Throwable cause = e.getCause();
            if (cause != null && "VersionConflictEngineException".equals(cause.getClass().getSimpleName())) {
                if (logger.isDebugEnabled()) {
                    logger.debug("Failed to insert {}", urlQueue, e);
                }
                return;
            }
            throw e;
        }
    }

    @Override
    public void delete(final String sessionId) {
        deleteBySessionId(sessionId);
    }

    @Override
    public void offerAll(final String sessionId, final List urlQueueList) {
        if (logger.isDebugEnabled()) {
            logger.debug("Offering URL: Session ID: {}, UrlQueue: {}", sessionId, urlQueueList);
        }
        final List> targetList = new ArrayList<>(urlQueueList.size());
        for (final UrlQueue urlQueue : urlQueueList) {
            if (!exists(sessionId, urlQueue.getUrl()) && !dataService.exists(sessionId, urlQueue.getUrl())) {
                urlQueue.setSessionId(sessionId);
                targetList.add(urlQueue);
            } else if (logger.isDebugEnabled()) {
                logger.debug("Existed URL: Session ID: {}, UrlQueue: {}", sessionId, urlQueue);
            }
        }
        if (!targetList.isEmpty()) {
            insertAll(targetList.stream()
                    .filter(urlQueue -> StringUtil.isNotBlank(urlQueue.getSessionId()) && StringUtil.isNotBlank(urlQueue.getUrl()))
                    .collect(Collectors.toList()), OpType.CREATE, true);
            if (logger.isDebugEnabled()) {
                logger.debug("Offered URL: Session ID: {}, UrlQueue: {}", sessionId, targetList);
            }
        }
    }

    @Override
    public EsUrlQueue poll(final String sessionId) {
        final QueueHolder queueHolder = getQueueHolder(sessionId);
        final Queue waitingQueue = queueHolder.waitingQueue;
        final Queue crawlingQueue = queueHolder.crawlingQueue;
        EsUrlQueue urlQueue = waitingQueue.poll();
        if (urlQueue != null) {
            if (crawlingQueue.size() > maxCrawlingQueueSize) {
                crawlingQueue.poll();
            }
            crawlingQueue.add(urlQueue);
            return urlQueue;
        }

        synchronized (queueHolder) {
            urlQueue = waitingQueue.poll();
            if (urlQueue == null) {
                final List urlQueueList = getList(EsUrlQueue.class, sessionId, null, 0, pollingFetchSize,
                        SortBuilders.fieldSort(CREATE_TIME).order(SortOrder.ASC));
                if (urlQueueList.isEmpty()) {
                    return null;
                }
                if (logger.isDebugEnabled()) {
                    logger.debug("Queued URL: {}", urlQueueList);
                }

                if (!urlQueueList.isEmpty()) {
                    try {
                        // delete from es
                        final BulkResponse response = getClient().get(c -> {
                            final BulkRequestBuilder bulkBuilder = c.prepareBulk();
                            for (final EsUrlQueue uq : urlQueueList) {
                                bulkBuilder.add(c.prepareDelete().setIndex(index).setId(uq.getId()));
                            }

                            return bulkBuilder.setRefreshPolicy(RefreshPolicy.IMMEDIATE).execute();
                        });
                        if (response.hasFailures()) {
                            logger.warn(response.buildFailureMessage());
                        }
                    } catch (final Exception e) {
                        throw new EsAccessException("Failed to delete " + urlQueueList, e);
                    }
                }

                waitingQueue.addAll(urlQueueList);

                urlQueue = waitingQueue.poll();
                if (urlQueue == null) {
                    return null;
                }
            }

        }

        if (crawlingQueue.size() > maxCrawlingQueueSize) {
            crawlingQueue.poll();
        }
        crawlingQueue.add(urlQueue);
        return urlQueue;
    }

    @Override
    public void saveSession(final String sessionId) {
        // TODO use cache
    }

    @Override
    public boolean visited(final EsUrlQueue urlQueue) {
        final String url = urlQueue.getUrl();
        if (StringUtil.isBlank(url)) {
            if (logger.isDebugEnabled()) {
                logger.debug("URL is a blank: {}", url);
            }
            return false;
        }

        final String sessionId = urlQueue.getSessionId();
        if (super.exists(sessionId, url)) {
            return true;
        }

        final AccessResult accessResult = dataService.getAccessResult(sessionId, url);
        return accessResult != null;
    }

    @Override
    protected boolean exists(final String sessionId, final String url) {
        final boolean ret = super.exists(sessionId, url);
        if (!ret) {
            final QueueHolder queueHolder = getQueueHolder(sessionId);
            final Queue waitingQueue = queueHolder.waitingQueue;
            final Queue crawlingQueue = queueHolder.crawlingQueue;

            for (final UrlQueue urlQueue : crawlingQueue) {
                if (sessionId.equals(urlQueue.getSessionId()) && url.equals(urlQueue.getUrl())) {
                    return true;
                }
            }
            for (final UrlQueue urlQueue : waitingQueue) {
                if (sessionId.equals(urlQueue.getSessionId()) && url.equals(urlQueue.getUrl())) {
                    return true;
                }
            }
        }
        return ret;
    }

    @Override
    public void generateUrlQueues(final String previousSessionId, final String sessionId) {
        dataService.iterate(previousSessionId, accessResult -> {
            final EsUrlQueue urlQueue = new EsUrlQueue();
            urlQueue.setSessionId(sessionId);
            urlQueue.setMethod(accessResult.getMethod());
            urlQueue.setUrl(accessResult.getUrl());
            urlQueue.setParentUrl(accessResult.getParentUrl());
            urlQueue.setDepth(0);
            urlQueue.setLastModified(accessResult.getLastModified());
            urlQueue.setCreateTime(System.currentTimeMillis());
            insert(urlQueue);
        });
    }

    protected QueueHolder getQueueHolder(final String sessionId) {
        QueueHolder queueHolder = sessionCache.get(sessionId);
        if (queueHolder == null) {
            queueHolder = new QueueHolder();
            final QueueHolder prevQueueHolder = sessionCache.putIfAbsent(sessionId, queueHolder);
            return prevQueueHolder == null ? queueHolder : prevQueueHolder;
        }
        return queueHolder;
    }

    protected static class QueueHolder {
        protected Queue waitingQueue = new ConcurrentLinkedQueue<>();

        protected Queue crawlingQueue = new ConcurrentLinkedQueue<>();
    }

    public void setPollingFetchSize(final int pollingFetchSize) {
        this.pollingFetchSize = pollingFetchSize;
    }

    public void setMaxCrawlingQueueSize(final int maxCrawlingQueueSize) {
        this.maxCrawlingQueueSize = maxCrawlingQueueSize;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy