All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.codelibs.fess.ds.wikipedia.WikipediaDataStore Maven / Gradle / Ivy

/*
 * Copyright 2012-2024 CodeLibs Project and the Others.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
 * either express or implied. See the License for the specific language
 * governing permissions and limitations under the License.
 */
package org.codelibs.fess.ds.wikipedia;

import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLEncoder;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;

import org.apache.commons.lang3.StringUtils;
import org.codelibs.fess.Constants;
import org.codelibs.fess.app.service.FailureUrlService;
import org.codelibs.fess.crawler.exception.CrawlingAccessException;
import org.codelibs.fess.crawler.exception.MultipleCrawlingAccessException;
import org.codelibs.fess.ds.AbstractDataStore;
import org.codelibs.fess.ds.callback.IndexUpdateCallback;
import org.codelibs.fess.ds.wikipedia.exception.ParserStoppedException;
import org.codelibs.fess.ds.wikipedia.support.WikiXMLSAXParser;
import org.codelibs.fess.entity.DataStoreParams;
import org.codelibs.fess.es.config.exentity.DataConfig;
import org.codelibs.fess.exception.DataStoreCrawlingException;
import org.codelibs.fess.exception.DataStoreException;
import org.codelibs.fess.helper.CrawlerStatsHelper;
import org.codelibs.fess.helper.CrawlerStatsHelper.StatsAction;
import org.codelibs.fess.helper.CrawlerStatsHelper.StatsKeyObject;
import org.codelibs.fess.util.ComponentUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class WikipediaDataStore extends AbstractDataStore {

    private static final Logger logger = LoggerFactory.getLogger(WikipediaDataStore.class);

    private static final String DEFAULT_WIKIPEDIA_URL = "http://download.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2";

    @Override
    protected String getName() {
        return this.getClass().getSimpleName();
    }

    @Override
    protected void storeData(final DataConfig dataConfig, final IndexUpdateCallback callback, final DataStoreParams paramMap,
            final Map scriptMap, final Map defaultDataMap) {
        final CrawlerStatsHelper crawlerStatsHelper = ComponentUtil.getCrawlerStatsHelper();

        final long readInterval = getReadInterval(paramMap);
        final URL wikipediaUrl = getWikipediaUrl(paramMap);
        final int limit = Integer.parseInt(paramMap.getAsString("limit", "0"));
        final int totalEntitySizeLimit = Integer.parseInt(paramMap.getAsString("total_entity_size_limit", "100000000"));
        final int maxDigestLength = Integer.parseInt(paramMap.getAsString("max_digest_length", "100"));
        final String scriptType = getScriptType(paramMap);
        logger.info("url: {}", wikipediaUrl);
        final AtomicInteger counter = new AtomicInteger();
        final WikiXMLSAXParser xmlParser = new WikiXMLSAXParser(wikipediaUrl);
        xmlParser.setTotalEntitySizeLimit(totalEntitySizeLimit);
        xmlParser.setPageCallback(page -> {
            final StatsKeyObject statsKey = new StatsKeyObject(dataConfig.getId() + "#" + page.getId());
            paramMap.put(Constants.CRAWLER_STATS_KEY, statsKey);
            final Map dataMap = new HashMap<>(defaultDataMap);
            final Map resultMap = new LinkedHashMap<>();
            try {
                crawlerStatsHelper.begin(statsKey);
                resultMap.putAll(paramMap.asMap());

                final String title = stripTitle(page.getTitle());
                final String content = page.getText();
                resultMap.put("id", page.getId());
                resultMap.put("title", title);
                resultMap.put("content", content);
                resultMap.put("encodedTitle", URLEncoder.encode(title, Constants.UTF_8));
                resultMap.put("digest", StringUtils.abbreviate(content, maxDigestLength));
                resultMap.put("format", page.getFormat());
                resultMap.put("model", page.getModel());
                resultMap.put("timestamp", page.getTimestamp());

                crawlerStatsHelper.record(statsKey, StatsAction.PREPARED);

                if (logger.isDebugEnabled()) {
                    for (final Map.Entry entry : resultMap.entrySet()) {
                        logger.debug("{}={}", entry.getKey(), entry.getValue());
                    }
                }

                final Map crawlingContext = new HashMap<>();
                crawlingContext.put("doc", dataMap);
                resultMap.put("crawlingContext", crawlingContext);
                for (final Map.Entry entry : scriptMap.entrySet()) {
                    final Object convertValue = convertValue(scriptType, entry.getValue(), resultMap);
                    if (convertValue != null) {
                        dataMap.put(entry.getKey(), convertValue);
                    }
                }

                crawlerStatsHelper.record(statsKey, StatsAction.EVALUATED);

                if (logger.isDebugEnabled()) {
                    for (final Map.Entry entry : dataMap.entrySet()) {
                        logger.debug("{}={}", entry.getKey(), entry.getValue());
                    }
                }

                if (dataMap.get("url") instanceof final String url) {
                    statsKey.setUrl(url);
                }

                callback.store(paramMap, dataMap);
                crawlerStatsHelper.record(statsKey, StatsAction.FINISHED);
            } catch (final CrawlingAccessException e) {
                logger.warn("Crawling Access Exception at : {}", dataMap, e);

                Throwable target = e;
                if (target instanceof final MultipleCrawlingAccessException ex) {
                    final Throwable[] causes = ex.getCauses();
                    if (causes.length > 0) {
                        target = causes[causes.length - 1];
                    }
                }

                String errorName;
                final Throwable cause = target.getCause();
                if (cause != null) {
                    errorName = cause.getClass().getCanonicalName();
                } else {
                    errorName = target.getClass().getCanonicalName();
                }

                if (target instanceof final DataStoreCrawlingException dce && dce.aborted()) {
                    throw new ParserStoppedException(page.getId());
                }

                final FailureUrlService failureUrlService = ComponentUtil.getComponent(FailureUrlService.class);
                failureUrlService.store(dataConfig, errorName, page.getId(), target);
                crawlerStatsHelper.record(statsKey, StatsAction.ACCESS_EXCEPTION);
            } catch (final Throwable t) {
                logger.warn("Crawling Access Exception at : {}", dataMap, t);
                final FailureUrlService failureUrlService = ComponentUtil.getComponent(FailureUrlService.class);
                failureUrlService.store(dataConfig, t.getClass().getCanonicalName(), page.getId(), t);

                if (readInterval > 0) {
                    sleep(readInterval);
                }
                crawlerStatsHelper.record(statsKey, StatsAction.EXCEPTION);
            } finally {
                crawlerStatsHelper.done(statsKey);
            }

            if (limit > 0 && counter.incrementAndGet() >= limit) {
                logger.info("Wikipedia crawler is stopped. ({} > {})", counter.get(), limit);
                throw new ParserStoppedException(page.getId());
            }
        });
        try {
            xmlParser.parse();
        } catch (final ParserStoppedException e) {
            if (logger.isDebugEnabled()) {
                logger.debug("Wikipedia crawler is stopped at " + e.getMessage(), e);
            }
        }
    }

    private URL getWikipediaUrl(final DataStoreParams paramMap) {
        try {
            return new URL(paramMap.getAsString("url", DEFAULT_WIKIPEDIA_URL));
        } catch (final MalformedURLException e) {
            throw new DataStoreException("Could not parse Wikipedia URL.", e);
        }
    }

    private String stripTitle(final String title) {
        final StringBuilder sb = new StringBuilder();
        sb.append(title);
        while (sb.length() > 0 && (sb.charAt(sb.length() - 1) == '\n' || (sb.charAt(sb.length() - 1) == ' '))) {
            sb.deleteCharAt(sb.length() - 1);
        }
        return sb.toString();
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy