All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.codelibs.fess.ds.git.GitDataStore Maven / Gradle / Ivy

/*
 * Copyright 2012-2024 CodeLibs Project and the Others.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
 * either express or implied. See the License for the specific language
 * governing permissions and limitations under the License.
 */
package org.codelibs.fess.ds.git;

import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Comparator;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Objects;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.Stream;

import org.apache.commons.io.output.DeferredFileOutputStream;
import org.codelibs.core.io.CopyUtil;
import org.codelibs.core.lang.StringUtil;
import org.codelibs.core.misc.Pair;
import org.codelibs.core.stream.StreamUtil;
import org.codelibs.fess.Constants;
import org.codelibs.fess.app.service.FailureUrlService;
import org.codelibs.fess.crawler.entity.ExtractData;
import org.codelibs.fess.crawler.exception.CrawlingAccessException;
import org.codelibs.fess.crawler.exception.MaxLengthExceededException;
import org.codelibs.fess.crawler.exception.MultipleCrawlingAccessException;
import org.codelibs.fess.crawler.extractor.Extractor;
import org.codelibs.fess.crawler.filter.UrlFilter;
import org.codelibs.fess.crawler.helper.MimeTypeHelper;
import org.codelibs.fess.ds.AbstractDataStore;
import org.codelibs.fess.ds.callback.IndexUpdateCallback;
import org.codelibs.fess.entity.DataStoreParams;
import org.codelibs.fess.es.config.exbhv.DataConfigBhv;
import org.codelibs.fess.es.config.exentity.DataConfig;
import org.codelibs.fess.exception.DataStoreCrawlingException;
import org.codelibs.fess.exception.DataStoreException;
import org.codelibs.fess.helper.CrawlerStatsHelper;
import org.codelibs.fess.helper.CrawlerStatsHelper.StatsAction;
import org.codelibs.fess.helper.CrawlerStatsHelper.StatsKeyObject;
import org.codelibs.fess.util.ComponentUtil;
import org.eclipse.jgit.api.Git;
import org.eclipse.jgit.api.errors.GitAPIException;
import org.eclipse.jgit.diff.DiffEntry;
import org.eclipse.jgit.diff.DiffFormatter;
import org.eclipse.jgit.lib.ObjectId;
import org.eclipse.jgit.lib.ObjectLoader;
import org.eclipse.jgit.lib.ObjectStream;
import org.eclipse.jgit.lib.Ref;
import org.eclipse.jgit.lib.Repository;
import org.eclipse.jgit.revwalk.RevCommit;
import org.eclipse.jgit.storage.file.FileRepositoryBuilder;
import org.eclipse.jgit.transport.CredentialsProvider;
import org.eclipse.jgit.transport.FetchResult;
import org.eclipse.jgit.transport.RefSpec;
import org.eclipse.jgit.transport.UsernamePasswordCredentialsProvider;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class GitDataStore extends AbstractDataStore {

    private static final Logger logger = LoggerFactory.getLogger(GitDataStore.class);

    protected static final String PASSWORD = "password";

    protected static final String USERNAME = "username";

    protected static final String COMMIT_ID = "commit_id";

    protected static final String REF_SPECS = "ref_specs";

    protected static final String DEFAULT_EXTRACTOR = "default_extractor";

    protected static final String CACHE_THRESHOLD = "cache_threshold";

    protected static final String EXTRACTORS = "extractors";

    protected static final String READ_INTERVAL = "read_interval";

    protected static final String TREE_WALK = "tree_walk";

    protected static final String REV_COMMIT = "rev_commit";

    protected static final String REPOSITORY = "repository";

    protected static final String URI = "uri";

    protected static final String BASE_URL = "base_url";

    protected static final String DIFF_ENTRY = "diff_entry";

    protected static final String GIT = "git";

    protected static final String CURRENT_COMMIT_ID = "current_commit_id";

    protected static final String PREV_COMMIT_ID = "prev_commit_id";

    protected static final String TEMP_REPOSITORY_PATH = "temp_repository_path";

    protected static final String REPOSITORY_PATH = "repository_path";

    protected static final String MAX_SIZE = "max_size";

    protected static final String INCLUDE_PATTERN = "include_pattern";

    protected static final String EXCLUDE_PATTERN = "exclude_pattern";

    protected static final String URL_FILTER = "url_filter";

    @Override
    protected String getName() {
        return this.getClass().getSimpleName();
    }

    @Override
    protected void storeData(final DataConfig dataConfig, final IndexUpdateCallback callback, final DataStoreParams paramMap,
            final Map scriptMap, final Map defaultDataMap) {
        final String uri = paramMap.getAsString(URI);
        if (StringUtil.isBlank(uri)) {
            throw new DataStoreException("uri is required.");
        }
        final String refSpec = paramMap.getAsString(REF_SPECS, "+refs/heads/*:refs/heads/*");
        final String commitId = paramMap.getAsString(COMMIT_ID, org.eclipse.jgit.lib.Constants.HEAD);
        final String username = paramMap.getAsString(USERNAME);
        final String password = paramMap.getAsString(PASSWORD);
        final String prevCommit = paramMap.getAsString(PREV_COMMIT_ID);
        final boolean isUpdateCommitId = prevCommit != null;
        final String baseUrl = paramMap.getAsString(BASE_URL);
        CredentialsProvider credentialsProvider = null;
        if (username != null && password != null) {
            credentialsProvider = new UsernamePasswordCredentialsProvider(username, password);
        }

        final Map configMap = createConfigMap(paramMap);
        configMap.put(URI, uri);

        final UrlFilter urlFilter = getUrlFilter(paramMap);

        logger.info("Git: {}", uri);

        final Repository repository = (Repository) configMap.get(REPOSITORY);
        configMap.put(REPOSITORY, repository);
        try (final Git git = new Git(repository)) {
            configMap.put(GIT, git);
            final FetchResult fetchResult = git.fetch().setForceUpdate(true).setRemote(uri).setRefSpecs(new RefSpec(refSpec))
                    .setInitialBranch(commitId).setCredentialsProvider(credentialsProvider).call();
            if (logger.isDebugEnabled()) {
                logger.debug("Fetch Result: {}", fetchResult.getMessages());
            }
            if (!hasCommitLogs(configMap)) {
                final Ref ref = git.checkout().setName(commitId).call();
                if (logger.isDebugEnabled()) {
                    logger.debug("Checked out {}", ref.getName());
                }
            }
            final ObjectId fromCommitId;
            if (StringUtil.isNotBlank(prevCommit)) {
                fromCommitId = repository.resolve(prevCommit);
            } else {
                fromCommitId = null;
            }
            final ObjectId toCommitId = repository.resolve(commitId);
            configMap.put(CURRENT_COMMIT_ID, toCommitId);
            try (DiffFormatter diffFormatter = new DiffFormatter(null)) {
                diffFormatter.setRepository(repository);
                logger.info("Rev: {} -> {}", fromCommitId, toCommitId);
                diffFormatter.scan(fromCommitId, toCommitId).forEach(entry -> {
                    final String path = entry.getNewPath();
                    if (urlFilter != null && !urlFilter.match(path)) {
                        if (logger.isDebugEnabled()) {
                            logger.debug("Not matched: {}", path);
                        }
                        return;
                    }
                    configMap.put(DIFF_ENTRY, entry);
                    switch (entry.getChangeType()) {
                    case ADD, MODIFY:
                        processFile(dataConfig, callback, paramMap, scriptMap, defaultDataMap, configMap);
                        break;
                    case DELETE:
                        if (StringUtil.isNotBlank(baseUrl)) {
                            deleteDocument(paramMap, configMap);
                        }
                        break;
                    case RENAME:
                        if (StringUtil.isNotBlank(baseUrl)) {
                            deleteDocument(paramMap, configMap);
                        }
                        processFile(dataConfig, callback, paramMap, scriptMap, defaultDataMap, configMap);
                        break;
                    default:
                        break;
                    }
                });
            }
            if (isUpdateCommitId) {
                updateDataConfig(dataConfig, toCommitId);
            }
        } catch (final Exception e) {
            throw new DataStoreException(e);
        } finally {
            try {
                repository.close();
            } finally {
                final File gitRepoPath = (File) configMap.get(TEMP_REPOSITORY_PATH);
                if (gitRepoPath != null) {
                    try (Stream walk = Files.walk(gitRepoPath.toPath())) {
                        walk.sorted(Comparator.reverseOrder()).map(Path::toFile).forEach(File::delete);
                    } catch (final IOException e) {
                        logger.warn("Failed to delete " + gitRepoPath.getAbsolutePath(), e);
                    }
                }
            }
        }
    }

    protected void deleteDocument(final DataStoreParams paramMap, final Map configMap) {
        final DiffEntry entry = (DiffEntry) configMap.get(DIFF_ENTRY);
        try {
            final String url = getUrl(paramMap, entry.getOldPath());
            ComponentUtil.getIndexingHelper().deleteDocumentByUrl(ComponentUtil.getSearchEngineClient(), url);
        } catch (final Exception e) {
            logger.warn("Failed to delete the document {}.", entry);
        }
    }

    protected void updateDataConfig(final DataConfig dataConfig, final ObjectId toCommitId) {
        final String paramStr = dataConfig.getHandlerParameterMap().entrySet().stream().map(e -> {
            if (PREV_COMMIT_ID.equals(e.getKey())) {
                return e.getKey() + "=" + toCommitId.name();
            }
            return e.getKey() + "=" + e.getValue();
        }).collect(Collectors.joining("\n"));
        dataConfig.setHandlerParameter(paramStr);
        if (logger.isDebugEnabled()) {
            logger.debug("Updating data config by {}.", paramStr);
        }
        ComponentUtil.getComponent(DataConfigBhv.class).update(dataConfig);
        logger.info("Updated DataConfig: {}", dataConfig.getId());
    }

    protected String getFileName(final String path) {
        final int pos = path.lastIndexOf('/');
        if (pos == -1) {
            return path;
        }
        return path.substring(pos + 1);
    }

    protected void processFile(final DataConfig dataConfig, final IndexUpdateCallback callback, final DataStoreParams paramMap,
            final Map scriptMap, final Map defaultDataMap, final Map configMap) {
        final CrawlerStatsHelper crawlerStatsHelper = ComponentUtil.getCrawlerStatsHelper();
        final Map dataMap = new HashMap<>(defaultDataMap);
        final String uri = (String) configMap.get(URI);
        final DiffEntry diffEntry = (DiffEntry) configMap.get(DIFF_ENTRY);
        final String path = diffEntry.getNewPath();
        final StatsKeyObject statsKey = new StatsKeyObject(uri);
        paramMap.put(Constants.CRAWLER_STATS_KEY, statsKey);
        try {
            crawlerStatsHelper.begin(statsKey);
            final RevCommit revCommit = getRevCommit(configMap, path);

            final String name = getFileName(path);
            logger.info("Crawling Path: {}", path);

            final Map resultMap = new LinkedHashMap<>(paramMap.asMap());
            final Repository repository = (Repository) configMap.get(REPOSITORY);
            final ObjectLoader objectLoader = repository.open(diffEntry.getNewId().toObjectId());
            final long size = objectLoader.getSize();
            if (size > ((Long) configMap.get(MAX_SIZE)).longValue()) {
                throw new MaxLengthExceededException(
                        "The content length (" + size + " byte) is over " + configMap.get(MAX_SIZE) + " byte. The path is " + path);
            }
            resultMap.put("contentLength", size);
            DeferredFileOutputStream dfos = null;
            try (ObjectStream in = objectLoader.openStream();
                    DeferredFileOutputStream out =
                            new DeferredFileOutputStream((Integer) configMap.get(CACHE_THRESHOLD), "fess-ds-git-", ".out", null)) {
                dfos = out;
                CopyUtil.copy(in, out);
                out.flush();

                final String mimeType = getMimeType(name, out);
                resultMap.put("mimetype", mimeType);
                final Extractor extractor = getExtractor(mimeType, configMap);

                final Map params = new HashMap<>();
                params.put(ExtractData.RESOURCE_NAME_KEY, name);
                try (InputStream is = getContentInputStream(out)) {
                    String content = extractor.getText(is, params).getContent();
                    if (content == null) {
                        content = StringUtil.EMPTY;
                    }
                    resultMap.put("content", content);
                } catch (final Exception e) {
                    if (!ComponentUtil.getFessConfig().isCrawlerIgnoreContentException()) {
                        throw e;
                    }
                    if (logger.isDebugEnabled()) {
                        logger.warn("Could not get a text from {}.", uri, e);
                    } else {
                        logger.warn("Could not get a text from {}. {}", uri, e.getMessage());
                    }
                }

                resultMap.put("url", getUrl(paramMap, path));
                resultMap.put("uri", uri);
                resultMap.put("path", path);
                resultMap.put("name", name);
                resultMap.put("crawlingConfig", dataConfig);
                resultMap.put("author", revCommit.getAuthorIdent());
                resultMap.put("committer", revCommit.getCommitterIdent());
                resultMap.put("timestamp", new Date(revCommit.getCommitTime() * 1000L));

                crawlerStatsHelper.record(statsKey, StatsAction.PREPARED);

                if (logger.isDebugEnabled()) {
                    logger.debug("resultMap: {}", resultMap);
                }

                final String scriptType = getScriptType(paramMap);
                for (final Map.Entry entry : scriptMap.entrySet()) {
                    final Object convertValue = convertValue(scriptType, entry.getValue(), resultMap);
                    if (convertValue != null) {
                        dataMap.put(entry.getKey(), convertValue);
                    }
                }

                crawlerStatsHelper.record(statsKey, StatsAction.EVALUATED);

                if (logger.isDebugEnabled()) {
                    logger.debug("dataMap: {}", dataMap);
                }

                if (dataMap.get("url") instanceof String statsUrl) {
                    statsKey.setUrl(statsUrl);
                }

                callback.store(paramMap, dataMap);
                crawlerStatsHelper.record(statsKey, StatsAction.FINISHED);
            } finally {
                if (dfos != null && !dfos.isInMemory()) {
                    final File file = dfos.getFile();
                    if (!file.delete()) {
                        logger.warn("Failed to delete {}.", file.getAbsolutePath());
                    }
                }
            }
        } catch (final CrawlingAccessException e) {
            logger.warn("Crawling Access Exception at : " + dataMap, e);

            Throwable target = e;
            if (target instanceof MultipleCrawlingAccessException) {
                final Throwable[] causes = ((MultipleCrawlingAccessException) target).getCauses();
                if (causes.length > 0) {
                    target = causes[causes.length - 1];
                }
            }

            String errorName;
            final Throwable cause = target.getCause();
            if (cause != null) {
                errorName = cause.getClass().getCanonicalName();
            } else {
                errorName = target.getClass().getCanonicalName();
            }

            String url;
            if (target instanceof DataStoreCrawlingException dce) {
                url = dce.getUrl();
                if (dce.aborted()) {
                    throw e;
                }
            } else {
                url = uri + ":" + path;
            }
            final FailureUrlService failureUrlService = ComponentUtil.getComponent(FailureUrlService.class);
            failureUrlService.store(dataConfig, errorName, url, target);
            crawlerStatsHelper.record(statsKey, StatsAction.ACCESS_EXCEPTION);
        } catch (final Throwable t) {
            logger.warn("Crawling Access Exception at : " + dataMap, t);
            final String url = uri + ":" + path;
            final FailureUrlService failureUrlService = ComponentUtil.getComponent(FailureUrlService.class);
            failureUrlService.store(dataConfig, t.getClass().getCanonicalName(), url, t);

            final long readInterval = (Long) configMap.get(READ_INTERVAL);
            if (readInterval > 0) {
                sleep(readInterval);
            }
            crawlerStatsHelper.record(statsKey, StatsAction.EXCEPTION);
        } finally {
            crawlerStatsHelper.done(statsKey);
        }
    }

    protected boolean hasCommitLogs(final Map configMap) {
        final Git git = (Git) configMap.get(GIT);
        try {
            git.log().call();
            return true;
        } catch (final Exception e) {
            if (logger.isDebugEnabled()) {
                logger.debug("Could not find commit logs.", e);
            }
            return false;
        }
    }

    protected RevCommit getRevCommit(final Map configMap, final String path) throws GitAPIException {
        final Git git = (Git) configMap.get(GIT);
        final Iterator revCommitIter = git.log().addPath(path).setMaxCount(1).call().iterator();
        if (!revCommitIter.hasNext()) {
            throw new DataStoreException("Failed to parse git log for " + path);
        }
        return revCommitIter.next();
    }

    protected String getUrl(final DataStoreParams paramMap, final String path) {
        final String baseUrl = paramMap.getAsString(BASE_URL);
        if (StringUtil.isNotBlank(baseUrl)) {
            return baseUrl + path;
        }
        return StringUtil.EMPTY;
    }

    protected Map createConfigMap(final DataStoreParams paramMap) {
        final Map configMap = new HashMap<>();
        @SuppressWarnings("unchecked")
        final Pair[] extractors = StreamUtil.split(paramMap.getAsString(EXTRACTORS), ",").get(stream -> stream.map(s -> {
            final String[] values = s.split(":");
            if (values.length != 2) {
                return null;
            }
            return new Pair<>(Pattern.compile(values[0]), values[1]);
        }).filter(Objects::nonNull).toArray(n -> new Pair[n]));
        configMap.put(EXTRACTORS, extractors);
        configMap.put(BASE_URL, paramMap.getAsString(BASE_URL, StringUtil.EMPTY));
        configMap.put(CACHE_THRESHOLD, Integer.parseInt(paramMap.getAsString(CACHE_THRESHOLD, "1000000")));
        configMap.put(DEFAULT_EXTRACTOR, paramMap.getAsString(DEFAULT_EXTRACTOR, "tikaExtractor"));
        configMap.put(READ_INTERVAL, getReadInterval(paramMap));
        final String maxSize = paramMap.getAsString(MAX_SIZE);
        configMap.put(MAX_SIZE, StringUtil.isNotBlank(maxSize) ? Long.parseLong(maxSize) : 10000000L);

        final String repositoryPath = paramMap.getAsString(REPOSITORY_PATH);
        if (StringUtil.isBlank(repositoryPath)) {
            try {
                final File gitRepoPath = File.createTempFile("fess-ds-git-", "");
                if (!gitRepoPath.delete()) {
                    throw new DataStoreException("Could not delete temporary file " + gitRepoPath);
                }
                gitRepoPath.mkdirs();
                final Repository repository = FileRepositoryBuilder.create(new File(gitRepoPath, ".git"));
                repository.create();
                configMap.put(REPOSITORY, repository);
                configMap.put(TEMP_REPOSITORY_PATH, gitRepoPath);
            } catch (final IOException e) {
                throw new DataStoreException("Failed to create a repository.", e);
            }
        } else {
            try {
                final File repoFile = new File(repositoryPath);
                final boolean exists = repoFile.exists();
                if (!exists) {
                    repoFile.mkdirs();
                }
                final Repository repository = FileRepositoryBuilder.create(new File(repositoryPath, ".git"));
                if (!exists) {
                    repository.create();
                }
                configMap.put(REPOSITORY, repository);
            } catch (final IOException e) {
                throw new DataStoreException("Failed to load " + repositoryPath, e);
            }
        }
        return configMap;
    }

    protected Extractor getExtractor(final String mimeType, final Map configMap) {
        @SuppressWarnings("unchecked")
        final Pair[] extractors = (Pair[]) configMap.get(EXTRACTORS);
        for (final Pair pair : extractors) {
            if (pair.getFirst().matcher(mimeType).matches()) {
                if (logger.isDebugEnabled()) {
                    logger.debug("use {} from {}", pair.getSecond(), mimeType);
                }
                final Extractor extractor = ComponentUtil.getComponent(pair.getSecond());
                if (extractor != null) {
                    return extractor;
                }
            }
        }
        if (logger.isDebugEnabled()) {
            logger.debug("use a default extractor from {}", mimeType);
        }
        Extractor extractor = ComponentUtil.getExtractorFactory().getExtractor(mimeType);
        if (extractor == null) {
            if (logger.isDebugEnabled()) {
                logger.debug("use a defautl extractor as tikaExtractor by {}", mimeType);
            }
            extractor = ComponentUtil.getComponent((String) configMap.get(DEFAULT_EXTRACTOR));
        }
        return extractor;
    }

    protected String getMimeType(final String filename, final DeferredFileOutputStream out) throws IOException {
        final MimeTypeHelper mimeTypeHelper = ComponentUtil.getComponent(MimeTypeHelper.class);
        try (InputStream is = getContentInputStream(out)) {
            return mimeTypeHelper.getContentType(is, filename);
        }
    }

    protected InputStream getContentInputStream(final DeferredFileOutputStream out) throws IOException {
        if (out.isInMemory()) {
            return new ByteArrayInputStream(out.getData());
        }
        return new FileInputStream(out.getFile());
    }

    protected UrlFilter getUrlFilter(final DataStoreParams paramMap) {
        final UrlFilter urlFilter = ComponentUtil.getComponent(UrlFilter.class);
        final String include = paramMap.getAsString(INCLUDE_PATTERN);
        if (StringUtil.isNotBlank(include)) {
            urlFilter.addInclude(include);
        }
        final String exclude = paramMap.getAsString(EXCLUDE_PATTERN);
        if (StringUtil.isNotBlank(exclude)) {
            urlFilter.addExclude(exclude);
        }
        urlFilter.init(paramMap.getAsString(Constants.CRAWLING_INFO_ID));
        if (logger.isDebugEnabled()) {
            logger.debug("urlFilter: {}", urlFilter);
        }
        return urlFilter;
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy