org.codelibs.fess.ds.slack.SlackDataStore Maven / Gradle / Ivy
/*
* Copyright 2012-2024 CodeLibs Project and the Others.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
* either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/
package org.codelibs.fess.ds.slack;
import static java.util.Collections.EMPTY_LIST;
import java.io.InputStream;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;
import org.codelibs.core.exception.InterruptedRuntimeException;
import org.codelibs.core.lang.StringUtil;
import org.codelibs.curl.CurlResponse;
import org.codelibs.fess.Constants;
import org.codelibs.fess.app.service.FailureUrlService;
import org.codelibs.fess.crawler.exception.CrawlingAccessException;
import org.codelibs.fess.crawler.exception.MaxLengthExceededException;
import org.codelibs.fess.crawler.exception.MultipleCrawlingAccessException;
import org.codelibs.fess.crawler.extractor.Extractor;
import org.codelibs.fess.crawler.filter.UrlFilter;
import org.codelibs.fess.ds.AbstractDataStore;
import org.codelibs.fess.ds.callback.IndexUpdateCallback;
import org.codelibs.fess.ds.slack.api.type.Attachment;
import org.codelibs.fess.ds.slack.api.type.Channel;
import org.codelibs.fess.ds.slack.api.type.File;
import org.codelibs.fess.ds.slack.api.type.Message;
import org.codelibs.fess.ds.slack.api.type.Team;
import org.codelibs.fess.ds.slack.api.type.User;
import org.codelibs.fess.entity.DataStoreParams;
import org.codelibs.fess.es.config.exentity.DataConfig;
import org.codelibs.fess.exception.DataStoreCrawlingException;
import org.codelibs.fess.helper.CrawlerStatsHelper;
import org.codelibs.fess.helper.CrawlerStatsHelper.StatsAction;
import org.codelibs.fess.helper.CrawlerStatsHelper.StatsKeyObject;
import org.codelibs.fess.util.ComponentUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class SlackDataStore extends AbstractDataStore {
private static final Logger logger = LoggerFactory.getLogger(SlackDataStore.class);
protected static final long DEFAULT_MAX_FILESIZE = 10000000L; // 10m
// parameters
protected static final String IGNORE_ERROR = "ignore_error";
protected static final String SUPPORTED_MIMETYPES = "supported_mimetypes";
protected static final String INCLUDE_PATTERN = "include_pattern";
protected static final String EXCLUDE_PATTERN = "exclude_pattern";
protected static final String URL_FILTER = "url_filter";
protected static final String NUMBER_OF_THREADS = "number_of_threads";
protected static final String MAX_FILESIZE = "max_filesize";
protected static final String FILE_CRAWL = "file_crawl";
// scripts
protected static final String MESSAGE = "message";
protected static final String MESSAGE_TITLE = "title";
protected static final String MESSAGE_TEXT = "text";
protected static final String MESSAGE_TEAM = "team";
protected static final String MESSAGE_TIMESTAMP = "timestamp";
protected static final String MESSAGE_USER = "user";
protected static final String MESSAGE_CHANNEL = "channel";
protected static final String MESSAGE_PERMALINK = "permalink";
protected static final String MESSAGE_ATTACHMENTS = "attachments";
protected String extractorName = "tikaExtractor";
@Override
protected String getName() {
return this.getClass().getSimpleName();
}
public void setExtractorName(final String extractorName) {
this.extractorName = extractorName;
}
@Override
protected void storeData(final DataConfig dataConfig, final IndexUpdateCallback callback, final DataStoreParams paramMap,
final Map scriptMap, final Map defaultDataMap) {
final Map configMap = new HashMap<>();
configMap.put(MAX_FILESIZE, getMaxFilesize(paramMap));
configMap.put(IGNORE_ERROR, isIgnoreError(paramMap));
configMap.put(SUPPORTED_MIMETYPES, getSupportedMimeTypes(paramMap));
configMap.put(FILE_CRAWL, isFileCrawl(paramMap));
configMap.put(URL_FILTER, getUrlFilter(paramMap));
if (logger.isDebugEnabled()) {
logger.debug("configMap: {}", configMap);
}
final ExecutorService executorService = newFixedThreadPool(Integer.parseInt(paramMap.getAsString(NUMBER_OF_THREADS, "1")));
try (final SlackClient client = new SlackClient(paramMap)) {
final Team team = client.getTeam();
final boolean fileCrawl = (Boolean) configMap.get(FILE_CRAWL);
client.getChannels(channel -> {
processChannelMessages(dataConfig, callback, configMap, paramMap, scriptMap, defaultDataMap, executorService, client, team,
channel);
if (fileCrawl) {
processChannelFiles(dataConfig, callback, configMap, paramMap, scriptMap, defaultDataMap, executorService, client, team,
channel);
}
});
if (logger.isDebugEnabled()) {
logger.debug("Shutting down thread executor.");
}
executorService.shutdown();
executorService.awaitTermination(60, TimeUnit.SECONDS);
} catch (final InterruptedException e) {
throw new InterruptedRuntimeException(e);
} finally {
executorService.shutdownNow();
}
}
protected long getMaxFilesize(final DataStoreParams paramMap) {
final String value = paramMap.getAsString(MAX_FILESIZE);
try {
return StringUtil.isNotBlank(value) ? Long.parseLong(value) : DEFAULT_MAX_FILESIZE;
} catch (final NumberFormatException e) {
return DEFAULT_MAX_FILESIZE;
}
}
protected boolean isIgnoreError(final DataStoreParams paramMap) {
return Constants.TRUE.equalsIgnoreCase(paramMap.getAsString(IGNORE_ERROR, Constants.TRUE));
}
private List getSupportedMimeTypes(final DataStoreParams paramMap) {
return Arrays.stream(StringUtil.split(paramMap.getAsString(SUPPORTED_MIMETYPES, ".*"), ",")).map(String::trim)
.collect(Collectors.toList());
}
protected boolean isFileCrawl(final DataStoreParams paramMap) {
return Constants.TRUE.equalsIgnoreCase(paramMap.getAsString(FILE_CRAWL, Constants.FALSE));
}
protected UrlFilter getUrlFilter(final DataStoreParams paramMap) {
final UrlFilter urlFilter = ComponentUtil.getComponent(UrlFilter.class);
final String include = paramMap.getAsString(INCLUDE_PATTERN);
if (StringUtil.isNotBlank(include)) {
urlFilter.addInclude(include);
}
final String exclude = paramMap.getAsString(EXCLUDE_PATTERN);
if (StringUtil.isNotBlank(exclude)) {
urlFilter.addExclude(exclude);
}
urlFilter.init(paramMap.getAsString(Constants.CRAWLING_INFO_ID));
if (logger.isDebugEnabled()) {
logger.debug("urlFilter: {}", urlFilter);
}
return urlFilter;
}
protected ExecutorService newFixedThreadPool(final int nThreads) {
if (logger.isDebugEnabled()) {
logger.debug("Executor Thread Pool: {}", nThreads);
}
return new ThreadPoolExecutor(nThreads, nThreads, 0L, TimeUnit.MILLISECONDS, new LinkedBlockingQueue(nThreads),
new ThreadPoolExecutor.CallerRunsPolicy());
}
protected void processChannelMessages(final DataConfig dataConfig, final IndexUpdateCallback callback,
final Map configMap, final DataStoreParams paramMap, final Map scriptMap,
final Map defaultDataMap, final ExecutorService executorService, final SlackClient client, final Team team,
final Channel channel) {
client.getChannelMessages(channel.getId(), message -> {
executorService.execute(() -> {
processMessage(dataConfig, callback, configMap, paramMap, scriptMap, defaultDataMap, client, team, channel, message);
if (message.getThreadTs() != null) {
processMessageReplies(dataConfig, callback, configMap, paramMap, scriptMap, defaultDataMap, client, team, channel,
message);
}
});
});
}
protected void processChannelFiles(final DataConfig dataConfig, final IndexUpdateCallback callback, final Map configMap,
final DataStoreParams paramMap, final Map scriptMap, final Map defaultDataMap,
final ExecutorService executorService, final SlackClient client, final Team team, final Channel channel) {
client.getChannelFiles(channel.getId(), file -> {
executorService.execute(() -> {
processFile(dataConfig, callback, configMap, paramMap, scriptMap, defaultDataMap, client, team, channel, file);
});
});
}
protected void processMessageReplies(final DataConfig dataConfig, final IndexUpdateCallback callback,
final Map configMap, final DataStoreParams paramMap, final Map scriptMap,
final Map defaultDataMap, final SlackClient client, final Team team, final Channel channel,
final Message parentMessage) {
client.getMessageReplies(channel.getId(), parentMessage.getThreadTs(), message -> {
processMessage(dataConfig, callback, configMap, paramMap, scriptMap, defaultDataMap, client, team, channel, message);
});
}
protected void processMessage(final DataConfig dataConfig, final IndexUpdateCallback callback, final Map configMap,
final DataStoreParams paramMap, final Map scriptMap, final Map defaultDataMap,
final SlackClient client, final Team team, final Channel channel, final Message message) {
final CrawlerStatsHelper crawlerStatsHelper = ComponentUtil.getCrawlerStatsHelper();
final Map dataMap = new HashMap<>(defaultDataMap);
final String url = getMessagePermalink(client, team, channel, message);
final StatsKeyObject statsKey = new StatsKeyObject(url);
paramMap.put(Constants.CRAWLER_STATS_KEY, statsKey);
try {
crawlerStatsHelper.begin(statsKey);
final UrlFilter urlFilter = (UrlFilter) configMap.get(URL_FILTER);
if (urlFilter != null && !urlFilter.match(url)) {
if (logger.isDebugEnabled()) {
logger.debug("Not matched: {}", url);
}
crawlerStatsHelper.discard(statsKey);
return;
}
logger.info("Crawling URL: {}", url);
final Map resultMap = new LinkedHashMap<>(paramMap.asMap());
final Map messageMap = new HashMap<>();
final String messageText = getMessageText(message);
final String username = getMessageUsername(client, message);
messageMap.put(MESSAGE_TITLE, StringUtil.EMPTY);
messageMap.put(MESSAGE_TEXT, messageText);
// messageMap.put(MESSAGE_TEAM, team.getName());
messageMap.put(MESSAGE_TIMESTAMP, getMessageTimestamp(message));
messageMap.put(MESSAGE_USER, username);
messageMap.put(MESSAGE_CHANNEL, channel.getName());
messageMap.put(MESSAGE_PERMALINK, url);
messageMap.put(MESSAGE_ATTACHMENTS, getMessageAttachmentsText(message));
resultMap.put(MESSAGE, messageMap);
crawlerStatsHelper.record(statsKey, StatsAction.PREPARED);
if (logger.isDebugEnabled()) {
logger.debug("messageMap: {}", messageMap);
}
final String scriptType = getScriptType(paramMap);
for (final Map.Entry entry : scriptMap.entrySet()) {
final Object convertValue = convertValue(scriptType, entry.getValue(), resultMap);
if (convertValue != null) {
dataMap.put(entry.getKey(), convertValue);
}
}
crawlerStatsHelper.record(statsKey, StatsAction.EVALUATED);
if (logger.isDebugEnabled()) {
logger.debug("dataMap: {}", dataMap);
}
if (dataMap.get("url") instanceof String statsUrl) {
statsKey.setUrl(statsUrl);
}
callback.store(paramMap, dataMap);
crawlerStatsHelper.record(statsKey, StatsAction.FINISHED);
} catch (final CrawlingAccessException e) {
logger.warn("Crawling Access Exception at : {}", dataMap, e);
Throwable target = e;
if (target instanceof MultipleCrawlingAccessException ex) {
final Throwable[] causes = ex.getCauses();
if (causes.length > 0) {
target = causes[causes.length - 1];
}
}
String errorName;
final Throwable cause = target.getCause();
if (cause != null) {
errorName = cause.getClass().getCanonicalName();
} else {
errorName = target.getClass().getCanonicalName();
}
final FailureUrlService failureUrlService = ComponentUtil.getComponent(FailureUrlService.class);
failureUrlService.store(dataConfig, errorName, url, target);
crawlerStatsHelper.record(statsKey, StatsAction.ACCESS_EXCEPTION);
} catch (final Throwable t) {
logger.warn("Crawling Access Exception at : {}", dataMap, t);
final FailureUrlService failureUrlService = ComponentUtil.getComponent(FailureUrlService.class);
failureUrlService.store(dataConfig, t.getClass().getCanonicalName(), url, t);
crawlerStatsHelper.record(statsKey, StatsAction.EXCEPTION);
} finally {
crawlerStatsHelper.done(statsKey);
}
}
protected void processFile(final DataConfig dataConfig, final IndexUpdateCallback callback, final Map configMap,
final DataStoreParams paramMap, final Map scriptMap, final Map defaultDataMap,
final SlackClient client, final Team team, final Channel channel, final File file) {
final CrawlerStatsHelper crawlerStatsHelper = ComponentUtil.getCrawlerStatsHelper();
final Map dataMap = new HashMap<>(defaultDataMap);
final String url = file.getPermalink();
final StatsKeyObject statsKey = new StatsKeyObject(url);
paramMap.put(Constants.CRAWLER_STATS_KEY, statsKey);
try {
crawlerStatsHelper.begin(statsKey);
final String mimeType = file.getMimetype();
final UrlFilter urlFilter = (UrlFilter) configMap.get(URL_FILTER);
if (urlFilter != null && !urlFilter.match(url)) {
if (logger.isDebugEnabled()) {
logger.debug("Not matched: {}", url);
}
crawlerStatsHelper.discard(statsKey);
return;
}
logger.info("Crawling URL: {}", url);
final boolean ignoreError = (Boolean) configMap.get(IGNORE_ERROR);
final Map resultMap = new LinkedHashMap<>(paramMap.asMap());
final Map fileMap = new HashMap<>();
final long maxFilesize = (Long) configMap.get(MAX_FILESIZE);
if (file.getSize() > maxFilesize) {
throw new MaxLengthExceededException(
"The content length (" + file.getSize() + " byte) is over " + maxFilesize + " byte. The url is " + url);
}
if (configMap.getOrDefault(SUPPORTED_MIMETYPES, EMPTY_LIST) instanceof List> supportedMimetypes
&& supportedMimetypes.stream().map(o -> o.toString()).noneMatch(mimeType::matches)) {
if (logger.isDebugEnabled()) {
logger.debug("{} is not an indexing target.", mimeType);
}
crawlerStatsHelper.discard(statsKey);
return;
}
final String fileContent = getFileContent(client, file, ignoreError);
fileMap.put(MESSAGE_TITLE, file.getName() + " " + file.getTitle());
fileMap.put(MESSAGE_TEXT, file.getName() + "\n" + fileContent);
// fileMap.put(MESSAGE_TEAM, team.getName());
fileMap.put(MESSAGE_TIMESTAMP, getFileTimestamp(file));
fileMap.put(MESSAGE_USER, getFileUsername(client, file));
fileMap.put(MESSAGE_CHANNEL, channel.getName());
fileMap.put(MESSAGE_PERMALINK, file.getPermalink());
fileMap.put(MESSAGE_ATTACHMENTS, "");
resultMap.put(MESSAGE, fileMap);
crawlerStatsHelper.record(statsKey, StatsAction.PREPARED);
if (logger.isDebugEnabled()) {
logger.debug("fileMap: {}", fileMap);
}
final String scriptType = getScriptType(paramMap);
for (final Map.Entry entry : scriptMap.entrySet()) {
final Object convertValue = convertValue(scriptType, entry.getValue(), resultMap);
if (convertValue != null) {
dataMap.put(entry.getKey(), convertValue);
}
}
crawlerStatsHelper.record(statsKey, StatsAction.EVALUATED);
if (logger.isDebugEnabled()) {
logger.debug("dataMap: {}", dataMap);
}
if (dataMap.get("url") instanceof String statsUrl) {
statsKey.setUrl(statsUrl);
}
callback.store(paramMap, dataMap);
crawlerStatsHelper.record(statsKey, StatsAction.FINISHED);
} catch (final CrawlingAccessException e) {
logger.warn("Crawling Access Exception at : {}", dataMap, e);
Throwable target = e;
if (target instanceof MultipleCrawlingAccessException ex) {
final Throwable[] causes = ex.getCauses();
if (causes.length > 0) {
target = causes[causes.length - 1];
}
}
String errorName;
final Throwable cause = target.getCause();
if (cause != null) {
errorName = cause.getClass().getCanonicalName();
} else {
errorName = target.getClass().getCanonicalName();
}
final FailureUrlService failureUrlService = ComponentUtil.getComponent(FailureUrlService.class);
failureUrlService.store(dataConfig, errorName, url, target);
crawlerStatsHelper.record(statsKey, StatsAction.ACCESS_EXCEPTION);
} catch (final Throwable t) {
logger.warn("Crawling Access Exception at : {}", dataMap, t);
final FailureUrlService failureUrlService = ComponentUtil.getComponent(FailureUrlService.class);
failureUrlService.store(dataConfig, t.getClass().getCanonicalName(), url, t);
crawlerStatsHelper.record(statsKey, StatsAction.EXCEPTION);
} finally {
crawlerStatsHelper.done(statsKey);
}
}
protected String getMessageText(final Message message) {
final String text = message.getText();
return text != null ? text : "";
}
protected Date getMessageTimestamp(final Message message) {
return new Date(Math.round(Double.parseDouble(message.getTs()) * 1000));
}
protected Date getFileTimestamp(final File file) {
return new Date(file.getTimestamp() * 1000L);
}
public String getMessageUsername(final SlackClient client, final Message message) {
try {
if (message.getUser() != null) {
return getUsername(client, message.getUser());
}
if (message.getSubtype() != null) {
if ("bot_message".equals(message.getSubtype())) {
return client.getBot(message.getBotId()).getName();
}
if ("file_comment".equals(message.getSubtype())) {
final User user = client.getUser(message.getComment().getUser());
return !user.getProfile().getDisplayName().isEmpty() ? user.getProfile().getDisplayName()
: user.getProfile().getRealName();
}
}
} catch (final Exception e) {
if (logger.isDebugEnabled()) {
logger.debug("Failed to get a username from message.", e);
}
}
return StringUtil.EMPTY;
}
public String getFileUsername(final SlackClient client, final File file) {
try {
if (file.getUser() != null) {
return getUsername(client, file.getUser());
}
} catch (final Exception e) {
if (logger.isDebugEnabled()) {
logger.debug("Failed to get a username from message.", e);
}
}
return StringUtil.EMPTY;
}
protected String getUsername(final SlackClient client, final String userId) {
try {
final User user = client.getUser(userId);
if (user.getProfile().getDisplayName() != null) {
return user.getProfile().getDisplayName();
}
if (user.getRealName() != null) {
return user.getRealName();
}
if (user.getName() != null) {
return user.getName();
}
} catch (final ExecutionException e) {
logger.warn("Failed to get username from user.", e);
}
return userId;
}
protected String getMessageAttachmentsText(final Message message) {
final List attachments = message.getAttachments();
if (attachments == null) {
return "";
}
final List fallbacks = attachments.stream().map(Attachment::getFallback).collect(Collectors.toList());
return String.join("\n", fallbacks);
}
public String getMessagePermalink(final SlackClient client, final Team team, final Channel channel, final Message message) {
String permalink = message.getPermalink();
if (permalink == null) {
if (team == null) {
permalink = client.getPermalink(channel.getId(), message.getTs());
} else {
permalink =
"https://" + team.getDomain() + ".slack.com/archives/" + channel.getId() + "/p" + message.getTs().replace(".", "");
}
}
return permalink;
}
protected String getFileContent(final SlackClient client, final File file, final boolean ignoreError) {
if (file.getPermalink() != null) {
final String mimeType = file.getMimetype().trim();
final String fileUrl = file.getUrlPrivateDownload();
try (final CurlResponse response = client.getFileResponse(fileUrl)) {
if (response.getHttpStatusCode() != 200) {
throw new SlackDataStoreException(
"HTTP Status " + response.getHttpStatusCode() + " : failed to get the file from " + fileUrl);
}
try (final InputStream in = response.getContentAsStream()) {
return ComponentUtil.getExtractorFactory().builder(in, null).mimeType(mimeType).extractorName(extractorName).extract()
.getContent();
}
} catch (final Exception e) {
if (!ignoreError && !ComponentUtil.getFessConfig().isCrawlerIgnoreContentException()) {
throw new DataStoreCrawlingException(file.getPermalink(), "Failed to get contents: " + file.getName(), e);
}
if (logger.isDebugEnabled()) {
logger.warn("Failed to get contents: {}", file.getName(), e);
} else {
logger.warn("Failed to get contents: {}. {}", file.getName(), e.getMessage());
}
return StringUtil.EMPTY;
}
}
return StringUtil.EMPTY;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy