
org.codelibs.fess.suggest.index.SuggestIndexer Maven / Gradle / Ivy
/*
* Copyright 2012-2024 CodeLibs Project and the Others.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
* either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/
package org.codelibs.fess.suggest.index;
import java.lang.management.ManagementFactory;
import java.time.ZonedDateTime;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ExecutorService;
import java.util.function.Supplier;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.codelibs.core.lang.StringUtil;
import org.codelibs.core.lang.ThreadUtil;
import org.codelibs.fess.suggest.analysis.SuggestAnalyzer;
import org.codelibs.fess.suggest.concurrent.Deferred;
import org.codelibs.fess.suggest.constants.FieldNames;
import org.codelibs.fess.suggest.converter.ReadingConverter;
import org.codelibs.fess.suggest.entity.ElevateWord;
import org.codelibs.fess.suggest.entity.SuggestItem;
import org.codelibs.fess.suggest.exception.SuggestIndexException;
import org.codelibs.fess.suggest.index.contents.ContentsParser;
import org.codelibs.fess.suggest.index.contents.DefaultContentsParser;
import org.codelibs.fess.suggest.index.contents.document.DocumentReader;
import org.codelibs.fess.suggest.index.contents.querylog.QueryLog;
import org.codelibs.fess.suggest.index.contents.querylog.QueryLogReader;
import org.codelibs.fess.suggest.index.writer.SuggestIndexWriter;
import org.codelibs.fess.suggest.index.writer.SuggestWriter;
import org.codelibs.fess.suggest.index.writer.SuggestWriterResult;
import org.codelibs.fess.suggest.normalizer.Normalizer;
import org.codelibs.fess.suggest.settings.SuggestSettings;
import org.codelibs.fess.suggest.util.SuggestUtil;
import org.opensearch.OpenSearchStatusException;
import org.opensearch.action.search.SearchResponse;
import org.opensearch.client.Client;
import org.opensearch.index.query.Operator;
import org.opensearch.index.query.QueryBuilder;
import org.opensearch.index.query.QueryBuilders;
import org.opensearch.search.SearchHit;
public class SuggestIndexer {
private final static Logger logger = LogManager.getLogger(SuggestIndexer.class);
protected final Client client;
protected String index;
protected SuggestSettings settings;
protected String[] supportedFields;
protected String[] tagFieldNames;
protected String roleFieldName;
protected String langFieldName;
protected String[] badWords;
protected boolean parallel;
protected ReadingConverter readingConverter;
protected ReadingConverter contentsReadingConverter;
protected Normalizer normalizer;
protected SuggestAnalyzer analyzer;
protected ContentsParser contentsParser;
protected SuggestWriter suggestWriter;
protected ExecutorService threadPool;
public SuggestIndexer(final Client client, final String index, final ReadingConverter readingConverter,
final ReadingConverter contentsReadingConverter, final Normalizer normalizer, final SuggestAnalyzer analyzer,
final SuggestSettings settings, final ExecutorService threadPool) {
this.client = client;
this.index = index;
supportedFields = settings.array().get(SuggestSettings.DefaultKeys.SUPPORTED_FIELDS);
badWords = settings.badword().get(true);
tagFieldNames = settings.getAsString(SuggestSettings.DefaultKeys.TAG_FIELD_NAME, StringUtil.EMPTY).split(",");
roleFieldName = settings.getAsString(SuggestSettings.DefaultKeys.ROLE_FIELD_NAME, StringUtil.EMPTY);
langFieldName = settings.getAsString(SuggestSettings.DefaultKeys.LANG_FIELD_NAME, StringUtil.EMPTY);
parallel = settings.getAsBoolean(SuggestSettings.DefaultKeys.PARALLEL_PROCESSING, false);
this.readingConverter = readingConverter;
this.contentsReadingConverter = contentsReadingConverter;
this.normalizer = normalizer;
this.analyzer = analyzer;
this.settings = settings;
contentsParser = new DefaultContentsParser();
suggestWriter = new SuggestIndexWriter();
this.threadPool = threadPool;
}
// TODO return result
public SuggestIndexResponse index(final SuggestItem item) {
return index(new SuggestItem[] { item });
}
// TODO return result
public SuggestIndexResponse index(final SuggestItem[] items) {
// TODO parallel?
final SuggestItem[] array = Stream.of(items).filter(item -> !item.isBadWord(badWords)).toArray(n -> new SuggestItem[n]);
try {
final long start = System.currentTimeMillis();
final SuggestWriterResult result = suggestWriter.write(client, settings, index, array, true);
return new SuggestIndexResponse(items.length, items.length, result.getFailures(), System.currentTimeMillis() - start);
} catch (final Exception e) {
throw new SuggestIndexException("Failed to write items[" + items.length + "] to " + index, e);
}
}
public SuggestDeleteResponse delete(final String id) {
final long start = System.currentTimeMillis();
final SuggestWriterResult result = suggestWriter.delete(client, settings, index, id);
return new SuggestDeleteResponse(result.getFailures(), System.currentTimeMillis() - start);
}
public SuggestDeleteResponse deleteByQuery(final String queryString) {
return deleteByQuery(QueryBuilders.queryStringQuery(queryString).defaultOperator(Operator.AND));
}
public SuggestDeleteResponse deleteByQuery(final QueryBuilder queryBuilder) {
final long start = System.currentTimeMillis();
final SuggestWriterResult result = suggestWriter.deleteByQuery(client, settings, index, queryBuilder);
return new SuggestDeleteResponse(result.getFailures(), System.currentTimeMillis() - start);
}
public SuggestDeleteResponse deleteAll() {
final SuggestDeleteResponse response = deleteByQuery(QueryBuilders.matchAllQuery());
restoreElevateWord();
return response;
}
public SuggestDeleteResponse deleteDocumentWords() {
final long start = System.currentTimeMillis();
final SuggestDeleteResponse deleteResponse =
deleteByQuery(QueryBuilders.boolQuery().must(QueryBuilders.rangeQuery(FieldNames.DOC_FREQ).gte(1))
.mustNot(QueryBuilders.matchPhraseQuery(FieldNames.KINDS, SuggestItem.Kind.QUERY.toString()))
.mustNot(QueryBuilders.matchPhraseQuery(FieldNames.KINDS, SuggestItem.Kind.USER.toString())));
if (deleteResponse.hasError()) {
throw new SuggestIndexException(deleteResponse.getErrors().get(0));
}
final List updateItems = new ArrayList<>();
SearchResponse response = client.prepareSearch(index).setSize(500).setScroll(settings.getScrollTimeout())
.setQuery(QueryBuilders.rangeQuery(FieldNames.DOC_FREQ).gte(1)).execute().actionGet(settings.getSearchTimeout());
String scrollId = response.getScrollId();
try {
while (scrollId != null) {
final SearchHit[] hits = response.getHits().getHits();
if (hits.length == 0) {
break;
}
for (final SearchHit hit : hits) {
final SuggestItem item = SuggestItem.parseSource(hit.getSourceAsMap());
item.setDocFreq(0);
item.setKinds(Stream.of(item.getKinds()).filter(kind -> kind != SuggestItem.Kind.DOCUMENT)
.toArray(count -> new SuggestItem.Kind[count]));
updateItems.add(item);
}
final SuggestWriterResult result =
suggestWriter.write(client, settings, index, updateItems.toArray(new SuggestItem[updateItems.size()]), false);
if (result.hasFailure()) {
throw new SuggestIndexException(result.getFailures().get(0));
}
response = client.prepareSearchScroll(scrollId).execute().actionGet(settings.getSearchTimeout());
if (!scrollId.equals(response.getScrollId())) {
SuggestUtil.deleteScrollContext(client, scrollId);
}
scrollId = response.getScrollId();
}
} finally {
SuggestUtil.deleteScrollContext(client, scrollId);
}
return new SuggestDeleteResponse(null, System.currentTimeMillis() - start);
}
public SuggestDeleteResponse deleteQueryWords() {
final long start = System.currentTimeMillis();
final SuggestDeleteResponse deleteResponse =
deleteByQuery(QueryBuilders.boolQuery().must(QueryBuilders.rangeQuery(FieldNames.QUERY_FREQ).gte(1))
.mustNot(QueryBuilders.matchPhraseQuery(FieldNames.KINDS, SuggestItem.Kind.DOCUMENT.toString()))
.mustNot(QueryBuilders.matchPhraseQuery(FieldNames.KINDS, SuggestItem.Kind.USER.toString())));
if (deleteResponse.hasError()) {
throw new SuggestIndexException(deleteResponse.getErrors().get(0));
}
final List updateItems = new ArrayList<>();
SearchResponse response = client.prepareSearch(index).setSize(500).setScroll(settings.getScrollTimeout())
.setQuery(QueryBuilders.rangeQuery(FieldNames.QUERY_FREQ).gte(1)).execute().actionGet(settings.getSearchTimeout());
String scrollId = response.getScrollId();
try {
while (scrollId != null) {
final SearchHit[] hits = response.getHits().getHits();
if (hits.length == 0) {
break;
}
for (final SearchHit hit : hits) {
final SuggestItem item = SuggestItem.parseSource(hit.getSourceAsMap());
item.setQueryFreq(0);
item.setKinds(Stream.of(item.getKinds()).filter(kind -> kind != SuggestItem.Kind.QUERY)
.toArray(count -> new SuggestItem.Kind[count]));
updateItems.add(item);
}
final SuggestWriterResult result =
suggestWriter.write(client, settings, index, updateItems.toArray(new SuggestItem[updateItems.size()]), false);
if (result.hasFailure()) {
throw new SuggestIndexException(result.getFailures().get(0));
}
response = client.prepareSearchScroll(scrollId).execute().actionGet(settings.getSearchTimeout());
if (!scrollId.equals(response.getScrollId())) {
SuggestUtil.deleteScrollContext(client, scrollId);
}
scrollId = response.getScrollId();
}
} finally {
SuggestUtil.deleteScrollContext(client, scrollId);
}
return new SuggestDeleteResponse(null, System.currentTimeMillis() - start);
}
public SuggestIndexResponse indexFromQueryLog(final QueryLog queryLog) {
return indexFromQueryLog(new QueryLog[] { queryLog });
}
public SuggestIndexResponse indexFromQueryLog(final QueryLog[] queryLogs) {
if (logger.isInfoEnabled()) {
logger.info("Index from querylog. num: {}", queryLogs.length);
}
try {
final long start = System.currentTimeMillis();
final Stream stream = Stream.of(queryLogs);
if (parallel) {
stream.parallel();
}
final SuggestItem[] array = stream
.flatMap(queryLog -> contentsParser
.parseQueryLog(queryLog, supportedFields, tagFieldNames, roleFieldName, readingConverter, normalizer).stream())
.toArray(n -> new SuggestItem[n]);
final long parseTime = System.currentTimeMillis();
final SuggestIndexResponse response = index(array);
final long indexTime = System.currentTimeMillis();
if (logger.isInfoEnabled()) {
printProcessingInfo("queries", queryLogs.length, array, parseTime - start, indexTime - parseTime);
}
return new SuggestIndexResponse(array.length, queryLogs.length, response.getErrors(), System.currentTimeMillis() - start);
} catch (final Exception e) {
throw new SuggestIndexException("Failed to index from query_string.", e);
}
}
// TODO replace queryLogReader with lambda reader
public Deferred.Promise indexFromQueryLog(final QueryLogReader queryLogReader, final int docPerReq,
final long requestInterval) {
final Deferred deferred = new Deferred<>();
threadPool.execute(() -> {
final long start = System.currentTimeMillis();
int numberOfSuggestDocs = 0;
int numberOfInputDocs = 0;
final List errors = new ArrayList<>();
final List queryLogs = new ArrayList<>(docPerReq);
try {
QueryLog queryLog = queryLogReader.read();
while (queryLog != null) {
if (Thread.currentThread().isInterrupted()) {
break;
}
queryLogs.add(queryLog);
queryLog = queryLogReader.read();
if (queryLog == null && !queryLogs.isEmpty() || queryLogs.size() >= docPerReq) {
final SuggestIndexResponse res = indexFromQueryLog(queryLogs.toArray(new QueryLog[queryLogs.size()]));
errors.addAll(res.getErrors());
numberOfSuggestDocs += res.getNumberOfSuggestDocs();
numberOfInputDocs += res.getNumberOfInputDocs();
queryLogs.clear();
Thread.sleep(requestInterval);
}
}
deferred.resolve(
new SuggestIndexResponse(numberOfSuggestDocs, numberOfInputDocs, errors, System.currentTimeMillis() - start));
} catch (final Throwable t) {
deferred.reject(t);
} finally {
queryLogReader.close();
}
});
return deferred.promise();
}
public SuggestIndexResponse indexFromDocument(final Map[] documents) {
final long start = System.currentTimeMillis();
try {
final Stream
© 2015 - 2025 Weber Informatics LLC | Privacy Policy