org.opensearch.index.termvectors.TermVectorsService Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of opensearch Show documentation
Show all versions of opensearch Show documentation
OpenSearch subproject :server
/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*
* Modifications Copyright OpenSearch Contributors. See
* GitHub history for details.
*/
package org.opensearch.index.termvectors;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.MultiTerms;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermVectors;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.memory.MemoryIndex;
import org.opensearch.OpenSearchException;
import org.opensearch.action.termvectors.TermVectorsFilter;
import org.opensearch.action.termvectors.TermVectorsRequest;
import org.opensearch.action.termvectors.TermVectorsResponse;
import org.opensearch.common.Nullable;
import org.opensearch.common.document.DocumentField;
import org.opensearch.common.lucene.uid.VersionsAndSeqNoResolver.DocIdAndVersion;
import org.opensearch.common.xcontent.XContentHelper;
import org.opensearch.common.xcontent.support.XContentMapValues;
import org.opensearch.core.common.Strings;
import org.opensearch.core.common.bytes.BytesReference;
import org.opensearch.core.xcontent.MediaType;
import org.opensearch.index.engine.Engine;
import org.opensearch.index.get.GetResult;
import org.opensearch.index.mapper.DocumentMapperForType;
import org.opensearch.index.mapper.IdFieldMapper;
import org.opensearch.index.mapper.MappedFieldType;
import org.opensearch.index.mapper.MapperService;
import org.opensearch.index.mapper.ParseContext;
import org.opensearch.index.mapper.ParsedDocument;
import org.opensearch.index.mapper.SourceFieldMapper;
import org.opensearch.index.mapper.SourceToParse;
import org.opensearch.index.mapper.StringFieldType;
import org.opensearch.index.mapper.TextSearchInfo;
import org.opensearch.index.mapper.Uid;
import org.opensearch.index.shard.IndexShard;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.concurrent.TimeUnit;
import java.util.function.LongSupplier;
/**
* Service for Term Vectors
*
* @opensearch.internal
*/
public class TermVectorsService {
private TermVectorsService() {}
public static TermVectorsResponse getTermVectors(IndexShard indexShard, TermVectorsRequest request) {
return getTermVectors(indexShard, request, System::nanoTime);
}
static TermVectorsResponse getTermVectors(IndexShard indexShard, TermVectorsRequest request, LongSupplier nanoTimeSupplier) {
final long startTime = nanoTimeSupplier.getAsLong();
final TermVectorsResponse termVectorsResponse = new TermVectorsResponse(indexShard.shardId().getIndex().getName(), request.id());
final Term uidTerm = new Term(IdFieldMapper.NAME, Uid.encodeId(request.id()));
Fields termVectorsByField = null;
TermVectorsFilter termVectorsFilter = null;
/* handle potential wildcards in fields */
if (request.selectedFields() != null) {
handleFieldWildcards(indexShard, request);
}
try (
Engine.GetResult get = indexShard.get(
new Engine.Get(request.realtime(), false, request.id(), uidTerm).version(request.version())
.versionType(request.versionType())
);
Engine.Searcher searcher = indexShard.acquireSearcher("term_vector")
) {
Fields topLevelFields = fields(get.searcher() != null ? get.searcher().getIndexReader() : searcher.getIndexReader());
DocIdAndVersion docIdAndVersion = get.docIdAndVersion();
/* from an artificial document */
if (request.doc() != null) {
termVectorsByField = generateTermVectorsFromDoc(indexShard, request);
termVectorsResponse.setArtificial(true);
termVectorsResponse.setExists(true);
}
/* or from an existing document */
else if (docIdAndVersion != null) {
// fields with stored term vectors
TermVectors termVectors = docIdAndVersion.reader.termVectors();
termVectorsByField = termVectors.get(docIdAndVersion.docId);
Set selectedFields = request.selectedFields();
// generate tvs for fields where analyzer is overridden
if (selectedFields == null && request.perFieldAnalyzer() != null) {
selectedFields = getFieldsToGenerate(request.perFieldAnalyzer(), termVectorsByField);
}
// fields without term vectors
if (selectedFields != null) {
termVectorsByField = addGeneratedTermVectors(indexShard, get, termVectorsByField, request, selectedFields);
}
termVectorsResponse.setDocVersion(docIdAndVersion.version);
termVectorsResponse.setExists(true);
}
/* no term vectors generated or found */
else {
termVectorsResponse.setExists(false);
}
/* if there are term vectors, optional compute dfs and/or terms filtering */
if (termVectorsByField != null) {
if (request.filterSettings() != null) {
termVectorsFilter = new TermVectorsFilter(termVectorsByField, topLevelFields, request.selectedFields());
termVectorsFilter.setSettings(request.filterSettings());
try {
termVectorsFilter.selectBestTerms();
} catch (IOException e) {
throw new OpenSearchException("failed to select best terms", e);
}
}
// write term vectors
termVectorsResponse.setFields(
termVectorsByField,
request.selectedFields(),
request.getFlags(),
topLevelFields,
termVectorsFilter
);
}
termVectorsResponse.setTookInMillis(TimeUnit.NANOSECONDS.toMillis(nanoTimeSupplier.getAsLong() - startTime));
} catch (Exception ex) {
throw new OpenSearchException("failed to execute term vector request", ex);
}
return termVectorsResponse;
}
public static Fields fields(IndexReader reader) {
return new Fields() {
@Override
public Iterator iterator() {
throw new UnsupportedOperationException();
}
@Override
public Terms terms(String field) throws IOException {
return MultiTerms.getTerms(reader, field);
}
@Override
public int size() {
throw new UnsupportedOperationException();
}
};
}
private static void handleFieldWildcards(IndexShard indexShard, TermVectorsRequest request) {
Set fieldNames = new HashSet<>();
for (String pattern : request.selectedFields()) {
fieldNames.addAll(indexShard.mapperService().simpleMatchToFullName(pattern));
}
request.selectedFields(fieldNames.toArray(Strings.EMPTY_ARRAY));
}
private static boolean isValidField(MappedFieldType fieldType) {
// must be a string
if (fieldType instanceof StringFieldType == false) {
return false;
}
// and must be indexed
if (fieldType.isSearchable() == false) {
return false;
}
return true;
}
private static Fields addGeneratedTermVectors(
IndexShard indexShard,
Engine.GetResult get,
Fields termVectorsByField,
TermVectorsRequest request,
Set selectedFields
) throws IOException {
/* only keep valid fields */
Set validFields = new HashSet<>();
for (String field : selectedFields) {
MappedFieldType fieldType = indexShard.mapperService().fieldType(field);
if (!isValidField(fieldType)) {
continue;
}
// already retrieved, only if the analyzer hasn't been overridden at the field
if (fieldType.getTextSearchInfo().termVectors() != TextSearchInfo.TermVector.NONE
&& (request.perFieldAnalyzer() == null || !request.perFieldAnalyzer().containsKey(field))) {
continue;
}
validFields.add(field);
}
if (validFields.isEmpty()) {
return termVectorsByField;
}
/* generate term vectors from fetched document fields */
String[] getFields = validFields.toArray(new String[validFields.size() + 1]);
getFields[getFields.length - 1] = SourceFieldMapper.NAME;
GetResult getResult = indexShard.getService().get(get, request.id(), getFields, null);
Fields generatedTermVectors = generateTermVectors(
indexShard,
getResult.sourceAsMap(),
getResult.getFields().values(),
request.offsets(),
request.perFieldAnalyzer(),
validFields
);
/* merge with existing Fields */
if (termVectorsByField == null) {
return generatedTermVectors;
} else {
return mergeFields(termVectorsByField, generatedTermVectors);
}
}
private static Analyzer getAnalyzerAtField(IndexShard indexShard, String field, @Nullable Map perFieldAnalyzer) {
MapperService mapperService = indexShard.mapperService();
Analyzer analyzer;
if (perFieldAnalyzer != null && perFieldAnalyzer.containsKey(field)) {
analyzer = mapperService.getIndexAnalyzers().get(perFieldAnalyzer.get(field));
} else {
MappedFieldType fieldType = mapperService.fieldType(field);
analyzer = fieldType.indexAnalyzer();
}
if (analyzer == null) {
analyzer = mapperService.getIndexAnalyzers().getDefaultIndexAnalyzer();
}
return analyzer;
}
private static Set getFieldsToGenerate(Map perAnalyzerField, Fields fieldsObject) {
Set selectedFields = new HashSet<>();
for (String fieldName : fieldsObject) {
if (perAnalyzerField.containsKey(fieldName)) {
selectedFields.add(fieldName);
}
}
return selectedFields;
}
private static Fields generateTermVectors(
IndexShard indexShard,
Map source,
Collection getFields,
boolean withOffsets,
@Nullable Map perFieldAnalyzer,
Set fields
) throws IOException {
Map> values = new HashMap<>();
for (DocumentField getField : getFields) {
String field = getField.getName();
if (fields.contains(field)) { // some fields are returned even when not asked for, eg. _timestamp
values.put(field, getField.getValues());
}
}
if (source != null) {
for (String field : fields) {
if (values.containsKey(field) == false) {
List