org.elasticsearch.index.termvectors.TermVectorsService Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of elasticsearch Show documentation
Show all versions of elasticsearch Show documentation
Elasticsearch subproject :distribution:archives:integ-test-zip
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the Server Side Public License, v 1; you may not use this file except
* in compliance with, at your election, the Elastic License 2.0 or the Server
* Side Public License, v 1.
*/
package org.elasticsearch.index.termvectors;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.MultiTerms;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.memory.MemoryIndex;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.action.termvectors.TermVectorsFilter;
import org.elasticsearch.action.termvectors.TermVectorsRequest;
import org.elasticsearch.action.termvectors.TermVectorsResponse;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.document.DocumentField;
import org.elasticsearch.common.lucene.uid.VersionsAndSeqNoResolver.DocIdAndVersion;
import org.elasticsearch.common.regex.Regex;
import org.elasticsearch.common.xcontent.XContentHelper;
import org.elasticsearch.core.Nullable;
import org.elasticsearch.index.engine.Engine;
import org.elasticsearch.index.get.GetResult;
import org.elasticsearch.index.mapper.DocumentParser;
import org.elasticsearch.index.mapper.IdFieldMapper;
import org.elasticsearch.index.mapper.LuceneDocument;
import org.elasticsearch.index.mapper.MappedFieldType;
import org.elasticsearch.index.mapper.MapperService;
import org.elasticsearch.index.mapper.MappingLookup;
import org.elasticsearch.index.mapper.ParsedDocument;
import org.elasticsearch.index.mapper.SourceFieldMapper;
import org.elasticsearch.index.mapper.SourceToParse;
import org.elasticsearch.index.mapper.SourceValueFetcher;
import org.elasticsearch.index.mapper.StringFieldType;
import org.elasticsearch.index.mapper.TextSearchInfo;
import org.elasticsearch.index.mapper.Uid;
import org.elasticsearch.index.shard.IndexShard;
import org.elasticsearch.search.lookup.SourceLookup;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.concurrent.TimeUnit;
import java.util.function.LongSupplier;
public class TermVectorsService {
private TermVectorsService() {}
public static TermVectorsResponse getTermVectors(IndexShard indexShard, TermVectorsRequest request) {
return getTermVectors(indexShard, request, System::nanoTime);
}
static TermVectorsResponse getTermVectors(IndexShard indexShard, TermVectorsRequest request, LongSupplier nanoTimeSupplier) {
final long startTime = nanoTimeSupplier.getAsLong();
final TermVectorsResponse termVectorsResponse = new TermVectorsResponse(
indexShard.shardId().getIndex().getName(),
request.type(),
request.id()
);
final Term uidTerm = new Term(IdFieldMapper.NAME, Uid.encodeId(request.id()));
Fields termVectorsByField = null;
TermVectorsFilter termVectorsFilter = null;
/* handle potential wildcards in fields */
if (request.selectedFields() != null) {
handleFieldWildcards(indexShard, request);
}
try (
Engine.GetResult get = indexShard.get(
new Engine.Get(request.realtime(), false, request.type(), request.id(), uidTerm).version(request.version())
.versionType(request.versionType())
);
Engine.Searcher searcher = indexShard.acquireSearcher("term_vector")
) {
Fields topLevelFields = fields(get.searcher() != null ? get.searcher().getIndexReader() : searcher.getIndexReader());
DocIdAndVersion docIdAndVersion = get.docIdAndVersion();
/* from an artificial document */
if (request.doc() != null) {
termVectorsByField = generateTermVectorsFromDoc(indexShard, request);
termVectorsResponse.setArtificial(true);
termVectorsResponse.setExists(true);
}
/* or from an existing document */
else if (docIdAndVersion != null) {
// fields with stored term vectors
termVectorsByField = docIdAndVersion.reader.getTermVectors(docIdAndVersion.docId);
Set selectedFields = request.selectedFields();
// generate tvs for fields where analyzer is overridden
if (selectedFields == null && request.perFieldAnalyzer() != null) {
selectedFields = getFieldsToGenerate(request.perFieldAnalyzer(), termVectorsByField);
}
// fields without term vectors
if (selectedFields != null) {
termVectorsByField = addGeneratedTermVectors(indexShard, get, termVectorsByField, request, selectedFields);
}
termVectorsResponse.setDocVersion(docIdAndVersion.version);
termVectorsResponse.setExists(true);
}
/* no term vectors generated or found */
else {
termVectorsResponse.setExists(false);
}
/* if there are term vectors, optional compute dfs and/or terms filtering */
if (termVectorsByField != null) {
if (request.filterSettings() != null) {
termVectorsFilter = new TermVectorsFilter(termVectorsByField, topLevelFields, request.selectedFields());
termVectorsFilter.setSettings(request.filterSettings());
try {
termVectorsFilter.selectBestTerms();
} catch (IOException e) {
throw new ElasticsearchException("failed to select best terms", e);
}
}
// write term vectors
termVectorsResponse.setFields(
termVectorsByField,
request.selectedFields(),
request.getFlags(),
topLevelFields,
termVectorsFilter
);
}
termVectorsResponse.setTookInMillis(TimeUnit.NANOSECONDS.toMillis(nanoTimeSupplier.getAsLong() - startTime));
} catch (Exception ex) {
throw new ElasticsearchException("failed to execute term vector request", ex);
}
return termVectorsResponse;
}
public static Fields fields(IndexReader reader) {
return new Fields() {
@Override
public Iterator iterator() {
throw new UnsupportedOperationException();
}
@Override
public Terms terms(String field) throws IOException {
return MultiTerms.getTerms(reader, field);
}
@Override
public int size() {
throw new UnsupportedOperationException();
}
};
}
private static void handleFieldWildcards(IndexShard indexShard, TermVectorsRequest request) {
// TODO rewrite this to use a field filter built from field patterns
// Using lookups doesn't work for eg dynamic fields
Set fieldNames = new HashSet<>();
for (String pattern : request.selectedFields()) {
Set expandedFields = indexShard.mapperService().mappingLookup().getMatchingFieldNames(pattern);
if (expandedFields.isEmpty()) {
if (Regex.isSimpleMatchPattern(pattern) == false) {
fieldNames.add(pattern);
}
} else {
fieldNames.addAll(expandedFields);
}
}
request.selectedFields(fieldNames.toArray(Strings.EMPTY_ARRAY));
}
private static boolean isValidField(MappedFieldType fieldType) {
// must be a string
if (fieldType instanceof StringFieldType == false) {
return false;
}
// and must be indexed
if (fieldType.isSearchable() == false) {
return false;
}
return true;
}
private static Fields addGeneratedTermVectors(
IndexShard indexShard,
Engine.GetResult get,
Fields termVectorsByField,
TermVectorsRequest request,
Set selectedFields
) throws IOException {
/* only keep valid fields */
Set validFields = new HashSet<>();
for (String field : selectedFields) {
MappedFieldType fieldType = indexShard.mapperService().fieldType(field);
if (isValidField(fieldType) == false) {
continue;
}
// already retrieved, only if the analyzer hasn't been overridden at the field
if (fieldType.getTextSearchInfo().termVectors() != TextSearchInfo.TermVector.NONE
&& (request.perFieldAnalyzer() == null || request.perFieldAnalyzer().containsKey(field) == false)) {
continue;
}
validFields.add(field);
}
if (validFields.isEmpty()) {
return termVectorsByField;
}
/* generate term vectors from fetched document fields */
String[] getFields = validFields.toArray(new String[validFields.size() + 1]);
getFields[getFields.length - 1] = SourceFieldMapper.NAME;
GetResult getResult = indexShard.getService().get(get, request.id(), request.type(), getFields, null);
Fields generatedTermVectors = generateTermVectors(
indexShard,
getResult.sourceAsMap(),
getResult.getFields().values(),
request.offsets(),
request.perFieldAnalyzer(),
validFields
);
/* merge with existing Fields */
if (termVectorsByField == null) {
return generatedTermVectors;
} else {
return mergeFields(termVectorsByField, generatedTermVectors);
}
}
private static Analyzer getAnalyzerAtField(IndexShard indexShard, String field, @Nullable Map perFieldAnalyzer) {
MapperService mapperService = indexShard.mapperService();
if (perFieldAnalyzer != null && perFieldAnalyzer.containsKey(field)) {
return mapperService.getIndexAnalyzers().get(perFieldAnalyzer.get(field));
} else {
return mapperService.indexAnalyzer(
field,
f -> { throw new IllegalArgumentException("No analyzer configured for field " + f); }
);
}
}
private static Set getFieldsToGenerate(Map perAnalyzerField, Fields fieldsObject) {
Set selectedFields = new HashSet<>();
for (String fieldName : fieldsObject) {
if (perAnalyzerField.containsKey(fieldName)) {
selectedFields.add(fieldName);
}
}
return selectedFields;
}
private static Fields generateTermVectors(
IndexShard indexShard,
Map source,
Collection getFields,
boolean withOffsets,
@Nullable Map perFieldAnalyzer,
Set fields
) throws IOException {
Map> values = new HashMap<>();
for (DocumentField getField : getFields) {
String field = getField.getName();
if (fields.contains(field)) { // some fields are returned even when not asked for, eg. _timestamp
values.put(field, getField.getValues());
}
}
if (source != null) {
MappingLookup mappingLookup = indexShard.mapperService().mappingLookup();
SourceLookup sourceLookup = new SourceLookup();
sourceLookup.setSource(source);
for (String field : fields) {
if (values.containsKey(field) == false) {
SourceValueFetcher valueFetcher = SourceValueFetcher.toString(mappingLookup.sourcePaths(field));
List