/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*
* Modifications Copyright OpenSearch Contributors. See
* GitHub history for details.
*/
package org.opensearch.index.mapper.annotatedtext;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.AnalyzerWrapper;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.IndexOptions;
import org.opensearch.OpenSearchParseException;
import org.opensearch.index.analysis.AnalyzerScope;
import org.opensearch.index.analysis.IndexAnalyzers;
import org.opensearch.index.analysis.NamedAnalyzer;
import org.opensearch.index.mapper.FieldMapper;
import org.opensearch.index.mapper.MapperParsingException;
import org.opensearch.index.mapper.ParametrizedFieldMapper;
import org.opensearch.index.mapper.ParseContext;
import org.opensearch.index.mapper.TextFieldMapper;
import org.opensearch.index.mapper.TextParams;
import org.opensearch.index.mapper.TextSearchInfo;
import org.opensearch.index.similarity.SimilarityProvider;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.io.UncheckedIOException;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/** A {@link FieldMapper} for full-text fields with annotation markup e.g.
*
* "New mayor is [John Smith](type=person&value=John%20Smith) "
*
* A special Analyzer wraps the default choice of analyzer in order
* to strip the text field of annotation markup and inject the related
* entity annotation tokens as supplementary tokens at the relevant points
* in the token stream.
* This code is largely a copy of TextFieldMapper which is less than ideal -
* my attempts to subclass TextFieldMapper failed but we can revisit this.
**/
public class AnnotatedTextFieldMapper extends ParametrizedFieldMapper {
public static final String CONTENT_TYPE = "annotated_text";
private static final int POSITION_INCREMENT_GAP_USE_ANALYZER = -1;
private static Builder builder(FieldMapper in) {
return ((AnnotatedTextFieldMapper) in).builder;
}
public static class Builder extends ParametrizedFieldMapper.Builder {
private final Parameter store = Parameter.storeParam(m -> builder(m).store.getValue(), false);
final TextParams.Analyzers analyzers;
final Parameter similarity = TextParams.similarity(m -> builder(m).similarity.getValue());
final Parameter indexOptions = TextParams.indexOptions(m -> builder(m).indexOptions.getValue());
final Parameter norms = TextParams.norms(true, m -> builder(m).norms.getValue());
final Parameter termVectors = TextParams.termVectors(m -> builder(m).termVectors.getValue());
final Parameter positionIncrementGap = Parameter.intParam(
"position_increment_gap",
false,
m -> builder(m).positionIncrementGap.getValue(),
POSITION_INCREMENT_GAP_USE_ANALYZER
).setValidator(v -> {
if (v != POSITION_INCREMENT_GAP_USE_ANALYZER && v < 0) {
throw new MapperParsingException("[positions_increment_gap] must be positive, got [" + v + "]");
}
});
private final Parameter boost = Parameter.boostParam();
private final Parameter