All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.opensearch.search.aggregations.bucket.terms.SignificantTextAggregationBuilder Maven / Gradle / Ivy

/*
 * SPDX-License-Identifier: Apache-2.0
 *
 * The OpenSearch Contributors require contributions made to
 * this file be licensed under the Apache-2.0 license or a
 * compatible open source license.
 */

/*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

/*
 * Modifications Copyright OpenSearch Contributors. See
 * GitHub history for details.
 */

package org.opensearch.search.aggregations.bucket.terms;

import org.opensearch.common.ParseField;
import org.opensearch.common.io.stream.StreamInput;
import org.opensearch.common.io.stream.StreamOutput;
import org.opensearch.common.xcontent.ObjectParser;
import org.opensearch.common.xcontent.XContentBuilder;
import org.opensearch.common.xcontent.XContentParser;
import org.opensearch.index.query.AbstractQueryBuilder;
import org.opensearch.index.query.QueryBuilder;
import org.opensearch.index.query.QueryShardContext;
import org.opensearch.search.aggregations.AbstractAggregationBuilder;
import org.opensearch.search.aggregations.AggregationBuilder;
import org.opensearch.search.aggregations.AggregationInitializationException;
import org.opensearch.search.aggregations.AggregatorFactories.Builder;
import org.opensearch.search.aggregations.AggregatorFactory;
import org.opensearch.search.aggregations.bucket.terms.TermsAggregator.BucketCountThresholds;
import org.opensearch.search.aggregations.bucket.terms.heuristic.SignificanceHeuristic;

import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Objects;

public class SignificantTextAggregationBuilder extends AbstractAggregationBuilder {
    public static final String NAME = "significant_text";

    static final ParseField FIELD_NAME = new ParseField("field");
    static final ParseField SOURCE_FIELDS_NAME = new ParseField("source_fields");
    static final ParseField FILTER_DUPLICATE_TEXT_FIELD_NAME = new ParseField("filter_duplicate_text");

    static final TermsAggregator.BucketCountThresholds DEFAULT_BUCKET_COUNT_THRESHOLDS =
        SignificantTermsAggregationBuilder.DEFAULT_BUCKET_COUNT_THRESHOLDS;
    static final SignificanceHeuristic DEFAULT_SIGNIFICANCE_HEURISTIC = SignificantTermsAggregationBuilder.DEFAULT_SIGNIFICANCE_HEURISTIC;

    private String fieldName = null;
    private String[] sourceFieldNames = null;
    private boolean filterDuplicateText = false;
    private IncludeExclude includeExclude = null;
    private QueryBuilder filterBuilder = null;
    private TermsAggregator.BucketCountThresholds bucketCountThresholds = new BucketCountThresholds(DEFAULT_BUCKET_COUNT_THRESHOLDS);
    private SignificanceHeuristic significanceHeuristic = DEFAULT_SIGNIFICANCE_HEURISTIC;

    private static final ObjectParser PARSER = new ObjectParser<>(
        SignificantTextAggregationBuilder.NAME,
        SignificanceHeuristic.class,
        SignificantTextAggregationBuilder::significanceHeuristic,
        null
    );
    static {
        PARSER.declareInt(SignificantTextAggregationBuilder::shardSize, TermsAggregationBuilder.SHARD_SIZE_FIELD_NAME);

        PARSER.declareLong(SignificantTextAggregationBuilder::minDocCount, TermsAggregationBuilder.MIN_DOC_COUNT_FIELD_NAME);

        PARSER.declareLong(SignificantTextAggregationBuilder::shardMinDocCount, TermsAggregationBuilder.SHARD_MIN_DOC_COUNT_FIELD_NAME);

        PARSER.declareInt(SignificantTextAggregationBuilder::size, TermsAggregationBuilder.REQUIRED_SIZE_FIELD_NAME);

        PARSER.declareString(SignificantTextAggregationBuilder::fieldName, FIELD_NAME);

        PARSER.declareStringArray(SignificantTextAggregationBuilder::sourceFieldNames, SOURCE_FIELDS_NAME);

        PARSER.declareBoolean(SignificantTextAggregationBuilder::filterDuplicateText, FILTER_DUPLICATE_TEXT_FIELD_NAME);

        PARSER.declareObject(
            SignificantTextAggregationBuilder::backgroundFilter,
            (p, context) -> AbstractQueryBuilder.parseInnerQueryBuilder(p),
            SignificantTermsAggregationBuilder.BACKGROUND_FILTER
        );

        PARSER.declareField(
            (b, v) -> b.includeExclude(IncludeExclude.merge(v, b.includeExclude())),
            IncludeExclude::parseInclude,
            IncludeExclude.INCLUDE_FIELD,
            ObjectParser.ValueType.OBJECT_ARRAY_OR_STRING
        );

        PARSER.declareField(
            (b, v) -> b.includeExclude(IncludeExclude.merge(b.includeExclude(), v)),
            IncludeExclude::parseExclude,
            IncludeExclude.EXCLUDE_FIELD,
            ObjectParser.ValueType.STRING_ARRAY
        );
    }

    public static SignificantTextAggregationBuilder parse(String aggregationName, XContentParser parser) throws IOException {
        return PARSER.parse(parser, new SignificantTextAggregationBuilder(aggregationName, null), null);
    }

    protected SignificantTextAggregationBuilder(
        SignificantTextAggregationBuilder clone,
        Builder factoriesBuilder,
        Map metadata
    ) {
        super(clone, factoriesBuilder, metadata);
        this.bucketCountThresholds = new BucketCountThresholds(clone.bucketCountThresholds);
        this.fieldName = clone.fieldName;
        this.filterBuilder = clone.filterBuilder;
        this.filterDuplicateText = clone.filterDuplicateText;
        this.includeExclude = clone.includeExclude;
        this.significanceHeuristic = clone.significanceHeuristic;
        this.sourceFieldNames = clone.sourceFieldNames;
    }

    @Override
    protected AggregationBuilder shallowCopy(Builder factoriesBuilder, Map metadata) {
        return new SignificantTextAggregationBuilder(this, factoriesBuilder, metadata);
    }

    protected TermsAggregator.BucketCountThresholds getBucketCountThresholds() {
        return new TermsAggregator.BucketCountThresholds(bucketCountThresholds);
    }

    public TermsAggregator.BucketCountThresholds bucketCountThresholds() {
        return bucketCountThresholds;
    }

    @Override
    public SignificantTextAggregationBuilder subAggregations(Builder subFactories) {
        throw new AggregationInitializationException(
            "Aggregator [" + name + "] of type [" + getType() + "] cannot accept sub-aggregations"
        );
    }

    @Override
    public SignificantTextAggregationBuilder subAggregation(AggregationBuilder aggregation) {
        throw new AggregationInitializationException(
            "Aggregator [" + name + "] of type [" + getType() + "] cannot accept sub-aggregations"
        );
    }

    public SignificantTextAggregationBuilder bucketCountThresholds(TermsAggregator.BucketCountThresholds bucketCountThresholds) {
        if (bucketCountThresholds == null) {
            throw new IllegalArgumentException("[bucketCountThresholds] must not be null: [" + name + "]");
        }
        this.bucketCountThresholds = bucketCountThresholds;
        return this;
    }

    /**
     * Sets the size - indicating how many term buckets should be returned
     * (defaults to 10)
     */
    public SignificantTextAggregationBuilder size(int size) {
        if (size <= 0) {
            throw new IllegalArgumentException("[size] must be greater than 0. Found [" + size + "] in [" + name + "]");
        }
        bucketCountThresholds.setRequiredSize(size);
        return this;
    }

    /**
     * Sets the shard_size - indicating the number of term buckets each shard
     * will return to the coordinating node (the node that coordinates the
     * search execution). The higher the shard size is, the more accurate the
     * results are.
     */
    public SignificantTextAggregationBuilder shardSize(int shardSize) {
        if (shardSize <= 0) {
            throw new IllegalArgumentException("[shardSize] must be greater than  0. Found [" + shardSize + "] in [" + name + "]");
        }
        bucketCountThresholds.setShardSize(shardSize);
        return this;
    }

    /**
     * Sets the name of the text field that will be the subject of this
     * aggregation.
     */
    public SignificantTextAggregationBuilder fieldName(String fieldName) {
        this.fieldName = fieldName;
        return this;
    }

    /**
     * Selects the fields to load from _source JSON and analyze.
     * If none are specified, the indexed "fieldName" value is assumed
     * to also be the name of the JSON field holding the value
     */
    public SignificantTextAggregationBuilder sourceFieldNames(List names) {
        this.sourceFieldNames = names.toArray(new String[names.size()]);
        return this;
    }

    /**
     * Control if duplicate paragraphs of text should try be filtered from the
     * statistical text analysis. Can improve results but slows down analysis.
     * Default is false.
     */
    public SignificantTextAggregationBuilder filterDuplicateText(boolean filterDuplicateText) {
        this.filterDuplicateText = filterDuplicateText;
        return this;
    }

    /**
     * Set the minimum document count terms should have in order to appear in
     * the response.
     */
    public SignificantTextAggregationBuilder minDocCount(long minDocCount) {
        if (minDocCount < 0) {
            throw new IllegalArgumentException(
                "[minDocCount] must be greater than or equal to 0. Found [" + minDocCount + "] in [" + name + "]"
            );
        }
        bucketCountThresholds.setMinDocCount(minDocCount);
        return this;
    }

    /**
     * Set the minimum document count terms should have on the shard in order to
     * appear in the response.
     */
    public SignificantTextAggregationBuilder shardMinDocCount(long shardMinDocCount) {
        if (shardMinDocCount < 0) {
            throw new IllegalArgumentException(
                "[shardMinDocCount] must be greater than or equal to 0. Found [" + shardMinDocCount + "] in [" + name + "]"
            );
        }
        bucketCountThresholds.setShardMinDocCount(shardMinDocCount);
        return this;
    }

    public SignificantTextAggregationBuilder backgroundFilter(QueryBuilder backgroundFilter) {
        if (backgroundFilter == null) {
            throw new IllegalArgumentException("[backgroundFilter] must not be null: [" + name + "]");
        }
        this.filterBuilder = backgroundFilter;
        return this;
    }

    public QueryBuilder backgroundFilter() {
        return filterBuilder;
    }

    /**
     * Set terms to include and exclude from the aggregation results
     */
    public SignificantTextAggregationBuilder includeExclude(IncludeExclude includeExclude) {
        this.includeExclude = includeExclude;
        return this;
    }

    /**
     * Get terms to include and exclude from the aggregation results
     */
    public IncludeExclude includeExclude() {
        return includeExclude;
    }

    public SignificantTextAggregationBuilder significanceHeuristic(SignificanceHeuristic significanceHeuristic) {
        if (significanceHeuristic == null) {
            throw new IllegalArgumentException("[significanceHeuristic] must not be null: [" + name + "]");
        }
        this.significanceHeuristic = significanceHeuristic;
        return this;
    }

    public SignificanceHeuristic significanceHeuristic() {
        return significanceHeuristic;
    }

    /**
     * @param name
     *            the name of this aggregation
     * @param fieldName
     *            the name of the text field that will be the subject of this
     *            aggregation
     *
     */
    public SignificantTextAggregationBuilder(String name, String fieldName) {
        super(name);
        this.fieldName = fieldName;
    }

    /**
     * Read from a stream.
     */
    public SignificantTextAggregationBuilder(StreamInput in) throws IOException {
        super(in);
        fieldName = in.readString();
        filterDuplicateText = in.readBoolean();
        bucketCountThresholds = new BucketCountThresholds(in);
        filterBuilder = in.readOptionalNamedWriteable(QueryBuilder.class);
        includeExclude = in.readOptionalWriteable(IncludeExclude::new);
        significanceHeuristic = in.readNamedWriteable(SignificanceHeuristic.class);
        sourceFieldNames = in.readOptionalStringArray();
    }

    @Override
    protected void doWriteTo(StreamOutput out) throws IOException {
        out.writeString(fieldName);
        out.writeBoolean(filterDuplicateText);
        bucketCountThresholds.writeTo(out);
        out.writeOptionalNamedWriteable(filterBuilder);
        out.writeOptionalWriteable(includeExclude);
        out.writeNamedWriteable(significanceHeuristic);
        out.writeOptionalStringArray(sourceFieldNames);
    }

    @Override
    public BucketCardinality bucketCardinality() {
        return BucketCardinality.MANY;
    }

    @Override
    protected AggregatorFactory doBuild(QueryShardContext queryShardContext, AggregatorFactory parent, Builder subFactoriesBuilder)
        throws IOException {
        SignificanceHeuristic executionHeuristic = this.significanceHeuristic.rewrite(queryShardContext);

        return new SignificantTextAggregatorFactory(
            name,
            includeExclude,
            filterBuilder,
            bucketCountThresholds,
            executionHeuristic,
            queryShardContext,
            parent,
            subFactoriesBuilder,
            fieldName,
            sourceFieldNames,
            filterDuplicateText,
            metadata
        );
    }

    @Override
    protected XContentBuilder internalXContent(XContentBuilder builder, Params params) throws IOException {
        builder.startObject();
        bucketCountThresholds.toXContent(builder, params);
        if (fieldName != null) {
            builder.field(FIELD_NAME.getPreferredName(), fieldName);
        }
        if (sourceFieldNames != null) {
            builder.array(SOURCE_FIELDS_NAME.getPreferredName(), sourceFieldNames);
        }

        if (filterDuplicateText) {
            builder.field(FILTER_DUPLICATE_TEXT_FIELD_NAME.getPreferredName(), filterDuplicateText);
        }
        if (filterBuilder != null) {
            builder.field(SignificantTermsAggregationBuilder.BACKGROUND_FILTER.getPreferredName(), filterBuilder);
        }
        if (includeExclude != null) {
            includeExclude.toXContent(builder, params);
        }
        significanceHeuristic.toXContent(builder, params);

        builder.endObject();
        return builder;
    }

    @Override
    public int hashCode() {
        return Objects.hash(
            super.hashCode(),
            bucketCountThresholds,
            fieldName,
            filterDuplicateText,
            filterBuilder,
            includeExclude,
            significanceHeuristic,
            Arrays.hashCode(sourceFieldNames)
        );
    }

    @Override
    public boolean equals(Object obj) {
        if (this == obj) return true;
        if (obj == null || getClass() != obj.getClass()) return false;
        if (super.equals(obj) == false) return false;
        SignificantTextAggregationBuilder other = (SignificantTextAggregationBuilder) obj;
        return Objects.equals(bucketCountThresholds, other.bucketCountThresholds)
            && Objects.equals(fieldName, other.fieldName)
            && Arrays.equals(sourceFieldNames, other.sourceFieldNames)
            && filterDuplicateText == other.filterDuplicateText
            && Objects.equals(filterBuilder, other.filterBuilder)
            && Objects.equals(includeExclude, other.includeExclude)
            && Objects.equals(significanceHeuristic, other.significanceHeuristic);
    }

    @Override
    public String getType() {
        return NAME;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy