All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.yahoo.search.predicate.utils.TargetingQueryFileConverter Maven / Gradle / Ivy

There is a newer version: 8.458.13
Show newest version
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.search.predicate.utils;

import com.google.common.net.UrlEscapers;
import com.yahoo.search.predicate.PredicateQuery;
import com.yahoo.search.predicate.serialization.PredicateQuerySerializer;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Stream;

import static java.util.stream.Collectors.joining;

/**
 * Converts a targeting query (the format provided by targeting team) into a file of Vespa queries formatted as URLs.
 *
 * The format is the following:
 * - Each line represents one bulk query (upto 64 subqueries)
 * - Each bulk query has a set of subqueries separated by ";"
 * - Each subquery is of the format: attrName\tattrValue\tsubqueryIndex\tisRangeTerm;
 * - Some attributes have no value.
 * - Value may contain ";"
 *
 * @author bjorncs
 */
public class TargetingQueryFileConverter {

    // Subqueries having more than this value are skipped.
    private static final int MAX_NUMBER_OF_TERMS = 100;

    private enum OutputFormat {JSON, YQL}

    private TargetingQueryFileConverter() {}

    public static void main(String[] args) throws IOException {
        int nQueries = 123042;
        int batchFactor = 64;
        Subqueries subqueries = parseRiseQueries(new File("test-data/rise-query2.txt"), nQueries);
        filterOutHugeSubqueries(subqueries);
        List queries = batchSubqueries(subqueries, batchFactor);
        writeSubqueriesToFile(
                queries,
                new File("test-data/targeting-queries-json-" + batchFactor + "b-" + nQueries + "n.txt"),
                OutputFormat.JSON);
        writeSubqueriesToFile(
                queries,
                new File("test-data/targeting-queries-yql-" + batchFactor + "b-" + nQueries + "n.txt"),
                OutputFormat.YQL);
    }


    private static void writeSubqueriesToFile(List queries, File output, OutputFormat outputFormat)
            throws IOException {
        try (BufferedWriter writer = new BufferedWriter(new FileWriter(output))) {
            if (outputFormat == OutputFormat.JSON) {
                writeJSONOutput(writer, queries);
            } else {
                writeYQLOutput(writer, queries);
            }

        }
    }

    private static void writeJSONOutput(BufferedWriter writer, List queries) throws IOException {
        PredicateQuerySerializer serializer = new PredicateQuerySerializer();
        for (Query query : queries) {
            PredicateQuery predicateQuery = toPredicateQuery(query);
            String json = serializer.toJSON(predicateQuery);
            writer.append(json).append('\n');
        }
    }

    private static PredicateQuery toPredicateQuery(Query query) {
        PredicateQuery predicateQuery = new PredicateQuery();
        for (Map.Entry> e : query.valuesForSubqueries.entrySet()) {
            e.getValue().forEach(f -> predicateQuery.addFeature(f.key, f.strValue, e.getKey()));
        }
        for (Map.Entry> e : query.rangesForSubqueries.entrySet()) {
            e.getValue().forEach(f -> predicateQuery.addRangeFeature(f.key, f.longValue, e.getKey()));
        }
        return predicateQuery;
    }

    private static void writeYQLOutput(BufferedWriter writer, List queries) throws IOException {
        for (Query query : queries) {
            writer.append(toYqlString(query)).append('\n');
        }
    }

    private static String toYqlString(Query query)  {
        StringBuilder yqlBuilder = new StringBuilder("select * from sources * where predicate(boolean, ");
        yqlBuilder
                .append(createYqlFormatSubqueryMapString(query.valuesForSubqueries, query.isSingleQuery))
                .append(", ")
                .append(createYqlFormatSubqueryMapString(query.rangesForSubqueries, query.isSingleQuery))
                .append(");");
        return "/search/?query&nocache&yql=" + UrlEscapers.urlFormParameterEscaper().escape(yqlBuilder.toString());
    }

    /*
     * The subqueryBatchFactor determines the batch factor for each query. A maximum of 64 queries can be batched
     * into a single query (as subqueries).
     *      0 => Do not batch and output plain queries (no subquery).
     *      1 => Do not batch, but output queries with single subquery.
     */
    private static List batchSubqueries(Subqueries subqueries, int subqueryBatchFactor) {
        Iterator iterator = subqueries.subqueries.iterator();
        List result = new ArrayList<>();
        while (iterator.hasNext()) {
            // Aggregate the subqueries that contains a given value.
            Map subqueriesForValue = new TreeMap<>();
            Map subqueriesForRange = new TreeMap<>();
            // Batch single to single subquery for batch factor 0.
            for (int i = 0; i < Math.max(1, subqueryBatchFactor) && iterator.hasNext(); ++i) {
                Integer subquery = iterator.next();
                registerSubqueryValues(i, subqueries.valuesForSubquery.get(subquery), subqueriesForValue);
                registerSubqueryValues(i, subqueries.rangesForSubquery.get(subquery), subqueriesForRange);
            }

            // Aggregate the values that are contained in a given set of subqueries.
            Query query = new Query(subqueryBatchFactor == 0);
            simplifyAndFillQueryValues(query.valuesForSubqueries, subqueriesForValue);
            simplifyAndFillQueryValues(query.rangesForSubqueries, subqueriesForRange);
            result.add(query);
        }
        return result;
    }

    private static void registerSubqueryValues(int subquery, Set values, Map subqueriesForValue) {
        if (values != null) {
            values.forEach(value -> subqueriesForValue.merge(value, 1L << subquery, (ids1, ids2) -> ids1 | ids2));
        }
    }

    private static void simplifyAndFillQueryValues(Map> queryValues, Map subqueriesForValue) {
        for (Map.Entry entry : subqueriesForValue.entrySet()) {
            Feature feature = entry.getKey();
            Long subqueryBitmap = entry.getValue();
            Set featureSet = queryValues.computeIfAbsent(subqueryBitmap, (k) -> new HashSet<>());
            featureSet.add(feature);
        }
    }

    private static String createYqlFormatSubqueryMapString(Map> subqueriesForString, boolean isSingleQuery) {
        return subqueriesForString.entrySet().stream()
                .map(e -> {
                    Stream features = e.getValue().stream().map(Feature::asYqlString);
                    if (isSingleQuery) {
                        return features.collect(joining(", "));
                    } else {
                        // Note: Cannot use method reference as both method toString(int) and method toString() match.
                        String values = features.collect(joining(", ", "{", "}"));
                        return String.format("\"0x%s\":%s", Long.toHexString(e.getKey()), values);
                    }
                })
                .collect(joining(", ", "{", "}"));
    }

    private static Subqueries parseRiseQueries(File riseQueryFile, int maxQueries) throws IOException {
        try (BufferedReader reader = new BufferedReader(new FileReader(riseQueryFile))) {
            Subqueries parsedSubqueries = new Subqueries();
            AtomicInteger counter = new AtomicInteger(1);
            reader.lines()
                    .limit(maxQueries)
                    .forEach(riseQuery -> parseRiseQuery(parsedSubqueries, riseQuery, counter.getAndIncrement()));
            return parsedSubqueries;
        }
    }

    private static void filterOutHugeSubqueries(Subqueries subqueries) {
        Iterator iterator = subqueries.subqueries.iterator();
        while (iterator.hasNext()) {
            Integer subquery = iterator.next();
            Set values = subqueries.valuesForSubquery.get(subquery);
            Set ranges = subqueries.rangesForSubquery.get(subquery);
            int sizeValues = values == null ? 0 : values.size();
            int sizeRanges = ranges == null ? 0 : ranges.size();
            if (sizeValues + sizeRanges > MAX_NUMBER_OF_TERMS) {
                iterator.remove();
                subqueries.valuesForSubquery.remove(subquery);
                subqueries.rangesForSubquery.remove(subquery);
            }
        }
    }

    private static void parseRiseQuery(Subqueries subqueries, String queryString, int queryId) {
        StringTokenizer subQueryTokenizer = new StringTokenizer(queryString, "\t", true);
        while (subQueryTokenizer.hasMoreTokens()) {
            String key = subQueryTokenizer.nextToken("\t");
            subQueryTokenizer.nextToken();  // Consume delimiter
            String value = subQueryTokenizer.nextToken();
            if (value.equals("\t")) {
                value = "";
            } else {
                subQueryTokenizer.nextToken();  // Consume delimiter
            }
            int subQueryIndex = Integer.parseInt(subQueryTokenizer.nextToken());
            subQueryTokenizer.nextToken();  // Consume delimiter
            boolean isRangeTerm = Boolean.parseBoolean(subQueryTokenizer.nextToken(";"));
            if (subQueryTokenizer.hasMoreTokens()) {
                subQueryTokenizer.nextToken();  // Consume delimiter
            }
            int subqueryId = subQueryIndex + 64 * queryId;
            if (isRangeTerm) {
                Set rangeFeatures = subqueries.rangesForSubquery.computeIfAbsent(
                        subqueryId, (id) -> new HashSet<>());
                rangeFeatures.add(new Feature(key, Long.parseLong(value)));
            } else {
                Set features = subqueries.valuesForSubquery.computeIfAbsent(subqueryId, (id) -> new HashSet<>());
                features.add(new Feature(key, value));
            }
            subqueries.subqueries.add(subqueryId);
        }
    }

    private static class Subqueries {
        public final TreeSet subqueries = new TreeSet<>();
        public final Map> valuesForSubquery = new HashMap<>();
        public final Map> rangesForSubquery = new HashMap<>();
    }

    private static class Query {
        public final boolean isSingleQuery;
        public final Map> valuesForSubqueries = new TreeMap<>();
        public final Map> rangesForSubqueries = new TreeMap<>();

        public Query(boolean isSingleQuery) {
            this.isSingleQuery = isSingleQuery;
        }
    }

    private static class Feature implements Comparable {
        public final String key;
        private final String strValue;
        private final long longValue;

        public Feature(String key, String value) {
            this.key = key;
            this.strValue = value;
            this.longValue = 0;
        }

        public Feature(String key, long value) {
            this.key = key;
            this.strValue = null;
            this.longValue = value;
        }

        public String asYqlString() {
            if (strValue != null) {
                return String.format("\"%s\":\"%s\"", key, strValue);
            } else {
                return String.format("\"%s\":%dl", key, longValue);
            }
        }

        @Override
        public boolean equals(Object o) {
            if (this == o) return true;
            if (!(o instanceof Feature feature)) return false;

            if (longValue != feature.longValue) return false;
            if (!key.equals(feature.key)) return false;
            return !(strValue != null ? !strValue.equals(feature.strValue) : feature.strValue != null);
        }

        @Override
        public int hashCode() {
            int result = key.hashCode();
            result = 31 * result + (strValue != null ? strValue.hashCode() : 0);
            result = 31 * result + (int) (longValue ^ (longValue >>> 32));
            return result;
        }

        @Override
        public int compareTo(Feature o) {
            return asYqlString().compareTo(o.asYqlString());
        }
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy