com.yahoo.search.predicate.utils.TargetingQueryFileConverter Maven / Gradle / Ivy
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.search.predicate.utils;
import com.google.common.net.UrlEscapers;
import com.yahoo.search.predicate.PredicateQuery;
import com.yahoo.search.predicate.serialization.PredicateQuerySerializer;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Stream;
import static java.util.stream.Collectors.joining;
/**
* Converts a targeting query (the format provided by targeting team) into a file of Vespa queries formatted as URLs.
*
* The format is the following:
* - Each line represents one bulk query (upto 64 subqueries)
* - Each bulk query has a set of subqueries separated by ";"
* - Each subquery is of the format: attrName\tattrValue\tsubqueryIndex\tisRangeTerm;
* - Some attributes have no value.
* - Value may contain ";"
*
* @author bjorncs
*/
public class TargetingQueryFileConverter {
// Subqueries having more than this value are skipped.
private static final int MAX_NUMBER_OF_TERMS = 100;
private enum OutputFormat {JSON, YQL}
private TargetingQueryFileConverter() {}
public static void main(String[] args) throws IOException {
int nQueries = 123042;
int batchFactor = 64;
Subqueries subqueries = parseRiseQueries(new File("test-data/rise-query2.txt"), nQueries);
filterOutHugeSubqueries(subqueries);
List queries = batchSubqueries(subqueries, batchFactor);
writeSubqueriesToFile(
queries,
new File("test-data/targeting-queries-json-" + batchFactor + "b-" + nQueries + "n.txt"),
OutputFormat.JSON);
writeSubqueriesToFile(
queries,
new File("test-data/targeting-queries-yql-" + batchFactor + "b-" + nQueries + "n.txt"),
OutputFormat.YQL);
}
private static void writeSubqueriesToFile(List queries, File output, OutputFormat outputFormat)
throws IOException {
try (BufferedWriter writer = new BufferedWriter(new FileWriter(output))) {
if (outputFormat == OutputFormat.JSON) {
writeJSONOutput(writer, queries);
} else {
writeYQLOutput(writer, queries);
}
}
}
private static void writeJSONOutput(BufferedWriter writer, List queries) throws IOException {
PredicateQuerySerializer serializer = new PredicateQuerySerializer();
for (Query query : queries) {
PredicateQuery predicateQuery = toPredicateQuery(query);
String json = serializer.toJSON(predicateQuery);
writer.append(json).append('\n');
}
}
private static PredicateQuery toPredicateQuery(Query query) {
PredicateQuery predicateQuery = new PredicateQuery();
for (Map.Entry> e : query.valuesForSubqueries.entrySet()) {
e.getValue().forEach(f -> predicateQuery.addFeature(f.key, f.strValue, e.getKey()));
}
for (Map.Entry> e : query.rangesForSubqueries.entrySet()) {
e.getValue().forEach(f -> predicateQuery.addRangeFeature(f.key, f.longValue, e.getKey()));
}
return predicateQuery;
}
private static void writeYQLOutput(BufferedWriter writer, List queries) throws IOException {
for (Query query : queries) {
writer.append(toYqlString(query)).append('\n');
}
}
private static String toYqlString(Query query) {
StringBuilder yqlBuilder = new StringBuilder("select * from sources * where predicate(boolean, ");
yqlBuilder
.append(createYqlFormatSubqueryMapString(query.valuesForSubqueries, query.isSingleQuery))
.append(", ")
.append(createYqlFormatSubqueryMapString(query.rangesForSubqueries, query.isSingleQuery))
.append(");");
return "/search/?query&nocache&yql=" + UrlEscapers.urlFormParameterEscaper().escape(yqlBuilder.toString());
}
/*
* The subqueryBatchFactor determines the batch factor for each query. A maximum of 64 queries can be batched
* into a single query (as subqueries).
* 0 => Do not batch and output plain queries (no subquery).
* 1 => Do not batch, but output queries with single subquery.
*/
private static List batchSubqueries(Subqueries subqueries, int subqueryBatchFactor) {
Iterator iterator = subqueries.subqueries.iterator();
List result = new ArrayList<>();
while (iterator.hasNext()) {
// Aggregate the subqueries that contains a given value.
Map subqueriesForValue = new TreeMap<>();
Map subqueriesForRange = new TreeMap<>();
// Batch single to single subquery for batch factor 0.
for (int i = 0; i < Math.max(1, subqueryBatchFactor) && iterator.hasNext(); ++i) {
Integer subquery = iterator.next();
registerSubqueryValues(i, subqueries.valuesForSubquery.get(subquery), subqueriesForValue);
registerSubqueryValues(i, subqueries.rangesForSubquery.get(subquery), subqueriesForRange);
}
// Aggregate the values that are contained in a given set of subqueries.
Query query = new Query(subqueryBatchFactor == 0);
simplifyAndFillQueryValues(query.valuesForSubqueries, subqueriesForValue);
simplifyAndFillQueryValues(query.rangesForSubqueries, subqueriesForRange);
result.add(query);
}
return result;
}
private static void registerSubqueryValues(int subquery, Set values, Map subqueriesForValue) {
if (values != null) {
values.forEach(value -> subqueriesForValue.merge(value, 1L << subquery, (ids1, ids2) -> ids1 | ids2));
}
}
private static void simplifyAndFillQueryValues(Map> queryValues, Map subqueriesForValue) {
for (Map.Entry entry : subqueriesForValue.entrySet()) {
Feature feature = entry.getKey();
Long subqueryBitmap = entry.getValue();
Set featureSet = queryValues.computeIfAbsent(subqueryBitmap, (k) -> new HashSet<>());
featureSet.add(feature);
}
}
private static String createYqlFormatSubqueryMapString(Map> subqueriesForString, boolean isSingleQuery) {
return subqueriesForString.entrySet().stream()
.map(e -> {
Stream features = e.getValue().stream().map(Feature::asYqlString);
if (isSingleQuery) {
return features.collect(joining(", "));
} else {
// Note: Cannot use method reference as both method toString(int) and method toString() match.
String values = features.collect(joining(", ", "{", "}"));
return String.format("\"0x%s\":%s", Long.toHexString(e.getKey()), values);
}
})
.collect(joining(", ", "{", "}"));
}
private static Subqueries parseRiseQueries(File riseQueryFile, int maxQueries) throws IOException {
try (BufferedReader reader = new BufferedReader(new FileReader(riseQueryFile))) {
Subqueries parsedSubqueries = new Subqueries();
AtomicInteger counter = new AtomicInteger(1);
reader.lines()
.limit(maxQueries)
.forEach(riseQuery -> parseRiseQuery(parsedSubqueries, riseQuery, counter.getAndIncrement()));
return parsedSubqueries;
}
}
private static void filterOutHugeSubqueries(Subqueries subqueries) {
Iterator iterator = subqueries.subqueries.iterator();
while (iterator.hasNext()) {
Integer subquery = iterator.next();
Set values = subqueries.valuesForSubquery.get(subquery);
Set ranges = subqueries.rangesForSubquery.get(subquery);
int sizeValues = values == null ? 0 : values.size();
int sizeRanges = ranges == null ? 0 : ranges.size();
if (sizeValues + sizeRanges > MAX_NUMBER_OF_TERMS) {
iterator.remove();
subqueries.valuesForSubquery.remove(subquery);
subqueries.rangesForSubquery.remove(subquery);
}
}
}
private static void parseRiseQuery(Subqueries subqueries, String queryString, int queryId) {
StringTokenizer subQueryTokenizer = new StringTokenizer(queryString, "\t", true);
while (subQueryTokenizer.hasMoreTokens()) {
String key = subQueryTokenizer.nextToken("\t");
subQueryTokenizer.nextToken(); // Consume delimiter
String value = subQueryTokenizer.nextToken();
if (value.equals("\t")) {
value = "";
} else {
subQueryTokenizer.nextToken(); // Consume delimiter
}
int subQueryIndex = Integer.parseInt(subQueryTokenizer.nextToken());
subQueryTokenizer.nextToken(); // Consume delimiter
boolean isRangeTerm = Boolean.parseBoolean(subQueryTokenizer.nextToken(";"));
if (subQueryTokenizer.hasMoreTokens()) {
subQueryTokenizer.nextToken(); // Consume delimiter
}
int subqueryId = subQueryIndex + 64 * queryId;
if (isRangeTerm) {
Set rangeFeatures = subqueries.rangesForSubquery.computeIfAbsent(
subqueryId, (id) -> new HashSet<>());
rangeFeatures.add(new Feature(key, Long.parseLong(value)));
} else {
Set features = subqueries.valuesForSubquery.computeIfAbsent(subqueryId, (id) -> new HashSet<>());
features.add(new Feature(key, value));
}
subqueries.subqueries.add(subqueryId);
}
}
private static class Subqueries {
public final TreeSet subqueries = new TreeSet<>();
public final Map> valuesForSubquery = new HashMap<>();
public final Map> rangesForSubquery = new HashMap<>();
}
private static class Query {
public final boolean isSingleQuery;
public final Map> valuesForSubqueries = new TreeMap<>();
public final Map> rangesForSubqueries = new TreeMap<>();
public Query(boolean isSingleQuery) {
this.isSingleQuery = isSingleQuery;
}
}
private static class Feature implements Comparable {
public final String key;
private final String strValue;
private final long longValue;
public Feature(String key, String value) {
this.key = key;
this.strValue = value;
this.longValue = 0;
}
public Feature(String key, long value) {
this.key = key;
this.strValue = null;
this.longValue = value;
}
public String asYqlString() {
if (strValue != null) {
return String.format("\"%s\":\"%s\"", key, strValue);
} else {
return String.format("\"%s\":%dl", key, longValue);
}
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (!(o instanceof Feature feature)) return false;
if (longValue != feature.longValue) return false;
if (!key.equals(feature.key)) return false;
return !(strValue != null ? !strValue.equals(feature.strValue) : feature.strValue != null);
}
@Override
public int hashCode() {
int result = key.hashCode();
result = 31 * result + (strValue != null ? strValue.hashCode() : 0);
result = 31 * result + (int) (longValue ^ (longValue >>> 32));
return result;
}
@Override
public int compareTo(Feature o) {
return asYqlString().compareTo(o.asYqlString());
}
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy