All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.yahoo.search.predicate.benchmarks.HitsVerificationBenchmark Maven / Gradle / Ivy

There is a newer version: 8.458.13
Show newest version
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.search.predicate.benchmarks;

import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.SerializationFeature;
import com.yahoo.search.predicate.Config;
import com.yahoo.search.predicate.Hit;
import com.yahoo.search.predicate.PredicateIndex;
import com.yahoo.search.predicate.PredicateIndexBuilder;
import com.yahoo.search.predicate.PredicateQuery;
import com.yahoo.search.predicate.serialization.PredicateQuerySerializer;
import com.yahoo.search.predicate.utils.VespaFeedParser;
import com.yahoo.search.predicate.utils.VespaQueryParser;
import io.airlift.airline.Arguments;
import io.airlift.airline.Command;
import io.airlift.airline.HelpOption;
import io.airlift.airline.Option;
import io.airlift.airline.SingleCommand;

import javax.inject.Inject;
import java.io.BufferedInputStream;
import java.io.BufferedWriter;
import java.io.DataInputStream;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.TreeMap;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Stream;

import static com.yahoo.search.predicate.benchmarks.HitsVerificationBenchmark.BenchmarkArguments.*;
import static java.util.stream.Collectors.joining;

/**
 * A test that runs outputs the hits for each query into result file.
 *
 * @author bjorncs
 */
public class HitsVerificationBenchmark {

    public static void main(String[] rawArgs) throws IOException {
        Optional wrappedArgs = getArguments(rawArgs);
        if (wrappedArgs.isEmpty()) return;

        BenchmarkArguments args = wrappedArgs.get();
        Map output = new TreeMap<>();
        addArgsToOutput(output, args);

        Config config = new Config.Builder()
                .setArity(args.arity)
                .setUseConjunctionAlgorithm(args.algorithm == Algorithm.CONJUNCTION)
                .build();

        PredicateIndex index = getIndex(args, config, output);

        Stream queries = parseQueries(args.format, args.queryFile);
        int totalHits = runQueries(index, queries, args.outputFile);
        output.put("Total hits", totalHits);
        writeOutputToStandardOut(output);
    }

    static PredicateIndex getIndex(BenchmarkArguments args, Config config, Map output) throws IOException {
        if (args.feedFile != null) {
            PredicateIndexBuilder builder = new PredicateIndexBuilder(config);
            AtomicInteger idCounter = new AtomicInteger();
            VespaFeedParser.parseDocuments(
                    args.feedFile, Integer.MAX_VALUE, p -> builder.indexDocument(idCounter.incrementAndGet(), p));
            builder.getStats().putValues(output);
            return builder.build();
        } else {
            try (DataInputStream in = new DataInputStream(new BufferedInputStream(new FileInputStream(args.indexFile)))) {
                long start = System.currentTimeMillis();
                PredicateIndex index = PredicateIndex.fromInputStream(in);
                output.put("Time deserialize index", System.currentTimeMillis() - start);
                return index;
            }
        }
    }

    private static int runQueries(
            PredicateIndex index, Stream queries, String outputFile) throws IOException {
        try (BufferedWriter writer = new BufferedWriter(new FileWriter(outputFile, false))) {
            AtomicInteger i = new AtomicInteger();
            PredicateIndex.Searcher searcher = index.searcher();
            return queries.map(searcher::search)
                    .peek(hits -> {if (i.get() % 500 == 0) {index.rebuildPostingListCache();}})
                    .mapToInt(hits -> writeHits(i.getAndIncrement(), hits, writer))
                    .sum();

        }
    }

    private static Stream parseQueries(Format format, String queryFile)
            throws IOException {
        PredicateQuerySerializer serializer = new PredicateQuerySerializer();
        return Files.lines(Paths.get(queryFile))
                .map(line ->
                        format == Format.JSON
                                ? serializer.fromJSON(line)
                                : VespaQueryParser.parseQueryFromQueryProperties(line));

    }

    private static int writeHits(int i, Stream hitStream, BufferedWriter writer) {
        try {
            List hits = hitStream.toList();
            writer.append(Integer.toString(i))
                    .append(": ")
                    .append(hits.stream()
                            .map(hit -> String.format("(%d, 0x%x)", hit.getDocId(), hit.getSubquery()))
                            .collect(joining(", ", "[", "]")))
                    .append("\n\n");
            return hits.size();
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    private static Optional getArguments(String[] rawArgs) {
        BenchmarkArguments args = SingleCommand.singleCommand(BenchmarkArguments.class).parse(rawArgs);
        if (args.helpOption.showHelpIfRequested()) {
            return Optional.empty();
        }
        if (args.feedFile == null && args.indexFile == null) {
            System.err.println("Provide either a feed file or index file.");
            return Optional.empty();
        }
        return Optional.of(args);

    }

    private static void addArgsToOutput(Map output, BenchmarkArguments args) {
        output.put("Arity", args.arity);
        output.put("Algorithm", args.algorithm);
        output.put("Query format", args.format);
        output.put("Feed file", args.feedFile);
        output.put("Query file", args.queryFile);
        output.put("Output file", args.outputFile);
        output.put("Index file", args.indexFile);
    }

    private static void writeOutputToStandardOut(Map output) {
        try {
            ObjectMapper objectMapper = new ObjectMapper();
            objectMapper.enable(SerializationFeature.INDENT_OUTPUT);
            objectMapper.writeValue(System.out, output);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    @Command(name = "hits-verifier",
            description = "Java predicate search system test that outputs the returned hits for each query")
    public static class BenchmarkArguments {

        public enum Format{JSON, VESPA}
        public enum Algorithm{CONJUNCTION, INTERVALONLY}

        @Option(name = {"-a", "--arity"}, description = "Arity")
        public int arity = 2;

        @Option(name = {"-al", "--algorithm"}, description = "Algorithm (CONJUNCTION or INTERVALONLY)")
        public Algorithm algorithm = Algorithm.INTERVALONLY;

        @Option(name = {"-qf", "--query-format"}, description =
                "Query format. Valid formats are either 'vespa' (obsolete query property format) or 'json'.")
        public Format format = Format.VESPA;

        @Option(name = {"-ff", "--feed-file"}, description = "File path to feed file (Vespa Json feed)")
        public String feedFile;

        @Option(name = {"-if", "--index-file"}, description = "File path to index file (Serialized index)")
        public String indexFile;

        @Option(name = {"-quf", "--query-file"}, description = "File path to query file")
        public String queryFile;

        @Arguments(title = "Output file", description = "File path to output file")
        public String outputFile;

        @Inject
        public HelpOption helpOption;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy