org.tallison.quaerite.analysis.TopNTokens Maven / Gradle / Ivy

Go to download
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.tallison.quaerite.analysis;


import java.io.BufferedWriter;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.PriorityQueue;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.tallison.quaerite.connectors.SearchClient;
import org.tallison.quaerite.connectors.SearchClientFactory;
import org.tallison.quaerite.core.stats.TokenDF;
import org.tallison.quaerite.core.stats.TokenDFTF;


public class TopNTokens {

    static Options OPTIONS = new Options();

    static {
        OPTIONS.addOption(
                Option.builder("s")
                        .hasArg()
                        .required()
                        .desc("search server").build()
        );
        OPTIONS.addOption(Option.builder("f")
                .longOpt("field")
                .hasArg(true)
                .required()
                .desc("field").build()
        );
        OPTIONS.addOption(Option.builder("n")
                .longOpt("topN")
                .hasArg(true)
                .required()
                .desc("'n' most frequent tokens to record").build()
        );
        OPTIONS.addOption(Option.builder("m")
                .longOpt("minDF")
                .hasArg(true)
                .required()
                .desc("minimum document frequency").build()
        );
        OPTIONS.addOption(Option.builder("o")
                .longOpt("output")
                .hasArg(true)
                .required()
                .desc("file to which to write results").build()
        );
    }

    public static void main(String[] args) throws Exception {
        CommandLine commandLine = null;

        try {
            commandLine = new DefaultParser().parse(OPTIONS, args);
        } catch (ParseException e) {
            HelpFormatter helpFormatter = new HelpFormatter();
            helpFormatter.printHelp(
                    "java -jar org.tallison.quaerite.analysis.CompareAnalyzers",
                    OPTIONS);
            return;
        }
        SearchClient client = SearchClientFactory.getClient(commandLine.getOptionValue("s"));
        String field = commandLine.getOptionValue("f");

        int n = Integer.parseInt(commandLine.getOptionValue("n"));

        String lower = "";
        int termSetSize = 100000;
        int minDF = Integer.parseInt(commandLine.getOptionValue("m"));
        long uniqueTerms = 0;
        long totalTermCount = 0;
        PriorityQueue queue = new PriorityQueue<>(n, new DFComparator());

        int scanned = 0;
        int maxScanned = -1;//100000;
        boolean hitMax = false;
        while (!hitMax) {
            List terms = client.getTerms(field, lower,
                    termSetSize, minDF, true);
            for (TokenDF tdf : terms) {
                add(tdf, queue, n);
                totalTermCount += ((TokenDFTF) tdf).getTf();
                uniqueTerms++;
                if (maxScanned > 0 && scanned++ > maxScanned) {
                    hitMax = true;
                    break;
                }
            }
            if (terms.size() == 0) {
                break;
            }

            lower = terms.get(terms.size() - 1).getToken();
        }

        //priority queue is sorted from head which is the "least common"
        //we have to flip it to get descending order.
        List tokens = new ArrayList<>();
        while (! queue.isEmpty()) {
            tokens.add(queue.remove());
        }
        if (commandLine.hasOption("o")) {
            writeResults(commandLine.getOptionValue("o"), uniqueTerms, totalTermCount, tokens);
        }
        System.out.println("unique terms: " + uniqueTerms);
        System.out.println("total tokens: " + totalTermCount);

        for (int i = tokens.size() - 1; i >= 0; i--) {
            TokenDF tdf = tokens.get(i);
            System.out.println(tdf.getToken().replaceAll("\t", " ")
                    + "\t" + tdf.getDf());
        }

    }

    private static void writeResults(String outputPath,
                                     long uniqueTerms, long totalTermCount,
                                     List tokens) throws IOException {
        try (BufferedWriter writer = Files.newBufferedWriter(
                Paths.get(outputPath), StandardCharsets.UTF_8)) {
            writer.write("unique terms: " + uniqueTerms);
            writer.newLine();
            writer.write("total tokens: " + totalTermCount);
            writer.newLine();
            for (int i = tokens.size() - 1; i >= 0; i--) {
                TokenDF tdf = tokens.get(i);
                writer.write(tdf.getToken().replaceAll("\t", " ")
                        + "\t" + tdf.getDf());
                writer.newLine();
            }
        }
    }

    private static void add(TokenDF tdf, PriorityQueue queue, int max) {

        if (queue.size() < max) {
            queue.add(tdf);
        } else if (queue.comparator().compare(tdf, queue.peek()) > 0) {

            queue.add(tdf);
            while (queue.size() > max) {
                TokenDF toRemove = queue.poll();
            }
        }
    }

    //sort by descending order of frequency and ascending order of term
    private static class DFComparator implements Comparator {

        @Override
        public int compare(TokenDF o1, TokenDF o2) {
            if (o1.getDf() == o2.getDf()) {
                return o2.getToken().compareTo(o1.getToken());
            } else {
                return (Long.compare(o1.getDf(), o2.getDf()));
            }
        }
    }
}