All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.carrot2.clustering.SharedInfrastructure Maven / Gradle / Ivy

There is a newer version: 4.6.0
Show newest version
/*
 * Carrot2 project.
 *
 * Copyright (C) 2002-2021, Dawid Weiss, Stanisław Osiński.
 * All rights reserved.
 *
 * Refer to the full license file "carrot2.LICENSE"
 * in the root folder of the repository checkout or at:
 * https://www.carrot2.org/carrot2.LICENSE
 */
package org.carrot2.clustering;

import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.IdentityHashMap;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import org.carrot2.attrs.AttrString;

public class SharedInfrastructure {
  public static AttrString queryHintAttribute() {
    return AttrString.builder().label("Query hint").defaultValue(null);
  }

  private static class ClusterData {
    final Cluster cluster;
    final double score;
    final String label;
    final int recursiveDocumentCount;

    public ClusterData(Cluster cluster, double score, int recursiveDocumentCount) {
      this.cluster = cluster;
      this.label = String.join(", ", cluster.getLabels());
      this.score = score;
      this.recursiveDocumentCount = recursiveDocumentCount;
    }
  }

  public static  List> reorderByWeightedScoreAndSize(
      List> clusters, double scoreWeight) {
    Comparator> comparator =
        Comparator.>comparingDouble(data -> data.score)
            .reversed()
            .thenComparing(Comparator.nullsFirst(Comparator.comparing(data -> data.label)));

    return clusters.stream()
        .map(
            cluster -> {
              int docCount = recursiveDocumentCount(cluster);
              double score =
                  Math.pow(docCount, 1d - scoreWeight) * Math.pow(cluster.getScore(), scoreWeight);
              return new ClusterData(cluster, score, docCount);
            })
        .sorted(comparator)
        .map(data -> data.cluster)
        .collect(Collectors.toList());
  }

  public static  List> reorderByDescendingSizeAndLabel(
      ArrayList> clusters) {
    Comparator> comparator =
        Comparator.>comparingInt(data -> data.recursiveDocumentCount)
            .reversed()
            .thenComparing(Comparator.nullsFirst(Comparator.comparing(data -> data.label)));

    return clusters.stream()
        .map(
            cluster -> {
              int docCount = recursiveDocumentCount(cluster);
              return new ClusterData(cluster, 0, docCount);
            })
        .sorted(comparator)
        .map(data -> data.cluster)
        .collect(Collectors.toList());
  }

  public static int recursiveDocumentCount(Cluster cluster) {
    Set visited = Collections.newSetFromMap(new IdentityHashMap<>());
    ArrayDeque> queue = new ArrayDeque<>();
    queue.add(cluster);

    while (!queue.isEmpty()) {
      Cluster c = queue.removeLast();
      visited.addAll(c.getDocuments());
      queue.addAll(cluster.getClusters());
    }

    return visited.size();
  }
}