All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.carrot2.text.util.CharArrayComparators Maven / Gradle / Ivy

Go to download

Carrot2 search results clustering framework. Minimal functional subset (core algorithms and infrastructure, no document sources).

There is a newer version: 3.16.3
Show newest version

/*
 * Carrot2 project.
 *
 * Copyright (C) 2002-2016, Dawid Weiss, Stanisław Osiński.
 * All rights reserved.
 *
 * Refer to the full license file "carrot2.LICENSE"
 * in the root folder of the repository checkout or at:
 * http://www.carrot2.org/carrot2.LICENSE
 */

package org.carrot2.text.util;

import java.util.Comparator;

/**
 * A set of specific comparators for char [] arrays.
 */
public class CharArrayComparators
{
    /**
     * A case-insensitive lexicographic comparator.
     */
    public static final Comparator CASE_INSENSITIVE_CHAR_ARRAY_COMPARATOR = new Comparator()
    {
        public final int compare(char [] a1, char [] a2)
        {
            if (a1 == null)
            {
                if (a2 == null) return 0;

                return 1;
            }

            if (a2 == null)
            {
                return -1;
            }

            final int l1 = a1.length;
            final int l2 = a2.length;
            final int n = l1 < l2 ? l1 : l2;

            // Quiet assumption that the numbers here won't cause an overflow.
            for (int i = 0; i < n; i++)
            {
                // Use JDK 1.5+ full codepoint method.
                final int c = Character.toLowerCase((int) a1[i]);
                final int d = Character.toLowerCase((int) a2[i]);
                if (c != d)
                {
                    return c - d;
                }
            }

            return l1 - l2;
        }
    };

    /**
     * A case-sensitive lexicographic comparator.
     */
    public static final Comparator FAST_CHAR_ARRAY_COMPARATOR = new Comparator()
    {
        public final int compare(char [] a1, char [] a2)
        {
            if (a1 == null)
            {
                if (a2 == null) return 0;
                
                return 1;
            }

            if (a2 == null)
            {
                return -1;
            }

            final int l1 = a1.length;
            final int l2 = a2.length;
            final int n = l1 < l2 ? l1 : l2;

            // Quiet assumption that the numbers here won't cause an overflow.
            for (int i = 0; i < n; i++)
            {
                final char a1I = a1[i];
                final char a2I = a2[i];

                if (a1I != a2I)
                {
                    return a1I - a2I;
                }
            }

            return l1 - l2;
        }
    };

    /**
     * A comparator that groups different strings into different buckets
     * (case-insensitive) and strings within these buckets (case-sensitive). The
     * comparator that applies the following rules in the following order:
     * 
    *
  1. A null string is greater than a non-null one
  2. *
  3. A longer string is greater
  4. *
  5. Strings are first compared in case-insensitive mode
  6. *
  7. Finally, strings are compared in case-sensitive mode
  8. *
* This comparator does not provide a lexicographic order, which makes it much faster, but * not suitable for general purpose sorting. */ public static final Comparator NORMALIZING_CHAR_ARRAY_COMPARATOR = new Comparator() { public final int compare(char [] a1, char [] a2) { if (a1 == null) { if (a2 == null) return 0; return 1; } if (a2 == null) { return -1; } if (a1 == a2) { return 0; } final int l1 = a1.length; final int l2 = a2.length; /* * For those who might wonder: the condition below is perfectly ok here. It is * used to calculate word occurrence statistics, which is essentially a "count * unique strings by sorting" problem. Therefore, the semantic meaning of the * order produced by this comparator doesn't matter at all as long as it: a) * groups equal (case sensitive) strings together, b) groups equal (case * insensitive) strings into one block, c) null string is always greater than * a non-null string. See tests for this comparator for examples. * * In comparison-based sorting algorithms crucial is the speed of comparisons, * so declaring that e.g. shorter strings are always smaller (regardless of * contents) saves us calls to Character.toLowerCase(), which are very costly. * For CaseNormalizer it doesn't matter at all, and makes sorting way faster. */ // Quiet assumption that the numbers here won't cause an overflow. // Not crucial, but speeds things up if (l1 != l2) { return l1 - l2; } // Compare whole strings in case insensitive mode first for (int i = 0; i < l1; i++) { // Use JDK 1.5+ full codepoint method. final int c = Character.toLowerCase((int) a1[i]); final int d = Character.toLowerCase((int) a2[i]); if (c != d) { return c - d; } } // Only if strings are case-insensitive equal, go case sensitive for (int i = 0; i < l1; i++) { char a1I = a1[i]; char a2I = a2[i]; if (a1I != a2I) { // Put lower case first return a2I - a1I; } } return 0; } }; /** * No instantiation. */ private CharArrayComparators() { } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy