org.carrot2.text.util.CharArrayComparators Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of carrot2-mini Show documentation
Show all versions of carrot2-mini Show documentation
Carrot2 search results clustering framework. Minimal functional subset
(core algorithms and infrastructure, no document sources).
/*
* Carrot2 project.
*
* Copyright (C) 2002-2019, Dawid Weiss, Stanisław Osiński.
* All rights reserved.
*
* Refer to the full license file "carrot2.LICENSE"
* in the root folder of the repository checkout or at:
* http://www.carrot2.org/carrot2.LICENSE
*/
package org.carrot2.text.util;
import java.util.Comparator;
/**
* A set of specific comparators for char []
arrays.
*/
public class CharArrayComparators
{
/**
* A case-insensitive lexicographic comparator.
*/
public static final Comparator CASE_INSENSITIVE_CHAR_ARRAY_COMPARATOR = new Comparator()
{
public final int compare(char [] a1, char [] a2)
{
if (a1 == null)
{
if (a2 == null) return 0;
return 1;
}
if (a2 == null)
{
return -1;
}
final int l1 = a1.length;
final int l2 = a2.length;
final int n = l1 < l2 ? l1 : l2;
// Quiet assumption that the numbers here won't cause an overflow.
for (int i = 0; i < n; i++)
{
// Use JDK 1.5+ full codepoint method.
final int c = Character.toLowerCase((int) a1[i]);
final int d = Character.toLowerCase((int) a2[i]);
if (c != d)
{
return c - d;
}
}
return l1 - l2;
}
};
/**
* A case-sensitive lexicographic comparator.
*/
public static final Comparator FAST_CHAR_ARRAY_COMPARATOR = new Comparator()
{
public final int compare(char [] a1, char [] a2)
{
if (a1 == null)
{
if (a2 == null) return 0;
return 1;
}
if (a2 == null)
{
return -1;
}
final int l1 = a1.length;
final int l2 = a2.length;
final int n = l1 < l2 ? l1 : l2;
// Quiet assumption that the numbers here won't cause an overflow.
for (int i = 0; i < n; i++)
{
final char a1I = a1[i];
final char a2I = a2[i];
if (a1I != a2I)
{
return a1I - a2I;
}
}
return l1 - l2;
}
};
/**
* A comparator that groups different strings into different buckets
* (case-insensitive) and strings within these buckets (case-sensitive). The
* comparator that applies the following rules in the following order:
*
* - A
null
string is greater than a non-null
one
* - A longer string is greater
* - Strings are first compared in case-insensitive mode
* - Finally, strings are compared in case-sensitive mode
*
* This comparator does not provide a lexicographic order, which makes it much faster, but
* not suitable for general purpose sorting.
*/
public static final Comparator NORMALIZING_CHAR_ARRAY_COMPARATOR = new Comparator()
{
public final int compare(char [] a1, char [] a2)
{
if (a1 == null)
{
if (a2 == null) return 0;
return 1;
}
if (a2 == null)
{
return -1;
}
if (a1 == a2)
{
return 0;
}
final int l1 = a1.length;
final int l2 = a2.length;
/*
* For those who might wonder: the condition below is perfectly ok here. It is
* used to calculate word occurrence statistics, which is essentially a "count
* unique strings by sorting" problem. Therefore, the semantic meaning of the
* order produced by this comparator doesn't matter at all as long as it: a)
* groups equal (case sensitive) strings together, b) groups equal (case
* insensitive) strings into one block, c) null string is always greater than
* a non-null string. See tests for this comparator for examples.
*
* In comparison-based sorting algorithms crucial is the speed of comparisons,
* so declaring that e.g. shorter strings are always smaller (regardless of
* contents) saves us calls to Character.toLowerCase(), which are very costly.
* For CaseNormalizer it doesn't matter at all, and makes sorting way faster.
*/
// Quiet assumption that the numbers here won't cause an overflow.
// Not crucial, but speeds things up
if (l1 != l2)
{
return l1 - l2;
}
// Compare whole strings in case insensitive mode first
for (int i = 0; i < l1; i++)
{
// Use JDK 1.5+ full codepoint method.
final int c = Character.toLowerCase((int) a1[i]);
final int d = Character.toLowerCase((int) a2[i]);
if (c != d)
{
return c - d;
}
}
// Only if strings are case-insensitive equal, go case sensitive
for (int i = 0; i < l1; i++)
{
char a1I = a1[i];
char a2I = a2[i];
if (a1I != a2I)
{
// Put lower case first
return a2I - a1I;
}
}
return 0;
}
};
/**
* No instantiation.
*/
private CharArrayComparators()
{
}
}