All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.carrot2.text.preprocessing.SubstringComparator Maven / Gradle / Ivy

Go to download

Carrot2 search results clustering framework. Minimal functional subset (core algorithms and infrastructure, no document sources).

There is a newer version: 3.16.3
Show newest version

/*
 * Carrot2 project.
 *
 * Copyright (C) 2002-2016, Dawid Weiss, Stanisław Osiński.
 * All rights reserved.
 *
 * Refer to the full license file "carrot2.LICENSE"
 * in the root folder of the repository checkout or at:
 * http://www.carrot2.org/carrot2.LICENSE
 */

package org.carrot2.text.preprocessing;

import java.io.Serializable;
import java.util.Comparator;

/**
 * Compares {@link Substring}s in such a way as to put them into different buckets in an
 * inflection-insensitive way. The order of different variants of substrings within
 * buckets is arbitrary.
 */
class SubstringComparator implements Comparator, Serializable
{
    private static final long serialVersionUID = 1L;
    
    private final int [] tokensWordIndex;
    private final int [] wordsStemIndex;

    /**
     * Creates a Comparator for substrings relating to the given word and stem indices.
     */
    public SubstringComparator(int [] tokensWordIndex, int [] wordsStemIndex)
    {
        this.tokensWordIndex = tokensWordIndex;
        this.wordsStemIndex = wordsStemIndex;
    }

    public int compare(Substring s1, Substring s2)
    {
        final int s1From = s1.from;
        final int s1To = s1.to;
        final int s2From = s2.from;
        final int s2To = s2.to;

        final int s2Length = s2To - s2From;
        final int s1Length = s1To - s1From;
        if (s1Length != s2Length)
        {
            return s1Length - s2Length;
        }

        assert s1Length == s2Length;
        for (int i = 0; i < s1Length; i++)
        {
            final int stemIndex1 = wordsStemIndex[tokensWordIndex[s1From + i]];
            final int stemIndex2 = wordsStemIndex[tokensWordIndex[s2From + i]];
            if (stemIndex1 != stemIndex2)
            {
                return stemIndex1 - stemIndex2;
            }
        }

        // If the substrings are equal in the inflection-insensitive way, return 0 (equivalence class).
        return 0;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy