All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.carrot2.text.linguistic.DefaultLexicalData Maven / Gradle / Ivy

Go to download

Carrot2 search results clustering framework. Minimal functional subset (core algorithms and infrastructure, no document sources).

There is a newer version: 3.16.3
Show newest version

/*
 * Carrot2 project.
 *
 * Copyright (C) 2002-2013, Dawid Weiss, Stanisław Osiński.
 * All rights reserved.
 *
 * Refer to the full license file "carrot2.LICENSE"
 * in the root folder of the repository checkout or at:
 * http://www.carrot2.org/carrot2.LICENSE
 */

package org.carrot2.text.linguistic;

import java.util.ArrayList;
import java.util.regex.Pattern;

import org.carrot2.text.util.MutableCharArray;

import com.carrotsearch.hppc.ObjectOpenHashSet;

/**
 * {@link ILexicalData} implemented on top of a hash set (stopwords) and a regular
 * expression pattern (stoplabels).
 */
final class DefaultLexicalData implements ILexicalData
{
    private final ObjectOpenHashSet stopwords;
    private final Pattern stoplabelPattern;

    /*
     * 
     */
    public DefaultLexicalData(ObjectOpenHashSet stopwords, 
                              ArrayList stoplabels)
    {
        this.stopwords = stopwords;
        this.stoplabelPattern = union(stoplabels);
    }

    /*
     * 
     */
    @Override
    public boolean isCommonWord(MutableCharArray word)
    {
        return stopwords.contains(word);
    }

    /*
     * 
     */
    @Override
    public boolean isStopLabel(CharSequence label)
    {
        if (this.stoplabelPattern == null)
            return false;

        return stoplabelPattern.matcher(label).matches();
    }

    /**
     * Combines a number of patterns into a single pattern with a union
     * of all of them. With automata-based pattern engines, this should
     * be faster and memory-friendly.
     */
    private static Pattern union(ArrayList patterns)
    {
        final StringBuilder union = new StringBuilder();
        if (patterns.size() > 0)
        {
            union.append("(");
            for (int i = 0; i < patterns.size(); i++)
            {
                if (i > 0) union.append(")|(");
                union.append(patterns.get(i).toString());
            }
            union.append(")");
            return Pattern.compile(union.toString());
        }
        else
        {
            return null;
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy