org.carrot2.text.linguistic.DefaultLexicalData Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of carrot2-mini Show documentation
Show all versions of carrot2-mini Show documentation
Carrot2 search results clustering framework. Minimal functional subset
(core algorithms and infrastructure, no document sources).
/*
* Carrot2 project.
*
* Copyright (C) 2002-2013, Dawid Weiss, Stanisław Osiński.
* All rights reserved.
*
* Refer to the full license file "carrot2.LICENSE"
* in the root folder of the repository checkout or at:
* http://www.carrot2.org/carrot2.LICENSE
*/
package org.carrot2.text.linguistic;
import java.util.ArrayList;
import java.util.regex.Pattern;
import org.carrot2.text.util.MutableCharArray;
import com.carrotsearch.hppc.ObjectOpenHashSet;
/**
* {@link ILexicalData} implemented on top of a hash set (stopwords) and a regular
* expression pattern (stoplabels).
*/
final class DefaultLexicalData implements ILexicalData
{
private final ObjectOpenHashSet stopwords;
private final Pattern stoplabelPattern;
/*
*
*/
public DefaultLexicalData(ObjectOpenHashSet stopwords,
ArrayList stoplabels)
{
this.stopwords = stopwords;
this.stoplabelPattern = union(stoplabels);
}
/*
*
*/
@Override
public boolean isCommonWord(MutableCharArray word)
{
return stopwords.contains(word);
}
/*
*
*/
@Override
public boolean isStopLabel(CharSequence label)
{
if (this.stoplabelPattern == null)
return false;
return stoplabelPattern.matcher(label).matches();
}
/**
* Combines a number of patterns into a single pattern with a union
* of all of them. With automata-based pattern engines, this should
* be faster and memory-friendly.
*/
private static Pattern union(ArrayList patterns)
{
final StringBuilder union = new StringBuilder();
if (patterns.size() > 0)
{
union.append("(");
for (int i = 0; i < patterns.size(); i++)
{
if (i > 0) union.append(")|(");
union.append(patterns.get(i).toString());
}
union.append(")");
return Pattern.compile(union.toString());
}
else
{
return null;
}
}
}