org.carrot2.text.linguistic.DefaultLexicalData Maven / Gradle / Ivy
/*
* Carrot2 project.
*
* Copyright (C) 2002-2016, Dawid Weiss, Stanisław Osiński.
* All rights reserved.
*
* Refer to the full license file "carrot2.LICENSE"
* in the root folder of the repository checkout or at:
* http://www.carrot2.org/carrot2.LICENSE
*/
package org.carrot2.text.linguistic;
import java.util.ArrayList;
import java.util.regex.Pattern;
import org.carrot2.text.util.MutableCharArray;
import com.carrotsearch.hppc.ObjectHashSet;
/**
* {@link ILexicalData} implemented on top of a hash set (stopwords) and a regular
* expression pattern (stoplabels).
*/
final class DefaultLexicalData implements ILexicalData
{
private final ObjectHashSet stopwords;
private final Pattern stoplabelPattern;
/*
*
*/
public DefaultLexicalData(ObjectHashSet stopwords,
ArrayList stoplabels)
{
this.stopwords = stopwords;
this.stoplabelPattern = union(stoplabels);
}
/*
*
*/
@Override
public boolean isCommonWord(MutableCharArray word)
{
return stopwords.contains(word);
}
/*
*
*/
@Override
public boolean isStopLabel(CharSequence label)
{
if (this.stoplabelPattern == null)
return false;
return stoplabelPattern.matcher(label).matches();
}
/**
* Combines a number of patterns into a single pattern with a union
* of all of them. With automata-based pattern engines, this should
* be faster and memory-friendly.
*/
private static Pattern union(ArrayList patterns)
{
final StringBuilder union = new StringBuilder();
if (patterns.size() > 0)
{
union.append("(");
for (int i = 0; i < patterns.size(); i++)
{
if (i > 0) union.append(")|(");
union.append(patterns.get(i).toString());
}
union.append(")");
return Pattern.compile(union.toString());
}
else
{
return null;
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy