org.carrot2.text.preprocessing.PreprocessedDocumentScanner Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of carrot2-mini Show documentation
Show all versions of carrot2-mini Show documentation
Carrot2 search results clustering framework. Minimal functional subset
(core algorithms and infrastructure, no document sources).
/*
* Carrot2 project.
*
* Copyright (C) 2002-2019, Dawid Weiss, Stanisław Osiński.
* All rights reserved.
*
* Refer to the full license file "carrot2.LICENSE"
* in the root folder of the repository checkout or at:
* http://www.carrot2.org/carrot2.LICENSE
*/
package org.carrot2.text.preprocessing;
import org.carrot2.text.analysis.ITokenizer;
import org.carrot2.util.IntArrayPredicateIterator;
import com.carrotsearch.hppc.predicates.ShortPredicate;
/**
* Iterates over tokenized documents in {@link PreprocessingContext}.
*/
public class PreprocessedDocumentScanner
{
/** Predicate for splitting on document separator. */
public static final ShortPredicate ON_DOCUMENT_SEPARATOR =
equalTo(ITokenizer.TF_SEPARATOR_DOCUMENT);
/** Predicate for splitting on field separator. */
public static final ShortPredicate ON_FIELD_SEPARATOR =
equalTo(ITokenizer.TF_SEPARATOR_FIELD);
/** Predicate for splitting on sentence separator. */
public static final ShortPredicate ON_SENTENCE_SEPARATOR = new ShortPredicate()
{
public boolean apply(short tokenType)
{
return (tokenType & ITokenizer.TF_SEPARATOR_SENTENCE) != 0;
}
};
/**
* Return a new {@link ShortPredicate} returning true
* if the argument equals a given value.
*/
public static final ShortPredicate equalTo(final short t)
{
return new ShortPredicate() {
public boolean apply(short value)
{
return value == t;
}
};
}
/**
* Iterate over all documents, fields and sentences in {@link PreprocessingContext#allTokens}.
*/
public final void iterate(PreprocessingContext context)
{
/*
* Recursively iterate through documents, fields and sentences. This can be
* implemented a bit faster (without iterators), but I guess the overhead here is
* minimal anyway.
*/
final IntArrayPredicateIterator docIterator = new IntArrayPredicateIterator(
context.allTokens.type, 0, context.allTokens.type.length - 1,
ON_DOCUMENT_SEPARATOR);
while (docIterator.hasNext())
{
final int docStart = docIterator.next();
final int docLength = docIterator.getLength();
document(context, docStart, docLength);
}
}
/**
* Invoked for each document. Splits further into fields.
*/
protected void document(PreprocessingContext context, int start, int length)
{
final IntArrayPredicateIterator fieldIterator = new IntArrayPredicateIterator(
context.allTokens.type, start, length,
ON_FIELD_SEPARATOR);
while (fieldIterator.hasNext())
{
final int fieldStart = fieldIterator.next();
final int fieldLength = fieldIterator.getLength();
field(context, fieldStart, fieldLength);
}
}
/**
* Invoked for each document's field. Splits further into sentences.
*/
protected void field(PreprocessingContext context, int start, int length)
{
final IntArrayPredicateIterator sentenceIterator = new IntArrayPredicateIterator(
context.allTokens.type, start, length,
ON_SENTENCE_SEPARATOR);
while (sentenceIterator.hasNext())
{
final int sentenceStart = sentenceIterator.next();
final int sentenceLength = sentenceIterator.getLength();
sentence(context, sentenceStart, sentenceLength);
}
}
/**
* Invoked for each document's sentence.
*/
protected void sentence(PreprocessingContext context, int start, int length)
{
}
}