org.carrot2.source.SearchEngineBase Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of carrot2-core Show documentation
Show all versions of carrot2-core Show documentation
Carrot2 Text Clustering Library
/*
* Carrot2 project.
*
* Copyright (C) 2002-2014, Dawid Weiss, Stanisław Osiński.
* All rights reserved.
*
* Refer to the full license file "carrot2.LICENSE"
* in the root folder of the repository checkout or at:
* http://www.carrot2.org/carrot2.LICENSE
*/
package org.carrot2.source;
import java.util.Collection;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang.StringEscapeUtils;
import org.apache.commons.lang.StringUtils;
import org.carrot2.core.*;
import org.carrot2.core.attribute.*;
import org.carrot2.util.attribute.*;
import org.carrot2.util.attribute.constraint.IntRange;
import org.carrot2.util.attribute.constraint.NotBlank;
/**
* A base class facilitating implementation of {@link IDocumentSource}s wrapping external
* search engines with remote/ network-based interfaces. The base class defines the common
* attribute fields used by more specific base classes and concrete implementations.
*
* @see SimpleSearchEngine
* @see MultipageSearchEngine
*/
@Bindable(prefix = "SearchEngineBase", inherit = CommonAttributes.class)
public abstract class SearchEngineBase extends ProcessingComponentBase implements
IDocumentSource
{
/** {@link Group} name. */
public static final String SERVICE = "Service";
/** {@link Group} name. */
protected static final String POSTPROCESSING = "Postprocessing";
@Processing
@Input
@Attribute(key = AttributeNames.START, inherit = true)
@IntRange(min = 0)
public int start = 0;
@Processing
@Input
@Attribute(key = AttributeNames.RESULTS, inherit = true)
@IntRange(min = 1)
public int results = 100;
@Processing
@Input
@Attribute(key = AttributeNames.QUERY, inherit = true)
@Required
@NotBlank
public String query;
@Processing
@Output
@Attribute(key = AttributeNames.RESULTS_TOTAL, inherit = true)
public long resultsTotal;
@Processing
@Output
@Attribute(key = AttributeNames.DOCUMENTS, inherit = true)
@Internal
public Collection documents;
/**
* Indicates whether the search engine returned a compressed result stream.
*/
@Processing
@Output
@Attribute
@Label("Compression used")
@Group(DefaultGroups.RESULT_INFO)
public boolean compressed;
/**
* This component usage statistics.
*/
public SearchEngineStats statistics = new SearchEngineStats();
/**
* Regexp pattern for matching query word highlighting.
*/
private static Pattern HIGHLIGHTS_PATTERN = Pattern.compile("?b>");
/**
* Unescape HTML entities and tags from a given set of fields
of all
* documents in the provided response
.
*
* @param response the search engine response to clean
* @param keepHighlights set to true
to keep query terms highlights
* @param fields names of fields to clean
*/
protected static void clean(SearchEngineResponse response, boolean keepHighlights,
String... fields)
{
for (Document document : response.results)
{
for (String field : fields)
{
final String originalField = document.getField(field);
if (StringUtils.isNotBlank(originalField))
{
String cleanedField = originalField;
if (!keepHighlights)
{
final Matcher matcher = HIGHLIGHTS_PATTERN.matcher(cleanedField);
cleanedField = matcher.replaceAll("");
}
cleanedField = StringEscapeUtils.unescapeHtml(cleanedField);
document.setField(field, cleanedField);
}
}
}
}
/**
* Called after a single search engine response has been fetched. The concrete
* implementation may want to override this empty implementation to e.g., clean or
* otherwise postprocess the returned results.
*/
protected void afterFetch(SearchEngineResponse response)
{
}
/**
* URL-encodes a string into UTF-8.
*/
protected static final String urlEncode(String string)
{
return org.carrot2.util.StringUtils.urlEncodeWrapException(string, "UTF-8");
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy