All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.carrot2.source.SearchEngineBase Maven / Gradle / Ivy

There is a newer version: 4.6.0
Show newest version

/*
 * Carrot2 project.
 *
 * Copyright (C) 2002-2014, Dawid Weiss, Stanisław Osiński.
 * All rights reserved.
 *
 * Refer to the full license file "carrot2.LICENSE"
 * in the root folder of the repository checkout or at:
 * http://www.carrot2.org/carrot2.LICENSE
 */

package org.carrot2.source;

import java.util.Collection;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang.StringEscapeUtils;
import org.apache.commons.lang.StringUtils;
import org.carrot2.core.*;
import org.carrot2.core.attribute.*;
import org.carrot2.util.attribute.*;
import org.carrot2.util.attribute.constraint.IntRange;
import org.carrot2.util.attribute.constraint.NotBlank;

/**
 * A base class facilitating implementation of {@link IDocumentSource}s wrapping external
 * search engines with remote/ network-based interfaces. The base class defines the common
 * attribute fields used by more specific base classes and concrete implementations.
 * 
 * @see SimpleSearchEngine
 * @see MultipageSearchEngine
 */
@Bindable(prefix = "SearchEngineBase", inherit = CommonAttributes.class)
public abstract class SearchEngineBase extends ProcessingComponentBase implements
    IDocumentSource
{
    /** {@link Group} name. */
    public static final String SERVICE = "Service";

    /** {@link Group} name. */
    protected static final String POSTPROCESSING = "Postprocessing";
    
    
    @Processing
    @Input
    @Attribute(key = AttributeNames.START, inherit = true)
    @IntRange(min = 0)
    public int start = 0;

    @Processing
    @Input
    @Attribute(key = AttributeNames.RESULTS, inherit = true)
    @IntRange(min = 1)
    public int results = 100;

    @Processing
    @Input
    @Attribute(key = AttributeNames.QUERY, inherit = true)
    @Required
    @NotBlank
    public String query;

    @Processing
    @Output
    @Attribute(key = AttributeNames.RESULTS_TOTAL, inherit = true)
    public long resultsTotal;

    @Processing
    @Output
    @Attribute(key = AttributeNames.DOCUMENTS, inherit = true)
    @Internal
    public Collection documents;

    /**
     * Indicates whether the search engine returned a compressed result stream.
     */
    @Processing
    @Output
    @Attribute
    @Label("Compression used")
    @Group(DefaultGroups.RESULT_INFO)
    public boolean compressed;

    /**
     * This component usage statistics.
     */
    public SearchEngineStats statistics = new SearchEngineStats();

    /**
     * Regexp pattern for matching query word highlighting.
     */
    private static Pattern HIGHLIGHTS_PATTERN = Pattern.compile("");

    /**
     * Unescape HTML entities and tags from a given set of fields of all
     * documents in the provided response.
     * 
     * @param response the search engine response to clean
     * @param keepHighlights set to true to keep query terms highlights
     * @param fields names of fields to clean
     */
    protected static void clean(SearchEngineResponse response, boolean keepHighlights,
        String... fields)
    {
        for (Document document : response.results)
        {
            for (String field : fields)
            {
                final String originalField = document.getField(field);
                if (StringUtils.isNotBlank(originalField))
                {
                    String cleanedField = originalField;
                    if (!keepHighlights)
                    {
                        final Matcher matcher = HIGHLIGHTS_PATTERN.matcher(cleanedField);
                        cleanedField = matcher.replaceAll("");
                    }

                    cleanedField = StringEscapeUtils.unescapeHtml(cleanedField);

                    document.setField(field, cleanedField);
                }
            }
        }
    }

    /**
     * Called after a single search engine response has been fetched. The concrete
     * implementation may want to override this empty implementation to e.g., clean or
     * otherwise postprocess the returned results.
     */
    protected void afterFetch(SearchEngineResponse response)
    {
    }

    /**
     * URL-encodes a string into UTF-8.
     */
    protected static final String urlEncode(String string)
    {
        return org.carrot2.util.StringUtils.urlEncodeWrapException(string, "UTF-8");
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy