org.carrot2.source.solr.SolrDocumentSource Maven / Gradle / Ivy


/*
 * Carrot2 project.
 *
 * Copyright (C) 2002-2015, Dawid Weiss, Stanisław Osiński.
 * All rights reserved.
 *
 * Refer to the full license file "carrot2.LICENSE"
 * in the root folder of the repository checkout or at:
 * http://www.carrot2.org/carrot2.LICENSE
 */

package org.carrot2.source.solr;

import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.carrot2.core.Cluster;
import org.carrot2.core.Document;
import org.carrot2.core.ProcessingResult;
import org.carrot2.core.attribute.AttributeNames;
import org.carrot2.core.attribute.Init;
import org.carrot2.core.attribute.Internal;
import org.carrot2.core.attribute.Processing;
import org.carrot2.source.SearchEngineResponse;
import org.carrot2.source.xml.RemoteXmlSimpleSearchEngineBase;
import org.carrot2.util.attribute.Attribute;
import org.carrot2.util.attribute.AttributeLevel;
import org.carrot2.util.attribute.Bindable;
import org.carrot2.util.attribute.DefaultGroups;
import org.carrot2.util.attribute.Group;
import org.carrot2.util.attribute.Input;
import org.carrot2.util.attribute.Label;
import org.carrot2.util.attribute.Level;
import org.carrot2.util.attribute.Output;
import org.carrot2.util.attribute.constraint.ImplementingClasses;
import org.carrot2.util.resource.ClassLoaderResource;
import org.carrot2.util.resource.ClassResource;
import org.carrot2.util.resource.FileResource;
import org.carrot2.util.resource.IResource;

import com.google.common.base.Predicate;
import com.google.common.base.Strings;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;

/**
 * Fetches documents from an instance of Solr.
 * 
 * @see Apache SOLR
 */
@Bindable(prefix = "SolrDocumentSource")
public class SolrDocumentSource extends RemoteXmlSimpleSearchEngineBase
{
    protected static final String FIELD_MAPPING = "Index field mapping";
    
    /**
     * Solr service URL base. The URL base can contain additional Solr parameters, 
     * for example: http://localhost:8983/solr/select?fq=timestemp:[NOW-24HOUR TO NOW]
     */
    @Input
    @Processing
    @Attribute
    @Label("Service URL")
    @Level(AttributeLevel.ADVANCED)
    @Group(SERVICE)
    public String serviceUrlBase = "http://localhost:8983/solr/select";

    /**
     * Filter query appended to {@link #serviceUrlBase}.
     */
    @Input
    @Init
    @Processing
    @Attribute
    @Label("Filter query")
    @Level(AttributeLevel.MEDIUM)
    @Group(SERVICE)
    public String solrFilterQuery = "";

    /**
     * Title field name. Name of the Solr field that will provide document titles.
     */
    @Input
    @Processing
    @Attribute
    @Label("Title field name")
    @Level(AttributeLevel.MEDIUM)
    @Group(FIELD_MAPPING)
    public String solrTitleFieldName = "title";

    /**
     * Summary field name. Name of the Solr field that will provide document summary.
     */
    @Input
    @Processing
    @Attribute
    @Label("Summary field name")
    @Level(AttributeLevel.MEDIUM)
    @Group(FIELD_MAPPING)
    public String solrSummaryFieldName = "description";

    /**
     * URL field name. Name of the Solr field that will provide document URLs.
     */
    @Input
    @Processing
    @Attribute
    @Label("URL field name")
    @Level(AttributeLevel.MEDIUM)
    @Group(FIELD_MAPPING)
    public String solrUrlFieldName = "url";

    /**
     * Document identifier field name (specified in Solr schema). This field is necessary
     * to connect Solr-side clusters or highlighter output to documents. 
     */
    @Input
    @Processing
    @Attribute
    @Label("ID field name")
    @Level(AttributeLevel.MEDIUM)
    @Group(FIELD_MAPPING)
    public String solrIdFieldName;

    /**
     * Provides a custom XSLT stylesheet for converting from Solr's output to
     * an XML format 
     * parsed by Carrot2. For performance reasons this attribute
     * can be provided at initialization time only (no processing-time overrides).  
     */
    @Input
    @Init
    @Attribute
    @Label("Custom XSLT adapter from Solr to Carrot2 format")
    @Level(AttributeLevel.ADVANCED)
    @Group(FIELD_MAPPING)
    @ImplementingClasses(classes =
    {
        ClassLoaderResource.class,
        FileResource.class
    }, strict = false)
    public IResource solrXsltAdapter;

    /**
     * If clusters are present in the Solr output they will be read and exposed to components
     * further down the processing chain. Note that {@link #solrIdFieldName} is required to match
     * document references.
     */
    @Input
    @Init 
    @Processing
    @Attribute
    @Label("Read Solr clusters if present")
    @Level(AttributeLevel.BASIC)
    @Group(FIELD_MAPPING)
    public boolean readClusters = false;

    /**
     * If highlighter fragments are present in the Solr output they will be used (and preferred) over full
     * field content. This may be used to decrease the memory required for clustering. In general if highlighter
     * is used the contents of full fields won't be emitted from Solr though (because it makes little sense).
     * 
     * Setting this option to false will disable using the highlighter output
     * entirely.
     */
    @Input
    @Init 
    @Processing
    @Attribute
    @Label("Use highlighter output if present")
    @Level(AttributeLevel.BASIC)
    @Group(FIELD_MAPPING)
    public boolean useHighlighterOutput = true;

    /**
     * Copy Solr fields from the search result to Carrot2 {@link org.carrot2.core.Document} instances (as fields).
     */
    @Input
    @Init 
    @Processing
    @Attribute
    @Label("Copy Solr document fields")
    @Level(AttributeLevel.ADVANCED)
    @Group(FIELD_MAPPING)
    public boolean copyFields = false;

    /**
     * If {@link #readClusters} is true and clusters are present in the input
     * XML, they will be deserialized and exposed to components further down the processing
     * chain.
     */
    @Processing
    @Input @Output
    @Internal
    @Attribute(key = AttributeNames.CLUSTERS)
    @Label("Clusters")
    @Level(AttributeLevel.BASIC)
    @Group(DefaultGroups.RESULT_INFO)
    public List clusters;

    @Override
    protected void afterFetch(SearchEngineResponse response,
        ProcessingResult processingResult)
    {
        if (readClusters) {
            final Set ids = Sets.newHashSet();
            List documents = processingResult.getDocuments();
            if (documents == null) documents = Collections.emptyList();
            List clusters = processingResult.getClusters();
            if (documents != null && clusters != null) {
                for (Document doc : documents) {
                    ids.add(doc.getStringId());
                }
    
                Predicate docFilter = new Predicate()
                {
                    @Override
                    public boolean apply(Document input)
                    {
                        return input != null && ids.contains(input.getStringId());
                    }
                };
                this.clusters = sanityCheck(clusters, docFilter);
            }
        }
    }

    @Override
    protected String buildServiceUrl()
    {
        return serviceUrlBase 
            + (serviceUrlBase.contains("?") ? "&" : "?")
            + "q=" + urlEncode(query)
            + (Strings.isNullOrEmpty(solrFilterQuery) ? "" : "&fq=" + urlEncode(solrFilterQuery))
            + "&start=" + start
            + "&rows=" + results 
            + "&indent=off";
    }

    @Override
    protected IResource getXsltResource()
    {
        if (solrXsltAdapter == null) {
            return new ClassResource(SolrDocumentSource.class, "solr-to-c2.xsl");
        } else {
            return solrXsltAdapter; 
        }
    }

    @Override
    protected Map getXsltParameters()
    {
        final Map parameters = Maps.newHashMap();

        parameters.put("solr.title-field", solrTitleFieldName);
        parameters.put("solr.summary-field", solrSummaryFieldName);
        parameters.put("solr.url-field", solrUrlFieldName);
        parameters.put("solr.id-field", Strings.nullToEmpty(solrIdFieldName));
        parameters.put("solr.use-highlighter-output", useHighlighterOutput ? "true" : "false");
        parameters.put("solr.copy-fields", copyFields ? "true" : "false");

        return parameters;
    }
    
    private static List sanityCheck(List in, Predicate docFilter)
    {
        List cloned = Lists.newArrayListWithCapacity(in.size());
        for (Cluster c : in) {
            Cluster c2 = new Cluster();
            c2.addPhrases(c.getPhrases());
            c2.addDocuments(
                Iterables.filter(c.getDocuments(), docFilter));
            c2.addSubclusters(sanityCheck(c.getSubclusters(), docFilter));
            cloned.add(c2);
        }
        return cloned;
    }    
}