All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.carrot2.source.solr.SolrDocumentSource Maven / Gradle / Ivy


/*
 * Carrot2 project.
 *
 * Copyright (C) 2002-2015, Dawid Weiss, Stanisław Osiński.
 * All rights reserved.
 *
 * Refer to the full license file "carrot2.LICENSE"
 * in the root folder of the repository checkout or at:
 * http://www.carrot2.org/carrot2.LICENSE
 */

package org.carrot2.source.solr;

import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.carrot2.core.Cluster;
import org.carrot2.core.Document;
import org.carrot2.core.ProcessingResult;
import org.carrot2.core.attribute.AttributeNames;
import org.carrot2.core.attribute.Init;
import org.carrot2.core.attribute.Internal;
import org.carrot2.core.attribute.Processing;
import org.carrot2.source.SearchEngineResponse;
import org.carrot2.source.xml.RemoteXmlSimpleSearchEngineBase;
import org.carrot2.util.attribute.Attribute;
import org.carrot2.util.attribute.AttributeLevel;
import org.carrot2.util.attribute.Bindable;
import org.carrot2.util.attribute.DefaultGroups;
import org.carrot2.util.attribute.Group;
import org.carrot2.util.attribute.Input;
import org.carrot2.util.attribute.Label;
import org.carrot2.util.attribute.Level;
import org.carrot2.util.attribute.Output;
import org.carrot2.util.attribute.constraint.ImplementingClasses;
import org.carrot2.util.resource.ClassLoaderResource;
import org.carrot2.util.resource.ClassResource;
import org.carrot2.util.resource.FileResource;
import org.carrot2.util.resource.IResource;

import com.google.common.base.Predicate;
import com.google.common.base.Strings;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;

/**
 * Fetches documents from an instance of Solr.
 * 
 * @see Apache SOLR
 */
@Bindable(prefix = "SolrDocumentSource")
public class SolrDocumentSource extends RemoteXmlSimpleSearchEngineBase
{
    protected static final String FIELD_MAPPING = "Index field mapping";
    
    /**
     * Solr service URL base. The URL base can contain additional Solr parameters, 
     * for example: http://localhost:8983/solr/select?fq=timestemp:[NOW-24HOUR TO NOW]
     */
    @Input
    @Processing
    @Attribute
    @Label("Service URL")
    @Level(AttributeLevel.ADVANCED)
    @Group(SERVICE)
    public String serviceUrlBase = "http://localhost:8983/solr/select";

    /**
     * Filter query appended to {@link #serviceUrlBase}.
     */
    @Input
    @Init
    @Processing
    @Attribute
    @Label("Filter query")
    @Level(AttributeLevel.MEDIUM)
    @Group(SERVICE)
    public String solrFilterQuery = "";

    /**
     * Title field name. Name of the Solr field that will provide document titles.
     */
    @Input
    @Processing
    @Attribute
    @Label("Title field name")
    @Level(AttributeLevel.MEDIUM)
    @Group(FIELD_MAPPING)
    public String solrTitleFieldName = "title";

    /**
     * Summary field name. Name of the Solr field that will provide document summary.
     */
    @Input
    @Processing
    @Attribute
    @Label("Summary field name")
    @Level(AttributeLevel.MEDIUM)
    @Group(FIELD_MAPPING)
    public String solrSummaryFieldName = "description";

    /**
     * URL field name. Name of the Solr field that will provide document URLs.
     */
    @Input
    @Processing
    @Attribute
    @Label("URL field name")
    @Level(AttributeLevel.MEDIUM)
    @Group(FIELD_MAPPING)
    public String solrUrlFieldName = "url";

    /**
     * Document identifier field name (specified in Solr schema). This field is necessary
     * to connect Solr-side clusters or highlighter output to documents. 
     */
    @Input
    @Processing
    @Attribute
    @Label("ID field name")
    @Level(AttributeLevel.MEDIUM)
    @Group(FIELD_MAPPING)
    public String solrIdFieldName;

    /**
     * Provides a custom XSLT stylesheet for converting from Solr's output to
     * an XML format 
     * parsed by Carrot2. For performance reasons this attribute
     * can be provided at initialization time only (no processing-time overrides).  
     */
    @Input
    @Init
    @Attribute
    @Label("Custom XSLT adapter from Solr to Carrot2 format")
    @Level(AttributeLevel.ADVANCED)
    @Group(FIELD_MAPPING)
    @ImplementingClasses(classes =
    {
        ClassLoaderResource.class,
        FileResource.class
    }, strict = false)
    public IResource solrXsltAdapter;

    /**
     * If clusters are present in the Solr output they will be read and exposed to components
     * further down the processing chain. Note that {@link #solrIdFieldName} is required to match
     * document references.
     */
    @Input
    @Init 
    @Processing
    @Attribute
    @Label("Read Solr clusters if present")
    @Level(AttributeLevel.BASIC)
    @Group(FIELD_MAPPING)
    public boolean readClusters = false;

    /**
     * If highlighter fragments are present in the Solr output they will be used (and preferred) over full
     * field content. This may be used to decrease the memory required for clustering. In general if highlighter
     * is used the contents of full fields won't be emitted from Solr though (because it makes little sense).
     * 
     * 

Setting this option to false will disable using the highlighter output * entirely.

*/ @Input @Init @Processing @Attribute @Label("Use highlighter output if present") @Level(AttributeLevel.BASIC) @Group(FIELD_MAPPING) public boolean useHighlighterOutput = true; /** * Copy Solr fields from the search result to Carrot2 {@link org.carrot2.core.Document} instances (as fields). */ @Input @Init @Processing @Attribute @Label("Copy Solr document fields") @Level(AttributeLevel.ADVANCED) @Group(FIELD_MAPPING) public boolean copyFields = false; /** * If {@link #readClusters} is true and clusters are present in the input * XML, they will be deserialized and exposed to components further down the processing * chain. */ @Processing @Input @Output @Internal @Attribute(key = AttributeNames.CLUSTERS) @Label("Clusters") @Level(AttributeLevel.BASIC) @Group(DefaultGroups.RESULT_INFO) public List clusters; @Override protected void afterFetch(SearchEngineResponse response, ProcessingResult processingResult) { if (readClusters) { final Set ids = Sets.newHashSet(); List documents = processingResult.getDocuments(); if (documents == null) documents = Collections.emptyList(); List clusters = processingResult.getClusters(); if (documents != null && clusters != null) { for (Document doc : documents) { ids.add(doc.getStringId()); } Predicate docFilter = new Predicate() { @Override public boolean apply(Document input) { return input != null && ids.contains(input.getStringId()); } }; this.clusters = sanityCheck(clusters, docFilter); } } } @Override protected String buildServiceUrl() { return serviceUrlBase + (serviceUrlBase.contains("?") ? "&" : "?") + "q=" + urlEncode(query) + (Strings.isNullOrEmpty(solrFilterQuery) ? "" : "&fq=" + urlEncode(solrFilterQuery)) + "&start=" + start + "&rows=" + results + "&indent=off"; } @Override protected IResource getXsltResource() { if (solrXsltAdapter == null) { return new ClassResource(SolrDocumentSource.class, "solr-to-c2.xsl"); } else { return solrXsltAdapter; } } @Override protected Map getXsltParameters() { final Map parameters = Maps.newHashMap(); parameters.put("solr.title-field", solrTitleFieldName); parameters.put("solr.summary-field", solrSummaryFieldName); parameters.put("solr.url-field", solrUrlFieldName); parameters.put("solr.id-field", Strings.nullToEmpty(solrIdFieldName)); parameters.put("solr.use-highlighter-output", useHighlighterOutput ? "true" : "false"); parameters.put("solr.copy-fields", copyFields ? "true" : "false"); return parameters; } private static List sanityCheck(List in, Predicate docFilter) { List cloned = Lists.newArrayListWithCapacity(in.size()); for (Cluster c : in) { Cluster c2 = new Cluster(); c2.addPhrases(c.getPhrases()); c2.addDocuments( Iterables.filter(c.getDocuments(), docFilter)); c2.addSubclusters(sanityCheck(c.getSubclusters(), docFilter)); cloned.add(c2); } return cloned; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy