com.norconex.committer.azuresearch.AzureSearchCommitter Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of norconex-committer-azuresearch Show documentation
Microsoft Azure Search implementation of Norconex Committer.
There is a newer version: 2.0.0
Show newest version
/* Copyright 2017 Norconex Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.norconex.committer.azuresearch;

import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.Map.Entry;
import java.util.Objects;

import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamWriter;

import org.apache.commons.codec.binary.Base64;
import org.apache.commons.configuration.XMLConfiguration;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.ObjectUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.builder.EqualsBuilder;
import org.apache.commons.lang3.builder.HashCodeBuilder;
import org.apache.commons.lang3.builder.ReflectionToStringBuilder;
import org.apache.commons.lang3.builder.ToStringStyle;
import org.apache.commons.text.StringEscapeUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.entity.ContentType;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.client.WinHttpClients;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;

import com.norconex.committer.core.AbstractCommitter;
import com.norconex.committer.core.AbstractMappedCommitter;
import com.norconex.committer.core.CommitterException;
import com.norconex.committer.core.IAddOperation;
import com.norconex.committer.core.ICommitOperation;
import com.norconex.committer.core.IDeleteOperation;
import com.norconex.commons.lang.encrypt.EncryptionUtil;
import com.norconex.commons.lang.net.ProxySettings;
import com.norconex.commons.lang.time.DurationParser;
import com.norconex.commons.lang.xml.EnhancedXMLStreamWriter;

/**
 * 
 * Commits documents to Microsoft Azure Search.
 * 
 * 
 * Document reference encoding
 * 
 * By default the document reference (Azure Search Document Key) is
 * encoded using URL-safe Base64 encoding. This is Azure Search recommended
 * approach when a document unique id can contain special characters
 * (e.g. a URL).  If you know your document references to be safe
 * (e.g. a sequence number), you can 
 * set {@link #setDisableReferenceEncoding(boolean)} to true.
 * To otherwise store a reference value un-encoded, you can additionally 
 * store it in a field other than your reference ("id") field.
 *   
 * 
 * Field names and errors
 * 
 * Azure Search will produce an error if any of the documents in a submitted 
 * batch contains one or more fields with invalid characters.  To prevent
 * sending those in vain, the committer will validate your fields
 * and throw an exception upon encountering an invalid one.
 * To prevent exceptions from being thrown, you can set 
 * {@link #setIgnoreValidationErrors(boolean)} to true to
 * log those errors instead.
 * 
 * 
 * An exception will also be thrown for errors returned by Azure Search 
 * (e.g. a field is not defined in your
 * Azure Search schema). To also log those errors instead of throwing an
 * exception, you can set {@link #setIgnoreResponseErrors(boolean)}
 * to true. 
 * 
 * Field naming rules
 * 
 * Those are the field naming rules mandated for Azure Search (in force
 * for Azure Search version 2016-09-01): 
 * Search version  
 * 
 * 
 *   Document reference (ID): Letters, numbers, dashes ("-"), 
 *       underscores ("_"), and equal signs ("="). First character cannot be
 *       an underscore.
 *   Document field name: Letters, numbers, underscores ("_"). First
 *       character must be a letter. Cannot start with "azureSearch". 
 *       Maximum length is 128 characters.
 * 
 * 
 * Password encryption in XML configuration:
 * 
 * The proxyPassword can take a password that has been
 * encrypted using {@link EncryptionUtil} (or command-line encrypt.[bat|sh]).
 * In order for the password to be decrypted properly by the crawler, you need
 * to specify the encryption key used to encrypt it. The key can be stored
 * in a few supported locations and a combination of
 * proxyPasswordKey
 * and proxyPasswordKeySource must be specified to properly
 * locate the key. The supported sources are:
 * 
 * 
 *   
 *     
 *     
 *   
 *   
 *     
 *     
 *   
 *   
 *     
 *     
 *   
 *   
 *     
 *     
 *   
 *   
 *     
 *     
 *   
 * proxyPasswordKeySource proxyPasswordKey
key The actual encryption key.
file Path to a file containing the encryption key.
environment Name of an environment variable containing the key.
property Name of a JVM system property containing the key.
 * 
 * XML configuration usage:
 *  *  <committer class="com.norconex.committer.azuresearch.AzureSearchCommitter">
 *      <endpoint>(Azure Search endpoint)</endpoint>
 *      <apiVersion>(Optional Azure Search API version to use)</apiVersion>
 *      <apiKey>(Azure Search API admin key)</apiKey>
 *      <indexName>(Name of the index to use)</indexName>
 *      <disableReferenceEncoding>[false|true]</disableReferenceEncoding>
 *      <ignoreValidationErrors>[false|true]</ignoreValidationErrors>
 *      <ignoreResponseErrors>[false|true]</ignoreResponseErrors>
 *      <useWindowsAuth>[false|true]</useWindowsAuth>
 *
 *      <proxyHost>...</proxyHost>
 *      <proxyPort>...</proxyPort>
 *      <proxyRealm>...</proxyRealm>
 *      <proxyScheme>...</proxyScheme>
 *      <proxyUsername>...</proxyUsername>
 *      <proxyPassword>...</proxyPassword>
 *      <!-- Use the following if password is encrypted. -->
 *      <proxyPasswordKey>(the encryption key or a reference to it)</proxyPasswordKey>
 *      <proxyPasswordKeySource>[key|file|environment|property]</proxyPasswordKeySource>
 *
 *      <sourceReferenceField keep="[false|true]">
 *         (Optional name of field that contains the document reference, when 
 *         the default document reference is not used.  The reference value
 *         will be mapped to the Azure Search ID field. 
 *         Once re-mapped, this metadata source field is 
 *         deleted, unless "keep" is set to true.)
 *      </sourceReferenceField>
 *      <targetReferenceField>
 *         (Name of Azure Search target field where the store a document unique 
 *         identifier (sourceReferenceField).  If not specified, 
 *         default is "id".) 
 *      </targetReferenceField>
 *      <sourceContentField keep="[false|true]">
 *         (If you wish to use a metadata field to act as the document 
 *         "content", you can specify that field here.  Default 
 *         does not take a metadata field but rather the document content.
 *         Once re-mapped, the metadata source field is deleted,
 *         unless "keep" is set to true.)
 *      </sourceContentField>
 *      <targetContentField>
 *         (Target repository field name for a document content/body.
 *          Default is "content".)
 *      </targetContentField>
 *      <commitBatchSize>
 *          (Max number of documents to send to Azure Search at once.
 *           Maximum is 1000.)
 *      </commitBatchSize>
 *      <queueDir>(optional path where to queue files)</queueDir>
 *      <queueSize>(max queue size before committing)</queueSize>
 *      <maxRetries>(max retries upon commit failures)</maxRetries>
 *      <maxRetryWait>(max delay in milliseconds between retries)</maxRetryWait>
 *  </committer>
 * 
 * 
 * XML configuration entries expecting millisecond durations
 * can be provided in human-readable format (English only), as per 
 * {@link DurationParser} (e.g., "5 minutes and 30 seconds" or "5m30s").
 * 
 * 
 * Usage example:
 * 
 * The following example uses the minimum required settings:.  
 *  
 *  *  <committer class="com.norconex.committer.azuresearch.AzureSearchCommitter">
 *      <endpoint>https://example.search.windows.net</endpoint>
 *      <apiKey>1234567890ABCDEF1234567890ABCDEF</apiKey>
 *      <indexName>sample-index</indexName>
 *  </committer>
 * 
 *  
 * @author Pascal Essiembre
 */
public class AzureSearchCommitter extends AbstractMappedCommitter {

    private static final Logger LOG = 
            LogManager.getLogger(AzureSearchCommitter.class);

    /** Default Azure Search API version */
    public static final String DEFAULT_API_VERSION = "2016-09-01"; 
    /** Default Azure Search document key field */
    public static final String DEFAULT_AZURE_ID_FIELD = "id";
    /** Default Azure Search content field */
    public static final String DEFAULT_AZURE_CONTENT_FIELD = "content";

    private String endpoint;
    private String apiVersion = DEFAULT_API_VERSION;
    private String apiKey;
    private String indexName;
    private boolean disableReferenceEncoding;
    private boolean ignoreValidationErrors;
    private boolean ignoreResponseErrors;
    private final ProxySettings proxySettings = new ProxySettings();
    
    private CloseableHttpClient client;
    private String restURL;
    private boolean useWindowsAuth;
    
    /**
     * Constructor.
     */
    public AzureSearchCommitter() {
        super();
        setTargetReferenceField(DEFAULT_AZURE_ID_FIELD);
        setTargetContentField(DEFAULT_AZURE_CONTENT_FIELD);
    }
    
	/**
     * Gets the index name.
     * @return index name
     */
    public String getIndexName() {
        return indexName;
    }
    /**
     * Sets the index name.
     * @param indexName the index name
     */
    public void setIndexName(String indexName) {
        this.indexName = indexName;
    }

    /**
     * Gets the Azure Search endpoint 
     * (https://[service name].search.windows.net). 
     * @return Azure Search endpoint
     */
    public String getEndpoint() {
        return endpoint;
    }
    /**
     * Sets the Azure Search endpoint
     * (https://[service name].search.windows.net).
     * @param endpoint Azure Search endpoint
     */
    public void setEndpoint(String endpoint) {
        this.endpoint = endpoint;
    }

    /**
     * Gets the Azure API version. Default is {@link #DEFAULT_API_VERSION}.
     * @return the Azure API version
     */
    public String getApiVersion() {
        return apiVersion;
    }
    /**
     * Sets the Azure API version.
     * @param apiVersion Azure API version
     */
    public void setApiVersion(String apiVersion) {
        this.apiVersion = apiVersion;
    }

    /**
     * Gets the Azure API admin key.  
     * @return Azure API admin key
     */
    public String getApiKey() {
        return apiKey;
    }
    /**
     * Sets the Azure API admin key.
     * @param apiKey Azure API admin key
     */
    public void setApiKey(String apiKey) {
        this.apiKey = apiKey;
    }
    
    /**
     * Whether to disable document reference encoding. By default, references
     * are encoded using a URL-safe Base64 encoding.  When true,
     * document references will be sent as is if they pass validation.
     * @return true if disabling reference encoding
     */
    public boolean isDisableReferenceEncoding() {
        return disableReferenceEncoding;
    }
    /**
     * Sets whether to disable document reference encoding. When 
     * false, references are encoded using a URL-safe Base64 
     * encoding.  When true, document references will be sent as 
     * is if they pass validation.
     * @param disableReferenceEncoding true if disabling 
     *        reference encoding
     */
    public void setDisableReferenceEncoding(boolean disableReferenceEncoding) {
        this.disableReferenceEncoding = disableReferenceEncoding;
    }

    /**
     * Whether to ignore validation errors.  By default, an exception is 
     * thrown if a document contains a field that Azure Search will reject.
     * When true the validation errors are logged 
     * instead and the faulty field or document is not committed.
     * @return true when ignoring validation errors
     */
    public boolean isIgnoreValidationErrors() {
        return ignoreValidationErrors;
    }
    /**
     * Sets whether to ignore validation errors.  
     * When false, an exception is 
     * thrown if a document contains a field that Azure Search will reject.  
     * When true the validation errors are logged 
     * instead and the faulty field or document is not committed.
     * @param ignoreValidationErrors true when ignoring validation 
     *        errors
     */
    public void setIgnoreValidationErrors(boolean ignoreValidationErrors) {
        this.ignoreValidationErrors = ignoreValidationErrors;
    }
    
    /**
     * Whether to ignore response errors.  By default, an exception is 
     * thrown if the Azure Search response contains an error.  
     * When true the errors are logged instead.
     * @return true when ignoring response errors
     */
    public boolean isIgnoreResponseErrors() {
        return ignoreResponseErrors;
    }
    /**
     * Sets whether to ignore response errors.  
     * When false, an exception is 
     * thrown if the Azure Search response contains an error.  
     * When true the errors are logged instead.
     * @param ignoreResponseErrors true when ignoring response 
     *        errors
     */
    public void setIgnoreResponseErrors(boolean ignoreResponseErrors) {
        this.ignoreResponseErrors = ignoreResponseErrors;
    }    

    /**
     * Gets the proxy settings. Never null.
     * @return proxy settings
     * @since 1.1.0
     */
    public ProxySettings getProxySettings() {
        return proxySettings;
    }

    /**
     * Whether to use integrated Windows Authentication (if applicable).
     * @return true if using Windows Authentication
     */
    public boolean isUseWindowsAuth() {
        return useWindowsAuth;
    }
    /**
     * Sets whether to use integrated Windows Authentication (if applicable).
     * @param useWindowsAuth true if using Windows Authentication
     */
    public void setUseWindowsAuth(boolean useWindowsAuth) {
        this.useWindowsAuth = useWindowsAuth;
    }

    @Override
    public void commit() {
        super.commit();
        closeIfDone();
    }

    //TODO The following is a workaround to not having
    // a close() method (or equivalent) on the Committers yet.
    // So we check that the caller is not itself, which means it should
    // be the parent framework, which should in theory, call this only 
    // once. This is safe to do as the worst case scenario is that a new
    // client is re-created.
    // Remove this method once proper init/close is added to Committers
    private void closeIfDone() {
        StackTraceElement[] els = Thread.currentThread().getStackTrace();
        for (StackTraceElement el : els) {
            if (AbstractCommitter.class.getName().equals(el.getClassName())
                    && "commitIfReady".equals(el.getMethodName())) {
                return;
            }
        }
        close();
    }
    protected void close() {
        IOUtils.closeQuietly(client);
        client = null;
        LOG.info("Azure Search REST API Http Client closed.");
    }
    
    @Override
    protected void commitBatch(List batch) {
        HttpClient safeClient = nullSafeHttpClient();
        
        LOG.info("Sending " + batch.size() 
                + " commit operations to Azure Search.");
        try {
            boolean first = true;
            StringBuilder json = new StringBuilder();
            for (ICommitOperation op : batch) {
                String toAppend;
                if (op instanceof IAddOperation) {
                    toAppend = buildAddOperationJSON((IAddOperation) op);
                } else if (op instanceof IDeleteOperation) {
                    toAppend = buildDeleteOperationJSON((IDeleteOperation) op); 
                } else {
                    close();
                    throw new CommitterException("Unsupported operation:" + op);
                }
                if (StringUtils.isNotBlank(toAppend)) {
                    if (!first) {
                        json.append(",\n");
                    }
                    json.append(toAppend);
                    first = false;
                }
            }
            
            if (json.length() == 0) {
                LOG.warn("No documents were valid. Nothing committed.");
                return;
            }

            json.insert(0, "{\"value\":[\n");
            json.append("\n]}\n");
            
            if (LOG.isTraceEnabled()) {
                LOG.trace("JSON POST:\n" + StringUtils.trim(json.toString()));
            }
            StringEntity requestEntity = new StringEntity(
                    json.toString(), ContentType.APPLICATION_JSON);

            HttpPost post = new HttpPost(restURL);
            post.addHeader("api-key", getApiKey());
            post.setEntity(requestEntity);
            HttpResponse response = safeClient.execute(post);
            handleResponse(response);
            post.releaseConnection();
            LOG.info("Done sending commit operations to Azure Search.");
        } catch (CommitterException e) {
            close();
            throw e;
        } catch (Exception e) {
            close();
            throw new CommitterException(
                    "Could not commit JSON batch to Azure Search.", e);
        }
    }

    private void handleResponse(HttpResponse response) 
            throws IOException {
        HttpEntity respEntity = response.getEntity();
        String responseAsString = "";
        if (respEntity != null) {
            responseAsString = IOUtils.toString(
                    respEntity.getContent(), StandardCharsets.UTF_8);
        }
        int statusCode = response.getStatusLine().getStatusCode();
        if (statusCode != HttpStatus.SC_OK 
                && statusCode != HttpStatus.SC_CREATED) {
            String error = "Invalid HTTP response: \""
                    + response.getStatusLine()
                    + "\". Azure Response: " + responseAsString;
            if (isIgnoreResponseErrors()) {
                LOG.error(error);
            } else {
                close();
                throw new CommitterException(error);
            }            
        } else {
            if (LOG.isDebugEnabled()) {
                LOG.debug("Azure Search response status: " 
                        + response.getStatusLine());
            }
            if (LOG.isTraceEnabled()) {
                LOG.trace("Azure Search response:\n" + responseAsString);
            }
        }
    }
    
    private String buildAddOperationJSON(IAddOperation add) {
        String docId = add.getMetadata().getString(getTargetReferenceField());
        if (StringUtils.isBlank(docId)) {
            docId = add.getReference();
        }
        
        // if allow unsafe... do not encode
        if (disableReferenceEncoding) {
            if (!validateDocumentKey(docId)) {
                return null;
            }
        } else {
            docId = Base64.encodeBase64URLSafeString(docId.getBytes());
        }
        
        StringBuilder json = new StringBuilder();
        json.append("{\"@search.action\": \"upload\",");
        append(json, getTargetReferenceField(), docId);
        for (Entry> entry : add.getMetadata().entrySet()) {
            String field = entry.getKey();
            
            // Since target ID was already added (needs to be first), we do 
            // not add it again here
            if (Objects.equals(getTargetReferenceField(), field)) {
                continue;
            }
            if (validateFieldName(field)) {
                json.append(',');
                append(json, field, entry.getValue());
            }
        }
        json.append("}");
        return json.toString();
    }

    private boolean validateFieldName(String field) {
        if (field.startsWith("azureSearch")) {
            return validationError("Document field cannot begin "
                    + "with \"azureSearch\": " + field);
        }
        if (!field.matches("[A-Za-z0-9_]+")) {
            return validationError("Document field cannot have "
                    + "one or more characters other than letters, "
                    + "numbers and underscores: " + field);
        }
        if (field.length() > 128) {
            return validationError("Document field cannot be "
                    + "longer than 128 characters: " + field);
        }
        return true;
    }
    private boolean validateDocumentKey(String docId) {
        if (docId.startsWith("_")) {
            return validationError("Document reference cannot start "
                    + "with an underscore character: " + docId);
        }
        if (!docId.matches("[A-Za-z0-9_\\-=]+")) {
            return validationError("Document reference cannot have one or more "
                    + "characters other than letters, numbers, dashes, "
                    + "underscores, and equal signs: " + docId);
        }
        return true;
    }
    
    private boolean validationError(String error) {
        if (isIgnoreValidationErrors()) {
            LOG.error(error);
            return false;
        }
        throw new CommitterException(error);
    }

    private String buildDeleteOperationJSON(IDeleteOperation del) {
        String docId = Base64.encodeBase64URLSafeString(
                del.getReference().getBytes());
        StringBuilder json = new StringBuilder();
        json.append("{\"@search.action\": \"delete\",");
        append(json, getTargetReferenceField(), docId);
        json.append("}");
        return json.toString();
    }

    private void append(StringBuilder json, String field, List values) {
        if (values.size() == 1) {
            append(json, field, values.get(0));
            return;
        }
        json.append('"')
            .append(StringEscapeUtils.escapeJson(field))
            .append("\":[");
        boolean first = true;
        for (String value : values) {
            if (!first) {
                json.append(',');
            }
            json.append('"')
            .append(StringEscapeUtils.escapeJson(value))
            .append("\"");
            first = false;
        }
        json.append(']');
    }
    
    private void append(StringBuilder json, String field, String value) {
        json.append('"')
            .append(StringEscapeUtils.escapeJson(field))
            .append("\":\"")
            .append(StringEscapeUtils.escapeJson(value))
            .append("\"");
    }
    
    private synchronized CloseableHttpClient nullSafeHttpClient() {
        if (client == null) {
            if (StringUtils.isBlank(getEndpoint())) {
                throw new CommitterException("Endpoint is undefined.");
            }
            if (StringUtils.isBlank(getApiKey())) {
                throw new CommitterException("API admin key is undefined.");
            }
            if (StringUtils.isBlank(getIndexName())) {
                throw new CommitterException("Index name is undefined.");
            }
            if (getCommitBatchSize() > 1000) {
                throw new CommitterException(
                        "Commit batch size cannot be greater than 1000.");
            }
            
            String version = ObjectUtils.defaultIfNull(
                    getApiVersion(), DEFAULT_API_VERSION);
            LOG.debug("Azure Search API Version: " + version);
            
            HttpClientBuilder httpBuilder;
            if (useWindowsAuth && WinHttpClients.isWinAuthAvailable()) {
                httpBuilder = WinHttpClients.custom();
            } else {
                httpBuilder = HttpClientBuilder.create();
            }
            buildHttpClient(httpBuilder);
            client = httpBuilder.build();
            restURL = getEndpoint() + "/indexes/" + getIndexName()
                    + "/docs/index?api-version=" + version;
        }
        return client;
    }

    protected void buildHttpClient(HttpClientBuilder builder) {
        if (proxySettings.isSet()) {
            builder.setProxy(proxySettings.createHttpHost());
            builder.setDefaultCredentialsProvider(
                    proxySettings.createCredentialsProvider());
        }
        builder.setMaxConnTotal(20);
        builder.setMaxConnPerRoute(10);
    }
    
    @Override
    protected void saveToXML(XMLStreamWriter writer) throws XMLStreamException {
        EnhancedXMLStreamWriter w = new EnhancedXMLStreamWriter(writer);
        w.writeElementString("endpoint", getEndpoint());
        w.writeElementString("apiKey", getApiKey());
        w.writeElementString("apiVersion", getApiVersion());
        w.writeElementString("indexName", getIndexName());
        w.writeElementBoolean("useWindowsAuth", isUseWindowsAuth());
        w.writeElementBoolean(
                "disableReferenceEncoding", isDisableReferenceEncoding());
        w.writeElementBoolean(
                "ignoreValidationErrors", isIgnoreValidationErrors());
        w.writeElementBoolean("ignoreResponseErrors", isIgnoreResponseErrors());
        proxySettings.saveProxyToXML(w);
    }

    @Override
    protected void loadFromXml(XMLConfiguration xml) {
        setEndpoint(xml.getString("endpoint", getEndpoint()));
        setApiKey(xml.getString("apiKey", getApiKey()));
        setApiVersion(xml.getString("apiVersion", getApiVersion()));
        setIndexName(xml.getString("indexName", getIndexName()));
        setUseWindowsAuth(xml.getBoolean("useWindowsAuth", isUseWindowsAuth()));
        setDisableReferenceEncoding(xml.getBoolean("disableReferenceEncoding", 
                isDisableReferenceEncoding()));
        setIgnoreValidationErrors(xml.getBoolean("ignoreValidationErrors", 
                isIgnoreValidationErrors()));
        setIgnoreResponseErrors(xml.getBoolean(
                "ignoreResponseErrors", isIgnoreResponseErrors()));
        proxySettings.loadProxyFromXML(xml);
    }
    
    @Override
    public boolean equals(final Object other) {
        return EqualsBuilder.reflectionEquals(this, other, "client", "restURL");
    }
    @Override
    public int hashCode() {
        return HashCodeBuilder.reflectionHashCode(this,  "client", "restURL");
    }
    @Override
    public String toString() {
        return new ReflectionToStringBuilder(
                this, ToStringStyle.SHORT_PREFIX_STYLE)
            .setExcludeFieldNames("client", "restURL").toString();
    }    
}
`proxyPasswordKeySource`	`proxyPasswordKey`
`key`	The actual encryption key.
`file`	Path to a file containing the encryption key.
`environment`	Name of an environment variable containing the key.
`property`	Name of a JVM system property containing the key.