![JAR search and dependency download from the Maven repository](/logo.png)
com.norconex.committer.azuresearch.AzureSearchCommitter Maven / Gradle / Ivy
Show all versions of norconex-committer-azuresearch Show documentation
/* Copyright 2017 Norconex Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.norconex.committer.azuresearch;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.Map.Entry;
import java.util.Objects;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamWriter;
import org.apache.commons.codec.binary.Base64;
import org.apache.commons.configuration.XMLConfiguration;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.ObjectUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.builder.EqualsBuilder;
import org.apache.commons.lang3.builder.HashCodeBuilder;
import org.apache.commons.lang3.builder.ReflectionToStringBuilder;
import org.apache.commons.lang3.builder.ToStringStyle;
import org.apache.commons.text.StringEscapeUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.entity.ContentType;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.client.WinHttpClients;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import com.norconex.committer.core.AbstractCommitter;
import com.norconex.committer.core.AbstractMappedCommitter;
import com.norconex.committer.core.CommitterException;
import com.norconex.committer.core.IAddOperation;
import com.norconex.committer.core.ICommitOperation;
import com.norconex.committer.core.IDeleteOperation;
import com.norconex.commons.lang.encrypt.EncryptionUtil;
import com.norconex.commons.lang.net.ProxySettings;
import com.norconex.commons.lang.time.DurationParser;
import com.norconex.commons.lang.xml.EnhancedXMLStreamWriter;
/**
*
* Commits documents to Microsoft Azure Search.
*
*
* Document reference encoding
*
* By default the document reference (Azure Search Document Key) is
* encoded using URL-safe Base64 encoding. This is Azure Search recommended
* approach when a document unique id can contain special characters
* (e.g. a URL). If you know your document references to be safe
* (e.g. a sequence number), you can
* set {@link #setDisableReferenceEncoding(boolean)} to true
.
* To otherwise store a reference value un-encoded, you can additionally
* store it in a field other than your reference ("id") field.
*
*
* Field names and errors
*
* Azure Search will produce an error if any of the documents in a submitted
* batch contains one or more fields with invalid characters. To prevent
* sending those in vain, the committer will validate your fields
* and throw an exception upon encountering an invalid one.
* To prevent exceptions from being thrown, you can set
* {@link #setIgnoreValidationErrors(boolean)} to true
to
* log those errors instead.
*
*
* An exception will also be thrown for errors returned by Azure Search
* (e.g. a field is not defined in your
* Azure Search schema). To also log those errors instead of throwing an
* exception, you can set {@link #setIgnoreResponseErrors(boolean)}
* to true
.
*
* Field naming rules
*
* Those are the field naming rules mandated for Azure Search (in force
* for Azure Search version 2016-09-01):
* Search version
*
*
* - Document reference (ID): Letters, numbers, dashes ("-"),
* underscores ("_"), and equal signs ("="). First character cannot be
* an underscore.
* - Document field name: Letters, numbers, underscores ("_"). First
* character must be a letter. Cannot start with "azureSearch".
* Maximum length is 128 characters.
*
*
* Password encryption in XML configuration:
*
* The proxyPassword
can take a password that has been
* encrypted using {@link EncryptionUtil} (or command-line encrypt.[bat|sh]).
* In order for the password to be decrypted properly by the crawler, you need
* to specify the encryption key used to encrypt it. The key can be stored
* in a few supported locations and a combination of
* proxyPasswordKey
* and proxyPasswordKeySource
must be specified to properly
* locate the key. The supported sources are:
*
*
*
* proxyPasswordKeySource
* proxyPasswordKey
*
*
* key
* The actual encryption key.
*
*
* file
* Path to a file containing the encryption key.
*
*
* environment
* Name of an environment variable containing the key.
*
*
* property
* Name of a JVM system property containing the key.
*
*
*
* XML configuration usage:
*
* <committer class="com.norconex.committer.azuresearch.AzureSearchCommitter">
* <endpoint>(Azure Search endpoint)</endpoint>
* <apiVersion>(Optional Azure Search API version to use)</apiVersion>
* <apiKey>(Azure Search API admin key)</apiKey>
* <indexName>(Name of the index to use)</indexName>
* <disableReferenceEncoding>[false|true]</disableReferenceEncoding>
* <ignoreValidationErrors>[false|true]</ignoreValidationErrors>
* <ignoreResponseErrors>[false|true]</ignoreResponseErrors>
* <useWindowsAuth>[false|true]</useWindowsAuth>
*
* <proxyHost>...</proxyHost>
* <proxyPort>...</proxyPort>
* <proxyRealm>...</proxyRealm>
* <proxyScheme>...</proxyScheme>
* <proxyUsername>...</proxyUsername>
* <proxyPassword>...</proxyPassword>
* <!-- Use the following if password is encrypted. -->
* <proxyPasswordKey>(the encryption key or a reference to it)</proxyPasswordKey>
* <proxyPasswordKeySource>[key|file|environment|property]</proxyPasswordKeySource>
*
* <sourceReferenceField keep="[false|true]">
* (Optional name of field that contains the document reference, when
* the default document reference is not used. The reference value
* will be mapped to the Azure Search ID field.
* Once re-mapped, this metadata source field is
* deleted, unless "keep" is set to true
.)
* </sourceReferenceField>
* <targetReferenceField>
* (Name of Azure Search target field where the store a document unique
* identifier (sourceReferenceField). If not specified,
* default is "id".)
* </targetReferenceField>
* <sourceContentField keep="[false|true]">
* (If you wish to use a metadata field to act as the document
* "content", you can specify that field here. Default
* does not take a metadata field but rather the document content.
* Once re-mapped, the metadata source field is deleted,
* unless "keep" is set to true
.)
* </sourceContentField>
* <targetContentField>
* (Target repository field name for a document content/body.
* Default is "content".)
* </targetContentField>
* <commitBatchSize>
* (Max number of documents to send to Azure Search at once.
* Maximum is 1000.)
* </commitBatchSize>
* <queueDir>(optional path where to queue files)</queueDir>
* <queueSize>(max queue size before committing)</queueSize>
* <maxRetries>(max retries upon commit failures)</maxRetries>
* <maxRetryWait>(max delay in milliseconds between retries)</maxRetryWait>
* </committer>
*
*
* XML configuration entries expecting millisecond durations
* can be provided in human-readable format (English only), as per
* {@link DurationParser} (e.g., "5 minutes and 30 seconds" or "5m30s").
*
*
* Usage example:
*
* The following example uses the minimum required settings:.
*
*
* <committer class="com.norconex.committer.azuresearch.AzureSearchCommitter">
* <endpoint>https://example.search.windows.net</endpoint>
* <apiKey>1234567890ABCDEF1234567890ABCDEF</apiKey>
* <indexName>sample-index</indexName>
* </committer>
*
*
* @author Pascal Essiembre
*/
public class AzureSearchCommitter extends AbstractMappedCommitter {
private static final Logger LOG =
LogManager.getLogger(AzureSearchCommitter.class);
/** Default Azure Search API version */
public static final String DEFAULT_API_VERSION = "2016-09-01";
/** Default Azure Search document key field */
public static final String DEFAULT_AZURE_ID_FIELD = "id";
/** Default Azure Search content field */
public static final String DEFAULT_AZURE_CONTENT_FIELD = "content";
private String endpoint;
private String apiVersion = DEFAULT_API_VERSION;
private String apiKey;
private String indexName;
private boolean disableReferenceEncoding;
private boolean ignoreValidationErrors;
private boolean ignoreResponseErrors;
private final ProxySettings proxySettings = new ProxySettings();
private CloseableHttpClient client;
private String restURL;
private boolean useWindowsAuth;
/**
* Constructor.
*/
public AzureSearchCommitter() {
super();
setTargetReferenceField(DEFAULT_AZURE_ID_FIELD);
setTargetContentField(DEFAULT_AZURE_CONTENT_FIELD);
}
/**
* Gets the index name.
* @return index name
*/
public String getIndexName() {
return indexName;
}
/**
* Sets the index name.
* @param indexName the index name
*/
public void setIndexName(String indexName) {
this.indexName = indexName;
}
/**
* Gets the Azure Search endpoint
* (https://[service name].search.windows.net).
* @return Azure Search endpoint
*/
public String getEndpoint() {
return endpoint;
}
/**
* Sets the Azure Search endpoint
* (https://[service name].search.windows.net).
* @param endpoint Azure Search endpoint
*/
public void setEndpoint(String endpoint) {
this.endpoint = endpoint;
}
/**
* Gets the Azure API version. Default is {@link #DEFAULT_API_VERSION}.
* @return the Azure API version
*/
public String getApiVersion() {
return apiVersion;
}
/**
* Sets the Azure API version.
* @param apiVersion Azure API version
*/
public void setApiVersion(String apiVersion) {
this.apiVersion = apiVersion;
}
/**
* Gets the Azure API admin key.
* @return Azure API admin key
*/
public String getApiKey() {
return apiKey;
}
/**
* Sets the Azure API admin key.
* @param apiKey Azure API admin key
*/
public void setApiKey(String apiKey) {
this.apiKey = apiKey;
}
/**
* Whether to disable document reference encoding. By default, references
* are encoded using a URL-safe Base64 encoding. When true
,
* document references will be sent as is if they pass validation.
* @return true
if disabling reference encoding
*/
public boolean isDisableReferenceEncoding() {
return disableReferenceEncoding;
}
/**
* Sets whether to disable document reference encoding. When
* false
, references are encoded using a URL-safe Base64
* encoding. When true
, document references will be sent as
* is if they pass validation.
* @param disableReferenceEncoding true
if disabling
* reference encoding
*/
public void setDisableReferenceEncoding(boolean disableReferenceEncoding) {
this.disableReferenceEncoding = disableReferenceEncoding;
}
/**
* Whether to ignore validation errors. By default, an exception is
* thrown if a document contains a field that Azure Search will reject.
* When true
the validation errors are logged
* instead and the faulty field or document is not committed.
* @return true
when ignoring validation errors
*/
public boolean isIgnoreValidationErrors() {
return ignoreValidationErrors;
}
/**
* Sets whether to ignore validation errors.
* When false
, an exception is
* thrown if a document contains a field that Azure Search will reject.
* When true
the validation errors are logged
* instead and the faulty field or document is not committed.
* @param ignoreValidationErrors true
when ignoring validation
* errors
*/
public void setIgnoreValidationErrors(boolean ignoreValidationErrors) {
this.ignoreValidationErrors = ignoreValidationErrors;
}
/**
* Whether to ignore response errors. By default, an exception is
* thrown if the Azure Search response contains an error.
* When true
the errors are logged instead.
* @return true
when ignoring response errors
*/
public boolean isIgnoreResponseErrors() {
return ignoreResponseErrors;
}
/**
* Sets whether to ignore response errors.
* When false
, an exception is
* thrown if the Azure Search response contains an error.
* When true
the errors are logged instead.
* @param ignoreResponseErrors true
when ignoring response
* errors
*/
public void setIgnoreResponseErrors(boolean ignoreResponseErrors) {
this.ignoreResponseErrors = ignoreResponseErrors;
}
/**
* Gets the proxy settings. Never null
.
* @return proxy settings
* @since 1.1.0
*/
public ProxySettings getProxySettings() {
return proxySettings;
}
/**
* Whether to use integrated Windows Authentication (if applicable).
* @return true
if using Windows Authentication
*/
public boolean isUseWindowsAuth() {
return useWindowsAuth;
}
/**
* Sets whether to use integrated Windows Authentication (if applicable).
* @param useWindowsAuth true
if using Windows Authentication
*/
public void setUseWindowsAuth(boolean useWindowsAuth) {
this.useWindowsAuth = useWindowsAuth;
}
@Override
public void commit() {
super.commit();
closeIfDone();
}
//TODO The following is a workaround to not having
// a close() method (or equivalent) on the Committers yet.
// So we check that the caller is not itself, which means it should
// be the parent framework, which should in theory, call this only
// once. This is safe to do as the worst case scenario is that a new
// client is re-created.
// Remove this method once proper init/close is added to Committers
private void closeIfDone() {
StackTraceElement[] els = Thread.currentThread().getStackTrace();
for (StackTraceElement el : els) {
if (AbstractCommitter.class.getName().equals(el.getClassName())
&& "commitIfReady".equals(el.getMethodName())) {
return;
}
}
close();
}
protected void close() {
IOUtils.closeQuietly(client);
client = null;
LOG.info("Azure Search REST API Http Client closed.");
}
@Override
protected void commitBatch(List batch) {
HttpClient safeClient = nullSafeHttpClient();
LOG.info("Sending " + batch.size()
+ " commit operations to Azure Search.");
try {
boolean first = true;
StringBuilder json = new StringBuilder();
for (ICommitOperation op : batch) {
String toAppend;
if (op instanceof IAddOperation) {
toAppend = buildAddOperationJSON((IAddOperation) op);
} else if (op instanceof IDeleteOperation) {
toAppend = buildDeleteOperationJSON((IDeleteOperation) op);
} else {
close();
throw new CommitterException("Unsupported operation:" + op);
}
if (StringUtils.isNotBlank(toAppend)) {
if (!first) {
json.append(",\n");
}
json.append(toAppend);
first = false;
}
}
if (json.length() == 0) {
LOG.warn("No documents were valid. Nothing committed.");
return;
}
json.insert(0, "{\"value\":[\n");
json.append("\n]}\n");
if (LOG.isTraceEnabled()) {
LOG.trace("JSON POST:\n" + StringUtils.trim(json.toString()));
}
StringEntity requestEntity = new StringEntity(
json.toString(), ContentType.APPLICATION_JSON);
HttpPost post = new HttpPost(restURL);
post.addHeader("api-key", getApiKey());
post.setEntity(requestEntity);
HttpResponse response = safeClient.execute(post);
handleResponse(response);
post.releaseConnection();
LOG.info("Done sending commit operations to Azure Search.");
} catch (CommitterException e) {
close();
throw e;
} catch (Exception e) {
close();
throw new CommitterException(
"Could not commit JSON batch to Azure Search.", e);
}
}
private void handleResponse(HttpResponse response)
throws IOException {
HttpEntity respEntity = response.getEntity();
String responseAsString = "";
if (respEntity != null) {
responseAsString = IOUtils.toString(
respEntity.getContent(), StandardCharsets.UTF_8);
}
int statusCode = response.getStatusLine().getStatusCode();
if (statusCode != HttpStatus.SC_OK
&& statusCode != HttpStatus.SC_CREATED) {
String error = "Invalid HTTP response: \""
+ response.getStatusLine()
+ "\". Azure Response: " + responseAsString;
if (isIgnoreResponseErrors()) {
LOG.error(error);
} else {
close();
throw new CommitterException(error);
}
} else {
if (LOG.isDebugEnabled()) {
LOG.debug("Azure Search response status: "
+ response.getStatusLine());
}
if (LOG.isTraceEnabled()) {
LOG.trace("Azure Search response:\n" + responseAsString);
}
}
}
private String buildAddOperationJSON(IAddOperation add) {
String docId = add.getMetadata().getString(getTargetReferenceField());
if (StringUtils.isBlank(docId)) {
docId = add.getReference();
}
// if allow unsafe... do not encode
if (disableReferenceEncoding) {
if (!validateDocumentKey(docId)) {
return null;
}
} else {
docId = Base64.encodeBase64URLSafeString(docId.getBytes());
}
StringBuilder json = new StringBuilder();
json.append("{\"@search.action\": \"upload\",");
append(json, getTargetReferenceField(), docId);
for (Entry> entry : add.getMetadata().entrySet()) {
String field = entry.getKey();
// Since target ID was already added (needs to be first), we do
// not add it again here
if (Objects.equals(getTargetReferenceField(), field)) {
continue;
}
if (validateFieldName(field)) {
json.append(',');
append(json, field, entry.getValue());
}
}
json.append("}");
return json.toString();
}
private boolean validateFieldName(String field) {
if (field.startsWith("azureSearch")) {
return validationError("Document field cannot begin "
+ "with \"azureSearch\": " + field);
}
if (!field.matches("[A-Za-z0-9_]+")) {
return validationError("Document field cannot have "
+ "one or more characters other than letters, "
+ "numbers and underscores: " + field);
}
if (field.length() > 128) {
return validationError("Document field cannot be "
+ "longer than 128 characters: " + field);
}
return true;
}
private boolean validateDocumentKey(String docId) {
if (docId.startsWith("_")) {
return validationError("Document reference cannot start "
+ "with an underscore character: " + docId);
}
if (!docId.matches("[A-Za-z0-9_\\-=]+")) {
return validationError("Document reference cannot have one or more "
+ "characters other than letters, numbers, dashes, "
+ "underscores, and equal signs: " + docId);
}
return true;
}
private boolean validationError(String error) {
if (isIgnoreValidationErrors()) {
LOG.error(error);
return false;
}
throw new CommitterException(error);
}
private String buildDeleteOperationJSON(IDeleteOperation del) {
String docId = Base64.encodeBase64URLSafeString(
del.getReference().getBytes());
StringBuilder json = new StringBuilder();
json.append("{\"@search.action\": \"delete\",");
append(json, getTargetReferenceField(), docId);
json.append("}");
return json.toString();
}
private void append(StringBuilder json, String field, List values) {
if (values.size() == 1) {
append(json, field, values.get(0));
return;
}
json.append('"')
.append(StringEscapeUtils.escapeJson(field))
.append("\":[");
boolean first = true;
for (String value : values) {
if (!first) {
json.append(',');
}
json.append('"')
.append(StringEscapeUtils.escapeJson(value))
.append("\"");
first = false;
}
json.append(']');
}
private void append(StringBuilder json, String field, String value) {
json.append('"')
.append(StringEscapeUtils.escapeJson(field))
.append("\":\"")
.append(StringEscapeUtils.escapeJson(value))
.append("\"");
}
private synchronized CloseableHttpClient nullSafeHttpClient() {
if (client == null) {
if (StringUtils.isBlank(getEndpoint())) {
throw new CommitterException("Endpoint is undefined.");
}
if (StringUtils.isBlank(getApiKey())) {
throw new CommitterException("API admin key is undefined.");
}
if (StringUtils.isBlank(getIndexName())) {
throw new CommitterException("Index name is undefined.");
}
if (getCommitBatchSize() > 1000) {
throw new CommitterException(
"Commit batch size cannot be greater than 1000.");
}
String version = ObjectUtils.defaultIfNull(
getApiVersion(), DEFAULT_API_VERSION);
LOG.debug("Azure Search API Version: " + version);
HttpClientBuilder httpBuilder;
if (useWindowsAuth && WinHttpClients.isWinAuthAvailable()) {
httpBuilder = WinHttpClients.custom();
} else {
httpBuilder = HttpClientBuilder.create();
}
buildHttpClient(httpBuilder);
client = httpBuilder.build();
restURL = getEndpoint() + "/indexes/" + getIndexName()
+ "/docs/index?api-version=" + version;
}
return client;
}
protected void buildHttpClient(HttpClientBuilder builder) {
if (proxySettings.isSet()) {
builder.setProxy(proxySettings.createHttpHost());
builder.setDefaultCredentialsProvider(
proxySettings.createCredentialsProvider());
}
builder.setMaxConnTotal(20);
builder.setMaxConnPerRoute(10);
}
@Override
protected void saveToXML(XMLStreamWriter writer) throws XMLStreamException {
EnhancedXMLStreamWriter w = new EnhancedXMLStreamWriter(writer);
w.writeElementString("endpoint", getEndpoint());
w.writeElementString("apiKey", getApiKey());
w.writeElementString("apiVersion", getApiVersion());
w.writeElementString("indexName", getIndexName());
w.writeElementBoolean("useWindowsAuth", isUseWindowsAuth());
w.writeElementBoolean(
"disableReferenceEncoding", isDisableReferenceEncoding());
w.writeElementBoolean(
"ignoreValidationErrors", isIgnoreValidationErrors());
w.writeElementBoolean("ignoreResponseErrors", isIgnoreResponseErrors());
proxySettings.saveProxyToXML(w);
}
@Override
protected void loadFromXml(XMLConfiguration xml) {
setEndpoint(xml.getString("endpoint", getEndpoint()));
setApiKey(xml.getString("apiKey", getApiKey()));
setApiVersion(xml.getString("apiVersion", getApiVersion()));
setIndexName(xml.getString("indexName", getIndexName()));
setUseWindowsAuth(xml.getBoolean("useWindowsAuth", isUseWindowsAuth()));
setDisableReferenceEncoding(xml.getBoolean("disableReferenceEncoding",
isDisableReferenceEncoding()));
setIgnoreValidationErrors(xml.getBoolean("ignoreValidationErrors",
isIgnoreValidationErrors()));
setIgnoreResponseErrors(xml.getBoolean(
"ignoreResponseErrors", isIgnoreResponseErrors()));
proxySettings.loadProxyFromXML(xml);
}
@Override
public boolean equals(final Object other) {
return EqualsBuilder.reflectionEquals(this, other, "client", "restURL");
}
@Override
public int hashCode() {
return HashCodeBuilder.reflectionHashCode(this, "client", "restURL");
}
@Override
public String toString() {
return new ReflectionToStringBuilder(
this, ToStringStyle.SHORT_PREFIX_STYLE)
.setExcludeFieldNames("client", "restURL").toString();
}
}