com.norconex.committer.cloudsearch.CloudSearchCommitter Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of norconex-committer-cloudsearch Show documentation
Show all versions of norconex-committer-cloudsearch Show documentation
Amazon CloudSaerch implementation of Norconex Committer.
/* Copyright 2016 Norconex Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.norconex.committer.cloudsearch;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamWriter;
import org.apache.commons.configuration.XMLConfiguration;
import org.apache.commons.lang3.CharEncoding;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.builder.EqualsBuilder;
import org.apache.commons.lang3.builder.HashCodeBuilder;
import org.apache.commons.lang3.builder.ToStringBuilder;
import org.apache.commons.lang3.builder.ToStringStyle;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.json.JSONObject;
import com.amazonaws.auth.BasicAWSCredentials;
import com.amazonaws.auth.DefaultAWSCredentialsProviderChain;
import com.amazonaws.services.cloudsearchdomain.AmazonCloudSearchDomainClient;
import com.amazonaws.services.cloudsearchdomain.model.UploadDocumentsRequest;
import com.amazonaws.services.cloudsearchdomain.model.UploadDocumentsResult;
import com.norconex.committer.core.AbstractMappedCommitter;
import com.norconex.committer.core.CommitterException;
import com.norconex.committer.core.IAddOperation;
import com.norconex.committer.core.ICommitOperation;
import com.norconex.committer.core.IDeleteOperation;
import com.norconex.commons.lang.map.Properties;
/**
*
* Commits documents to Amazon CloudSearch.
*
* Authentication:
* An access key and security key are required to connect to interact with
* CloudSearch. For enhanced security, it is best to use one of the methods
* described in {@link DefaultAWSCredentialsProviderChain} for setting them
* (environment variables, system properties, profile file, etc).
* Do not explicitely set "accessKey" and "secretKey" on this class relying on
* safer methods.
*
* XML configuration usage:
*
*
* <committer class="com.norconex.committer.cloudsearch.CloudSearchCommitter">
*
* <-- Mandatory: -->
* <documentEndpoint>(CloudSearch document endpoint)</documentEndpoint>
*
* <-- Mandatory if not configured elsewhere: -->
* <accessKey>
* (Optional CloudSearch access key. Will be taken from environment
* when blank.)
* </accessKey>
* <secretKey>
* (Optional CloudSearch secret key. Will be taken from environment
* when blank.)
* </secretKey>
*
* <-- Optional settings: -->
* <sourceReferenceField keep="[false|true]">
* (Optional name of field that contains the document reference, when
* the default document reference is not used. The reference value
* will be mapped to CloudSearch "id" field, which is mandatory.
* Once re-mapped, this metadata source field is
* deleted, unless "keep" is set to true
.)
* </sourceReferenceField>
* <sourceContentField keep="[false|true]">
* (If you wish to use a metadata field to act as the document
* "content", you can specify that field here. Default
* does not take a metadata field but rather the document content.
* Once re-mapped, the metadata source field is deleted,
* unless "keep" is set to true
.)
* </sourceContentField>
* <targetContentField>
* (CloudSearch target field name for a document content/body.
* Default is: content)
* </targetContentField>
* <commitBatchSize>
* (Max number of docs to send CloudSearch at once. If you experience
* memory problems, lower this number. Default is 100.)
* </commitBatchSize>
* <queueDir>(Optional path where to queue files)</queueDir>
* <queueSize>
* (Max queue size before committing. Default is 1000.)
* </queueSize>
* <maxRetries>
* (Max retries upon commit failures. Default is 0.)
* </maxRetries>
* <maxRetryWait>
* (Max delay between retries. Default is 0.)
* </maxRetryWait>
* </committer>
*
*
* @author El-Hebri Khiari
* @author Pascal Essiembre
*/
public class CloudSearchCommitter extends AbstractMappedCommitter {
private static final Logger LOG =
LogManager.getLogger(CloudSearchCommitter.class);
/** CloudSearch mandatory ID field */
public static final String COULDSEARCH_ID_FIELD = "id";
/** Default CloudSearch content field */
public static final String DEFAULT_COULDSEARCH_CONTENT_FIELD = "content";
private static final String TEMP_TARGET_ID_FIELD = "__nx.cloudsearch.id";
private AmazonCloudSearchDomainClient awsClient;
private boolean needNewAwsClient = true;
private String documentEndpoint;
private String accessKey;
private String secretKey;
public CloudSearchCommitter() {
this(null);
}
public CloudSearchCommitter(String documentEndpoint) {
super();
this.documentEndpoint = documentEndpoint;
setTargetContentField(DEFAULT_COULDSEARCH_CONTENT_FIELD);
super.setTargetReferenceField(TEMP_TARGET_ID_FIELD);
}
/**
* Gets the CloudSearch document endpoint.
* @return document endpoint
*/
public String getDocumentEndpoint() {
return documentEndpoint;
}
/**
* Sets the CloudSearch document endpoint.
* @param documentEndpoint document endpoint
*/
public void setDocumentEndpoint(String documentEndpoint) {
this.documentEndpoint = documentEndpoint;
needNewAwsClient = true;
}
/**
* Gets the CloudSearch access key. If null
, the access key
* will be obtained from the environment, as detailed in
* {@link DefaultAWSCredentialsProviderChain}.
* @return the access key
*/
public String getAccessKey() {
return accessKey;
}
/**
* Sets the CloudSearch access key. If null
, the access key
* will be obtained from the environment, as detailed in
* {@link DefaultAWSCredentialsProviderChain}.
* @param accessKey the access key
*/
public void setAccessKey(String accessKey) {
this.accessKey = accessKey;
needNewAwsClient = true;
}
/**
* Gets the CloudSearch secret key. If null
, the secret key
* will be obtained from the environment, as detailed in
* {@link DefaultAWSCredentialsProviderChain}.
* @return the secret key
*/
public String getSecretKey() {
return secretKey;
}
/**
* Sets the CloudSearch secret key. If null
, the secret key
* will be obtained from the environment, as detailed in
* {@link DefaultAWSCredentialsProviderChain}.
* @param secretKey the secret key
*/
public void setSecretKey(String secretKey) {
this.secretKey = secretKey;
needNewAwsClient = true;
}
/**
* This method is not supported and will throw an
* {@link UnsupportedOperationException} if invoked. With CloudSearch,
* the target field for a document unique id is always "id".
* @param targetReferenceField the target field
*/
@Override
public void setTargetReferenceField(String targetReferenceField) {
if (StringUtils.isNotBlank(targetReferenceField)) {
LOG.warn("Target reference field is always \"id\" "
+ "and cannot be changed.");
}
}
@Override
protected void commitBatch(List batch) {
LOG.info("Sending " + batch.size()
+ " documents to AWS CloudSearch for addition/deletion.");
List documentBatch = new ArrayList<>();
for (ICommitOperation op : batch) {
if (op instanceof IAddOperation) {
documentBatch.add(buildJsonDocumentAddition(
((IAddOperation) op).getMetadata()));
} else if (op instanceof IDeleteOperation) {
documentBatch.add(buildJsonDocumentDeletion(
((IDeleteOperation) op).getReference()));
} else {
throw new CommitterException("Unsupported operation:" + op);
}
}
uploadBatchToCloudSearch(documentBatch);
}
private void uploadBatchToCloudSearch(List documentBatch) {
// Convert the JSON list to String and read it as a stream from memory
// (for increased performance), for it to be usable by the AWS
// CloudSearch UploadRequest. If memory becomes a concern, consider
// streaming to file.
// ArrayList.toString() joins the elements in a JSON-compliant way.
byte[] bytes;
try {
bytes = documentBatch.toString().getBytes(CharEncoding.UTF_8);
} catch (UnsupportedEncodingException e) {
throw new CommitterException("UTF-8 not supported by OS.", e);
}
try (ByteArrayInputStream is = new ByteArrayInputStream(bytes)) {
UploadDocumentsRequest uploadRequest = new UploadDocumentsRequest();
uploadRequest.setContentType("application/json");
uploadRequest.setDocuments(is);
uploadRequest.setContentLength((long) bytes.length);
ensureAWSClient();
UploadDocumentsResult result =
awsClient.uploadDocuments(uploadRequest);
LOG.info(result.getAdds() + " Add requests and "
+ result.getDeletes() + " Delete requests "
+ "sent to the AWS CloudSearch domain.");
} catch (IOException e) {
throw new CommitterException(
"Could not upload request to CloudSearch.", e);
}
}
private synchronized void ensureAWSClient() {
if (StringUtils.isBlank(getDocumentEndpoint())) {
throw new CommitterException("Document endpoint is undefined.");
}
if (!needNewAwsClient) {
return;
}
if (StringUtils.isAnyBlank(accessKey, secretKey)) {
awsClient = new AmazonCloudSearchDomainClient(
new DefaultAWSCredentialsProviderChain());
} else {
awsClient = new AmazonCloudSearchDomainClient(
new BasicAWSCredentials(accessKey, secretKey));
}
awsClient.setEndpoint(documentEndpoint);
needNewAwsClient = false;
}
private JSONObject buildJsonDocumentAddition(Properties fields) {
if (fields.isEmpty()) {
throw new CommitterException(
"Attempting to commit an empty document.");
}
Map documentMap = new HashMap<>();
documentMap.put("type", "add");
documentMap.put("id", fields.getString(TEMP_TARGET_ID_FIELD));
fields.remove(TEMP_TARGET_ID_FIELD);
Map fieldMap = new HashMap<>();
for (String key : fields.keySet()) {
List values = fields.getStrings(key);
if (!StringUtils.equals(key, "id")) {
/*size = 1 : non-empty single-valued field
size > 1 : non-empty multi-valued field
size = 0 : empty field
*/
if (values.size() == 1) {
fieldMap.put(key, values.get(0));
} else if (values.size() > 1){
fieldMap.put(key, values);
} else {
fieldMap.put(key, "");
}
}
}
documentMap.put("fields", fieldMap);
return new JSONObject(documentMap);
}
private JSONObject buildJsonDocumentDeletion(String reference) {
Map documentMap = new HashMap<>();
documentMap.put("type", "delete");
documentMap.put("id", reference);
return new JSONObject(documentMap);
}
@Override
protected void saveToXML(XMLStreamWriter writer) throws XMLStreamException {
writer.writeStartElement("documentEndpoint");
writer.writeCharacters(documentEndpoint);
writer.writeEndElement();
writer.writeStartElement("accessKey");
writer.writeCharacters(accessKey);
writer.writeEndElement();
writer.writeStartElement("secretKey");
writer.writeCharacters(secretKey);
writer.writeEndElement();
}
@Override
protected void loadFromXml(XMLConfiguration xml) {
setDocumentEndpoint(xml.getString(
"documentEndpoint", getDocumentEndpoint()));
setAccessKey(xml.getString("accessKey", getAccessKey()));
setSecretKey(xml.getString("secretKey", getSecretKey()));
}
@Override
public int hashCode() {
return new HashCodeBuilder()
.appendSuper(super.hashCode())
.append(documentEndpoint)
.append(accessKey)
.append(secretKey)
.toHashCode();
}
@Override
public boolean equals(Object obj) {
if (this == obj) {
return true;
}
if (obj == null) {
return false;
}
if (!(obj instanceof CloudSearchCommitter)) {
return false;
}
CloudSearchCommitter other = (CloudSearchCommitter) obj;
return new EqualsBuilder()
.appendSuper(super.equals(obj))
.append(documentEndpoint, other.documentEndpoint)
.append(accessKey, other.accessKey)
.append(secretKey, other.secretKey)
.isEquals();
}
@Override
public String toString() {
return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE)
.appendSuper(super.toString())
.append("documentEndpoint", documentEndpoint)
.append("accessKey", accessKey)
.append("secretKey", secretKey)
.toString();
}
}