All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.norconex.committer.cloudsearch.CloudSearchCommitter Maven / Gradle / Ivy

There is a newer version: 2.0.0
Show newest version
/* Copyright 2016 Norconex Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.norconex.committer.cloudsearch;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamWriter;

import org.apache.commons.configuration.XMLConfiguration;
import org.apache.commons.lang3.CharEncoding;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.builder.EqualsBuilder;
import org.apache.commons.lang3.builder.HashCodeBuilder;
import org.apache.commons.lang3.builder.ToStringBuilder;
import org.apache.commons.lang3.builder.ToStringStyle;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.json.JSONObject;

import com.amazonaws.auth.BasicAWSCredentials;
import com.amazonaws.auth.DefaultAWSCredentialsProviderChain;
import com.amazonaws.services.cloudsearchdomain.AmazonCloudSearchDomainClient;
import com.amazonaws.services.cloudsearchdomain.model.UploadDocumentsRequest;
import com.amazonaws.services.cloudsearchdomain.model.UploadDocumentsResult;
import com.norconex.committer.core.AbstractMappedCommitter;
import com.norconex.committer.core.CommitterException;
import com.norconex.committer.core.IAddOperation;
import com.norconex.committer.core.ICommitOperation;
import com.norconex.committer.core.IDeleteOperation;
import com.norconex.commons.lang.map.Properties;

/**
 * 

* Commits documents to Amazon CloudSearch. *

*

Authentication:

* An access key and security key are required to connect to interact with * CloudSearch. For enhanced security, it is best to use one of the methods * described in {@link DefaultAWSCredentialsProviderChain} for setting them * (environment variables, system properties, profile file, etc). * Do not explicitely set "accessKey" and "secretKey" on this class relying on * safer methods. * *

XML configuration usage:

* *
 *  <committer class="com.norconex.committer.cloudsearch.CloudSearchCommitter">
 *  
 *      <-- Mandatory: -->
 *      <documentEndpoint>(CloudSearch document endpoint)</documentEndpoint>
 *      
 *      <-- Mandatory if not configured elsewhere: -->
 *      <accessKey>
 *         (Optional CloudSearch access key. Will be taken from environment 
 *          when blank.)
 *      </accessKey>
 *      <secretKey>
 *         (Optional CloudSearch secret key. Will be taken from environment
 *          when blank.)
 *      </secretKey>
 *      
 *      <-- Optional settings: -->
 *      <sourceReferenceField keep="[false|true]">
 *         (Optional name of field that contains the document reference, when 
 *         the default document reference is not used.  The reference value
 *         will be mapped to CloudSearch "id" field, which is mandatory.
 *         Once re-mapped, this metadata source field is 
 *         deleted, unless "keep" is set to true.)
 *      </sourceReferenceField>
 *      <sourceContentField keep="[false|true]">
 *         (If you wish to use a metadata field to act as the document 
 *         "content", you can specify that field here.  Default 
 *         does not take a metadata field but rather the document content.
 *         Once re-mapped, the metadata source field is deleted,
 *         unless "keep" is set to true.)
 *      </sourceContentField>
 *      <targetContentField>
 *         (CloudSearch target field name for a document content/body.
 *          Default is: content)
 *      </targetContentField>
 *      <commitBatchSize>
 *          (Max number of docs to send CloudSearch at once. If you experience
 *           memory problems, lower this number.  Default is 100.)
 *      </commitBatchSize>
 *      <queueDir>(Optional path where to queue files)</queueDir>
 *      <queueSize>
 *          (Max queue size before committing. Default is 1000.)
 *      </queueSize>
 *      <maxRetries>
 *          (Max retries upon commit failures. Default is 0.)
 *      </maxRetries>
 *      <maxRetryWait>
 *          (Max delay between retries. Default is 0.)
 *      </maxRetryWait>
 *  </committer>
 * 
* * @author El-Hebri Khiari * @author Pascal Essiembre */ public class CloudSearchCommitter extends AbstractMappedCommitter { private static final Logger LOG = LogManager.getLogger(CloudSearchCommitter.class); /** CloudSearch mandatory ID field */ public static final String COULDSEARCH_ID_FIELD = "id"; /** Default CloudSearch content field */ public static final String DEFAULT_COULDSEARCH_CONTENT_FIELD = "content"; private static final String TEMP_TARGET_ID_FIELD = "__nx.cloudsearch.id"; private AmazonCloudSearchDomainClient awsClient; private boolean needNewAwsClient = true; private String documentEndpoint; private String accessKey; private String secretKey; public CloudSearchCommitter() { this(null); } public CloudSearchCommitter(String documentEndpoint) { super(); this.documentEndpoint = documentEndpoint; setTargetContentField(DEFAULT_COULDSEARCH_CONTENT_FIELD); super.setTargetReferenceField(TEMP_TARGET_ID_FIELD); } /** * Gets the CloudSearch document endpoint. * @return document endpoint */ public String getDocumentEndpoint() { return documentEndpoint; } /** * Sets the CloudSearch document endpoint. * @param documentEndpoint document endpoint */ public void setDocumentEndpoint(String documentEndpoint) { this.documentEndpoint = documentEndpoint; needNewAwsClient = true; } /** * Gets the CloudSearch access key. If null, the access key * will be obtained from the environment, as detailed in * {@link DefaultAWSCredentialsProviderChain}. * @return the access key */ public String getAccessKey() { return accessKey; } /** * Sets the CloudSearch access key. If null, the access key * will be obtained from the environment, as detailed in * {@link DefaultAWSCredentialsProviderChain}. * @param accessKey the access key */ public void setAccessKey(String accessKey) { this.accessKey = accessKey; needNewAwsClient = true; } /** * Gets the CloudSearch secret key. If null, the secret key * will be obtained from the environment, as detailed in * {@link DefaultAWSCredentialsProviderChain}. * @return the secret key */ public String getSecretKey() { return secretKey; } /** * Sets the CloudSearch secret key. If null, the secret key * will be obtained from the environment, as detailed in * {@link DefaultAWSCredentialsProviderChain}. * @param secretKey the secret key */ public void setSecretKey(String secretKey) { this.secretKey = secretKey; needNewAwsClient = true; } /** * This method is not supported and will throw an * {@link UnsupportedOperationException} if invoked. With CloudSearch, * the target field for a document unique id is always "id". * @param targetReferenceField the target field */ @Override public void setTargetReferenceField(String targetReferenceField) { if (StringUtils.isNotBlank(targetReferenceField)) { LOG.warn("Target reference field is always \"id\" " + "and cannot be changed."); } } @Override protected void commitBatch(List batch) { LOG.info("Sending " + batch.size() + " documents to AWS CloudSearch for addition/deletion."); List documentBatch = new ArrayList<>(); for (ICommitOperation op : batch) { if (op instanceof IAddOperation) { documentBatch.add(buildJsonDocumentAddition( ((IAddOperation) op).getMetadata())); } else if (op instanceof IDeleteOperation) { documentBatch.add(buildJsonDocumentDeletion( ((IDeleteOperation) op).getReference())); } else { throw new CommitterException("Unsupported operation:" + op); } } uploadBatchToCloudSearch(documentBatch); } private void uploadBatchToCloudSearch(List documentBatch) { // Convert the JSON list to String and read it as a stream from memory // (for increased performance), for it to be usable by the AWS // CloudSearch UploadRequest. If memory becomes a concern, consider // streaming to file. // ArrayList.toString() joins the elements in a JSON-compliant way. byte[] bytes; try { bytes = documentBatch.toString().getBytes(CharEncoding.UTF_8); } catch (UnsupportedEncodingException e) { throw new CommitterException("UTF-8 not supported by OS.", e); } try (ByteArrayInputStream is = new ByteArrayInputStream(bytes)) { UploadDocumentsRequest uploadRequest = new UploadDocumentsRequest(); uploadRequest.setContentType("application/json"); uploadRequest.setDocuments(is); uploadRequest.setContentLength((long) bytes.length); ensureAWSClient(); UploadDocumentsResult result = awsClient.uploadDocuments(uploadRequest); LOG.info(result.getAdds() + " Add requests and " + result.getDeletes() + " Delete requests " + "sent to the AWS CloudSearch domain."); } catch (IOException e) { throw new CommitterException( "Could not upload request to CloudSearch.", e); } } private synchronized void ensureAWSClient() { if (StringUtils.isBlank(getDocumentEndpoint())) { throw new CommitterException("Document endpoint is undefined."); } if (!needNewAwsClient) { return; } if (StringUtils.isAnyBlank(accessKey, secretKey)) { awsClient = new AmazonCloudSearchDomainClient( new DefaultAWSCredentialsProviderChain()); } else { awsClient = new AmazonCloudSearchDomainClient( new BasicAWSCredentials(accessKey, secretKey)); } awsClient.setEndpoint(documentEndpoint); needNewAwsClient = false; } private JSONObject buildJsonDocumentAddition(Properties fields) { if (fields.isEmpty()) { throw new CommitterException( "Attempting to commit an empty document."); } Map documentMap = new HashMap<>(); documentMap.put("type", "add"); documentMap.put("id", fields.getString(TEMP_TARGET_ID_FIELD)); fields.remove(TEMP_TARGET_ID_FIELD); Map fieldMap = new HashMap<>(); for (String key : fields.keySet()) { List values = fields.getStrings(key); if (!StringUtils.equals(key, "id")) { /*size = 1 : non-empty single-valued field size > 1 : non-empty multi-valued field size = 0 : empty field */ if (values.size() == 1) { fieldMap.put(key, values.get(0)); } else if (values.size() > 1){ fieldMap.put(key, values); } else { fieldMap.put(key, ""); } } } documentMap.put("fields", fieldMap); return new JSONObject(documentMap); } private JSONObject buildJsonDocumentDeletion(String reference) { Map documentMap = new HashMap<>(); documentMap.put("type", "delete"); documentMap.put("id", reference); return new JSONObject(documentMap); } @Override protected void saveToXML(XMLStreamWriter writer) throws XMLStreamException { writer.writeStartElement("documentEndpoint"); writer.writeCharacters(documentEndpoint); writer.writeEndElement(); writer.writeStartElement("accessKey"); writer.writeCharacters(accessKey); writer.writeEndElement(); writer.writeStartElement("secretKey"); writer.writeCharacters(secretKey); writer.writeEndElement(); } @Override protected void loadFromXml(XMLConfiguration xml) { setDocumentEndpoint(xml.getString( "documentEndpoint", getDocumentEndpoint())); setAccessKey(xml.getString("accessKey", getAccessKey())); setSecretKey(xml.getString("secretKey", getSecretKey())); } @Override public int hashCode() { return new HashCodeBuilder() .appendSuper(super.hashCode()) .append(documentEndpoint) .append(accessKey) .append(secretKey) .toHashCode(); } @Override public boolean equals(Object obj) { if (this == obj) { return true; } if (obj == null) { return false; } if (!(obj instanceof CloudSearchCommitter)) { return false; } CloudSearchCommitter other = (CloudSearchCommitter) obj; return new EqualsBuilder() .appendSuper(super.equals(obj)) .append(documentEndpoint, other.documentEndpoint) .append(accessKey, other.accessKey) .append(secretKey, other.secretKey) .isEquals(); } @Override public String toString() { return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE) .appendSuper(super.toString()) .append("documentEndpoint", documentEndpoint) .append("accessKey", accessKey) .append("secretKey", secretKey) .toString(); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy