All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.marklogic.mapreduce.ContentWriter Maven / Gradle / Ivy

/*
 * Copyright 2003-2019 MarkLogic Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.marklogic.mapreduce;

import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.zip.GZIPInputStream;
import java.util.zip.ZipInputStream;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;

import com.marklogic.mapreduce.utilities.AssignmentManager;
import com.marklogic.mapreduce.utilities.AssignmentPolicy;
import com.marklogic.mapreduce.utilities.ForestHost;
import com.marklogic.mapreduce.utilities.InternalUtilities;
import com.marklogic.mapreduce.utilities.StatisticalAssignmentPolicy;
import com.marklogic.xcc.Content;
import com.marklogic.xcc.ContentCapability;
import com.marklogic.xcc.ContentCreateOptions;
import com.marklogic.xcc.ContentFactory;
import com.marklogic.xcc.ContentPermission;
import com.marklogic.xcc.ContentSource;
import com.marklogic.xcc.DocumentFormat;
import com.marklogic.xcc.DocumentRepairLevel;
import com.marklogic.xcc.RequestOptions;
import com.marklogic.xcc.Session;
import com.marklogic.xcc.impl.SessionImpl;
import com.marklogic.xcc.Session.TransactionMode;
import com.marklogic.xcc.exceptions.ContentInsertException;
import com.marklogic.xcc.exceptions.QueryException;
import com.marklogic.xcc.exceptions.RequestException;
import com.marklogic.xcc.exceptions.RequestServerException;
import com.marklogic.xcc.exceptions.RetryableQueryException;

/**
 * MarkLogicRecordWriter that inserts content to MarkLogicServer.
 * 
 * @author jchen
 *
 */
public class ContentWriter 
extends MarkLogicRecordWriter 
implements MarkLogicConstants {
    public static final Log LOG = LogFactory.getLog(ContentWriter.class);
    
    public static final String ID_PREFIX = "#";

    /**
     * Directory of the output documents.
     */
    protected String outputDir;
    
    /**
     * Content options of the output documents.
     */
    protected ContentCreateOptions options;
    
    /**
     * A map from a host to a ContentSource. 
     */
    protected Map hostSourceMap;
    
    /**
     * Content lists for each forest.
     */
    protected Content[][] forestContents;
    
    /**
     * An array of forest/host ids
     */
    protected String[] forestIds;

    /**
     * An array of current replica
     */
    protected int[] curReplica;

    /**
     * An array of blacklist host 
     */
    protected boolean[] blacklist;
    
    /** 
     * Counts of documents per forest.
     */
    protected int[] counts;
    
    /**
     * URIs of the documents in a batch.
     */
    protected HashMap[] pendingUris;
    
    /**
     * URIs of the documents to be committed.
     */
    protected List[] commitUris;
    
    /**
     * Whether in fast load mode.
     */
    protected boolean fastLoad;
    
    /**
     * Batch size.
     */
    protected int batchSize;
    
    /**
     * Counts of requests per forest.
     */
    protected int[] stmtCounts;
    
    /**
     * Sessions per forest.
     */
    protected Session[] sessions;
    
    private boolean formatNeeded;
    
    private FileSystem fs;
    
    protected InputStream is;
    
    private boolean streaming;
    
    private RequestOptions requestOptions;

    protected AssignmentManager am;
    
    //fIdx cached for statistical policy
    protected int sfId;
    
    protected boolean countBased;
    
    /** role-id -> role-name mapping **/
    protected LinkedMapWritable roleMap;
    
    protected HashMap permsMap;
    
    protected int succeeded = 0;
    
    protected int failed = 0;
    
    protected boolean needCommit = false;
    
    protected int hostId = 0;
    
    protected boolean isCopyColls;
    protected boolean isCopyQuality;
    protected boolean isCopyMeta;
    
    protected long effectiveVersion;
    
    protected boolean isTxnCompatible = false;

    /**
     * for failover retry
     */
    protected int retry;

    protected final int maxRetries = 15;

    protected int sleepTime;

    protected final int maxSleepTime = 30000;
    
    public ContentWriter(Configuration conf, 
        Map hostSourceMap, boolean fastLoad) {
        this(conf, hostSourceMap, fastLoad, null);
    }
    
    public ContentWriter(Configuration conf,
        Map hostSourceMap, boolean fastLoad,
        AssignmentManager am) {
        super(conf, null);
   
        effectiveVersion = am.getEffectiveVersion();
        isTxnCompatible = effectiveVersion == 0;
        
        this.fastLoad = fastLoad;
        
        this.hostSourceMap = hostSourceMap;
        
        this.am = am;

        requestOptions = new RequestOptions();
        requestOptions.setMaxAutoRetry(0);
        
        permsMap = new HashMap();
        
        // key order in key set is guaranteed by LinkedHashMap,
        // i.e., the order keys are inserted

        int srcMapSize;

        if (fastLoad) {
            forestIds = am.getMasterIds().clone();
            srcMapSize = forestIds.length;
            curReplica = new int[srcMapSize];
            for (int i = 0; i < srcMapSize; i++) {
               curReplica[i] = 0;
            }
        } else {
            srcMapSize = hostSourceMap.size();
            forestIds = new String[srcMapSize];
            forestIds = hostSourceMap.keySet().toArray(forestIds);
            blacklist = new boolean[srcMapSize];
            for (int i = 0; i < srcMapSize; i++) {
               blacklist[i] = false;
            }
        }

        hostId = (int)(Math.random() * srcMapSize);
        
        // arraySize is the number of forests in fast load mode; 1 otherwise.
        int arraySize = fastLoad ? srcMapSize : 1;
        sessions = new Session[arraySize];
        stmtCounts = new int[arraySize];
        
        outputDir = conf.get(OUTPUT_DIRECTORY);
        batchSize = conf.getInt(BATCH_SIZE, DEFAULT_BATCH_SIZE);
        
        pendingUris = new HashMap[arraySize];
        for (int i = 0; i < arraySize; i++) {
            pendingUris[i] = new HashMap();
        }

        if (fastLoad
            && (am.getPolicy().getPolicyKind() == AssignmentPolicy.Kind.STATISTICAL
            || am.getPolicy().getPolicyKind() == AssignmentPolicy.Kind.RANGE
            || am.getPolicy().getPolicyKind() == AssignmentPolicy.Kind.QUERY)) {
            countBased = true;
            forestContents = new Content[1][batchSize];
            counts = new int[1];
            sfId = -1;
        } else {
            forestContents = new Content[arraySize][batchSize];
            counts = new int[arraySize];                
            sfId = 0;
        }
        
        String[] perms = conf.getStrings(OUTPUT_PERMISSION);
        List permissions = null;
        if (perms != null && perms.length > 0) {
            int i = 0;
            while (i + 1 < perms.length) {
                String roleName = perms[i++];
                if (roleName == null || roleName.isEmpty()) {
                    LOG.error("Illegal role name: " + roleName);
                    continue;
                }
                String perm = perms[i].trim();
                ContentCapability capability = null;
                if (perm.equalsIgnoreCase(ContentCapability.READ.toString())) {
                    capability = ContentCapability.READ;
                } else if (perm.equalsIgnoreCase(ContentCapability.EXECUTE.toString())) {
                    capability = ContentCapability.EXECUTE;
                } else if (perm.equalsIgnoreCase(ContentCapability.INSERT.toString())) {
                    capability = ContentCapability.INSERT;
                } else if (perm.equalsIgnoreCase(ContentCapability.UPDATE.toString())) {
                    capability = ContentCapability.UPDATE;
                } else if (perm.equalsIgnoreCase(ContentCapability.NODE_UPDATE.toString())) {
                    capability = ContentCapability.NODE_UPDATE;
                } else {
                    LOG.error("Illegal permission: " + perm);
                }
                if (capability != null) {
                    if (permissions == null) {
                        permissions = new ArrayList();
                    }
                    permissions.add(new ContentPermission(capability, roleName));
                }
                i++;
            }
        }
        
        options = new ContentCreateOptions();
        String[] collections = conf.getStrings(OUTPUT_COLLECTION);
        if (collections != null) {
            for (int i = 0; i < collections.length; i++) {
                collections[i] = collections[i].trim();
            }
            options.setCollections(collections);
        }
        
        options.setQuality(conf.getInt(OUTPUT_QUALITY, 0));
        if (permissions != null) {
            options.setPermissions(permissions.toArray(
                    new ContentPermission[permissions.size()]));
        } 
        String contentTypeStr = conf.get(CONTENT_TYPE, DEFAULT_CONTENT_TYPE);
        ContentType contentType = ContentType.valueOf(contentTypeStr);
        if (contentType == ContentType.UNKNOWN) {
            formatNeeded = true;
        } else {
            options.setFormat(contentType.getDocumentFormat());
        }
        
        options.setLanguage(conf.get(OUTPUT_CONTENT_LANGUAGE));
        String repairLevel = conf.get(OUTPUT_XML_REPAIR_LEVEL,
                DEFAULT_OUTPUT_XML_REPAIR_LEVEL).toLowerCase();
        options.setNamespace(conf.get(OUTPUT_CONTENT_NAMESPACE));
        if (DocumentRepairLevel.DEFAULT.toString().equals(repairLevel)){
            options.setRepairLevel(DocumentRepairLevel.DEFAULT);
        }
        else if (DocumentRepairLevel.NONE.toString().equals(repairLevel)){
            options.setRepairLevel(DocumentRepairLevel.NONE);
        }
        else if (DocumentRepairLevel.FULL.toString().equals(repairLevel)){
            options.setRepairLevel(DocumentRepairLevel.FULL);
        }
        
        streaming = conf.getBoolean(OUTPUT_STREAMING, false);
        
        String encoding = conf.get(OUTPUT_CONTENT_ENCODING);
        if (encoding != null) {
            options.setEncoding(encoding);
        }
        
        options.setTemporalCollection(conf.get(TEMPORAL_COLLECTION));
        
        needCommit = needCommit();
        if (needCommit) {
            commitUris = new ArrayList[arraySize];
            for (int i = 0; i < arraySize; i++) {
                commitUris[i] = new ArrayList(txnSize*batchSize);
            }
        }
        
        isCopyColls = conf.getBoolean(COPY_COLLECTIONS, true);
        isCopyQuality = conf.getBoolean(COPY_QUALITY, true);
        isCopyMeta = conf.getBoolean(COPY_METADATA, true);
    }
    
    protected boolean needCommit() {
        return true;
    }

    protected Content createContent(DocumentURI key, VALUEOUT value) 
            throws IOException {
        String uri = key.getUri();
        Content content = null;
        if (value instanceof Text) {
            if (formatNeeded) {
                options.setFormat(DocumentFormat.TEXT);
                formatNeeded = false;
            }
            options.setEncoding(DEFAULT_OUTPUT_CONTENT_ENCODING);
            content = ContentFactory.newContent(uri, 
                    ((Text) value).getBytes(), 0, 
                    ((Text)value).getLength(), options);
        } else if (value instanceof MarkLogicNode) {
            if (formatNeeded) {
                options.setFormat(DocumentFormat.XML);
                formatNeeded = false;
            }
            content = ContentFactory.newContent(uri, 
                    ((MarkLogicNode)value).get(), options);
        } else if (value instanceof ForestDocument) {
            ContentCreateOptions newOptions = options;
            if (isCopyColls || isCopyMeta || isCopyQuality) {
                newOptions = (ContentCreateOptions)options.clone();
            }
            content = ((ForestDocument)value).createContent(uri, newOptions,
                    isCopyColls, isCopyMeta, isCopyQuality);
        } else if (value instanceof BytesWritable) {
            if (formatNeeded) {
                options.setFormat(DocumentFormat.BINARY);
                formatNeeded = false;
            }
            content = ContentFactory.newContent(uri, 
                    ((BytesWritable) value).getBytes(), 0, 
                    ((BytesWritable) value).getLength(), options);               
        } else if (value instanceof CustomContent) { 
            ContentCreateOptions newOptions = options;
            newOptions = (ContentCreateOptions)options.clone();
            content = ((CustomContent) value).getContent(conf, newOptions, 
                    uri);
        } else if (value instanceof DatabaseDocument) {
            DatabaseDocument doc = (DatabaseDocument)value;
            if (formatNeeded) {
                options.setFormat(doc.getContentType().getDocumentFormat());
                formatNeeded = false;
            }
            options.setEncoding(DEFAULT_OUTPUT_CONTENT_ENCODING);
            if (doc.getContentType() == ContentType.BINARY) {
                content = ContentFactory.newContent(uri, 
                        doc.getContentAsByteArray(), options);
            } else {
                content = ContentFactory.newContent(uri, 
                        doc.getContentAsText().getBytes(), options);
            }
        } else if (value instanceof StreamLocator) {
            Path path = ((StreamLocator)value).getPath();
            if (fs == null) {         
                URI fileUri = path.toUri();
                fs = FileSystem.get(fileUri, conf);
            }
            switch (((StreamLocator)value).getCodec()) {
                case GZIP:
                    InputStream fileIn = fs.open(path);
                    is = new GZIPInputStream(fileIn);
                    break;
                case ZIP:
                    if (is == null) {
                        InputStream zipfileIn = fs.open(path);
                        ZipInputStream zis = new ZipInputStream(zipfileIn);
                        is = new ZipEntryInputStream(zis, path.toString());
                    }
                    break;
                case NONE:
                    is = fs.open(path);
                    break;
                default:
                    LOG.error("Unsupported compression codec: " + 
                        ((StreamLocator)value).getCodec() + " for document " +
                        key);
                    return content;
            }
            if (streaming) {
                content = ContentFactory.newUnBufferedContent(uri, is, 
                        options);
            } else {
                content = ContentFactory.newContent(uri, is, options);
            }

        } else {
            throw new UnsupportedOperationException(value.getClass()
                    + " is not supported.");
        }
        return content;
    }

    /**
     * Insert batch, log errors and update stats.
     * 
     * @param batch batch of content to insert
     * @param id forest Id
     */
    protected void insertBatch(Content[] batch, int id) 
    throws IOException {
        retry = 0;
        sleepTime = 500;
        while (retry < maxRetries) {
            try {
                if (retry == 1) { 
                    LOG.info("Retrying document insert");
                }
                List errors = 
                    sessions[id].insertContentCollectErrors(batch);
                if (errors != null) {
                    for (RequestException ex : errors) {
                        Throwable cause = ex.getCause();
                        if (cause != null) {
                            if (cause instanceof QueryException) {
                                LOG.error(
                                    ((QueryException)cause).getFormatString());
                            } else {
                                LOG.error(cause.getMessage());
                            }
                        }
                        if (ex instanceof ContentInsertException) {
                            Content content = 
                                    ((ContentInsertException)ex).getContent();
                            DocumentURI failedUri = 
                                    pendingUris[id].remove(content);
                            failed++;
                            if (failedUri != null) {
                                LOG.warn("Failed document " + failedUri);
                            }
                        }
                    }
                }
                if (retry > 0) { 
                    LOG.debug("Retry successful");
                }
            } catch (Exception e) {
                boolean retryable = true;
                if (e instanceof QueryException) {
                    LOG.error("QueryException:" + ((QueryException)e).getFormatString());
                    retryable = ((QueryException)e).isRetryable();
                } else if (e instanceof RequestServerException) {
                    // log error and continue on RequestServerException
                    // not retryable so trying to connect to the replica
                    LOG.error("RequestServerException:" + e.getMessage());
                } else {
                    LOG.error("Exception:" + e.getMessage());
                }

                // necessary to roll back in certain scenarios.
                if (needCommit) {
                    rollback(id);
                }

                if (retryable && ++retry < maxRetries) {
                    // necessary to close the session too.
                    sessions[id].close();
                    try {
                        InternalUtilities.sleep(sleepTime);
                    } catch (Exception e2) {
                    }
                    sleepTime = sleepTime * 2;
                    if (sleepTime > maxSleepTime) 
                        sleepTime = maxSleepTime;
                    
                    // We need to switch host even for RetryableQueryException
                    // because it could be "sync replicating" (bug:45785)
                    sessions[id] = getSession(id, true);
                    continue;
                } else if (retryable) {
                    LOG.info("Exceeded max retry");
                }
                failed += batch.length;
                // remove the failed content from pendingUris
                for (Content fc : batch) {
                    DocumentURI failedUri = pendingUris[id].remove(fc);
                    LOG.warn("Failed document " + failedUri);
                }
                throw new IOException(e);
            } 
            break;
        }

        if (needCommit) {
            // move uris from pending to commit
            for (DocumentURI uri : pendingUris[id].values()) {
                commitUris[id].add(uri);
            }
        } else {
            succeeded += pendingUris[id].size();
        }
        pendingUris[id].clear();
    }

    protected void rollback(int id) throws IOException {
        try {
            sessions[id].rollback();
        } catch (Exception e) {
            LOG.error("Error rolling back transaction " + e.getMessage());
            if (LOG.isDebugEnabled()) {
                LOG.debug(e);
            }
        } finally {
            if (countBased) {
                rollbackCount(id);
            }
            failed += commitUris[id].size();
            for (DocumentURI failedUri : commitUris[id]) {
                LOG.warn("Failed document: " + failedUri);
            }
            commitUris[id].clear();
        }
    }

    protected void commit(int id) throws IOException {
        try {
            sessions[id].commit();
            succeeded += commitUris[id].size();
            commitUris[id].clear();
        } catch (Exception e) {
            LOG.error("Error commiting transaction " + e.getMessage());
            rollback(id);
            sessions[id].close();
            sessions[id] = null;
        }
    }
    
    @Override
    public void write(DocumentURI key, VALUEOUT value) 
    throws IOException, InterruptedException {
        InternalUtilities.getUriWithOutputDir(key, outputDir);
        int fId = 0;
        if (fastLoad) {
            if(!countBased) {
                // placement for legacy or bucket
                fId = am.getPlacementForestIndex(key);
                sfId = fId;
            } else {
                if (sfId == -1) {
                    sfId = am.getPlacementForestIndex(key);
                }
                fId = sfId;
            }
        } 
        int sid = fId;
        Content content = createContent(key,value); 
        if (content == null) {
            failed++;
            return;
        }
        if (countBased) {
            fId = 0;
        }
        pendingUris[sid].put(content, (DocumentURI)key.clone());
        boolean inserted = false;
        forestContents[fId][counts[fId]++] = content;
        if (counts[fId] == batchSize) {
            if (sessions[sid] == null) {
                sessions[sid] = getSession(sid, false);
            }  
            insertBatch(forestContents[fId], sid);
            stmtCounts[sid]++;
                
            //reset forest index for statistical
            if (countBased) {
                sfId = -1;
            }
            counts[fId] = 0;
            inserted = true;
        }
        boolean committed = false;
        if (needCommit && stmtCounts[sid] == txnSize) {
            commit(sid);  
            stmtCounts[sid] = 0;         
            committed = true;
        }
        if ((!fastLoad) && ((inserted && (!needCommit)) || committed)) { 
            // rotate to next host and reset session
            int oldHostId = hostId;
            for (;;) {
                hostId = (hostId + 1)%forestIds.length;
                if (!blacklist[hostId]) {
                  break;
                }
                if (hostId == oldHostId) {
                    for (int i = 0; i < blacklist.length; i++) {
                        blacklist[i] = false;
                    }
                }
            }
            sessions[0] = null;
        }
    }

    protected void rollbackCount(int fId) {
        ((StatisticalAssignmentPolicy)am.getPolicy()).updateStats(fId, 
                -batchSize);
    }
    
    protected Session getSession(int fId, boolean nextReplica, TransactionMode mode) {
        Session session = null;
        if (fastLoad) {
            List replicas = am.getReplicas(forestIds[fId]);

            if (nextReplica) {
                curReplica[fId] = (curReplica[fId] + 1)%replicas.size();
            }

            ContentSource cs = hostSourceMap.get(replicas.get(curReplica[fId]).getHostName());
            String forestId = replicas.get(curReplica[fId]).getForest();
            session = cs.newSession(ID_PREFIX + forestId);
            if (LOG.isDebugEnabled()) {
                LOG.debug("Connect to forest " + forestId + " on "
                    + session.getConnectionUri().getHost());
            }
        } else {
            if (nextReplica) {
                blacklist[hostId] = true;
                int oldHostId = hostId;
                for (;;) {
                    hostId = (hostId + 1)%forestIds.length;
                    if (!blacklist[hostId])
                        break;
                    if (hostId == oldHostId) {
                        for (int i = 0; i < blacklist.length; i++) {
                            blacklist[i] = false;
                        }
                    }
                }
            }
            String forestId = forestIds[hostId];
            ContentSource cs = hostSourceMap.get(forestId);
            session = cs.newSession();
            if (LOG.isDebugEnabled()) {
                LOG.debug("Connect to " + session.getConnectionUri().getHost());
            }
        }      
        session.setTransactionMode(mode);
        session.setDefaultRequestOptions(requestOptions);
        ((SessionImpl)session).setCompatibleMode(isTxnCompatible);
        return session;
    }
    
    protected Session getSession(int fId, boolean nextReplica) {
        TransactionMode mode = TransactionMode.AUTO;
        if (needCommit) {
            mode = TransactionMode.UPDATE;
        }
        return getSession(fId, nextReplica, mode);
    }

    @Override
    public void close(TaskAttemptContext context) throws IOException,
    InterruptedException {
        if (batchSize > 1) {
            int len, sid;
            if (countBased) {
                len = 1;
                sid = sfId;
            } else {
            	len = fastLoad ? forestIds.length : 1;
            	sid = 0;
            }
            for (int i = 0; i < len; i++, sid++) {             
                if (sid != -1 && pendingUris[sid].size() > 0) {
                    Content[] remainder = new Content[counts[i]];
                    System.arraycopy(forestContents[i], 0, remainder, 0, 
                            counts[i]);
                    if (sessions[sid] == null) {
                        sessions[sid] = getSession(sid, false);
                    }
                    try {
                        insertBatch(remainder, sid);
                    } catch (Throwable e) {
                        LOG.error("Error caught inserting documents: ", e);
                    }
                    stmtCounts[sid]++;
                }
            }
        }
        for (int i = 0; i < sessions.length; i++) {
            if (sessions[i] != null) {
                if (stmtCounts[i] > 0 && needCommit) {
                    try {
                        commit(i);
                        if (sessions[i] != null) {
                            sessions[i].close();
                        }
                    } catch (Throwable e) {
                        LOG.error("Error committing transaction: ", e);
                    }
                } else {
                    sessions[i].close();
                }
            }
        }
        if (is != null) {
            is.close();
            if (is instanceof ZipEntryInputStream) {
                ((ZipEntryInputStream)is).closeZipInputStream();
            }
        }
        context.getCounter(MarkLogicCounter.OUTPUT_RECORDS_COMMITTED)
                .increment(succeeded);
        context.getCounter(MarkLogicCounter.OUTPUT_RECORDS_FAILED)
                .increment(failed);
    }
    
    @Override
    public int getTransactionSize(Configuration conf) {
        // return the specified txn size
        if (conf.get(TXN_SIZE) != null) {
            int txnSize = conf.getInt(TXN_SIZE, 0);
            return txnSize <= 0 ? 1 : txnSize;
        } 
        return 1000 / conf.getInt(BATCH_SIZE, DEFAULT_BATCH_SIZE);
    }
    
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy