All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.marklogic.contentpump.DatabaseContentWriter Maven / Gradle / Ivy

There is a newer version: 11.3.1
Show newest version
/*
 * Copyright (c) 2019 MarkLogic Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.marklogic.contentpump;

import java.io.IOException;
import java.util.HashSet;
import java.util.Map;
import java.util.Vector;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.node.ObjectNode;
import com.marklogic.mapreduce.MarkLogicConstants;
import com.marklogic.mapreduce.MarkLogicCounter;
import com.marklogic.mapreduce.ContentType;
import com.marklogic.mapreduce.ContentWriter;
import com.marklogic.mapreduce.DocumentURI;
import com.marklogic.mapreduce.MarkLogicDocument;
import com.marklogic.mapreduce.utilities.AssignmentManager;
import com.marklogic.mapreduce.utilities.InternalUtilities;
import com.marklogic.xcc.exceptions.RequestException;
import com.marklogic.xcc.AdhocQuery;
import com.marklogic.xcc.Content;
import com.marklogic.xcc.ContentFactory;
import com.marklogic.xcc.ContentCreateOptions;
import com.marklogic.xcc.ContentPermission;
import com.marklogic.xcc.ContentSource;
import com.marklogic.xcc.Session;
import com.marklogic.xcc.ValueFactory;
import com.marklogic.xcc.types.ValueType;
import com.marklogic.xcc.types.XName;
import com.marklogic.xcc.types.XdmValue;
/**
 * MarkLogicRecordWriter that can 
 * 1) insert content from Archive to MarkLogic Server
 * 2) copy content from source MarkLogic Server to destination MarkLogic Server
 * 
 * @author ali
 * 
 */
public class DatabaseContentWriter extends
    ContentWriter implements ConfigConstants {
    public static final Log LOG = 
            LogFactory.getLog(DatabaseContentWriter.class);

    private URIMetadata[][] metadatas;
    protected XdmValue[][] propertyUris = null;
    protected XdmValue[][] propertyXmlStrings = null;
    protected int[] propertyCounts = null;

    protected boolean isCopyProps;
    protected boolean isCopyPerms;
    
    public static final String XQUERY_VERSION_1_0_ML = 
            "xquery version \"1.0-ml\";\n";

    public DatabaseContentWriter(Configuration conf,
        Map hostSourceMap, boolean fastLoad) {
        this(conf, hostSourceMap, fastLoad, null); 
    }
    
    public DatabaseContentWriter(Configuration conf,
            Map hostSourceMap, boolean fastLoad,
            AssignmentManager am) {
        super(conf, hostSourceMap, fastLoad, am);
        
        int arraySize = countBased ? 1 : forestIds.length;
        metadatas = new URIMetadata[arraySize][batchSize];
        isCopyProps = conf.getBoolean(CONF_COPY_PROPERTIES, true);
        isCopyPerms = conf.getBoolean(CONF_COPY_PERMISSIONS, true);

        if (effectiveVersion >= BATCH_MIN_VERSION && isCopyProps) {
            propertyUris = new XdmValue[arraySize][batchSize];
            propertyXmlStrings = new XdmValue[arraySize][batchSize];
            propertyCounts = new int[arraySize];
        }
    }

    /**
     * fetch the options information from conf and metadata, set to the field
     * "options"
     */
    protected static ContentCreateOptions newContentCreateOptions(
            DocumentMetadata meta, ContentCreateOptions options, 
            boolean isCopyColls, boolean isCopyQuality, boolean isCopyMeta,
            boolean isCopyPerms, long effectiveVersion) {
        ContentCreateOptions opt = (ContentCreateOptions)options.clone();
        if (meta != null) {
            if (isCopyQuality && opt.getQuality() == 0) {
                opt.setQuality(meta.quality);
            }
            if (isCopyColls) {
                if (opt.getCollections() != null) {
                    HashSet colSet = 
                            new HashSet(meta.collectionsList);
                    // union copy_collection and output_collection
                    for (String s : opt.getCollections()) {
                        colSet.add(s);
                    }
                    opt.setCollections(
                            colSet.toArray(new String[colSet.size()]));
                } else {
                    opt.setCollections(meta.getCollections());
                }
            }      
            if (isCopyPerms) {
                if (effectiveVersion < MarkLogicConstants.MIN_NODEUPDATE_VERSION &&
                        meta.isNakedProps()) {
                    boolean reset = false;
                    Vector perms = new Vector<>();
                    for (ContentPermission perm : meta.permissionsList) {
                        if (!perm.getCapability().toString().equals(
                                ContentPermission.NODE_UPDATE.toString())) {
                            perms.add(perm);
                        } else {
                            reset = true;
                        }
                    }
                    if (reset) {
                        meta.clearPermissions();
                        meta.addPermissions(perms);
                        meta.setPermString(null);
                    }
                }
                if (opt.getPermissions() != null) {
                    HashSet pSet = 
                         new HashSet(meta.permissionsList);
                    // union of output_permission & copy_permission
                    for (ContentPermission p : opt.getPermissions()) {
                        pSet.add(p);
                    }
                    opt.setPermissions(
                            pSet.toArray(new ContentPermission[pSet.size()]));
                } else {
                    opt.setPermissions(meta.getPermissions());
                }
            }
            if (isCopyMeta) {
                opt.setMetadata(meta.meta);
            }
        }       
        return opt;
    }

    @Override
    public void write(DocumentURI key, VALUE value) throws IOException,
        InterruptedException {       
        String uri = InternalUtilities.getUriWithOutputDir(key, outputDir);
        int fId = 0;
        if (fastLoad) {
            if(!countBased) {
                // placement for legacy or bucket
                fId = am.getPlacementForestIndex(key);
                sfId = fId;
            } else {
                if (sfId == -1) {
                    sfId = am.getPlacementForestIndex(key);
                }
                fId = sfId;
            }
        } 
        int sid = fId;
        
        Content content = null;
        DocumentMetadata meta = null;
        if (value instanceof DatabaseDocumentWithMeta) {
            try {
                meta = ((DatabaseDocumentWithMeta) value).getMeta();
                ContentCreateOptions opt = newContentCreateOptions(meta, options,
                    isCopyColls, isCopyQuality, isCopyMeta, isCopyPerms,
                    effectiveVersion);
                MarkLogicDocument doc = (MarkLogicDocument)value;
                if (meta == null || !meta.isNakedProps()) {
                    opt.setFormat(doc.getContentType().getDocumentFormat());
                    if (doc.getContentType() == ContentType.BINARY) {
                        content = ContentFactory.newContent(uri,
                                doc.getContentAsByteArray(), opt);
                    } else {
                        content = ContentFactory.newContent(uri,
                                doc.getContentAsText().toString(), opt);
                    }
                }
            } catch (Exception e) {
                failed++;
                LOG.warn("Failed document: " + uri);
                return;
            }
        } else {
            throw new UnsupportedOperationException(value.getClass()
                    + " is not supported.");
        }      
        if(countBased) {
            fId = 0;
        }
        pendingUris[sid].put(content, new DocumentURI(key));
        boolean inserted = false;
        if (meta == null || !meta.isNakedProps()) {
            // add new content
            forestContents[fId][counts[fId]] = content;
            // add properties
            if (propertyUris != null && meta != null && 
                    meta.getProperties() != null) {
                propertyUris[fId][propertyCounts[fId]] = 
                        ValueFactory.newValue(ValueType.XS_STRING, uri);
                propertyXmlStrings[fId][propertyCounts[fId]] = 
                        ValueFactory.newValue(ValueType.XS_STRING, 
                                meta.getProperties());
                propertyCounts[fId]++;
                counts[fId]++;
            } else {
                metadatas[fId][counts[fId]++] = new URIMetadata(uri, meta);
            }
        } else if (isCopyProps) { // naked properties
            if (sessions[sid] == null) {
                sessions[sid] = getSession(sid, false);
            }
            boolean suc = setDocumentProperties(uri, meta.getProperties(),
                    isCopyPerms?meta.getPermString():null,
                    isCopyColls?meta.getCollectionString():null,
                    isCopyQuality?meta.getQualityString():null, 
                    isCopyMeta?meta.getMeta():null, sessions[sid]);
            stmtCounts[sid]++;
            if (suc) { 
                commitUris[sid].add(key);
            } else {
                failed++;
            }
        }
        if (counts[fId] == batchSize) {
            if (sessions[sid] == null) {
                sessions[sid] = getSession(sid, false);
            }    
            insertBatch(forestContents[fId], sid);     
            stmtCounts[sid]++;
            if (isCopyProps) { //prop batch insert
                if (propertyUris != null) {
                    setBatchProperties(fId, sessions[sid]);
                    stmtCounts[sid]++;
                    propertyCounts[fId] = 0;
                } else {
                    for (int i = 0; i < counts[fId]; i++) {
                        DocumentMetadata m = metadatas[fId][i].getMeta();
                        String u = metadatas[fId][i].getUri();
                        if (m != null && m.getProperties() != null) {
                            boolean suc = setDocumentProperties(u,
                                    m.getProperties(), null, null, null, null,
                                    sessions[sid]);
                            stmtCounts[sid]++;
                            if (suc) {
                                commitUris[sid].add(key);
                            }
                        }
                    }
                }
            }
            //reset forest index for statistical
            if (countBased) {
                sfId = -1;
            }
            counts[fId] = 0;
            inserted = true;
        }
        boolean committed = false;
        if (stmtCounts[sid] == txnSize && needCommit) {
            commit(sid);
            stmtCounts[sid] = 0;
            committed = true;
        }
        if ((!fastLoad) && ((inserted && (!needCommit)) || committed)) {
            // rotate to next host and reset session
            hostId = (hostId + 1)%forestIds.length;
            sessions[0] = null;
        }
    }

    @Override
    public void close(TaskAttemptContext context) throws IOException,
        InterruptedException {
        if (batchSize > 1) {
            int len, sid;
            if (countBased) {
                len = 1;
                sid = sfId;
            } else {
                len = fastLoad ? forestIds.length : 1;
                sid = 0;
            }
            for (int i = 0; i < len; i++, sid++) {
                if (pendingUris[sid].size() > 0) {
                    Content[] remainder = new Content[counts[i]];
                    System.arraycopy(forestContents[i], 0, remainder, 0,
                            counts[i]);
                    if (sessions[sid] == null) {
                        sessions[sid] = getSession(i, false);
                    }
                    try {
                        insertBatch(remainder, sid);
                    } catch (Exception e) {
                    }
                    stmtCounts[sid]++;
                    if (!isCopyProps) {
                        continue;
                    }
                    if (propertyUris != null && propertyCounts[i] > 0) {
                        setBatchProperties(i, sessions[sid]);
                        stmtCounts[sid]++;
                    } else if (propertyUris == null) { 
                        // non-batch insert props
                        for (int j = 0; j < counts[i]; j++) {
                            DocumentMetadata m = metadatas[i][j].getMeta();
                            String u = metadatas[i][j].getUri();
                            if (m != null && m.getProperties() != null) {
                                setDocumentProperties(u, m.getProperties(),
                                        null, null, null, null, sessions[sid]);
                                stmtCounts[sid]++;
                            }
                        }
                    }
                }
            }
        }
        for (int i = 0; i < sessions.length; i++) {
            if (sessions[i] != null) {
                if (stmtCounts[i] > 0 && needCommit) {
                    try {
                        sessions[i].commit();
                        succeeded += commitUris[i].size();
                    } catch (RequestException e) {
                        // log error and continue on RequestServerException
                        LOG.error("Error commiting transaction", e);
                        failed += commitUris[i].size();
                        for (DocumentURI failedUri : commitUris[i]) {
                            LOG.warn("Failed document " + failedUri);
                        }
                        commitUris[i].clear();
                    } finally {
                        sessions[i].close();
                    }
                } else {
                    sessions[i].close();
                }
            }
        }
        context.getCounter(
                MarkLogicCounter.OUTPUT_RECORDS_COMMITTED).increment(succeeded);
        context.getCounter(
                MarkLogicCounter.OUTPUT_RECORDS_FAILED).increment(failed);
    }

    protected boolean setBatchProperties(int i, Session s) {
        if (propertyCounts[i] == 0) return true;
        
        String query = XQUERY_VERSION_1_0_ML
                + "declare variable $URI as xs:string* external;\n" +
                "declare variable $XML-STRING as xs:string* external;\n" +
                "for $docuri in $URI\n" +
                "return(       \n" +
                "xdmp:document-set-properties($docuri, xdmp:unquote(fn:head($XML-STRING))/prop:properties/node() ),\n" +
                "    xdmp:set($XML-STRING, fn:tail($XML-STRING))\n" +
                "    ); ";
        if (LOG.isDebugEnabled()) {
            LOG.debug(query);
        }
        AdhocQuery req = s.newAdhocQuery(query);
        
        XdmValue[] uriArray = propertyUris[i];
        XdmValue[] xmlStringArray = propertyXmlStrings[i]; 
        if (propertyCounts[i] < batchSize) {
            uriArray = new XdmValue[propertyCounts[i]];
            xmlStringArray = new XdmValue[propertyCounts[i]];
            System.arraycopy(propertyUris[i], 0, 
                    uriArray, 0, propertyCounts[i]);
            System.arraycopy(propertyXmlStrings[i], 0, 
                    xmlStringArray, 0, propertyCounts[i]);
        }

        req.setNewVariables(new XName("URI"), uriArray);
        req.setNewVariables(new XName("XML-STRING"), xmlStringArray);

        try {
            s.submitRequest(req);
            return true;
        } catch (RequestException ex) {
            for (XdmValue failedUri : uriArray) {
                LOG.error("Error batch setting document properties for: " + 
                        failedUri.asString() );
            }
            LOG.error( ex.getMessage());
            return false;
        }
    }

    /**
     *
     * @param uri uri of the document whose property is to be set
     * @param xmlString property in xml string
     * @param permString
     * @param collString
     * @param quality
     * @param meta
     * @param s
     * @return
     */
    protected static boolean setDocumentProperties(String uri,
                                                   String xmlString, String permString, String collString, String quality,
                                                   Map meta, Session s) {
        String query = XQUERY_VERSION_1_0_ML
                + "declare variable $URI as xs:string external;\n"
                + "declare variable $XML-STRING as xs:string external;\n"
                + "declare variable $PERM-STRING as xs:string external;\n"
                + "declare variable $COLL-STRING as xs:string external;\n"
                + "declare variable $QUALITY-STRING as xs:string external;\n"
                + (meta == null ?
                "" : "declare variable $META as map:map external;\n")
                + "xdmp:document-set-properties($URI,\n"
                + "  xdmp:unquote($XML-STRING)/prop:properties/node() )\n"
                + ", if('' eq ($PERM-STRING)) then () else \n"
                + "xdmp:document-set-permissions($URI, \n"
                + "xdmp:unquote($PERM-STRING)/node()/sec:permission)\n"
                + ", if('' eq ($COLL-STRING)) then () else \n"
                + "let $f := fn:function-lookup(xs:QName('xdmp:from-json-string'), 1)\n"
                + "return if (fn:exists($f)) then \n"
                + "xdmp:document-set-collections($URI,json:array-values($f($COLL-STRING)))\n"
                + "else xdmp:document-set-collections($URI,json:array-values(xdmp:from-json($COLL-STRING)))\n"
                + ", if('' eq ($QUALITY-STRING)) then () else xdmp:document-set-quality($URI,xs:integer($QUALITY-STRING))\n"
                + (meta == null ?
                "" : ", (let $f := fn:function-lookup(xs:QName('xdmp:document-set-metadata'),2)\n"
                + "return if (exists($f)) then $f($URI,$META) else ())\n");
        if (LOG.isDebugEnabled()) {
            LOG.debug(query);
        }
        AdhocQuery req = s.newAdhocQuery(query);
        req.setNewStringVariable("URI", uri);
        req.setNewStringVariable("XML-STRING", xmlString);
        req.setNewStringVariable("PERM-STRING",
                permString==null?"":permString);
        req.setNewStringVariable("COLL-STRING",
                collString==null||collString.isEmpty()?"":collString);
        req.setNewStringVariable("QUALITY-STRING",
                quality==null?"":quality);
        if (meta != null) {
            ObjectMapper mapper = new ObjectMapper();
            ObjectNode node = mapper.createObjectNode();
            for (Map.Entry entry : meta.entrySet()) {
                node.put(entry.getKey(), entry.getValue());
            }
            req.setNewVariable("META", ValueType.JS_OBJECT, node);
        }

        try {
            s.submitRequest(req);
            return true;
        } catch (RequestException ex) {
            LOG.error("Error setting document properties for " + uri + ": " +
                    ex.getMessage());
            return false;
        }
    }
}

class URIMetadata {
    String uri;
    DocumentMetadata meta;

    public URIMetadata(String uri, DocumentMetadata meta) {
        super();
        this.uri = uri;
        this.meta = meta;
    }

    public String getUri() {
        return uri;
    }

    public DocumentMetadata getMeta() {
        return meta;
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy