All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.mongodb.spark.pickle.BSONPickler Maven / Gradle / Ivy

package com.mongodb.spark.pickle;

import com.mongodb.DBRef;
import net.razorvine.pickle.IObjectPickler;
import net.razorvine.pickle.Opcodes;
import net.razorvine.pickle.PickleException;
import net.razorvine.pickle.Pickler;
import org.bson.types.BSONTimestamp;
import org.bson.types.Binary;
import org.bson.types.Code;
import org.bson.types.CodeWScope;
import org.bson.types.CodeWithScope;
import org.bson.types.MaxKey;
import org.bson.types.MinKey;
import org.bson.types.ObjectId;

import java.io.IOException;
import java.io.OutputStream;
import java.util.regex.Pattern;

/**
 * Implementation of {@link net.razorvine.pickle.IObjectPickler} that pickles
 * BSON types only, so that they can be correctly unpickled into PyMongo
 * objects.
 *
 * For documentation on Python's pickle protocol, see the following:
 * - https://docs.python.org/2/library/pickle.html
 * - http://svn.python.org/projects/python/trunk/Lib/pickletools.py
 */
public class BSONPickler implements IObjectPickler {

    private void putBinstring(final byte[] bytes, final OutputStream out)
      throws IOException {
        int binLen = bytes.length;
        if (binLen <= 0xff) {
            out.write(Opcodes.SHORT_BINSTRING);
            out.write(binLen);
        } else {
            out.write(Opcodes.BINSTRING);
            out.write(binLen & 0xff);
            out.write(binLen >>> 8 & 0xff);
            out.write(binLen >>> 16 & 0xff);
            out.write(binLen >>> 24 & 0xff);
        }
        out.write(bytes);
    }

    private void putBinstring(final String string, final OutputStream out)
      throws IOException {
        putBinstring(string.getBytes(), out);
    }

    /**
     * Translate flags from java.util.regex.Pattern into their respetive values
     * in Python's "re" library.
     *
     * @param javaFlags flags from a java.util.regex.Pattern
     * @return equivalent flags in Python
     */
    private int translateRegexFlags(final int javaFlags) {
        int pyFlags = 0;
        if ((javaFlags & Pattern.CASE_INSENSITIVE) > 0) {
            pyFlags |= 2;
        }
        if ((javaFlags & Pattern.COMMENTS) > 0) {
            pyFlags |= 64;
        }
        if ((javaFlags & Pattern.DOTALL) > 0) {
            pyFlags |= 16;
        }
        if ((javaFlags & Pattern.MULTILINE) > 0) {
            pyFlags |= 8;
        }
        // 0x100 == Pattern.UNICODE_CHARACTER_CLASS in Java >= 7.
        // Python doesn't have separate flags for these. Even if only one of
        // these is set, enabling the UNICODE flag is still probably the closest
        // approximation in Python.
        if (((javaFlags & Pattern.UNICODE_CASE) | (javaFlags & 0x100)) > 0) {
            pyFlags |= 32;
        }
        return pyFlags;
    }

    private void pickleRegex(final Pattern pattern, final OutputStream out,
                             final Pickler pickler)
      throws IOException {
        out.write(Opcodes.GLOBAL);
        out.write("bson.regex\nRegex\n".getBytes());
        out.write(Opcodes.EMPTY_TUPLE);
        out.write(Opcodes.NEWOBJ);
        out.write(Opcodes.EMPTY_DICT);
        out.write(Opcodes.MARK);
        putBinstring("pattern", out);
        pickler.save(pattern.pattern());
        putBinstring("flags", out);
        pickler.save(translateRegexFlags(pattern.flags()));
        out.write(Opcodes.SETITEMS);
        out.write(Opcodes.BUILD);
    }

    private void pickleBSONTimestamp(final BSONTimestamp timestamp,
                                     final OutputStream out,
                                     final Pickler pickler)
      throws IOException {
        out.write(Opcodes.GLOBAL);
        out.write("bson.timestamp\nTimestamp\n".getBytes());
        out.write(Opcodes.EMPTY_TUPLE);
        out.write(Opcodes.NEWOBJ);
        out.write(Opcodes.EMPTY_DICT);
        out.write(Opcodes.MARK);
        putBinstring("_Timestamp__time", out);
        pickler.save(timestamp.getTime());
        putBinstring("_Timestamp__inc", out);
        pickler.save(timestamp.getInc());
        out.write(Opcodes.SETITEMS);
        out.write(Opcodes.BUILD);
    }

    private void pickleCode(final Code code, final OutputStream out,
                            final Pickler pickler)
      throws IOException {
        out.write(Opcodes.GLOBAL);
        out.write("bson.code\nCode\n".getBytes());
        pickler.save(code.getCode());
        out.write(Opcodes.TUPLE1);
        out.write(Opcodes.NEWOBJ);

        // PyMongo's bson.code.Code always has a scope, even it if is empty.
        out.write(Opcodes.EMPTY_DICT);
        putBinstring("_Code__scope", out);
        if (code instanceof CodeWithScope) {
            pickler.save(((CodeWithScope) code).getScope());
        } else if (code instanceof CodeWScope) {
            pickler.save(((CodeWScope) code).getScope().toMap());
        } else {
            out.write(Opcodes.EMPTY_DICT);
        }
        out.write(Opcodes.SETITEM);

        out.write(Opcodes.BUILD);
    }

    private void writeMinKey(final OutputStream out)
      throws IOException {
        out.write(Opcodes.GLOBAL);
        out.write("bson.min_key\nMinKey\n".getBytes());
        out.write(Opcodes.EMPTY_TUPLE);
        out.write(Opcodes.NEWOBJ);
        out.write(Opcodes.EMPTY_DICT);
        out.write(Opcodes.BUILD);
    }

    private void writeMaxKey(final OutputStream out)
      throws IOException {
        out.write(Opcodes.GLOBAL);
        out.write("bson.max_key\nMaxKey\n".getBytes());
        out.write(Opcodes.EMPTY_TUPLE);
        out.write(Opcodes.NEWOBJ);
        out.write(Opcodes.EMPTY_DICT);
        out.write(Opcodes.BUILD);
    }

    private void pickleDBRef(final DBRef dbref, final OutputStream out,
                             final Pickler pickler)
      throws IOException {
        out.write(Opcodes.GLOBAL);
        out.write("bson.dbref\nDBRef\n".getBytes());
        out.write(Opcodes.EMPTY_TUPLE);
        out.write(Opcodes.NEWOBJ);
        out.write(Opcodes.EMPTY_DICT);
        out.write(Opcodes.MARK);
        putBinstring("_DBRef__kwargs", out);
        out.write(Opcodes.EMPTY_DICT);
        putBinstring("_DBRef__collection", out);
        pickler.save(dbref.getCollectionName());
        putBinstring("_DBRef__database", out);
        // org.bson.types.DBRef stores neither database name nor extra "kwargs".
        out.write(Opcodes.NONE);
        putBinstring("_DBRef__id", out);
        // Not saving this in memo, because a DBRef that uses itself as its own
        // id can't be saved to MongoDB anyway.
        pickler.save(dbref.getId());
        out.write(Opcodes.SETITEMS);
        out.write(Opcodes.BUILD);
    }

    private void pickleBinary(final Binary binary, final OutputStream out,
                              final Pickler pickler)
      throws IOException {
        out.write(Opcodes.GLOBAL);
        out.write("bson.binary\nBinary\n".getBytes());
        putBinstring(binary.getData(), out);
        pickler.save(binary.getType());
        out.write(Opcodes.TUPLE2);
        out.write(Opcodes.NEWOBJ);
        out.write(Opcodes.EMPTY_DICT);
        putBinstring("_Binary__subtype", out);
        pickler.save(binary.getType());
        out.write(Opcodes.SETITEM);
        out.write(Opcodes.BUILD);
    }

    private void pickleObjectId(final ObjectId oid, final OutputStream out)
      throws IOException {
        out.write(Opcodes.GLOBAL);
        out.write("bson.objectid\nObjectId\n".getBytes());
        out.write(Opcodes.EMPTY_TUPLE);
        out.write(Opcodes.NEWOBJ);
        out.write(Opcodes.SHORT_BINSTRING);
        out.write(12);
        out.write(oid.toByteArray());
        out.write(Opcodes.BUILD);
    }

    /**
     * Write the Python "pickle" representation of a BSON type.
     *
     * @param o the object to be pickled
     * @param out the OutputStream to which to write
     * @param currentPickler the current Pickler instance
     * @throws PickleException if an issue is encountered while serializing a
     * BSON object
     * @throws IOException if an issue is encountered writing to the
     * OutputStream
     */
    @Override
    public void pickle(
      final Object obj, final OutputStream out, final Pickler currentPickler)
      throws IOException {

        Object o = obj;
        if (obj instanceof BSONValueBox) {
            o = ((BSONValueBox) obj).get();
        }

        if (o instanceof ObjectId) {
            pickleObjectId((ObjectId) o, out);
        } else if (o instanceof Binary) {
            pickleBinary((Binary) o, out, currentPickler);
        } else if (o instanceof DBRef) {
            pickleDBRef((DBRef) o, out, currentPickler);
        } else if (o instanceof MaxKey) {
            writeMaxKey(out);
        } else if (o instanceof MinKey) {
            writeMinKey(out);
        } else if (o instanceof Code) {
            pickleCode((Code) o, out, currentPickler);
        } else if (o instanceof BSONTimestamp) {
            pickleBSONTimestamp((BSONTimestamp) o, out, currentPickler);
        } else if (o instanceof Pattern) {
            // Since the Hadoop connector is in Java, regular expressions will
            // be of this class, rather than scala.util.matching.Regex.
            pickleRegex((Pattern) o, out, currentPickler);
        } else {
            throw new PickleException("Can't pickle this: " + o);
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy