All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.utils.python.StreamingPythonScriptExecutor Maven / Gradle / Ivy

The newest version!
package org.broadinstitute.hellbender.utils.python;

import com.google.common.annotations.VisibleForTesting;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.runtime.*;

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.Future;
import java.util.function.Function;
import java.util.stream.Collectors;

/**
 * Python executor used to interact with a cooperative, keep-alive Python process. The executor issues commands
 * to call Python functions in the {@code tool} module in {@code gatktool} Python package. These include functions
 * for managing an acknowledgement FIFO that is used to signal completion of Python commands, and a data FIFO that
 * can be used to stream data to Python.
 *
 *  - construct the executor
 *  - start the remote process ({@link #start}.
 *  - optionally call {@code #getStreamWriter} to initialize and create a data transfer fifo.
 *  - send one or more synchronous or asynchronous commands to be executed in Python
 *  - optionally send data one or more times of type {@ocde T} through the async writer
 *  - execute python code to close the data fifo
 *  - terminate the executor {@link #terminate}
 *
 * Guidelines for writing GATK tools that use Python interactively:
 *
 *   - Program correctness should not rely on consumption of anything written by Python to stdout/stderr. All
 *     data should be transferred through the stream writer or a file.
 *   - Python code should write errors to stderr.
 *   - Prefer single line commands that run a script, vs. multi-line Python code embedded in Java
 *   - Terminate commands with a newline.
 *   - Try not to be chatty (maximize use of the fifo buffer by writing to it in batches before reading from Python)
 *
 * @param  type of data that will be streamed to the Python process
 */
public class StreamingPythonScriptExecutor extends PythonExecutorBase {
    private static final Logger logger = LogManager.getLogger(StreamingPythonScriptExecutor.class);
    private static final String NL = System.lineSeparator();

    private final List curatedCommandLineArgs = new ArrayList<>();

    private StreamingProcessController spController;
    private ProcessSettings processSettings;

    private File dataTransferFIFOFile;
    private FileOutputStream dataTransferFIFOWriter;
    private AsynchronousStreamWriter asyncWriter;

    private File profileResults;

    // Python commands that are executed in the companion python process. The functions called
    // here live in the {@code tool} module in {@code gatktool} Python package.
    private final static String PYTHON_IMPORT_GATK = "from gatktool import tool" + NL;
    private final static String PYTHON_INITIALIZE_GATK = "tool.initializeGATK('%s')" + NL;
    private final static String PYTHON_START_PROFILING = "tool.startProfiling()" + NL;
    private final static String PYTHON_TERMINATE_GATK = "tool.terminateGATK()" + NL;
    private final static String PYTHON_INITIALIZE_DATA_FIFO = "tool.initializeDataFIFO('%s')" + NL;
    private final static String PYTHON_CLOSE_DATA_FIFO = "tool.closeDataFIFO()" + NL;
    private final static String PYTHON_SEND_ACK_REQUEST = "tool.sendAck()" + NL;
    private final static String PYTHON_END_PROFILING = "tool.endProfiling('%s')" + NL;

    // keep track of when an ack request has been made and reject attempts to send another ack
    // request until the previous one has been handled
    private boolean isAckRequestOutstanding = false;

    /**
     * The start method must be called to actually start the remote executable.
     *
     * @param ensureExecutableExists throw if the python executable cannot be located
     */
    public StreamingPythonScriptExecutor(boolean ensureExecutableExists) {
        this(PythonExecutableName.PYTHON, ensureExecutableExists);
    }

    /**
     * The start method must be called to actually start the remote executable.
     *
     * @param pythonExecutableName name of the python executable to start
     * @param ensureExecutableExists throw if the python executable cannot be found
     */
    public StreamingPythonScriptExecutor(final PythonExecutableName pythonExecutableName, final boolean ensureExecutableExists) {
        super(pythonExecutableName, ensureExecutableExists);
    }

    /**
     * Start the Python process.
     *
     * @param pythonProcessArgs args to be passed to the python process
     * @return true if the process is successfully started
     */
    public boolean start(final List pythonProcessArgs) {
        return start(pythonProcessArgs, false, null);
    }

    /**
     * Start the Python process.
     *
     * @param pythonProcessArgs args to be passed to the python process
     * @param enableJournaling true to enable Journaling, which records all interprocess IO to a file. This is
     *                         expensive and should only be used for debugging purposes.
     * @return true if the process is successfully started
     */
    public boolean start(final List pythonProcessArgs, final boolean enableJournaling, final File profileResults) {

        // Since the error reporting mechanism used by this class is dependent on the GATK Python environment
        // having been properly established, we need to use an out-of-band mechanism to verify the environment
        // before we start executing commands (otherwise the commands will hang because the error reporting mechanism
        // isn't in place). So use the non-streaming Python executor, which has no requirements on the environment,
        // to validate that the "gatktool" package is present. If it is, any subsequent environment errors will
        // be propagated through the StreamingPythonExecutor's message passing mechanism.
        PythonScriptExecutor.checkPythonEnvironmentForPackage("gatktool");

        this.profileResults = profileResults;
        final List args = new ArrayList<>();
        args.add(externalScriptExecutableName);
        args.add("-u");
        args.add("-i");
        if (pythonProcessArgs != null) {
            args.addAll(pythonProcessArgs);
        }

        curatedCommandLineArgs.addAll(args);

        final InputStreamSettings isSettings = new InputStreamSettings();
        final OutputStreamSettings stdOutSettings = new OutputStreamSettings();
        stdOutSettings.setBufferSize(-1);
        final OutputStreamSettings stdErrSettings = new OutputStreamSettings();
        stdErrSettings.setBufferSize(-1);

        processSettings = new ProcessSettings(
                args.toArray(new String[args.size()]),
                false,  // redirect error
                null,           // directory
                null,         // env
                isSettings,
                stdOutSettings,
                stdErrSettings
        );

        // start the process, initialize the python code, and do the ack fifo handshake
        spController = new StreamingProcessController(processSettings, enableJournaling);
        final File ackFIFOFile = spController.start();
        if (ackFIFOFile == null) {
            return false;
        }
        initializeTool(ackFIFOFile);
        return true;
    }

    /**
     * Send a command to Python, and wait for an ack, returning all accumulated output
     * since the last call to either  or 
     * This is a blocking call - if no acknowledgment is received from the remote process, it will
     * block indefinitely. If an exception is raised in the Python code, or a negative acknowledgment
     * is received, an PythonScriptExecutorException will be thrown.
     *
     * The caller is required to terminate commands with the correct number of newline(s) as appropriate for
     * the command being issued. Since white space is significant in Python, failure to do so properly can
     * leave the Python parser blocked waiting for more newlines to terminate indented code blocks.
     *
     * @param line data to be sent to the remote process
     * @return ProcessOutput
     * @throws UserException if a timeout occurs
     */
    public ProcessOutput sendSynchronousCommand(final String line) {
        if (!line.endsWith(NL)) {
            throw new IllegalArgumentException(
                    "Python commands must be newline-terminated in order to be executed. " +
                            "Indented Python code blocks must be terminated with additional newlines");
        }
        spController.writeProcessInput(line);
        sendAckRequest();
        return waitForAck();
    }

    /**
     * Send a command to the remote process without waiting for a response. This method should only
     * be used for responses that will block the remote process.
     *
     * NOTE: Before executing further synchronous statements after calling this method, getAccumulatedOutput
     * should be called to enforce a synchronization point.
     *
     * The caller is required to terminate commands with the correct number of newline(s) as appropriate for
     * the command being issued. Since white space is significant in Python, failure to do so properly can
     * leave the Python parser blocked waiting for more newlines to terminate indented code blocks.
     *
     * @param line data to send to the remote process
     */
    public void sendAsynchronousCommand(final String line) {
        if (!line.endsWith(NL)) {
            throw new IllegalArgumentException("Python commands must be newline-terminated");
        }
        spController.writeProcessInput(line);
        sendAckRequest(); // but don't wait for it..the caller should subsequently call waitForAck
    }

    /**
     * Wait for an acknowledgement (which must have been previously requested).
     * @return {@link ProcessOutput} when positive acknowledgement (ack) has been received, otherwise throws
     * @throws PythonScriptExecutorException if nck was received
     */
    public ProcessOutput waitForAck() {
        if (!isAckRequestOutstanding) {
            throw new GATKException("No ack request is outstanding. An ack request must be issued first");
        }
        final ProcessControllerAckResult pcAckResult = spController.waitForAck();
        isAckRequestOutstanding = false;
        // At every ack receipt, we want to retrieve the stdout/stderr output in case we're journaling
        final ProcessOutput po = getAccumulatedOutput();
        if (!pcAckResult.isPositiveAck()) {
            throw new PythonScriptExecutorException(
                    String.format(
                            "A nack was received from the Python process (most likely caused by a raised exception caused by): %s",
                            pcAckResult.getDisplayMessage()
                    )
            );
        }
        return po;
    }

    /**
    /**
     * Return a (not necessarily executable) string representing the current command line for this executor
     * for error reporting purposes.
     * @return A string representing the command line used for this executor.
     */
    public String getApproximateCommandLine() {
        return curatedCommandLineArgs.stream().collect(Collectors.joining(" "));
    }

    /**
     * Obtain a stream writer that serializes and writes batches of items of type {@code T} on a background thread.
     * @param itemSerializer {@code Function} that  accepts items of type {@code T} and converts them to a
     *                                       {@code ByteArrayOutputStream} that is subsequently written to the stream
     * @return An {@link AsynchronousStreamWriter}
     */
    public void initStreamWriter(final Function itemSerializer) {
        Utils.nonNull(itemSerializer, "An item serializer must be provided for the async writer service");

        dataTransferFIFOFile = spController.createDataFIFO();

        // Open the FIFO for writing. Opening a FIFO for read or write will block until there is a reader/writer
        // on the other end, so before we open it, send a non blocking, ASYNCHRONOUS command to the Python process
        // to open the FIFO for reading. The Python process will then block until we open the FIFO below.
        sendAsynchronousCommand(String.format(PYTHON_INITIALIZE_DATA_FIFO, dataTransferFIFOFile.getAbsolutePath()));
        try {
            dataTransferFIFOWriter = new FileOutputStream(dataTransferFIFOFile);
            asyncWriter = spController.getAsynchronousStreamWriter(dataTransferFIFOWriter, itemSerializer);
            // synchronize on an ack for the async command sent above before returning
            waitForAck();
        } catch ( IOException e ) {
            throw new GATKException("Failure opening FIFO for writing", e);
        }
    }

    /**
     * Request that a batch of items be written to the stream on a background thread. Any previously requested batch
     * must have already been completed and retrieved via {@link #waitForPreviousBatchCompletion}.
     *
     * @param pythonCommand command that will be executed asynchronously to cconsume the data written to the stream
     * @param batchList a list of items to be written
     */
    public void startBatchWrite(final String pythonCommand, final List batchList) {
        Utils.nonNull(pythonCommand);
        Utils.nonNull(batchList);
        Utils.nonEmpty(batchList);
        sendAsynchronousCommand(pythonCommand);
        asyncWriter.startBatchWrite(batchList);
    }

    /**
     * Waits for a batch that was previously initiated via {@link #startBatchWrite(String, List)}}
     * to complete, flushes the target stream and returns the corresponding completed Future. The Future representing
     * a given batch can only be obtained via this method once. If no work is outstanding, and/or the previous batch
     * has already been retrieved, null is returned.
     * @return returns null if no previous work to complete, otherwise a completed Future
     */
    public Future waitForPreviousBatchCompletion() {
        // Rather than waiting for the asyncWriter Future to complete first, and THEN waiting for
        // the ack, call waitForAck() first instead, because it will will detect and propagate any
        // exception that occurs on the python side that causes it to stop pulling data from the
        // FIFO (which in turn can result in the background thread blocking, thereby preventing the
        // asyncWriter Future from ever completing). This is safer than waiting for the Future first,
        // since the Future might never complete if the async writer thread is blocked.
        waitForAck();
        // now that we have the ack, verify that the async batch write completed
        final Future numberOfItemsWritten = asyncWriter.waitForPreviousBatchCompletion();
        return numberOfItemsWritten;
    }

    /**
     * Get the Process object associated with this executor. For testing only.
     *
     * @return
     */
    @VisibleForTesting
    protected Process getProcess() {
        return spController.getProcess();
    }

    /**
     * Terminate the remote process, closing the fifo if any.
     */
    public void terminate() {
        if (profileResults != null) {
            spController.writeProcessInput(String.format(PYTHON_END_PROFILING, profileResults.getAbsolutePath()));
            sendAckRequest();
            waitForAck();
        }
        if (dataTransferFIFOWriter != null) {
            if (asyncWriter != null) {
                if(!asyncWriter.terminate()){
                    throw new GATKException("failed to close asyncWriter");
                }
            }
            spController.writeProcessInput(PYTHON_CLOSE_DATA_FIFO);
            sendAckRequest();
            waitForAck();
            try {
                dataTransferFIFOWriter.close();
                dataTransferFIFOWriter = null;
                dataTransferFIFOFile = null;
            } catch (IOException e) {
                throw new GATKException("IOException closing fifo", e);
            }
        }

        // we can't get an ack for this, since it closes down the ack fifo
        spController.writeProcessInput(PYTHON_TERMINATE_GATK);
        spController.terminate();
    }

    /**
     * Return all data accumulated since the last call to {@link #getAccumulatedOutput} (either directly, or
     * indirectly through {@link #sendSynchronousCommand}.
     *
     * Note that the output returned is somewhat non-deterministic, in that there is no guaranty that all of
     * the output from the previous command has been flushed at the time this call is made.
     *
     * @return ProcessOutput containing all accumulated output from stdout/stderr
     * @throws UserException if a timeout occurs waiting for output
     * @throws PythonScriptExecutorException if a traceback is detected in the output
     */
    public ProcessOutput getAccumulatedOutput() {
        return spController.getProcessOutput();
    }

    private void initializeTool(final File ackFIFOFile) {
        //  first we need to import the module; no ack expected yet as we haven't initialized the ack fifo
        spController.writeProcessInput(PYTHON_IMPORT_GATK); // no ack generated
        spController.writeProcessInput(String.format(PYTHON_INITIALIZE_GATK, ackFIFOFile.getAbsolutePath()));
        sendAckRequest(); // queue up an ack request
        // open the FIFO to unblock the remote caller (which should be blocked on open for read), and then
        // wait for the ack to be sent
        spController.openAckFIFOForRead();
        waitForAck();
        if (profileResults != null) {
            spController.writeProcessInput(PYTHON_START_PROFILING);
            sendAckRequest();
            waitForAck();
        }
    }

    private void sendAckRequest() {
        if (isAckRequestOutstanding) {
            throw new GATKException("An ack request is already outstanding. The previous ack request must be retrieved" +
                    " before a new ack request can be issued");
        }
        spController.writeProcessInput(PYTHON_SEND_ACK_REQUEST);
        isAckRequestOutstanding = true;
    }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy