All Downloads are FREE. Search and download functionalities are using the official Maven repository.

nextflow.dag.DAG.groovy Maven / Gradle / Ivy

Go to download

A DSL modelled around the UNIX pipe concept, that simplifies writing parallel and scalable pipelines in a portable manner

There is a newer version: 24.11.0-edge
Show newest version
/*
 * Copyright 2013-2024, Seqera Labs
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package nextflow.dag

import groovy.transform.MapConstructor
import groovy.transform.PackageScope
import groovy.transform.ToString
import groovy.util.logging.Slf4j
import groovyx.gpars.dataflow.DataflowBroadcast
import groovyx.gpars.dataflow.DataflowQueue
import groovyx.gpars.dataflow.DataflowReadChannel
import groovyx.gpars.dataflow.DataflowWriteChannel
import groovyx.gpars.dataflow.expression.DataflowExpression
import groovyx.gpars.dataflow.operator.DataflowProcessor
import nextflow.NF
import nextflow.extension.CH
import nextflow.extension.DataflowHelper
import nextflow.processor.TaskProcessor
import nextflow.script.params.DefaultInParam
import nextflow.script.params.DefaultOutParam
import nextflow.script.params.EachInParam
import nextflow.script.params.InParam
import nextflow.script.params.InputsList
import nextflow.script.params.OutParam
import nextflow.script.params.OutputsList
import nextflow.script.params.TupleInParam
import nextflow.script.params.TupleOutParam

import java.util.concurrent.atomic.AtomicLong

/**
 * Model a direct acyclic graph of the pipeline execution.
 *
 * @author Paolo Di Tommaso 
 */
@Slf4j
class DAG {

    @PackageScope
    static enum Type {
        PROCESS,
        OPERATOR,
        ORIGIN,
        NODE
    }

    /**
     * The list of edges in the graph
     */
    private List edges = new ArrayList<>(50)

    /**
     * The ordered list of vertices
     */
    private List vertices = new ArrayList<>(50)

    /**
     * Contains mappings of ReadChannel nodes to DataflowBroadcast nodes.
     * This is needed for DataflowBroadcast operators that get added to the
     * DAG as ReadChannels - we need to get back to the DataflowBroadcast
     * given the ReadChannel.
     */
    private Map dataflowBroadcastLookup = new HashMap()

    /**
     * Adds a mapping from ReadChannel node to DataflowBroadcast node.
     */
    void addDataflowBroadcastPair(readChannel, broadcastChannel) {
        dataflowBroadcastLookup.put(readChannel, broadcastChannel)
    }

    List getVertices() { vertices }

    List getEdges() { edges }

    boolean isEmpty() { edges.size()==0 && vertices.size()==0 }

    /**
     *  Creates a new vertex in the DAG representing a computing `process`
     *
     * @param label The label associated to the process
     * @param inputs The list of inputs entering in the process
     * @param outputs the list of outputs leaving the process
     */
    void addProcessNode( String label, InputsList inputs, OutputsList outputs, TaskProcessor process=null ) {
        assert label
        assert inputs
        assert outputs
        addVertex( Type.PROCESS, label, normalizeInputs(inputs), normalizeOutputs(outputs), process )
    }

    /**
     * Creates a new DAG vertex representing a dataflow operator
     *
     * @param label The operator label
     * @param inputs The operator input(s). It can be either a single channel or a list of channels.
     * @param outputs The operator output(s). It can be either a single channel, a list of channels or {@code null} if the operator has no output.
     */
    void addOperatorNode( String label, inputs, outputs, List operators=null )  {
        assert label
        assert inputs
        addVertex(Type.OPERATOR, label, normalizeChannels(inputs), normalizeChannels(outputs), operators )
    }

    /**
     * Creates a vertex in the DAG representing a dataflow channel source.
     *
     * @param label The node description
     * @param source Either a dataflow channel or a list of channel.
     */
    void addSourceNode( String label, source )  {
        assert label
        assert source
        addVertex(Type.ORIGIN, label, null, normalizeChannels(source) )
    }

    /**
     * Creates a vertex and adds it to the DAG
     *
     * @param type A {link Type} value
     * @param label The vertex description
     * @param inbounds The inbounds channels to this vertex
     * @param outbounds The outbounds channels leaving the vertex
     */
    @PackageScope
    void addVertex( Type type, String label, List inbounds, List outbounds, Object extra=null) {

        final vertex = createVertex( type, label, extra )

        for( ChannelHandler channel : inbounds ) {
            inbound( vertex, channel )
        }

        for( ChannelHandler channel : outbounds ) {
            outbound( vertex, channel )
        }
    }

    /**
     * Creates a DAG vertex object
     *
     * @param type The vertex type
     * @param label The vertex label
     * @return A {@link Vertex} object
     */
    @PackageScope
    Vertex createVertex( Type type, String label, extra=null ) {
        def result = new Vertex(type, label)
        if( extra instanceof TaskProcessor ) {
            result.process = extra
            result.operators = [ extra.operator ]
        }
        else if( extra instanceof List ) {
            result.operators = (List)extra.clone()
        }
        else if( extra != null )
            throw new IllegalArgumentException("Not a valid DAG vertex parameter: [${extra.class.name}] $extra")

        vertices << result
        return result
    }

    private void inbound( Vertex vertex, ChannelHandler entering )  {

        // look for an existing edge for the given dataflow channel
        def edge = findEdge(entering.channel)

        // if does not exist just create it
        if( !edge ) {
            edges << new Edge(channel: entering.channel, to: vertex, label: entering.label)
        }
        // link the edge to given `edge`
        else if( edge.to == null ) {
            edge.to = vertex
        }
        // handle the special case for dataflow variable
        // this kind of channel can be used more than one time as an input
        else if( isForkable(entering.channel) ) {
            if( !edge.from ) {
                edge.from = new Vertex(Type.ORIGIN);
                int p = vertices.indexOf(edge.to)
                if(p!=-1) vertices.add(p,edge.from)
                else vertices.add(edge.from)
            }
            def fork = new Edge(channel: entering.channel, from: edge.from, to: vertex, label: entering.label)
            edges << fork
        }
        // the same channel - apart the above case - cannot be used multiple times as an input
        // thus throws an exception
        else {
            final name = getChannelName(entering)
            log.debug "Before MultipleInputChannelException: entering=$entering; name=$name; vertex=$vertex; edge.to=$edge.to"
            throw new MultipleInputChannelException(name, vertex, edge.to)
        }
    }

    private boolean isForkable(obj) {
        if( obj instanceof DataflowExpression )
            return true
        if( obj instanceof DataflowBroadcast )
            return true
        return obj instanceof DataflowQueue && CH.isBridge(obj)
    }

    private void outbound( Vertex vertex, ChannelHandler leaving) {

        // look for an existing edge for the given dataflow channel
        final edge = findEdge(leaving.channel)
        if( !edge ) {
            edges << new Edge(channel: leaving.channel, from: vertex, label: leaving.label)
        }
        else if( edge.from == null ) {
            edge.from = vertex
        }
        // the same channel cannot be used multiple times as an output
        // thus throws an exception
        else {
            final name = getChannelName(leaving)
            throw new MultipleOutputChannelException(name, vertex, edge.from)
        }

    }

    private List normalizeInputs( InputsList inputs ) {

        inputs
                .findAll { !( it instanceof DefaultInParam)  }
                .collect { InParam p -> new ChannelHandler(channel: p.rawChannel, label: inputName0(p)) }

    }

    private String inputName0(InParam param) {
        if( param instanceof TupleInParam ) return null
        if( param instanceof EachInParam ) return null
        return param.name
    }

    private List normalizeOutputs( OutputsList outputs ) {

        def result = []
        for(OutParam p :outputs) {
            if( p instanceof DefaultOutParam )
                break
            final it = p.getOutChannel()
            if( it!=null )
                result << new ChannelHandler(channel: it, label: p instanceof TupleOutParam ? null : p.name)
        }

        return result
    }

    private List normalizeChannels( entry ) {
        if( entry == null ) {
            Collections.emptyList()
        }
        else if( entry instanceof DataflowReadChannel || entry instanceof DataflowWriteChannel ) {
            [ new ChannelHandler(channel: dataflowBroadcastLookup.getOrDefault(entry, entry)) ]
        }
        else if( entry instanceof Collection || entry instanceof Object[] ) {
            entry
                .collect( it -> dataflowBroadcastLookup.getOrDefault(it, it) )
                .unique() // removes duplicate DataflowBroadcast channels
                .collect( it -> new ChannelHandler(channel: it) )
        }
        else {
            throw new IllegalArgumentException("Not a valid channel type: [${entry.class.name}]")
        }
    }

    @PackageScope
    Edge findEdge( channel ) {
        edges.find { edge -> edge.channel.is(channel) }
    }

    @PackageScope
    int indexOf(Vertex v) {
        vertices.indexOf(v)
    }

    @PackageScope
    void normalizeMissingVertices() {
        for( Edge e : edges ) {
            assert e.from || e.to, 'Missing source and termination vertices for edge'

            if( !e.from ) {
                // creates the missing origin vertex
                def vertex = e.from = new Vertex(Type.ORIGIN)
                int p = vertices.indexOf( e.to )
                vertices.add( p, vertex )
            }
            else if( !e.to ) {
                // creates the missing termination vertex
                def vertex = e.to = new Vertex(Type.NODE)
                int p = vertices.indexOf( e.from )
                vertices.add( p+1, vertex )
            }
        }
    }

    @PackageScope
    void resolveEdgeNames() {
        for( Edge edge : new ArrayList<>(edges) ) {
            final name = lookupVariable(edge.channel)
            if( name )
                edge.label = name
        }
    }

    @PackageScope String lookupVariable(obj) {
        NF.lookupVariable(obj)
    }

    @PackageScope
    String resolveChannelName( Map map, channel ) {
        def entry = map.find { k,v -> v.is channel }
        return entry ? entry.key : null
    }

    @PackageScope
    String getChannelName( ChannelHandler handler ) {
        NF.lookupVariable(handler.channel) ?: handler.label
    }

    void normalize() {
        normalizeMissingVertices()
        resolveEdgeNames()
    }

    /**
     * @return
     *      A string listing the current active processes/operators in the
     *      dataflow network represented by this DAG
     */
    String dumpActiveNodes() {
        normalize()

        // first dump active processes
        def processes = vertices.findAll { it.process && it.isActive() }.collect { it.process }
        if( processes ) {
            def result = new StringBuilder()
            processes.eachWithIndex { it, index ->
                if( index>0 ) result << '\n'
                result << it.dumpTerminationStatus()
            }
            return result.toString()
        }

        // otherwise fallback on other nodes
        def nodes = vertices.findAll { it.active }
        if( !nodes )
            return null

        def result = new StringBuilder()
        nodes.each {
            result << '  [' << it.type.toString().toLowerCase() << "] " << (it.label ?: it.name) << '\n'
        }

        return result
    }

    /**
     * Model a vertex in the DAG.
     *
     * @author Paolo Di Tommaso 
     */
    @ToString(includeNames = true, includes = 'label,type', includePackage=false)
    class Vertex {

        static private AtomicLong nextID = new AtomicLong()

        /**
         * The vertex label
         */
        String label

        /**
         * The vertex type
         */
        Type type

        /**
         * One or more {@link DataflowProcessor} associated to this graph node
         */
        List operators

        TaskProcessor process

        /**
         * unique Id
         */
        final long id = nextID.getAndIncrement()

        /**
         * Create a DAG vertex instance
         *
         * @param type A {@link Type} value
         * @param label A descriptive string to label this vertex
         */
        Vertex( Type type, String label = null ) {
            assert type
            this.label = label
            this.type = type
        }

        /**
         * @return The order of the index in the DAG
         */
        int getOrder() {
            indexOf(this)
        }

        /**
         * @return The unique name for this node
         */
        String getName() { "v${getOrder()}" }

        boolean isActive() {
            operators?.any { DataflowHelper.isProcessorActive(it) }
        }

    }

    /**
     * Models an edge in the DAG
     *
     * @author Paolo Di Tommaso 
     */
    @ToString(includeNames = true, includes = 'label,from,to', includePackage=false)
    @MapConstructor
    class Edge {

        static private AtomicLong nextID = new AtomicLong()

        /**
         * The Dataflow channel that originated this graph edge
         */
        Object channel

        /**
         * The vertex *from* where the edge starts
         */
        Vertex from

        /**
         * The vertex *to* where the edge ends
         */
        Vertex to

        /**
         * A descriptive label
         */
        String label

        /**
         * unique Id
         */
        final long id = nextID.getAndIncrement()

    }

    /**
     * A simple wrapper object to handle a channel and the associated label
     */
    @ToString(includeNames = true, includes = 'label', includePackage=false)
    static class ChannelHandler {

        /**
         * The {@link groovyx.gpars.dataflow.DataflowChannel} that originated this graph edge
         */
        Object channel

        /**
         * The edge label
         */
        String label

    }


}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy