nextflow.splitter.AbstractSplitter.groovy Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of nextflow Show documentation
A DSL modelled around the UNIX pipe concept, that simplifies writing parallel and scalable pipelines in a portable manner
There is a newer version: 24.11.0-edge
Show newest version
/*
 * Copyright 2013-2024, Seqera Labs
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package nextflow.splitter

import java.nio.file.Files
import java.nio.file.Path
import java.util.zip.GZIPInputStream

import groovy.transform.CompileStatic
import groovy.transform.PackageScope
import groovy.util.logging.Slf4j
import groovyx.gpars.dataflow.DataflowBroadcast
import groovyx.gpars.dataflow.DataflowQueue
import groovyx.gpars.dataflow.DataflowWriteChannel
import nextflow.Channel
import nextflow.exception.StopSplitIterationException
import nextflow.extension.CH
import nextflow.util.CheckHelper
/**
 * Generic data splitter, provide main methods/interfaces
 *
 * @author Paolo Di Tommaso 
 */
@Slf4j
@CompileStatic
abstract class AbstractSplitter implements SplitterStrategy {

    protected Map fOptionsMap

    protected def into

    protected Closure closure

    protected boolean recordMode

    protected Map recordFields

    protected boolean autoClose = true

    protected Path sourceFile

    protected decompress

    protected String operatorName

    protected long limit

    protected Integer elem

    private targetObj

    private CollectorStrategy collector

    protected boolean multiSplit

    protected EntryCounter counter = new EntryCounter(1)

    AbstractSplitter() { }

    /**
     * Create a splitter object for the specified operator name
     *
     * @param name The name of an operator invoking the splitter. This value
     * is meant to be used only for reporting a meaningful error message
     */
    AbstractSplitter( String name ) {
        this.operatorName = name
    }

    /**
     * Create a splitter object with the specified option parameters
     *
     * See {@link #options(java.util.Map)}
     *
     * @param opt A map of named parameters
     */
    protected AbstractSplitter( Map opt ) {
        options(opt)
    }

    /**
     * @return A string representing the operator invoking the splitter
     */
    String getOperatorName() { operatorName ?: this.class.simpleName }

    /**
     * @return The splitter raw target object
     */
    protected Object getTargetObj() { targetObj }

    /**
     * @return The target object that receives the splitted chunks. It can be a {@link groovyx.gpars.dataflow.DataflowChannel} or a {@code List}
     */
    def getInto() { into }

    /**
     * @return Whenever each split is parsed to a record object or a chunk in the native format i.e. text line(s) or bytes
     */
    boolean getRecordMode() { recordMode }

    /**
     * @return The fields to be included in each parsed record
     */
    Map getRecordFields() { recordFields }

    AbstractSplitter setRecordFields( Map fields ) {
        recordMode = true
        recordFields = fields
        return this
    }

    AbstractSplitter setMultiSplit(boolean value) {
        this.multiSplit = value
        return this
    }

    /**
     * Apply the splitting operation on the given object
     *
     * @param index the current split count
     * @return Either {@link groovyx.gpars.dataflow.DataflowChannel} or a {@code List} which holds the splitted chunks
     */
    final apply() {

        def result = null
        def source = targetObj instanceof List ? findSource((List)targetObj) : targetObj

        setSource(source)


        final chunks = collector = createCollector()
        if( chunks instanceof CacheableCollector && chunks.checkCached() ) {
            log.debug "Operator `$operatorName` reusing cached chunks at path: ${chunks.getBaseFile()}"
            result = resumeFromCache(chunks)
        }

        else {
            try {
                def stream = normalizeSource(source)
                result = process(stream)
            }
            catch ( StopSplitIterationException e ) {
                log.trace 'Split iteration interrupted'
            }
        }


        /*
         * now close and return the result
         * - when the target it's a channel, send stop message
         * - when it's a list return it
         * - otherwise return the last value
         */
        if( into instanceof DataflowWriteChannel && autoClose ) {
            append(into, Channel.STOP)
            return into
        }
        if( into != null )
            return into

        return result
    }

    /**
     * @param tuple
     *      A non-empty list of objects
     * @return
     *      Returns the item at position defined by the attribute {@code #elem} in the list specified as parameter.
     *      When {@code #elem} is equal to -1 find find out the first occurrence of a file object.
     *      If no file is available the first item in the list is returned
     */
    @PackageScope
    def findSource( List tuple ) {

        if( elem >= 0 )
            return tuple.get(elem)

        // find the elem-th item having Path or File type
        int pos = elem != null ? -elem : 1
        int count = 0
        for( int i=0; i
     * Supporter parameters are:
     * {@code by}: Defines the splitting interval e.g. how many lines are in each chunk when splitting a text file
     * 
{@code into}: The receiving object, it can a {@link List} instance of a {@link DataflowQueue} instance
     * 
{@code each}: The transforming closure invoke by each splitting chunk
     * 
{@code record}:
     *          When {@code true} the splitting chunk is parsed into a record object, alternatively use to specify
     *          the field names required with a map of booleans
     * {@code autoClose}:
     *          Then {@code into} parameter is {@link DataflowQueue} use this params to enable/disable the splitter
     *          to close the channel by sending a {@link nextflow.Channel#STOP} message when complete (default: {@code true})
     *
     * @param options The map holding the named parameters
     * @return The object itself
     */
    AbstractSplitter options( Map options ) {
        CheckHelper.checkParams(getOperatorName(), options, validOptions())

        fOptionsMap = options

        closure = (Closure)options.each

        if( options.by )
            counter = new EntryCounter(options.by as Integer)

        into = options.into

        recordMode = isTrueOrMap(options.record)

        if( options.record instanceof Map )
            recordFields = (Map)options.record

        if( options.autoClose instanceof Boolean )
            autoClose = options.autoClose as boolean

        if( options.decompress != null )
            decompress = options.decompress

        if( options.limit )
            limit = options.limit as long

        if( options.elem )
            elem = options.elem as int

        return this
    }

    /**
     * @return A map representing the valid options for the splitter. The map keys define the
     * accepted parameter names, the values the valid values for each of them.
     */
    protected Map validOptions() {
        [
                each: Closure,
                by: Integer,
                into: [Collection, DataflowQueue, DataflowBroadcast],
                autoClose: Boolean,
                limit: Integer,
                elem: Integer,
                decompress: Boolean
        ]
    }

    /**
     * Set the target object to be splitter. This method invokes {@link #normalizeSource(java.lang.Object)}
     *
     * @param object The object to be splitted
     * @return The object itself
     */
    AbstractSplitter target( obj ) {
        targetObj = obj
        return this
    }

    /**
     * Start the slitting
     */
    def split() {
        apply()
    }

    /**
     * Apply the specified closure to each chunk in the target object
     *
     * @param closure A closure object
     */
    void each( Closure closure ) {
        this.closure = closure
        apply()
    }

    /**
     * @return The number of chunks in the target object
     */
    long count() {
        long result = 0
        closure = { result++ }
        apply()
        return result
    }

    /**
     * @return Split the target objects and return a list containing all chunks
     */
    List list() {
        into = []
        (List) apply()
    }

    /**
     * @return Split the target object and return a channel emitting the produced chunks
     */
    DataflowWriteChannel channel() {
        into = CH.create()
        (DataflowWriteChannel) apply()
    }

    /**
     * Invoke the each closure
     *
     * @param closure
     * @param chunk
     * @param index
     * @return
     */
    @PackageScope
    final invokeEachClosure( Closure closure, Object chunk ) {

        def result
        if( targetObj instanceof List ) {
            result = new ArrayList((List)targetObj)
            result.set(elem, chunk)
        }
        else {
            result = chunk
        }

        if( closure ) {
            result = closure.call(result)
        }

        if( into != null )
            append(into,result)

        return result
    }


    /**
     * Add a generic value to a target container, that can be either a {@code Collection}
     * or a {@code DataflowWriteChannel} instance
     *
     * @param into The target container, either a {@code Collection} or a {@code DataflowWriteChannel} instance
     * @param value Any value
     * @throws {@code IllegalArgumentException} whenever parameter {@code into} is not a valid object
     */

    private int debugCount = 0

    protected void append( into, value ) {
        log.trace "Splitter value: ${debugCount++}"

        if( into instanceof Collection )
            into.add(value)

        else if( into instanceof DataflowWriteChannel )
            into.bind(value)

        else
            throw new IllegalArgumentException("Not a valid 'into' target object: ${into?.class?.name}")
    }

    /**
     * @param value An object to check
     * @return {@code true} if the value is an instanceof {@link Map} of a boolean value equals to {@code true}
     */
    static protected boolean isTrueOrMap( value ) {
        if( value instanceof Map )
            return true

        return value instanceof Boolean && (value as Boolean)
    }

    /**
     * Given a {@link Path} return a new {@link InputStream} associated to it.
     * When the file name ends with {@code .gz} the stream is filtered by a {@link GZIPInputStream}
     *
     * @param path An path for an existing file
     * @return The {@link InputStream} object for the given file
     */
    protected InputStream newInputStream( Path path ) {

        def result = Files.newInputStream(path)

        if( decompress == null && path.name.endsWith('.gz') )
            decompress = true

        if( decompress ) {
            log.debug "Creating gzip splitter for: $path"
            return new GZIPInputStream(result)
        }

        return result
    }

    /**
     * @return The current {@link CollectorStrategy} object
     */
    final protected CollectorStrategy getCollector() {
        collector
    }

    /**
     * @return create a new {@link CollectorStrategy} object. Subclass must implement a valid
     * strategy
     */
    abstract protected CollectorStrategy createCollector()

}