nextflow.splitter.AbstractTextSplitter.groovy Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of nextflow Show documentation
Show all versions of nextflow Show documentation
A DSL modelled around the UNIX pipe concept, that simplifies writing parallel and scalable pipelines in a portable manner
/*
* Copyright 2013-2024, Seqera Labs
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package nextflow.splitter
import java.nio.charset.Charset
import java.nio.charset.CharsetDecoder
import java.nio.file.Path
import groovy.transform.CompileStatic
import groovy.transform.InheritConstructors
import groovy.transform.PackageScope
import groovy.util.logging.Slf4j
import nextflow.Global
import nextflow.Nextflow
import nextflow.exception.AbortOperationException
import nextflow.util.CacheHelper
import nextflow.util.CharsetHelper
import nextflow.splitter.TextFileCollector.CachePath
/**
* Implements an abstract text splitting for the main types
*
* @author Paolo Di Tommaso
*/
@Slf4j
@CompileStatic
@InheritConstructors
abstract class AbstractTextSplitter extends AbstractSplitter {
protected Charset charset = Charset.defaultCharset()
Charset getCharset() { charset }
protected boolean fileMode
protected Path collectPath
protected String collectName
protected boolean compress
private long itemsCount
AbstractTextSplitter options(Map options) {
super.options(options)
// the charset used to parse the file
charset = CharsetHelper.getCharset(options.charset)
// collect chunks to temporary files
if( options.file instanceof Boolean )
fileMode = options.file as Boolean
else if( options.file instanceof Path ) {
collectPath = (Path)options.file
fileMode = true
}
else if( options.file instanceof CharSequence ) {
collectName = options.file.toString()
fileMode = true
}
if( options.compress?.toString() == 'true' )
compress = options.compress as boolean
// file mode and record cannot be used at the same time
if( fileMode && recordMode )
throw new AbortOperationException("Parameters `file` and `record` conflict -- check operator `$operatorName`")
if( !fileMode && compress )
throw new AbortOperationException("Parameter `compress` requires also the use of parameter `file: true` -- check operator `$operatorName`")
return this
}
/**
* @return A map representing the valid options for the splitter. The map keys define the
* accepted parameter names, the values the valid values for each of them.
*/
@Override
protected Map validOptions() {
def result = super.validOptions()
result.charset = [ Charset, Map, String ]
result.file = [Boolean, Path, CharSequence]
result.compress = [Boolean]
return result
}
/**
* Creates a {@link Reader} for the given object.
*
* @param obj An instance of {@link Reader}, {@link CharSequence}, {@link Path}, {@link File}
* {@link InputStream}, {@code char[]}
* @return A {@link Reader} for the given object.
* @throws IllegalArgumentException if the object specified is of a type not supported
*/
protected Reader normalizeSource( obj ) {
Reader reader = normalizeSource0( obj )
// detect if starts with bom and position after it
int consume = positionAfterBOM(reader)
if( consume ){
char[]tmp = new char[consume]
reader.read(tmp)
}
reader
}
protected Reader normalizeSource0( obj ) {
if( obj instanceof Reader )
return (Reader)obj
if( obj instanceof CharSequence )
return new StringReader(obj.toString())
if( obj instanceof Path )
return newReader((Path)obj, charset)
if( obj instanceof InputStream )
return new InputStreamReader((InputStream)obj,charset)
if( obj instanceof File )
return newReader(obj.toPath(), charset)
if( obj instanceof char[] )
return new StringReader(new String((char[])obj))
throw new IllegalArgumentException("Object of class '${obj.class.name}' does not support 'splitter' methods")
}
/**
* Create a new reader object for the given path
* @param path
* @param charset
* @return
*/
protected Reader newReader( Path path, Charset charset ) {
def source = newInputStream(path)
CharsetDecoder decoder = charset.newDecoder()
Reader reader = new InputStreamReader(source, decoder)
new BufferedReader(reader)
}
/**
* Wrap the target object with a {@link BufferedReader} filter stream
* @param targetObject The target stream go be split
* @return A {@link BufferedReader} instance
*/
protected BufferedReader wrapReader( Reader targetObject ) {
if( targetObject instanceof BufferedReader )
return (BufferedReader)targetObject
else
return new BufferedReader(targetObject)
}
protected boolean isCollectorEnabled() {
return (counter.isEnabled() || fileMode)
}
/**
* Process the object to split
*
* @param targetObject
* @param offset
* @return
*/
protected process( Reader targetObject ) {
def result = null
BufferedReader reader = wrapReader(targetObject)
counter.reset() // <-- make sure to start
itemsCount = 0
try {
while( true ) {
// -- parse a record object
final record = fetchRecord( reader )
if( record == null )
break
// -- apply the splitting logic for the fetched record
result = processChunk( record )
// -- check the limit of allowed rows has been reached
if( limit>0 && ++itemsCount == limit )
break
}
// make sure to process collected entries
if ( collector && collector.hasChunk() ) {
result = invokeEachClosure(closure, collector.nextChunk())
}
}
finally {
reader.closeQuietly()
if( collector instanceof Closeable )
collector.closeQuietly()
}
return result
}
/**
* Add the fetched record to the current collection and emit a new chunk
*
* @param record The read record object
* @return A value returned by the user splitting closure
*/
protected processChunk( record ) {
def result = null
if ( isCollectorEnabled() ) {
// -- append to the list buffer
collector.add(record)
if( counter.isChunckComplete() ) {
result = invokeEachClosure(closure, collector.nextChunk())
counter.reset()
}
}
else {
result = invokeEachClosure(closure, record)
}
return result
}
/**
* @return A {@link CollectorStrategy} object implementing a concrete
* strategy according the user provided options
*/
protected CollectorStrategy createCollector() {
if( !isCollectorEnabled() )
return null
if( recordMode )
return new ObjectListCollector()
if( fileMode ) {
def base = getCollectorBaseFile()
return new TextFileCollector(base, charset, compress)
}
return new CharSequenceCollector()
}
protected String getCollectFileName() {
if( collectName ) {
return multiSplit ? "${collectName}_${elem}" : collectName
}
if( sourceFile ) {
def fileName = sourceFile.getName()
if( fileName.endsWith('.gz') )
fileName = fileName[0..-4]
return fileName
}
return multiSplit ? "chunk_$elem" : 'chunk'
}
/**
* @return A path where file chunks are cached
*/
@PackageScope
CachePath getCollectorBaseFile () {
final fileName = getCollectFileName()
log.trace "Splitter collector file name: $fileName"
CachePath result
if( collectPath ) {
final file = sourceFile ?: targetObj
final path = collectPath.isDirectory() ? collectPath.resolve(fileName) : collectPath
final keys = [Global?.session?.uniqueId, file, getCacheableOptions()]
final hash = CacheHelper.hasher(keys).hash()
result = new CachePath(path,hash)
}
else if( sourceFile ) {
final path = Nextflow.cacheableFile( [sourceFile, getCacheableOptions()], fileName)
result = new CachePath(path)
}
else {
final path = Nextflow.cacheableFile( [targetObj, getCacheableOptions()], fileName )
result = new CachePath(path)
}
log.debug "Splitter `$operatorName` collector path: $result"
return result
}
/**
* @return A map of the options that contribute to create unique key
*/
protected Map getCacheableOptions() {
def result = new HashMap( fOptionsMap )
result.remove('into')
result.remove('each')
result.charset = charset.toString()
return result
}
/**
* Read and parse a new record from the reader object
* @param reader A reader representing the object to split
* @return A logical record read from underlying stream
*/
abstract protected fetchRecord( BufferedReader reader )
protected int positionAfterBOM(Reader reader ){
if( !reader.markSupported() )
return 0
char[] beginner = new char[3];
reader.mark(beginner.length)
int read = reader.read(beginner)
reader.reset()
if( read ){
if( beginner[0] == '\ufeff' || beginner[0] == '\ufffe') {
return 1
}
//include more BOM types if required
}
return 0
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy