All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.github.goldin.plugins.gradle.crawler.CrawlerTask.groovy Maven / Gradle / Ivy

package com.github.goldin.plugins.gradle.crawler

import static com.github.goldin.plugins.gradle.crawler.CrawlerConstants.*
import com.github.goldin.plugins.gradle.common.BaseTask
import com.github.goldin.plugins.gradle.common.HttpResponse
import org.gcontracts.annotations.Ensures
import org.gcontracts.annotations.Requires
import org.gradle.api.GradleException
import org.gradle.api.logging.LogLevel
import java.util.concurrent.*
import java.util.concurrent.atomic.AtomicLong


/**
 * {@link CrawlerPlugin} task.
 */
class CrawlerTask extends BaseTask
{
    @Override
    Class extensionType (){ CrawlerExtension }


    private final Queue futures         = new ConcurrentLinkedQueue()
    private final AtomicLong    bytesDownloaded = new AtomicLong( 0L )
    private final AtomicLong    linksProcessed  = new AtomicLong( 0L )

    private volatile boolean            crawlingAborted = false // If ever set to true - crawling process is aborted immediately
    private          ThreadPoolExecutor threadPool
    private          LinksStorage       linksStorage


    /**
     * Verifies {@link CrawlerExtension} contains proper settings and updates it with additional properties.
     * @return {@link CrawlerExtension} instance verified and updated.
     */
    @Override
    void verifyUpdateExtension ( String description )
    {
        assert ( ! ext.rootUrl             ), "'rootUrl' should not be used in $description - private area"
        assert ( ! ext.internalLinkPattern ), "'internalLinkPattern' should not be used in $description - private area"

        ext.baseUrl             = ext.baseUrl?.trim()?.replace( '\\', '/' )?.replaceAll( '^.+?:/+', '' ) // Protocol part removed
        ext.rootUrl             = ext.baseUrl?.replaceAll( '/.*', '' )                                   // Path part removed
        ext.internalLinkPattern = ~/(?:('|")|>)(https?:\/\/\Q${ ext.baseUrl }\E.*?)(?:\1|<)/

        assert ext.baseUrl, "'baseUrl' should be defined in $description"
        assert ext.rootUrl && ( ! ext.rootUrl.endsWith( '/' )) && ext.internalLinkPattern

        assert ext.userAgent,                 "'userAgent' should be defined in $description"
        assert ext.threadPoolSize       >  0, "'threadPoolSize' [${ ext.threadPoolSize }] in $description should be positive"
        assert ext.checksumsChunkSize   >  0, "'checksumsChunkSize' [${ ext.checksumsChunkSize }] in $description should be positive"
        assert ext.futuresPollingPeriod >  0, "'futuresPollingPeriod' [${ ext.futuresPollingPeriod }] in $description should be positive"
        assert ext.connectTimeout       > -1, "'connectTimeout' [${ ext.connectTimeout }] in $description should not be negative"
        assert ext.readTimeout          > -1, "'readTimeout' [${ ext.readTimeout }] in $description should not be negative"
        assert ext.retries              > -1, "'retries' [${ ext.retries }] in $description should not be negative"
        assert ext.retryDelay           > -1, "'retryDelay' [${ ext.retryDelay }] in $description should not be negative"
        assert ext.requestDelay         > -1, "'requestDelay' [${ ext.requestDelay }] in $description should not be negative"

        assert ext.retryStatusCodes.every { it }, "'retryStatusCodes' should not contain nulls in $description"
        assert ext.retryExceptions. every { it }, "'retryExceptions' should not contain nulls in $description"

        ext.rootLinks = ( ext.rootLinks?.grep()?.toSet() ?: [ "http://$ext.baseUrl" ]).collect {
            String rootLink ->
            final isGoodEnough = rootLink && rootLink.with { startsWith( 'http://' ) || startsWith( 'https://' )}
            final noSlash      = (( ! rootLink ) || ext.baseUrl.endsWith( '/' ) || rootLink.startsWith( '/' ))
            isGoodEnough ? rootLink : "http://${ ext.baseUrl }${ noSlash ? '' : '/' }${ rootLink ?: '' }"
        }

        assert ext.rootLinks && ext.rootLinks.every{ it }
    }


    @Override
    void taskAction ()
    {
        this.threadPool   = Executors.newFixedThreadPool( ext.threadPoolSize ) as ThreadPoolExecutor
        this.linksStorage = new LinksStorage( ext )

        if ( ext.log ){ delete( ext.log )}

        printStartBanner()
        submitRootLinks()
        waitForIdle()

        printFinishReport()
        writeLinksMapFiles()
        archiveLogFiles()
        checkIfBuildShouldFail()
    }


    @Requires({ ch && input && alternative })
    @Ensures({ result })
    String removeAllAfter( String ch, String input, String alternative )
    {
        final j = input.indexOf( ch )
        ( j > 0 ? new String( input.substring( 0, j )) : alternative )
    }


    /**
     * Logs message returned by the closure provided.
     *
     * @param logLevel           message log level
     * @param error              error thrown
     * @param logMessageCallback closure returning message text
     */
    @Requires({ logLevel && logMessageCallback })
    void crawlerLog ( LogLevel logLevel = LogLevel.INFO, Throwable error = null, Closure logMessageCallback )
    {
        String logText = log( logLevel, error, logMessageCallback )

        if ( ext.log )
        {
            logText = logText ?: logMessageCallback()
            assert logText

            ext.log.append( logText + '\n' )

            if ( error )
            {
                final os = new ByteArrayOutputStream()
                error.printStackTrace( new PrintWriter( os, true ))
                ext.log.append( os.toString() + '\n' )
            }
        }
    }


    /**
     * Prints startup banner.
     */
    void printStartBanner ()
    {
        crawlerLog {
            final ipAddress     = (( ext.rootUrl ==~ /^\d+\.\d+\.\d+\.\d+$/ ) ? '' : " (${ InetAddress.getByName( ext.rootUrl.replaceFirst( /:\d+$/, '' )).hostAddress })" )
            final bannerMessage = "Checking [$ext.baseUrl]${ ipAddress } links with [${ ext.threadPoolSize }] thread${ s( ext.threadPoolSize ) }"
            final bannerLine    = "-" * ( bannerMessage.size() + 2 )
            final os            = new ByteArrayOutputStream()
            final writer        = new PrintWriter( os, true )

            writer.println( bannerLine )
            writer.println( " $bannerMessage" )
            writer.println( " Root link${ s( ext.rootLinks )}:" )
            ext.rootLinks.sort().each { writer.println( " * [$it]" )}
            writer.println( bannerLine )

            os.toString()
        }
    }


    /**
     * Submits root links for checking and starts the crawling process.
     *
     * @param ext
     */
    void submitRootLinks ()
    {
        for ( link in linksStorage.addLinksToProcess( '', ext.rootLinks ).sort())
        {
            final String pageUrl = ( ext.linkTransformers ?: [] ).inject( link ){ String l, Closure c -> c( l )}
            futures << threadPool.submit({ checkLinks( pageUrl, 'Root link', true, 0 )} as Runnable )
        }
    }


    /**
     * Blocks until there is no more activity in a thread pool, meaning all links are checked.
     */
    void waitForIdle ()
    {
        while (( ! crawlingAborted ) && futures.any{ ! it.done } )
        {
            sleep( ext.futuresPollingPeriod )
            futures.removeAll { it.done }

            if ( ext.teamcityMessages )
            {
                final processed = linksProcessed.get()
                final queued    = threadPool.queue.size()
                final broken    = linksStorage.brokenLinksNumber()
                logTeamCityProgressMessage( "$processed link${ s( processed ) } processed, $broken broken, $queued queued" )
            }
        }

        threadPool.queue.clear()
        futures.clear()
        linksStorage.lock()
        threadPool.shutdown()
        threadPool.awaitTermination( 30L, TimeUnit.SECONDS )
    }


    @Requires({ message })
    void logTeamCityProgressMessage ( String message )
    {
        log( LogLevel.WARN ){ "##teamcity[progressMessage '${ message.replace( "'", "|'" ) }']" }
    }


    @Requires({ status && message })
    void logTeamCityBuildStatusMessage ( String status, String message )
    {
        log( LogLevel.WARN ){ "##teamcity[buildStatus status='${ status.replace( "'", "|'" ) }' text='${ message.replace( "'", "|'" ) }']" }
    }


    /**
     * Prints finish report after all links are checked.
     */
    void printFinishReport ()
    {
        if ( ext.teamcityMessages ) { logTeamCityProgressMessage( 'Writing report' )}

        final processedLinks = linksProcessed.get()
        final brokenLinks    = linksStorage.brokenLinksNumber()
        final isSuccess      = ( ! brokenLinks ) && ( ! crawlingAborted )
        final mbDownloaded   = ( long )( bytesDownloaded.get() / ( 1024 * 1024 ))
        final kbDownloaded   = ( long )( bytesDownloaded.get() / ( 1024 ))
        final downloaded     = "[${ mbDownloaded ?: kbDownloaded }] ${ mbDownloaded ? 'Mb' : 'Kb' } downloaded"
        final logLevel       = ( brokenLinks ? LogLevel.ERROR : ext.displaySummary ? LogLevel.WARN : LogLevel.INFO )


        crawlerLog( logLevel ){ "\n\n[$processedLinks] link${ s( processedLinks ) } processed in " +
                                "${( long )(( System.currentTimeMillis() - startTime ) / 1000 )} sec, $downloaded" +
                                ( ext.displayLinks ? ':' : '' )}

        if ( ext.displayLinks )
        {
            final processedLinksLines = toMultiLines( linksStorage.processedLinks())
            crawlerLog( logLevel ){ processedLinksLines }
        }

        crawlerLog( logLevel ){ "\n[$brokenLinks] broken link${ s( brokenLinks ) } found${ brokenLinks ? ':\n' : isSuccess ? ' - thumbs up!' : '' }" }

        if ( brokenLinks )
        {
            final joinLines = { Collection c, String delim = '' -> '\n\n[' + c.join( "]\n$delim[" ) + ']\n\n' }

            for ( brokenLink in linksStorage.brokenLinks().sort())
            {
                final referrers   = linksStorage.brokenLinkReferrers( brokenLink )
                final linkMessage =
                    "[$brokenLink]\n\n" +
                    ( ext.displayLinksPath ? "Path:${ joinLines( linksStorage.linkPath( brokenLink ), '=>\n' )}" : '' ) +
                    ( "Referred by [${ referrers.size()}] resource${ s( referrers ) }:${ joinLines( referrers ) }" )

                crawlerLog( logLevel ){ "- ${ linkMessage.readLines().join( '\n  ' )}\n" }
            }
        }

        if ( ext.teamcityMessages )
        {
            final status  = (( crawlingAborted || ( ext.failOnBrokenLinks && brokenLinks )) ? 'FAILURE' : 'SUCCESS' )
            final message = "$processedLinks link${ s( processedLinks )}, $brokenLinks broken${ crawlingAborted ?  ', crawling aborted' : '' }"

            logTeamCityBuildStatusMessage( status, message )
        }
    }


    /**
     * Writes "links map" files.
     */
    void writeLinksMapFiles ()
    {
        final print = {
            File file, Map> linksMap, String title ->

            assert file && ( linksMap != null ) && title

            final linksMapReport = linksMap.keySet().
                                   collect { String pageUrl -> "[$pageUrl]:\n${ toMultiLines( linksMap[ pageUrl ] ) }" }.
                                   join( '\n' )

            write( file, linksMapReport )

            crawlerLog {
                "$title is written to [${ file.canonicalPath }], [${ linksMap.size() }] entr${ s( linksMap.size(), 'y', 'ies' )}"
            }
        }

        if ( ext.linksMapFile    ) { print( ext.linksMapFile,    linksStorage.linksMap(),    'Links map'     )}
        if ( ext.newLinksMapFile ) { print( ext.newLinksMapFile, linksStorage.newLinksMap(), 'New links map' )}
    }


    /**
     * Archives log files if needed.
     */
    void archiveLogFiles()
    {
        final logFiles = [ ext.log, ext.linksMapFile, ext.newLinksMapFile ].grep()

        if ( ext.zipLogFiles && logFiles )
        {
            logFiles.each { zip( it ); delete( it )}
        }
    }


    /**
     * Checks if build should fail and fails it if required.
     */
    void checkIfBuildShouldFail()
    {
        final brokenLinks = linksStorage.brokenLinksNumber()

        if ( crawlingAborted )
        {
            throw new GradleException(
                'Crawling process was aborted, see above for more details' )
        }

        if ( ext.failOnBrokenLinks && brokenLinks )
        {
            throw new GradleException(
                "[$brokenLinks] broken link${ s( brokenLinks )} found, see above for more details" )
        }

        if ( linksProcessed.get() < ext.minimumLinks )
        {
            throw new GradleException(
                "Only [$linksProcessed] link${ s( linksProcessed.get())} checked, " +
                "[${ ext.minimumLinks }] link${ s( ext.minimumLinks )} at least required." )
        }

        if ( bytesDownloaded.get() < ext.minimumBytes )
        {
            throw new GradleException(
                "Only [$bytesDownloaded] byte${ s( bytesDownloaded.get())} downloaded, " +
                "[${ ext.minimumBytes }] byte${ s( ext.minimumBytes )} at least required." )
        }
    }


    /**
     * Invoked in a thread pool worker - checks links in the page specified.
     *
     * @param pageUrl     URL of a page to check its links
     * @param referrerUrl URL of another page referring to the one being checked
     * @param isRootLink  whether url submitted is a root link
     * @param pageDepth   current page depth
     */
    @SuppressWarnings([ 'GroovyMultipleReturnPointsPerMethod' ])
    @Requires({ pageUrl && referrerUrl && ( pageDepth > -1 ) && linksStorage && threadPool })
    void checkLinks ( String pageUrl, String referrerUrl, boolean isRootLink, int pageDepth )
    {
        if ( crawlingAborted ) { return }

        assert (( ext.maxDepth < 0 ) || ( pageDepth <= ext.maxDepth ))
        delay  ( ext.requestDelay )

        try
        {
            final response = readResponse( pageUrl, referrerUrl, isRootLink )

            if ( response.isRedirect )
            {
                final  actualUrlList = filterTransformLinks([ response.actualUrl ]) // List of one element, transformed link
                assert actualUrlList.size().with {( delegate == 0 ) || ( delegate == 1 )}
                if ( actualUrlList && linksStorage.addLinksToProcess( pageUrl, actualUrlList ))
                {
                    checkLinks( actualUrlList.first(), referrerUrl, isRootLink, pageDepth )
                }

                return
            }

            assert pageUrl == response.actualUrl
            final processed = linksProcessed.incrementAndGet()

            if ( ! response.content ){ return }

            final pageContent        = response.asString()
            final pageIgnored        = ( ext.ignoredContent ?: [] ).any { it ( pageUrl, pageContent )}
            final verificationPassed = ( ext.verifyContent  ? verificationPassed( pageUrl, pageContent, ext.verifyContent ) : true )

            if ( ! verificationPassed )
            {
                abortCrawling(  "! Verification of [$pageUrl] has failed" )
                return
            }

            if ( pageIgnored ){ return }
            if ( pageDepth == ext.maxDepth ){ return }

            final List pageLinks = readLinks( pageUrl, pageContent )
            final List newLinks  = ( pageLinks ? linksStorage.addLinksToProcess( pageUrl, pageLinks ) : [] )
            final queued                 = threadPool.queue.size()

            linksStorage.updateBrokenLinkReferrers( pageUrl, pageLinks )

            if ( ext.linksMapFile    && pageLinks ) { linksStorage.updateLinksMap   ( pageUrl, pageLinks )}
            if ( ext.newLinksMapFile && newLinks  ) { linksStorage.updateNewLinksMap( pageUrl, newLinks  )}

            crawlerLog {
                final linksMessage    = pageLinks ? ", ${ newLinks.size() } new"     : ''
                final newLinksMessage = newLinks  ? ": ${ toMultiLines( newLinks )}" : ''

                "[$pageUrl] - depth [$pageDepth], ${ pageLinks.size() } link${ s( pageLinks ) } found$linksMessage, " +
                "$processed processed, $queued queued$newLinksMessage"
            }

            for ( link in newLinks )
            {
                final String linkUrl = link // Otherwise, various invocations share the same "link" instance when invoked
                futures << threadPool.submit({ checkLinks( linkUrl, pageUrl, false, pageDepth + 1 )} as Runnable )
            }
        }
        catch( Throwable error )
        {
            final message = "Unexpected error while reading [$pageUrl], referrer [$referrerUrl]"
            if ( ext.failOnFailure ) { abortCrawling( message, error ) }
            else                     { crawlerLog( LogLevel.ERROR, error ){ message }}
        }
    }


    @Requires({ errorMessage })
    void abortCrawling ( String errorMessage, Throwable error = null )
    {
        crawlingAborted = true
        log( LogLevel.ERROR, error ){ "! $errorMessage, aborting the crawling process" }
        threadPool.shutdownNow()
    }


    /**
     * Invoke verifiers for the page specified and determines if any of them return {@code false} or fails.
     *
     * @param pageUrl     url of the page being checked
     * @param pageContent content of the page being checked
     * @param verifiers   verifiers to invoke
     * @return true if all verifiers returned true when invoked, false otherwise
     */
    @Requires({ pageUrl && pageContent && verifiers })
    boolean verificationPassed( String pageUrl, String pageContent, List verifiers )
    {
        try
        {
            verifiers.every { final Object result = it( pageUrl, pageContent ); (( result == null ) || ( result )) }
        }
        catch ( Throwable error )
        {
            log( LogLevel.ERROR, error ){ "Error thrown while verifying [$pageUrl]" }
            false
        }
    }


    /**
     * Reads all hyperlinks in the content specified.
     *
     * @param pageContent content of the page downloaded previously
     * @return all links found in the page content
     */
    @Requires({ pageUrl && pageContent })
    @Ensures({ result != null })
    List readLinks ( String pageUrl, String pageContent )
    {
        String cleanContent = (( String )( ext.pageTransformers ?: [] ).inject( pageContent ){
            String content, Closure transformer -> transformer( pageUrl, content )
        }).replace( '\\', '/' )

        if ( ext.replaceSpecialCharacters )
        {
            cleanContent = cleanContent.replace( '%3A',       ':' ).
                                        replace( '%2F',       '/' ).
                                        replace( '<',      '<' ).
                                        replace( '>',      '>' ).
                                        replace( '"',    '"' ).
                                        replace( '&amp;', '&' ).
                                        replace( '&',     '&' )
        }

        if ( ext.removeHtmlComments )
        {
            cleanContent = cleanContent.replaceAll( htmlCommentPattern, '' )
        }

        final List links = findAll( cleanContent, ext.internalLinkPattern, 2 )

        if ( ext.checkExternalLinks ) {
            final  externalLinks = findAll ( cleanContent, externalLinkPattern, 2 )
            assert externalLinks.every { it.startsWith( 'http://' ) || it.startsWith( 'https://' ) }

            links.addAll( externalLinks )
        }

        if ( ext.checkAbsoluteLinks ) {
            final  absoluteLinks = findAll ( cleanContent, absoluteLinkPattern, 2 )
            assert absoluteLinks.every{ it.startsWith( '/' ) }

            links.addAll( absoluteLinks.collect{( it.startsWith( '//' ) ? "http://${ it.replaceAll( slashesPattern, '' )}" :
                                                                          "http://$ext.rootUrl$it" ).toString() })
        }

        if ( ext.checkRelativeLinks ) {

            final pageBaseUrl    = pageUrl.replaceFirst( relativeLinkReminderPattern, '' )
            final requestBaseUrl = removeAllAfter( '?', pageUrl, pageBaseUrl )
            final relativeLinks  = findAll ( cleanContent, relativeLinkPattern, 2 )
            assert ( ! pageBaseUrl.endsWith( '/' )) && ( ! requestBaseUrl.endsWith( '?' )) && relativeLinks.every { ! it.startsWith( '/' )}

            links.addAll( relativeLinks.collect{( it.startsWith( '?' ) ? "$requestBaseUrl$it" : "$pageBaseUrl/$it" ).toString() })
        }

        assert links.every{ it }
        filterTransformLinks( links )
    }


    @Requires({ links != null })
    @Ensures({ result != null })
    List filterTransformLinks ( Collection links )
    {
        ( List ) links.collect { normalizeUrl( removeAllAfter( '#', it, it )) }.
                               toSet().
                               findAll { String link -> ( ! ( ext.ignoredLinks ?: [] ).any { it( link ) }) }.
                               collect { String link -> ( ext.linkTransformers ?: [] ).inject( link ){ String l, Closure c -> c( l ) }}
    }


    @Requires({ pageUrl })
    @Ensures({ result })
    String normalizeUrl( String pageUrl )
    {
        try              { return pageUrl.toURI().normalize().toURL().toString() }
        catch ( ignored ){ return pageUrl }
    }


    /**
     * Retrieves {@code byte[]} content of the link specified.
     *
     * @param pageUrl         URL of a link to read
     * @param referrerUrl     URL of link referrer
     * @param forceGetRequest whether a link should be GET-requested regardless of its type
     * @param attempt         Number of the current attempt, starts from 1
     *
     * @return response data container
     */
    @Requires({ pageUrl && referrerUrl && linksStorage && ( attempt > 0 ) })
    @Ensures({ result })
    CrawlerHttpResponse readResponse ( final String  pageUrl,
                                       final String  referrerUrl,
                                       final boolean forceGetRequest,
                                       final int     attempt = 1 )
    {
        final        htmlLink          = ( ! pageUrl.toLowerCase().with{ ( ext.nonHtmlExtensions - ext.htmlExtensions ).any{ endsWith( ".$it" ) || contains( ".$it?" ) }} ) &&
                                         ( ! ( ext.nonHtmlLinks ?: [] ).any{ it( pageUrl ) })
        final        readFullContent   = ( htmlLink && pageUrl.with { startsWith( "http://${ ext.baseUrl  }" ) ||
                                                                      startsWith( "https://${ ext.baseUrl }" ) })
        final        isHeadRequest     = (( ! forceGetRequest ) && ( ! readFullContent ))
        final        requestMethod     = ( isHeadRequest ? 'HEAD' : 'GET' )
        final        linksStorageLocal = linksStorage // So that the closure that follows can access it
        final        crawlerResponse   = { HttpResponse r -> new CrawlerHttpResponse( r, referrerUrl, linksStorageLocal, attempt )}
        CrawlerHttpResponse response   = crawlerResponse( new HttpResponse( pageUrl, requestMethod ))

        try
        {
            final t  = System.currentTimeMillis()
            crawlerLog{ "[$pageUrl] - sending $requestMethod request .." }
            response = crawlerResponse ( httpRequest( pageUrl,
                                                      requestMethod,
                                                      [ 'User-Agent' : ext.userAgent, 'Connection': 'keep-alive' ],
                                                      ext.connectTimeout,
                                                      ext.readTimeout,
                                                      true, true, null, null,
                                                      { HttpResponse r -> ( readFullContent && ( ! r.isRedirect ))}))

            if (( response.data != null ) && ( response.content != null ))
            {   // Response was read, but it can be empty
                final responseSize         = response.data.length
                final contentSize          = response.content.length
                final totalBytesDownloaded = bytesDownloaded.addAndGet( responseSize )

                crawlerLog {
                    "[$pageUrl] - [$responseSize${ ( responseSize != contentSize ) ? ' => ' + contentSize : '' }] " +
                    "byte${ s( Math.max( responseSize, contentSize )) }, [${ System.currentTimeMillis() - t }] ms"
                }

                checkDownloadLimits( pageUrl, responseSize, totalBytesDownloaded )
            }
            else
            {   // Response wasn't read
                assert response.inputStream
                bytesDownloaded.addAndGet(( isHeadRequest || response.isRedirect || ( response.inputStream.read() == -1 )) ? 0 : 1 )
                response.inputStream.close()

                crawlerLog {
                    "[$pageUrl] - " +
                    ( response.isRedirect ? "redirected to [$response.actualUrl], " : 'can be read, ' ) +
                    "[${ System.currentTimeMillis() - t }] ms"
                }
            }

            response
        }
        catch ( Throwable error )
        {
            handleError( response, error )
        }
    }


    @Requires({ pageUrl && ( responseSize >= 0 ) && ( totalBytesDownloaded >= 0 ) })
    void checkDownloadLimits( String pageUrl, long responseSize, long totalBytesDownloaded )
    {
        if (( ext.pageDownloadLimit > 0 ) && ( responseSize > ext.pageDownloadLimit ))
        {
            abortCrawling( "[$pageUrl] - response size of [$responseSize] byte${ s( responseSize ) } " +
                           "exceeds the per page download limit of [$ext.pageDownloadLimit] byte${ s( ext.pageDownloadLimit ) }" )
            return
        }

        if (( ext.totalDownloadLimit > 0 ) && ( totalBytesDownloaded > ext.totalDownloadLimit ))
        {
            abortCrawling( "Total amount of bytes download [$totalBytesDownloaded] " +
                           "exceeds the total download limit of [$ext.totalDownloadLimit] byte${ s( ext.totalDownloadLimit ) }" )
        }
    }


    /**
     * Handles the error thrown while reading the response.
     *
     * @param response response data container
     * @param error error thrown
     * @return new response data (if request was retried) or the same instance that was specified
     */
    @Requires({ response && error })
    @Ensures({ result })
    CrawlerHttpResponse handleError ( CrawlerHttpResponse response, Throwable error )
    {
        response.with {
            final isRetryMatch = ( ext.retryStatusCodes?.any { it == statusCode } ||
                                   ext.retryExceptions?. any { it.isInstance( error ) || it.isInstance( statusCode ) })
            final isRetry      = ( isHeadRequest || ( isRetryMatch && ( attempt < ext.retries )))
            final isAttempt    = (( ! isHeadRequest ) && ( ext.retries > 1 ) && ( isRetryMatch ))
            final errorMessage = "! [$actualUrl] - $error, status code [${ ( statusCode instanceof Integer ) ? statusCode : 'unknown' }]"

            if ( isRetry )
            {
                assert ( isHeadRequest || isAttempt )
                crawlerLog { "$errorMessage, ${ isHeadRequest ? 'will be retried as GET request' : 'attempt ' + attempt }" }

                delay( ext.retryDelay )
                readResponse( actualUrl, referrerUrl, true, isHeadRequest ? 1 : attempt + 1 )
            }
            else
            {
                errorMessage = "$errorMessage${ isAttempt ? ', attempt ' + attempt : '' }"

                if (( ext.ignoredBrokenLinks ?: [] ).any{ it ( actualUrl )})
                {
                    crawlerLog{ "$errorMessage, not registered as broken link - filtered out by ignoredBrokenLinks" }
                }
                else
                {
                    crawlerLog{ "$errorMessage, registered as broken link" }
                    linksStorage.addBrokenLink( originalUrl, referrerUrl )
                }

                response
            }
        }
    }


    /**
     * Converts collection specified to multi-line String.
     * @param c Collection to convert.
     * @param delimiter Delimiter to use on every line.
     *
     * @return collection specified converted to multi-line String
     */
    @Requires({ c != null })
    @Ensures({ result })
    String toMultiLines( Collection c, String delimiter = '*' ){ "\n$delimiter [${ c.sort().join( "]\n$delimiter [" ) }]\n" }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy