All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.archive.modules.seeds.TextSeedModule Maven / Gradle / Ivy

/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package org.archive.modules.seeds;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.io.Writer;
import java.util.Iterator;
import java.util.concurrent.CountDownLatch;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.commons.httpclient.URIException;
import org.apache.commons.io.IOUtils;
import org.archive.io.ReadSource;
import org.archive.modules.CrawlURI;
import org.archive.modules.SchedulingConstants;
import org.archive.net.UURI;
import org.archive.net.UURIFactory;
import org.archive.spring.WriteTarget;
import org.archive.util.ArchiveUtils;
import org.archive.util.DevUtils;
import org.archive.util.iterator.LineReadingIterator;
import org.archive.util.iterator.RegexLineIterator;
import org.springframework.beans.factory.annotation.Required;

/**
 * Module that announces a list of seeds from a text source (such
 * as a ConfigFile or ConfigString), and provides a mechanism for
 * adding seeds after a crawl has begun.
 *
 * @author gojomo
 */
public class TextSeedModule extends SeedModule 
implements ReadSource {
    private static final long serialVersionUID = 3L;

    private static final Logger logger =
        Logger.getLogger(TextSeedModule.class.getName());

    /**
     * Text from which to extract seeds
     */
    protected ReadSource textSource = null;
    public ReadSource getTextSource() {
        return textSource;
    }
    @Required
    public void setTextSource(ReadSource seedsSource) {
        this.textSource = seedsSource;
    }
    
    /**
     * Number of lines of seeds-source to read on initial load before proceeding
     * with crawl. Default is -1, meaning all. Any other value will cause that
     * number of lines to be loaded before fetching begins, while all extra
     * lines continue to be processed in the background. Generally, this should
     * only be changed when working with very large seed lists, and scopes that
     * do *not* depend on reading all seeds. 
     */
    protected int blockAwaitingSeedLines = -1;
    public int getBlockAwaitingSeedLines() {
        return blockAwaitingSeedLines;
    }
    public void setBlockAwaitingSeedLines(int blockAwaitingSeedLines) {
        this.blockAwaitingSeedLines = blockAwaitingSeedLines;
    }

    public TextSeedModule() {
    }

    /**
     * Announce all seeds from configured source to SeedListeners 
     * (including nonseed lines mixed in). 
     * @see org.archive.modules.seeds.SeedModule#announceSeeds()
     */
    public void announceSeeds() {
        if(getBlockAwaitingSeedLines()>-1) {
            final CountDownLatch latch = new CountDownLatch(getBlockAwaitingSeedLines());
            new Thread(){
                @Override
                public void run() {
                    announceSeeds(latch); 
                    while(latch.getCount()>0) {
                        latch.countDown();
                    }
                }
            }.start();
            try {
                latch.await();
            } catch (InterruptedException e) {
                // do nothing
            } 
        } else {
            announceSeeds(null); 
        }
    }
    
    protected void announceSeeds(CountDownLatch latchOrNull) {
        BufferedReader reader = new BufferedReader(textSource.obtainReader());       
        try {
            announceSeedsFromReader(reader,latchOrNull);    
        } finally {
            IOUtils.closeQuietly(reader);
        }
    }
            
    /**
     * Announce all seeds (and nonseed possible-directive lines) from
     * the given Reader
     * @param reader source of seed/directive lines
     * @param latchOrNull if non-null, sent countDown after each line, allowing 
     * another thread to proceed after a configurable number of lines processed
     */
    protected void announceSeedsFromReader(BufferedReader reader, CountDownLatch latchOrNull) {
        String s;
        Iterator iter = 
            new RegexLineIterator(
                    new LineReadingIterator(reader),
                    RegexLineIterator.COMMENT_LINE,
                    RegexLineIterator.NONWHITESPACE_ENTRY_TRAILING_COMMENT,
                    RegexLineIterator.ENTRY);

        int count = 0; 
        while (iter.hasNext()) {
            s = (String) iter.next();
            if(Character.isLetterOrDigit(s.charAt(0))) {
                // consider a likely URI
                seedLine(s);
                count++;
                if(count%20000==0) {
                    System.runFinalization();
                }
            } else {
                // report just in case it's a useful directive
                nonseedLine(s);
            }
            if(latchOrNull!=null) {
                latchOrNull.countDown(); 
            }
        }
        publishConcludedSeedBatch(); 
    }
    
    /**
     * Handle a read line that is probably a seed.
     * 
     * @param uri String seed-containing line
     */
    protected void seedLine(String uri) {
        String originalUri = uri;
        if (!uri.matches("[a-zA-Z][\\w+\\-]+:.*")) { // Rfc2396 s3.1 scheme,
                                                     // minus '.'
            // Does not begin with scheme, so try http://
            uri = "http://" + uri;
        }
        try {
            UURI uuri = UURIFactory.getInstance(uri);
            CrawlURI curi = new CrawlURI(uuri);
            curi.setSeed(true);
            curi.setSchedulingDirective(SchedulingConstants.MEDIUM);
            if (getSourceTagSeeds()) {
                curi.setSourceTag(originalUri);
            }
            publishAddedSeed(curi);
        } catch (URIException e) {
            // try as nonseed line as fallback
            nonseedLine(uri);
        }
    }
    
    /**
     * Handle a read line that is not a seed, but may still have
     * meaning to seed-consumers (such as scoping beans). 
     */
    protected void nonseedLine(String line) {
        publishNonSeedLine(line);
    }
    
    /**
     * Treat the given file as a source of additional seeds,
     * announcing to SeedListeners.
     * 
     * @see org.archive.modules.seeds.SeedModule#actOn(java.io.File)
     */
    public void actOn(File f) {
        BufferedReader reader = null;
        try {
            reader = ArchiveUtils.getBufferedReader(f);
            announceSeedsFromReader(reader, null);    
        } catch (IOException ioe) {
            logger.log(Level.SEVERE,"problem reading seed file "+f,ioe);
        } finally {
            IOUtils.closeQuietly(reader);
        }
    }

    /**
     * Add a new seed to scope. By default, simply appends
     * to seeds file, though subclasses may handle differently.
     *
     * 

This method is *not* sufficient to get the new seed * scheduled in the Frontier for crawling -- it only * affects the Scope's seed record (and decisions which * flow from seeds). * * @param curi CandidateUri to add */ @Override public synchronized void addSeed(final CrawlURI curi) { if(!(textSource instanceof WriteTarget)) { // TODO: do something else to log seed update logger.warning("nowhere to log added seed: "+curi); } else { // TODO: determine if this modification to seeds file means // TextSeedModule should (again) be Checkpointable try { Writer fw = ((WriteTarget)textSource).obtainWriter(true); // Write to new (last) line the URL. fw.write("\n"); fw.write("# Heritrix added seed " + ((curi.getVia() != null) ? "redirect from " + curi.getVia(): "(JMX)") + ".\n"); fw.write(curi.toString()); fw.flush(); fw.close(); } catch (IOException e) { DevUtils.warnHandle(e, "problem writing new seed"); } } publishAddedSeed(curi); } public Reader obtainReader() { return textSource.obtainReader(); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy