org.archive.modules.seeds.TextSeedModule Maven / Gradle / Ivy
Show all versions of heritrix-modules Show documentation
/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.modules.seeds;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.io.Writer;
import java.util.Iterator;
import java.util.concurrent.CountDownLatch;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.commons.httpclient.URIException;
import org.apache.commons.io.IOUtils;
import org.archive.io.ReadSource;
import org.archive.modules.CrawlURI;
import org.archive.modules.SchedulingConstants;
import org.archive.net.UURI;
import org.archive.net.UURIFactory;
import org.archive.spring.WriteTarget;
import org.archive.util.ArchiveUtils;
import org.archive.util.DevUtils;
import org.archive.util.iterator.LineReadingIterator;
import org.archive.util.iterator.RegexLineIterator;
import org.springframework.beans.factory.annotation.Required;
/**
* Module that announces a list of seeds from a text source (such
* as a ConfigFile or ConfigString), and provides a mechanism for
* adding seeds after a crawl has begun.
*
* @author gojomo
*/
public class TextSeedModule extends SeedModule
implements ReadSource {
private static final long serialVersionUID = 3L;
private static final Logger logger =
Logger.getLogger(TextSeedModule.class.getName());
/**
* Text from which to extract seeds
*/
protected ReadSource textSource = null;
public ReadSource getTextSource() {
return textSource;
}
@Required
public void setTextSource(ReadSource seedsSource) {
this.textSource = seedsSource;
}
/**
* Number of lines of seeds-source to read on initial load before proceeding
* with crawl. Default is -1, meaning all. Any other value will cause that
* number of lines to be loaded before fetching begins, while all extra
* lines continue to be processed in the background. Generally, this should
* only be changed when working with very large seed lists, and scopes that
* do *not* depend on reading all seeds.
*/
protected int blockAwaitingSeedLines = -1;
public int getBlockAwaitingSeedLines() {
return blockAwaitingSeedLines;
}
public void setBlockAwaitingSeedLines(int blockAwaitingSeedLines) {
this.blockAwaitingSeedLines = blockAwaitingSeedLines;
}
public TextSeedModule() {
}
/**
* Announce all seeds from configured source to SeedListeners
* (including nonseed lines mixed in).
* @see org.archive.modules.seeds.SeedModule#announceSeeds()
*/
public void announceSeeds() {
if(getBlockAwaitingSeedLines()>-1) {
final CountDownLatch latch = new CountDownLatch(getBlockAwaitingSeedLines());
new Thread(){
@Override
public void run() {
announceSeeds(latch);
while(latch.getCount()>0) {
latch.countDown();
}
}
}.start();
try {
latch.await();
} catch (InterruptedException e) {
// do nothing
}
} else {
announceSeeds(null);
}
}
protected void announceSeeds(CountDownLatch latchOrNull) {
BufferedReader reader = new BufferedReader(textSource.obtainReader());
try {
announceSeedsFromReader(reader,latchOrNull);
} finally {
IOUtils.closeQuietly(reader);
}
}
/**
* Announce all seeds (and nonseed possible-directive lines) from
* the given Reader
* @param reader source of seed/directive lines
* @param latchOrNull if non-null, sent countDown after each line, allowing
* another thread to proceed after a configurable number of lines processed
*/
protected void announceSeedsFromReader(BufferedReader reader, CountDownLatch latchOrNull) {
String s;
Iterator iter =
new RegexLineIterator(
new LineReadingIterator(reader),
RegexLineIterator.COMMENT_LINE,
RegexLineIterator.NONWHITESPACE_ENTRY_TRAILING_COMMENT,
RegexLineIterator.ENTRY);
int count = 0;
while (iter.hasNext()) {
s = (String) iter.next();
if(Character.isLetterOrDigit(s.charAt(0))) {
// consider a likely URI
seedLine(s);
count++;
if(count%20000==0) {
System.runFinalization();
}
} else {
// report just in case it's a useful directive
nonseedLine(s);
}
if(latchOrNull!=null) {
latchOrNull.countDown();
}
}
publishConcludedSeedBatch();
}
/**
* Handle a read line that is probably a seed.
*
* @param uri String seed-containing line
*/
protected void seedLine(String uri) {
String originalUri = uri;
if (!uri.matches("[a-zA-Z][\\w+\\-]+:.*")) { // Rfc2396 s3.1 scheme,
// minus '.'
// Does not begin with scheme, so try http://
uri = "http://" + uri;
}
try {
UURI uuri = UURIFactory.getInstance(uri);
CrawlURI curi = new CrawlURI(uuri);
curi.setSeed(true);
curi.setSchedulingDirective(SchedulingConstants.MEDIUM);
if (getSourceTagSeeds()) {
curi.setSourceTag(originalUri);
}
publishAddedSeed(curi);
} catch (URIException e) {
// try as nonseed line as fallback
nonseedLine(uri);
}
}
/**
* Handle a read line that is not a seed, but may still have
* meaning to seed-consumers (such as scoping beans).
*/
protected void nonseedLine(String line) {
publishNonSeedLine(line);
}
/**
* Treat the given file as a source of additional seeds,
* announcing to SeedListeners.
*
* @see org.archive.modules.seeds.SeedModule#actOn(java.io.File)
*/
public void actOn(File f) {
BufferedReader reader = null;
try {
reader = ArchiveUtils.getBufferedReader(f);
announceSeedsFromReader(reader, null);
} catch (IOException ioe) {
logger.log(Level.SEVERE,"problem reading seed file "+f,ioe);
} finally {
IOUtils.closeQuietly(reader);
}
}
/**
* Add a new seed to scope. By default, simply appends
* to seeds file, though subclasses may handle differently.
*
* This method is *not* sufficient to get the new seed
* scheduled in the Frontier for crawling -- it only
* affects the Scope's seed record (and decisions which
* flow from seeds).
*
* @param curi CandidateUri to add
*/
@Override
public synchronized void addSeed(final CrawlURI curi) {
if(!(textSource instanceof WriteTarget)) {
// TODO: do something else to log seed update
logger.warning("nowhere to log added seed: "+curi);
} else {
// TODO: determine if this modification to seeds file means
// TextSeedModule should (again) be Checkpointable
try {
Writer fw = ((WriteTarget)textSource).obtainWriter(true);
// Write to new (last) line the URL.
fw.write("\n");
fw.write("# Heritrix added seed " +
((curi.getVia() != null) ? "redirect from " + curi.getVia():
"(JMX)") + ".\n");
fw.write(curi.toString());
fw.flush();
fw.close();
} catch (IOException e) {
DevUtils.warnHandle(e, "problem writing new seed");
}
}
publishAddedSeed(curi);
}
public Reader obtainReader() {
return textSource.obtainReader();
}
}