org.archive.modules.extractor.ContentExtractorTestBase Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of heritrix-modules Show documentation
Show all versions of heritrix-modules Show documentation
This project contains some of the configurable modules used within the
Heritrix application to crawl the web. The modules in this project can
be used in applications other than Heritrix, however.
/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.modules.extractor;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import org.archive.modules.CrawlURI;
import org.archive.modules.ProcessorTestBase;
import org.archive.net.UURI;
import org.archive.net.UURIFactory;
import org.archive.util.Recorder;
/**
* Abstract base class for unit testing ContentExtractor implementations.
*
* @author pjack
*/
public abstract class ContentExtractorTestBase extends ProcessorTestBase {
/**
* An extractor created during the setUp.
*/
protected Extractor extractor;
/**
* Sets up the {@link #extractor}.
*/
final public void setUp() {
extractor = makeExtractor();
}
@Override
protected Object makeModule() {
return makeExtractor();
}
/**
* Subclasses should return an Extractor instance to test.
*
* @return an Extractor instance to test
*/
protected abstract Extractor makeExtractor();
/**
* Returns a CrawlURI for testing purposes.
*
* @return a CrawlURI
* @throws Exception just in case
*/
protected CrawlURI defaultURI() throws Exception {
UURI uuri = UURIFactory.getInstance("http://www.archive.org/start/");
return new CrawlURI(uuri, null, null, LinkContext.NAVLINK_MISC);
}
/**
* Tests that a URI with a zero content length has no links extracted.
*
* @throws Exception just in case
*/
public void testZeroContent() throws Exception {
CrawlURI uri = defaultURI();
Recorder recorder = createRecorder("");
uri.setContentType("text/plain");
uri.setRecorder(recorder);
extractor.process(uri);
assertEquals(0, uri.getOutLinks().size());
assertNoSideEffects(uri);
}
/**
* Tests that a URI whose linkExtractionFinished flag has been set has
* no links extracted.
*
* @throws Exception just in case
*/
public void testFinished() throws Exception {
CrawlURI uri = defaultURI();
uri.linkExtractorFinished();
extractor.process(uri);
assertEquals(0, uri.getOutLinks().size());
assertNoSideEffects(uri);
}
/**
* Asserts that the given URI has no URI errors, no localized errors, and
* no annotations.
*
* @param uri the URI to test
*/
protected static void assertNoSideEffects(CrawlURI uri) {
assertEquals(0, uri.getNonFatalFailures().size());
assertTrue(uri.getAnnotations().isEmpty());
}
@Deprecated
public static Recorder createRecorder(String content) throws IOException {
return createRecorder(content, Charset.defaultCharset().name());
}
public static Recorder createRecorder(String content, String charset)
throws IOException {
File temp = File.createTempFile("test", ".tmp");
Recorder recorder = new Recorder(temp, 1024, 1024);
byte[] b = content.getBytes(charset);
ByteArrayInputStream bais = new ByteArrayInputStream(b);
InputStream is = recorder.inputWrap(bais);
recorder.markContentBegin();
for (int x = is.read(); x >= 0; x = is.read());
is.close();
return recorder;
}
}