All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.archive.crawler.reporting.HostsReport Maven / Gradle / Ivy

The newest version!
/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
 
package org.archive.crawler.reporting;

import java.io.PrintWriter;
import java.util.Collection;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.archive.bdb.DisposableStoredSortedMap;
import org.archive.modules.net.CrawlHost;

/**
 * The "Hosts Report", tallies by host.
 * 
 * @author gojomo
 */
public class HostsReport extends Report {
    
    private final static Logger logger =
            Logger.getLogger(HostsReport.class.getName());
    
    int maxSortSize = -1;
    public int getMaxSortSize() {
    	return maxSortSize;
    }
    /**
     * The maximum number of hosts allowed in a report while still sorting it. If the number of hosts exceeds
     * this value, the generated report will not be sorted. A negative signifies no limit (always sort). 
     * A value of zero means never sort. Default -1, always sort. This matches the behavior before this 
     * parameter was introduced.
     * 
     * This value can not be overridden by a sheet. It may be safely edited at runtime.
     * 
     * @param maxSortSize
     */
    public void setMaxSortSize(int maxSortSize) {
    	this.maxSortSize = maxSortSize;
    }
    
    boolean suppressEmptyHosts = false;
    public boolean isSuppressEmptyHosts() {
		return suppressEmptyHosts;
	}
    /**
     * If true, hosts for whom no URLs have been fetched will be suppressed in this report.
     * Such hosts are recorded when the crawler encounters an URL for a host but has not yet (and may never) 
     * processed any URL for the host. This can happen for many reason's, related to scoping and queue budgeting
     * among others.
     * Default behavior is to include these non-crawled hosts.
     * 
     * This value can not be overridden by a sheet. It may be safely edited at runtime.
     *  
     * @param suppressEmptyHosts 
     */
	public void setSuppressEmptyHosts(boolean suppressEmptyHosts) {
		this.suppressEmptyHosts = suppressEmptyHosts;
	}
	
	@Override
    public void write(final PrintWriter writer, StatisticsTracker stats) {
    	Collection keys = null;
    	DisposableStoredSortedMap hd = null;
    	if (maxSortSize<0 || maxSortSize>stats.serverCache.hostKeys().size()) {
    		hd = stats.calcReverseSortedHostsDistribution();
        	keys = hd.values();
        } else {
        	keys = stats.serverCache.hostKeys();
        }
        writer.print("[#urls] [#bytes] [host] [#robots] [#remaining] [#novel-urls] [#novel-bytes] [#dup-by-hash-urls] [#dup-by-hash-bytes] [#not-modified-urls] [#not-modified-bytes]\n"); 
        for (String key : keys) {
            // key is -count, value is hostname
            try {
                CrawlHost host = stats.serverCache.getHostFor(key);
                long fetchSuccesses = host.getSubstats().getFetchSuccesses();
                if (!suppressEmptyHosts || fetchSuccesses>0) {
	                writeReportLine(writer,
	                        fetchSuccesses,
	                        host.getSubstats().getTotalBytes(),
	                        host.fixUpName(),
	                        host.getSubstats().getRobotsDenials(),
	                        host.getSubstats().getRemaining(), 
	                        host.getSubstats().getNovelUrls(),
	                        host.getSubstats().getNovelBytes(),
	                        host.getSubstats().getDupByHashUrls(),
	                        host.getSubstats().getDupByHashBytes(),
	                        host.getSubstats().getNotModifiedUrls(),
	                        host.getSubstats().getNotModifiedBytes());
                }
            } catch (Exception e) {
                logger.log(Level.WARNING, "unable to tally host stats for " + key, e);
            }
        }
        if (hd!=null) {
        	hd.dispose();
        }
    }

    protected void writeReportLine(PrintWriter writer, Object  ... fields) {
        for(Object field : fields) {
            writer.print(field);
            writer.print(" ");
        }
        writer.print("\n");
     }

    @Override
    public String getFilename() {
        return "hosts-report.txt";
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy