All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.archive.modules.net.CrawlHost Maven / Gradle / Ivy

Go to download

This project contains some of the configurable modules used within the Heritrix application to crawl the web. The modules in this project can be used in applications other than Heritrix, however.

There is a newer version: 3.5.0
Show newest version
/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package org.archive.modules.net;

import java.io.Serializable;
import java.io.UnsupportedEncodingException;
import java.net.Inet4Address;
import java.net.InetAddress;
import java.net.URLEncoder;
import java.net.UnknownHostException;
import java.nio.ByteBuffer;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.archive.bdb.AutoKryo;
import org.archive.modules.fetcher.FetchStats;
import org.archive.util.IdentityCacheable;
import org.archive.util.InetAddressUtil;
import org.archive.util.ObjectIdentityCache;

import com.esotericsoftware.kryo.Serializer;

/** 
 * Represents a single remote "host".
 *
 * An host is a name for which there is a dns record or an IP-address. This
 * might be a machine or a virtual host.
 *
 * @author gojomo
 */
public class CrawlHost implements Serializable, FetchStats.HasFetchStats, IdentityCacheable {

    private static final long serialVersionUID = -5494573967890942895L;

    private static final Logger logger = Logger.getLogger(CrawlHost.class.getName());
    /** Flag value indicating always-valid IP */
    public static final long IP_NEVER_EXPIRES = -1;
    /** Flag value indicating an IP has not yet been looked up */
    public static final long IP_NEVER_LOOKED_UP = -2;
    private String hostname;
    private String countryCode;
    private InetAddress ip;
    private long ipFetched = IP_NEVER_LOOKED_UP;
    protected FetchStats substats = new FetchStats(); 
    /**
     * TTL gotten from dns record.
     *
     * From rfc2035:
     * 
     * TTL       a 32 bit unsigned integer that specifies the time
     *           interval (in seconds) that the resource record may be
     *           cached before it should be discarded.  Zero values are
     *           interpreted to mean that the RR can only be used for the
     *           transaction in progress, and should not be cached.
     * 
*/ private long ipTTL = IP_NEVER_LOOKED_UP; // Used when bandwith constraint are used private long earliestNextURIEmitTime = 0; /** * Create a new CrawlHost object. * * @param hostname the host name for this host. */ public CrawlHost(String hostname) { this(hostname, null); } /** * Create a new CrawlHost object. * * @param hostname the host name for this host. * @param countryCode the country code for this host. */ public CrawlHost(String hostname, String countryCode) { this.hostname = hostname; this.countryCode = countryCode; InetAddress tmp = InetAddressUtil.getIPHostAddress(hostname); if (tmp != null) { setIP(tmp, IP_NEVER_EXPIRES); } } /** Return true if the IP for this host has been looked up. * * Returns true even if the lookup failed. * * @return true if the IP for this host has been looked up. */ public boolean hasBeenLookedUp() { return ipFetched != IP_NEVER_LOOKED_UP; } /** * Set the IP address for this host. * * @param address * @param ttl the TTL from the dns record in seconds or -1 if it should live * forever (is a numeric IP). */ public void setIP(InetAddress address, long ttl) { this.ip = address; // Assume that a lookup as occurred by the time // a caller decides to set this (even to null) this.ipFetched = System.currentTimeMillis(); this.ipTTL = ttl; if (logger.isLoggable(Level.FINE)) { logger.fine(hostname + ": " + ((address != null)? address.toString(): "null")); } } /** Get the IP address for this host. * * @return the IP address for this host. */ public InetAddress getIP() { return ip; } /** Get the time when the IP address for this host was last looked up. * * @return the time when the IP address for this host was last looked up. */ public long getIpFetched() { return ipFetched; } /** * Get the TTL value from the dns record for this host. * * @return the TTL value from the dns record for this host -- in seconds -- * or -1 if this lookup should be valid forever (numeric ip). */ public long getIpTTL() { return this.ipTTL; } public String toString() { return "CrawlHost<" + hostname + "(ip:" + ip + ")>"; } @Override public int hashCode() { return this.hostname != null ? this.hostname.hashCode() : 0; } @Override public boolean equals(Object obj) { if (obj == null) { return false; } if (getClass() != obj.getClass()) { return false; } final CrawlHost other = (CrawlHost) obj; if (this.hostname != other.hostname // identity compare && (this.hostname == null || !this.hostname.equals(other.hostname))) { return false; } return true; } /** * Get the host name. * @return Returns the host name. */ public String getHostName() { return hostname; } /** * Get the earliest time a URI for this host could be emitted. * This only has effect if constraints on bandwidth per host is set. * * @return Returns the earliestNextURIEmitTime. */ public long getEarliestNextURIEmitTime() { return earliestNextURIEmitTime; } /** * Set the earliest time a URI for this host could be emitted. * This only has effect if constraints on bandwidth per host is set. * * @param earliestNextURIEmitTime The earliestNextURIEmitTime to set. */ public void setEarliestNextURIEmitTime(long earliestNextURIEmitTime) { this.earliestNextURIEmitTime = earliestNextURIEmitTime; } /** * Get country code of this host * * @return Retruns country code or null if not availabe */ public String getCountryCode() { return countryCode; } /** * Set country code for this hos * * @param countryCode The country code of this host */ public void setCountryCode(String countryCode) { this.countryCode = countryCode; } /* (non-Javadoc) * @see org.archive.crawler.datamodel.CrawlSubstats.HasCrawlSubstats#getSubstats() */ public FetchStats getSubstats() { return substats; } // Kryo support public static void autoregisterTo(final AutoKryo kryo) { kryo.register(CrawlHost.class); kryo.autoregister(FetchStats.class); /* * Custom serializer because default serialization doesn't work. Any * non-null IP address comes back as 0.0.0.0. XXX Inet4Address also * holds hostname, but heritrix doesn't use that; and retrieving it can * result in dns lookup, so we don't serialize it. */ kryo.register(Inet4Address.class, new Serializer() { @Override public void writeObjectData(ByteBuffer buffer, Object object) { Inet4Address i4a = (Inet4Address) object; kryo.writeObject(buffer, i4a.getAddress()); } @Override @SuppressWarnings("unchecked") public T readObjectData(ByteBuffer buffer, Class type) { byte[] address = kryo.readObject(buffer, byte[].class); try { return (T) InetAddress.getByAddress(address); } catch (UnknownHostException e) { throw new RuntimeException(e); } } }); kryo.autoregister(byte[].class); kryo.setRegistrationOptional(true); } // // IdentityCacheable support // transient private ObjectIdentityCache cache; @Override public String getKey() { return getHostName(); } @Override public void makeDirty() { cache.dirtyKey(getKey()); } @Override public void setIdentityCache(ObjectIdentityCache cache) { this.cache = cache; } public String fixUpName() { if ("dns:".equals(getHostName()) || "whois:".equals(getHostName())) { return getHostName(); } else { try { return URLEncoder.encode(getHostName(), "UTF-8"); } catch (UnsupportedEncodingException e) { throw new RuntimeException(e); } } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy