All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.archive.modules.fetcher.AbstractCookieStore Maven / Gradle / Ivy

Go to download

This project contains some of the configurable modules used within the Heritrix application to crawl the web. The modules in this project can be used in applications other than Heritrix, however.

There is a newer version: 3.6.0
Show newest version
/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual
 *  contributors.
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.modules.fetcher;

import it.unimi.dsi.mg4j.util.MutableString;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Comparator;
import java.util.Date;
import java.util.LinkedList;
import java.util.List;
import java.util.Locale;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.commons.httpclient.URIException;
import org.apache.commons.io.IOUtils;
import org.apache.http.client.CookieStore;
import org.apache.http.cookie.Cookie;
import org.apache.http.cookie.CookieIdentityComparator;
import org.apache.http.impl.cookie.BasicClientCookie;
import org.archive.checkpointing.Checkpointable;
import org.archive.modules.CrawlURI;
import org.archive.spring.ConfigFile;
import org.archive.spring.ConfigPath;
import org.springframework.context.Lifecycle;

abstract public class AbstractCookieStore implements Lifecycle, Checkpointable,
        CookieStore, FetchHTTPCookieStore {

    public static final int MAX_COOKIES_FOR_DOMAIN = 50;
    
    protected final Logger logger =
            Logger.getLogger(AbstractCookieStore.class.getName());

    protected static final Comparator cookieComparator = new CookieIdentityComparator();

    protected ConfigFile cookiesLoadFile = null;
    public ConfigFile getCookiesLoadFile() {
        return cookiesLoadFile;
    }
    public void setCookiesLoadFile(ConfigFile cookiesLoadFile) {
        this.cookiesLoadFile = cookiesLoadFile;
    }

    protected ConfigPath cookiesSaveFile = null;
    public ConfigPath getCookiesSaveFile() {
        return cookiesSaveFile;
    }
    public void setCookiesSaveFile(ConfigPath cookiesSaveFile) {
        this.cookiesSaveFile = cookiesSaveFile;
    }

    protected boolean isRunning = false;

    @Override
    public void start() {
        if (isRunning()) {
            return;
        }
        prepare();
        if (getCookiesLoadFile()!=null) {
            loadCookies(getCookiesLoadFile());
        }
        isRunning = true;
    }

    @Override
    public void stop() {
        isRunning = false;
    }

    @Override
    public boolean isRunning() {
        return isRunning;
    }

    public void saveCookies() {
        if (getCookiesSaveFile() != null) {
            saveCookies(getCookiesSaveFile().getFile().getAbsolutePath());
        }
    }

    protected void loadCookies(ConfigFile file) {
        Reader reader = null;
        try {
            reader = file.obtainReader();
            loadCookies(reader);
        } finally {
            IOUtils.closeQuietly(reader);
        }
    }

    protected void loadCookies(Reader reader) {
        Collection loadedCookies = readCookies(reader);
        for (Cookie cookie: loadedCookies) {
            addCookie(cookie);
        }
    }

    public void saveCookies(String saveCookiesFile) {
        // Do nothing if cookiesFile is not specified.
        if (saveCookiesFile == null || saveCookiesFile.length() <= 0) {
            return;
        }

        FileOutputStream out = null;
        try {
            out = new FileOutputStream(new File(saveCookiesFile));
            String tab ="\t";
            out.write("# Heritrix Cookie File\n".getBytes());
            out.write("# This file is the Netscape cookies.txt format\n\n".getBytes());
            for (Cookie cookie: new ArrayList(getCookies())) {
                // Guess an initial size
                MutableString line = new MutableString(1024 * 2);
                line.append(cookie.getDomain());
                line.append(tab);
                // XXX line.append(cookie.isDomainAttributeSpecified() ? "TRUE" : "FALSE");
                line.append("TRUE");
                line.append(tab);
                line.append(cookie.getPath() != null ? cookie.getPath() : "/");
                line.append(tab);
                line.append(cookie.isSecure() ? "TRUE" : "FALSE");
                line.append(tab);
                line.append(cookie.getExpiryDate() != null ? cookie.getExpiryDate().getTime() / 1000 : -1);
                line.append(tab);
                line.append(cookie.getName());
                line.append(tab);
                line.append(cookie.getValue() != null ? cookie.getValue() : "");
                line.append("\n");
                out.write(line.toString().getBytes());
            }
        } catch (IOException e) {
            logger.log(Level.SEVERE, "Unable to write " + saveCookiesFile, e);
        } finally {
            IOUtils.closeQuietly(out);
        }
    }

    /**
     * Load cookies. The input is text in the Netscape's 'cookies.txt' file
     * format. Example entry of cookies.txt file:
     * 

* www.archive.org FALSE / FALSE 1311699995 details-visit texts-cralond *

*

* Each line has 7 tab-separated fields: *

*
    *
  1. DOMAIN: The domain that created and have access to the cookie value.
  2. *
  3. FLAG: A TRUE or FALSE value indicating if hosts within the given * domain can access the cookie value.
  4. *
  5. PATH: The path within the domain that the cookie value is valid for.
  6. *
  7. SECURE: A TRUE or FALSE value indicating if to use a secure * connection to access the cookie value.
  8. *
  9. EXPIRATION: The expiration time of the cookie value, or -1 for no * expiration
  10. *
  11. NAME: The name of the cookie value
  12. *
  13. VALUE: The cookie value
  14. *
* * @param reader * input in the Netscape's 'cookies.txt' format. */ protected Collection readCookies(Reader reader) { LinkedList cookies = new LinkedList(); BufferedReader br = new BufferedReader(reader); try { String line; int lineNo = 1; while ((line = br.readLine()) != null) { if (!line.matches("\\s*(?:#.*)?")) { // skip blank links and comments String[] tokens = line.split("\\t"); if (tokens.length == 7) { long epochSeconds = Long.parseLong(tokens[4]); Date expirationDate = (epochSeconds >= 0 ? new Date(epochSeconds * 1000) : null); BasicClientCookie cookie = new BasicClientCookie(tokens[5], tokens[6]); cookie.setDomain(tokens[0]); cookie.setExpiryDate(expirationDate); cookie.setSecure(Boolean.valueOf(tokens[3]).booleanValue()); cookie.setPath(tokens[2]); // XXX httpclient cookie doesn't have this thing? // cookie.setDomainAttributeSpecified(Boolean.valueOf(tokens[1]).booleanValue()); logger.fine("Adding cookie: domain " + cookie.getDomain() + " cookie " + cookie); cookies.add(cookie); } else { logger.warning("cookies input line " + lineNo + " invalid, expected 7 tab-delimited tokens"); } } lineNo++; } } catch (IOException e) { logger.log(Level.WARNING,e.getMessage(), e); } return cookies; } protected class LimitedCookieStoreFacade implements CookieStore { private final List cookies; protected LimitedCookieStoreFacade(List cookies) { this.cookies = cookies; } @Override public List getCookies() { return cookies; } @Override public boolean clearExpired(Date date) { throw new RuntimeException("not implemented"); } @Override public void clear() { throw new RuntimeException("not implemented"); } @Override public void addCookie(Cookie cookie) { AbstractCookieStore.this.addCookie(cookie); } } /** * Returns a string that uniquely identifies the cookie, The format The * format of the key is {@code "normalizedDomain;name;path"}. Adapted from * {@link CookieIdentityComparator#compare(Cookie, Cookie)}. */ protected String sortableKey(Cookie cookie) { String normalizedDomain = normalizeHost(cookie.getDomain()); // use ";" as delimiter since it is the delimiter in the cookie header, // so presumably can't appear in any of these values StringBuilder buf = new StringBuilder(normalizedDomain); buf.append(";").append(cookie.getName()); buf.append(";").append(cookie.getPath() != null ? cookie.getPath() : "/"); return buf.toString(); } protected String normalizeHost(String host) { if (host == null) { host = ""; } if (host.startsWith(".")) { host = host.substring(1); } host = host.toLowerCase(Locale.ENGLISH); return host; } public CookieStore cookieStoreFor(CrawlURI curi) throws URIException { String normalizedHost = normalizeHost(curi.getUURI().getHost()); return cookieStoreFor(normalizedHost); } public boolean isCookieCountMaxedForDomain(String domain) { CookieStore cookieStore = cookieStoreFor(normalizeHost(domain)); return (cookieStore != null && cookieStore.getCookies().size() >= MAX_COOKIES_FOR_DOMAIN); } public void addCookie(Cookie cookie) { if (isCookieCountMaxedForDomain(cookie.getDomain())) { logger.log( Level.FINEST, "Maximum number of cookies reached for domain " + cookie.getDomain() + ". Will not add new cookie " + cookie.getName() + " with value " + cookie.getValue()); return; } addCookieImpl(cookie); } abstract protected void addCookieImpl(Cookie cookie); abstract public void clear(); abstract protected void prepare(); }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy