All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.archive.modules.fetcher.AbstractCookieStore Maven / Gradle / Ivy

/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual
 *  contributors.
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.modules.fetcher;

import it.unimi.dsi.mg4j.util.MutableString;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Comparator;
import java.util.Date;
import java.util.LinkedList;
import java.util.List;
import java.util.Locale;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.commons.httpclient.URIException;
import org.apache.commons.io.IOUtils;
import org.apache.http.client.CookieStore;
import org.apache.http.cookie.Cookie;
import org.apache.http.cookie.CookieIdentityComparator;
import org.apache.http.impl.cookie.BasicClientCookie;
import org.archive.checkpointing.Checkpointable;
import org.archive.modules.CrawlURI;
import org.archive.spring.ConfigFile;
import org.archive.spring.ConfigPath;
import org.springframework.context.Lifecycle;

abstract public class AbstractCookieStore implements Lifecycle, Checkpointable,
        CookieStore, FetchHTTPCookieStore {

    public static final int MAX_COOKIES_FOR_DOMAIN = 50;
    
    protected final Logger logger =
            Logger.getLogger(AbstractCookieStore.class.getName());

    protected static final Comparator cookieComparator = new CookieIdentityComparator();

    protected ConfigFile cookiesLoadFile = null;
    public ConfigFile getCookiesLoadFile() {
        return cookiesLoadFile;
    }
    public void setCookiesLoadFile(ConfigFile cookiesLoadFile) {
        this.cookiesLoadFile = cookiesLoadFile;
    }

    protected ConfigPath cookiesSaveFile = null;
    public ConfigPath getCookiesSaveFile() {
        return cookiesSaveFile;
    }
    public void setCookiesSaveFile(ConfigPath cookiesSaveFile) {
        this.cookiesSaveFile = cookiesSaveFile;
    }

    protected boolean isRunning = false;

    @Override
    public void start() {
        if (isRunning()) {
            return;
        }
        prepare();
        if (getCookiesLoadFile()!=null) {
            loadCookies(getCookiesLoadFile());
        }
        isRunning = true;
    }

    @Override
    public void stop() {
        isRunning = false;
    }

    @Override
    public boolean isRunning() {
        return isRunning;
    }

    public void saveCookies() {
        if (getCookiesSaveFile() != null) {
            saveCookies(getCookiesSaveFile().getFile().getAbsolutePath());
        }
    }

    protected void loadCookies(ConfigFile file) {
        Reader reader = null;
        try {
            reader = file.obtainReader();
            loadCookies(reader);
        } finally {
            IOUtils.closeQuietly(reader);
        }
    }

    protected void loadCookies(Reader reader) {
        Collection loadedCookies = readCookies(reader);
        for (Cookie cookie: loadedCookies) {
            addCookie(cookie);
        }
    }

    public void saveCookies(String saveCookiesFile) {
        // Do nothing if cookiesFile is not specified.
        if (saveCookiesFile == null || saveCookiesFile.length() <= 0) {
            return;
        }

        FileOutputStream out = null;
        try {
            out = new FileOutputStream(new File(saveCookiesFile));
            String tab ="\t";
            out.write("# Heritrix Cookie File\n".getBytes());
            out.write("# This file is the Netscape cookies.txt format\n\n".getBytes());
            for (Cookie cookie: new ArrayList(getCookies())) {
                // Guess an initial size
                MutableString line = new MutableString(1024 * 2);
                line.append(cookie.getDomain());
                line.append(tab);
                // XXX line.append(cookie.isDomainAttributeSpecified() ? "TRUE" : "FALSE");
                line.append("TRUE");
                line.append(tab);
                line.append(cookie.getPath() != null ? cookie.getPath() : "/");
                line.append(tab);
                line.append(cookie.isSecure() ? "TRUE" : "FALSE");
                line.append(tab);
                line.append(cookie.getExpiryDate() != null ? cookie.getExpiryDate().getTime() / 1000 : -1);
                line.append(tab);
                line.append(cookie.getName());
                line.append(tab);
                line.append(cookie.getValue() != null ? cookie.getValue() : "");
                line.append("\n");
                out.write(line.toString().getBytes());
            }
        } catch (IOException e) {
            logger.log(Level.SEVERE, "Unable to write " + saveCookiesFile, e);
        } finally {
            IOUtils.closeQuietly(out);
        }
    }

    /**
     * Load cookies. The input is text in the Netscape's 'cookies.txt' file
     * format. Example entry of cookies.txt file:
     * 

* www.archive.org FALSE / FALSE 1311699995 details-visit texts-cralond *

*

* Each line has 7 tab-separated fields: *

*
    *
  1. DOMAIN: The domain that created and have access to the cookie value.
  2. *
  3. FLAG: A TRUE or FALSE value indicating if hosts within the given * domain can access the cookie value.
  4. *
  5. PATH: The path within the domain that the cookie value is valid for.
  6. *
  7. SECURE: A TRUE or FALSE value indicating if to use a secure * connection to access the cookie value.
  8. *
  9. EXPIRATION: The expiration time of the cookie value, or -1 for no * expiration
  10. *
  11. NAME: The name of the cookie value
  12. *
  13. VALUE: The cookie value
  14. *
* * @param reader * input in the Netscape's 'cookies.txt' format. */ protected Collection readCookies(Reader reader) { LinkedList cookies = new LinkedList(); BufferedReader br = new BufferedReader(reader); try { String line; int lineNo = 1; while ((line = br.readLine()) != null) { if (!line.matches("\\s*(?:#.*)?")) { // skip blank links and comments String[] tokens = line.split("\\t"); if (tokens.length == 7) { long epochSeconds = Long.parseLong(tokens[4]); Date expirationDate = (epochSeconds >= 0 ? new Date(epochSeconds * 1000) : null); BasicClientCookie cookie = new BasicClientCookie(tokens[5], tokens[6]); cookie.setDomain(tokens[0]); cookie.setExpiryDate(expirationDate); cookie.setSecure(Boolean.valueOf(tokens[3]).booleanValue()); cookie.setPath(tokens[2]); // XXX httpclient cookie doesn't have this thing? // cookie.setDomainAttributeSpecified(Boolean.valueOf(tokens[1]).booleanValue()); logger.fine("Adding cookie: domain " + cookie.getDomain() + " cookie " + cookie); cookies.add(cookie); } else { logger.warning("cookies input line " + lineNo + " invalid, expected 7 tab-delimited tokens"); } } lineNo++; } } catch (IOException e) { logger.log(Level.WARNING,e.getMessage(), e); } return cookies; } protected class LimitedCookieStoreFacade implements CookieStore { private final List cookies; protected LimitedCookieStoreFacade(List cookies) { this.cookies = cookies; } @Override public List getCookies() { return cookies; } @Override public boolean clearExpired(Date date) { throw new RuntimeException("not implemented"); } @Override public void clear() { throw new RuntimeException("not implemented"); } @Override public void addCookie(Cookie cookie) { AbstractCookieStore.this.addCookie(cookie); } } /** * Returns a string that uniquely identifies the cookie, The format The * format of the key is {@code "normalizedDomain;name;path"}. Adapted from * {@link CookieIdentityComparator#compare(Cookie, Cookie)}. */ protected String sortableKey(Cookie cookie) { String normalizedDomain = normalizeHost(cookie.getDomain()); // use ";" as delimiter since it is the delimiter in the cookie header, // so presumably can't appear in any of these values StringBuilder buf = new StringBuilder(normalizedDomain); buf.append(";").append(cookie.getName()); buf.append(";").append(cookie.getPath() != null ? cookie.getPath() : "/"); return buf.toString(); } protected String normalizeHost(String host) { if (host == null) { host = ""; } if (host.startsWith(".")) { host = host.substring(1); } host = host.toLowerCase(Locale.ENGLISH); return host; } public CookieStore cookieStoreFor(CrawlURI curi) throws URIException { String normalizedHost = normalizeHost(curi.getUURI().getHost()); return cookieStoreFor(normalizedHost); } public boolean isCookieCountMaxedForDomain(String domain) { CookieStore cookieStore = cookieStoreFor(normalizeHost(domain)); return (cookieStore != null && cookieStore.getCookies().size() >= MAX_COOKIES_FOR_DOMAIN); } public void addCookie(Cookie cookie) { if (isCookieCountMaxedForDomain(cookie.getDomain())) { logger.log( Level.FINEST, "Maximum number of cookies reached for domain " + cookie.getDomain() + ". Will not add new cookie " + cookie.getName() + " with value " + cookie.getValue()); return; } addCookieImpl(cookie); } abstract protected void addCookieImpl(Cookie cookie); abstract public void clear(); abstract protected void prepare(); }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy