be.bagofwords.http.URLDownloader Maven / Gradle / Ivy

Go to download
package be.bagofwords.http;

import be.bagofwords.ui.UI;
import be.bagofwords.util.URLUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import sun.security.ssl.SSLSocketImpl;

import javax.net.ssl.SSLException;
import javax.net.ssl.SSLParameters;
import javax.net.ssl.SSLSocket;
import javax.net.ssl.SSLSocketFactory;
import java.io.*;
import java.net.InetAddress;
import java.net.InetSocketAddress;
import java.net.Socket;
import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;

/**
 * @author jan ∧ koen
 * @version 1.1
 */
public class URLDownloader {

    private static final String DEFAULT_USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:28.0) Gecko/20100101 Firefox/28.0";

    // accessible fields
    private String method;
    private String url;
    private String userAgent;
    private final ArrayList redirectedUrls;
    private final long timeOut;
    private final long maxSize;
    private int status;
    private String postData;
    private String content;
    private long totalBytesDownloaded;

    // internal settings
    private final String defaultEncoding = "UTF-8";
    private final int maxNbRedirects = 10;

    private byte[] buffer;
    private byte[] rawcontent, raw;
    private List extraRequestHeaders;
    private Map savedCookies;
    private String[] responseHeaders;
    private int endOfHeader;

	/*
     * statusses:
	 *  -10 not a valid url
	 *  -11 infinite redirect
	 *  -12 length of extracted header contains 0 lines
	 *  -14 can't decode raw content as defaultEncoding
	 *  -15 can't decode rawcontent with found encoding
	 */

    /**
     * URLDownloader
     *
     * @param url the url to be downloaded
     */

    public URLDownloader(String url) {
        this("GET", url, null, 5000, -1);
    }

    public URLDownloader(String url, String userAgent) {
        this("GET", url, userAgent, 5000, 1024 * 1024);
    }

    public URLDownloader(String method, String url, String userAgent) {
        this(method, url, userAgent, 5000, 1024 * 1024);
    }

    public void setPostData(String postData) {
        this.postData = postData.trim() + "\r\n";
    }

    public URLDownloader(String method, String url, String userAgent, long timeout, long maxSize) {
        String protocol = URLUtils.getProtocol(url);
        if (!protocol.equals("http") && !protocol.equals("https")) {
            throw new RuntimeException("Unsupported protocol in url " + url);
        }
        this.url = url;
        if (!StringUtils.isEmpty(userAgent)) {
            this.userAgent = userAgent;
        } else {
            this.userAgent = DEFAULT_USER_AGENT;
        }
        this.method = method;
        this.timeOut = timeout;
        this.maxSize = maxSize;
        this.totalBytesDownloaded = 0;
        this.redirectedUrls = new ArrayList<>();
        this.extraRequestHeaders = new ArrayList<>();
        this.savedCookies = new HashMap<>();
    }

    /**
     * Makes connection with the webserver, and download the content. The status can be retrieved by calling getStatus()
     */
    public DownloadResult download() {
        DownloadResult result = null;
        while (result == null) {
            result = downloadRaw();
            if (result.isSuccess()) {
                extractHeaderAndRawContent();
                extractAndSaveCookies();
                if (hasChunkedEncoding())
                    fixChunkedEncoding();
                switch (status) {
                    case 200:
                        decodeBuffer();
                        break;
                    case 301:
                    case 302:
                        if (redirectedUrls.size() < maxNbRedirects) {
                            method = "GET";
                            postData = null;
                            String newurl = extractRedirect();
                            if (StringUtils.isEmpty(newurl)) {
                                result = new DownloadResult(false, "could not find redirect");
                            } else {
                                Matcher pM = URLUtils.protocolP.matcher(newurl);
                                if (!pM.find()) {
                                    newurl = URLUtils.makeAbsolute(newurl, url);
                                }
                                redirectedUrls.add(newurl);
                                url = newurl;
                                result = null; //will download redirected page
                            }
                        } else {
                            status = -11;
                            result = new DownloadResult(false, "too many redirects");
                        }
                        break;
                    case 404:
                        result = new DownloadResult(false, "404 not found");
                        break;
                    default:
                        result = new DownloadResult(false, "unknown status " + status);
                }
            }
        }
        return result;
    }

    private void extractAndSaveCookies() {
        for (String header : getResponseHeaders()) {
            int indOfColon = header.indexOf(':');
            if (indOfColon > -1) {
                String headerName = header.substring(0, indOfColon);
                if ("Set-Cookie".equals(headerName)) {
                    String cookieAndAttributes = header.substring(indOfColon + 2);
                    String cookie;
                    if (cookieAndAttributes.contains(";")) {
                        cookie = cookieAndAttributes.split(";")[0];
                    } else {
                        cookie = cookieAndAttributes;
                    }
                    cookie = cookie.split(";")[0];
                    int indOfEquals = cookie.indexOf('=');
                    if (indOfEquals != -1) {
                        String cookieName = cookie.substring(0, indOfEquals);
                        String cookieValue = cookie.substring(indOfEquals + 1);
                        savedCookies.put(cookieName, cookieValue);
                    }
                }
            }
        }
    }

    public int getStatus() {
        return status;
    }

    /**
     * get the html content of the webpage
     *
     * @return the html content
     */
    public String getContent() {
        return content;
    }

    /**
     * Returns the url passed in the constructor, without the http://
     *
     * @return the url
     */
    public String getURL() {
        return url;
    }

    public ArrayList getRedirectedURLs() {
        return redirectedUrls;
    }


    private UrlParts splitUrl(String url) {
        return new UrlParts(URLUtils.getProtocol(url), URLUtils.getDomain(url), URLUtils.getPath(url));
    }

    /**
     * Decode the raw bytes from the webserver with the hopfully correct encoding
     *
     * @post if all is ok, content is set
     * @post if all is ok, getSuccesfull() returns true
     */

    private void decodeBuffer() {
        String encoding = extractEncodingFromHeader();
        if (StringUtils.isEmpty(encoding))
            encoding = extractEncodingFromContent();
        if (StringUtils.isEmpty(encoding))
            encoding = extractEncodingFromGuess();
        if (StringUtils.isEmpty(encoding))
            encoding = defaultEncoding;
        encoding = encoding.replaceAll("\"", "").replaceAll(";", "");
        try {
            content = new String(rawcontent, encoding);
            content = content.replace("\r", "");
        } catch (UnsupportedEncodingException e) {
            UI.writeError("Problem while reading url " + getURL() + " with encoding " + encoding, e);
            status = -15;
        }
    }

    /**
     * Guess the encoding by trying several encodings, and seeing if there are unknown bytes (65533, 0xfffd)
     *
     * @return the encoding, or null if none works
     */
    private String extractEncodingFromGuess() {
        String[] candidateEncodings = new String[]{"UTF-8", "ISO-8859-1", "ISO-8859-15", "US-ASCII", "UTF-16BE", "UTF-16LE", "UTF-16"};
        for (int e = 0; e < Math.min(4, candidateEncodings.length); e++) {
            try {
                String content = new String(rawcontent, candidateEncodings[e]);
                int nbErrs = 0;
                for (int i = 0; i < content.length(); i++) {
                    if ((int) content.charAt(i) == 65533)
                        nbErrs++;
                }
                if (nbErrs == 0) {
                    return candidateEncodings[e];
                }
            } catch (UnsupportedEncodingException ue) {
                status = -14;
            }
        }
        return null;
    }

    /**
     * Look at html to find an encoding. Decoded with defaultEncoding
     */
    private String extractEncodingFromContent() {
        try {
            String content = new String(rawcontent, defaultEncoding);
            int positionOfEncoding = content.indexOf("charset=");
            if (positionOfEncoding > -1) {
                int start = positionOfEncoding + 8;
                int possibleEnd1 = content.indexOf("\"", start);
                int possibleEnd2 = content.indexOf("'", start);
                int possibleEnd3 = content.indexOf(" ", start);
                if (possibleEnd1 == -1) {
                    possibleEnd1 = Integer.MAX_VALUE;
                }
                if (possibleEnd2 == -1) {
                    possibleEnd2 = Integer.MAX_VALUE;
                }
                if (possibleEnd3 == -1) {
                    possibleEnd3 = Integer.MAX_VALUE;
                }
                int end = Math.min(Math.min(possibleEnd1, possibleEnd2), possibleEnd3);
                if (end < Integer.MAX_VALUE) {
                    String candCharset = content.substring(start, end);
                    candCharset = candCharset.trim();
                    return candCharset;
                } else {
                    return null;
                }
            }
            // 
        } catch (UnsupportedEncodingException e) {
            status = -14;
        }
        return null;
    }

    private String extractEncodingFromHeader() {
        String res = null;
        for (String aHeader : responseHeaders) {
            if (aHeader.contains("charset=")) {
                res = aHeader.substring(aHeader.indexOf("charset=") + 8);
                break;
            }
        }
        return res;
    }

    private String extractRedirect() {
        for (String aHeader : responseHeaders) {
            if (aHeader.startsWith("Location: ") || aHeader.startsWith("location: ")) {
                return aHeader.substring(aHeader.indexOf(" ") + 1);
            }
        }
        return null;
    }

    private void extractHeaderAndRawContent() {
        endOfHeader = 0;
        for (int i = 3; i < buffer.length; i++) {
            if (buffer[i - 3] == 13 && buffer[i - 2] == 10 && buffer[i - 1] == 13 && buffer[i] == 10) {
                endOfHeader = i;
                break;
            }
        }
        // extraction of header
        StringBuilder sb = new StringBuilder();
        for (int i = 0; i < endOfHeader; i++)
            sb.append((char) buffer[i]);
        responseHeaders = sb.toString().trim().split("\n");
        for (int i = 0; i < responseHeaders.length; i++)
            responseHeaders[i] = responseHeaders[i].trim();
        if (responseHeaders.length > 1) {
            status = Integer.parseInt(responseHeaders[0].split(" ")[1]);
            // TODO: [1] could throw indexoutofboundsexception?
        } else {
            status = -12;
            System.err.println("unexpected error: found no header");
        }
        //moving endOfHeader forward until we clear the newlines (not 100% accurate, since the newlines can be considered part of the html, but there are advantages).
        while (endOfHeader < buffer.length && (buffer[endOfHeader] == '\n' || buffer[endOfHeader] == '\r'))
            endOfHeader++;
        // extraction of content
        rawcontent = new byte[buffer.length - endOfHeader];
        System.arraycopy(buffer, endOfHeader + 0, rawcontent, 0, rawcontent.length);
        raw = new byte[buffer.length];
        System.arraycopy(buffer, 0, raw, 0, buffer.length);
    }

    private boolean hasChunkedEncoding() {
        for (String aHeader : responseHeaders) {
            if (aHeader.toLowerCase().contains("transfer-encoding: chunked"))
                return true;
        }
        return false;
    }

    private void fixChunkedEncoding() {
        //		if(Math.random()>-1)return;
        int lastlength = -1, chunkLenTot = 0;
        int p = 0;
        while (lastlength != 0) {
            int endOfLenSpec = p;
            while (!(rawcontent[endOfLenSpec] == '\r' && rawcontent[endOfLenSpec + 1] == '\n'))
                endOfLenSpec++;
            int chunkLen = 0;
            for (int i = p; i < endOfLenSpec; i++) {
                if ((char) rawcontent[i] == ' ')//FIXME, occurred on CNN. bytes= cf Click Here0
                    break;
                chunkLen = chunkLen * 16 + Integer.valueOf("" + (char) rawcontent[i], 16);
            }
            //UI.write("chunklen: "+chunkLen);
            chunkLenTot += chunkLen;
            endOfLenSpec += 2; //0x10 Ox13
            lastlength = chunkLen;
            p = endOfLenSpec + chunkLen + 2;//+2 for 0x10 0x13. Assert it really is.
        }
        //UI.write("tot: "+chunkLenTot);
        byte[] newrawcontent = new byte[chunkLenTot];
        lastlength = -1;
        p = 0;
        int nrwi = 0;
        while (lastlength != 0) {
            int endOfLenSpec = p;
            while (!(rawcontent[endOfLenSpec] == '\r' && rawcontent[endOfLenSpec + 1] == '\n'))
                endOfLenSpec++;
            int chunkLen = 0;
            for (int i = p; i < endOfLenSpec; i++) {
                if ((char) rawcontent[i] == ' ')//FIXME, occurred on CNN. bytes= cf Click Here0
                    break;
                chunkLen = chunkLen * 16 + Integer.valueOf("" + (char) rawcontent[i], 16);
            }
            //UI.write("chunklen: "+chunkLen);
            endOfLenSpec += 2; //0x10 Ox13
            for (int i = 0; i < chunkLen; i++)
                newrawcontent[nrwi++] = rawcontent[endOfLenSpec + i];
            lastlength = chunkLen;
            p = endOfLenSpec + chunkLen + 2;//+2 for 0x10 0x13. Assert it really is.
        }
        rawcontent = newrawcontent;

    }

    private DownloadResult downloadRaw() {
        UrlParts urlParts = splitUrl(url);
        if (!urlParts.isValid()) {
            return new DownloadResult(false, "invalid url");
        }
        PrintStream ps = null;
        InputStream is = null;

        Socket s = null;
        try {
            InetAddress address = ExtraDNSCache.getAddress(urlParts.getHost());
            if (urlParts.isHttps()) {
                SSLSocketFactory ssf = (SSLSocketFactory) SSLSocketFactory.getDefault();
                SSLSocket sslSocket = (SSLSocket) ssf.createSocket();
                configureSSLSocket(address, sslSocket);
                s = sslSocket;
            } else {
                s = new Socket();
                s.setSoTimeout((int) timeOut);
                s.connect(new InetSocketAddress(address, 80), (int) timeOut); //Allow for a timeout even during connect.
            }
            ps = new PrintStream(s.getOutputStream());
            ps.print(method + " " + urlParts.getPath().trim() + " HTTP/1.0\r\n");
            printRequestHeaders(urlParts, ps);
            if (!StringUtils.isEmpty(postData)) {
                ps.print(postData);
            }
            is = s.getInputStream();
            ByteArrayOutputStream bos = new ByteArrayOutputStream();
            IOUtils.copy(is, bos);
            buffer = bos.toByteArray();
            totalBytesDownloaded += buffer.length;
            return new DownloadResult(true, "");
        } catch (UnknownHostException exp) {
            return new DownloadResult(false, "unknown host");
        } catch (java.net.SocketTimeoutException e) {
            return new DownloadResult(false, "socket timeout");
        } catch (SSLException e) {
            return new DownloadResult(false, "ssl problem");
        } catch (Exception e) {
            return new DownloadResult(false, e.getMessage());
        } finally {
            IOUtils.closeQuietly(ps);
            IOUtils.closeQuietly(is);
            IOUtils.closeQuietly(s);
        }
    }

    private void configureSSLSocket(InetAddress address, SSLSocket sslSocket) throws IOException {
        sslSocket.connect(new InetSocketAddress(address.getHostName(), 443));
        if (sslSocket instanceof SSLSocketImpl) {
            ((SSLSocketImpl) sslSocket).setHost(address.getHostName());
        }
        SSLParameters sslParameters = sslSocket.getSSLParameters();
        sslParameters.setEndpointIdentificationAlgorithm("HTTPS");
        sslSocket.setSSLParameters(sslParameters);
        sslSocket.setSoTimeout((int) timeOut);
        sslSocket.startHandshake();
    }

    private void printRequestHeaders(UrlParts urlParts, PrintStream ps) {
        ps.print("Host: " + urlParts.getHost().trim() + "\r\n");
        ps.print("User-Agent: " + userAgent + "\r\n");
        ps.print("Accept-Charset: utf-8");

        for (String header : extraRequestHeaders) {
            ps.print(header + "\r\n");
        }
        if (!savedCookies.isEmpty()) {
            ps.print("Cookie: " + createCookieHeader() + "\r\n");
        }
        if (!StringUtils.isEmpty(postData)) {
            ps.print("Content-Length: " + postData.getBytes().length);
        }
        ps.print("Connection: close\r\n");
        ps.print("\r\n");
    }

    private String createCookieHeader() {
        String result = "";
        for (String key : savedCookies.keySet()) {
            result += key + "=" + savedCookies.get(key) + ";";
        }
        if (!result.isEmpty()) {
            //Trim trailing ';'
            result = result.substring(0, result.length() - 1);
        }
        return result;
    }

    public String getContentType() {
        for (String field : responseHeaders)
            if (field.startsWith("Content-Type:")) {
                String result = field.replaceFirst("Content-Type:", "");
                result = result.trim();
                result = result.split("[ ;]")[0];
                return result;
            }
        return "text/html"; //Return default ...
    }

    public long getTotalBytesDownloaded() {
        return totalBytesDownloaded;
    }

    public byte[] getRawContent() {
        return rawcontent;
    }

    public String[] getResponseHeaders() {
        return responseHeaders;
    }

    public String getResponseHeader(String name) {
        for (String header : getResponseHeaders()) {
            int indOfColon = header.indexOf(':');
            if (indOfColon > -1) {
                String headerName = header.substring(0, indOfColon);
                if (name.equals(headerName)) {
                    String headerValue = header.substring(indOfColon + 2);
                    return headerValue;
                }
            }
        }
        return null;
    }

    public Map getSavedCookies() {
        return savedCookies;
    }

    public List getExtraRequestHeaders() {
        return extraRequestHeaders;
    }

    public void setForwardingIP(String ip) {
        getExtraRequestHeaders().add("X-Forwarded-For: " + ip);
    }

    private class UrlParts {
        private String protocol;
        private String host;
        private String path;

        private UrlParts(String protocol, String host, String path) {
            this.protocol = protocol;
            this.host = host;
            this.path = path;
        }

        public String getProtocol() {
            return protocol;
        }

        public String getHost() {
            return host;
        }

        public String getPath() {
            return path;
        }

        public boolean isHttps() {
            return protocol.equals("https");
        }

        public boolean isValid() {
            return !StringUtils.isEmpty(protocol) && !StringUtils.isEmpty(host) && !StringUtils.isEmpty(path);
        }
    }

}