All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.squareup.okhttp.sample.Crawler Maven / Gradle / Ivy

/*
 * Copyright (C) 2014 Square, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.squareup.okhttp.sample;

import com.squareup.okhttp.Cache;
import com.squareup.okhttp.HttpUrl;
import com.squareup.okhttp.OkHttpClient;
import com.squareup.okhttp.Request;
import com.squareup.okhttp.Response;
import com.squareup.okhttp.internal.NamedRunnable;
import java.io.File;
import java.io.IOException;
import java.util.Collections;
import java.util.LinkedHashSet;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.atomic.AtomicInteger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

/**
 * Fetches HTML from a requested URL, follows the links, and repeats.
 */
public final class Crawler {
  private final OkHttpClient client;
  private final Set fetchedUrls = Collections.synchronizedSet(
      new LinkedHashSet());
  private final LinkedBlockingQueue queue = new LinkedBlockingQueue<>();
  private final ConcurrentHashMap hostnames = new ConcurrentHashMap<>();

  public Crawler(OkHttpClient client) {
    this.client = client;
  }

  private void parallelDrainQueue(int threadCount) {
    ExecutorService executor = Executors.newFixedThreadPool(threadCount);
    for (int i = 0; i < threadCount; i++) {
      executor.execute(new NamedRunnable("Crawler %s", i) {
        @Override protected void execute() {
          try {
            drainQueue();
          } catch (Exception e) {
            e.printStackTrace();
          }
        }
      });
    }
    executor.shutdown();
  }

  private void drainQueue() throws Exception {
    for (HttpUrl url; (url = queue.take()) != null; ) {
      if (!fetchedUrls.add(url)) {
        continue;
      }

      try {
        fetch(url);
      } catch (IOException e) {
        System.out.printf("XXX: %s %s%n", url, e);
      }
    }
  }

  public void fetch(HttpUrl url) throws IOException {
    // Skip hosts that we've visited many times.
    AtomicInteger hostnameCount = new AtomicInteger();
    AtomicInteger previous = hostnames.putIfAbsent(url.host(), hostnameCount);
    if (previous != null) hostnameCount = previous;
    if (hostnameCount.incrementAndGet() > 100) return;

    Request request = new Request.Builder()
        .url(url)
        .build();
    Response response = client.newCall(request).execute();
    String responseSource = response.networkResponse() != null
        ? ("(network: " + response.networkResponse().code() + " over " + response.protocol() + ")")
        : "(cache)";
    int responseCode = response.code();

    System.out.printf("%03d: %s %s%n", responseCode, url, responseSource);

    String contentType = response.header("Content-Type");
    if (responseCode != 200 || contentType == null) {
      response.body().close();
      return;
    }

    Document document = Jsoup.parse(response.body().string(), url.toString());
    for (Element element : document.select("a[href]")) {
      String href = element.attr("href");
      HttpUrl link = response.request().httpUrl().resolve(href);
      if (link != null) queue.add(link);
    }
  }

  public static void main(String[] args) throws IOException {
    if (args.length != 2) {
      System.out.println("Usage: Crawler  ");
      return;
    }

    int threadCount = 20;
    long cacheByteCount = 1024L * 1024L * 100L;

    OkHttpClient client = new OkHttpClient();
    Cache cache = new Cache(new File(args[0]), cacheByteCount);
    client.setCache(cache);

    Crawler crawler = new Crawler(client);
    crawler.queue.add(HttpUrl.parse(args[1]));
    crawler.parallelDrainQueue(threadCount);
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy