com.soulgalore.crawler.run.CrawlToPlainTxtOnlyMatching Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of crawler Show documentation
Simple java (1.6) crawler to crawl web pages on one and same domain.
The newest version!
/******************************************************
 * Web crawler
 * 
 * 
 * Copyright (C) 2012 by Peter Hedenskog (http://peterhedenskog.com)
 * 
 ****************************************************** 
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 * in compliance with the License. You may obtain a copy of the License at
 * 
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software distributed under the License
 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 * or implied. See the License for the specific language governing permissions and limitations under
 * the License.
 * 
 ******************************************************* 
 */
package com.soulgalore.crawler.run;

import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;

import com.google.inject.Guice;
import com.google.inject.Injector;
import com.soulgalore.crawler.core.Crawler;
import com.soulgalore.crawler.core.CrawlerResult;
import com.soulgalore.crawler.core.HTMLPageResponse;
import com.soulgalore.crawler.guice.CrawlModule;

/**
 * Crawl and print urls that contains specific keyword in the HTML body.
 * 
 */
public class CrawlToPlainTxtOnlyMatching extends AbstractCrawl {

  private final String keyword;

  CrawlToPlainTxtOnlyMatching(String[] args) throws ParseException {
    super(args);
    keyword = getLine().getOptionValue("keyword");

  }

  /**
   * Run.
   * 
   * @param args the args
   */
  public static void main(String[] args) {

    try {

      final CrawlToPlainTxtOnlyMatching crawl = new CrawlToPlainTxtOnlyMatching(args);
      crawl.crawl();

    } catch (ParseException e) {
      System.out.print(e.getMessage());
    } catch (IllegalArgumentException e) {
      System.out.println(e.getMessage());
    }

  }

  private void crawl() {
    final Injector injector = Guice.createInjector(new CrawlModule());
    final Crawler crawler = injector.getInstance(Crawler.class);

    final CrawlerResult result = crawler.getUrls(getConfiguration());
    for (HTMLPageResponse response : result.getVerifiedURLResponses()) {

      if (response.getBody().toString().contains(keyword)) {
        System.out.println(response.getUrl());
      }
    }

    crawler.shutdown();
  }

  /**
   * Get the options.
   * 
   * @return the specific CrawlToCsv options
   */
  @Override
  protected Options getOptions() {
    final Options options = super.getOptions();

    final Option filenameOption =
        new Option("k", "the keyword to search for in the page  [required]");
    filenameOption.setArgName("KEYWORD");
    filenameOption.setLongOpt("keyword");
    filenameOption.setRequired(true);
    filenameOption.setArgs(1);

    options.addOption(filenameOption);

    return options;

  }
}