All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.signalcollect.examples.WebCrawler.scala Maven / Gradle / Ivy

The newest version!
/*
 *  @author Philip Stutz
 *
 *  Copyright 2011 University of Zurich
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 *
 */

package com.signalcollect.examples

import com.signalcollect._

/**
 *  Regular expression to match links in Html strings
 */
object Regex {
  val hyperlink = """""".r
}

/**
 *  Proof of concept combination of the PageRank vertices with a WebCrawler
 *  Do not use on a larger scale:
 *  	- does not respect robots.txt
 *  	- lacks proper user agent string
 */
object WebCrawler extends App {
  val graph = GraphBuilder.build
  graph.addVertex(new Webpage("http://www.ifi.uzh.ch/ddis/", 2))
  val stats = graph.execute
  graph.foreachVertex(println(_))
  println(stats)
  graph.shutdown
}

/**
 *  Adds linked webpages as vertices to the graph and connects them with a link edge
 */
class Webpage(id: String, crawlDepth: Int, dampingFactor: Double = 0.85) extends PageRankVertex(id, dampingFactor) {

  /** This method gets called by the framework after the vertex has been fully initialized. */
  override def afterInitialization(graphEditor: GraphEditor[Any, Any]) {
    super.afterInitialization(graphEditor)
    if (crawlDepth > 0) {
      try {
        val webpage = io.Source.fromURL(id, "ISO-8859-1").mkString
        Regex.hyperlink.findAllIn(webpage).matchData map (_.group(1)) foreach { linked =>
          graphEditor.addVertex(new Webpage(linked, crawlDepth - 1))
          graphEditor.addEdge(id, new PageRankEdge(linked))
        }
      } catch {
        case _: Throwable =>
      }
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy