All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.tribbloids.spookystuff.pipeline.transformer.google.GoogleSearchTransformer.scala Maven / Gradle / Ivy

The newest version!
package com.tribbloids.spookystuff.pipeline.transformer.google

import java.util.UUID

import com.tribbloids.spookystuff.actions._
import com.tribbloids.spookystuff.dsl
import com.tribbloids.spookystuff.http.HttpUtils
import com.tribbloids.spookystuff.pipeline.DynamicSetter
import com.tribbloids.spookystuff.sparkbinding.PageRowRDD


class GoogleSearchTransformer(
                               override val uid: String =
                               classOf[GoogleSearchTransformer].getCanonicalName + "_" + UUID.randomUUID().toString
                               ) extends DynamicSetter {

  import dsl._
  import org.apache.spark.ml.param._

  /**
   * Param for input column name.
   * @group param
   */
  final val InputCol: Param[String] = new Param[String](this, "inputCol", "input column name")
  final val Pages: Param[Int] = new Param[Int](this, "Pages", "number of pages")
  final val PageCol: Param[String] = new Param[String](this, "PageCol", "output page number column name")
  final val IndexCol: Param[String] = new Param[String](this, "IndexCol", "output index number column name")

  setDefault(Pages -> 0, PageCol -> null, IndexCol -> null)

  override def transform(dataset: PageRowRDD): PageRowRDD = {

    dataset.fetch(
      Visit("http://www.google.com/") +>
        TextInput("input[name=\"q\"]",$(InputCol)) +>
        Submit("input[name=\"btnG\"]")
    )
      .wgetExplore(S"div#foot a:contains(Next)", maxDepth = getOrDefault(Pages), depthKey = $(PageCol), optimizer = Narrow)
      .wgetJoin(S".g h3.r a".hrefs.flatMap {
      uri =>
        val query = HttpUtils.uri(uri).getQuery
        val realURI = if (query == null) Some(uri)
        else if (uri.contains("/url?")) query.split("&").find(_.startsWith("q=")).map(_.replaceAll("q=",""))
        else None
        realURI
    },
        ordinalKey = $(IndexCol),
        failSafe = 2 //not all links are viable
      )
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy