All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.dbpedia.extraction.mappings.wikitemplate.WiktionaryUtils.scala Maven / Gradle / Ivy

The newest version!
package org.dbpedia.extraction.mappings.wikitemplate

import org.dbpedia.extraction.mappings.WiktionaryPageExtractor
import org.dbpedia.extraction.mappings.wikitemplate.MyLinkNode._
import org.dbpedia.extraction.mappings.wikitemplate.MyNode._
import org.dbpedia.extraction.mappings.wikitemplate.MyNodeList._
import org.dbpedia.extraction.sources.WikiPage
import org.dbpedia.extraction.util.{Language, UriUtils}
import org.dbpedia.extraction.wikiparser._

import scala.collection.mutable.{ListBuffer, Stack}
import scala.language.{implicitConversions, reflectiveCalls}
import scala.util.control.Breaks._

case class WiktionaryException(val s: String, val vars : VarBindingsHierarchical, val unexpectedNode : Option[Node]) extends  Exception(s) {}

case class ContinueException(val s: String) extends Exception(s) {}
 * extend the stack class (by using a wrapper and implicit conversion - scala magic)
class MyStack(s : Stack[Node]) {
  val stack : Stack[Node] = s

  def prependString(str : String) : Unit  = {
    if(stack.size == 0){
      stack push new TextNode(str,0)
    } else {
        val head = stack.pop
        val newhead = new TextNode(str + head.asInstanceOf[TextNode].text, head.line)
      } else {
        val newhead = new TextNode(str, stack.head.line)

  def toShortDumpString : String = {

  def toWikiText : String = {

  def toReadableString : String = {

   * reverse a stack
   * the reverse function return (for some unknown reason) no stack, but a Seq...
  def reversed : Stack[Node]  = {
     new Stack().pushAll(stack.reverse)

   * return all nodes until we see a "list-end" node
   * if "list-start" nodes occur on the way, we need to keep track of them, so we dont see their "lower-level" "list-end" nodes as our searched list end
  def getList : Stack[Node] = {
      val list = new ListBuffer[Node]()
      var i = 0
      breakable {
        while (stack.size > 0){
          val cur = stack.pop
          cur match {
            case tn : TemplateNode => {
              if(tn.title.decoded == "Extractiontpl"){
                val tplType ="1").get.children(0).asInstanceOf[TextNode].text
                tplType match {
                  case "list-start" =>     i += 1
                  case "list-end" =>       {
                    if(i == 0){
                      break  //return inner of the list. the list start was consumed before calling this function, now we return before appending the end marker
                    } else {
                      i -= 1
                  case _ =>
            case _ =>
          list prepend cur
      val st = new Stack[Node]() pushAll list

   * get the next "normal" node (no var node etc.)
  def findNextNonTplNode() : Option[Node] =
      node => !(node.isInstanceOf[TemplateNode] && node.asInstanceOf[TemplateNode].title.decoded == "Extractiontpl") && !(node.isInstanceOf[TextNode] && node.asInstanceOf[TextNode].text == "")

   * filters newline textnodes that come between a extractiontpl-template and a section node
   * example page
   * == sec ==
   * === sec2 ==
   * ...
   * should be parsed with the template
   * {{extractiontpl|list-start}}
   * == sec ==
   * {{extractiontpl|list-start}}
   * === sec 2 ==
   * ...
   * on the parsed page there will be only two section nodes
   * on the parsed template there will be this:
  def filterNewLines() = {
    val otherStack = new Stack[Node]()
    val list = stack.toList
    for(i <- list.indices) {
      if(i > 0 && i < list.indices.last){
            (list(i-1).isInstanceOf[SectionNode] && list(i+1).isInstanceOf[TemplateNode] && list(i+1).asInstanceOf[TemplateNode].title.decoded == "Extractiontpl")
            (list(i+1).isInstanceOf[SectionNode] && list(i-1).isInstanceOf[TemplateNode] && list(i-1).asInstanceOf[TemplateNode].title.decoded == "Extractiontpl")
            && list(i).isInstanceOf[TextNode] && list(i).asInstanceOf[TextNode].text.startsWith("\n") && list(i).asInstanceOf[TextNode].text.length > 1
           otherStack push list(i).asInstanceOf[TextNode].copy(text=list(i).asInstanceOf[TextNode].text.substring(1)) //strip that addional newline
        } else {
          otherStack push list(i) //ok
      } else otherStack push list(i)

  def filterEmptyTextNodes() = {
    val otherStack = stack.reverse
    //println("filterEmptyTextNodes called")
    otherStack.foreach(i=> {
      i match {
        case tn : TextNode => if(!tn.text.equals("")){stack.push(i)}
        case sn : SectionNode => stack.push(sn.copy(children=sn.children.filter((c) => !c.isInstanceOf[TextNode] || !c.asInstanceOf[TextNode].text.equals(""))))
        case _ => stack push i

   * filter whitespaces
  def filterSpaces() = {
    val otherStack = new Stack[Node]()
    //println("filter spaces called")
    stack.foreach(i=> {
      i match {
        case tn : TextNode => otherStack push tn.copy(text=tn.text.replace("  ", " ").replace(" \n", "\n"))
        case _ => otherStack push i

  def dropUntilAndPop(f : (Node) => Boolean) = {

  def reverseSwapList() = {
    val otherStack = new Stack[Node]()
    val thisReverse = stack.reverse.filter( (n : Node)=>
        n.isInstanceOf[TemplateNode] && 
        n.asInstanceOf[TemplateNode].title.decoded == "Extractiontpl" && 
            n.asInstanceOf[TemplateNode].property("1").get.children(0).asInstanceOf[TextNode].text == "list-start" || 
            n.asInstanceOf[TemplateNode].property("1").get.children(0).asInstanceOf[TextNode].text == "list-end"
    val thisClone = stack.clone

    stack.foreach(i=> {

      i match {
        case tn : TemplateNode => if(tn.title.decoded == "Extractiontpl"){
                val tplType ="1").get.children(0).asInstanceOf[TextNode].text
                tplType match {
                  case "list-start" => {
                    val correspondingEnd = thisReverse.pop
                    otherStack push correspondingEnd
                  case "list-end" => {
                    val correspondingStart = thisReverse.pop
                    otherStack push correspondingStart
                  case _ => otherStack push i
        } else {otherStack push i}
        case _ => otherStack push i

object MyStack {
 * these functions tell how to convert to the wrapper implicitly
  implicit def Stack2MyStack(s : Stack[Node]) : MyStack = { new MyStack(s) }
  implicit def MyStack2Stack(s : MyStack) : Stack[Node] = { s.stack }

  val parser = WikiParser.getInstance()
   * parse a string as wikisyntax and return the nodes as a stack
  def fromString(in : String) : Stack[Node] = {

    //fix restrictive parsing of sections (must be \n== xy ==\n - but in case start of file or end of file, the newlines are omitted)
    //force leading and trailing \n
    var prependedNewline = false
    var appendedNewline = false
    //and useless whitespaces
    val str = 
        (if(in.startsWith("=")){prependedNewline = true; "\n"} else {""}) +
        in +
        (if(in.endsWith("=")){appendedNewline = true; "\n"} else {""})

    //println("after normalizations >"+str+"<")
    val page : PageNode = parser(
        new WikiPage(
          new WikiTitle("wiktionary extraction subtemplate", Namespace.Main, Language.English), str //parsing
    ).getOrElse(throw new Exception("Parser Error") )
    val nodes = new Stack[Node]()

    if(appendedNewline && (page.children.last match {case TextNode("\n", _)=>true; case _ => false})){
      nodes.pushAll(page.children.reverse.tail) //without the last
    } else {
    if(prependedNewline && (nodes.head match {case TextNode("\n", _)=>true; case _ => false})){


    //println("dumping subtemplate ")
    //nodes.foreach((n: Node) => println(n.dumpStrShort))


   * read a file containing wikisyntax and return the nodes as a stack
   * currently not used
  def fromParsedFile(name : String) : Stack[Node] = {


 * possibility of scala to _kind of_ extend the language with own constructs:
 * i "define" the keywords measure and report...
 * code within the measure-block is executed with timekeeping (how many millisoconds the execution took)
 * the result is handed over to the report block, which needs to be a function (which prints it or so)
object TimeMeasurement {
  def measure(code : => Unit) = new {
    def report(reporterFunc : Long => Unit) = {
      val before = System.currentTimeMillis
      val after = System.currentTimeMillis
      val duration = after - before

 * extend the string class with some "inner-trim" functionality
class MyString(val str : String){
  //reduce multiple whitespaces and lines with only whitespaces. then trim
  def fullTrim() : String = str.replaceAll("\\r?\\n\\s{1,}\\r?\\n", "\n\n").replaceAll("^\\s{1,}\\r?\\n", "\n").replaceAll("\\r?\\n\\s{1,}$", "\n").replaceAll("\\s{2,}", " ")

object MyString {
  implicit def String2MyString(s : String) : MyString = new MyString(s)
  implicit def MyString2String(s : MyString) : String = s.str

object Logging {
  var level = 0  // will be read from config and then overwritten
  val st_depth_start = new Exception("").getStackTrace.length + 1

  //print info about a function call, and the template and page (the first n nodes)
  def printFuncDump(name : String, tplIt : Stack[Node], pageIt : Stack[Node], thisLevel : Int) : Unit = {
    if(thisLevel <= level){
      val st_depth = new Exception("").getStackTrace.length - st_depth_start
      val prefix =  " " * st_depth
      println(prefix + "------------")
      println(prefix + "")
      println(prefix + "