All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.dbpedia.extraction.mappings.WiktionaryPageExtractor.scala Maven / Gradle / Ivy

There is a newer version: 4.1
Show newest version
package org.dbpedia.extraction.mappings

import org.dbpedia.extraction.wikiparser._
import org.dbpedia.extraction.util.Language
import java.util.Locale
import org.dbpedia.extraction.destinations.{Quad, Dataset}
import org.openrdf.model.{Literal, URI, Resource, Value, Statement }
import org.openrdf.model.impl.{ValueFactoryImpl }
import util.control.Breaks._
import java.io.FileNotFoundException
import java.io.FileWriter
import java.net.URLEncoder
import java.net.URL
import java.lang.StringBuffer
import xml.{XML, Node => XMLNode, NodeSeq}
import scala.util.matching.Regex
import collection.mutable.{HashMap, Stack, ListBuffer, HashSet}
import org.dbpedia.extraction.mappings.wikitemplate._
import org.dbpedia.extraction.mappings.wikitemplate.wiktionary.bindinghandler._
import org.dbpedia.extraction.mappings.wikitemplate.wiktionary.postprocessor._

//some of my utilities
import org.dbpedia.extraction.mappings.wikitemplate.MyNodeList._
import org.dbpedia.extraction.mappings.wikitemplate.MyNode._
import org.dbpedia.extraction.mappings.wikitemplate.MyStack._
import org.dbpedia.extraction.mappings.wikitemplate.TimeMeasurement._
import org.dbpedia.extraction.mappings.wikitemplate.VarBinder._
import org.dbpedia.extraction.mappings.wikitemplate.Logging._
import org.dbpedia.extraction.mappings.wikitemplate.MyLinkNode._
import Matcher._
import WiktionaryPageExtractor._ //companion


/**
 * parses (wiktionary) wiki pages
 * is meant to be configurable for multiple languages
 *
 * is even meant to be usable for non-wiktionary wikis -> arbitrary wikis, but where all pages follow a common schema
 * but in contrast to infobox-focused extraction, we *aim* to be more flexible:
 * dbpedia core is hardcoded extraction. here we try to use a meta-language describing the information to be extracted
 * this is done via xml containing wikisyntax snippets (called templates) containing placeholders (called variables), which are then bound
 *
 * we also extended this approach to match the wiktionary schema
 * a page can contain information about multiple entities (sequential blocks)
 *
 * @author Jonas Brekle 
 * @author Sebastian Hellmann 
 */

class WiktionaryPageExtractor(
  context : {
   def redirects : Redirects
   def language : Language
  }
)
extends PageNodeExtractor {
  override val datasets = Set(datasetURI) //new Dataset("wiktionary"))

  override def extract(page: PageNode, subjectUri: String, pageContext: PageContext): Seq[Quad] =
  {
    //return new Graph()
    val cache = new Cache

    Logging.printMsg("start "+subjectUri+" threadID="+Thread.currentThread().getId(),2)
    // wait a random number of seconds. kills parallelism - otherwise debug output from different threads is mixed
    if(logLevel > 0){
      val r = new scala.util.Random
      Thread sleep r.nextInt(10)*1000
    }

    val quads = new ListBuffer[Statement]()

    val entityId = page.title.decoded
    cache.savedVars("entityId") = entityId

    //skip some useless pages
    //println(entityId)
    for(start <- ignoreStart){
      if(entityId.startsWith(start)){
        Logging.printMsg("ignored "+entityId,1)
        return quads.map(mapStatement(_))
      }
    }
    for(end <- ignoreEnd){
      if(entityId.endsWith(end)){
        Logging.printMsg("ignored "+entityId,1)
        return quads.map(mapStatement(_))
      }
    }

    Logging.printMsg("processing "+entityId, 1)

    measure {
      //this is also the base-url (all nested blocks will get uris with this as a prefix)
      cache.blockURIs("page") = vf.createURI(resourceNS + urify(entityId))
      cache.savedVars("block") = cache.blockURIs("page").toString

      //generate template-unrelated triples that are configured for the page
      try {
        quads appendAll makeTriples(pageConfig.rt, new HashMap[String, List[Node]], pageConfig.name, cache)
      } catch {
        case e : WiktionaryException => Logging.printMsg("#error generating page triples: "+e.toString, 2)
      }

      //println(page.children)

      val pageStack = new Stack[org.dbpedia.extraction.wikiparser.Node]().pushAll(page.children.reverse).filterSpaces
      //apply "first" nodehandlers
      beforeNodeHandlers.foreach((nh : NodeHandler) => {
        var result = nh.process(pageStack, cache.blockURIs("page").toString, cache, Map(), pageConfig)
        if(result.isInstanceOf[NodeHandlerTriplesResult]){
          quads appendAll result.asInstanceOf[NodeHandlerTriplesResult].triples
        }
      })

      val proAndEpilogBindings : ListBuffer[Tuple2[Tpl, wikitemplate.VarBindingsHierarchical]] = new ListBuffer
      //handle prolog (beginning) (e.g. "see also") - not related to blocks, but to the main entity of the page
      for(prolog <- languageConfig \ "page" \ "prologs" \ "template"){
        val prologtpl = Tpl.fromNode(prolog)
        Logging.printMsg("try "+prologtpl.name, 2)
        try {
          proAndEpilogBindings.append( (prologtpl, VarBinder.parseNodesWithTemplate(prologtpl.wiki.clone, pageStack)) )
        } catch {
          case e : WiktionaryException => proAndEpilogBindings.append( (prologtpl, e.vars) )
        }
      }

      //handle epilog (ending) (e.g. "links to other languages") by parsing the page backwards
      val rev = new Stack[Node] pushAll pageStack //reversed
      if(rev.headOption.isDefined && (!rev.head.isInstanceOf[TextNode] || (rev.head.isInstanceOf[TextNode] && !rev.head.asInstanceOf[TextNode].text.equals("\n")))){
        rev.push(new TextNode("\n",0))
      }
      for(epilog <- languageConfig \ "page" \ "epilogs" \ "template"){
        val epilogtpl = Tpl.fromNode(epilog)
        try {
          proAndEpilogBindings.append( (epilogtpl, VarBinder.parseNodesWithTemplate(epilogtpl.wiki.reverseSwapList, rev)) )
        } catch {
          case e : WiktionaryException => proAndEpilogBindings.append( (epilogtpl, e.vars) )
        }
      }
      Logging.printMsg(proAndEpilogBindings.size+ " prologs and epilogs found", 2)

      //apply consumed nodes (from the reversed page) to pageStack  (unreversed)
      pageStack.clear
      pageStack pushAll rev

      //handle the bindings from pro- and epilog
      proAndEpilogBindings.foreach({case (tpl : Tpl, tplBindings : VarBindingsHierarchical) => {
        try {
          quads appendAll handleFlatBindings(tplBindings.getFlat(), pageConfig, tpl, cache, cache.blockURIs("page").getURI)
        } catch { case _ : Throwable => } //returned bindings are wrong
      }})
      Logging.printMsg("pro- and epilog bindings handled", 2)

      //keep track where we are in the page block hierarchy
      val curOpenBlocks = new ListBuffer[Block]()
      curOpenBlocks append pageConfig
      var curBlock : Block = pageConfig
      var counter = 0
      //keep track if we consumed at least one node in this while run - if not, drop one node at the end
      var consumed = false
      while(pageStack.size > 0){
        counter += 1
        if(counter % 100 == 0){
          Logging.printMsg(""+counter+"/"+page.children.size+" nodes inspected "+entityId,1)
        }
        Logging.printMsg("page node: "+pageStack.head.toWikiText, 2)
        consumed = false

        //collect open blocks. reverse them, so innermost blocks are regarded first (when matching their templates and indicator templates)
        val possibleBlocks = curOpenBlocks.map(_.blocks).reverse.flatten //:: pageConfig
        //val possibleTemplates = curOpenBlocks.foldLeft(List[Tpl]()){(all,cur)=>all ::: cur.templates}
        //println("possibleTemplates="+ possibleTemplates.map(t=>t.name) )
        //println("possibleBlocks="+ possibleBlocks.map(_.name) )

        //try matching this blocks templates
        var triedTemplatesCounter = 0
        while(triedTemplatesCounter < curBlock.templates.size && pageStack.size > 0){
          for(tpl <- curBlock.templates){
            triedTemplatesCounter += 1
            //println(""+triedTemplatesCounter+"/"+curBlock.templates.size)
            //for(tpl <- possibleTemplates){
            Logging.printMsg("trying template "+tpl.name, 3)
            val pageCopy = pageStack.clone
            //println(pageStack.take(1).map(_.dumpStrShort).mkString)
            try {
              //println("vs")
              //println(block.indTpl.tpl.map(_.dumpStrShort).mkString )

              val blockBindings = VarBinder.parseNodesWithTemplate(tpl.wiki.clone, pageStack)

              //generate triples
              //println(tpl.name +": "+ blockBindings.dump())
              quads appendAll handleFlatBindings(blockBindings.getFlat(), curBlock, tpl, cache, cache.blockURIs(curBlock.name).getURI)

              //no exception -> success -> stuff below here will be executed on success
              consumed = true
              //reset counter, so all templates need to be tried again
              triedTemplatesCounter = 0
              Logging.printMsg("finished template "+tpl.name+" successfully", 3)
            } catch {
              case e : WiktionaryException => restore(pageStack, pageCopy) //did not match
            }
          }
        }

        if(pageStack.size > 0){
          parsingNodeHandlers.foreach((nh : NodeHandler) => {
            var result = nh.process(pageStack, cache.blockURIs("page").toString, cache, Map(), pageConfig)
            if(result.isInstanceOf[NodeHandlerTriplesResult]){
              quads appendAll result.asInstanceOf[NodeHandlerTriplesResult].triples
            }
          })
        }

        if(pageStack.size > 0){
          // try recognizing block starts of blocks. if recognized we go somewhere UP the hierarchy (the block ended) or one step DOWN (new sub block)
          // each block has a "indicator-template" (indTpl)
          // when it matches, the block starts. and from that template we get bindings that describe the block

          Logging.printMsg("trying block indicator templates. page node: "+pageStack.head.toWikiText, 3)
          breakable {
            for(block <- possibleBlocks){
              for(blockIndTpl <- block.indTpl){
                //println("name="+blockIndTpl.name)
                //these two are need in case of exeption to undo stuff
                val pageCopy = pageStack.clone
                val blockURICopy = cache.savedVars("block")
                val consumedCopy = consumed
                try {
                  //println("vs")
                  //println(blockIndTpl.tpl.map(_.dumpStrShort).mkString )
                  val blockIndBindings = VarBinder.parseNodesWithTemplate(blockIndTpl.wiki.clone, pageStack)

                  val parentBlockUri = if(!curOpenBlocks.contains(block)){
                    cache.blockURIs(curBlock.name).getURI
                  } else {
                    cache.blockURIs(block.parent.name).getURI
                  }
                  cache.savedVars("block") = parentBlockUri

                  quads appendAll handleFlatBindings(blockIndBindings.getFlat(), block, blockIndTpl, cache, parentBlockUri)

                  //no exception -> success -> stuff below here will be executed on success
                  consumed = true

                  curBlock = block //switch to new block
                  Logging.printMsg("block indicator template "+blockIndTpl.name+" matched", 2)

                  //check where in the hierarchy the new opended block is
                  if(!curOpenBlocks.contains(block)){
                    // the new block is not up in the hierarchy
                    // go one step down/deeper
                    //println("new block detected: "+block.name+" in curOpenBlocks= "+curOpenBlocks.map(_.name))
                    curOpenBlocks append block
                  } else {
                    //the new block somewhere up the hierarchy
                    //go to the parent of that
                    //println("parent block detected: "+block.name+" in curOpenBlocks= "+curOpenBlocks.map(_.name))
                    val newOpen = curOpenBlocks.takeWhile(_ != block)
                    curOpenBlocks.clear()
                    curOpenBlocks.appendAll(newOpen)
                    //take a new turn
                    curOpenBlocks.append(block)// up
                  }

                  //generate template-unrelated triples that are configured for the block
                  try {
                    quads appendAll makeTriples(block.rt, new HashMap[String, List[Node]], block.name, cache)
                  } catch {
                    case e : WiktionaryException => Logging.printMsg("error generating block triples: "+e.toString, 2)
                  }
                  break; //dont match another block indicator template right away (continue with this blocks templates)
                } catch {
                  case e : WiktionaryException => {
                    //did not match
                    Logging.printMsg("exception while trying to match template "+e.toString, 2)
                    restore(pageStack, pageCopy) //restore the page -> dont consume anything
                    cache.savedVars("block") = blockURICopy //restore the current block URI
                    consumed = consumedCopy
                  }
                }
              }
            }
          }
        }
        if(!consumed){
          var anyMatched = false
          afterNodeHandlers.foreach((nh : NodeHandler) => {
            var result = nh.process(pageStack, cache.blockURIs("page").getURI, cache, Map(), pageConfig)
            if(result.isInstanceOf[NodeHandlerTriplesResult]){
              anyMatched = true
              quads appendAll result.asInstanceOf[NodeHandlerTriplesResult].triples
            }
          })

          if(!anyMatched){
            Logging.printMsg("skipping unconsumable node ", 2)

            val unconsumeableNode = pageStack.pop
            unconsumeableNode match {
              case tn : TextNode => if(tn.text.startsWith(" ") || tn.text.startsWith("\n")){
                val afterSpace = tn.text.substring(1)
                if(afterSpace.length != 0){
                  pageStack.push(tn.copy(text=afterSpace))
                }
              } else {
                //skip this line
                val lineEnd = tn.text.indexOf("\n")
                val afterFirstBreak = tn.text.substring(tn.text.indexOf("\n")+1)
                if(lineEnd >= 0 && afterFirstBreak.length != 0){
                  pageStack.push(tn.copy(text=afterFirstBreak))
                }
              }
              case _ =>
            }
          }
        }
      }

    } report {
      duration : Long => Logging.printMsg("took "+ duration +"ms", 1)
    }

    //remove duplicate triples
    val quadsDistinct = quads.groupBy(_.toString).map(_._2.head).toList

    //postprocessing (schema transformation)
    val quadsPostProcessed = if(postprocessor.isDefined && quadsDistinct.size > 0){
      postprocessor.get.process(quadsDistinct, cache.blockURIs("page").getURI).groupBy(_.toString).map(_._2.head).toList
    } else quadsDistinct

    val quadsFiltered = quadsPostProcessed.filter((q : Statement) =>
           !q.getSubject.toString().equals(resourceNS)
        && !q.getSubject.toString().equals(resourceNS+"-")
        && !q.getObject.toString().equals("")
        && !q.getPredicate.toString().equals(""))

    //sorting (by length, if equal length: alphabetical)
    val quadsSorted = quadsFiltered.sortWith((q1 : Statement, q2 : Statement)=>{
      val subjComp = myCompare(q1.getSubject.toString(), q2.getSubject.toString())
      if(subjComp == 0){
        myCompare(q1.getPredicate.toString(), q2.getPredicate.toString()) < 0
      } else subjComp < 0
    })

    val statQuad = vf.createStatement(cache.blockURIs("page"), vf.createURI(termsNS+"statistics"), vf.createLiteral(""+quadsSorted.size+"-"+(page.toWikiText.count(_.equals('\n'))+1-countEmptyLines(page.toWikiText))))
    val quadsWithStat = quadsSorted ::: List(statQuad)

    Logging.printMsg(""+quadsSorted.size+" quads extracted for "+entityId, 1)
    quadsWithStat.foreach( q => { Logging.printMsg(q.toString, 1) } )
    Logging.printMsg("finish "+subjectUri+" threadID="+Thread.currentThread().getId(), 2)

    quadsWithStat.map(mapStatement(_))
  }
}

object WiktionaryPageExtractor {

  val emptyLinePat = java.util.regex.Pattern.compile( """^\s*$""", java.util.regex.Pattern.MULTILINE )
  private def countEmptyLines(s:String) : Int = {
    val m= emptyLinePat.matcher(s)
    var i = 0
    while(m.find()){
      i = i+1
    }
    i
  }

  def handleFlatBindings(bindings : VarBindings, block : Block, tpl : Tpl, cache : Cache, thisBlockURI : String) : List[Statement] = {
    val quads = new ListBuffer[Statement]

    if(tpl.pp.isDefined){
        val handler = Class.forName(tpl.pp.get.clazz).newInstance.asInstanceOf[BindingHandler]
        return handler.process(bindings, thisBlockURI, cache, tpl.pp.get.parameters)
    }
    var bindingsMatchedAnyResultTemplate = false
    bindings.foreach( (binding : HashMap[String, List[Node]]) => {
      Logging.printMsg("bindings "+binding, 2)
      Logging.printMsg("", 2)
      tpl.resultTemplates.foreach( (rt : ResultTemplate) => {
        try {
          quads appendAll makeTriples(rt, binding, block.name, cache) 
          bindingsMatchedAnyResultTemplate = true
        } catch {
          case e1:java.util.NoSuchElementException => Logging.printMsg("missing binding: "+e1, 3)
          case e : Throwable => Logging.printMsg("exception while processing bindings: "+e, 3)
        }
      })
    })
    if(!bindingsMatchedAnyResultTemplate){
        throw new WiktionaryException("result templates were not satisfiable", new VarBindingsHierarchical, None)    
    }
    quads.toList
  }

  def escape(s:String) : String = s.replace("(", escapeSeqOpenBrace).replace(")", escapeSeqCloseBrace).replace(",", escapeSeqComma)
  def unescape(s:String) : String = s.replace(escapeSeqOpenBrace, "(").replace(escapeSeqCloseBrace, ")").replace(escapeSeqComma, ",")

  def makeTriples(rt : ResultTemplate, binding : HashMap[String, List[org.dbpedia.extraction.wikiparser.Node]], blockName : String, cache : Cache) : List[Statement] = {
          Logging.printMsg("bindings "+binding, 2)
          val quads = new ListBuffer[Statement]
          rt.triples.foreach( (tt : TripleTemplate) => {
            try{
            //apply the same placeholder-processing to s, p and o => DRY:
            val in = Map("s"->tt.s, "p"->tt.p, "o"->tt.o,"oLang"->tt.oLang, "oDatatype"->tt.oDatatype)
            val out = in.map( kv => {
                if(kv._2 == null)
                  (kv._1, null)
                else {
                  val replacedVars = varPattern.replaceAllIn( kv._2, (m) => {
                    val varName = m.matched.substring(1);//exclude the $ symbol
                    val replacement = if(binding.contains(varName)){
                        binding(varName).toReadableString
                    } else if(cache.savedVars.contains(varName)){
                        cache.savedVars(varName)
                    } else if(!tt.optional){
                        throw new Exception("missing binding for "+varName)
                    } else {
                        //println("continue")
                        throw new ContinueException("skip optional triple")
                    }
                    //to prevent, that recorded ) symbols mess up the regex of map and uri
                    val varValue = escape(replacement)
                    if(saveVars.contains(varName)){
                        cache.savedVars(varName.toUpperCase) = varValue
                    }
                    varValue
                  })
                  //println("replacedVars: "+replacedVars)
                  var functionsResolved = replacedVars
                  var matched = false
                  var i = 0
                  do{ //replace function calls inside out (the pattern each time only matches the innermost fun)
                      i += 1
                      matched = false
                      functionsResolved = funPattern.replaceAllIn(functionsResolved, (m) => {
                          matched = true
                          val funName = m.group(1)
                          val funArgEscaped = m.group(2)
                          val funArg = unescape(funArgEscaped)
                          val res = funName match {
                            case "uri" => urify(funArg)
                            case "clean" => clean(funArg)
                            case "map" => map(funArg)
                            case "lastword" => funArg.split(" ").last.trim
                            case "assertMapped" => if(!hasMapping(funArg)){throw new Exception("assertion failed: existing key in mapings for "+funArg)} else {funArg}
                            case "assertNumeric" => if(!funArg.forall(_.isDigit)){throw new Exception("assertion failed: numeric "+funArg)} else {funArg}
                            case "getId" => cache.matcher.getId(funArg)
                            case "makeId" => cache.matcher.makeId(funArg)
                            case "getOrMakeId" => cache.matcher.getOrMakeId(funArg)
                            case "saveId" => {val args = funArgEscaped.split(", "); val str = args(0); val id = args(1); cache.matcher.saveId(id, str); id}
                            case _ =>  {matched = false; funArg} // if unknown keep arg only
                          }
                          res
                      })
                  } while (matched && i < 10)
                  (kv._1, unescape(functionsResolved))
                }
            })
            
            val s = out("s")
            val p = out("p")
            val o = out("o")
            val oDatatype = out("oDatatype")
            val oLang = out("oLang")

            //to trigger exceptions if URL doesnt start with http:// etc, i dont use a real regex because its to heavy for every URI            
            new URL(s)
            if(s.contains(" ")){
                throw new Exception("invalid subject URI <"+s+">")
            }
            new URL(p)
            if(p.contains(" ")){
                throw new Exception("invalid predicate URI <"+p+">")
            }
            Logging.printMsg("emmiting triple <"+s+"> <"+p+"> /"+o+"/", 2)
            //determine if o is literal or URI
            val oObj = if(tt.oType == "URI"){
                new URL(o)
                if(o.contains(" ")){
                    throw new Exception("invalid object URI <"+o+">")
                }
                val oURI = vf.createURI(o)
                if(tt.oNewBlock){
                  cache.blockURIs(blockName) = oURI //save for later
                  //println(blockURIs)
                  Logging.printMsg("entering "+oURI, 2)
                  cache.savedVars("block") = o
                }
                oURI
              } else {
                if (oLang == null)
                  vf.createLiteral(o.trim, language)
                else if(oLang.equals("null"))
                  vf.createLiteral(o.trim)
                else
                  vf.createLiteral(o.trim, oLang)
              }

            quads += vf.createStatement(vf.createURI(s), vf.createURI(p), oObj)
            } catch {
              case ce : ContinueException => //skip
              case e : Exception => throw e //propagate
            }
          })
    quads.toList
  }

  //load config from xml
  private val config = XML.loadFile("config/config.xml")

  val properties : Map[String, String] = (config \ "properties" \ "property").map(
      (n : XMLNode) =>
        ( (n \ "@name").text,
          (n \ "@value").text
        )
      ).toMap

  val language = properties("language")
  val logLevel = properties("logLevel").toInt
  
  val ns = properties.get("ns").getOrElse("http://undefined.com/")
  val resourceNS = ns +"resource/"
  val termsNS = ns +"terms/"

  /**
  * OpenRDF factory
  */
  val vf = new ValueFactoryImpl()

  /**
  * Quad needs some objects, store them here to reuse them
  */

  /**
  * the name of the dataset
  */
  val datasetURI : Dataset = new Dataset("wiktionary.dbpedia.org")
  
  /**
  * the graph of all produced quads
  */
  val tripleContext = vf.createURI(ns.replace("http://", "http://"+language+"."))
  
  Logging.level = logLevel
  Logging.printMsg("wiktionary loglevel = "+logLevel,0)


  val varPattern = new Regex("\\$[a-zA-Z0-9]+")  

  val funPattern = new Regex("([a-zA-Z0-9]+)\\(([^)(]*)\\)")

  val escapeSeqOpenBrace = "%#*+~&" //an unlikely sequence of characters (duh)
  val escapeSeqCloseBrace = "%#*+~/" //an unlikely sequence of characters (duh)
  val escapeSeqComma = "%#*+~§" //an unlikely sequence of characters (duh)

  val matchingStrategy = properties.get("matchingStrategy").getOrElse("levenshtein")
  val matchingThreshold = properties.get("matchingThreshold").getOrElse("levenshtein")

  private val languageConfig = XML.loadFile("config/config-"+language+".xml")

  private val mappings : Map[String, String] = (languageConfig \\ "mapping").map(
      (n : XMLNode) => //language specific mappings (from language ("Spanisch") to global vocabulary ("spanish"))
        ( (n \ "@from").text,
          (n \ "@to").text
        )
      ).toMap ++ (config \ "mappings"\ "mapping").map(
      (n : XMLNode) => // general mappings (from language codes ("es") to global vocabulary ("spanish"))
        ( (n \ "@from").text,
          (n \ "@to").text
        )
      ).toMap

  /**
  * a list of var names that should be saved
  */
  val saveVars = (languageConfig \ "saveVars" \ "var").map((n : XMLNode) => (n \ "@name").text)

  /**
  * a list of strings, that - if the page title starts with it - makes the extractor skip it
  */
  val ignoreStart = (languageConfig \ "ignore" \ "page").filter(_.attribute("startsWith").isDefined).map((n : XMLNode) => (n \ "@startsWith").text)
  
  /**
  * a list of strings, that - if the page title ends with it - makes the extractor skip it
  */
  val ignoreEnd = (languageConfig \ "ignore" \ "page").filter(_.attribute("endsWith").isDefined).map((n : XMLNode) => (n \ "@endsWith").text)

  /**
  * a optional PostProcessor instance, it can transform the generated Quads 
  */
  val postprocessor : Option[PostProcessor] = if((languageConfig \ "postprocessing" \ "@enabled").text.equals("true")){Some(Class.forName((languageConfig \ "postprocessing" \ "@ppClass").text).getConstructor(classOf[NodeSeq]).newInstance((languageConfig \ "postprocessing" \ "config")).asInstanceOf[PostProcessor])} else None

  /**
  * load nodeHandlers from config
  */
  val nodeHandlers: Map[String, List[NodeHandler]] = (languageConfig \ "nodeHandlers" \ "nodeHandler").map(_.head).toList.groupBy((n : XMLNode) => (n \ "@order").text).mapValues( (configs : List[XMLNode]) => {
      configs.map( (n : XMLNode) => {

        if(n.attribute("nhClass").isDefined){
            Class.forName((n \ "@nhClass").text).getConstructor(classOf[NodeSeq]).newInstance((n)).asInstanceOf[NodeHandler]
        } else {
            if(n.attribute("builtin").isDefined){
                (n \ "@builtin").text match {
                  case "InfoBoxMapper" => new InfoBoxMapper(n)
                 case _ => throw new Exception("unknown nodeHandler. builtin type unknown.")
                }
            } else {
                throw new Exception("unknown nodeHandler. use nhClass or builtin attributes.")
            }
        }
    })
  })

  val parsingNodeHandlers : List[NodeHandler] = if(nodeHandlers.contains("parsing")){nodeHandlers("parsing")} else List[NodeHandler]()
  val beforeNodeHandlers : List[NodeHandler] = if(nodeHandlers.contains("before")){nodeHandlers("before")} else List[NodeHandler]()
  val afterNodeHandlers : List[NodeHandler] = if(nodeHandlers.contains("after")){nodeHandlers("after")} else List[NodeHandler]()

  val templateRepresentativeProperty = (languageConfig \ "templateRepresentativeProperties" \ "templateRepresentativeProperty").map(n=> ((n \ "@tplName").text, (n \ "@pKey").text) ).toMap
  
  /**
  * the root of the config
  */
  val pageConfig = new Page((languageConfig \ "page")(0))


  /**
  * urify
  * do some pseudo-urlencoding 
  */
  def urify(in:String):String = in.replace("	"," ")
    .replace(" ", "_")
    .replace("'", "")
    .replace("(", "")
    .replace(")", "")
    .replace("[ ", "")
    .replace("]", "")
    .replace("{", "")
    .replace("}", "")
    .replace("*", "")
    .replace("+", "")
    .replace(";", "")
    .replace("#", "")
    .replace("/", "")
    .replace("\\", "")
    .replace("<", "")
    .replace(">", "")
  //dont use real encoder because Wiktioanary uses UTF-8 characters in URLs too
  //URLEncoder.encode(Matcher.clean(in), "UTF-8")

  val cleanPattern = new Regex("[^\\p{L} ]")
  def getCleaned(dirty:String) = cleanPattern.replaceAllIn(dirty, "").trim

  def myCompare(s1 : String, s2 : String) : Int = {
      val s1l = s1.length
      val s2l = s2.length 
      if(s1l == s2l){
        s1.compareTo(s2) 
      } else {
        s1l - s2l
      }
  }

  def hasMapping(in:String):Boolean = {
    val clean = getCleaned(in)
    mappings.contains(clean)
  }

  def map(in:String):String = {
    val clean = getCleaned(in)
    if(!mappings.contains(clean))
      Logging.printMsg("missing mapping for \""+clean+"\"", 0)
    mappings.getOrElse(clean, clean)
  }

  def mapStatement(s:Statement):Quad = {
    new Quad(
      if(s.getObject.isInstanceOf[Literal] && s.getObject.asInstanceOf[Literal].getLanguage != null){
        s.getObject.asInstanceOf[Literal].getLanguage
      } else {
        null
      },
      datasetURI.name,
      s.getSubject.toString(),
      s.getPredicate.toString(),
      s.getObject.toString(),
      tripleContext.toString(),
      if(s.getObject.isInstanceOf[Literal] && s.getObject.asInstanceOf[Literal].getDatatype != null){
        s.getObject.asInstanceOf[Literal].getDatatype.toString
      } else if(s.getObject.isInstanceOf[Literal]) {
        "http://www.w3.org/2001/XMLSchema#string"
      } else {
        null
      }
    )
  }
}

class Matcher {
  /**
   * a map from id to used glosses for that id
   */
  val ids = new HashMap[String, ListBuffer[String]]()

  /**
   * a map from gloss to id
   */
  val cache = new HashMap[String, String]()

  /**
   * threshold for glosses to match
   * sim(glossA, glossB) >= threshold => id(glossA) := id(glossB)
   */
  val threshold = WiktionaryPageExtractor.matchingThreshold.toFloat


  /**
   * find best id for gloss
   * may return ("", 0) if all glosses are equally bad
   */
  def matchId(gloss:String) : Tuple2[String, Float] = {
    var max = 0.0f
    var maxId = ""
    val glossCleaned = clean(gloss)

    for(id <- ids.keySet){
      for(source <- ids(id)){
        val cur = sim(glossCleaned, source)
        if(cur > max){
          max = cur
          maxId = id
        }
      }
    }
    (maxId, max)
  }

  /**
   * get best id for gloss, None if below threshold
   */
  def getIdOption(gloss : String) : Option[String] = {
    val glossCleaned = clean(gloss)
    if(cache.contains(glossCleaned)){return Some(cache(glossCleaned))}
    val matched = matchId(gloss)
    if(matched._2 < threshold)
        None
    else {
        saveId(matched._1, gloss)
        Some(matched._1)
    }
  }

  /**
   * find best id for gloss, make one if all equally bad
   */
  def getId(gloss : String) : String = {
    val idOption = getIdOption(gloss)
    if(!idOption.isDefined){
      makeId(gloss)
    } else {
      idOption.get
    }
  }

  /**
   * get id if one exists that matches over threshold, create new if not
   */
  def getOrMakeId(gloss : String) : String = {
    val glossCleaned = clean(gloss)
    if(cache.contains(glossCleaned)){return cache(glossCleaned)}
    val id = getIdOption(gloss)
    if(!id.isDefined){
      makeId(gloss)
    } else {
      saveId(id.get, gloss)
      id.get
    }
  }

  /**
   * create new (incremental) id
   */
  def makeId(gloss : String) : String = {
    val id = (ids.size + 1).toString
    saveId(id, gloss)
    id
  }

  /**
   * add gloss for id, create id if not exists yet
   */
  def saveId(id : String, gloss : String) : Unit = {
    val glossCleaned = clean(gloss)
    if(!ids.contains(id)){
       ids(id) = new ListBuffer[String]() //create
    }
    //avoid double add, but should be harmless
    if(!ids(id).contains(glossCleaned)){
        //add
        ids(id).append(glossCleaned) 
    }
    cache(glossCleaned) = id
  }
  def clear = ids.clear
}
object Matcher{
  def sim(s1 : String, s2 : String) : Float = {
    if(s1.equals(s2)){
      //equal match
      1.0f
    } else if (s1.containsSlice(s2) || s2.containsSlice(s1)){
      //substring match
      0.9f
    } else {
      //harder to computer similarity
      WiktionaryPageExtractor.matchingStrategy match {
        case "levenshtein" => levenshtein(s1, s2)
        case "dice" =>        dice(ngrams(s1,3), ngrams(s2,3))
        case "jaccard" =>     jaccard(ngrams(s1,3), ngrams(s2,3))
        case "overlap" =>     overlap(ngrams(s1,3), ngrams(s2,3))
        case _ => levenshtein(s1, s2)
      }
    }
  }

  def ngrams(s:String, n:Int) : Set[String] = {
    (("#"*(n-1))+s+("#"*(n-1))).sliding(n).toSet
  }

  def jaccard(s1ngrams : Set[String], s2ngrams: Set[String]) : Float = {
    val intersectSize = s1ngrams.intersect(s2ngrams).size
    val unionSize = s1ngrams.union(s2ngrams).size
    if(unionSize == 0){
      1.0f
    } else {
      intersectSize.toFloat / unionSize
    }
  }
  def dice(s1ngrams : Set[String], s2ngrams: Set[String]) : Float = {
    val intersectSize = s1ngrams.intersect(s2ngrams).size
    val summedSize = s1ngrams.size + s2ngrams.size
    if(summedSize == 0){
      1.0f
    } else {
      (2 * intersectSize).toFloat / summedSize
    }
  }
  def overlap(s1ngrams : Set[String], s2ngrams: Set[String]) : Float = {
    val intersectSize = s1ngrams.intersect(s2ngrams).size
    val s1Size = s1ngrams.size
    val s2Size = s2ngrams.size
    val minSize = if(s1Size < s2Size) s1Size else s2Size
    if(minSize == 0){
      0.0f
    } else {
      intersectSize.toFloat / minSize
    }
  }
  def levenshtein_dist(s1: String, s2: String) : Int = {
    val memo = scala.collection.mutable.Map[(List[Char],List[Char]),Int]()
    def min(a:Int, b:Int, c:Int) = scala.math.min( scala.math.min( a, b ), c)
    def sd(s1: List[Char], s2: List[Char]): Int = {
      if (memo.contains((s1,s2)) == false)
        memo((s1,s2)) = (s1, s2) match {
          case (_, Nil) => s1.length
          case (Nil, _) => s2.length
          case (c1::t1, c2::t2)  => min( sd(t1,s2) + 1, sd(s1,t2) + 1,
            sd(t1,t2) + (if (c1==c2) 0 else 1) )
        }
      memo((s1,s2))
    }

    sd( s1.toList, s2.toList )
  }
  def levenshtein(s1: String, s2: String) : Float = {
    val s1Size = s1.size
    val s2Size = s2.size
    1 - (levenshtein_dist(s1, s2).toFloat / (scala.math.max(s1Size, s2Size) ))
  }

  val cleaner = "\\W".r

  def clean(in:String) = cleaner.replaceAllIn(in.toLowerCase,"")
}

class Cache {
  //to cache last used blockURIs (from block name to its uri)
  val blockURIs = new HashMap[String, URI]
  //to save variable bindings
  val savedVars = new HashMap[String, String]()
  //which variables to save
  val matcher = new Matcher
}






© 2015 - 2025 Weber Informatics LLC | Privacy Policy