All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.dbpedia.extraction.scripts.FixNTriplesEncoding.scala Maven / Gradle / Ivy

package org.dbpedia.extraction.scripts

import java.io.File
import org.dbpedia.extraction.util.RichFile.wrapFile
import org.dbpedia.extraction.util.StringUtils.prettyMillis
import org.dbpedia.extraction.util.NumberUtils
import scala.Console.err
import org.dbpedia.extraction.util.IOUtils
import java.lang.StringBuilder


/**
 * Encodes non-ASCII chars in N-Triples files.
 * DOES NOT ESCAPE DOUBLE QUOTES (") AND BACKSLASHES (\) - we assume that the file is mostly
 * in correct N-Triples format and just contains a few non-ASCII chars.
 *  
 * Example call:
 * ../run FixNTriplesEncoding /data/dbpedia/links bbcwildlife,italian-public-schools _fixed _links.nt.gz
 */
object FixNTriplesEncoding {
  
  private def split(arg: String): Array[String] = { 
    arg.split(",").map(_.trim).filter(_.nonEmpty)
  }
  
  def main(args: Array[String]): Unit = {
    
    require(args != null && args.length == 4, 
      "need four args: "+
      /*0*/ "directory, "+
      /*1*/ "comma-separated names of input files (e.g. 'bbcwildlife,bookmashup'), "+
      /*2*/ "output dataset name extension (e.g. '_fixed'), "+
      /*3*/ "file extension (e.g. '_links.nt.gz')"
    )
    
    val dir = new File(args(0))
    
    val inputs = split(args(1))
    require(inputs.nonEmpty, "no input file names")
    
    val extension = args(2)
    require(extension.nonEmpty, "no output name extension")
    
    // Suffix of input/output files, for example "_links.nt.gz"
    // This script works with .nt or .nq files using URIs, NOT with .ttl or .tql files and NOT with IRIs.
    val suffix = args(3)
    require(suffix.nonEmpty, "no input/output file suffix")
    
    for (input <- inputs) {
      val inFile = new File(dir, input + suffix)
      val outFile = new File(dir, input + extension + suffix)
      err.println("reading "+inFile+" ...")
      err.println("writing "+outFile+" ...")
      var lineCount = 0
      var changeCount = 0
      val start = System.nanoTime
      val writer = IOUtils.writer(outFile)
      try {
        IOUtils.readLines(inFile) { line =>
          if (line != null) {
            val escaped = new TurtleEscaper().escapeTurtle(line)
            writer.write(escaped)
            writer.write('\n')
            if (! line.eq(escaped)) changeCount += 1
            lineCount += 1
            if (lineCount % 1000000 == 0) log(lineCount, changeCount, start)
          }
        }
      }
      finally writer.close()
      log(lineCount, changeCount, start)
    }
    
  }

  private def log(lines: Int, changed: Int, start: Long): Unit = {
    val micros = (System.nanoTime - start) / 1000
    err.println("read "+lines+" lines, changed "+changed+" lines in "+prettyMillis(micros / 1000)+" ("+(micros.toFloat / lines)+" micros per line)")
  }
  
}

/**
 * Escapes a Unicode string according to Turtle / N-Triples format. 
 * DOES NOT ESCAPE DOUBLE QUOTES (") AND BACKSLASHES (\) - we assume that the file is mostly
 * in correct N-Triples format and just contains a few non-ASCII chars.
 * @param builder may be null
 * @param input may be null
 * @param turtle if true, non-ASCII characters are not escaped (allowed by Turtle); 
 * if false, non-ASCII characters are escaped (required by N-Triples / N-Quads).
 */
class TurtleEscaper {
  
  private var builder: StringBuilder = null
  
  private var input: String = null
  
  private var last = 0
  
  /**
   * Escapes a Unicode string according to Turtle / N-Triples format.
   * @param input must not be null
   */
  def escapeTurtle(str: String): String = {
    input = str
    last = 0
    var index = 0
    while (index < input.length)
    {
      val code = input.codePointAt(index)
      val replaced = escapeTurtle(index, code)
      index += Character.charCount(code)
      if (replaced) last = index
    }
    if (builder == null) input else builder.append(input, last, index).toString
  }
  
  /**
   * Escapes a Unicode code point according to Turtle / N-Triples format.
   * @param code Unicode code point
   */
  private def escapeTurtle(index: Int, code: Int): Boolean = {
    // TODO: use a lookup table for c <= 0xA0? c <= 0xFF?
         if (code == '\n') append(index, "\\n")
    else if (code == '\r') append(index, "\\r")
    else if (code == '\t') append(index, "\\t")
    else if (code >= 0x0020 && code < 0x007F) false
    else if (code <= 0xFFFF) appendHex(index, 'u', code, 4)
    else appendHex(index, 'U', code, 8)
  }

  private def appendHex(index: Int, esc: Char, code: Int, digits: Int): Boolean = {
    append(index)
    builder append '\\' append esc
    NumberUtils.intToHex(builder, code, digits)
    true
  }
  
  private def append(index: Int, str: String): Boolean = {
    append(index)
    builder.append(str)
    true
  }
  
  private def append(index: Int): Unit = {
    if (builder == null) builder = new StringBuilder
    builder.append(input, last, index)
  }
  
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy