All Downloads are FREE. Search and download functionalities are using the official Maven repository.

input.parsers.FileInputParser.scala Maven / Gradle / Ivy

There is a newer version: 1.0.0-RC23
Show newest version
package avrohugger
package input
package parsers

import org.apache.avro.{ Protocol, Schema }
import org.apache.avro.Schema.Parser
import org.apache.avro.Schema.Type.{ RECORD, UNION, ENUM }
import org.apache.avro.compiler.idl.Idl
import org.apache.avro.generic.{ GenericDatumReader, GenericRecord }
import org.apache.avro.file.DataFileReader

import java.io.File

import scala.collection.JavaConverters._

class FileInputParser {
  lazy val parser = new Parser()

  def getSchemaOrProtocols(infile: File): List[Either[Schema, Protocol]] = {
    
    def unUnion(schema: Schema) = {
      schema.getType match {
        //if top-level record is wrapped in a union with no other types
        case UNION => {
          val types = schema.getTypes.asScala.toList
          if (types.length == 1) types.head
          else sys.error("""Unions, beyond nullable fields, are not supported. 
            |Found a union of more than one type: """.trim.stripMargin + types)
        }
        case RECORD => schema
        case ENUM => schema
        case _ => sys.error("""Neither a record, enum nor a union of either. 
          |Nothing to map to a definition.""".trim.stripMargin)
      }
    }
    
    /**
     * Avro files may contain imported types from other namespaces. For .avdl
     * files, it is possible to see where the imported types come from, and if 
     * they come from a protocol or idl, then they should be generated as part 
     * of an ADT. For .avpr and .avsc files, it is NOT possible to determine if 
     * imported types are part of an external idl or protocol, and are generated
     * as simple stand-alone classes or enums.  
     */
    val schemaOrProtocols: List[Either[Schema, Protocol]] = {
      infile.getName.split("\\.").last match {
        case "avro" =>
          val gdr = new GenericDatumReader[GenericRecord]
          val dfr = new DataFileReader(infile, gdr)
          val schema = unUnion(dfr.getSchema)
          List(Left(schema))
        case "avsc" =>
          val schema = unUnion(parser.parse(infile))
          List(Left(schema))
        case "avpr" =>
          val protocol = Protocol.parse(infile)
          val schemas = protocol.getTypes.asScala.toList
          val otherNamespaceSchemaOrProtocols = schemas.filterNot(schema => {
            schema.getNamespace == protocol.getNamespace
          }).map(schema => Left(schema))
          Right(protocol) +: otherNamespaceSchemaOrProtocols
        case "avdl" =>
          val idlParser = new Idl(infile)
          val protocol = idlParser.CompilationUnit()
          val importedFiles = IdlImportParser.getImportedFiles(infile)
          val importedSchemaOrProtocols = importedFiles.flatMap(imported => {
            getSchemaOrProtocols(imported)
          }).toList
          Right(protocol) +: importedSchemaOrProtocols
        case _ =>
          throw new Exception("""File must end in ".avpr" for protocol files, 
            |".avsc" for plain text json files, ".avdl" for IDL files, or .avro 
            |for binary.""".trim.stripMargin)
      }
    }
    
    schemaOrProtocols
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy