com.microsoft.accumulo.MLeapUtil.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of microsoft-accumulo-spark-datasource Show documentation
Show all versions of microsoft-accumulo-spark-datasource Show documentation
Accumulo Connector for Apache Spark
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.microsoft.accumulo
import org.apache.spark.sql.types.StructField
import ml.combust.bundle.BundleFile
import ml.combust.mleap.runtime.MleapSupport._
import ml.combust.mleap.runtime.MleapContext.defaultContext
import org.apache.spark.sql.mleap.TypeConverters
import java.io.File
import java.util.{Base64, HashMap}
import java.net.URI
import java.nio.file.{Files, FileSystem, FileSystems, Path, StandardOpenOption}
import resource._
import ml.combust.mleap.core.types.ScalarType
import com.google.common.jimfs.{Jimfs, Configuration}
import com.microsoft.accumulo.zipfs.{ZipFileSystem, ZipFileSystemProvider}
@SerialVersionUID(1L)
object MLeapUtil {
// load the Spark pipeline we saved in the previous section
def mleapSchemaToCatalyst(modelBase64: String): Seq[StructField] = {
if (modelBase64.isEmpty)
Seq.empty[StructField]
else {
val mleapBundleArr = Base64.getDecoder().decode(modelBase64)
val fs = Jimfs.newFileSystem(Configuration.unix())
val mleapFilePath = fs.getPath("/mleap.zip")
Files.write(mleapFilePath, mleapBundleArr, StandardOpenOption.CREATE)
// Why do we access a private constructor???
// 1. MLeap only exposes a FileSystem layer to load models.
// 2. We don't want to write to the local file system
// 2a. We use Google JimFS
// 2b. We can't use https://github.com/marschall/memoryfilesystem at it has a 16MB file size limitation
// 2c. We can't use Apache common-vfs as it doesn't support directory listing
// 3. Usually one triggers the ZFS implementation by prefixing the URI with jar:
// Unfortunately on Spark the file system provider disappears from the installed list https://stackoverflow.com/questions/39500445/filesystem-provider-disappearing-in-spark
// thus it cannot be found by the ZFS implementation when looking up the jimfs: protocol
// 4. The public methods (e.g. FileSystems.newFileSystem(), new ZipFileSystemProvider().newFileSystem()) have checks that limit the incoming FileSystemProvider
// Attempt 10: try to find the jar provider, but then we don't know if the same methods exists :(
// val zfsProvider = FileSystemProvider.installedProviders().asScala.filter(_.getScheme == "jar")
// FileSystemProvider.installedProviders().asScala.foreach(p => println(p.getScheme))
// Attempt 9: hard dependency on Oracle JDK, fails on OpenJDK
// package private ctor... *sigh*
// import com.sun.nio.zipfs.{ZipFileSystem, ZipFileSystemProvider}
// val zfsCtor = classOf[ZipFileSystem].getDeclaredConstructor(
// classOf[ZipFileSystemProvider],
// classOf[java.nio.file.Path],
// classOf[java.util.Map[String, Object]])
// zfsCtor.setAccessible(true)
// val zfs = zfsCtor.newInstance(new ZipFileSystemProvider, mleapFilePath, new java.util.HashMap[String, Object])
// moving to modified OpenJDK ZipFileSystem
val zfs = new ZipFileSystem(new ZipFileSystemProvider, mleapFilePath, new HashMap[String, Object])
val mleapPipeline = (for(bf <- managed(BundleFile(zfs, zfs.getPath("/")))) yield {
bf.loadMleapBundle().get.root
}).tried.get
// TODO: also process mleapPipeline.inputSchema to determine the required fields
mleapPipeline.outputSchema.fields.flatMap {
mleapField => {
mleapField.dataType match {
case _: ScalarType => Some(TypeConverters.mleapFieldToSparkField(mleapField))
case _ => None
}
}
}
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy