All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lens.rdd.LensRDDClient Maven / Gradle / Ivy

There is a newer version: 2.7.1
Show newest version
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.lens.rdd;

import java.io.IOException;
import java.io.Serializable;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.UUID;

import org.apache.lens.api.LensException;
import org.apache.lens.api.query.*;
import org.apache.lens.client.LensClient;
import org.apache.lens.client.LensClientResultSet;
import org.apache.lens.ml.algo.spark.HiveTableRDD;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.TableType;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.ql.metadata.Hive;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.hadoop.hive.ql.plan.AddPartitionDesc;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hive.hcatalog.data.HCatRecord;
import org.apache.spark.SparkContext;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.rdd.RDD;

/**
 * 

* Create RDD from a Lens query. User can poll returned query handle with isReadyForRDD() until the RDD is ready to be * used. *

* Example - *

*

 *   LensRDDClient client = new LensRDDClient(javaSparkContext);
 *   QueryHandle query = client.createLensRDDAsync("SELECT msr1 from TEST_CUBE WHERE ...", conf);
 *
 *   while (!client.isReadyForRDD(query)) {
 *     Thread.sleep(1000);
 *   }
 *
 *   JavaRDD rdd = client.getRDD(query).toJavaRDD();
 *
 *   // Consume RDD here -
 *   rdd.map(...);
 * 
*

*

*

*

* Alternatively in blocking mode *

*

 * JavaRDD<ResultRow> rdd = client.createLensRDD("SELECT msr1 from TEST_CUBE WHERE ...", conf);
 * 
*

*

*/ public class LensRDDClient { /** The Constant LOG. */ public static final Log LOG = LogFactory.getLog(LensRDDClient.class); // Default input format for table created from Lens result set /** The Constant INPUT_FORMAT. */ private static final String INPUT_FORMAT = TextInputFormat.class.getName(); // Default output format /** The Constant OUTPUT_FORMAT. */ private static final String OUTPUT_FORMAT = TextOutputFormat.class.getName(); // Name of partition column and its value. There is always exactly one partition in the table created from // Result set. /** The Constant TEMP_TABLE_PART_COL. */ private static final String TEMP_TABLE_PART_COL = "dummy_partition_column"; /** The Constant TEMP_TABLE_PART_VAL. */ private static final String TEMP_TABLE_PART_VAL = "placeholder_value"; /** The Constant HIVE_CONF. */ protected static final HiveConf HIVE_CONF = new HiveConf(); static { HIVE_CONF.setVar(HiveConf.ConfVars.METASTOREURIS, ""); HIVE_CONF.set("javax.jdo.option.ConnectionURL", "jdbc:derby:;databaseName=./metastore_db;create=true"); HIVE_CONF.set("javax.jdo.option.ConnectionDriverName", "org.apache.derby.jdbc.EmbeddedDriver"); HIVE_CONF.setBoolean("hive.metastore.local", true); HIVE_CONF.set("hive.metastore.warehouse.dir", "file://${user.dir}/warehouse"); } /** The spark context. */ private final JavaSparkContext sparkContext; // Spark context /** The lens client. */ private LensClient lensClient; // Lens client instance. Initialized lazily. /** * Create an RDD client with given spark Context. * * @param sparkContext the spark context */ public LensRDDClient(JavaSparkContext sparkContext) { this.sparkContext = sparkContext; } /** * Create an RDD client with given spark Context. * * @param sc the sc */ public LensRDDClient(SparkContext sc) { this(new JavaSparkContext(sc)); } /** * Instantiates a new lens rdd client. * * @param sparkContext the spark context * @param lensClient the lens client */ public LensRDDClient(JavaSparkContext sparkContext, LensClient lensClient) { this.sparkContext = sparkContext; this.lensClient = lensClient; } /** * Instantiates a new lens rdd client. * * @param sparkContext the spark context * @param lensClient the lens client */ public LensRDDClient(SparkContext sparkContext, LensClient lensClient) { this(new JavaSparkContext(sparkContext), lensClient); } private synchronized LensClient getClient() { if (lensClient == null) { lensClient = new LensClient(); } return lensClient; } /** * API for non blocking use. * * @param query the query * @return the query handle * @throws LensException the lens exception */ public QueryHandle createLensRDDAsync(String query) throws LensException { return getClient().executeQueryAsynch(query, ""); } /** * Check if the RDD is created. RDD will be created as soon as the underlying Lens query is complete * * @param queryHandle the query handle * @return true, if is ready for rdd * @throws LensException the lens exception */ public boolean isReadyForRDD(QueryHandle queryHandle) throws LensException { QueryStatus status = getClient().getQueryStatus(queryHandle); return status.finished(); } /** * Allow cancelling underlying query in case of non blocking RDD creation. * * @param queryHandle the query handle * @throws LensException the lens exception */ public void cancelRDD(QueryHandle queryHandle) throws LensException { getClient().killQuery(queryHandle); } /** * Get the RDD created for the query. This should be used only is isReadyForRDD returns true * * @param queryHandle the query handle * @return the rdd * @throws LensException the lens exception */ public LensRDDResult getRDD(QueryHandle queryHandle) throws LensException { QueryStatus status = getClient().getQueryStatus(queryHandle); if (!status.finished() && !status.isResultSetAvailable()) { throw new LensException(queryHandle.getHandleId() + " query not finished or result unavailable"); } LensClient.LensClientResultSetWithStats result = getClient().getAsyncResults(queryHandle); if (result.getResultSet() == null) { throw new LensException("Result set not available for query " + queryHandle.getHandleId()); } LensClientResultSet resultSet = result.getResultSet(); QueryResultSetMetadata metadata = result.getResultSet().getResultSetMetadata(); // TODO allow creating RDD from in-memory result sets if (!(resultSet.getResult() instanceof PersistentQueryResult)) { throw new LensException("RDDs only supported for persistent result sets"); } PersistentQueryResult persistentQueryResult = (PersistentQueryResult) resultSet.getResult(); String tempTableName; try { tempTableName = createTempMetastoreTable(persistentQueryResult.getPersistedURI(), metadata); } catch (HiveException e) { throw new LensException("Error creating temp table from result set", e); } // Now create one RDD JavaPairRDD rdd = null; try { rdd = HiveTableRDD.createHiveTableRDD(sparkContext, HIVE_CONF, "default", tempTableName, TEMP_TABLE_PART_COL + "='" + TEMP_TABLE_PART_VAL + "'"); LOG.info("Created RDD " + rdd.name() + " for table " + tempTableName); } catch (IOException e) { throw new LensException("Error creating RDD for table " + tempTableName, e); } return new LensRDDResult(rdd.map(new HCatRecordToObjectListMapper()).rdd(), queryHandle, tempTableName); } // Create a temp table with schema of the result set and location /** * Creates the temp metastore table. * * @param dataLocation the data location * @param metadata the metadata * @return the string * @throws HiveException the hive exception */ protected String createTempMetastoreTable(String dataLocation, QueryResultSetMetadata metadata) throws HiveException { String tableName = "lens_rdd_" + UUID.randomUUID().toString().replace("-", "_"); Hive hiveClient = Hive.get(HIVE_CONF); Table tbl = hiveClient.newTable("default." + tableName); tbl.setTableType(TableType.MANAGED_TABLE); tbl.setInputFormatClass(INPUT_FORMAT); // String outputFormat = null; // tbl.setOutputFormatClass(outputFormat); // Add columns for (ResultColumn rc : metadata.getColumns()) { tbl.getCols().add(new FieldSchema(rc.getName(), toHiveType(rc.getType()), "default")); System.out.println("@@@@ COL " + rc.getName() + " TYPE " + toHiveType(rc.getType())); } tbl.getPartCols().add(new FieldSchema(TEMP_TABLE_PART_COL, "string", "default")); hiveClient.createTable(tbl); LOG.info("Table " + tableName + " created"); // Add partition to the table AddPartitionDesc partitionDesc = new AddPartitionDesc("default", tableName, false); Map partSpec = new HashMap(); partSpec.put(TEMP_TABLE_PART_COL, TEMP_TABLE_PART_VAL); partitionDesc.addPartition(partSpec, dataLocation); hiveClient.createPartitions(partitionDesc); LOG.info("Created partition in " + tableName + " for data in " + dataLocation); return tableName; } // Convert lens data type to Hive data type. /** * To hive type. * * @param type the type * @return the string */ private String toHiveType(ResultColumnType type) { return type.name().toLowerCase(); } /** * Blocking call to create an RDD from a Lens query. Return only when the query is complete. * * @param query the query * @return the lens rdd result * @throws LensException the lens exception */ public LensRDDResult createLensRDD(String query) throws LensException { QueryHandle queryHandle = createLensRDDAsync(query); while (!isReadyForRDD(queryHandle)) { try { Thread.sleep(1000); } catch (InterruptedException e) { LOG.warn("Interrupted while waiting for query", e); break; } } return getRDD(queryHandle); } /** * Container object to store the RDD and corresponding Lens query handle. */ public static class LensRDDResult implements Serializable { /** The result rdd. */ private transient RDD> resultRDD; /** The lens query. */ private QueryHandle lensQuery; /** The temp table name. */ private String tempTableName; /** * Instantiates a new lens rdd result. * * @param rdd the rdd * @param lensQuery the lens query * @param tempTableName the temp table name */ public LensRDDResult(RDD> rdd, QueryHandle lensQuery, String tempTableName) { this.resultRDD = rdd; this.lensQuery = lensQuery; this.tempTableName = tempTableName; } /** * Instantiates a new lens rdd result. */ public LensRDDResult() { } public QueryHandle getLensQuery() { return lensQuery; } public RDD> getRDD() { return resultRDD; } /** * Recreate RDD. This will work if the result object was saved. As long as the metastore and corresponding HDFS * directory is available result object should be able to recreate an RDD. * * @param sparkContext the spark context * @return the rdd * @throws LensException the lens exception */ public RDD> recreateRDD(JavaSparkContext sparkContext) throws LensException { if (resultRDD == null) { try { JavaPairRDD javaPairRDD = HiveTableRDD.createHiveTableRDD(sparkContext, HIVE_CONF, "default", tempTableName, TEMP_TABLE_PART_COL + "='" + TEMP_TABLE_PART_VAL + "'"); resultRDD = javaPairRDD.map(new HCatRecordToObjectListMapper()).rdd(); LOG.info("Created RDD " + resultRDD.name() + " for table " + tempTableName); } catch (IOException e) { throw new LensException("Error creating RDD for table " + tempTableName, e); } } return resultRDD; } public String getTempTableName() { return tempTableName; } /** * Delete temp table. This should be done to release underlying temp table. * * @throws LensException the lens exception */ public void deleteTempTable() throws LensException { Hive hiveClient = null; try { hiveClient = Hive.get(HIVE_CONF); hiveClient.dropTable("default." + tempTableName); LOG.info("Dropped temp table " + tempTableName); } catch (HiveException e) { throw new LensException(e); } } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy