All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.splout.db.hadoop.SQLiteOutputFormat Maven / Gradle / Ivy

Go to download

Splout is a read only, horizontally scalable SQL database that plays well with Hadoop.

There is a newer version: 0.3.0
Show newest version
package com.splout.db.hadoop;

/*
 * #%L
 * Splout SQL Hadoop library
 * %%
 * Copyright (C) 2012 Datasalt Systems S.L.
 * %%
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * #L%
 */

import java.io.IOException;
import java.io.Serializable;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicLong;

import org.apache.commons.collections.MapUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import com.datasalt.pangool.io.Fields;
import com.datasalt.pangool.io.ITuple;
import com.datasalt.pangool.io.Schema;
import com.splout.db.common.HeartBeater;

/**
 * Low-level Pangool OutputFormat that can be used to generate partitioned SQL views in Hadoop. It accepts Tuples that
 * have "sql" strings and "partition" integers. Each partition will generate a different .db file named .db
 * 

* Furthermore, the OutputFormat accepts a list of initial SQL statements that will be executed for each partition when * the database is created (e.g. CREATE TABLE and such). It also accepts finalization statements (e.g. CREATE INDEX). *

* This OutputFormat can be used as a basis for creating more complex OutputFormats such as * {@link TupleSQLite4JavaOutputFormat}. *

* Moreover, using this OutputFormat directly can result in poor-performing Jobs as it can't cache PreparedStatements * (it has to create a new Statement for every SQL it receives). */ @SuppressWarnings("serial") public class SQLiteOutputFormat extends FileOutputFormat implements Serializable { public static Log LOG = LogFactory.getLog(SQLiteOutputFormat.class); // The Pangool Tuple Schema that will be received as output: SQL and partition number to write to public final static Schema SCHEMA = new Schema("sql", Fields.parse("sql:string, partition:int")); private static AtomicLong FILE_SEQUENCE = new AtomicLong(0); // Number of SQL inserts that will be performed before committing a transaction. // Recommended around 1000000 for performance. private int batchSize; List initSqlStatements = null; List endSqlStatements = null; public SQLiteOutputFormat(String[] initSqlStatements, String[] endSqlStatements, int batchSize) { if(initSqlStatements != null) { this.initSqlStatements = Arrays.asList(initSqlStatements); } if(endSqlStatements != null) { this.endSqlStatements = Arrays.asList(endSqlStatements); } this.batchSize = batchSize; } /* * The main complexity here resides in maintaining separate transactions for each partition. We use transactions for * efficiency (bulk inserting). For that purpose we cache the Statement, Connection and number of executed SQL actions * for each partition (statementPool, connectionPool, executedMap). */ public class SQLRecordWriter extends RecordWriter { // Temporary and permanent Paths for properly writing Hadoop output files private Map permPool = new HashMap(); private Map tempPool = new HashMap(); // Transaction cache private Map statementPool = new HashMap(); private Map connectionPool = new HashMap(); private Map executedMap = new HashMap(); private Configuration conf; private HeartBeater heartBeater; private TaskAttemptContext context; private FileSystem fs; public SQLRecordWriter(TaskAttemptContext context) throws IOException { this.context = context; long waitTimeHeartBeater = context.getConfiguration().getLong(HeartBeater.WAIT_TIME_CONF, 5000); heartBeater = new HeartBeater(context, waitTimeHeartBeater); heartBeater.needHeartBeat(); conf = context.getConfiguration(); } // This method is called one time per each partition protected void initSql(int partition) throws IOException { Path outPath = FileOutputFormat.getOutputPath(context); fs = outPath.getFileSystem(conf); Path perm = new Path(FileOutputFormat.getOutputPath(context), partition + ".db"); Path temp = conf.getLocalPath("mapred.local.dir", partition + "." + FILE_SEQUENCE.incrementAndGet()); fs.delete(perm, true); // delete old, if any fs.delete(temp, true); // delete old, if any Path local = fs.startLocalOutput(perm, temp); // try { permPool.put(partition, perm); tempPool.put(partition, temp); // Load the sqlite-JDBC driver using the current class loader LOG.info("Initializing SQL connection [" + partition + "]"); Class.forName("org.sqlite.JDBC"); // Get a Connection Connection conn = DriverManager.getConnection("jdbc:sqlite:" + local.toString()); connectionPool.put(partition, conn); // Create a Statement Statement st = conn.createStatement(); statementPool.put(partition, st); // Execute initial statements if(initSqlStatements != null) { LOG.info("Executing initial SQL statements."); for(String sql : initSqlStatements) { LOG.info("Executing: " + sql); st.execute(sql); } } // PRAGMA optimizations as per http://www.sqlite.org/pragma.html#pragma_count_changes st.execute("PRAGMA synchronous=OFF"); st.execute("PRAGMA journal_mode=OFF"); st.execute("PRAGMA temp_store=FILE"); // Init transaction st.execute("BEGIN"); } catch(SQLException e) { throw new IOException(e); } catch(ClassNotFoundException e) { throw new IOException(e); } } // Subclasses may need access to the connection pool protected Map getConnectionPool() { return connectionPool; } // Subclasses may need access to the statement pool public Map getStatementPool() { return statementPool; } // Finalize: finish pending transactions, execute end statements @Override public void close(TaskAttemptContext ctx) throws IOException, InterruptedException { try { if(ctx != null) { heartBeater.setProgress(ctx); } for(Map.Entry entry : statementPool.entrySet()) { LOG.info("Closing SQL connection [" + entry.getKey() + "]"); Statement st = entry.getValue(); st.execute("COMMIT"); st.close(); if(endSqlStatements != null) { LOG.info("Executing end SQL statements."); for(String sql : endSqlStatements) { LOG.info("Executing: " + sql); st.execute(sql); } } // Hadoop - completeLocalOutput() fs.completeLocalOutput(permPool.get(entry.getKey()), tempPool.get(entry.getKey())); } } catch(SQLException e) { throw new IOException(e); } finally { // in any case, destroy the HeartBeater heartBeater.cancelHeartBeat(); } } @Override public void write(ITuple tuple, NullWritable ignore) throws IOException, InterruptedException { String sql = tuple.get("sql").toString(); int partition = (Integer) tuple.get("partition"); try { if(statementPool.get(partition) == null) { // Execute warm-up for this partition initSql(partition); } Statement st = statementPool.get(partition); // Execute the SQL st.execute(sql); Integer exec = MapUtils.getInteger(executedMap, partition, 0); // Increment the number of statements executed for this partition exec++; if(exec % batchSize == 0) { // Close transaction and begin another one if > BATCH_SIZE st.execute("COMMIT"); st.execute("BEGIN"); } executedMap.put(partition, exec); } catch(SQLException e) { throw new IOException(e); } finally { heartBeater.cancelHeartBeat(); } } } @Override public RecordWriter getRecordWriter(final TaskAttemptContext context) throws IOException, InterruptedException { return new SQLRecordWriter(context); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy