com.dell.doradus.client.utils.CSVDumper Maven / Gradle / Ivy
/*
* Copyright (C) 2014 Dell, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.dell.doradus.client.utils;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.slf4j.helpers.MessageFormatter;
import com.dell.doradus.client.ApplicationSession;
import com.dell.doradus.client.Client;
import com.dell.doradus.client.OLAPSession;
import com.dell.doradus.client.QueryResult;
import com.dell.doradus.common.ApplicationDefinition;
import com.dell.doradus.common.DBObject;
import com.dell.doradus.common.FieldDefinition;
import com.dell.doradus.common.FieldType;
import com.dell.doradus.common.TableDefinition;
import com.dell.doradus.common.Utils;
/**
* CSVDumper dumps all objects in a given Doradus application to CSV files. It is the
* counterpart to {@link CSVLoader}. CSVDumper opens a Doradus database and dumps all
* objects for each table to its own CSV file. It uses a configurable number of workers to
* dump tables in parallel. Each file is called "{table name}.csv" and is written into the
* current directory. Existing files, if present, are replaced. Each CSV record holds all
* scalar and link field values for one object. The values of scalar collections and link
* fields are concatenated into a single value, separated by "~". For example, the scalar
* collections Colors could become a column with the value "red~blue~green". All column
* names match the corresponding field name except for the _ID field, whose name can be
* configured. The default is "Key".
*
* Default options are defined in {@link CSVConfig} and can be overwritten
* programmatically via {@link CSVConfig#set(String, String)} or via parameters to
* {@link #main(String[])}. For example, the parameter "-optimize true" causes only one
* link in each bi-directional relationship to be dumped, thereby reducing file size.
*
* Because this utility only dumps fields named in the schema, it will skip any unnamed
* fields. The files created by CSVDumper can be loaded back into another database using
* CSVLoader as long as the same schema is used and all fields are named.
*
* This utility is suitable for small databases (millions of objects), but dumping larger
* databases to CSV files is probably impractical.
*/
public class CSVDumper {
// Default values.
private static final int LINK_FANOUT_SAMPLE_SIZE = 100;
// Members:
private CSVConfig m_config = CSVConfig.instance();
private Client m_client;
private ApplicationSession m_session;
private Map> m_dbSuppressedFieldMap;
private AtomicLong m_totalTables = new AtomicLong();
private AtomicLong m_totalObjects = new AtomicLong();
private AtomicLong m_totalBytes = new AtomicLong();
private ApplicationDefinition m_appDef;
private Iterator m_tableIterator;
private List m_workerList = new ArrayList();
// Logging interface:
private Logger m_logger = LoggerFactory.getLogger(getClass().getSimpleName());
//----- Public methods
/**
* Run with "-?" to get details on program arguments. Also, see {@link CSVConfig} for
* a description of parameters.
*
* @param args Program arguments.
*/
public static void main(String[] args) {
try {
CSVDumper app = new CSVDumper();
app.parseArgs(args);
app.run();
System.exit(0);
}
catch (Exception e) {
e.printStackTrace();
System.exit(1);
}
} // main
/**
* Performs the CSV dump as defined by the {@link CSVConfig} singleton object.
* This method can be called instead of {@link #main(String[])} for programmatic
* access. Parameters should be set first by direct access to the CSVConfig
* object or by calling {@link CSVConfig#set(String, String)}.
*/
public void run() {
loadSchema();
m_tableIterator = m_appDef.getTableDefinitions().values().iterator();
m_logger.info("Starting {} workers", m_config.workers);
long startTime = (new Date()).getTime();
for (int workerNo = 1; workerNo <= m_config.workers; workerNo++) {
DumpWorker dumpWorker = new DumpWorker(workerNo);
dumpWorker.start();
m_workerList.add(dumpWorker);
}
for (DumpWorker dumpWorker : m_workerList) {
try {
dumpWorker.join();
} catch (InterruptedException e) {
// ignore
}
}
long stopTime = System.currentTimeMillis();
m_logger.info("Total tables dumped: {}", m_totalTables.get());
m_logger.info("Total objects dumped: {}", m_totalObjects.get());
m_logger.info("Total bytes written: {}", m_totalBytes.get());
long totalMillis = stopTime - startTime;
m_logger.info("Total time for dump: {}", Utils.formatElapsedTime(totalMillis));
m_logger.info("Average objects/sec: {}", ((m_totalObjects.get() * 1000) / totalMillis));
m_logger.info("Average bytes/sec: {}", ((m_totalBytes.get() * 1000) / totalMillis));
} // run
//----- Package private methods used by DumpWorker
// Return the next table to be dumped.
synchronized TableDefinition nextTableDef() {
if (m_tableIterator.hasNext()) {
m_totalTables.incrementAndGet();
return m_tableIterator.next();
}
return null;
} // nextTableDef
// Increment the number of objects dumped
void incrementObjectCount(long count) {
m_totalObjects.addAndGet(count);
} // incrementObjectCount
// Increment the number of bytes dumped
void incrementObjectByteCount(long count) {
m_totalBytes.addAndGet(count);
} // incrementByteCount
/**
* Return true if the given {@link FieldDefinition} should be suppressed when its
* table is dumped to a CSV file. Currently, this will only occur for link fields
* when -optimize is specified.
*
* @param fieldDef {@link FieldDefinition} of a field.
* @return True if the field should be suppressed.
*/
boolean shouldSuppressField(FieldDefinition fieldDef) {
if (!m_config.optimize) {
return false;
}
Set suppressedFields = m_dbSuppressedFieldMap.get(fieldDef.getTableName());
return suppressedFields != null && suppressedFields.contains(fieldDef.getName());
} // shouldSuppressLinkField
//----- Private methods
private static void usage() {
display("Usage: CSVDumper ");
display("where are:");
display(" -app Doradus application name. Default is: {}", CSVConfig.DEFAULT_APPNAME);
display(" -batchsize <#> Batch size. Default is: {}", CSVConfig.DEFAULT_BATCH_SIZE);
display(" -compress [T|F] Compress messages. Default is: {}", CSVConfig.DEFAULT_COMPRESS);
display(" -host Doradus server host name. Default is: {}", CSVConfig.DEFAULT_HOST);
display(" -id Column name of ID field. Default is: {}", CSVConfig.DEFAULT_ID_FIELD);
display(" -port Doradus server port. Default is: {}", CSVConfig.DEFAULT_PORT);
display(" -root Root folder of CSV files. Default is: {}", CSVConfig.DEFAULT_ROOT);
display(" -shard (OLAP only): Name of shard to load. Default is: {}", CSVConfig.DEFAULT_SHARD);
display(" -workers <#> # of worker threads. Default is: {}", CSVConfig.DEFAULT_WORKERS);
display("Reads all records in all tables for the given OLAP or Spider application and dumps");
display("them to CSV files found in 'root' folder. TLS options are also available.");
System.exit(0);
} // usage
// Write the given message to stdout only. Uses {}-style parameters
private static void display(String format, Object... args) {
System.out.println(MessageFormatter.arrayFormat(format, args).getMessage());
} // display
// Parse args into CSVConfig object.
private void parseArgs(String[] args) {
int index = 0;
while (index < args.length) {
String name = args[index];
if (name.equals("-?") || name.equalsIgnoreCase("-help")) {
usage();
}
if (name.charAt(0) != '-') {
m_logger.error("Unrecognized parameter: {}", name);
usage();
}
if (++index >= args.length) {
m_logger.error("Another parameter expected after '{}'", name);
usage();
}
String value = args[index];
try {
m_config.set(name.substring(1), value);
} catch (Exception e) {
m_logger.error(e.toString());
usage();
}
index++;
}
if (!m_config.root.endsWith(File.separator)) {
m_config.root += File.separator;
}
} // parseArgs
// Connect to the Doradus server and download the requested application's schema.
private void loadSchema() {
m_logger.info("Loading schema for application: {}", m_config.app);
m_client = new Client(m_config.host, m_config.port, m_config.getTLSParams());
m_session = m_client.openApplication(m_config.app); // throws if unknown app
m_appDef = m_session.getAppDef();
if (m_config.optimize) {
computeLinkFanouts();
}
} // loadSchema
// Simple class for holding mutable float objects.
static class MutableFloat {
float m_value;
MutableFloat(float value) {
m_value = value;
}
} // static class MutableFloat
// Compute approximate link fanouts for each table by querying up to 100 objects each.
private void computeLinkFanouts() {
// Table name -> link field name -> average number of links per owning object.
Map> dbLinkFanoutMap =
new HashMap>();
m_logger.info("Computing link fanouts");
for (TableDefinition tableDef : m_appDef.getTableDefinitions().values()) {
Map tableLinkFanoutMap = new HashMap();
dbLinkFanoutMap.put(tableDef.getTableName(), tableLinkFanoutMap);
computeLinkFanouts(tableDef, tableLinkFanoutMap);
}
m_dbSuppressedFieldMap = new HashMap>();
Set decidedLinkSet = new HashSet();
for (TableDefinition tableDef : m_appDef.getTableDefinitions().values()) {
Set tableSuppFieldSet = m_dbSuppressedFieldMap.get(tableDef.getTableName());
if (tableSuppFieldSet == null) {
tableSuppFieldSet = new HashSet();
m_dbSuppressedFieldMap.put(tableDef.getTableName(), tableSuppFieldSet);
}
Map tableLinkFanoutMap =
dbLinkFanoutMap.get(tableDef.getTableName());
for (FieldDefinition fieldDef : tableDef.getFieldDefinitions()) {
if (!fieldDef.isLinkField() || decidedLinkSet.contains(fieldDef)) {
continue;
}
float linkFanout = 0;
if (tableLinkFanoutMap != null) {
MutableFloat avgLinks = tableLinkFanoutMap.get(fieldDef.getName());
if (avgLinks != null) {
linkFanout = avgLinks.m_value;
}
}
TableDefinition invTableDef = tableDef.getLinkExtentTableDef(fieldDef);
FieldDefinition invFieldDef = invTableDef.getFieldDef(fieldDef.getLinkInverse());
float invLinkFanout = 0;
Map invTableLinkFanoutMap =
dbLinkFanoutMap.get(invTableDef.getTableName());
if (invTableLinkFanoutMap != null) {
MutableFloat avgLinks = invTableLinkFanoutMap.get(invFieldDef.getName());
if (avgLinks != null) {
invLinkFanout = avgLinks.m_value;
}
}
// If this link's fanout is higher than or equal to that of its inverse,
// we'll suppress this link, otherwise the inverse. When the fanouts are
// the same or very close, it's arbitrary which one we pick, really.
if (linkFanout >= invLinkFanout) {
// Suppress this link.
tableSuppFieldSet.add(fieldDef.getName());
m_logger.info("Will suppress {}.{} and dump {}.{}",
new Object[] {tableDef.getTableName(), fieldDef.getName(),
invTableDef.getTableName(), invFieldDef.getName()});
} else {
// Suppress the inverse.
Set invTableSuppFieldSet = m_dbSuppressedFieldMap.get(invTableDef.getTableName());
if (invTableSuppFieldSet == null) {
invTableSuppFieldSet = new HashSet();
m_dbSuppressedFieldMap.put(invTableDef.getTableName(), invTableSuppFieldSet);
}
invTableSuppFieldSet.add(invFieldDef.getName());
m_logger.info("Will suppress {}.{} and dump {}.{}",
new Object[]{invTableDef.getTableName(), invFieldDef.getName(),
tableDef.getTableName(), fieldDef.getName()});
}
decidedLinkSet.add(fieldDef);
decidedLinkSet.add(invFieldDef);
}
}
} // computeLinkFanouts
// Compute link fanouts for the given table
private void computeLinkFanouts(TableDefinition tableDef,
Map tableLinkFanoutMap) {
m_logger.info("Computing link field fanouts for table: {}", tableDef.getTableName());
StringBuilder buffer = new StringBuilder();
for (FieldDefinition fieldDef : tableDef.getFieldDefinitions()) {
if (fieldDef.isLinkField()) {
if (buffer.length() > 0) {
buffer.append(",");
}
buffer.append(fieldDef.getName());
}
}
if (buffer.length() == 0) {
return;
}
Map queryParams = new HashMap<>();
queryParams.put("q", "*");
queryParams.put("f", buffer.toString());
queryParams.put("s", Integer.toString(LINK_FANOUT_SAMPLE_SIZE));
if (m_session instanceof OLAPSession) {
queryParams.put("shards", m_config.shard);
}
QueryResult qResult = m_session.objectQuery(tableDef.getTableName(), queryParams);
Collection objectSet = qResult.getResultObjects();
if (objectSet.size() == 0) {
return;
}
Map linkValueCounts = new HashMap();
int totalObjs = 0;
for (DBObject dbObj : objectSet) {
totalObjs++;
for (String fieldName : dbObj.getFieldNames()) {
if (tableDef.isLinkField(fieldName)) {
Collection linkValues = dbObj.getFieldValues(fieldName);
AtomicInteger totalLinkValues = linkValueCounts.get(fieldName);
if (totalLinkValues == null) {
linkValueCounts.put(fieldName, new AtomicInteger(linkValues.size()));
} else {
totalLinkValues.addAndGet(linkValues.size());
}
}
}
}
for (String fieldName : linkValueCounts.keySet()) {
AtomicInteger totalLinkValues = linkValueCounts.get(fieldName);
float linkFanout = totalLinkValues.get() / (float)totalObjs; // may round to 0
m_logger.info("Average fanout for link {}: {}", fieldName, linkFanout);
tableLinkFanoutMap.put(fieldName, new MutableFloat(linkFanout));
}
} // computeLinkFanouts(tableDef)
/**
* Each DumpWorker grabs a table from CSVDumper and dumps the whole table.
*/
final class DumpWorker extends Thread {
// Number of objects we should request at one time:
private static final int BATCH_SIZE = 1000;
// Members:
private final int m_workerNo;
private final ApplicationSession m_session;
private final CSVConfig m_config;
private int m_totalObjects;
private int m_totalBytes;
// Logging interface:
private Logger m_logger = LoggerFactory.getLogger(getClass().getSimpleName());
// Create a DumpWorker object that belongs to the given application. A client object
// is created, which causes the database to be openned.
DumpWorker(int workerNo) {
m_workerNo = workerNo;
m_config = CSVConfig.instance();
m_logger.info("Worker {}: Opening session to application: {}",
new Object[]{m_workerNo, m_config.app});
Client client = new Client(m_config.host, m_config.port, m_config.getTLSParams());
m_session = client.openApplication(m_config.app); // throws if unknown
client.close();
} // constructor
// run contains the main get-and-dump loop.
@Override
public void run() {
String tableName = "";
try {
for (TableDefinition tableDef = nextTableDef(); tableDef != null; tableDef = nextTableDef()) {
tableName = tableDef.getTableName();
m_totalObjects = 0;
m_totalBytes = 0;
dumpTable(tableDef);
incrementObjectCount(m_totalObjects);
incrementObjectByteCount(m_totalBytes);
}
} catch (Exception ex) {
m_logger.error("Worker {}: Error dumping table '{}'", m_workerNo, tableName);
m_logger.error("Worker {}", m_workerNo, ex);
} finally {
m_session.close();
m_logger.info("Worker {}: Thread shutting down", m_workerNo);
}
} // run
// Dump the given table to a CSV file.
private void dumpTable(TableDefinition tableDef) throws IOException {
m_logger.info("Worker {}: Dumping table: {}", m_workerNo, tableDef.getTableName());
File csvFile = new File(m_config.root + tableDef.getTableName() + ".csv");
if (csvFile.exists()) {
if (!csvFile.delete()) {
throw new IOException("Could not delete existing CSV file: " + csvFile.getAbsolutePath());
}
}
BufferedWriter writer = null;
try {
writer = new BufferedWriter(new FileWriter(csvFile));
dumpTable(tableDef, writer);
writer.close();
m_totalBytes += csvFile.length();
} finally {
Utils.close(writer);
}
} // dumpTable
// Get all objects for the given table and write to the given writer in CSV format.
private void dumpTable(TableDefinition tableDef, BufferedWriter writer) throws IOException {
StringBuilder fieldParam = new StringBuilder("*");
StringBuilder csvHeader = new StringBuilder(m_config.id);
// Build a list of fields in the order they'll be dumped.
List fieldDefList = new ArrayList();
for (FieldDefinition fieldDef : tableDef.getFieldDefinitions()) {
if (!(fieldDef.isScalarField() || fieldDef.isLinkField()) ||
shouldSuppressField(fieldDef)) {
continue;
}
fieldDefList.add(fieldDef);
csvHeader.append(',');
csvHeader.append(fieldDef.getName());
if (fieldDef.isLinkField()) {
fieldParam.append(',');
fieldParam.append(fieldDef.getName());
}
}
writer.write(csvHeader.toString());
writer.newLine();
Map queryParams = new HashMap<>();
queryParams.put("q", "*");
queryParams.put("f", fieldParam.toString());
queryParams.put("s", Integer.toString(BATCH_SIZE));
if (m_session instanceof OLAPSession) {
queryParams.put("shards", m_config.shard);
}
String contToken = null;
int objsDumped = 0;
while (true) {
if (m_session instanceof OLAPSession) {
if (objsDumped > 0) {
queryParams.put("k", Integer.toString(objsDumped));
}
} else if (contToken != null) {
queryParams.put("g", contToken);
}
QueryResult qResult = m_session.objectQuery(tableDef.getTableName(), queryParams);
Collection objectSet = qResult.getResultObjects();
for (DBObject dbObj : objectSet) {
dumpObject(fieldDefList, dbObj, writer);
objsDumped++;
}
contToken = qResult.getContinuationToken();
if (objectSet.size() == 0 || Utils.isEmpty(contToken)) {
break;
}
}
} // dumpTable
// Write a CSV record to the given writer containing all fields of the given object.
// The fields must appear in the oder of the given field list, with empty commas for
// fields that have no values.
private void dumpObject(List fieldDefList, DBObject dbObj, BufferedWriter writer)
throws IOException {
StringBuilder buffer = new StringBuilder();
buffer.append("\"");
buffer.append(dbObj.getObjectID());
buffer.append("\"");
for (FieldDefinition fieldDef : fieldDefList) {
buffer.append(",");
if (fieldDef.isLinkField()) {
Collection linkValues = dbObj.getFieldValues(fieldDef.getName());
if (linkValues != null && linkValues.size() > 0) {
buffer.append("\"");
buffer.append(Utils.concatenate(linkValues, "~"));
buffer.append("\"");
}
} else if (fieldDef.isCollection()) {
Collection collValues = dbObj.getFieldValues(fieldDef.getName());
if (collValues != null && collValues.size() > 0) {
// Quote the value if it might contain commas.
if (fieldDef.getType() == FieldType.TEXT) {
buffer.append("\"");
}
buffer.append(Utils.concatenate(collValues, "~"));
if (fieldDef.getType() == FieldType.TEXT) {
buffer.append("\"");
}
}
} else {
assert fieldDef.isScalarField();
String fieldValue = dbObj.getFieldValue(fieldDef.getName());
if (fieldValue != null && fieldValue.length() > 0) {
if (fieldDef.getType() == FieldType.TEXT) {
buffer.append("\"");
}
buffer.append(fieldValue);
if (fieldDef.getType() == FieldType.TEXT) {
buffer.append("\"");
}
}
}
}
m_totalObjects++;
writer.write(buffer.toString());
writer.newLine();
} // dumpObject
} // class DumpWorker
} // class CSVDumper