Commit 635fbcbc authored by Jonathan Mace's avatar Jonathan Mace
Browse files

Update use of settings

parent 02c31e3a
......@@ -6,36 +6,43 @@ import org.apache.spark.sql.hive.HiveContext;
import com.databricks.spark.sql.perf.tpcds.Tables;
import edu.brown.cs.systems.tpcds.Dsdgen;
public class SparkTPCDSDataGenerator {
/** Generate data using the default dataset settings */
public static void generateData() {
generateData(TPCDSSettings.dataLocation(), TPCDSSettings.dataFormat(),
TPCDSSettings.scaleFactor());
generateData(TPCDSSettings.createWithDefaults());
}
/**
* Generate data using some default dataset settings, overridding the
* specified values
*/
public static void generateData(String location, String format, int scaleFactor) {
generateData(location, format, scaleFactor, TPCDSSettings.overwrite(),
TPCDSSettings.partitionTables(), TPCDSSettings.useDoubleForDecimal(),
TPCDSSettings.clusterByPartitionColumns(), TPCDSSettings.filterOutNullPartitionValues());
TPCDSSettings settings = TPCDSSettings.createWithDefaults();
settings.dataLocation = location;
settings.dataFormat = format;
settings.scaleFactor = scaleFactor;
generateData(settings);
}
public static void generateData(String location, String format, int scaleFactor,
boolean overwrite, boolean partitionTables, boolean useDoubleForDecimal, boolean clusterByPartitionColumns,
boolean filterOutNullPartitionValues) {
/** Generate data using the specified dataset settings */
public static void generateData(TPCDSSettings settings) {
SparkConf conf = new SparkConf().setAppName("TPC-DS generateData");
SparkContext sc = new SparkContext(conf);
HiveContext sql = new HiveContext(sc);
Tables tables = new Tables(sql, scaleFactor);
tables.genData(location, format, overwrite, partitionTables, useDoubleForDecimal, clusterByPartitionColumns,
filterOutNullPartitionValues, "");
HiveContext sqlContext = new HiveContext(sc);
Tables tables = new Tables(sqlContext, settings.scaleFactor);
tables.genData(settings.dataLocation, settings.dataFormat, settings.overwrite, settings.partitionTables,
settings.useDoubleForDecimal, settings.clusterByPartitionColumns,
settings.filterOutNullPartitionValues, "");
sc.stop();
}
public static void main(String[] args) {
generateData();
TPCDSSettings settings = TPCDSSettings.createWithDefaults();
System.out.println("Creating TPC-DS data using spark, with default settings:");
System.out.println(settings);
generateData(settings);
}
}
......@@ -30,24 +30,24 @@ public class SparkTPCDSWorkloadGenerator {
* @return SQL context with tables loaded
*/
public static SQLContext spinUpWithDefaults() {
SparkConf c = new SparkConf().setAppName("SparkTPCDSWorkloadGenerator");
return spinUp("SparkTPCDSWorkloadGenerator", TPCDSSettings.createWithDefaults());
}
public static SQLContext spinUp(String name, TPCDSSettings settings) {
SparkConf c = new SparkConf().setAppName(name);
SparkContext sc = new SparkContext(c);
SQLContext sql = new HiveContext(sc);
loadExistingTablesIntoMemory(sql, TPCDSSettings.dataLocation(), TPCDSSettings.dataFormat());
return sql;
SQLContext sqlContext = new HiveContext(sc);
loadExistingTablesIntoMemory(sqlContext, settings);
return sqlContext;
}
/** Loads tpcds tables into memory on Spark from a source location, eg from HDFS.
*
* @param sql A SQLContext to load the tables
* @param dataLocation The location of the TPC-DS data, eg "/Users/jon/tpcds/data", "hdfs://127.0.0.1:9000/tpcds/data", etc.
* @param dataFormat The format of the generated data, eg "text", "parquet", etc.
*/
public static void loadExistingTablesIntoMemory(SQLContext sql, String dataLocation, String dataFormat) {
/** Loads tpcds tables into memory in Spark from a source location, eg from HDFS.
* Only uses the dataLocation and dataFormat settings. */
public static void loadExistingTablesIntoMemory(SQLContext sqlContext, TPCDSSettings settings) {
/* Tables constructor takes dsdgenDir and scaleFactor, but they are not used when loading existing data.
* So we just use default values for these instead of adding them as confusing and unused parameters */
Tables tables = new Tables(sql, TPCDSSettings.scaleFactor());
tables.createTemporaryTables(dataLocation, dataFormat, "");
Tables tables = new Tables(sqlContext, settings.scaleFactor);
tables.createTemporaryTables(settings.dataLocation, settings.dataFormat, "");
}
......
......@@ -3,72 +3,57 @@ package edu.brown.cs.systems.tpcds.spark;
import com.typesafe.config.Config;
import com.typesafe.config.ConfigFactory;
/** Settings for a TPC-DS dataset */
public class TPCDSSettings {
public static class SettingsInstance {
public final int scaleFactor;
public final String dataLocation;
public final String dataFormat;
public final boolean overwrite;
public final boolean useDoubleForDecimal;
public final boolean partitionTables;
public final boolean clusterByPartitionColumns;
public final boolean filterOutNullPartitionValues;
public SettingsInstance(Config config) {
scaleFactor = config.getInt("scaleFactor");
dataLocation = config.getString("dataLocation");
dataFormat = config.getString("dataFormat");
overwrite = config.getBoolean("overwrite");
useDoubleForDecimal = config.getBoolean("useDoubleForDecimal");
partitionTables = config.getBoolean("partitionTables");
clusterByPartitionColumns = config.getBoolean("clusterByPartitionColumns");
filterOutNullPartitionValues = config.getBoolean("filterOutNullPartitionValues");
}
}
private static SettingsInstance defaults = null;
public static SettingsInstance defaults() {
if (defaults == null) {
synchronized(TPCDSSettings.class) {
if (defaults == null) {
defaults = new SettingsInstance(ConfigFactory.load().getConfig("tpcds"));
}
}
}
return defaults;
}
public static int scaleFactor() {
return defaults().scaleFactor;
}
public static String dataLocation() {
return defaults().dataLocation;
}
public static String dataFormat() {
return defaults().dataFormat;
}
public static boolean overwrite() {
return defaults().overwrite;
}
public static boolean useDoubleForDecimal() {
return defaults().useDoubleForDecimal;
}
public static boolean partitionTables() {
return defaults().partitionTables;
}
public static boolean clusterByPartitionColumns() {
return defaults().clusterByPartitionColumns;
public int scaleFactor;
public String dataLocation;
public String dataFormat;
public boolean overwrite;
public boolean useDoubleForDecimal;
public boolean partitionTables;
public boolean clusterByPartitionColumns;
public boolean filterOutNullPartitionValues;
private TPCDSSettings(Config config) {
scaleFactor = config.getInt("scaleFactor");
dataLocation = config.getString("dataLocation");
dataFormat = config.getString("dataFormat");
overwrite = config.getBoolean("overwrite");
useDoubleForDecimal = config.getBoolean("useDoubleForDecimal");
partitionTables = config.getBoolean("partitionTables");
clusterByPartitionColumns = config.getBoolean("clusterByPartitionColumns");
filterOutNullPartitionValues = config.getBoolean("filterOutNullPartitionValues");
}
public static boolean filterOutNullPartitionValues() {
return defaults().filterOutNullPartitionValues;
/**
* Create TPC-DS settings, taking values from the default typesafe config.
* This call is equivalent to
* {@code createFromConfig(ConfigFactory.load().getConfig("tpcds"))}
*/
public static TPCDSSettings createWithDefaults() {
return createFromConfig(ConfigFactory.load().getConfig("tpcds"));
}
/**
* Create TPC-DS settings, taking values from the provided typesafe config
* object. The default values are contained in the "tpcds" root config
*/
public static TPCDSSettings createFromConfig(Config config) {
return new TPCDSSettings(config);
}
@Override
public String toString() {
StringBuilder b = new StringBuilder();
b.append("scaleFactor: " + scaleFactor);
b.append("dataLocation: " + dataLocation);
b.append("dataFormat: " + dataFormat);
b.append("overwrite: " + overwrite);
b.append("useDoubleForDecimal: " + useDoubleForDecimal);
b.append("partitionTables: " + partitionTables);
b.append("clusterByPartitionColumns: " + clusterByPartitionColumns);
b.append("filterOutNullPartitionValues: " + filterOutNullPartitionValues);
return b.toString();
}
}
......@@ -2,7 +2,7 @@ tpcds {
scaleFactor = 1
dataLocation = "hdfs://127.0.0.1:9000/tpcds"
dataFormat = "parquet"
overwrite = true
overwrite = false
partitionTables = false
useDoubleForDecimal = false
clusterByPartitionColumns = false
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment