Commit 97e64dce authored by Jonathan Mace's avatar Jonathan Mace
Browse files

Updates to QueryUtils and JarUtils with bugfixes and improvements to command line utils

parent b29fbfbd
......@@ -167,7 +167,7 @@
<extraJvmArguments>-Xmx16m</extraJvmArguments>
<programs>
<program>
<mainClass>edu.brown.cs.systems.tpcds.Queries</mainClass>
<mainClass>edu.brown.cs.systems.tpcds.QueryUtils</mainClass>
<id>list-queries</id>
</program>
<program>
......
......@@ -19,43 +19,84 @@ import com.google.common.collect.Lists;
public class JarUtils {
public static List<String> listFilesystemDir(String dir) {
URL benchmarkFolder = Thread.currentThread().getContextClassLoader().getResource(dir);
List<String> contents = Lists.newArrayList();
if (benchmarkFolder != null) {
try {
final File f = new File(benchmarkFolder.toURI());
for (File resource : f.listFiles()) {
contents.add(resource.getName());
}
} catch (URISyntaxException ex) {
}
}
return contents;
}
/** Lists the contents of a directory that resides within a jarfile */
public static List<String> listDirFromJarFile(JarFile jar, String dir) {
Enumeration<JarEntry> entries = jar.entries(); //gives ALL entries in jar
List<String> directoryEntries = Lists.newArrayList();
while(entries.hasMoreElements()) {
JarEntry entry = entries.nextElement();
String name = entry.getName();
// Ignore anything that isn't actually in the dir
if (!name.startsWith(dir)) {
continue;
}
// Ignore the dir itself, only interested in dir contents
if (name.equals(dir)) {
continue;
}
// Get the name of this path element in the dir
String nameInDir = name.substring(dir.length()).split(File.separator)[0];
// Ignore the directory entry if we've seen it before
if (directoryEntries.contains(nameInDir)) {
continue;
}
// Append to list of entries
directoryEntries.add(nameInDir);
}
return directoryEntries;
}
/** List a directory that may exist within a jar file or on the classpath somewhere
* Only valid for resources within this project */
public static List<String> listDir(String dir) throws IOException {
// Figure out whether we're loading from a JAR or from file
File jarFile = new File(JarUtils.class.getProtectionDomain().getCodeSource().getLocation().getPath());
List<String> contents = Lists.newArrayList();
if(jarFile.isFile()) {
JarFile jar = new JarFile(jarFile);
Enumeration<JarEntry> entries = jar.entries(); //gives ALL entries in jar
while(entries.hasMoreElements()) {
JarEntry entry = entries.nextElement();
String name = entry.getName();
if (name.startsWith(dir + File.separator) && !entry.isDirectory()) {
contents.add(name.substring(dir.length()+1));
}
}
jar.close();
JarFile jar = new JarFile(jarFile);
try {
return listDirFromJarFile(jar, dir+File.separator);
} finally {
jar.close();
}
} else {
URL benchmarkFolder = Thread.currentThread().getContextClassLoader().getResource(dir);
if (benchmarkFolder != null) {
try {
final File f = new File(benchmarkFolder.toURI());
for (File resource : f.listFiles()) {
contents.add(resource.getName());
}
} catch (URISyntaxException ex) {
}
}
return listFilesystemDir(dir);
}
return contents;
}
/** Read a file that may exist within a jar file or on the classpath somewhere
* Only valid for resources within this project */
public static String readFile(String fileName) {
return new Scanner(Queries.class.getClassLoader().getResourceAsStream(fileName)).useDelimiter("\\Z").next();
InputStream rsrc = QueryUtils.class.getClassLoader().getResourceAsStream(fileName);
String fileContents = "";
if (rsrc != null) {
Scanner s = new Scanner(rsrc);
s.useDelimiter("\\Z"); // EOF delimiter
if (s.hasNext()) {
fileContents = s.next();
}
s.close();
}
return fileContents;
}
public static void copyDirectory(String resourceDirName, File destDir) throws IOException {
......
package edu.brown.cs.systems.tpcds;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.List;
import org.apache.commons.lang3.StringUtils;
import com.google.common.collect.Lists;
public class Queries {
/**
* Multiple different people have implemented TPC-DS queries. This function
* returns the names of the variants available in this package. Most queries
* don't actually work. View the documentation to see which queries do work
* and which ones to use.
*/
public static List<String> availableBenchmarks() {
try {
return JarUtils.listDir("queries");
} catch (IOException e) {
return Lists.<String>newArrayList();
}
}
/** Lists the names of queries in a named benchmark
* @throws IOException */
public static List<String> queriesInBenchmark(String benchmark) throws IOException {
List<String> files = JarUtils.listDir(String.format("queries%s%s", File.separator, benchmark));
files.remove("README");
return files;
}
/** Returns the benchmark's README text */
public static String description(String benchmark) {
String fileName = String.format("queries%s%s%sREADME", File.separator, benchmark, File.separator);
return JarUtils.readFile(fileName);
}
/** Strips comment line and trailing semicolon from query */
private static String formatQuery(String query) {
String[] lines = query.split("\n");
List<String> newLines = Lists.newArrayListWithExpectedSize(lines.length);
for (String line : lines) {
// Ignore comment lines
line = line.trim();
if (line.startsWith("--")) {
continue;
}
if (line.equals("exit;")) {
continue;
}
while (line.endsWith(";")) {
line = line.substring(0, line.length() - 1);
}
if (!line.isEmpty()) {
newLines.add(line);
}
}
return StringUtils.join(newLines, "\n");
}
/**
* Finds a named query on the classpath, loads it, and strips out comments
* and trailing semicolons
*/
public static String loadQuery(String benchmark, String queryName) throws FileNotFoundException {
String fileName = String.format("queries%s%s%s%s", File.separator, benchmark, File.separator, queryName);
return formatQuery(JarUtils.readFile(fileName));
}
public static void main(String[] args) throws IOException {
for (String benchmark : availableBenchmarks()) {
System.out.println(benchmark);
System.out.println(description(benchmark));
for (String query : queriesInBenchmark(benchmark)) {
System.out.println(query);
}
}
}
}
package edu.brown.cs.systems.tpcds;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import edu.brown.cs.systems.tpcds.QueryUtils.Benchmark.Query;
/**
* Provides utility methods for loading benchmarks and constituent queries from
* files, which are contained within the jars or files on the classpath
*/
public class QueryUtils {
static final Logger log = LoggerFactory.getLogger(QueryUtils.class);
/** Simple representation of a set of queries */
public static class Benchmark {
public String benchmarkName;
public String benchmarkDescription;
public Map<String, Query> benchmarkQueries = Maps.newHashMap();
public Benchmark(String name, String description) {
this.benchmarkName = name;
this.benchmarkDescription = description;
}
public void addQuery(String queryName, String queryText) {
benchmarkQueries.put(queryName, new Query(queryName, queryText));
}
@Override
public String toString() {
return String.format("Benchmark %s with %d queries", benchmarkName, benchmarkQueries.size());
}
public String toLongString() {
StringBuilder b = new StringBuilder();
b.append(this.toString());
b.append("\n");
b.append(benchmarkDescription);
for (Query q : benchmarkQueries.values()) {
b.append("\n");
b.append(q.toLongString());
}
return b.toString();
}
public class Query {
public String queryName;
public String queryText;
public Query(String queryName, String queryText) {
this.queryName = queryName;
this.queryText = queryText;
}
public String toLongString() {
return String.format("%s/%s:\n\n%s", benchmarkName, queryName, queryText);
}
@Override
public String toString() {
return String.format("%s/%s", benchmarkName, queryName);
}
}
}
/**
* Finds all available benchmarks and their constituent queries. Looks
* inside the 'queries' folder for subfolders. Each folder contains text
* files with queries inside. Looks at the README in each folder for a
* description of the benchmark.
*/
public static Map<String, Benchmark> load() {
Map<String, Benchmark> allBenchmarks = Maps.newHashMap();
for (String benchmarkName : availableBenchmarks()) {
Benchmark benchmark = new Benchmark(benchmarkName, description(benchmarkName));
// Skip queries that we fail to load
try {
List<String> queries = queriesInBenchmark(benchmarkName);
for (String queryName : queries) {
try {
String queryText = loadQuery(benchmarkName, queryName);
benchmark.addQuery(queryName, queryText);
} catch (FileNotFoundException e) {
e.printStackTrace();
log.warn("Unable to load query " + queryName + " in benchmark " + benchmarkName, e);
}
}
} catch (IOException e) {
e.printStackTrace();
log.warn("Unable to load queries for benchmark " + benchmarkName, e);
}
// Only include benchmarks with at least one query
if (!benchmark.benchmarkQueries.isEmpty()) {
allBenchmarks.put(benchmarkName, benchmark);
}
}
return allBenchmarks;
}
/**
* Multiple different people have implemented TPC-DS queries. This function
* returns the names of the variants available in this package. Most queries
* don't actually work. View the documentation to see which queries do work
* and which ones to use.
*/
private static List<String> availableBenchmarks() {
try {
return JarUtils.listDir("queries");
} catch (IOException e) {
return Lists.<String> newArrayList();
}
}
/** Lists the names of queries in a named benchmark */
private static List<String> queriesInBenchmark(String benchmark) throws IOException {
List<String> files = JarUtils.listDir(String.format("queries%s%s", File.separator, benchmark));
files.remove("README");
return files;
}
/** Returns the benchmark's README text, or the empty string if no text */
private static String description(String benchmark) {
String fileName = String.format("queries%s%s%sREADME", File.separator, benchmark, File.separator);
return JarUtils.readFile(fileName);
}
/** Strips comment line and trailing semicolon from query */
private static String formatQuery(String query) {
String[] lines = query.split("\n");
List<String> newLines = Lists.newArrayListWithExpectedSize(lines.length);
for (String line : lines) {
// Ignore comment lines
line = line.trim();
if (line.startsWith("--")) {
continue;
}
if (line.equals("exit;")) {
continue;
}
while (line.endsWith(";")) {
line = line.substring(0, line.length() - 1);
}
if (!line.isEmpty()) {
newLines.add(line);
}
}
return StringUtils.join(newLines, "\n");
}
/** Loads, from file, a specific query from a benchmark */
private static String loadQuery(String benchmark, String queryName) throws FileNotFoundException {
String fileName = String.format("queries%s%s%s%s", File.separator, benchmark, File.separator, queryName);
return formatQuery(JarUtils.readFile(fileName));
}
/** Simple util for printing queries and benchmarks */
public static void main(String[] args) throws IOException {
// Get all benchmark data
Map<String, Benchmark> benchmarks = load();
// No args? Print all available benchmarks
if (args.length == 0) {
System.out.println(StringUtils.join(benchmarks.values(), "\n"));
return;
}
// Split arg on file separator; either benchmark or query name
String[] splits = args[0].split(File.separator);
// Get the benchmark, check existence
Benchmark b = benchmarks.get(splits[0]);
if (b == null) {
System.out.println("Unknown benchmark " + splits[0]);
return;
}
// No query specified? Print benchmark's query list
if (splits.length == 1) {
System.out.println(b);
System.out.println(b.benchmarkDescription);
System.out.println(StringUtils.join(b.benchmarkQueries.values(), "\n"));
return;
}
// Second arg is query name
Query q = b.benchmarkQueries.get(splits[1]);
if (q == null) {
System.out.printf("Unknown query %s\n", args[0]);
return;
}
// Print query
System.out.println(q.toLongString());
}
}
......@@ -2,7 +2,7 @@ package edu.brown.cs.systems.tpcds.spark;
import org.apache.spark.SparkConf;
import org.apache.spark.SparkContext;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.hive.HiveContext;
import com.databricks.spark.sql.perf.tpcds.Tables;
......@@ -26,7 +26,7 @@ public class SparkTPCDSDataGenerator {
boolean filterOutNullPartitionValues) {
SparkConf conf = new SparkConf().setAppName("TPC-DS generateData");
SparkContext sc = new SparkContext(conf);
SQLContext sql = new SQLContext(sc);
HiveContext sql = new HiveContext(sc);
Tables tables = new Tables(sql, scaleFactor);
tables.genData(location, format, overwrite, partitionTables, useDoubleForDecimal, clusterByPartitionColumns,
filterOutNullPartitionValues, "");
......
......@@ -15,7 +15,9 @@ import org.slf4j.LoggerFactory;
import com.google.common.collect.Maps;
import edu.brown.cs.systems.tpcds.Queries;
import edu.brown.cs.systems.tpcds.QueryUtils;
import edu.brown.cs.systems.tpcds.QueryUtils.Benchmark;
import edu.brown.cs.systems.tpcds.QueryUtils.Benchmark.Query;
import com.databricks.spark.sql.perf.tpcds.Tables;
......@@ -53,12 +55,14 @@ public class SparkTPCDSWorkloadGenerator {
System.out.println("Starting SparkTPCDSWorkloadGenerator");
SQLContext sql = spinUpWithDefaults();
String benchmark = "impala-tpcds-modified-queries";
String query = "q19.sql";
String q = Queries.loadQuery(benchmark, query);
System.out.printf("Running query %s/%s\n", benchmark, query);
System.out.println(q);
Row[] rows = sql.sql(q).collect();
Map<String, Benchmark> allBenchmarks = QueryUtils.load();
String benchmarkName = "impala-tpcds-modified-queries";
String queryName = "q19.sql";
Benchmark benchmark = allBenchmarks.get(benchmarkName);
Query query = benchmark.benchmarkQueries.get(queryName);
System.out.printf("Running query %s/%s\n", query);
System.out.println(query.queryText);
Row[] rows = sql.sql(query.queryText).collect();
for (Row r : rows) {
System.out.println(r);
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment