Compare revisions

Jonathan Mace · Jonathan Mace · Jonathan Mace · Jonathan Mace · Jonathan Mace · Jonathan Mace
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,7 +6,8 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17 -O2")
 set(CXX "g++-8")
 set(CC "gcc-8")
 set(THREADS_PREFER_PTHREAD_FLAG ON)
-set(CMAKE_BUILD_TYPE Debug)
+set(CMAKE_BUILD_TYPE Release)
+#set(CMAKE_BUILD_TYPE Debug)

 find_package(Threads REQUIRED)
 find_package(Boost COMPONENTS system filesystem REQUIRED)
@@ -71,6 +72,8 @@ add_library( clockwork
 	src/clockwork/controller/direct_controller.cpp
 	src/clockwork/network/client.cpp

+	$ENV{TVM_HOME}/src/runtime/meta_data.h
+
 ### No point maintaining the below any more
 #	src/clockwork/alternatives/runtime_model.cpp
 #	src/clockwork/alternatives/model_manager.cpp
@@ -90,8 +93,32 @@ target_link_libraries( clockwork PUBLIC
    ${Boost_FILESYSTEM_LIBRARY}
 )

+
+# Converts from TVM models to clockwork models
+include_directories(clockwork-convert)
+add_executable (convert
+	src/clockwork-convert/tvm/decoupled_graph_runtime.cc
+	src/clockwork-convert/tvm_model.cpp
+	src/clockwork-convert/tvm_abstract_model.cpp
+	src/clockwork-convert/convert.cpp
+)
+
+target_link_libraries( convert
+	clockwork
+	clockwork_proto
+	Threads::Threads
+	dl
+	cuda
+	cudart
+	tvm_runtime
+	tbb
+	nvidia-ml
+    ${Boost_SYSTEM_LIBRARY}
+    ${Boost_FILESYSTEM_LIBRARY}
+)
+
 # cudafatbin is temporary / hacky 
-add_executable (cudafatbin src/cudafatbin.cc )
+add_executable (cudafatbin src/clockwork-convert/cudafatbin.cc )
 target_link_libraries( cudafatbin
 	clockwork
 	clockwork_proto
@@ -106,17 +133,11 @@ target_link_libraries( cudafatbin
    ${Boost_FILESYSTEM_LIBRARY}
 )

-
-# Converts from TVM models to clockwork models
-include_directories(clockwork-convert)
-add_executable (convert
-	src/clockwork/tvm/decoupled_graph_runtime.cc
-	src/clockwork-convert/tvm_model.cpp
-	src/clockwork-convert/tvm_abstract_model.cpp
-	src/clockwork-convert/convert.cpp
+add_executable (check_model
+	src/check_model.cpp
 )

-target_link_libraries( convert
+target_link_libraries( check_model
 	clockwork
 	clockwork_proto
 	Threads::Threads

--- a/README.md
+++ b/README.md
@@ -109,17 +109,17 @@ List available GPU clock frequencies
 nvidia-smi -q -d SUPPORTED_CLOCKS
 ```

-Pick a memory and graphics clock frequency (usually the highest), e.g.
+Pick a memory and graphics clock frequency (usually the highest), e.g. on volta machines:
 ```
    Supported Clocks
-        Memory                      : 3004 MHz
-            Graphics                : 1114 MHz
+        Memory                      : 877 MHz
+            Graphics                : 1380 MHz
 ```

-Set the default application clock and system clock to those highest values
+Set the default application clock and system clock to those highest values, e.g. on volta machines:
 ```
-nvidia-smi -ac 3004,1114
-nvidia-smi -lgc 1114
+nvidia-smi -ac 877,1380
+nvidia-smi -lgc 1380
 ```

 FYI:
@@ -132,6 +132,8 @@ Some of these values can be checked by running the Clockwork profiler with:
 ./profile [check]
 ```

+IMPORTANT: when you restart your machine, you will need to set persistence mode again.  Run the checker frequently!
+
 # Troubleshooting

 Currently, the CMakeLists assumes CUDA lives in either `/usr/local/cuda/lib64` (the default location in Ubuntu 14.x) or `/usr/lib/x86_64-linux-gnu/nvidia/current` (the default location for MPI cluster machines).  If you get build errors saying cannot find CUDA or cannot find nvidia-ml, then you'll need to update the `include_directories` and `link_directories` directives in the CMakeLists.txt with the CUDA location on your machine.

--- a/bin/convert.py
+++ b/bin/convert.py
+import argparse
+import os
+import subprocess
+
+
+"""
+Clockwork must be build in order to run this script.
+
+This script uses the `convert` binary and expects it to exist in the `build` folder
+"""
+
+parser = argparse.ArgumentParser(description='Convert a TVM model into a Clockwork model')
+parser.add_argument("input_dir", metavar="INDIR", type=str, help="Base directory where TVM models exist.  The utility expects multiple models, one per batch size, each in a subdirectory.")
+parser.add_argument("output_dir", metavar="OUTDIR", type=str, help="Output directory.  Directory will be created if it does not exist.")
+parser.add_argument('-p', "--page_size", type=int, default=16777216, help="Page size to use for compiled models.  16MB by default.")
+parser.add_argument("--subdir_prefix", type=str, default="b", help="Within input_dir, a prefix for how subdirectories are named.  Default \"b\" followed by the batch size.")
+
+
+def find_tvm_models(path):
+	dir_contents = os.listdir(path)
+	dir_contents_paths = [os.path.join(path, c) for c in dir_contents]
+	dir_files = [c for c in dir_contents_paths if os.path.isfile(c)]
+
+	so_files = [f[:-3] for f in dir_files if f.endswith(".so")]
+	model_choices = [f for f in so_files if is_model(f)]
+	return model_choices
+
+def is_model(path_prefix):
+	suffixes = ["so", "params", "json"]
+	for suffix in suffixes:
+		if not os.path.exists("%s.%s" % (path_prefix, suffix)):
+			return False
+	return True
+
+def find_models(path, subdir_prefix):
+	found_models = []
+	for entry in os.listdir(path):
+		entry_path = os.path.join(path, entry)
+		if not os.path.isdir(entry_path):
+			print("Ignoring non-directory %s" % entry_path)
+			continue
+		if not entry.startswith(subdir_prefix):
+			print("Skipping non-matching (prefix=\"%s\") directory %s " % (subdir_prefix, entry_path))
+			continue
+
+		candidates = find_tvm_models(entry_path)
+		if len(candidates) == 0:
+			print("Skipping directory with no valid models (expect .so .json and .params with matching names) %s" % entry_path)
+			continue
+		if len(candidates) > 1:
+			print("Skipping directory with multiple valid models %s" % entry_path)
+			continue
+
+		batch_size = int(entry[len(subdir_prefix):])
+
+		found_models.append((batch_size, candidates[0]))
+
+	return sorted(found_models)
+
+
+def convert(args):
+
+	models = find_models(args.input_dir, args.subdir_prefix)
+
+	print("Found %d models in input directory %s:" % (len(models), args.input_dir))
+	for batch_size, model in models:
+		print("  %d %s" % (batch_size, model))
+
+	# Create output directory
+	if not os.path.exists(args.output_dir):
+		print("Output directory %s will be created" % args.output_dir)
+	else:
+		print("Will output to existing directory %s" % args.output_dir)
+
+	print("The following command will run:")
+
+	pargs = [str(v) for v in [
+		"../build/convert",
+		"-o", args.output_dir,
+		"-p", args.page_size
+	] + [x for m in models for x in m]]
+	print(" ".join(pargs))
+	print("Press <return> to continue or CTRL-C to abort")
+	input()
+
+	if not os.path.exists(args.output_dir):
+		print("Created output directory %s" % args.output_dir)
+		os.makedirs(args.output_dir)
+
+	popen = subprocess.Popen(pargs)
+	popen.wait()
+
+if __name__ == '__main__':
+    args = parser.parse_args()
+    exit(convert(args))
\ No newline at end of file
--- a/include/clockwork/cuda_common.h
+++ b/include/clockwork/cuda_common.h
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file common.h
+ * \brief Common utilities for CUDA
+ */
+#ifndef _CLOCKWORK_CUDA_COMMON_H_
+#define _CLOCKWORK_CUDA_COMMON_H_
+
+#include <cuda_runtime.h>
+#include <string>
+
+namespace clockwork {
+
+#define CUDA_DRIVER_CALL(x)                                             \
+  {                                                                     \
+    CUresult result = x;                                                \
+    if (result != CUDA_SUCCESS && result != CUDA_ERROR_DEINITIALIZED) { \
+      const char *msg;                                                  \
+      cuGetErrorName(result, &msg);                                     \
+      LOG(FATAL)                                                        \
+          << "CUDAError: " #x " failed with error: " << msg;            \
+    }                                                                   \
+  }
+
+#define CUDA_CALL(func)                                            \
+  {                                                                \
+    cudaError_t e = (func);                                        \
+    CHECK(e == cudaSuccess || e == cudaErrorCudartUnloading)       \
+        << "CUDA: " << cudaGetErrorString(e);                      \
+  }
+
+}
+
+#endif  // _CLOCKWORK_CUDA_COMMON_H_
--- a/include/clockwork/modeldef.h
+++ b/include/clockwork/modeldef.h
@@ -76,13 +76,19 @@ struct PageMappedDLTensorDef {
    uint64_t page_offset;
    uint64_t size;
    std::vector<int64_t> shape;
+    int code = 2U; // kDLFloat
+    int bits = 32;
+    int lanes = 1;

    PODS_SERIALIZABLE(1,         
        PODS_MDR(base_offset),
        PODS_MDR(page),
        PODS_MDR(page_offset),
        PODS_MDR(size),
-        PODS_MDR(shape)
+        PODS_MDR(shape),
+        PODS_OPT(code),
+        PODS_OPT(bits),
+        PODS_OPT(lanes)
    )
 };


--- a/profile/clockwork/profile/check.cpp
+++ b/profile/clockwork/profile/check.cpp
@@ -49,7 +49,7 @@ void check_environment() {
    for (unsigned i = 0; i < num_gpus; i++) {

        if (!util::is_gpu_exclusive(i)) { // TODO: check all GPUs
-            std::cout << "  ✘ GPU " << i << " is not in exclusive mode; set with `nvidia-smi -i " << i << " -c 3` or set for all GPUs with `nvndia-smi -c 3`" << std::endl;
+            std::cout << "  ✘ GPU " << i << " is not in exclusive mode; set with `nvidia-smi -i " << i << " -c 3` or set for all GPUs with `nvidia-smi -c 3`" << std::endl;
        } else {
            std::cout << "  ✔ GPU " << i << " is in exclusive mode" << std::endl;
        }

--- a/resources/resnet18_tesla-m40/model.1.clockwork
+++ b/resources/resnet18_tesla-m40/model.1.clockwork
--- a/resources/resnet18_tesla-m40/model.16.clockwork
+++ b/resources/resnet18_tesla-m40/model.16.clockwork
--- a/resources/resnet18_tesla-m40/model.2.clockwork
+++ b/resources/resnet18_tesla-m40/model.2.clockwork
--- a/resources/resnet18_tesla-m40/model.4.clockwork
+++ b/resources/resnet18_tesla-m40/model.4.clockwork
--- a/resources/resnet18_tesla-m40/model.8.clockwork
+++ b/resources/resnet18_tesla-m40/model.8.clockwork
--- a/resources/resnet18_tesla-m40_batchsize1/model.1.clockwork
+++ b/resources/resnet18_tesla-m40_batchsize1/model.1.clockwork
--- a/src/check_model.cpp
+++ b/src/check_model.cpp
+#include "clockwork/worker.h"
+#include "clockwork/network/worker.h"
+#include "clockwork/runtime.h"
+#include "clockwork/model/batched.h"
+#include "clockwork/memory.h"
+#include "clockwork/cache.h"
+#include "clockwork/cuda_common.h"
+
+using namespace clockwork;
+
+
+void show_usage() {
+	std::cout << "USAGE" << std::endl;
+	std::cout << "  ./check_model [MODEL]" << std::endl;
+	std::cout << "DESCRIPTION" << std::endl;
+	std::cout << "  Will load and run an inference on a specified Clockwork model" << std::endl;
+	std::cout << "OPTIONS" << std::endl;
+    std::cout << "  -h, --help" << std::endl;
+    std::cout << "      Print this message" << std::endl;
+    std::cout << "  -p, --page_size" << std::endl;
+    std::cout << "      Weights page size used by Clockwork.  Defaults to 16MB.  You shouldn't" << std::endl;
+    std::cout << "      need to set this because we are using 16MB pages." << std::endl;
+}
+
+model::BatchedModel* load_model(std::string model) {
+	return model::BatchedModel::loadFromDisk(model, 0);
+}
+
+void check_model(int page_size, std::string model_path) {
+	std::cout << "Checking " << model_path << std::endl;
+
+	util::setCudaFlags();
+    util::initializeCudaStream();
+
+	clockwork::model::BatchedModel* model = load_model(model_path);
+
+	auto batch_sizes = model->implemented_batch_sizes();
+	for (unsigned batch_size : batch_sizes) {
+		std::cout << "  loaded batch size " << batch_size << std::endl;
+	}
+
+    model->instantiate_models_on_host();
+
+    size_t weights_page_size = page_size;
+    size_t weights_cache_size = model->num_weights_pages(weights_page_size) * weights_page_size;
+    PageCache* weights_cache = make_GPU_cache(weights_cache_size, weights_page_size, GPU_ID_0);
+
+    cudaError_t status;
+    model->instantiate_models_on_device();
+	    
+    std::shared_ptr<Allocation> weights = weights_cache->alloc(model->num_weights_pages(weights_page_size), []{});
+    model->transfer_weights_to_device(weights->page_pointers, util::Stream());
+
+    for (unsigned batch_size : batch_sizes) {
+    	// Create inputs and outputs
+	    char* input = new char[model->input_size(batch_size)];
+	    char* output = new char[model->output_size(batch_size)];
+
+	    // Create and allocate io_memory on GPU
+    	size_t io_memory_size = model->io_memory_size(batch_size);
+    	MemoryPool* io_pool = CUDAMemoryPool::create(io_memory_size, GPU_ID_0);
+	    char* io_memory = io_pool->alloc(io_memory_size);
+
+	    // Create and allocate intermediate GPU memory workspace
+	    size_t workspace_size = model->workspace_memory_size(batch_size);
+    	MemoryPool* workspace_pool = CUDAMemoryPool::create(workspace_size, GPU_ID_0);
+	    char* workspace_memory = workspace_pool->alloc(workspace_size);
+
+	    // Now execute each step of model
+	    model->transfer_input_to_device(batch_size, input, io_memory, util::Stream());
+
+	    // Time the call
+	    int warmups = 20;
+		for (int i = 0; i < warmups; i++) {    
+	    	model->call(batch_size, weights->page_pointers, io_memory, workspace_memory, util::Stream());
+	    }
+            status = cudaStreamSynchronize(util::Stream());
+            CHECK(status == cudaSuccess);
+	    auto before = util::now();
+            int iterations = 100;
+		for (int i = 0; i < iterations; i++) {    
+	    	model->call(batch_size, weights->page_pointers, io_memory, workspace_memory, util::Stream());
+	    }
+            status = cudaStreamSynchronize(util::Stream());
+            CHECK(status == cudaSuccess);
+	    auto after = util::now();
+	    printf("  b%d: %.2f ms per call\n", batch_size, ((float) (after-before)) / (iterations * 1000000.0));
+
+	    model->transfer_output_from_device(batch_size, output, io_memory, util::Stream());
+
+	    status = cudaStreamSynchronize(util::Stream());
+	    CHECK(status == cudaSuccess);
+
+	    delete input;
+	    delete output;
+
+	    io_pool->free(io_memory);
+    	delete io_pool;
+
+	    workspace_pool->free(workspace_memory);
+    	delete workspace_pool;
+	}
+
+    weights_cache->unlock(weights);
+    weights_cache->free(weights);
+    delete weights_cache;
+
+    model->uninstantiate_models_on_device();
+    model->uninstantiate_models_on_host();
+
+    delete model;
+}
+
+int main(int argc, char *argv[]) {
+	std::vector<std::string> non_argument_strings;
+
+	int page_size = 16 * 1024 * 1024;
+	for (int i = 1; i < argc; ++i) {
+		std::string arg = argv[i];
+		if ((arg == "-h") || (arg == "--help")) {
+		    show_usage();
+		    return 0;
+		} else if ((arg == "-p") || (arg == "--page_size")) {
+		    page_size = atoi(argv[++i]);
+		} else {
+		 	non_argument_strings.push_back(arg);
+		}
+	}
+
+	if (non_argument_strings.size() != 1) {
+		std::cerr << "Expecting a model as input" << std::endl;
+		return 1;
+	}
+
+	std::string model_path = non_argument_strings[0];
+
+	check_model(page_size, model_path);
+}
--- a/src/clockwork-convert/convert.cpp
+++ b/src/clockwork-convert/convert.cpp
@@ -14,7 +14,6 @@
 #include <pods/binary.h>
 #include <pods/buffers.h>
 #include <pods/streams.h>
-#include "clockwork/tvm/decoupled_graph_runtime.h"
 #include "clockwork-convert/tvm_model.h"
 #include "clockwork-convert/tvm_abstract_model.h"

@@ -114,8 +113,38 @@ void convert(ConvertConfig config) {
 }

 void show_usage() {
-	std::cout << "Provide the name of a model, to convert it" << std::endl;
-	std::cout << "Specify page size with -p flag" << std::endl;
+	std::cout << "USAGE" << std::endl;
+	std::cout << "  ./convert [OPTIONS] [MODELS]" << std::endl;
+	std::cout << "DESCRIPTION" << std::endl;
+	std::cout << "  This utility converts models compiled for TVM into models compatible with" << std::endl;
+	std::cout << "  Clockwork.  All models provided to this command must be variants of the SAME" << std::endl;
+	std::cout << "  model but with DIFFERENT batch sizes" << std::endl;
+	std::cout << "MODELS" << std::endl;
+	std::cout << "  Specify a model with two arguments [model_batchsize] [model_file_prefix]" << std::endl;
+	std::cout << "      model_batchsize" << std::endl;
+	std::cout << "          The specific batch size TVM compiled for this model" << std::endl;
+	std::cout << "      model_file_prefix" << std::endl;
+	std::cout << "          TVM outputs three files when it compiles a model: a .so, a .params," << std::endl;
+	std::cout << "          and a .json file.  model_file_prefix specifies the path to these files" << std::endl;
+	std::cout << "OPTIONS" << std::endl;
+    std::cout << "  -h, --help" << std::endl;
+    std::cout << "      Print this message" << std::endl;
+    std::cout << "  -o, --output" << std::endl;
+    std::cout << "      Directory to output all compiled models to.  Defaults to 'model' in the" << std::endl;
+    std::cout << "      current directory.  You should probably set this." << std::endl;
+    std::cout << "  -p, --page_size" << std::endl;
+    std::cout << "      Weights page size used by Clockwork.  Defaults to 16MB.  You shouldn't" << std::endl;
+    std::cout << "      need to set this because we are using 16MB pages." << std::endl;
+	std::cout << "EXAMPLE" << std::endl;
+	std::cout << "  Suppose you have compiled resnet50 using TVM, for batch sizes 1, 2, and 4," << std::endl;
+	std::cout << "  located in ~/models/resnet50/batchsize1, ~/models/resnet50/batchsize2, and " << std::endl;
+	std::cout << "  ~/models/resnet50/batchsize4 respectively.  The following command will" << std::endl;
+	std::cout << "  combine the models and convert them into a clockwork model, putting the" << std::endl;
+	std::cout << "  output in ~/models/resnet50/clockwork" << std::endl;
+	std::cout << "     ./convert -o ~/models/resnet50/clockwork \\" << std::endl;
+	std::cout << "               1 ~/models/resnet50/batchsize1/tvm-model \\" << std::endl;
+	std::cout << "               2 ~/models/resnet50/batchsize2/tvm-model \\" << std::endl;
+	std::cout << "               4 ~/models/resnet50/batchsize4/tvm-model" << std::endl;
 }

 int main(int argc, char *argv[]) {
@@ -131,8 +160,6 @@ int main(int argc, char *argv[]) {
 		if ((arg == "-h") || (arg == "--help")) {
 		    show_usage();
 		    return 0;
-		} else if ((arg == "--weights_page_size")) {
-		    config.weights_page_size = atoi(argv[++i]);
 		} else if ((arg == "-o") || (arg == "--output")) {
 		    config.output_dir = argv[++i];
 		} else if ((arg == "-p") || (arg == "--page_size")) {
@@ -142,8 +169,8 @@ int main(int argc, char *argv[]) {
 		}
 	}

-	if (non_argument_strings.size() < 1) {
-		std::cerr << "Expected input model filename, none given." << std::endl 
+	if (non_argument_strings.size() < 2) {
+		std::cerr << "Each input model should be specified as <batch_size> <filename>, e.g. 1 ~/models/resnet50/batchsize1" << std::endl 
 		          << "Execute with --help for usage information." << std::endl;
 		return 1;
 	}

--- a/src/cudafatbin.cc
+++ b/src/cudafatbin.cc
@@ -3,7 +3,6 @@
 #include <cuda_runtime.h>
 #include "clockwork/tvm/pack_args.h"

-#include <tvm/runtime/cuda_common.h>
 #include "clockwork/util.h"
 #include <iostream>
 #include "tbb/task_scheduler_init.h"
@@ -21,10 +20,9 @@
 #include <pods/binary.h>
 #include <pods/buffers.h>
 #include <pods/streams.h>
-#include "clockwork/tvm/decoupled_graph_runtime.h"
 #include <cuda_runtime.h>
 #include <chrono>
-#include <tvm/runtime/cuda_common.h>
+#include "clockwork/cuda_common.h"
 #include "clockwork/cache.h"
 #include "clockwork/util.h"
 #include "clockwork/model/so.h"

--- a/src/clockwork/tvm/decoupled_graph_runtime.cc
+++ b/src/clockwork/tvm/decoupled_graph_runtime.cc
@@ -2,7 +2,7 @@
 *  Copyright (c) 2017 by Contributors
 * \file graph_runtime.cc
 */
-#include "clockwork/tvm/decoupled_graph_runtime.h"
+#include "clockwork-convert/tvm/decoupled_graph_runtime.h"
 #include <tvm/runtime/managed_cuda_device_api.h>

 #include <tvm/runtime/ndarray.h>

--- a/include/clockwork/tvm/decoupled_graph_runtime.h
+++ b/include/clockwork/tvm/decoupled_graph_runtime.h
--- a/src/clockwork-convert/tvm_model.h
+++ b/src/clockwork-convert/tvm_model.h
@@ -16,7 +16,7 @@
 #include <tvm/runtime/module.h>
 #include <tvm/runtime/registry.h>
 #include <tvm/runtime/packed_func.h>
-#include "clockwork/tvm/decoupled_graph_runtime.h"
+#include "clockwork-convert/tvm/decoupled_graph_runtime.h"


 namespace tvm_model {

--- a/src/clockwork/action.h
+++ b/src/clockwork/action.h
@@ -7,7 +7,7 @@
 #include <memory>
 #include <atomic>
 #include <cuda_runtime.h>
-#include "tvm/runtime/cuda_common.h"
+#include "clockwork/cuda_common.h"
 #include "clockwork/telemetry.h"
 #include "clockwork/cache.h"
 #include "clockwork/model/model.h"

--- a/src/clockwork/cache.cpp
+++ b/src/clockwork/cache.cpp
 #include "clockwork/cache.h"
 #include <dmlc/logging.h>
-#include "tvm/runtime/cuda_common.h"
+#include "clockwork/cuda_common.h"

 namespace clockwork {
No results found