Compare revisions

Jonathan Mace · Jonathan Mace · Jonathan Mace · Jonathan Mace · Jonathan Mace · Jonathan Mace
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,6 +8,7 @@ set(CC "gcc-8")
 set(THREADS_PREFER_PTHREAD_FLAG ON)
 set(CMAKE_BUILD_TYPE Release)
 #set(CMAKE_BUILD_TYPE Debug)
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)

 find_package(Threads REQUIRED)
 find_package(Boost COMPONENTS system filesystem REQUIRED)
@@ -89,6 +90,7 @@ target_link_libraries( clockwork PUBLIC
 	tvm_runtime
 	tbb
 	nvidia-ml
+	rt
    ${Boost_SYSTEM_LIBRARY}
    ${Boost_FILESYSTEM_LIBRARY}
 )
@@ -97,7 +99,6 @@ target_link_libraries( clockwork PUBLIC
 # Converts from TVM models to clockwork models
 include_directories(clockwork-convert)
 add_executable (convert
-	src/clockwork-convert/tvm/decoupled_graph_runtime.cc
 	src/clockwork-convert/tvm_model.cpp
 	src/clockwork-convert/tvm_abstract_model.cpp
 	src/clockwork-convert/convert.cpp

--- a/README.md
+++ b/README.md
@@ -88,13 +88,25 @@ DefaultLimitNOFILE=65535
 ```
 4. Restart

+## 3. Increase mmap limits
+
+Clockwork uses a lot of shared objects, and we need to increase the mmap limit.  As root, run
+```
+/usr/sbin/sysctl -w vm.max_map_count=10000000
+```
+
+In general you can check mmap limits with:
+```
+sysctl vm.max_map_count
+```
+
 ## 3. Disable CUDA JIT

 None of the models we feed to Clockwork should have CUDA PTX code (JIT'able code) -- to make sure, set the `CUDA_CACHE_DISABLE=1` environment variable

 ## 4. Disable GPU frequency autoscaling

-Enable persistence mode
+Enable persistence mode.  NOTE: This must be done on every restart
 ```
 nvidia-smi -pm 1
 ```
@@ -140,4 +152,11 @@ Currently, the CMakeLists assumes CUDA lives in either `/usr/local/cuda/lib64` (

 Undefined reference to tvm::runtime::ManagedCuda... -- this probably means you didn't build TVM properly.  Make sure you haven't modified or deleted the file `build/config.cmake` in the TVM repository.  `make clean` and `make` TVM again.

-Unable to set number of open files with ulimit: default values are picked up from conf files, e.g. /etc/security/limits.conf, but they may be overwritten by files in a subdirectory, e.g. /etc/security/limits.d/*.conf
\ No newline at end of file
+Unable to set number of open files with ulimit: default values are picked up from conf files, e.g. /etc/security/limits.conf, but they may be overwritten by files in a subdirectory, e.g. /etc/security/limits.d/mpi.conf
+
+If you are loading lots of models, you might see the following:
+*  `src/clockwork/model/so.cpp:20: Check failed: lib_handle_ != nullptr: Failed to load SO /proc/26344/fd/14656/proc/26344/fd/14656: cannot apply additional memory protection after relocation: Cannot allocate memory`
+*  `src/clockwork/model/so.cpp:20: Check failed: lib_handle_ != nullptr: Failed to load SO /proc/12386/fd/11804/proc/12386/fd/11804: failed to map segment from shared object`
+To solve this you need to increase your mmap limits, described above.
+
+
--- a/bin/convert.py
+++ b/bin/convert.py
@@ -9,6 +9,9 @@ Clockwork must be build in order to run this script.
 This script uses the `convert` binary and expects it to exist in the `build` folder
 """

+convert_exec = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../build/convert")
+
+
 parser = argparse.ArgumentParser(description='Convert a TVM model into a Clockwork model')
 parser.add_argument("input_dir", metavar="INDIR", type=str, help="Base directory where TVM models exist.  The utility expects multiple models, one per batch size, each in a subdirectory.")
 parser.add_argument("output_dir", metavar="OUTDIR", type=str, help="Output directory.  Directory will be created if it does not exist.")
@@ -75,7 +78,7 @@ def convert(args):
 	print("The following command will run:")

 	pargs = [str(v) for v in [
-		"../build/convert",
+		convert_exec,
 		"-o", args.output_dir,
 		"-p", args.page_size
 	] + [x for m in models for x in m]]

--- a/profile/clockwork/profile/check.cpp
+++ b/profile/clockwork/profile/check.cpp
@@ -2,7 +2,7 @@
 #include <sys/resource.h>
 #include <catch2/catch.hpp>
 #include <cuda_runtime.h>
-#include "tvm/runtime/cuda_common.h"
+#include "clockwork/cuda_common.h"
 #include "clockwork/util.h"
 #include "clockwork/test/util.h"
 #include "clockwork/model/so.h"
@@ -54,6 +54,12 @@ void check_environment() {
            std::cout << "  ✔ GPU " << i << " is in exclusive mode" << std::endl;
        }

+        if (!util::is_persistence_mode_enabled_on_gpu(i)) { // TODO: check all GPUs
+            std::cout << "  ✘ GPU " << i << " does not have persistent mode enabled; set with `nvidia-smi -i " << i << " -pm 1` or set for all GPUs with `nvidia-smi -pm 1`" << std::endl;
+        } else {
+            std::cout << "  ✔ GPU " << i << " persistent mode is enabled" << std::endl;
+        }
+

    }


--- a/profile/clockwork/profile/model/profilecuda.cpp
+++ b/profile/clockwork/profile/model/profilecuda.cpp
 #include <catch2/catch.hpp>
 #include <cuda_runtime.h>
-#include "tvm/runtime/cuda_common.h"
+#include "clockwork/cuda_common.h"
 #include "clockwork/util.h"
 #include "clockwork/test/util.h"
 #include "clockwork/model/so.h"

--- a/profile/clockwork/profile/model/profilemodel.cpp
+++ b/profile/clockwork/profile/model/profilemodel.cpp
@@ -5,7 +5,7 @@
 #include <thread>
 #include <catch2/catch.hpp>
 #include <cuda_runtime.h>
-#include "tvm/runtime/cuda_common.h"
+#include "clockwork/cuda_common.h"
 #include "clockwork/util.h"
 #include "clockwork/test/util.h"
 #include "clockwork/model/so.h"
@@ -480,6 +480,25 @@ TEST_CASE("Warmup works", "[profile] [warmup]") {
    }
 }

+TEST_CASE("Test max concurrent models", "[so_limits]") {
+    util::setCudaFlags();
+    util::initializeCudaStream();
+
+    std::string model_path = "/home/jcmace/clockwork-modelzoo-volta/others/cifar_resnet20_v1/model";
+
+    model::Model* model = load_model_from_disk(model_path);
+    std::vector<model::Model*> copies;
+    for (unsigned i = 0; i < 100000; i++) {
+        model::Model* copy = duplicate(model, false);
+        // model::Model* copy = load_model_from_disk(model_path);
+        copy->instantiate_model_on_host();
+        //copy->instantiate_model_on_device();
+        copies.push_back(copy);
+        std::cout << i << std::endl;
+    }
+
+}
+
 void runMultiClientExperiment(int num_execs, int models_per_exec, bool duplicate_weights, int iterations, bool with_module_load) {
    util::setCudaFlags();
    for (unsigned i = 0; i < 3; i++) {

--- a/src/check_model.cpp
+++ b/src/check_model.cpp
@@ -20,13 +20,15 @@ void show_usage() {
    std::cout << "  -p, --page_size" << std::endl;
    std::cout << "      Weights page size used by Clockwork.  Defaults to 16MB.  You shouldn't" << std::endl;
    std::cout << "      need to set this because we are using 16MB pages." << std::endl;
+    std::cout << "  -i, --iterations" << std::endl;
+    std::cout << "      Number of iterations to measure" << std::endl;
 }

 model::BatchedModel* load_model(std::string model) {
 	return model::BatchedModel::loadFromDisk(model, 0);
 }

-void check_model(int page_size, std::string model_path) {
+void check_model(int page_size, int iterations, std::string model_path) {
 	std::cout << "Checking " << model_path << std::endl;

 	util::setCudaFlags();
@@ -35,9 +37,6 @@ void check_model(int page_size, std::string model_path) {
 	clockwork::model::BatchedModel* model = load_model(model_path);

 	auto batch_sizes = model->implemented_batch_sizes();
-	for (unsigned batch_size : batch_sizes) {
-		std::cout << "  loaded batch size " << batch_size << std::endl;
-	}

    model->instantiate_models_on_host();

@@ -47,9 +46,32 @@ void check_model(int page_size, std::string model_path) {

    cudaError_t status;
    model->instantiate_models_on_device();
-	    
-    std::shared_ptr<Allocation> weights = weights_cache->alloc(model->num_weights_pages(weights_page_size), []{});
-    model->transfer_weights_to_device(weights->page_pointers, util::Stream());
+	
+	unsigned num_pages = model->num_weights_pages(weights_page_size);
+    std::shared_ptr<Allocation> weights = weights_cache->alloc(num_pages, []{});
+
+
+	int warmups = 10;
+
+    // Time the transfer
+    for (unsigned i = 0; i < warmups; i++) {
+    	model->transfer_weights_to_device(weights->page_pointers, util::Stream());
+    }
+	status = cudaStreamSynchronize(util::Stream());
+	CHECK(status == cudaSuccess);
+	auto before_transfer = util::now();
+    for (unsigned i = 0; i < iterations; i++) {
+    	model->transfer_weights_to_device(weights->page_pointers, util::Stream());
+    }
+	status = cudaStreamSynchronize(util::Stream());
+	CHECK(status == cudaSuccess);
+	auto after_transfer = util::now();
+	std::cout << "  input_size:  " << model->input_size(1) << std::endl;
+	std::cout << "  output_size: " << model->output_size(1) << std::endl;
+	std::cout << "  workspace:   " << model->workspace_memory_size(1) << std::endl;
+	std::cout << "  weights size paged (non-paged) [num_pages]: " << (weights_page_size * num_pages) << " (" << model->weights_size << ") [" << num_pages << "]" << std::endl;
+	printf("  weights transfer latency: %.2f ms\n", ((float) (after_transfer-before_transfer)) / (iterations * 1000000.0));
+	std::cout << "  execution latency:" << std::endl;

    for (unsigned batch_size : batch_sizes) {
    	// Create inputs and outputs
@@ -70,21 +92,19 @@ void check_model(int page_size, std::string model_path) {
 	    model->transfer_input_to_device(batch_size, input, io_memory, util::Stream());

 	    // Time the call
-	    int warmups = 20;
 		for (int i = 0; i < warmups; i++) {    
 	    	model->call(batch_size, weights->page_pointers, io_memory, workspace_memory, util::Stream());
 	    }
            status = cudaStreamSynchronize(util::Stream());
            CHECK(status == cudaSuccess);
 	    auto before = util::now();
-            int iterations = 100;
 		for (int i = 0; i < iterations; i++) {    
 	    	model->call(batch_size, weights->page_pointers, io_memory, workspace_memory, util::Stream());
 	    }
            status = cudaStreamSynchronize(util::Stream());
            CHECK(status == cudaSuccess);
 	    auto after = util::now();
-	    printf("  b%d: %.2f ms per call\n", batch_size, ((float) (after-before)) / (iterations * 1000000.0));
+	    printf("    b%d: %.2f ms\n", batch_size, ((float) (after-before)) / (iterations * 1000000.0));

 	    model->transfer_output_from_device(batch_size, output, io_memory, util::Stream());

@@ -114,6 +134,7 @@ void check_model(int page_size, std::string model_path) {
 int main(int argc, char *argv[]) {
 	std::vector<std::string> non_argument_strings;

+	int iterations = 100;
 	int page_size = 16 * 1024 * 1024;
 	for (int i = 1; i < argc; ++i) {
 		std::string arg = argv[i];
@@ -122,6 +143,8 @@ int main(int argc, char *argv[]) {
 		    return 0;
 		} else if ((arg == "-p") || (arg == "--page_size")) {
 		    page_size = atoi(argv[++i]);
+		} else if ((arg == "-i") || (arg == "--iterations")) {
+		    iterations = atoi(argv[++i]);
 		} else {
 		 	non_argument_strings.push_back(arg);
 		}
@@ -134,5 +157,5 @@ int main(int argc, char *argv[]) {

 	std::string model_path = non_argument_strings[0];

-	check_model(page_size, model_path);
+	check_model(page_size, iterations, model_path);
 }
--- a/src/clockwork-convert/cudafatbin.cc
+++ b/src/clockwork-convert/cudafatbin.cc
@@ -2,6 +2,7 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include "clockwork/tvm/pack_args.h"
+#include "dmlc/memory_io.h"

 #include "clockwork/util.h"
 #include <iostream>

--- a/src/clockwork-convert/tvm/decoupled_graph_runtime.cc
+++ b/src/clockwork-convert/tvm/decoupled_graph_runtime.cc
--- a/src/clockwork-convert/tvm/decoupled_graph_runtime.h
+++ b/src/clockwork-convert/tvm/decoupled_graph_runtime.h
-/*!
- *  Copyright (c) 2017 by Contributors
- *
- * \brief Tiny graph runtime that can run graph
- *        containing only tvm PackedFunc.
- * \file graph_runtime.h
- */
-#ifndef CLOCKWORK_TVM_DECOUPLED_RUNTIME_GRAPH_GRAPH_RUNTIME_H_
-#define CLOCKWORK_TVM_DECOUPLED_RUNTIME_GRAPH_GRAPH_RUNTIME_H_
-
-//#include "graph_runtime.h"
-
-#include <dlpack/dlpack.h>
-#include <dmlc/memory_io.h>
-#include <dmlc/json.h>
-#include <tvm/runtime/ndarray.h>
-#include <tvm/runtime/packed_func.h>
-#include <tvm/runtime/managed_cuda_device_api.h>
-#include "clockwork/modeldef.h"
-
-#include <vector>
-#include <string>
-
-namespace tvm {
-namespace runtime {
-
-/*! \brief macro to do C API call */
-#define TVM_CCALL(func)                                            \
-  {                                                                \
-    int ret = (func);                                              \
-    CHECK_EQ(ret, 0)                                               \
-        << TVMGetLastError();                                      \
-  }
-
-/*! \brief Magic number for NDArray list file  */
-constexpr uint64_t kTVMNDArrayListMagic = 0xF7E58D4F05049CB7;
-
-/*! \brief operator attributes about tvm op */
-struct TVMOpParam {
-  std::string func_name;
-  uint32_t num_inputs;
-  uint32_t num_outputs;
-  uint32_t flatten_data;
-};
-
-/*!
- * \brief Tiny graph runtime with interfaced defined to isolate resource usage.
- *
- *  This runtime can be acccesibly in various language via
- *  TVM runtime PackedFunc API.
- */
-class DecoupledGraphRuntime : public ModuleNode {
- public:
-
-  // GraphRuntime() : ModuleNode() {
-  //   std::cout << "Creating GraphRuntime" << std::endl;
-  // }
-
-  virtual ~DecoupledGraphRuntime() {
-    if (tempParams_ != nullptr) {
-      for (size_t i = 0; i < this->paramsSize_; ++i) {
-        delete this->tempParams_[i];
-      }
-
-      delete this->copyParamsToEIDs_;
-      delete this->tempParams_;
-    } else {
-      delete contiguous_input_memory.tempParamsArray;
-    }
-  }
-
-  /*!
-   * \brief Get member function to front-end
-   * \param name The name of the function.
-   * \param sptr_to_self The pointer to the module node.
-   * \return The corresponding member function.
-   */
-  virtual PackedFunc GetFunction(const std::string& name,
-                                 const std::shared_ptr<ModuleNode>& sptr_to_self);
-
-  /*!
-   * \return The type key of the executor.
-   */
-  const char* type_key() const final {
-    return "DecoupledGraphRuntime";
-  }
-  void Run();
-
-  /*!
-   * \brief Initialize the graph executor with graph and context.
-   * \param graph_json The execution graph.
-   * \param module The module containing the compiled functions for the host
-   *  processor.
-   * \param ctxs The context of the host and devices where graph nodes will be
-   *  executed on.
-   */
-
-  void Init(const std::string& graph_json,
-            tvm::runtime::Module module,
-            const std::vector<TVMContext>& ctxs,
-            bool contiguous);
-
-  /*!
-   * \brief Get the input index given the name of input.
-   * \param name The name of the input.
-   * \return The index of input.
-   */
-  int GetInputIndex(const std::string& name);
-
-  /*!
-   * \brief set index-th input to the graph.
-   * \param index The input index.
-   * \param data_in The input data.
-   */
-  void SetInput(int index, DLTensor* data_in);
-  /*!
-   * \brief Get the number of outputs
-   *
-   * \return The number of outputs from graph.
-   */
-  int NumOutputs() const;
-  /*!
-   * \brief Return NDArray for given input index.
-   * \param index The input index.
-   *
-   * \return NDArray corresponding to given input node index.
-   */
-  NDArray GetInput(int index) const;
-  /*!
-   * \brief Return NDArray for given output index.
-   * \param index The output index.
-   *
-   * \return NDArray corresponding to given output node index.
-   */
-  NDArray GetOutput(int index) const;
-  /*!
-   * \brief Copy index-th output to data_out.
-   * \param index The output index.
-   * \param data_out the output data.
-   */
-  void CopyOutputTo(int index, DLTensor* data_out);
-
-  /*!
-   * \brief Load parameters from parameter blob to cpu memory
-   * \param strm The input stream.
-   */
-  void LoadParamsContiguously(const std::string& param_blob);
-
-  /*!
-   * \brief Load parameters from binary stream to cpu memory
-   * \param strm The input stream.
-   */
-  void LoadParams(dmlc::Stream* strm);
-  /*!
-   * \brief Load parameters from parameter blob to cpu memory.
-   * \param param_blob A binary blob of parameter.
-   */
-  void LoadParams(const std::string& param_blob);
-
-  /*!
-   * \brief Do necessary transfers to device.
-   * \param param_blob A binary blob of parameter.
-   */
-  void* LoadToDevice();
-
-  /*!
-   * \brief Get the tensor vector pointer.
-   */
-  std::vector<NDArray>& data_entry() {
-      return data_entry_;
-  }
-
-  /*!
-   * \brief Get the execution function pointer.
-   */
-  std::vector<std::function<void()> >& op_execs() {
-        return op_execs_;
-  }
-
-  /*!
-   * \brief Get node entry index.
-   * \param nid Node id.
-   * \param index Index of the nodes.
-   */
-  uint32_t GetEntryId(uint32_t nid, uint32_t index) const {
-    return node_row_ptr_[nid] + index;
-  }
-
- /*!
-  * \brief Get total number of nodes.
-  * \return Total number of nodes.
-  */
-  uint32_t GetNumOfNodes() const {
-    return static_cast<uint32_t>(nodes_.size());
-  }
-
-  std::string GetNodeName(uint32_t nid) const {
-    return nodes_[nid].name;
-  }
-
-
- private:
-  // Memory pool entry.
-  struct PoolEntry {
-    size_t size;
-    int device_type;
-    int storage_id;
-    PoolEntry(int s, int dev_type) : size(s), device_type(dev_type), storage_id(0) {}
-    PoolEntry(int s, int dev_type, int storage_id) : size(s), device_type(dev_type), storage_id(storage_id) {}
-  };
-  // Node entry
-  struct NodeEntry {
-    uint32_t node_id;
-    uint32_t index;
-    uint32_t version;
-    // JSON Loader
-    void Load(dmlc::JSONReader *reader) {
-      reader->BeginArray();
-      CHECK(reader->NextArrayItem()) << "invalid json format";
-      reader->Read(&node_id);
-      CHECK(reader->NextArrayItem()) << "invalid json format";
-      reader->Read(&index);
-      if (reader->NextArrayItem()) {
-        reader->Read(&version);
-        CHECK(!reader->NextArrayItem()) << "invalid json format";
-      } else {
-        version = 0;
-      }
-    }
-  };
-  // Node
-  struct Node {
-    // operator type in string
-    std::string op_type;
-    // name of the op
-    std::string name;
-    // parameters
-    TVMOpParam param;
-    // inputs
-    std::vector<NodeEntry> inputs;
-    // control deps
-    std::vector<uint32_t> control_deps;
-    // JSON Loader
-    void LoadAttrs(dmlc::JSONReader *reader, TVMOpParam* param) {
-      int bitmask = 0;
-      std::string key, value;
-      reader->BeginObject();
-      while (reader->NextObjectItem(&key)) {
-        reader->Read(&value);
-        if (key == "func_name") {
-          param->func_name = value;
-          bitmask |= 1;
-        } else if (key == "num_inputs") {
-          param->num_inputs = strtoul(value.c_str(), nullptr, 10);
-          bitmask |= 2;
-        } else if (key == "num_outputs") {
-          param->num_outputs = strtoul(value.c_str(), nullptr, 10);
-          bitmask |= 4;
-        } else if (key == "flatten_data") {
-          param->flatten_data = strtoul(value.c_str(), nullptr, 10);
-          bitmask |= 8;
-        }
-      }
-      CHECK_EQ(bitmask, 1|2|4|8) << "invalid format";
-    }
-    // JSON Loader
-    void Load(dmlc::JSONReader *reader) {
-      reader->BeginObject();
-      int bitmask = 0;
-      std::string key;
-      while (reader->NextObjectItem(&key)) {
-        if (key == "op") {
-          reader->Read(&op_type);
-          bitmask |= 1;
-        } else if (key == "name") {
-          reader->Read(&name);
-          bitmask |= 2;
-        } else if (key == "inputs") {
-          reader->Read(&inputs);
-          bitmask |= 4;
-        } else if (key == "attr" || key == "attrs") {
-          this->LoadAttrs(reader, &param);
-        } else if (key == "control_deps") {
-          reader->Read(&control_deps);
-        } else {
-          LOG(FATAL) << "do not support key " << key;
-        }
-      }
-      CHECK_EQ(bitmask, 1|2|4) << "invalid format";
-    }
-  };
-  struct GraphAttr {
-    size_t storage_num_not_alloctaed{0};
-    std::vector<int> storage_id;
-    std::vector<int> device_index;
-    std::vector<std::string> dltype;
-    std::vector<std::vector<int64_t> > shape;
-    // The graph attribute fields.
-    void Load(dmlc::JSONReader *reader) {
-      reader->BeginObject();
-      int bitmask = 0;
-      std::string key, type;
-      while (reader->NextObjectItem(&key)) {
-        if (key == "dltype") {
-          reader->BeginArray();
-          CHECK(reader->NextArrayItem());
-          reader->Read(&type);
-          CHECK_EQ(type, "list_str");
-          CHECK(reader->NextArrayItem());
-          reader->Read(&dltype);
-          CHECK(!reader->NextArrayItem());
-          bitmask |= 1;
-        } else if (key == "storage_id") {
-          reader->BeginArray();
-          CHECK(reader->NextArrayItem());
-          reader->Read(&type);
-          CHECK_EQ(type, "list_int");
-          CHECK(reader->NextArrayItem());
-          reader->Read(&storage_id);
-          CHECK(!reader->NextArrayItem());
-          bitmask |= 2;
-        } else if (key == "shape") {
-          reader->BeginArray();
-          CHECK(reader->NextArrayItem());
-          reader->Read(&type);
-          CHECK_EQ(type, "list_shape");
-          CHECK(reader->NextArrayItem());
-          reader->Read(&shape);
-          CHECK(!reader->NextArrayItem());
-          bitmask |= 4;
-        } else if (key == "device_index") {
-          reader->BeginArray();
-          CHECK(reader->NextArrayItem());
-          reader->Read(&type);
-          CHECK_EQ(type, "list_int");
-          CHECK(reader->NextArrayItem());
-          reader->Read(&device_index);
-          CHECK(!reader->NextArrayItem());
-        } else {
-          reader->BeginArray();
-          CHECK(reader->NextArrayItem());
-          reader->Read(&type);
-          if (type == "list_int") {
-            CHECK(reader->NextArrayItem());
-            std::vector<int> temp;
-            reader->Read(&temp);
-          } else if (type == "size_t") {
-            CHECK(reader->NextArrayItem());
-            size_t temp;
-            reader->Read(&temp);
-          } else {
-            LOG(FATAL) << "cannot skip graph attr " << key;
-          }
-          CHECK(!reader->NextArrayItem());
-        }
-      }
-      CHECK_EQ(bitmask, 1|2|4) << "invalid format";
-    }
-  };
-  // The graph attribute fields.
-  void Load(dmlc::JSONReader *reader) {
-      reader->BeginObject();
-      int bitmask = 0;
-      std::string key;
-      while (reader->NextObjectItem(&key)) {
-        if (key == "nodes") {
-          reader->Read(&nodes_);
-          bitmask |= 1;
-        } else if (key == "arg_nodes") {
-          reader->Read(&input_nodes_);
-          bitmask |= 2;
-        } else if (key == "node_row_ptr") {
-          reader->Read(&node_row_ptr_);
-          bitmask |= 4;
-        } else if (key == "heads") {
-          reader->Read(&outputs_);
-          bitmask |= 8;
-        } else if (key == "attrs") {
-          reader->Read(&attrs_);
-          bitmask |= 16;
-        } else {
-          LOG(FATAL) << "key " << key << " is not supported";
-        }
-      }
-      CHECK_EQ(bitmask, 1|2|4|8|16) << "invalid format";
-  }
-  /*! \brief Creates description of the temporal memory */
-  void SetupStorage();
-  void SetupStorageContiguous();
-
-  /*! \brief Allocates memory needed as described in
-      pool_entries_ and contiguous_memory_input */
-  void AllocateStorageSpace();
-
-  /*!
-   * \brief Load previously prepped parameteres to the device.
-   * \param param_blob A binary blob of parameter.
-   */
-  void UploadParams();
-
-  std::string SaveParams();
-
-  /*! \brief Setup the executors. */
-  void SetupOpExecs();
-
-  NDArray GetConstParams();
-  void SetConstParams(NDArray params);
-
-
-  clockwork::model::ModelDef* ExtractModelSpec();
-  std::vector<std::vector<WorkspaceAlloc>>* ExtractWorkspaceAllocs();
-
-  /*!
-   * \brief Create a executtion function given input.
-   * \param attrs The node attributes.
-   * \param args The arguments to the functor, including inputs and outputs.
-   * \param num_inputs Number of inputs.
-   * \param dev_type The device type of the tvm_op.
-   * \return The created executor.
-   */
-  std::function<void()> CreateTVMOp(const TVMOpParam& attrs,
-                                    const std::vector<DLTensor>& args,
-                                    size_t num_inputs);
-  // Get node entry index.
-  uint32_t entry_id(uint32_t nid, uint32_t index) const {
-    return node_row_ptr_[nid] + index;
-  }
-  // Get node entry index.
-  uint32_t entry_id(const NodeEntry& e) const {
-    return entry_id(e.node_id, e.index);
-  }
-  // Number of node entries.
-  uint32_t num_node_entries() const {
-    return node_row_ptr_.back();
-  }
-
-  /*! \brief The graph nodes. */
-  std::vector<Node> nodes_;
-  /*! \brief The argument nodes. */
-  std::vector<uint32_t> input_nodes_;
-  /*! \brief Used for quick entry indexing. */
-  std::vector<uint32_t> node_row_ptr_;
-  /*! \brief Output entries. */
-  std::vector<NodeEntry> outputs_;
-  /*! \brief Additional graph attributes. */
-  GraphAttr attrs_;
-  /*! \brief The code module that contains both host and device code. */
-  tvm::runtime::Module module_;
-  /*! \brief Execution context of all devices including the host. */
-  std::vector<TVMContext> ctxs_;
-  /*! \brief Common storage pool for all devices. */
-  std::vector<NDArray> storage_pool_;
-  /*! \brief Data entry of each node. */
-  std::vector<NDArray> data_entry_;
-  /*! \brief Operator on each node. */
-  std::vector<std::function<void()> > op_execs_;
-  /*! \brief Parameters loaded in cpu memory. */
-  NDArray** tempParams_ = nullptr;
-  /*! \brief Place to copy params on device memory. */
-  uint32_t* copyParamsToEIDs_ = nullptr;
-  /*! \brief Number of parameters loaded in cpu memory. */
-  size_t paramsSize_ = 0;
-
-  typedef struct contiguous_memory_allocation {
-    contiguous_memory_allocation() : size(0), tempParamsArray(nullptr) {}
-    uint64_t size;
-    uint64_t paramsSize; // Size inside this memory allocation of the const params
-    std::vector<uint32_t> storage_ids;
-    std::vector<uint32_t> offsets;
-    NDArray backing_array;
-    NDArray backing_array_params_view;
-    NDArray* tempParamsArray;
-    TVMContext ctx;
-  } contiguous_memory_allocation;
-
-  /*! \brief Allocate input memory as a single contiguous region to enable fast copy. */
-  contiguous_memory_allocation contiguous_input_memory;
-
-  // Size and device type of each storage pool entry.
-  std::vector<PoolEntry> pool_entries_;
-};
-
-Module DecoupledGraphRuntimeCreate(const std::string& sym_json,
-                          const tvm::runtime::Module& m, int device_type, int device_id);
-std::shared_ptr<DecoupledGraphRuntime> DecoupledGraphRuntimeCreateDirect(const std::string& sym_json,
-                          const tvm::runtime::Module& m, int device_type, int device_id);
-
-// std::vector<TVMContext> GetAllContext(const TVMArgs& args);
-}  // namespace runtime
-}  // namespace tvm
-
-#endif  // TVM_RUNTIME_GRAPH_DECOUPLED_GRAPH_RUNTIME_H_
--- a/src/clockwork-convert/tvm_abstract_model.cpp
+++ b/src/clockwork-convert/tvm_abstract_model.cpp
@@ -6,6 +6,7 @@
 #include <cstring>
 #include "clockwork-convert/tvm_model.h"
 #include <dmlc/logging.h>
+#include <tvm/runtime/packed_func.h>

 #include <dlpack/dlpack.h>

@@ -50,7 +51,7 @@ Model Model::fromTVM(tvm_model::Model &model, tvm_model::Params &params, tvm_mod
 	for (unsigned i = 0; i < model.attrs_.storage_id.size(); i++) {
 		Tensor* tensor = new Tensor();
 		tensor->id = i;
-		tensor->dltype = tvm::runtime::String2TVMType(model.attrs_.dltype[i]);
+		tensor->dltype = tvm::runtime::String2DLDataType(model.attrs_.dltype[i]);
 		tensor->shape = model.attrs_.shape[i];
 		tensor->storage = storage_locations[model.attrs_.storage_id[i]];
 		tensors.push_back(tensor);
@@ -63,12 +64,19 @@ Model Model::fromTVM(tvm_model::Model &model, tvm_model::Params &params, tvm_mod
 		weights->data = p.second->dataptr();
 		weights->size = p.second->Size();
 		weights->tensor = nullptr;
+
+		CHECK(out.weights.find(p.first) == out.weights.end()) << "Found duplicate layers with name " << p.first;
 		out.weights[p.first] = weights;
 	}

+	std::vector<std::string> seen_names;
+
 	for (unsigned i = 0; i < model.nodes_.size(); i++) {
 		tvm_model::Node &node = model.nodes_[i];

+		CHECK(std::find(seen_names.begin(), seen_names.end(), node.name) == seen_names.end()) << "Found duplicate node " << node.name;
+		seen_names.push_back(node.name);
+
 		if (node.op_type == "null") {
 			int input_index = model.node_row_ptr_[i];
 			int input_offset = 0;
@@ -202,7 +210,8 @@ void make_weights_lookup_table(Model &model, PageMappedStorage* mapped) {

 			// Store the actual weights data in the storage lookup table.
 			auto data = std::string(static_cast<char*>(weights->data), weights->size);
-			mapped->weights_lookup[data] = std::make_pair(page_number, index_in_page);
+
+			mapped->weights_lookup[data].push_back(PreMappedIndices{page_number, index_in_page});

 			index_in_page++;
 		}
@@ -212,6 +221,8 @@ void make_weights_lookup_table(Model &model, PageMappedStorage* mapped) {
 }

 std::vector<Page*> replicate_weights_mapping(Model &model, PageMappedStorage* existing_weights_mapping) {
+	std::unordered_map<std::string, unsigned> current_index;
+
 	std::vector<Page*> pages;
 	for (unsigned i = 0; i < existing_weights_mapping->weights.size(); i++) {
 		Page* page = new Page();
@@ -224,11 +235,12 @@ std::vector<Page*> replicate_weights_mapping(Model &model, PageMappedStorage* ex
 	for (auto &p : model.weights) {
 		LayerWeights* weights = p.second;
 		auto data = std::string(static_cast<char*>(weights->data), weights->size);
-		if (existing_weights_mapping->weights_lookup.find(data) == existing_weights_mapping->weights_lookup.end()) {
-			throw "Error: " + weights->name + " not found in existing weights mapping";
-		}
-		auto assignment = existing_weights_mapping->weights_lookup[data];
-		pages[assignment.first]->used_by[assignment.second] = weights->tensor->storage;
+		CHECK(existing_weights_mapping->weights_lookup.find(data) != existing_weights_mapping->weights_lookup.end())
+			<< "Error: " + weights->name + " not found in existing weights mapping";
+
+		unsigned position = current_index[data]++;
+		PreMappedIndices indices = existing_weights_mapping->weights_lookup[data][position];
+		pages[indices.page_index]->used_by[indices.location_index] = weights->tensor->storage;
 	}

 	for (unsigned i = 0; i < pages.size(); i++) {
@@ -257,8 +269,12 @@ PageMappedStorage* PageMappedStorage::calculate(Model &model, size_t weights_pag
 		CHECK(p.second->tensor != nullptr);
 		CHECK(p.second->tensor->storage != nullptr);

-		weights_storage.push_back(p.second->tensor->storage);
-		seen.push_back(p.second->tensor->storage);
+		StorageLocation* l = p.second->tensor->storage;
+
+		CHECK(std::find(seen.begin(), seen.end(), l) == seen.end()) << "Found duplicate storage location";
+
+		weights_storage.push_back(l);
+		seen.push_back(l);
 	}

 	// Pull out storage locations that are model inputs
@@ -309,6 +325,12 @@ PageMappedStorage* PageMappedStorage::calculate(Model &model, size_t weights_pag
 	// Pack the weights onto pages, or use existing mapping if provided
 	if (existing_weights_mapping == nullptr) {
 		mapped->weights = pack(weights_storage, weights_page_size);
+
+		int count = 0;
+		for (Page* page : mapped->weights) {
+			count += page->used_by.size();
+		}
+
 		make_weights_lookup_table(model, mapped);
 	} else {
 		mapped->weights = replicate_weights_mapping(model, existing_weights_mapping);
@@ -491,6 +513,9 @@ void makeModelDef(Model &model, size_t weights_page_size, clockwork::model::Page
 			tensordef.page_offset = pageptr.page_offset;
 			tensordef.size = tensor->Size();
 			tensordef.shape = tensor->shape;
+			tensordef.code = tensor->dltype.code;
+			tensordef.bits = tensor->dltype.bits;
+			tensordef.lanes = tensor->dltype.lanes;

 			opdef.inputs.push_back(tensordef);
 		}

--- a/src/clockwork-convert/tvm_abstract_model.h
+++ b/src/clockwork-convert/tvm_abstract_model.h
@@ -90,6 +90,12 @@ public:
 	size_t Size();
 };

+class PreMappedIndices {
+public:
+	unsigned page_index;
+	unsigned location_index;
+};
+
 class PageMappedStorage {
 public:
 	std::vector<Page*> weights;
@@ -97,7 +103,7 @@ public:
 	std::vector<StorageLocation*> workspace_storage;
 	size_t transient_memory;

-	std::unordered_map<std::string, std::pair<unsigned, unsigned>> weights_lookup;
+	std::unordered_map<std::string, std::vector<PreMappedIndices>> weights_lookup;

 	static PageMappedStorage* calculate(Model &model, size_t weights_page_size, PageMappedStorage* existing_weights_mapping = nullptr);
 };

--- a/src/clockwork-convert/tvm_model.cpp
+++ b/src/clockwork-convert/tvm_model.cpp

 #include "clockwork-convert/tvm_model.h"
+#include <tvm/runtime/device_api.h>

 using namespace tvm_model;

@@ -177,6 +178,7 @@ void Params::Load(dmlc::Stream* strm) {
 	CHECK(size == names.size()) << "Invalid parameters file format";

 	for (std::string name : names) {
+    CHECK(data.find(name) == data.end()) << "Duplicate params for " << name;
 		data[name] = new tvm::runtime::NDArray();
 		data[name]->Load(strm);
 	}
@@ -198,7 +200,7 @@ Allocs Allocs::ProfileModel(std::string model_so, std::string model_json, std::s
 	const int device_type = kDLGPU;
 	const int device_id = 0;

-	const tvm::runtime::PackedFunc load_module(*tvm::runtime::Registry::Get("module.loadfile_so"));
+	const tvm::runtime::PackedFunc load_module(*tvm::runtime::Registry::Get("runtime.module.loadfile_so"));
 	tvm::runtime::Module mod_syslib = load_module(model_so, "so");

 	// Graph structure
@@ -207,8 +209,8 @@ Allocs Allocs::ProfileModel(std::string model_so, std::string model_json, std::s
 	json_in.close();

 	// Construct TVM runtime
-	std::shared_ptr<tvm::runtime::DecoupledGraphRuntime> rt = DecoupledGraphRuntimeCreateDirect(json_data, mod_syslib, device_type, device_id);
-	tvm::runtime::Module mod = tvm::runtime::Module(rt);
+  const tvm::runtime::PackedFunc create_graph_runtime(*tvm::runtime::Registry::Get("tvm.graph_runtime.create"));
+	tvm::runtime::Module mod = create_graph_runtime(json_data, mod_syslib, device_type, device_id);
 	// const tvm::runtime::PackedFunc create_graph_runtime(*tvm::runtime::Registry::Get("tvm.decoupled_graph_runtime.create_contiguous"));
 	// tvm::runtime::Module mod = create_graph_runtime(json_data, mod_syslib, device_type, device_id);
 	
@@ -231,12 +233,8 @@ Allocs Allocs::ProfileModel(std::string model_so, std::string model_json, std::s
 	  // tvm::runtime::PackedFunc set_const_params = mod.GetFunction("set_const_params");
 	  // tvm::runtime::NDArray const_params = get_const_params();

-    // load the model onto device
-    tvm::runtime::PackedFunc load_to_device = mod.GetFunction("load_to_device");
-    load_to_device();

-
-    tvm::runtime::PackedFunc extract_allocs = mod.GetFunction("extract_workspace_allocs");
+    tvm::runtime::PackedFunc extract_allocs = mod.GetFunction("profile_workspace_allocs");
    std::vector<std::vector<tvm::runtime::WorkspaceAlloc>>* alloc_vector = static_cast<std::vector<std::vector<tvm::runtime::WorkspaceAlloc>>*>((void*) extract_allocs());

    Allocs allocs;

--- a/src/clockwork-convert/tvm_model.h
+++ b/src/clockwork-convert/tvm_model.h
@@ -16,7 +16,6 @@
 #include <tvm/runtime/module.h>
 #include <tvm/runtime/registry.h>
 #include <tvm/runtime/packed_func.h>
-#include "clockwork-convert/tvm/decoupled_graph_runtime.h"


 namespace tvm_model {

--- a/src/clockwork/model/cuda.cpp
+++ b/src/clockwork/model/cuda.cpp
@@ -2,6 +2,7 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include "clockwork/tvm/pack_args.h"
+#include "dmlc/memory_io.h"

 #include "clockwork/cuda_common.h"
 #include "clockwork/util.h"

--- a/src/clockwork/model/memfile.cpp
+++ b/src/clockwork/model/memfile.cpp
@@ -3,14 +3,30 @@
 #include <sys/syscall.h>
 #include "dmlc/logging.h"
 #include "clockwork/model/memfile.h"
+#include <iostream>
+#include <sys/mman.h>
+#include <fcntl.h> 
+#include <sstream>
+#include <cstdio>

 namespace clockwork {

+int make_shmemfd(std::string &name) {
+	int fd = shm_open(name.c_str(), O_RDWR | O_CREAT, S_IRWXU);
+	if (fd < 0) {
+		std::cout << fd << std::endl;
+		perror("shm_open");
+		CHECK(fd < 0) << "Could not create memfd using shm_open";		
+	}
+	return fd;
+}
+
 int make_memfd() {
 	int fd = syscall(__NR_memfd_create, std::string("").c_str(), 0);
 	if (fd < 0) {
+		std::cout << fd << std::endl;
 		perror("memfd_create");
-		CHECK(fd < 0) << "ModelCodeReader could not create memfile";
+		CHECK(fd < 0) << "Could not create memfd using memfd_create";
 	}
 	return fd;
 }
@@ -21,7 +37,7 @@ std::string memfd_filename(int memfd) {
 	return ss.str();
 }

-Memfile Memfile::readFrom(const std::string &filename) {
+MemfdFile* MemfdFile::readFrom(const std::string &filename) {
 	int memfd = make_memfd();
 	std::string memfilename = memfd_filename(memfd);
    std::ofstream dst(memfilename, std::ios::binary);
@@ -34,18 +50,46 @@ Memfile Memfile::readFrom(const std::string &filename) {
    src.close();
    dst.close();

-    return Memfile(memfd, memfilename);
+    return new MemfdFile(memfd, memfilename);	
 }

-MemfileReader::MemfileReader(const Memfile &memfile) : 
-	memfile(memfile), dst(memfile.filename, std::ios::binary) {
+int MemfdFile::close() {
+	std::cout << "Closing " << memfd << std::endl;
+	return ::close(memfd);
+}
+
+unsigned shmem_counter = 0;
+
+ShmemFile* ShmemFile::readFrom(const std::string &filename) {
+	// Filename of the shmem file
+	std::stringstream name;
+	name << "/clockwork-" << getpid() << "-" << shmem_counter++;
+	std::string shmem_name = name.str();
+	int shmem_fd = make_shmemfd(shmem_name);
+
+	// Path to the shmem file
+	std::string shmem_path = "/dev/shm" + shmem_name;
+
+	// Remove existing file
+	std::remove(shmem_path.c_str());
+
+    std::ofstream dst(shmem_path, std::ios::binary);
+    CHECK(dst.good()) << "Bad memfile " << shmem_path;
+
+	std::ifstream src(filename, std::ios::binary);
+	CHECK(src.good()) << "Unable to open file " << filename;
+    dst << src.rdbuf();
+
+    src.close();
+    dst.close();
+
+    return new ShmemFile(shmem_fd, shmem_path, shmem_name);
 }

-unsigned MemfileReader::readsome(std::istream stream, unsigned amount) {
-	char buf[amount];
-	int amountRead = stream.readsome(buf, amount);
-	dst.write(buf, amountRead);
-	return amountRead;
+int ShmemFile::close() {
+	::close(memfd);
+	int status = shm_unlink(name.c_str());
+	return status;
 }
-	
+
 }
\ No newline at end of file
--- a/src/clockwork/model/memfile.h
+++ b/src/clockwork/model/memfile.h
@@ -9,30 +9,66 @@
 namespace clockwork {

 /** An in-memory file */
-class Memfile {
+class MemfileImpl {
 public:
 	const int memfd;
 	const std::string filename;

-	Memfile(const int &memfd, const std::string &filename) :
+	MemfileImpl(const int &memfd, const std::string &filename) :
 		memfd(memfd), filename(filename) {}

-	// Copy another file into a memfile
-	static Memfile readFrom(const std::string &filename);
+	virtual int close() = 0;
+};
+
+/** An in-memory file */
+class MemfdFile : public MemfileImpl {
+public:

+	MemfdFile(const int &memfd, const std::string &filename) :
+		MemfileImpl(memfd, filename) {}
+
+	// Copy another file into a MemfdFile
+	static MemfdFile* readFrom(const std::string &filename);
+
+	virtual int close();
 };

-/** Used for reading data from a stream into a memfile */
-class MemfileReader {
+class ShmemFile : public MemfileImpl {
 public:
-	const Memfile &memfile;
-	MemfileReader(const Memfile &memfile);
+	const std::string name;
+
+	ShmemFile(const int &fd, const std::string &filename, const std::string &name) : 
+		MemfileImpl(fd, filename), name(name) {}
+
+	// Copy another file into a ShmemFile
+	static ShmemFile* readFrom(const std::string &filename);

+	virtual int close();
+
+};
+
+class Memfile {
 private:
-	std::ofstream dst;
+	MemfileImpl* impl;
+
+public:
+	const int memfd;
+	const std::string filename;
+
+	Memfile(MemfileImpl* impl) : impl(impl), memfd(impl->memfd), filename(impl->filename) {}

-	unsigned readsome(std::istream stream, unsigned amount);
+	// Copy another file into the default memfile impl
+	static Memfile readFrom(const std::string &filename) {
+		return Memfile(ShmemFile::readFrom(filename));
+	}

+	int close() { 
+		if (impl != nullptr) {
+			return impl->close();
+		} else {
+			return -1;
+		}
+	}
 };

 }

--- a/src/clockwork/model/model.cpp
+++ b/src/clockwork/model/model.cpp
@@ -2,6 +2,7 @@
 #include "clockwork/cuda_common.h"
 #include "clockwork/util.h"
 #include "clockwork/model/model.h"
+#include <unistd.h>

 using namespace clockwork::model;

@@ -42,12 +43,25 @@ void Model::instantiate_model_on_host() {
 	weights_pages_count = spec->weights_pages.size();
 	io_size = spec->io_memory;
 	workspace_size = spec->workspace_memory;
+	
+	inputs_size = 0;
+	for (auto &input : spec->inputs) {
+		inputs_size += input.size;
+	}
+
+	outputs_size = 0;
+	for (auto &output : spec->outputs) {
+		outputs_size += output.size;
+	}

 	// 3: setup model executor
 	op_execs = new std::vector<OpExec>(spec->ops.size());
 	for (unsigned i = 0; i < spec->ops.size(); i++) {
 		make_op_exec(spec->ops[i], (*op_execs)[i]);
 	}
+
+	// Close original so_memfile
+	so_memfile.close();
 }

 void Model::uninstantiate_model_on_host() {
@@ -111,17 +125,18 @@ void Model::transfer_weights_to_device(std::vector<char*> &weights_pages, cudaSt
 				stream
 			)
 		)
-		if (i > MAX_OUTSTANDING_MEMCPY_EVENTS) {
-			CUDA_CALL(cudaEventSynchronize(rate_limit_events[i % MAX_OUTSTANDING_MEMCPY_EVENTS]));
+		if (rate_limit) {
+			if (i > MAX_OUTSTANDING_MEMCPY_EVENTS) {
+				CUDA_CALL(cudaEventSynchronize(rate_limit_events[i % MAX_OUTSTANDING_MEMCPY_EVENTS]));
+			}
+			CUDA_CALL(cudaEventRecord(rate_limit_events[i % MAX_OUTSTANDING_MEMCPY_EVENTS], stream));
 		}
-		CUDA_CALL(cudaEventRecord(rate_limit_events[i % MAX_OUTSTANDING_MEMCPY_EVENTS], stream));
 	}
 }

 size_t Model::input_size() {
-	/** TODO: for now, a model only has one input */
 	CHECK(spec != nullptr) << "input_size spec is nullptr";
-	return spec->inputs[0].size;
+	return inputs_size;
 }

 /* Preconditions: instantiate_model_on_host && set_workspace_pages */
@@ -131,9 +146,10 @@ void Model::transfer_input_to_device(const char* input_ptr, char* &dst_io_memory

 void Model::transfer_input_to_device(size_t input_size, const char* input_ptr, char* &dst_io_memory, cudaStream_t stream) {
 	CHECK(spec != nullptr) << "transfer_input_to_device spec is nullptr";
-	CHECK(input_size <= spec->inputs[0].size) << "transfer_input_to_device tried to transfer more bytes than allowed";
+	CHECK(input_size <= inputs_size) << "transfer_input_to_device tried to transfer more bytes than allowed";
 	CHECK(spec->inputs[0].page == weights_pages_count) << "transfer_input_to_device expected input on page " << weights_pages_count;
-	void* dst_ptr = dst_io_memory + spec->inputs[0].page_offset;
+	CHECK(spec->inputs[0].page_offset == 0) << "transfer_input_to_device expected inputs to start at offset 0 on io_memory but found";
+	void* dst_ptr = dst_io_memory;
 	CUDA_CALL(cudaSetDevice(gpu_id));
 	CUDA_CALL(
 		cudaMemcpyAsync(
@@ -148,9 +164,8 @@ void Model::transfer_input_to_device(size_t input_size, const char* input_ptr, c

 /* Preconditions: instantiate_model_on_host */
 size_t Model::output_size() {
-	/** TODO: for now, a model only has one output */
 	CHECK(spec != nullptr) << "output_size spec is nullptr";
-	return spec->outputs[0].size;
+	return outputs_size;
 }

 /* Preconditions: instantiate_model_on_host */
@@ -160,9 +175,10 @@ void Model::transfer_output_from_device(char* output_ptr, char* &src_io_memory,

 void Model::transfer_output_from_device(size_t output_size, char* output_ptr, char* &src_io_memory, cudaStream_t stream) {
 	CHECK(spec != nullptr) << "transfer_output_from_device spec is nullptr";
-	CHECK(output_size <= spec->outputs[0].size) << "transfer_output_from_device tried to transfer more bytes than allowed";
-	CHECK(spec->inputs[0].page == weights_pages_count) << "transfer_output_from_device expected output on page " << weights_pages_count;
-	void* src_ptr = src_io_memory + spec->outputs[0].page_offset;
+	CHECK(output_size <= outputs_size) << "transfer_output_from_device tried to transfer more bytes than allowed";
+	CHECK(spec->outputs[0].page == weights_pages_count) << "transfer_output_from_device expected output on page " << weights_pages_count;
+	CHECK(spec->outputs[0].page_offset == inputs_size) << "transfer_input_to_device expected outputs to come after inputs";
+	void* src_ptr = src_io_memory + inputs_size;
 	CUDA_CALL(cudaSetDevice(gpu_id));
 	CUDA_CALL(
 		cudaMemcpyAsync(
@@ -189,10 +205,12 @@ void Model::call(std::vector<char*> &weights_pages, char* &io_memory, char* &wor

 	for (unsigned i = 0; i < op_execs->size(); i++) {
 		call_op_exec((*op_execs)[i], pages);
-		if (i > MAX_OUTSTANDING_EXEC_EVENTS) {
-			CUDA_CALL(cudaEventSynchronize(rate_limit_events[i % MAX_OUTSTANDING_EXEC_EVENTS]));
+		if (rate_limit) {
+			if (i > MAX_OUTSTANDING_EXEC_EVENTS) {
+				CUDA_CALL(cudaEventSynchronize(rate_limit_events[i % MAX_OUTSTANDING_EXEC_EVENTS]));
+			}
+			CUDA_CALL(cudaEventRecord(rate_limit_events[i % MAX_OUTSTANDING_EXEC_EVENTS], stream));
 		}
-		CUDA_CALL(cudaEventRecord(rate_limit_events[i % MAX_OUTSTANDING_EXEC_EVENTS], stream));
 	}
 }

@@ -217,7 +235,7 @@ void Model::make_op_exec(PageMappedOpDef &spec, OpExec &op) {
 		tensor.strides = nullptr;
 		tensor.byte_offset = 0;
 		op.func_inputs[i].v_handle = &tensor;
-		op.func_tcodes[i] = kArrayHandle;
+		op.func_tcodes[i] = kTVMDLTensorHandle;
 	}

 	op.workspace_ptrs.resize(spec.workspace_allocs.size());

--- a/src/clockwork/model/model.h
+++ b/src/clockwork/model/model.h
@@ -9,8 +9,8 @@
 #include <cuda_runtime.h>
 #include "clockwork/util.h"

-#define MAX_OUTSTANDING_EVENTS 4
-#define MAX_OUTSTANDING_EXEC_EVENTS 4
+#define MAX_OUTSTANDING_EVENTS 16
+#define MAX_OUTSTANDING_EXEC_EVENTS 16
 #define MAX_OUTSTANDING_MEMCPY_EVENTS 2

 namespace clockwork{
@@ -36,9 +36,10 @@ struct OpExec {
 class Model {
 public:
 	unsigned gpu_id;
+	bool rate_limit = true;

 	// Cool
-	const Memfile so_memfile;
+	Memfile so_memfile;
 	std::string serialized_spec;
 	int weights_size;
 	char* weights_pinned_host_memory; // alloced with cudaMallocHost
@@ -61,7 +62,7 @@ private:
 	// Warm
 	model::PageMappedModelDef* spec = nullptr;
 	unsigned weights_pages_count;
-	size_t io_size, workspace_size;
+	size_t io_size, workspace_size, inputs_size, outputs_size;

 	std::vector<OpExec>* op_execs = nullptr;
 	so::TVMWarmSharedObject* warm_so = nullptr;

--- a/src/clockwork/model/so.cpp
+++ b/src/clockwork/model/so.cpp
@@ -17,7 +17,7 @@ void* SharedObject::GetSymbol(const char* symbolName) {

 SharedObject::SharedObject(const std::string &name) : name(name) {
    lib_handle_ = dlopen(name.c_str(), RTLD_LOCAL | RTLD_NOW);
-    CHECK(lib_handle_ != nullptr) << "Failed to load SO " << name << dlerror();
+    CHECK(lib_handle_ != nullptr) << "Failed to load SO " << name << ": " << dlerror();
 }

 SharedObject::~SharedObject() {
No results found