Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • cld/ml/clockwork
1 result
Show changes
Commits on Source (10)
Showing
with 279 additions and 1571 deletions
......@@ -8,6 +8,7 @@ set(CC "gcc-8")
set(THREADS_PREFER_PTHREAD_FLAG ON)
set(CMAKE_BUILD_TYPE Release)
#set(CMAKE_BUILD_TYPE Debug)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
find_package(Threads REQUIRED)
find_package(Boost COMPONENTS system filesystem REQUIRED)
......@@ -89,6 +90,7 @@ target_link_libraries( clockwork PUBLIC
tvm_runtime
tbb
nvidia-ml
rt
${Boost_SYSTEM_LIBRARY}
${Boost_FILESYSTEM_LIBRARY}
)
......@@ -97,7 +99,6 @@ target_link_libraries( clockwork PUBLIC
# Converts from TVM models to clockwork models
include_directories(clockwork-convert)
add_executable (convert
src/clockwork-convert/tvm/decoupled_graph_runtime.cc
src/clockwork-convert/tvm_model.cpp
src/clockwork-convert/tvm_abstract_model.cpp
src/clockwork-convert/convert.cpp
......
......@@ -88,13 +88,25 @@ DefaultLimitNOFILE=65535
```
4. Restart
## 3. Increase mmap limits
Clockwork uses a lot of shared objects, and we need to increase the mmap limit. As root, run
```
/usr/sbin/sysctl -w vm.max_map_count=10000000
```
In general you can check mmap limits with:
```
sysctl vm.max_map_count
```
## 3. Disable CUDA JIT
None of the models we feed to Clockwork should have CUDA PTX code (JIT'able code) -- to make sure, set the `CUDA_CACHE_DISABLE=1` environment variable
## 4. Disable GPU frequency autoscaling
Enable persistence mode
Enable persistence mode. NOTE: This must be done on every restart
```
nvidia-smi -pm 1
```
......@@ -140,4 +152,11 @@ Currently, the CMakeLists assumes CUDA lives in either `/usr/local/cuda/lib64` (
Undefined reference to tvm::runtime::ManagedCuda... -- this probably means you didn't build TVM properly. Make sure you haven't modified or deleted the file `build/config.cmake` in the TVM repository. `make clean` and `make` TVM again.
Unable to set number of open files with ulimit: default values are picked up from conf files, e.g. /etc/security/limits.conf, but they may be overwritten by files in a subdirectory, e.g. /etc/security/limits.d/*.conf
\ No newline at end of file
Unable to set number of open files with ulimit: default values are picked up from conf files, e.g. /etc/security/limits.conf, but they may be overwritten by files in a subdirectory, e.g. /etc/security/limits.d/mpi.conf
If you are loading lots of models, you might see the following:
* `src/clockwork/model/so.cpp:20: Check failed: lib_handle_ != nullptr: Failed to load SO /proc/26344/fd/14656/proc/26344/fd/14656: cannot apply additional memory protection after relocation: Cannot allocate memory`
* `src/clockwork/model/so.cpp:20: Check failed: lib_handle_ != nullptr: Failed to load SO /proc/12386/fd/11804/proc/12386/fd/11804: failed to map segment from shared object`
To solve this you need to increase your mmap limits, described above.
......@@ -9,6 +9,9 @@ Clockwork must be build in order to run this script.
This script uses the `convert` binary and expects it to exist in the `build` folder
"""
convert_exec = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../build/convert")
parser = argparse.ArgumentParser(description='Convert a TVM model into a Clockwork model')
parser.add_argument("input_dir", metavar="INDIR", type=str, help="Base directory where TVM models exist. The utility expects multiple models, one per batch size, each in a subdirectory.")
parser.add_argument("output_dir", metavar="OUTDIR", type=str, help="Output directory. Directory will be created if it does not exist.")
......@@ -75,7 +78,7 @@ def convert(args):
print("The following command will run:")
pargs = [str(v) for v in [
"../build/convert",
convert_exec,
"-o", args.output_dir,
"-p", args.page_size
] + [x for m in models for x in m]]
......
......@@ -2,7 +2,7 @@
#include <sys/resource.h>
#include <catch2/catch.hpp>
#include <cuda_runtime.h>
#include "tvm/runtime/cuda_common.h"
#include "clockwork/cuda_common.h"
#include "clockwork/util.h"
#include "clockwork/test/util.h"
#include "clockwork/model/so.h"
......@@ -54,6 +54,12 @@ void check_environment() {
std::cout << " ✔ GPU " << i << " is in exclusive mode" << std::endl;
}
if (!util::is_persistence_mode_enabled_on_gpu(i)) { // TODO: check all GPUs
std::cout << " ✘ GPU " << i << " does not have persistent mode enabled; set with `nvidia-smi -i " << i << " -pm 1` or set for all GPUs with `nvidia-smi -pm 1`" << std::endl;
} else {
std::cout << " ✔ GPU " << i << " persistent mode is enabled" << std::endl;
}
}
......
#include <catch2/catch.hpp>
#include <cuda_runtime.h>
#include "tvm/runtime/cuda_common.h"
#include "clockwork/cuda_common.h"
#include "clockwork/util.h"
#include "clockwork/test/util.h"
#include "clockwork/model/so.h"
......
......@@ -5,7 +5,7 @@
#include <thread>
#include <catch2/catch.hpp>
#include <cuda_runtime.h>
#include "tvm/runtime/cuda_common.h"
#include "clockwork/cuda_common.h"
#include "clockwork/util.h"
#include "clockwork/test/util.h"
#include "clockwork/model/so.h"
......@@ -480,6 +480,25 @@ TEST_CASE("Warmup works", "[profile] [warmup]") {
}
}
TEST_CASE("Test max concurrent models", "[so_limits]") {
util::setCudaFlags();
util::initializeCudaStream();
std::string model_path = "/home/jcmace/clockwork-modelzoo-volta/others/cifar_resnet20_v1/model";
model::Model* model = load_model_from_disk(model_path);
std::vector<model::Model*> copies;
for (unsigned i = 0; i < 100000; i++) {
model::Model* copy = duplicate(model, false);
// model::Model* copy = load_model_from_disk(model_path);
copy->instantiate_model_on_host();
//copy->instantiate_model_on_device();
copies.push_back(copy);
std::cout << i << std::endl;
}
}
void runMultiClientExperiment(int num_execs, int models_per_exec, bool duplicate_weights, int iterations, bool with_module_load) {
util::setCudaFlags();
for (unsigned i = 0; i < 3; i++) {
......
......@@ -20,13 +20,15 @@ void show_usage() {
std::cout << " -p, --page_size" << std::endl;
std::cout << " Weights page size used by Clockwork. Defaults to 16MB. You shouldn't" << std::endl;
std::cout << " need to set this because we are using 16MB pages." << std::endl;
std::cout << " -i, --iterations" << std::endl;
std::cout << " Number of iterations to measure" << std::endl;
}
model::BatchedModel* load_model(std::string model) {
return model::BatchedModel::loadFromDisk(model, 0);
}
void check_model(int page_size, std::string model_path) {
void check_model(int page_size, int iterations, std::string model_path) {
std::cout << "Checking " << model_path << std::endl;
util::setCudaFlags();
......@@ -35,9 +37,6 @@ void check_model(int page_size, std::string model_path) {
clockwork::model::BatchedModel* model = load_model(model_path);
auto batch_sizes = model->implemented_batch_sizes();
for (unsigned batch_size : batch_sizes) {
std::cout << " loaded batch size " << batch_size << std::endl;
}
model->instantiate_models_on_host();
......@@ -47,9 +46,32 @@ void check_model(int page_size, std::string model_path) {
cudaError_t status;
model->instantiate_models_on_device();
std::shared_ptr<Allocation> weights = weights_cache->alloc(model->num_weights_pages(weights_page_size), []{});
model->transfer_weights_to_device(weights->page_pointers, util::Stream());
unsigned num_pages = model->num_weights_pages(weights_page_size);
std::shared_ptr<Allocation> weights = weights_cache->alloc(num_pages, []{});
int warmups = 10;
// Time the transfer
for (unsigned i = 0; i < warmups; i++) {
model->transfer_weights_to_device(weights->page_pointers, util::Stream());
}
status = cudaStreamSynchronize(util::Stream());
CHECK(status == cudaSuccess);
auto before_transfer = util::now();
for (unsigned i = 0; i < iterations; i++) {
model->transfer_weights_to_device(weights->page_pointers, util::Stream());
}
status = cudaStreamSynchronize(util::Stream());
CHECK(status == cudaSuccess);
auto after_transfer = util::now();
std::cout << " input_size: " << model->input_size(1) << std::endl;
std::cout << " output_size: " << model->output_size(1) << std::endl;
std::cout << " workspace: " << model->workspace_memory_size(1) << std::endl;
std::cout << " weights size paged (non-paged) [num_pages]: " << (weights_page_size * num_pages) << " (" << model->weights_size << ") [" << num_pages << "]" << std::endl;
printf(" weights transfer latency: %.2f ms\n", ((float) (after_transfer-before_transfer)) / (iterations * 1000000.0));
std::cout << " execution latency:" << std::endl;
for (unsigned batch_size : batch_sizes) {
// Create inputs and outputs
......@@ -70,21 +92,19 @@ void check_model(int page_size, std::string model_path) {
model->transfer_input_to_device(batch_size, input, io_memory, util::Stream());
// Time the call
int warmups = 20;
for (int i = 0; i < warmups; i++) {
model->call(batch_size, weights->page_pointers, io_memory, workspace_memory, util::Stream());
}
status = cudaStreamSynchronize(util::Stream());
CHECK(status == cudaSuccess);
auto before = util::now();
int iterations = 100;
for (int i = 0; i < iterations; i++) {
model->call(batch_size, weights->page_pointers, io_memory, workspace_memory, util::Stream());
}
status = cudaStreamSynchronize(util::Stream());
CHECK(status == cudaSuccess);
auto after = util::now();
printf(" b%d: %.2f ms per call\n", batch_size, ((float) (after-before)) / (iterations * 1000000.0));
printf(" b%d: %.2f ms\n", batch_size, ((float) (after-before)) / (iterations * 1000000.0));
model->transfer_output_from_device(batch_size, output, io_memory, util::Stream());
......@@ -114,6 +134,7 @@ void check_model(int page_size, std::string model_path) {
int main(int argc, char *argv[]) {
std::vector<std::string> non_argument_strings;
int iterations = 100;
int page_size = 16 * 1024 * 1024;
for (int i = 1; i < argc; ++i) {
std::string arg = argv[i];
......@@ -122,6 +143,8 @@ int main(int argc, char *argv[]) {
return 0;
} else if ((arg == "-p") || (arg == "--page_size")) {
page_size = atoi(argv[++i]);
} else if ((arg == "-i") || (arg == "--iterations")) {
iterations = atoi(argv[++i]);
} else {
non_argument_strings.push_back(arg);
}
......@@ -134,5 +157,5 @@ int main(int argc, char *argv[]) {
std::string model_path = non_argument_strings[0];
check_model(page_size, model_path);
check_model(page_size, iterations, model_path);
}
......@@ -2,6 +2,7 @@
#include <cuda.h>
#include <cuda_runtime.h>
#include "clockwork/tvm/pack_args.h"
#include "dmlc/memory_io.h"
#include "clockwork/util.h"
#include <iostream>
......
This diff is collapsed.
/*!
* Copyright (c) 2017 by Contributors
*
* \brief Tiny graph runtime that can run graph
* containing only tvm PackedFunc.
* \file graph_runtime.h
*/
#ifndef CLOCKWORK_TVM_DECOUPLED_RUNTIME_GRAPH_GRAPH_RUNTIME_H_
#define CLOCKWORK_TVM_DECOUPLED_RUNTIME_GRAPH_GRAPH_RUNTIME_H_
//#include "graph_runtime.h"
#include <dlpack/dlpack.h>
#include <dmlc/memory_io.h>
#include <dmlc/json.h>
#include <tvm/runtime/ndarray.h>
#include <tvm/runtime/packed_func.h>
#include <tvm/runtime/managed_cuda_device_api.h>
#include "clockwork/modeldef.h"
#include <vector>
#include <string>
namespace tvm {
namespace runtime {
/*! \brief macro to do C API call */
#define TVM_CCALL(func) \
{ \
int ret = (func); \
CHECK_EQ(ret, 0) \
<< TVMGetLastError(); \
}
/*! \brief Magic number for NDArray list file */
constexpr uint64_t kTVMNDArrayListMagic = 0xF7E58D4F05049CB7;
/*! \brief operator attributes about tvm op */
struct TVMOpParam {
std::string func_name;
uint32_t num_inputs;
uint32_t num_outputs;
uint32_t flatten_data;
};
/*!
* \brief Tiny graph runtime with interfaced defined to isolate resource usage.
*
* This runtime can be acccesibly in various language via
* TVM runtime PackedFunc API.
*/
class DecoupledGraphRuntime : public ModuleNode {
public:
// GraphRuntime() : ModuleNode() {
// std::cout << "Creating GraphRuntime" << std::endl;
// }
virtual ~DecoupledGraphRuntime() {
if (tempParams_ != nullptr) {
for (size_t i = 0; i < this->paramsSize_; ++i) {
delete this->tempParams_[i];
}
delete this->copyParamsToEIDs_;
delete this->tempParams_;
} else {
delete contiguous_input_memory.tempParamsArray;
}
}
/*!
* \brief Get member function to front-end
* \param name The name of the function.
* \param sptr_to_self The pointer to the module node.
* \return The corresponding member function.
*/
virtual PackedFunc GetFunction(const std::string& name,
const std::shared_ptr<ModuleNode>& sptr_to_self);
/*!
* \return The type key of the executor.
*/
const char* type_key() const final {
return "DecoupledGraphRuntime";
}
void Run();
/*!
* \brief Initialize the graph executor with graph and context.
* \param graph_json The execution graph.
* \param module The module containing the compiled functions for the host
* processor.
* \param ctxs The context of the host and devices where graph nodes will be
* executed on.
*/
void Init(const std::string& graph_json,
tvm::runtime::Module module,
const std::vector<TVMContext>& ctxs,
bool contiguous);
/*!
* \brief Get the input index given the name of input.
* \param name The name of the input.
* \return The index of input.
*/
int GetInputIndex(const std::string& name);
/*!
* \brief set index-th input to the graph.
* \param index The input index.
* \param data_in The input data.
*/
void SetInput(int index, DLTensor* data_in);
/*!
* \brief Get the number of outputs
*
* \return The number of outputs from graph.
*/
int NumOutputs() const;
/*!
* \brief Return NDArray for given input index.
* \param index The input index.
*
* \return NDArray corresponding to given input node index.
*/
NDArray GetInput(int index) const;
/*!
* \brief Return NDArray for given output index.
* \param index The output index.
*
* \return NDArray corresponding to given output node index.
*/
NDArray GetOutput(int index) const;
/*!
* \brief Copy index-th output to data_out.
* \param index The output index.
* \param data_out the output data.
*/
void CopyOutputTo(int index, DLTensor* data_out);
/*!
* \brief Load parameters from parameter blob to cpu memory
* \param strm The input stream.
*/
void LoadParamsContiguously(const std::string& param_blob);
/*!
* \brief Load parameters from binary stream to cpu memory
* \param strm The input stream.
*/
void LoadParams(dmlc::Stream* strm);
/*!
* \brief Load parameters from parameter blob to cpu memory.
* \param param_blob A binary blob of parameter.
*/
void LoadParams(const std::string& param_blob);
/*!
* \brief Do necessary transfers to device.
* \param param_blob A binary blob of parameter.
*/
void* LoadToDevice();
/*!
* \brief Get the tensor vector pointer.
*/
std::vector<NDArray>& data_entry() {
return data_entry_;
}
/*!
* \brief Get the execution function pointer.
*/
std::vector<std::function<void()> >& op_execs() {
return op_execs_;
}
/*!
* \brief Get node entry index.
* \param nid Node id.
* \param index Index of the nodes.
*/
uint32_t GetEntryId(uint32_t nid, uint32_t index) const {
return node_row_ptr_[nid] + index;
}
/*!
* \brief Get total number of nodes.
* \return Total number of nodes.
*/
uint32_t GetNumOfNodes() const {
return static_cast<uint32_t>(nodes_.size());
}
std::string GetNodeName(uint32_t nid) const {
return nodes_[nid].name;
}
private:
// Memory pool entry.
struct PoolEntry {
size_t size;
int device_type;
int storage_id;
PoolEntry(int s, int dev_type) : size(s), device_type(dev_type), storage_id(0) {}
PoolEntry(int s, int dev_type, int storage_id) : size(s), device_type(dev_type), storage_id(storage_id) {}
};
// Node entry
struct NodeEntry {
uint32_t node_id;
uint32_t index;
uint32_t version;
// JSON Loader
void Load(dmlc::JSONReader *reader) {
reader->BeginArray();
CHECK(reader->NextArrayItem()) << "invalid json format";
reader->Read(&node_id);
CHECK(reader->NextArrayItem()) << "invalid json format";
reader->Read(&index);
if (reader->NextArrayItem()) {
reader->Read(&version);
CHECK(!reader->NextArrayItem()) << "invalid json format";
} else {
version = 0;
}
}
};
// Node
struct Node {
// operator type in string
std::string op_type;
// name of the op
std::string name;
// parameters
TVMOpParam param;
// inputs
std::vector<NodeEntry> inputs;
// control deps
std::vector<uint32_t> control_deps;
// JSON Loader
void LoadAttrs(dmlc::JSONReader *reader, TVMOpParam* param) {
int bitmask = 0;
std::string key, value;
reader->BeginObject();
while (reader->NextObjectItem(&key)) {
reader->Read(&value);
if (key == "func_name") {
param->func_name = value;
bitmask |= 1;
} else if (key == "num_inputs") {
param->num_inputs = strtoul(value.c_str(), nullptr, 10);
bitmask |= 2;
} else if (key == "num_outputs") {
param->num_outputs = strtoul(value.c_str(), nullptr, 10);
bitmask |= 4;
} else if (key == "flatten_data") {
param->flatten_data = strtoul(value.c_str(), nullptr, 10);
bitmask |= 8;
}
}
CHECK_EQ(bitmask, 1|2|4|8) << "invalid format";
}
// JSON Loader
void Load(dmlc::JSONReader *reader) {
reader->BeginObject();
int bitmask = 0;
std::string key;
while (reader->NextObjectItem(&key)) {
if (key == "op") {
reader->Read(&op_type);
bitmask |= 1;
} else if (key == "name") {
reader->Read(&name);
bitmask |= 2;
} else if (key == "inputs") {
reader->Read(&inputs);
bitmask |= 4;
} else if (key == "attr" || key == "attrs") {
this->LoadAttrs(reader, &param);
} else if (key == "control_deps") {
reader->Read(&control_deps);
} else {
LOG(FATAL) << "do not support key " << key;
}
}
CHECK_EQ(bitmask, 1|2|4) << "invalid format";
}
};
struct GraphAttr {
size_t storage_num_not_alloctaed{0};
std::vector<int> storage_id;
std::vector<int> device_index;
std::vector<std::string> dltype;
std::vector<std::vector<int64_t> > shape;
// The graph attribute fields.
void Load(dmlc::JSONReader *reader) {
reader->BeginObject();
int bitmask = 0;
std::string key, type;
while (reader->NextObjectItem(&key)) {
if (key == "dltype") {
reader->BeginArray();
CHECK(reader->NextArrayItem());
reader->Read(&type);
CHECK_EQ(type, "list_str");
CHECK(reader->NextArrayItem());
reader->Read(&dltype);
CHECK(!reader->NextArrayItem());
bitmask |= 1;
} else if (key == "storage_id") {
reader->BeginArray();
CHECK(reader->NextArrayItem());
reader->Read(&type);
CHECK_EQ(type, "list_int");
CHECK(reader->NextArrayItem());
reader->Read(&storage_id);
CHECK(!reader->NextArrayItem());
bitmask |= 2;
} else if (key == "shape") {
reader->BeginArray();
CHECK(reader->NextArrayItem());
reader->Read(&type);
CHECK_EQ(type, "list_shape");
CHECK(reader->NextArrayItem());
reader->Read(&shape);
CHECK(!reader->NextArrayItem());
bitmask |= 4;
} else if (key == "device_index") {
reader->BeginArray();
CHECK(reader->NextArrayItem());
reader->Read(&type);
CHECK_EQ(type, "list_int");
CHECK(reader->NextArrayItem());
reader->Read(&device_index);
CHECK(!reader->NextArrayItem());
} else {
reader->BeginArray();
CHECK(reader->NextArrayItem());
reader->Read(&type);
if (type == "list_int") {
CHECK(reader->NextArrayItem());
std::vector<int> temp;
reader->Read(&temp);
} else if (type == "size_t") {
CHECK(reader->NextArrayItem());
size_t temp;
reader->Read(&temp);
} else {
LOG(FATAL) << "cannot skip graph attr " << key;
}
CHECK(!reader->NextArrayItem());
}
}
CHECK_EQ(bitmask, 1|2|4) << "invalid format";
}
};
// The graph attribute fields.
void Load(dmlc::JSONReader *reader) {
reader->BeginObject();
int bitmask = 0;
std::string key;
while (reader->NextObjectItem(&key)) {
if (key == "nodes") {
reader->Read(&nodes_);
bitmask |= 1;
} else if (key == "arg_nodes") {
reader->Read(&input_nodes_);
bitmask |= 2;
} else if (key == "node_row_ptr") {
reader->Read(&node_row_ptr_);
bitmask |= 4;
} else if (key == "heads") {
reader->Read(&outputs_);
bitmask |= 8;
} else if (key == "attrs") {
reader->Read(&attrs_);
bitmask |= 16;
} else {
LOG(FATAL) << "key " << key << " is not supported";
}
}
CHECK_EQ(bitmask, 1|2|4|8|16) << "invalid format";
}
/*! \brief Creates description of the temporal memory */
void SetupStorage();
void SetupStorageContiguous();
/*! \brief Allocates memory needed as described in
pool_entries_ and contiguous_memory_input */
void AllocateStorageSpace();
/*!
* \brief Load previously prepped parameteres to the device.
* \param param_blob A binary blob of parameter.
*/
void UploadParams();
std::string SaveParams();
/*! \brief Setup the executors. */
void SetupOpExecs();
NDArray GetConstParams();
void SetConstParams(NDArray params);
clockwork::model::ModelDef* ExtractModelSpec();
std::vector<std::vector<WorkspaceAlloc>>* ExtractWorkspaceAllocs();
/*!
* \brief Create a executtion function given input.
* \param attrs The node attributes.
* \param args The arguments to the functor, including inputs and outputs.
* \param num_inputs Number of inputs.
* \param dev_type The device type of the tvm_op.
* \return The created executor.
*/
std::function<void()> CreateTVMOp(const TVMOpParam& attrs,
const std::vector<DLTensor>& args,
size_t num_inputs);
// Get node entry index.
uint32_t entry_id(uint32_t nid, uint32_t index) const {
return node_row_ptr_[nid] + index;
}
// Get node entry index.
uint32_t entry_id(const NodeEntry& e) const {
return entry_id(e.node_id, e.index);
}
// Number of node entries.
uint32_t num_node_entries() const {
return node_row_ptr_.back();
}
/*! \brief The graph nodes. */
std::vector<Node> nodes_;
/*! \brief The argument nodes. */
std::vector<uint32_t> input_nodes_;
/*! \brief Used for quick entry indexing. */
std::vector<uint32_t> node_row_ptr_;
/*! \brief Output entries. */
std::vector<NodeEntry> outputs_;
/*! \brief Additional graph attributes. */
GraphAttr attrs_;
/*! \brief The code module that contains both host and device code. */
tvm::runtime::Module module_;
/*! \brief Execution context of all devices including the host. */
std::vector<TVMContext> ctxs_;
/*! \brief Common storage pool for all devices. */
std::vector<NDArray> storage_pool_;
/*! \brief Data entry of each node. */
std::vector<NDArray> data_entry_;
/*! \brief Operator on each node. */
std::vector<std::function<void()> > op_execs_;
/*! \brief Parameters loaded in cpu memory. */
NDArray** tempParams_ = nullptr;
/*! \brief Place to copy params on device memory. */
uint32_t* copyParamsToEIDs_ = nullptr;
/*! \brief Number of parameters loaded in cpu memory. */
size_t paramsSize_ = 0;
typedef struct contiguous_memory_allocation {
contiguous_memory_allocation() : size(0), tempParamsArray(nullptr) {}
uint64_t size;
uint64_t paramsSize; // Size inside this memory allocation of the const params
std::vector<uint32_t> storage_ids;
std::vector<uint32_t> offsets;
NDArray backing_array;
NDArray backing_array_params_view;
NDArray* tempParamsArray;
TVMContext ctx;
} contiguous_memory_allocation;
/*! \brief Allocate input memory as a single contiguous region to enable fast copy. */
contiguous_memory_allocation contiguous_input_memory;
// Size and device type of each storage pool entry.
std::vector<PoolEntry> pool_entries_;
};
Module DecoupledGraphRuntimeCreate(const std::string& sym_json,
const tvm::runtime::Module& m, int device_type, int device_id);
std::shared_ptr<DecoupledGraphRuntime> DecoupledGraphRuntimeCreateDirect(const std::string& sym_json,
const tvm::runtime::Module& m, int device_type, int device_id);
// std::vector<TVMContext> GetAllContext(const TVMArgs& args);
} // namespace runtime
} // namespace tvm
#endif // TVM_RUNTIME_GRAPH_DECOUPLED_GRAPH_RUNTIME_H_
......@@ -6,6 +6,7 @@
#include <cstring>
#include "clockwork-convert/tvm_model.h"
#include <dmlc/logging.h>
#include <tvm/runtime/packed_func.h>
#include <dlpack/dlpack.h>
......@@ -50,7 +51,7 @@ Model Model::fromTVM(tvm_model::Model &model, tvm_model::Params &params, tvm_mod
for (unsigned i = 0; i < model.attrs_.storage_id.size(); i++) {
Tensor* tensor = new Tensor();
tensor->id = i;
tensor->dltype = tvm::runtime::String2TVMType(model.attrs_.dltype[i]);
tensor->dltype = tvm::runtime::String2DLDataType(model.attrs_.dltype[i]);
tensor->shape = model.attrs_.shape[i];
tensor->storage = storage_locations[model.attrs_.storage_id[i]];
tensors.push_back(tensor);
......@@ -63,12 +64,19 @@ Model Model::fromTVM(tvm_model::Model &model, tvm_model::Params &params, tvm_mod
weights->data = p.second->dataptr();
weights->size = p.second->Size();
weights->tensor = nullptr;
CHECK(out.weights.find(p.first) == out.weights.end()) << "Found duplicate layers with name " << p.first;
out.weights[p.first] = weights;
}
std::vector<std::string> seen_names;
for (unsigned i = 0; i < model.nodes_.size(); i++) {
tvm_model::Node &node = model.nodes_[i];
CHECK(std::find(seen_names.begin(), seen_names.end(), node.name) == seen_names.end()) << "Found duplicate node " << node.name;
seen_names.push_back(node.name);
if (node.op_type == "null") {
int input_index = model.node_row_ptr_[i];
int input_offset = 0;
......@@ -202,7 +210,8 @@ void make_weights_lookup_table(Model &model, PageMappedStorage* mapped) {
// Store the actual weights data in the storage lookup table.
auto data = std::string(static_cast<char*>(weights->data), weights->size);
mapped->weights_lookup[data] = std::make_pair(page_number, index_in_page);
mapped->weights_lookup[data].push_back(PreMappedIndices{page_number, index_in_page});
index_in_page++;
}
......@@ -212,6 +221,8 @@ void make_weights_lookup_table(Model &model, PageMappedStorage* mapped) {
}
std::vector<Page*> replicate_weights_mapping(Model &model, PageMappedStorage* existing_weights_mapping) {
std::unordered_map<std::string, unsigned> current_index;
std::vector<Page*> pages;
for (unsigned i = 0; i < existing_weights_mapping->weights.size(); i++) {
Page* page = new Page();
......@@ -224,11 +235,12 @@ std::vector<Page*> replicate_weights_mapping(Model &model, PageMappedStorage* ex
for (auto &p : model.weights) {
LayerWeights* weights = p.second;
auto data = std::string(static_cast<char*>(weights->data), weights->size);
if (existing_weights_mapping->weights_lookup.find(data) == existing_weights_mapping->weights_lookup.end()) {
throw "Error: " + weights->name + " not found in existing weights mapping";
}
auto assignment = existing_weights_mapping->weights_lookup[data];
pages[assignment.first]->used_by[assignment.second] = weights->tensor->storage;
CHECK(existing_weights_mapping->weights_lookup.find(data) != existing_weights_mapping->weights_lookup.end())
<< "Error: " + weights->name + " not found in existing weights mapping";
unsigned position = current_index[data]++;
PreMappedIndices indices = existing_weights_mapping->weights_lookup[data][position];
pages[indices.page_index]->used_by[indices.location_index] = weights->tensor->storage;
}
for (unsigned i = 0; i < pages.size(); i++) {
......@@ -257,8 +269,12 @@ PageMappedStorage* PageMappedStorage::calculate(Model &model, size_t weights_pag
CHECK(p.second->tensor != nullptr);
CHECK(p.second->tensor->storage != nullptr);
weights_storage.push_back(p.second->tensor->storage);
seen.push_back(p.second->tensor->storage);
StorageLocation* l = p.second->tensor->storage;
CHECK(std::find(seen.begin(), seen.end(), l) == seen.end()) << "Found duplicate storage location";
weights_storage.push_back(l);
seen.push_back(l);
}
// Pull out storage locations that are model inputs
......@@ -309,6 +325,12 @@ PageMappedStorage* PageMappedStorage::calculate(Model &model, size_t weights_pag
// Pack the weights onto pages, or use existing mapping if provided
if (existing_weights_mapping == nullptr) {
mapped->weights = pack(weights_storage, weights_page_size);
int count = 0;
for (Page* page : mapped->weights) {
count += page->used_by.size();
}
make_weights_lookup_table(model, mapped);
} else {
mapped->weights = replicate_weights_mapping(model, existing_weights_mapping);
......@@ -491,6 +513,9 @@ void makeModelDef(Model &model, size_t weights_page_size, clockwork::model::Page
tensordef.page_offset = pageptr.page_offset;
tensordef.size = tensor->Size();
tensordef.shape = tensor->shape;
tensordef.code = tensor->dltype.code;
tensordef.bits = tensor->dltype.bits;
tensordef.lanes = tensor->dltype.lanes;
opdef.inputs.push_back(tensordef);
}
......
......@@ -90,6 +90,12 @@ public:
size_t Size();
};
class PreMappedIndices {
public:
unsigned page_index;
unsigned location_index;
};
class PageMappedStorage {
public:
std::vector<Page*> weights;
......@@ -97,7 +103,7 @@ public:
std::vector<StorageLocation*> workspace_storage;
size_t transient_memory;
std::unordered_map<std::string, std::pair<unsigned, unsigned>> weights_lookup;
std::unordered_map<std::string, std::vector<PreMappedIndices>> weights_lookup;
static PageMappedStorage* calculate(Model &model, size_t weights_page_size, PageMappedStorage* existing_weights_mapping = nullptr);
};
......
#include "clockwork-convert/tvm_model.h"
#include <tvm/runtime/device_api.h>
using namespace tvm_model;
......@@ -177,6 +178,7 @@ void Params::Load(dmlc::Stream* strm) {
CHECK(size == names.size()) << "Invalid parameters file format";
for (std::string name : names) {
CHECK(data.find(name) == data.end()) << "Duplicate params for " << name;
data[name] = new tvm::runtime::NDArray();
data[name]->Load(strm);
}
......@@ -198,7 +200,7 @@ Allocs Allocs::ProfileModel(std::string model_so, std::string model_json, std::s
const int device_type = kDLGPU;
const int device_id = 0;
const tvm::runtime::PackedFunc load_module(*tvm::runtime::Registry::Get("module.loadfile_so"));
const tvm::runtime::PackedFunc load_module(*tvm::runtime::Registry::Get("runtime.module.loadfile_so"));
tvm::runtime::Module mod_syslib = load_module(model_so, "so");
// Graph structure
......@@ -207,8 +209,8 @@ Allocs Allocs::ProfileModel(std::string model_so, std::string model_json, std::s
json_in.close();
// Construct TVM runtime
std::shared_ptr<tvm::runtime::DecoupledGraphRuntime> rt = DecoupledGraphRuntimeCreateDirect(json_data, mod_syslib, device_type, device_id);
tvm::runtime::Module mod = tvm::runtime::Module(rt);
const tvm::runtime::PackedFunc create_graph_runtime(*tvm::runtime::Registry::Get("tvm.graph_runtime.create"));
tvm::runtime::Module mod = create_graph_runtime(json_data, mod_syslib, device_type, device_id);
// const tvm::runtime::PackedFunc create_graph_runtime(*tvm::runtime::Registry::Get("tvm.decoupled_graph_runtime.create_contiguous"));
// tvm::runtime::Module mod = create_graph_runtime(json_data, mod_syslib, device_type, device_id);
......@@ -231,12 +233,8 @@ Allocs Allocs::ProfileModel(std::string model_so, std::string model_json, std::s
// tvm::runtime::PackedFunc set_const_params = mod.GetFunction("set_const_params");
// tvm::runtime::NDArray const_params = get_const_params();
// load the model onto device
tvm::runtime::PackedFunc load_to_device = mod.GetFunction("load_to_device");
load_to_device();
tvm::runtime::PackedFunc extract_allocs = mod.GetFunction("extract_workspace_allocs");
tvm::runtime::PackedFunc extract_allocs = mod.GetFunction("profile_workspace_allocs");
std::vector<std::vector<tvm::runtime::WorkspaceAlloc>>* alloc_vector = static_cast<std::vector<std::vector<tvm::runtime::WorkspaceAlloc>>*>((void*) extract_allocs());
Allocs allocs;
......
......@@ -16,7 +16,6 @@
#include <tvm/runtime/module.h>
#include <tvm/runtime/registry.h>
#include <tvm/runtime/packed_func.h>
#include "clockwork-convert/tvm/decoupled_graph_runtime.h"
namespace tvm_model {
......
......@@ -2,6 +2,7 @@
#include <cuda.h>
#include <cuda_runtime.h>
#include "clockwork/tvm/pack_args.h"
#include "dmlc/memory_io.h"
#include "clockwork/cuda_common.h"
#include "clockwork/util.h"
......
......@@ -3,14 +3,30 @@
#include <sys/syscall.h>
#include "dmlc/logging.h"
#include "clockwork/model/memfile.h"
#include <iostream>
#include <sys/mman.h>
#include <fcntl.h>
#include <sstream>
#include <cstdio>
namespace clockwork {
int make_shmemfd(std::string &name) {
int fd = shm_open(name.c_str(), O_RDWR | O_CREAT, S_IRWXU);
if (fd < 0) {
std::cout << fd << std::endl;
perror("shm_open");
CHECK(fd < 0) << "Could not create memfd using shm_open";
}
return fd;
}
int make_memfd() {
int fd = syscall(__NR_memfd_create, std::string("").c_str(), 0);
if (fd < 0) {
std::cout << fd << std::endl;
perror("memfd_create");
CHECK(fd < 0) << "ModelCodeReader could not create memfile";
CHECK(fd < 0) << "Could not create memfd using memfd_create";
}
return fd;
}
......@@ -21,7 +37,7 @@ std::string memfd_filename(int memfd) {
return ss.str();
}
Memfile Memfile::readFrom(const std::string &filename) {
MemfdFile* MemfdFile::readFrom(const std::string &filename) {
int memfd = make_memfd();
std::string memfilename = memfd_filename(memfd);
std::ofstream dst(memfilename, std::ios::binary);
......@@ -34,18 +50,46 @@ Memfile Memfile::readFrom(const std::string &filename) {
src.close();
dst.close();
return Memfile(memfd, memfilename);
return new MemfdFile(memfd, memfilename);
}
MemfileReader::MemfileReader(const Memfile &memfile) :
memfile(memfile), dst(memfile.filename, std::ios::binary) {
int MemfdFile::close() {
std::cout << "Closing " << memfd << std::endl;
return ::close(memfd);
}
unsigned shmem_counter = 0;
ShmemFile* ShmemFile::readFrom(const std::string &filename) {
// Filename of the shmem file
std::stringstream name;
name << "/clockwork-" << getpid() << "-" << shmem_counter++;
std::string shmem_name = name.str();
int shmem_fd = make_shmemfd(shmem_name);
// Path to the shmem file
std::string shmem_path = "/dev/shm" + shmem_name;
// Remove existing file
std::remove(shmem_path.c_str());
std::ofstream dst(shmem_path, std::ios::binary);
CHECK(dst.good()) << "Bad memfile " << shmem_path;
std::ifstream src(filename, std::ios::binary);
CHECK(src.good()) << "Unable to open file " << filename;
dst << src.rdbuf();
src.close();
dst.close();
return new ShmemFile(shmem_fd, shmem_path, shmem_name);
}
unsigned MemfileReader::readsome(std::istream stream, unsigned amount) {
char buf[amount];
int amountRead = stream.readsome(buf, amount);
dst.write(buf, amountRead);
return amountRead;
int ShmemFile::close() {
::close(memfd);
int status = shm_unlink(name.c_str());
return status;
}
}
\ No newline at end of file
......@@ -9,30 +9,66 @@
namespace clockwork {
/** An in-memory file */
class Memfile {
class MemfileImpl {
public:
const int memfd;
const std::string filename;
Memfile(const int &memfd, const std::string &filename) :
MemfileImpl(const int &memfd, const std::string &filename) :
memfd(memfd), filename(filename) {}
// Copy another file into a memfile
static Memfile readFrom(const std::string &filename);
virtual int close() = 0;
};
/** An in-memory file */
class MemfdFile : public MemfileImpl {
public:
MemfdFile(const int &memfd, const std::string &filename) :
MemfileImpl(memfd, filename) {}
// Copy another file into a MemfdFile
static MemfdFile* readFrom(const std::string &filename);
virtual int close();
};
/** Used for reading data from a stream into a memfile */
class MemfileReader {
class ShmemFile : public MemfileImpl {
public:
const Memfile &memfile;
MemfileReader(const Memfile &memfile);
const std::string name;
ShmemFile(const int &fd, const std::string &filename, const std::string &name) :
MemfileImpl(fd, filename), name(name) {}
// Copy another file into a ShmemFile
static ShmemFile* readFrom(const std::string &filename);
virtual int close();
};
class Memfile {
private:
std::ofstream dst;
MemfileImpl* impl;
public:
const int memfd;
const std::string filename;
Memfile(MemfileImpl* impl) : impl(impl), memfd(impl->memfd), filename(impl->filename) {}
unsigned readsome(std::istream stream, unsigned amount);
// Copy another file into the default memfile impl
static Memfile readFrom(const std::string &filename) {
return Memfile(ShmemFile::readFrom(filename));
}
int close() {
if (impl != nullptr) {
return impl->close();
} else {
return -1;
}
}
};
}
......
......@@ -2,6 +2,7 @@
#include "clockwork/cuda_common.h"
#include "clockwork/util.h"
#include "clockwork/model/model.h"
#include <unistd.h>
using namespace clockwork::model;
......@@ -42,12 +43,25 @@ void Model::instantiate_model_on_host() {
weights_pages_count = spec->weights_pages.size();
io_size = spec->io_memory;
workspace_size = spec->workspace_memory;
inputs_size = 0;
for (auto &input : spec->inputs) {
inputs_size += input.size;
}
outputs_size = 0;
for (auto &output : spec->outputs) {
outputs_size += output.size;
}
// 3: setup model executor
op_execs = new std::vector<OpExec>(spec->ops.size());
for (unsigned i = 0; i < spec->ops.size(); i++) {
make_op_exec(spec->ops[i], (*op_execs)[i]);
}
// Close original so_memfile
so_memfile.close();
}
void Model::uninstantiate_model_on_host() {
......@@ -111,17 +125,18 @@ void Model::transfer_weights_to_device(std::vector<char*> &weights_pages, cudaSt
stream
)
)
if (i > MAX_OUTSTANDING_MEMCPY_EVENTS) {
CUDA_CALL(cudaEventSynchronize(rate_limit_events[i % MAX_OUTSTANDING_MEMCPY_EVENTS]));
if (rate_limit) {
if (i > MAX_OUTSTANDING_MEMCPY_EVENTS) {
CUDA_CALL(cudaEventSynchronize(rate_limit_events[i % MAX_OUTSTANDING_MEMCPY_EVENTS]));
}
CUDA_CALL(cudaEventRecord(rate_limit_events[i % MAX_OUTSTANDING_MEMCPY_EVENTS], stream));
}
CUDA_CALL(cudaEventRecord(rate_limit_events[i % MAX_OUTSTANDING_MEMCPY_EVENTS], stream));
}
}
size_t Model::input_size() {
/** TODO: for now, a model only has one input */
CHECK(spec != nullptr) << "input_size spec is nullptr";
return spec->inputs[0].size;
return inputs_size;
}
/* Preconditions: instantiate_model_on_host && set_workspace_pages */
......@@ -131,9 +146,10 @@ void Model::transfer_input_to_device(const char* input_ptr, char* &dst_io_memory
void Model::transfer_input_to_device(size_t input_size, const char* input_ptr, char* &dst_io_memory, cudaStream_t stream) {
CHECK(spec != nullptr) << "transfer_input_to_device spec is nullptr";
CHECK(input_size <= spec->inputs[0].size) << "transfer_input_to_device tried to transfer more bytes than allowed";
CHECK(input_size <= inputs_size) << "transfer_input_to_device tried to transfer more bytes than allowed";
CHECK(spec->inputs[0].page == weights_pages_count) << "transfer_input_to_device expected input on page " << weights_pages_count;
void* dst_ptr = dst_io_memory + spec->inputs[0].page_offset;
CHECK(spec->inputs[0].page_offset == 0) << "transfer_input_to_device expected inputs to start at offset 0 on io_memory but found";
void* dst_ptr = dst_io_memory;
CUDA_CALL(cudaSetDevice(gpu_id));
CUDA_CALL(
cudaMemcpyAsync(
......@@ -148,9 +164,8 @@ void Model::transfer_input_to_device(size_t input_size, const char* input_ptr, c
/* Preconditions: instantiate_model_on_host */
size_t Model::output_size() {
/** TODO: for now, a model only has one output */
CHECK(spec != nullptr) << "output_size spec is nullptr";
return spec->outputs[0].size;
return outputs_size;
}
/* Preconditions: instantiate_model_on_host */
......@@ -160,9 +175,10 @@ void Model::transfer_output_from_device(char* output_ptr, char* &src_io_memory,
void Model::transfer_output_from_device(size_t output_size, char* output_ptr, char* &src_io_memory, cudaStream_t stream) {
CHECK(spec != nullptr) << "transfer_output_from_device spec is nullptr";
CHECK(output_size <= spec->outputs[0].size) << "transfer_output_from_device tried to transfer more bytes than allowed";
CHECK(spec->inputs[0].page == weights_pages_count) << "transfer_output_from_device expected output on page " << weights_pages_count;
void* src_ptr = src_io_memory + spec->outputs[0].page_offset;
CHECK(output_size <= outputs_size) << "transfer_output_from_device tried to transfer more bytes than allowed";
CHECK(spec->outputs[0].page == weights_pages_count) << "transfer_output_from_device expected output on page " << weights_pages_count;
CHECK(spec->outputs[0].page_offset == inputs_size) << "transfer_input_to_device expected outputs to come after inputs";
void* src_ptr = src_io_memory + inputs_size;
CUDA_CALL(cudaSetDevice(gpu_id));
CUDA_CALL(
cudaMemcpyAsync(
......@@ -189,10 +205,12 @@ void Model::call(std::vector<char*> &weights_pages, char* &io_memory, char* &wor
for (unsigned i = 0; i < op_execs->size(); i++) {
call_op_exec((*op_execs)[i], pages);
if (i > MAX_OUTSTANDING_EXEC_EVENTS) {
CUDA_CALL(cudaEventSynchronize(rate_limit_events[i % MAX_OUTSTANDING_EXEC_EVENTS]));
if (rate_limit) {
if (i > MAX_OUTSTANDING_EXEC_EVENTS) {
CUDA_CALL(cudaEventSynchronize(rate_limit_events[i % MAX_OUTSTANDING_EXEC_EVENTS]));
}
CUDA_CALL(cudaEventRecord(rate_limit_events[i % MAX_OUTSTANDING_EXEC_EVENTS], stream));
}
CUDA_CALL(cudaEventRecord(rate_limit_events[i % MAX_OUTSTANDING_EXEC_EVENTS], stream));
}
}
......@@ -217,7 +235,7 @@ void Model::make_op_exec(PageMappedOpDef &spec, OpExec &op) {
tensor.strides = nullptr;
tensor.byte_offset = 0;
op.func_inputs[i].v_handle = &tensor;
op.func_tcodes[i] = kArrayHandle;
op.func_tcodes[i] = kTVMDLTensorHandle;
}
op.workspace_ptrs.resize(spec.workspace_allocs.size());
......
......@@ -9,8 +9,8 @@
#include <cuda_runtime.h>
#include "clockwork/util.h"
#define MAX_OUTSTANDING_EVENTS 4
#define MAX_OUTSTANDING_EXEC_EVENTS 4
#define MAX_OUTSTANDING_EVENTS 16
#define MAX_OUTSTANDING_EXEC_EVENTS 16
#define MAX_OUTSTANDING_MEMCPY_EVENTS 2
namespace clockwork{
......@@ -36,9 +36,10 @@ struct OpExec {
class Model {
public:
unsigned gpu_id;
bool rate_limit = true;
// Cool
const Memfile so_memfile;
Memfile so_memfile;
std::string serialized_spec;
int weights_size;
char* weights_pinned_host_memory; // alloced with cudaMallocHost
......@@ -61,7 +62,7 @@ private:
// Warm
model::PageMappedModelDef* spec = nullptr;
unsigned weights_pages_count;
size_t io_size, workspace_size;
size_t io_size, workspace_size, inputs_size, outputs_size;
std::vector<OpExec>* op_execs = nullptr;
so::TVMWarmSharedObject* warm_so = nullptr;
......
......@@ -17,7 +17,7 @@ void* SharedObject::GetSymbol(const char* symbolName) {
SharedObject::SharedObject(const std::string &name) : name(name) {
lib_handle_ = dlopen(name.c_str(), RTLD_LOCAL | RTLD_NOW);
CHECK(lib_handle_ != nullptr) << "Failed to load SO " << name << dlerror();
CHECK(lib_handle_ != nullptr) << "Failed to load SO " << name << ": " << dlerror();
}
SharedObject::~SharedObject() {
......