Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • cld/ml/clockwork
1 result
Show changes
Commits on Source (16)
Showing
with 372 additions and 33 deletions
......@@ -6,7 +6,8 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17 -O2")
set(CXX "g++-8")
set(CC "gcc-8")
set(THREADS_PREFER_PTHREAD_FLAG ON)
set(CMAKE_BUILD_TYPE Debug)
set(CMAKE_BUILD_TYPE Release)
#set(CMAKE_BUILD_TYPE Debug)
find_package(Threads REQUIRED)
find_package(Boost COMPONENTS system filesystem REQUIRED)
......@@ -71,6 +72,8 @@ add_library( clockwork
src/clockwork/controller/direct_controller.cpp
src/clockwork/network/client.cpp
$ENV{TVM_HOME}/src/runtime/meta_data.h
### No point maintaining the below any more
# src/clockwork/alternatives/runtime_model.cpp
# src/clockwork/alternatives/model_manager.cpp
......@@ -90,8 +93,32 @@ target_link_libraries( clockwork PUBLIC
${Boost_FILESYSTEM_LIBRARY}
)
# Converts from TVM models to clockwork models
include_directories(clockwork-convert)
add_executable (convert
src/clockwork-convert/tvm/decoupled_graph_runtime.cc
src/clockwork-convert/tvm_model.cpp
src/clockwork-convert/tvm_abstract_model.cpp
src/clockwork-convert/convert.cpp
)
target_link_libraries( convert
clockwork
clockwork_proto
Threads::Threads
dl
cuda
cudart
tvm_runtime
tbb
nvidia-ml
${Boost_SYSTEM_LIBRARY}
${Boost_FILESYSTEM_LIBRARY}
)
# cudafatbin is temporary / hacky
add_executable (cudafatbin src/cudafatbin.cc )
add_executable (cudafatbin src/clockwork-convert/cudafatbin.cc )
target_link_libraries( cudafatbin
clockwork
clockwork_proto
......@@ -106,17 +133,11 @@ target_link_libraries( cudafatbin
${Boost_FILESYSTEM_LIBRARY}
)
# Converts from TVM models to clockwork models
include_directories(clockwork-convert)
add_executable (convert
src/clockwork/tvm/decoupled_graph_runtime.cc
src/clockwork-convert/tvm_model.cpp
src/clockwork-convert/tvm_abstract_model.cpp
src/clockwork-convert/convert.cpp
add_executable (check_model
src/check_model.cpp
)
target_link_libraries( convert
target_link_libraries( check_model
clockwork
clockwork_proto
Threads::Threads
......
......@@ -109,17 +109,17 @@ List available GPU clock frequencies
nvidia-smi -q -d SUPPORTED_CLOCKS
```
Pick a memory and graphics clock frequency (usually the highest), e.g.
Pick a memory and graphics clock frequency (usually the highest), e.g. on volta machines:
```
Supported Clocks
Memory : 3004 MHz
Graphics : 1114 MHz
Memory : 877 MHz
Graphics : 1380 MHz
```
Set the default application clock and system clock to those highest values
Set the default application clock and system clock to those highest values, e.g. on volta machines:
```
nvidia-smi -ac 3004,1114
nvidia-smi -lgc 1114
nvidia-smi -ac 877,1380
nvidia-smi -lgc 1380
```
FYI:
......@@ -132,6 +132,8 @@ Some of these values can be checked by running the Clockwork profiler with:
./profile [check]
```
IMPORTANT: when you restart your machine, you will need to set persistence mode again. Run the checker frequently!
# Troubleshooting
Currently, the CMakeLists assumes CUDA lives in either `/usr/local/cuda/lib64` (the default location in Ubuntu 14.x) or `/usr/lib/x86_64-linux-gnu/nvidia/current` (the default location for MPI cluster machines). If you get build errors saying cannot find CUDA or cannot find nvidia-ml, then you'll need to update the `include_directories` and `link_directories` directives in the CMakeLists.txt with the CUDA location on your machine.
......
import argparse
import os
import subprocess
"""
Clockwork must be build in order to run this script.
This script uses the `convert` binary and expects it to exist in the `build` folder
"""
parser = argparse.ArgumentParser(description='Convert a TVM model into a Clockwork model')
parser.add_argument("input_dir", metavar="INDIR", type=str, help="Base directory where TVM models exist. The utility expects multiple models, one per batch size, each in a subdirectory.")
parser.add_argument("output_dir", metavar="OUTDIR", type=str, help="Output directory. Directory will be created if it does not exist.")
parser.add_argument('-p', "--page_size", type=int, default=16777216, help="Page size to use for compiled models. 16MB by default.")
parser.add_argument("--subdir_prefix", type=str, default="b", help="Within input_dir, a prefix for how subdirectories are named. Default \"b\" followed by the batch size.")
def find_tvm_models(path):
dir_contents = os.listdir(path)
dir_contents_paths = [os.path.join(path, c) for c in dir_contents]
dir_files = [c for c in dir_contents_paths if os.path.isfile(c)]
so_files = [f[:-3] for f in dir_files if f.endswith(".so")]
model_choices = [f for f in so_files if is_model(f)]
return model_choices
def is_model(path_prefix):
suffixes = ["so", "params", "json"]
for suffix in suffixes:
if not os.path.exists("%s.%s" % (path_prefix, suffix)):
return False
return True
def find_models(path, subdir_prefix):
found_models = []
for entry in os.listdir(path):
entry_path = os.path.join(path, entry)
if not os.path.isdir(entry_path):
print("Ignoring non-directory %s" % entry_path)
continue
if not entry.startswith(subdir_prefix):
print("Skipping non-matching (prefix=\"%s\") directory %s " % (subdir_prefix, entry_path))
continue
candidates = find_tvm_models(entry_path)
if len(candidates) == 0:
print("Skipping directory with no valid models (expect .so .json and .params with matching names) %s" % entry_path)
continue
if len(candidates) > 1:
print("Skipping directory with multiple valid models %s" % entry_path)
continue
batch_size = int(entry[len(subdir_prefix):])
found_models.append((batch_size, candidates[0]))
return sorted(found_models)
def convert(args):
models = find_models(args.input_dir, args.subdir_prefix)
print("Found %d models in input directory %s:" % (len(models), args.input_dir))
for batch_size, model in models:
print(" %d %s" % (batch_size, model))
# Create output directory
if not os.path.exists(args.output_dir):
print("Output directory %s will be created" % args.output_dir)
else:
print("Will output to existing directory %s" % args.output_dir)
print("The following command will run:")
pargs = [str(v) for v in [
"../build/convert",
"-o", args.output_dir,
"-p", args.page_size
] + [x for m in models for x in m]]
print(" ".join(pargs))
print("Press <return> to continue or CTRL-C to abort")
input()
if not os.path.exists(args.output_dir):
print("Created output directory %s" % args.output_dir)
os.makedirs(args.output_dir)
popen = subprocess.Popen(pargs)
popen.wait()
if __name__ == '__main__':
args = parser.parse_args()
exit(convert(args))
\ No newline at end of file
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*!
* \file common.h
* \brief Common utilities for CUDA
*/
#ifndef _CLOCKWORK_CUDA_COMMON_H_
#define _CLOCKWORK_CUDA_COMMON_H_
#include <cuda_runtime.h>
#include <string>
namespace clockwork {
#define CUDA_DRIVER_CALL(x) \
{ \
CUresult result = x; \
if (result != CUDA_SUCCESS && result != CUDA_ERROR_DEINITIALIZED) { \
const char *msg; \
cuGetErrorName(result, &msg); \
LOG(FATAL) \
<< "CUDAError: " #x " failed with error: " << msg; \
} \
}
#define CUDA_CALL(func) \
{ \
cudaError_t e = (func); \
CHECK(e == cudaSuccess || e == cudaErrorCudartUnloading) \
<< "CUDA: " << cudaGetErrorString(e); \
}
}
#endif // _CLOCKWORK_CUDA_COMMON_H_
......@@ -76,13 +76,19 @@ struct PageMappedDLTensorDef {
uint64_t page_offset;
uint64_t size;
std::vector<int64_t> shape;
int code = 2U; // kDLFloat
int bits = 32;
int lanes = 1;
PODS_SERIALIZABLE(1,
PODS_MDR(base_offset),
PODS_MDR(page),
PODS_MDR(page_offset),
PODS_MDR(size),
PODS_MDR(shape)
PODS_MDR(shape),
PODS_OPT(code),
PODS_OPT(bits),
PODS_OPT(lanes)
)
};
......
......@@ -49,7 +49,7 @@ void check_environment() {
for (unsigned i = 0; i < num_gpus; i++) {
if (!util::is_gpu_exclusive(i)) { // TODO: check all GPUs
std::cout << " ✘ GPU " << i << " is not in exclusive mode; set with `nvidia-smi -i " << i << " -c 3` or set for all GPUs with `nvndia-smi -c 3`" << std::endl;
std::cout << " ✘ GPU " << i << " is not in exclusive mode; set with `nvidia-smi -i " << i << " -c 3` or set for all GPUs with `nvidia-smi -c 3`" << std::endl;
} else {
std::cout << " ✔ GPU " << i << " is in exclusive mode" << std::endl;
}
......
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
#include "clockwork/worker.h"
#include "clockwork/network/worker.h"
#include "clockwork/runtime.h"
#include "clockwork/model/batched.h"
#include "clockwork/memory.h"
#include "clockwork/cache.h"
#include "clockwork/cuda_common.h"
using namespace clockwork;
void show_usage() {
std::cout << "USAGE" << std::endl;
std::cout << " ./check_model [MODEL]" << std::endl;
std::cout << "DESCRIPTION" << std::endl;
std::cout << " Will load and run an inference on a specified Clockwork model" << std::endl;
std::cout << "OPTIONS" << std::endl;
std::cout << " -h, --help" << std::endl;
std::cout << " Print this message" << std::endl;
std::cout << " -p, --page_size" << std::endl;
std::cout << " Weights page size used by Clockwork. Defaults to 16MB. You shouldn't" << std::endl;
std::cout << " need to set this because we are using 16MB pages." << std::endl;
}
model::BatchedModel* load_model(std::string model) {
return model::BatchedModel::loadFromDisk(model, 0);
}
void check_model(int page_size, std::string model_path) {
std::cout << "Checking " << model_path << std::endl;
util::setCudaFlags();
util::initializeCudaStream();
clockwork::model::BatchedModel* model = load_model(model_path);
auto batch_sizes = model->implemented_batch_sizes();
for (unsigned batch_size : batch_sizes) {
std::cout << " loaded batch size " << batch_size << std::endl;
}
model->instantiate_models_on_host();
size_t weights_page_size = page_size;
size_t weights_cache_size = model->num_weights_pages(weights_page_size) * weights_page_size;
PageCache* weights_cache = make_GPU_cache(weights_cache_size, weights_page_size, GPU_ID_0);
cudaError_t status;
model->instantiate_models_on_device();
std::shared_ptr<Allocation> weights = weights_cache->alloc(model->num_weights_pages(weights_page_size), []{});
model->transfer_weights_to_device(weights->page_pointers, util::Stream());
for (unsigned batch_size : batch_sizes) {
// Create inputs and outputs
char* input = new char[model->input_size(batch_size)];
char* output = new char[model->output_size(batch_size)];
// Create and allocate io_memory on GPU
size_t io_memory_size = model->io_memory_size(batch_size);
MemoryPool* io_pool = CUDAMemoryPool::create(io_memory_size, GPU_ID_0);
char* io_memory = io_pool->alloc(io_memory_size);
// Create and allocate intermediate GPU memory workspace
size_t workspace_size = model->workspace_memory_size(batch_size);
MemoryPool* workspace_pool = CUDAMemoryPool::create(workspace_size, GPU_ID_0);
char* workspace_memory = workspace_pool->alloc(workspace_size);
// Now execute each step of model
model->transfer_input_to_device(batch_size, input, io_memory, util::Stream());
// Time the call
int warmups = 20;
for (int i = 0; i < warmups; i++) {
model->call(batch_size, weights->page_pointers, io_memory, workspace_memory, util::Stream());
}
status = cudaStreamSynchronize(util::Stream());
CHECK(status == cudaSuccess);
auto before = util::now();
int iterations = 100;
for (int i = 0; i < iterations; i++) {
model->call(batch_size, weights->page_pointers, io_memory, workspace_memory, util::Stream());
}
status = cudaStreamSynchronize(util::Stream());
CHECK(status == cudaSuccess);
auto after = util::now();
printf(" b%d: %.2f ms per call\n", batch_size, ((float) (after-before)) / (iterations * 1000000.0));
model->transfer_output_from_device(batch_size, output, io_memory, util::Stream());
status = cudaStreamSynchronize(util::Stream());
CHECK(status == cudaSuccess);
delete input;
delete output;
io_pool->free(io_memory);
delete io_pool;
workspace_pool->free(workspace_memory);
delete workspace_pool;
}
weights_cache->unlock(weights);
weights_cache->free(weights);
delete weights_cache;
model->uninstantiate_models_on_device();
model->uninstantiate_models_on_host();
delete model;
}
int main(int argc, char *argv[]) {
std::vector<std::string> non_argument_strings;
int page_size = 16 * 1024 * 1024;
for (int i = 1; i < argc; ++i) {
std::string arg = argv[i];
if ((arg == "-h") || (arg == "--help")) {
show_usage();
return 0;
} else if ((arg == "-p") || (arg == "--page_size")) {
page_size = atoi(argv[++i]);
} else {
non_argument_strings.push_back(arg);
}
}
if (non_argument_strings.size() != 1) {
std::cerr << "Expecting a model as input" << std::endl;
return 1;
}
std::string model_path = non_argument_strings[0];
check_model(page_size, model_path);
}
......@@ -14,7 +14,6 @@
#include <pods/binary.h>
#include <pods/buffers.h>
#include <pods/streams.h>
#include "clockwork/tvm/decoupled_graph_runtime.h"
#include "clockwork-convert/tvm_model.h"
#include "clockwork-convert/tvm_abstract_model.h"
......@@ -114,8 +113,38 @@ void convert(ConvertConfig config) {
}
void show_usage() {
std::cout << "Provide the name of a model, to convert it" << std::endl;
std::cout << "Specify page size with -p flag" << std::endl;
std::cout << "USAGE" << std::endl;
std::cout << " ./convert [OPTIONS] [MODELS]" << std::endl;
std::cout << "DESCRIPTION" << std::endl;
std::cout << " This utility converts models compiled for TVM into models compatible with" << std::endl;
std::cout << " Clockwork. All models provided to this command must be variants of the SAME" << std::endl;
std::cout << " model but with DIFFERENT batch sizes" << std::endl;
std::cout << "MODELS" << std::endl;
std::cout << " Specify a model with two arguments [model_batchsize] [model_file_prefix]" << std::endl;
std::cout << " model_batchsize" << std::endl;
std::cout << " The specific batch size TVM compiled for this model" << std::endl;
std::cout << " model_file_prefix" << std::endl;
std::cout << " TVM outputs three files when it compiles a model: a .so, a .params," << std::endl;
std::cout << " and a .json file. model_file_prefix specifies the path to these files" << std::endl;
std::cout << "OPTIONS" << std::endl;
std::cout << " -h, --help" << std::endl;
std::cout << " Print this message" << std::endl;
std::cout << " -o, --output" << std::endl;
std::cout << " Directory to output all compiled models to. Defaults to 'model' in the" << std::endl;
std::cout << " current directory. You should probably set this." << std::endl;
std::cout << " -p, --page_size" << std::endl;
std::cout << " Weights page size used by Clockwork. Defaults to 16MB. You shouldn't" << std::endl;
std::cout << " need to set this because we are using 16MB pages." << std::endl;
std::cout << "EXAMPLE" << std::endl;
std::cout << " Suppose you have compiled resnet50 using TVM, for batch sizes 1, 2, and 4," << std::endl;
std::cout << " located in ~/models/resnet50/batchsize1, ~/models/resnet50/batchsize2, and " << std::endl;
std::cout << " ~/models/resnet50/batchsize4 respectively. The following command will" << std::endl;
std::cout << " combine the models and convert them into a clockwork model, putting the" << std::endl;
std::cout << " output in ~/models/resnet50/clockwork" << std::endl;
std::cout << " ./convert -o ~/models/resnet50/clockwork \\" << std::endl;
std::cout << " 1 ~/models/resnet50/batchsize1/tvm-model \\" << std::endl;
std::cout << " 2 ~/models/resnet50/batchsize2/tvm-model \\" << std::endl;
std::cout << " 4 ~/models/resnet50/batchsize4/tvm-model" << std::endl;
}
int main(int argc, char *argv[]) {
......@@ -131,8 +160,6 @@ int main(int argc, char *argv[]) {
if ((arg == "-h") || (arg == "--help")) {
show_usage();
return 0;
} else if ((arg == "--weights_page_size")) {
config.weights_page_size = atoi(argv[++i]);
} else if ((arg == "-o") || (arg == "--output")) {
config.output_dir = argv[++i];
} else if ((arg == "-p") || (arg == "--page_size")) {
......@@ -142,8 +169,8 @@ int main(int argc, char *argv[]) {
}
}
if (non_argument_strings.size() < 1) {
std::cerr << "Expected input model filename, none given." << std::endl
if (non_argument_strings.size() < 2) {
std::cerr << "Each input model should be specified as <batch_size> <filename>, e.g. 1 ~/models/resnet50/batchsize1" << std::endl
<< "Execute with --help for usage information." << std::endl;
return 1;
}
......
......@@ -3,7 +3,6 @@
#include <cuda_runtime.h>
#include "clockwork/tvm/pack_args.h"
#include <tvm/runtime/cuda_common.h>
#include "clockwork/util.h"
#include <iostream>
#include "tbb/task_scheduler_init.h"
......@@ -21,10 +20,9 @@
#include <pods/binary.h>
#include <pods/buffers.h>
#include <pods/streams.h>
#include "clockwork/tvm/decoupled_graph_runtime.h"
#include <cuda_runtime.h>
#include <chrono>
#include <tvm/runtime/cuda_common.h>
#include "clockwork/cuda_common.h"
#include "clockwork/cache.h"
#include "clockwork/util.h"
#include "clockwork/model/so.h"
......
......@@ -2,7 +2,7 @@
* Copyright (c) 2017 by Contributors
* \file graph_runtime.cc
*/
#include "clockwork/tvm/decoupled_graph_runtime.h"
#include "clockwork-convert/tvm/decoupled_graph_runtime.h"
#include <tvm/runtime/managed_cuda_device_api.h>
#include <tvm/runtime/ndarray.h>
......
......@@ -16,7 +16,7 @@
#include <tvm/runtime/module.h>
#include <tvm/runtime/registry.h>
#include <tvm/runtime/packed_func.h>
#include "clockwork/tvm/decoupled_graph_runtime.h"
#include "clockwork-convert/tvm/decoupled_graph_runtime.h"
namespace tvm_model {
......
......@@ -7,7 +7,7 @@
#include <memory>
#include <atomic>
#include <cuda_runtime.h>
#include "tvm/runtime/cuda_common.h"
#include "clockwork/cuda_common.h"
#include "clockwork/telemetry.h"
#include "clockwork/cache.h"
#include "clockwork/model/model.h"
......
#include "clockwork/cache.h"
#include <dmlc/logging.h>
#include "tvm/runtime/cuda_common.h"
#include "clockwork/cuda_common.h"
namespace clockwork {
......