Compare revisions

Jonathan Mace · Jonathan Mace · Jonathan Mace · Jonathan Mace · Jonathan Mace · Jonathan Mace
--- a/README.md
+++ b/README.md
@@ -65,7 +65,7 @@ Set the "performance" governor to prevent CPU clock scaling
 echo performance | sudo tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
 ```

-## 2. Increase file and memlock limits
+## 2. Increase file, memlock, and rtprio limits

 Limits on the number of open files, and the amount of page-locked memory, reduce the total number of DNNs clockwork can keep in memory at any point in time.  A limit of 1024 is too low.  A limit of 16k or higher is acceptable.

@@ -79,6 +79,8 @@ Increase the `RLIMIT_NOFILE` (number of open files) and `RLIMIT_MEMLOCK` (amount
 *            soft   memlock           unlimited
 *            hard   nofile            unlimited
 *            soft   nofile            unlimited
+*            hard   rtprio            unlimited
+*            soft   rtprio            unlimited
 ```
 Note: for MPI cluster machines with the default Debian distribution, you will also need to modify `/etc/security/limits.d/mpiprefs.conf`


--- a/profile/clockwork/profile/check.cpp
+++ b/profile/clockwork/profile/check.cpp
@@ -37,11 +37,34 @@ void check_environment() {
    }

    getrlimit(RLIMIT_MEMLOCK, &rlim);
-    if (rlim.rlim_cur < 1024L * 1024L * 1024L * 1024L) {
+    if (rlim.rlim_cur != RLIM_INFINITY) {
        std::cout << "✘ Resource limit on memlocked pages is " << rlim.rlim_cur << ", require unlimited" << std::endl;
        environmentIsOK = false;
    } else {
-        std::cout << "✔ RLIMIT_MEMLOCK is " << rlim.rlim_cur << std::endl;
+        std::cout << "✔ RLIMIT_MEMLOCK is unlimited" << std::endl;
+    }
+
+    getrlimit(RLIMIT_RTPRIO, &rlim);
+    if (rlim.rlim_cur != RLIM_INFINITY) {
+        std::cout << "✘ rtprio is not unlimited; this will prevent setting thread priority.  Current value: " << rlim.rlim_cur << std::endl;
+        environmentIsOK = false;
+    } else {
+        std::cout << "✔ RLIMIT_RTPRIO is unlimited" << std::endl;
+    }
+
+    FILE* statusf = fopen("/proc/sys/vm/max_map_count", "r");
+    if (!statusf) {
+        std::cout << "✘ Unable to read /proc/sys/vm/max_map_count.  Manually check.  Value should be > 1000000" << std::endl;
+        environmentIsOK = false;
+    } else {
+        char line[100];
+        int max_map_count = atoi(fgets(line, 100, statusf));
+        if (max_map_count < 10000000) {
+            std::cout << "✘ vm.max_map_count is " << max_map_count << std::endl;               
+            environmentIsOK = false;
+        } else {
+            std::cout << "✔ vm.max_map_count is " << max_map_count << std::endl;            
+        }
    }

    unsigned num_gpus = util::get_num_gpus();

--- a/src/check_model.cpp
+++ b/src/check_model.cpp
@@ -33,12 +33,16 @@ void check_model(int page_size, int iterations, std::string model_path) {

 	util::setCudaFlags();
    util::initializeCudaStream();
+    util::setCurrentThreadMaxPriority();

 	clockwork::model::BatchedModel* model = load_model(model_path);

 	auto batch_sizes = model->implemented_batch_sizes();

    model->instantiate_models_on_host();
+    for (auto &p : model->models) {
+    	p.second->rate_limit = false;
+    }

    size_t weights_page_size = page_size;
    size_t weights_cache_size = model->num_weights_pages(weights_page_size) * weights_page_size;

--- a/src/clockwork/model/model.cpp
+++ b/src/clockwork/model/model.cpp
@@ -6,6 +6,31 @@

 using namespace clockwork::model;

+class PerGPULimiters {
+public:
+	const unsigned num_events;
+	const unsigned skip;
+	std::vector<CudaRateLimiter*> limiters;
+
+	PerGPULimiters(unsigned num_events, unsigned skip) : num_events(num_events), skip(skip) {
+	}
+
+	CudaRateLimiter* get(unsigned gpu_id) {
+		if (gpu_id >= limiters.size()) {
+			limiters.resize(gpu_id+1, nullptr);
+		}
+		if (limiters[gpu_id] == nullptr) {
+			CUDA_CALL(cudaSetDevice(gpu_id));
+			limiters[gpu_id] = new CudaRateLimiter(num_events, skip);
+		}
+		return limiters[gpu_id];
+	}
+
+};
+
+thread_local PerGPULimiters exec_limiters(2, 5);
+thread_local PerGPULimiters transfer_limiters(2, 0);
+
 Model::Model(Memfile so_memfile, std::string &serialized_spec, int weights_size,
 	char* weights_pinned_host_memory, unsigned gpu_id):
 		so_memfile(so_memfile),	
@@ -13,20 +38,13 @@ Model::Model(Memfile so_memfile, std::string &serialized_spec, int weights_size,
 		weights_size(weights_size),
 		weights_pinned_host_memory(weights_pinned_host_memory),
 		gpu_id(gpu_id) {
-	CUDA_CALL(cudaSetDevice(gpu_id));
-	for (unsigned i = 0; i < rate_limit_events.size(); i++) {
-		CUDA_CALL(cudaEventCreateWithFlags(&rate_limit_events[i], cudaEventDisableTiming));
-	}
+	exec_limiter = exec_limiters.get(gpu_id);
+	transfer_limiter = transfer_limiters.get(gpu_id);
 }

 Model::~Model() {
 	if (hot_so != nullptr) uninstantiate_model_on_device();
 	if (warm_so != nullptr) uninstantiate_model_on_host();
-
-	CUDA_CALL(cudaSetDevice(gpu_id));
-	for (unsigned i = 0; i < rate_limit_events.size(); i++) {
-		CUDA_CALL(cudaEventDestroy(rate_limit_events[i]));
-	}
 }

 void Model::instantiate_model_on_host() {
@@ -79,6 +97,7 @@ void Model::uninstantiate_model_on_host() {
 void Model::instantiate_model_on_device() {
 	CHECK(hot_so == nullptr) << "instantiate_model_on_device hot_so is not nullptr";

+
 	/* 1: load the CUDA module onto device, which ultimately calls cuModuleLoad
 	cuModuleLoad requires a barrier on kernel execution, and will block until
 	current outstanding kernels have completed.  It will also block submission
@@ -125,12 +144,7 @@ void Model::transfer_weights_to_device(std::vector<char*> &weights_pages, cudaSt
 				stream
 			)
 		)
-		if (rate_limit) {
-			if (i > MAX_OUTSTANDING_MEMCPY_EVENTS) {
-				CUDA_CALL(cudaEventSynchronize(rate_limit_events[i % MAX_OUTSTANDING_MEMCPY_EVENTS]));
-			}
-			CUDA_CALL(cudaEventRecord(rate_limit_events[i % MAX_OUTSTANDING_MEMCPY_EVENTS], stream));
-		}
+		if (rate_limit) transfer_limiter->limit(stream);
 	}
 }

@@ -205,12 +219,7 @@ void Model::call(std::vector<char*> &weights_pages, char* &io_memory, char* &wor

 	for (unsigned i = 0; i < op_execs->size(); i++) {
 		call_op_exec((*op_execs)[i], pages);
-		if (rate_limit) {
-			if (i > MAX_OUTSTANDING_EXEC_EVENTS) {
-				CUDA_CALL(cudaEventSynchronize(rate_limit_events[i % MAX_OUTSTANDING_EXEC_EVENTS]));
-			}
-			CUDA_CALL(cudaEventRecord(rate_limit_events[i % MAX_OUTSTANDING_EXEC_EVENTS], stream));
-		}
+		if (rate_limit) exec_limiter->limit(stream);
 	}
 }

@@ -230,7 +239,11 @@ void Model::make_op_exec(PageMappedOpDef &spec, OpExec &op) {
 		tensor.data = nullptr;
 		tensor.ctx = DLContext{kDLGPU, 0}; // TODO: multiple devices
 		tensor.ndim = tspec.shape.size();
-		tensor.dtype = DLDataType{tspec.code, tspec.bits, tspec.lanes};
+		tensor.dtype = DLDataType{
+			static_cast<uint8_t>(tspec.code), 
+			static_cast<uint8_t>(tspec.bits), 
+			static_cast<uint16_t>(tspec.lanes)
+		};
 		tensor.shape = tspec.shape.data();
 		tensor.strides = nullptr;
 		tensor.byte_offset = 0;

--- a/src/clockwork/model/model.h
+++ b/src/clockwork/model/model.h
@@ -8,6 +8,7 @@
 #include "clockwork/model/so.h"
 #include <cuda_runtime.h>
 #include "clockwork/util.h"
+#include "clockwork/cuda_common.h"

 #define MAX_OUTSTANDING_EVENTS 16
 #define MAX_OUTSTANDING_EXEC_EVENTS 16
@@ -16,6 +17,39 @@
 namespace clockwork{
 namespace model {

+// Rate-limits cuda calls on a stream
+class CudaRateLimiter {
+private:
+	const unsigned num_events, skip;
+	unsigned position, count;
+
+public:
+	std::vector<cudaEvent_t> events;
+	CudaRateLimiter(unsigned num_events, unsigned skip) : 
+			num_events(num_events), skip(skip), position(0), count(0) {
+		events.resize(num_events);
+		for (unsigned i = 0; i < num_events; i++) {
+			CUDA_CALL(cudaEventCreateWithFlags(&events[i], cudaEventDisableTiming));
+		}
+	}
+	~CudaRateLimiter() {
+		for (unsigned i = 0; i < num_events; i++) {
+			CUDA_CALL(cudaEventDestroy(events[i]));
+		}
+	}
+
+	void limit(cudaStream_t stream) {
+		if (count++ == skip) {
+			CUDA_CALL(cudaEventSynchronize(events[position]));
+			CUDA_CALL(cudaEventRecord(events[position], stream));
+
+			position = (position+1) % num_events;
+			count = 0;
+		}
+	}
+
+};
+
 // TVM Function signature for generated packed function in shared library
 typedef int (*OpFunc)(void* args, int* type_codes, int num_args);

@@ -47,16 +81,17 @@ public:
 	Model(Memfile so_memfile, std::string &serialized_spec, int weights_size,
 		char* weights_pinned_host_memory, unsigned gpu_id);

-
-private:
-
 	/* These events are used to rate-limit submission of asynchronous CUDA operations.
 	Executing a model comprises potentially dozens of CUDA kernels.  With paged memory,
 	copying model weights comprises on the order of a dozen asynchronous memcpys.
 	Internally, CUDA has very short queues for managing submitted asynchronous tasks,
 	and surprisingly quickly will block ALL asynchronous submissions if there are too
 	many outstanding, even those in completely independent streams */
-	std::array<cudaEvent_t, MAX_OUTSTANDING_EVENTS> rate_limit_events;
+	CudaRateLimiter* exec_limiter;
+	CudaRateLimiter* transfer_limiter;
+
+
+private:


 	// Warm

--- a/src/clockwork/priority_queue.h
+++ b/src/clockwork/priority_queue.h
@@ -5,8 +5,92 @@
 #include <mutex>
 #include <condition_variable>
 #include <queue>
+#include "tbb/concurrent_queue.h"

 namespace clockwork {
+
+
+/* This is a priority queue with the same semantics as time_release_priority_queue
+but only when there is a single thread reading.  It uses a thread-safe concurrent queue
+and a non-thread-safe queue maintained by the reader */
+template <typename T> class single_reader_priority_queue {
+private:
+	struct container {
+		T* element;
+		uint64_t priority;
+		uint64_t version;
+
+		friend bool operator < (const container& lhs, const container &rhs) {
+			return lhs.priority < rhs.priority || 
+			  (lhs.priority == rhs.priority && lhs.version < rhs.version);
+		}
+		friend bool operator > (const container& lhs, const container &rhs) {
+			return lhs.priority > rhs.priority ||
+			  (lhs.priority == rhs.priority && lhs.version > rhs.version);
+		}
+	};
+
+	std::atomic_bool alive;
+	tbb::concurrent_queue<container> queue;
+
+	uint64_t version;
+	std::priority_queue<container, std::vector<container>, std::greater<container>> reader_queue;
+
+	void pull_new_elements() {
+		container next;
+		while (queue.try_pop(next)) {
+			next.version = version++;
+			reader_queue.push(next);
+		}
+	}
+
+public:
+
+	single_reader_priority_queue() : alive(true) {}
+
+	bool enqueue(T* element, uint64_t priority) {
+		if (alive) {
+			queue.push(container{element, priority, 0});
+		}
+		return alive;
+	}
+
+	bool try_dequeue(T* &element) {
+		pull_new_elements();
+
+		if (!alive || reader_queue.empty() || reader_queue.top().priority > util::now()) {
+			return false;
+		}
+
+		element = reader_queue.top().element;
+		reader_queue.pop();
+		return true;
+	}
+
+	T* dequeue() {
+		T* element = nullptr;
+		while (alive && !try_dequeue(element));
+		return element;
+	}
+
+	std::vector<T*> drain() {
+		pull_new_elements();
+
+		std::vector<T*> elements;
+		while (!reader_queue.empty()) {
+			elements.push_back(reader_queue.top().element);
+			reader_queue.pop();
+		}
+
+		return elements;
+	}
+
+	void shutdown() {
+		alive = false;
+	}
+
+};
+
 	/* This is a priority queue, but one where priorities also define a minimum
 time that an enqueued task is eligible to be dequeued.  The queue will block
 if no eligible tasks are available */

--- a/src/clockwork/runtime.cpp
+++ b/src/clockwork/runtime.cpp
@@ -29,8 +29,8 @@ CPUExecutor::CPUExecutor(TaskType type, std::vector<unsigned> cores) : BaseExecu

 void CPUExecutor::executorMain(unsigned executor_id, unsigned core) {
 	std::cout << TaskTypeName(type) << "-" << executor_id << " binding to core " << core << std::endl;
-	util::set_core(core);
-	util::setCurrentThreadMaxPriority();
+	// util::set_core(core);
+	// util::setCurrentThreadMaxPriority();

 	while (alive.load()) {
 		// TODO: possibility off too many outstanding asyc tasks
@@ -64,8 +64,8 @@ GPUExecutorShared::GPUExecutorShared(TaskType type, std::vector<unsigned> cores,

 void GPUExecutorShared::executorMain(unsigned executor_id, unsigned core) {
 	std::cout << TaskTypeName(type) << "-" << executor_id << " binding to core " << core << std::endl;
-	util::set_core(core);
-	util::setCurrentThreadMaxPriority();
+	// util::set_core(core);
+	// util::setCurrentThreadMaxPriority();

 	unsigned priority = 0;
 	std::vector<cudaStream_t> streams;
@@ -119,8 +119,8 @@ GPUExecutorExclusive::GPUExecutorExclusive(TaskType type, std::vector<unsigned>

 void GPUExecutorExclusive::executorMain(unsigned executor_id, unsigned core) {
 	std::cout << TaskTypeName(type) << "-" << executor_id << " binding to core " << core << std::endl;
-	util::set_core(core);
-	util::setCurrentThreadMaxPriority();
+	// util::set_core(core);
+	// util::setCurrentThreadMaxPriority();
 	util::initializeCudaStream(gpu_id);
 	cudaStream_t stream = util::Stream();

@@ -170,8 +170,8 @@ void AsyncTaskChecker::join() {

 void AsyncTaskChecker::executorMain(unsigned executor_id, unsigned core) {
 	std::cout << "Checker-" << executor_id << " binding to core " << core << std::endl;
-	util::set_core(core);
-	util::setCurrentThreadMaxPriority();
+	// util::set_core(core);
+	// util::setCurrentThreadMaxPriority();
 	//util::initializeCudaStream(GPU_ID_0); // TODO Is this call necessary?

 	std::vector<AsyncTask*> pending_tasks;

--- a/src/clockwork/runtime.h
+++ b/src/clockwork/runtime.h
@@ -87,6 +87,7 @@ public:
 	void executorMain(unsigned executor_id, unsigned core);
 };

+
 class ClockworkRuntime {
 public:
 	unsigned num_gpus;
@@ -136,26 +137,60 @@ public:
 	void join();

 protected:
+
+	// Utility class for allocating cores
+	class CoreAllocator {
+	public:
+		std::vector<unsigned> usage_count;
+		CoreAllocator() {
+			usage_count.resize(util::get_num_cores(), 0);
+		}
+
+		int try_acquire(unsigned gpu_id) {
+			std::vector<unsigned> preferred = util::get_gpu_core_affinity(gpu_id);
+			for (unsigned i = preferred.size()-1; i >= 0; i--) {
+				unsigned core = preferred[i];
+				if (usage_count[core] == 0) {
+					usage_count[core]++;
+					return core;
+				}
+			}
+			for (unsigned core = 0; core < usage_count.size(); core++) {
+				if (usage_count[core] == 0) {
+					usage_count[core]++;
+					return core;
+				}
+			}
+			return -1;
+		}
+
+		unsigned acquire(unsigned gpu_id) {
+			int core = try_acquire(gpu_id);
+			CHECK(core >= 0) << "Unable to acquire core for GPU " << gpu_id << "; all cores exhausted";
+			return static_cast<unsigned>(core);
+		}
+
+	};
+
 	void initialize(ClockworkWorkerSettings settings) {

 		num_gpus = settings.num_gpus;

 		manager = new MemoryManager(settings);

+		CoreAllocator cores;
+
 		for (unsigned gpu_id = 0; gpu_id < num_gpus; gpu_id++) {
 			event_pools.push_back(new CudaEventPool(gpu_id));
-
-			auto cores = util::get_gpu_core_affinity(gpu_id);
-			int i = cores.size()-1;
 			
-			gpu_executors.push_back(new GPUExecutorExclusive(GPU, {cores[i--]}, gpu_id)); // Type 3
+			gpu_executors.push_back(new GPUExecutorExclusive(GPU, {cores.acquire(gpu_id)}, gpu_id)); // Type 3

 			if (gpu_id == 0) {
-				load_model_executor = new CPUExecutor(CPU, {cores[i--]}); // Type 0
-				weights_executor = new GPUExecutorShared(PCIe_H2D_Weights, {cores[i--]}, num_gpus);	// Type 1
-				inputs_executor = new GPUExecutorShared(PCIe_H2D_Inputs, {cores[i--]}, num_gpus);	// Type 2
-				outputs_executor = new GPUExecutorShared(PCIe_D2H_Output, {cores[i--]}, num_gpus);	// Type 4
-				checker = new AsyncTaskChecker({cores[i--]});
+				load_model_executor = new CPUExecutor(CPU, {cores.acquire(gpu_id)}); // Type 0
+				weights_executor = new GPUExecutorShared(PCIe_H2D_Weights, {cores.acquire(gpu_id)}, num_gpus);	// Type 1
+				inputs_executor = new GPUExecutorShared(PCIe_H2D_Inputs, {cores.acquire(gpu_id)}, num_gpus);	// Type 2
+				outputs_executor = new GPUExecutorShared(PCIe_D2H_Output, {cores.acquire(gpu_id)}, num_gpus);	// Type 4
+				checker = new AsyncTaskChecker({cores.acquire(gpu_id)});
 			}
 		}
 		std::string task_file_name = settings.task_telemetry_log_dir;

--- a/src/clockwork/util.cpp
+++ b/src/clockwork/util.cpp
@@ -155,16 +155,19 @@ char* getGPUModelToBuffer(int deviceNumber, char* buf) {

 void setCurrentThreadMaxPriority() {
  pthread_t thId = pthread_self();
-  pthread_attr_t thAttr;
-  int policy = 0;
-  int max_prio_for_policy = 0;
-
-  pthread_attr_init(&thAttr);
-  pthread_attr_getschedpolicy(&thAttr, &policy);
-  max_prio_for_policy = sched_get_priority_max(policy);
+  
+  struct sched_param params;
+  params.sched_priority = sched_get_priority_max(SCHED_RR);
+  int ret = pthread_setschedparam(thId, SCHED_RR, &params);
+  if (ret != 0) {
+    std::cout << "Warning!  Cannot set thread priority.  Don't forget to set rtprio unlimited in limits.conf.  See README for details." << std::endl;
+    return;
+  }

-  pthread_setschedprio(thId, max_prio_for_policy);
-  pthread_attr_destroy(&thAttr);
+  int policy = 0;
+  ret = pthread_getschedparam(thId, &policy, &params);
+  CHECK(ret == 0) << "Could not retrieve thread scheduler parameters for setting thread priority";
+  CHECK(policy == SCHED_RR) << "Inconsistent thread scheduler parameters encountered";
 }



--- a/test/clockwork/test/testpriorityqueue.cpp
+++ b/test/clockwork/test/testpriorityqueue.cpp
@@ -271,3 +271,267 @@ TEST_CASE("Priority Queue Drain after Shutdown", "[queue] [shutdown]") {
    INFO("Unable to drain pending elements from queue after shutdown");
    REQUIRE(drained.size() == 3);
 }
+
+TEST_CASE("Single Reader Priority Queue Simple Dequeue Order", "[queue]") {
+    using namespace clockwork;
+
+    single_reader_priority_queue<int> q;
+
+    std::vector<int*> elements;
+    for (unsigned i = 0; i < 10; i++) {
+        int* element = new int();
+        q.enqueue(element, i);
+        elements.push_back(element);
+    }
+
+    for (unsigned i = 0; i < 10; i++) {
+        int* element = q.dequeue();
+        REQUIRE(element == elements[i]);
+    }
+}
+
+TEST_CASE("Single Reader Priority Queue Reverse Dequeue Order", "[queue]") {
+    using namespace clockwork;
+
+    single_reader_priority_queue<int> q;
+
+    std::vector<int*> elements;
+    for (unsigned i = 0; i < 10; i++) {
+        int* element = new int();
+        q.enqueue(element, 9-i);
+        elements.push_back(element);
+    }
+
+    for (unsigned i = 0; i < 10; i++) {
+        int* element = q.dequeue();
+        REQUIRE(element == elements[9-i]);
+    }
+}
+
+
+
+TEST_CASE("Single Reader Priority Queue ZigZag Dequeue Order", "[queue]") {
+    using namespace clockwork;
+
+    single_reader_priority_queue<int> q;
+
+
+    std::vector<int> priorities = { 10, 0, 5, 8, 3, 7, 11, 1};
+    std::unordered_map<int, int*> elems;
+
+    for (int &priority : priorities) {
+        int* element = new int();
+        q.enqueue(element, priority);
+        elems[priority] = element;
+    }
+
+    std::sort(priorities.begin(), priorities.end());
+
+    for (int &priority : priorities) {
+        int* element = q.dequeue();
+        REQUIRE(element == elems[priority]);
+    }
+}
+
+TEST_CASE("Single Reader Priority Queue Multiple Identical Priorities", "[queue]") {
+    using namespace clockwork;
+
+    single_reader_priority_queue<int> q;
+
+    std::vector<int*> low;
+    std::vector<int*> high;
+
+    for (unsigned i = 0; i < 10; i++) {
+        int* elow = new int();
+        q.enqueue(elow, 10);
+        low.push_back(elow);
+
+        int* ehigh = new int();
+        q.enqueue(ehigh, 20);
+        high.push_back(ehigh);
+    }
+
+    for (unsigned i = 0; i < low.size(); i++) {
+        int* e = q.dequeue();
+        REQUIRE(std::find(low.begin(), low.end(), e) != low.end());
+        REQUIRE(std::find(high.begin(), high.end(), e) == high.end());
+    }
+
+    for (unsigned i = 0; i < high.size(); i++) {
+        int* e = q.dequeue();
+        REQUIRE(std::find(low.begin(), low.end(), e) == low.end());
+        REQUIRE(std::find(high.begin(), high.end(), e) != high.end());
+    }
+}
+
+TEST_CASE("Single Reader Priority Queue Eligible Time", "[queue]") {
+    using namespace clockwork;
+
+    single_reader_priority_queue<int> q;
+
+    uint64_t now = clockwork::util::now();
+    std::vector<int*> elements;
+    std::vector<uint64_t> priorities;
+
+
+    for (int i = -10; i < 10; i++) {
+        int* e = new int();
+        uint64_t priority = now + i * 200000000L; // 200ms * i
+        q.enqueue(e, priority);
+        elements.push_back(e);
+        priorities.push_back(priority);
+    }
+
+    for (int i = 0; i < elements.size(); i++) {
+        int* e = q.dequeue();
+        REQUIRE(e == elements[i]);
+        REQUIRE(clockwork::util::now() >= priorities[i]);
+    }
+}
+
+class ShutdownSignaller2 {
+public:
+    std::atomic_bool signalled_shutdown;
+    std::thread thread;
+    clockwork::single_reader_priority_queue<int> &q;
+    ShutdownSignaller2(clockwork::single_reader_priority_queue<int> &q) : 
+            q(q), signalled_shutdown(false), thread(&ShutdownSignaller2::run, this) {
+    }
+    ~ShutdownSignaller2() {
+        while (!signalled_shutdown);
+        thread.join();
+    }
+    void run() {
+        using namespace clockwork;
+        uint64_t start = util::now();
+        while (util::now() < start + 100000000UL) {
+            usleep(1000);
+        }
+        signalled_shutdown = true;
+        q.shutdown();
+    }
+};
+
+TEST_CASE("Single Reader Priority Queue Blocking Dequeue", "[queue] [shutdown]") {
+    using namespace clockwork;
+
+    single_reader_priority_queue<int> q;
+    ShutdownSignaller2 s(q);
+
+    int* e = q.dequeue();
+
+    {
+        INFO("Blocking dequeue returned before shutdown signaller completed");
+        REQUIRE(s.signalled_shutdown);
+    }
+
+    {
+        INFO("Blocking dequeue should have returned a nullptr");
+        REQUIRE(e == nullptr);
+    }
+}
+
+class Dequeuer2 {
+public:
+    std::thread thread;
+    clockwork::single_reader_priority_queue<int> &q;
+    std::atomic_bool complete;
+    std::vector<int*> dequeued;
+    Dequeuer2(clockwork::single_reader_priority_queue<int> &q) : 
+            q(q), complete(false), thread(&Dequeuer2::run, this) {
+    }
+    ~Dequeuer2() {
+        while (!complete);
+        thread.join();
+    }
+    void run() {
+        using namespace clockwork;
+        uint64_t start = util::now();
+        while (true) {
+            int* element = q.dequeue();
+            if (element == nullptr) {
+                break;
+            } else {
+                dequeued.push_back(element);
+            }
+        }
+        complete = true;
+    }
+};
+
+TEST_CASE("Single Reader Priority Queue Shutdown", "[queue] [shutdown]") {
+    using namespace clockwork;
+
+    single_reader_priority_queue<int> q;
+
+    Dequeuer2 d(q);
+
+    std::vector<int*> elements;
+    for (unsigned i = 0; i < 10; i++) {
+        int* element = new int();
+        q.enqueue(element, i);
+        elements.push_back(element);
+    }
+
+    uint64_t now = util::now();
+    while (util::now() < now + 100000000) { // Wait 100ms
+        INFO("Dequeuer thread completed before it was signalled");
+        REQUIRE(!d.complete.load());
+        usleep(1000);
+    }
+
+    INFO("Dequeuer thread dequeued " << d.dequeued.size() << " elements");
+    REQUIRE(d.dequeued.size() == 10);
+
+    INFO("Dequeuer thread completed before it was signalled");
+    REQUIRE(!d.complete.load());
+
+    q.shutdown();
+
+    now = util::now();
+    while (util::now() < now + 1000000000) { // Max 1s
+        if (d.complete.load()) {
+            break;
+        }
+        usleep(1000);
+    }
+    INFO("Dequeuer thread never unblocked");
+    REQUIRE(d.complete.load());
+}
+
+TEST_CASE("Single Reader Priority Queue Enqueue after Shutdown", "[queue] [shutdown]") {
+    using namespace clockwork;
+
+    single_reader_priority_queue<int> q;
+
+    REQUIRE(q.enqueue(new int(), 0));
+
+    q.shutdown();
+
+    INFO("Should not be able to enqueue new elements after shutdown");
+    REQUIRE(!q.enqueue(new int(), 0));
+}
+
+TEST_CASE("Single Reader Priority Queue Drain after Shutdown", "[queue] [shutdown]") {
+    using namespace clockwork;
+
+    single_reader_priority_queue<int> q;
+
+    uint64_t now = util::now();
+
+    REQUIRE(q.enqueue(new int(), now + 1000000000UL));
+    REQUIRE(q.enqueue(new int(), now + 1000000000UL));
+    REQUIRE(q.enqueue(new int(), now + 1000000000UL));
+
+    q.shutdown();
+
+    int* dequeued = q.dequeue();
+    INFO("Shouldn't be able to dequeue after queue shutdown");
+    REQUIRE(dequeued == nullptr);
+
+
+    std::vector<int*> drained = q.drain();
+
+    INFO("Unable to drain pending elements from queue after shutdown");
+    REQUIRE(drained.size() == 3);
+}
\ No newline at end of file
--- a/test/clockwork/test/testtask.cpp
+++ b/test/clockwork/test/testtask.cpp
@@ -305,8 +305,6 @@ public:
    cudaStream_t stream;
 	unsigned gpu_id = 0;
    Autostream(unsigned gpu_id = 0): gpu_id(gpu_id) {
-		util::set_core(0);
-		util::setCurrentThreadMaxPriority();
 		REQUIRE(cudaSetDevice(gpu_id) == cudaSuccess);
 		REQUIRE(cudaStreamCreateWithPriority(&stream, cudaStreamNonBlocking, 0)
 			== cudaSuccess);
No results found