Skip to content
Snippets Groups Projects

Added ActionTelemetry and Implemented closed-loop Controller

Merged Safya Alzayat requested to merge salzayat/clockwork:master into master
Files
31
@@ -5,16 +5,27 @@
@@ -5,16 +5,27 @@
#include <pods/binary.h>
#include <pods/binary.h>
#include <pods/buffers.h>
#include <pods/buffers.h>
#include <chrono>
#include <chrono>
 
#include "clockwork/util.h"
namespace clockwork {
namespace clockwork {
struct TaskTelemetry {
struct TaskTelemetry {
int task_type, executor_id, model_id, action_id;
int action_type, task_type, executor_id, gpu_id, model_id, action_id, batch_size, status;
std::chrono::high_resolution_clock::time_point created, enqueued,
std::chrono::high_resolution_clock::time_point enqueued,
eligible_for_dequeue, dequeued, exec_complete, async_complete;
dequeued, exec_complete, async_complete;
 
uint64_t eligible_for_dequeue;
float async_wait, async_duration;
float async_wait, async_duration;
 
TaskTelemetry() : enqueued(util::hrt()){}
};
};
 
struct ActionTelemetry {
 
int telemetry_type;
 
int action_id, action_type;
 
int status;
 
std::chrono::high_resolution_clock::time_point timestamp;
 
};
 
 
struct ExecutorTelemetry {
struct ExecutorTelemetry {
int task_type, executor_id;
int task_type, executor_id;
std::chrono::high_resolution_clock::time_point next_task_begin, slot_available, task_dequeued, task_complete;
std::chrono::high_resolution_clock::time_point next_task_begin, slot_available, task_dequeued, task_complete;
@@ -28,16 +39,19 @@ struct RequestTelemetry {
@@ -28,16 +39,19 @@ struct RequestTelemetry {
};
};
struct SerializedTaskTelemetry {
struct SerializedTaskTelemetry {
int task_type, executor_id, model_id, action_id;
int action_type, task_type, executor_id, gpu_id, model_id, action_id, batch_size, status;
uint64_t created, enqueued, eligible_for_dequeue, dequeued, exec_complete, async_complete;
uint64_t created, enqueued, eligible_for_dequeue, dequeued, exec_complete, async_complete;
uint64_t async_wait, async_duration;
uint64_t async_wait, async_duration;
PODS_SERIALIZABLE(1,
PODS_SERIALIZABLE(1,
 
PODS_MDR(action_type),
PODS_MDR(task_type),
PODS_MDR(task_type),
PODS_MDR(executor_id),
PODS_MDR(executor_id),
 
PODS_MDR(gpu_id),
PODS_MDR(model_id),
PODS_MDR(model_id),
PODS_MDR(action_id),
PODS_MDR(action_id),
PODS_MDR(created),
PODS_MDR(batch_size),
 
PODS_MDR(status),
PODS_MDR(enqueued),
PODS_MDR(enqueued),
PODS_MDR(eligible_for_dequeue),
PODS_MDR(eligible_for_dequeue),
PODS_MDR(dequeued),
PODS_MDR(dequeued),
@@ -48,6 +62,20 @@ struct SerializedTaskTelemetry {
@@ -48,6 +62,20 @@ struct SerializedTaskTelemetry {
)
)
};
};
 
struct SerializedActionTelemetry {
 
int telemetry_type;
 
int action_id, action_type, status;
 
uint64_t timestamp;
 
 
PODS_SERIALIZABLE(1,
 
PODS_MDR(telemetry_type),
 
PODS_MDR(action_id),
 
PODS_MDR(action_type),
 
PODS_MDR(status),
 
PODS_MDR(timestamp)
 
)
 
};
 
struct SerializedExecutorTelemetry {
struct SerializedExecutorTelemetry {
int task_type, executor_id;
int task_type, executor_id;
uint64_t next_task_begin, slot_available, task_dequeued, task_complete;
uint64_t next_task_begin, slot_available, task_dequeued, task_complete;
@@ -81,4 +109,4 @@ struct SerializedRequestTelemetry {
@@ -81,4 +109,4 @@ struct SerializedRequestTelemetry {
}
}
#endif
#endif
\ No newline at end of file
Loading