Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • antoinek/perf-profiler
  • mstolet/perf-profiler
2 results
Show changes
Commits on Source (14)
CFLAGS= -O3 -Wall -g
LDFLAGS= -g
all: pfmtest
all: pfmprof
pfmtest: pfmtest.o config.o events.o symbols.o
pfmtest: LDLIBS+=-lpfm -lbfd
pfmprof: pfmprof.o config.o events.o symbols.o
pfmprof: LDLIBS+=-lpfm -lbfd
clean:
rm -f *.o pfmtest
rm -f *.o pfmprof
.PHONY: all clean
......@@ -97,6 +97,7 @@ size_t config_procs_num = 0;
struct config_thread *config_threads = NULL;
size_t config_threads_num = 0;
uint64_t config_duration = 0;
static int is_pid(const char *dname)
{
......@@ -237,9 +238,10 @@ int config_init(int argc, char *argv[])
int c, opt_idx, done;
static struct option long_opts[] = {
{"process", required_argument, NULL, 'p'},
{"duration", required_argument, NULL, 'd'},
{NULL, 0, NULL, 0},
};
static const char *short_opts = "p:";
static const char *short_opts = "p:d:";
done = 0;
while (!done) {
......@@ -249,6 +251,10 @@ int config_init(int argc, char *argv[])
find_processes(optarg);
break;
case 'd':
config_duration = strtoul(optarg, NULL, 10);
break;
case -1:
done = 1;
break;
......
......@@ -13,6 +13,7 @@
struct tid_group {
struct event_desc **descs;
volatile struct perf_event_mmap_page *mp;
pid_t tid;
};
struct event_desc {
......@@ -23,6 +24,7 @@ struct event_desc {
uint64_t last_value;
uint64_t last_time_enabled;
uint64_t last_time_running;
};
static struct tid_group *tid_groups;
......@@ -102,6 +104,7 @@ static struct event_desc *event_open(const char *str, pid_t pid, int output_fd,
ed->last_value = 0;
ed->last_time_enabled = 0;
ed->last_time_running = 0;
return ed;
}
......@@ -116,14 +119,6 @@ static void event_enable(struct event_desc *ed)
}
}
#if 0
static void event_close(struct event_desc *ed)
{
close(ed->fd);
free(ed);
}
#endif
static struct event_desc *event_lookup_id(struct tid_group *tg, uint64_t id)
{
size_t i;
......@@ -170,75 +165,96 @@ static void event_sample(struct tid_group *tg, struct perf_event_header *hdr)
{
struct sample_event *se = (struct sample_event *) hdr;
struct event_desc *ed;
//struct target *t;
if (hdr->size != sizeof(struct sample_event)) {
if (se->tid != tg->tid) {
fprintf(stderr, "event_sample: event tid does not match (%u != %u)\n",
se->tid, tg->tid);
return;
}
if (se->hdr.size != sizeof(struct sample_event)) {
fprintf(stderr, "event_sample: event smaller than expected\n");
abort();
}
ed = event_lookup_id(tg, se->sample_id);
/*t = target_lookup(se->tid, se->ip);
t->counters[ed->my_id].value += ed->last_value - se->value;
t->counters[ed->my_id].time_enabled += ed->last_time_enabled - se->time_enabled*/
/*printf(" sample_id=%lu ip=%lx"
" pid=%u tid=%u time=%lu id=%lu cpu=%u res=%u value=%lx time_enabled=%lu"
" time_running=%lu\n", se->sample_id, se->ip, se->pid,
se->tid, se->time, se->id, se->cpu, se->res, se->value, se->time_enabled,
se->time_running);*/
/*printf(" pid=%u tid=%u cpu=%u ip=%lx %d val=%lx\n", se->pid, se->tid,
* se->cpu, se->ip, ed->my_id, se->value); */
profile_sample(se->pid, se->tid, se->cpu, se->ip, se->value - ed->last_value,
se->time_enabled - ed->last_time_enabled, ed->my_id);
se->time_enabled - ed->last_time_enabled,
se->time_running - ed->last_time_running,
ed->my_id);
ed->last_value = se->value;
ed->last_time_enabled = se->time_enabled;
ed->last_time_running = se->time_running;
}
static inline uint64_t mp_head_read(volatile struct perf_event_mmap_page *mp)
{
asm volatile("" ::: "memory");
return mp->data_head;
}
static inline void mp_tail_write(volatile struct perf_event_mmap_page *mp,
uint64_t tail)
{
asm volatile("" ::: "memory");
mp->data_tail = tail;
}
static unsigned tid_group_poll(struct tid_group *tg, unsigned n_max)
{
unsigned n;
volatile struct perf_event_mmap_page *mp = tg->mp;
uint64_t pos = mp->data_tail;
uint64_t size = mp->data_size;
uint64_t head = mp->data_head;
uint8_t *data = (uint8_t *) mp + mp->data_offset;
struct perf_event_header *hdr;
uint8_t buf [256];
size_t off;
uint64_t pos, head, ev_size;
for (n = 0; n < n_max; n++) {
head = mp_head_read(mp);
pos = mp->data_tail;
if (pos == head)
break;
off = pos % size;
hdr = (struct perf_event_header *) (data + off);
ev_size = hdr->size;
if ((off + ev_size) % size < off) {
fprintf(stderr, "tid_group_poll: event wraps\n");
if (ev_size > sizeof(buf)) {
fprintf(stderr, "tid_group_poll: event too large (%lu)\n", ev_size);
abort();
}
memcpy(buf, hdr, size - off);
memcpy(buf + (size - off), data, ev_size - (size - off));
hdr = (struct perf_event_header *) buf;
}
if (hdr->type == PERF_RECORD_SAMPLE) {
event_sample(tg, hdr);
} else {
printf("hdr->type=%u size=%u\n", hdr->type, hdr->size);
fprintf(stderr, "hdr->type=%u size=%u\n", hdr->type, hdr->size);
}
pos += hdr->size;
mp_tail_write(mp, pos + ev_size);
}
mp->data_tail = pos;
return n;
}
/*static volatile struct perf_event_mmap_page *events_initialize(pid_t pid*/
static int tid_group_init(pid_t pid, struct tid_group *tg)
{
size_t i;
int fd;
tg->tid = pid;
for (i = 0; i < config_counter_num; i++) {
fd = (i == 0 ? -1 : tg->descs[0]->fd);
tg->descs[i] = event_open(config_counters[i].counter, pid, fd, i,
......
......@@ -5,6 +5,7 @@
#include <stdint.h>
#include <string.h>
#include <sys/mman.h>
#include <time.h>
#include <perfmon/pfmlib_perf_event.h>
......@@ -21,6 +22,7 @@ struct target {
struct {
uint64_t value;
uint64_t time_enabled;
uint64_t time_running;
} counters[];
};
......@@ -81,22 +83,25 @@ static inline struct target *target_lookup(uint64_t ip, uint32_t pid,
}
void profile_sample(pid_t pid, pid_t tid, uint16_t cpu, uint64_t ip,
uint64_t value, uint64_t time_enabled, uint32_t event)
uint64_t value, uint64_t time_enabled, uint64_t time_running,
uint32_t event)
{
struct target *t = target_lookup(ip, pid, tid);
t->counters[event].value += value;
t->counters[event].time_enabled += time_enabled;
t->counters[event].time_running += time_running;
}
static void target_dump(struct target *t)
{
size_t i;
uint64_t symoff;
char *symbol;
char *symbol, *dso;
if (symbols_lookup(t->pid, t->ip, &symbol, &symoff) != 0) {
if (symbols_lookup(t->pid, t->ip, &symbol, &dso, &symoff) != 0) {
symbol = "";
dso = "";
symoff = 0;
}
......@@ -107,32 +112,79 @@ static void target_dump(struct target *t)
" \"tid\": %u,\n"
" \"symbol\": \"%s\",\n"
" \"symoff\": %lu,\n"
" \"dso\": \"%s\",\n"
" \"counters\": {\n",
t->ip, t->pid, t->tid, symbol, symoff);
t->ip, t->pid, t->tid, symbol, symoff, dso);
for (i = 0; i < config_counter_num; i++) {
if (t->counters[i].value == 0)
continue;
printf(
" \"%s\": { \"value\": %lu, \"time_enabled\": %lu }",
" \"%s\": { \"value\": %lu, \"time_enabled\": %lu, "
"\"time_running\": %lu },\n",
config_counters[i].counter,
t->counters[i].value, t->counters[i].time_enabled);
t->counters[i].value, t->counters[i].time_enabled,
t->counters[i].time_running);
if (i != config_counter_num - 1)
printf(",\n");
else
printf("\n");
}
printf(" \"\": { }\n");
printf(
" }\n"
" },\n");
}
static void profile_dump(void)
static void profile_dump(double dur)
{
struct target *t;
size_t i;
struct config_process *p;
struct config_thread *th;
size_t i, j;
printf("{\n");
/* dump out duration */
printf("\"duration\": %lf,\n", dur);
/* dump out processes */
printf("\"procs\": {\n");
for (i = 0, p = config_procs; p != NULL; i++, p = p->next) {
printf(" \"%u\": {\n", p->pid);
printf(" \"comm\": \"%s\",\n", p->comm);
/* dump out threads */
printf(" \"threads\": {\n");
for (j = 0, th = p->threads; th != NULL; j++, th = th->next_proc) {
printf(" \"%u\": {\n", th->tid);
printf(" \"comm\": \"%s\"\n", th->comm);
if (j != p->num_threads - 1)
printf(" },\n");
else
printf(" }\n");
}
printf(" }\n");
printf("[\n");
if (i != config_procs_num - 1)
printf(" },\n");
else
printf(" }\n");
}
printf("},\n");
/* dump out counters */
printf("\"event_descs\": {\n");
for (i = 0; i < config_counter_num; i++) {
printf(" \"%s\": { \"period\": %lu }", config_counters[i].counter,
config_counters[i].period);
if (i != config_counter_num - 1)
printf(",\n");
else
printf("\n");
}
printf("},\n");
/* dump out events */
printf("\"events\": [\n");
for (i = 0; i < HT_SIZE; i++) {
t = target_table[i];
while (t != NULL) {
......@@ -142,6 +194,7 @@ static void profile_dump(void)
}
printf(" {}\n");
printf("]\n");
printf("}\n");
}
static void int_handler(int dummy)
......@@ -152,6 +205,8 @@ static void int_handler(int dummy)
int main(int argc, char *argv[])
{
size_t i;
struct timespec ts_start, ts_end;
uint64_t td;
signal(SIGINT, int_handler);
......@@ -163,6 +218,8 @@ int main(int argc, char *argv[])
return -1;
}
clock_gettime(CLOCK_MONOTONIC, &ts_start);
if (events_init() != 0) {
return -1;
}
......@@ -176,9 +233,21 @@ int main(int argc, char *argv[])
while (!exited) {
events_poll();
i++;
if (i % 10000 == 0 && config_duration != 0) {
clock_gettime(CLOCK_MONOTONIC, &ts_end);
td = 1000000000ULL * (ts_end.tv_sec - ts_start.tv_sec) +
(ts_end.tv_nsec - ts_start.tv_nsec);
if (td / 1000000000ULL >= config_duration)
exited = 1;
}
}
profile_dump();
clock_gettime(CLOCK_MONOTONIC, &ts_end);
td = 1000000000ULL * (ts_end.tv_sec - ts_start.tv_sec) +
(ts_end.tv_nsec - ts_start.tv_nsec);
profile_dump(((double) td) / 1000000000ULL);
return 0;
}
from profiling import *
from topdown import *
from report_utils import *
from splittcp import *
import sys
#(md, sgs) = load_exp_file(sys.argv[1])
(md, sgs) = load_file(sys.argv[1])
if sys.argv[2] == 'linux':
tids = find_tids_prefix(md, 'echo-w')
classifier = classify_linux
elif sys.argv[2] == 'ix':
tids = find_tids_prefix(md, 'ix')
classifier = classify_linux
elif sys.argv[2] == 'stcp-fp':
tids = find_tids_prefix(md, 'stcp-fp-')
classifier = classify_stcp
elif sys.argv[2] == 'stcp-app':
tids = find_tids_prefix(md, 'echo-w')
classifier = classify_stcp_app
elif sys.argv[2] == 'stcp':
tids_app = find_tids_prefix(md, 'echo-w')
tids_stcp = find_tids_prefix(md, 'stcp-fp-')
tids = tids_app + tids_stcp
stcp_pid = find_pid(md, 'splittcp')
classifier = lambda a: 'stcp-' + classify_stcp(a) \
if a['pid'] == stcp_pid else \
'app-' + classify_stcp_app(a)
elif sys.argv[2] == 'ix':
tids = find_tids_prefix(md, 'ix')
classifier = classify_ix
sgs = sgs_filter(sgs, lambda a: a['tid'] in tids)
sgs = sgs_aggregate(sgs, lambda a: a.mask_out('tid'))
# estimate 'real' values for counters
balance_counters(sgs)
# scale to per-request
scale_counters(sgs, 1 / (float(md['duration'] * md['throughput'])))
print('Icache footprint:', calculate_icache_footprint(sgs, cutoff_frac=1) * 64 / 1024, 'KB')
total_sg = sgs_aggregate(sgs, lambda a: '')[0]
print('Cycles per request: %d' % (total_sg.get('CPU_CLK_UNHALTED')))
print('Instructions per request: %d' % (total_sg.get('INST_RETIRED:ANY_P')))
print('\n')
td = TopDown(total_sg)
print('CPI: %5.2f' % (td.get('cpi')))
print('Retiring: %5.2f%%' % (td.get('l1_retiring') * 100))
print(' General Retirement: %5.2f%%' % (td.get('l2_general_retirement') * 100))
print(' Microcode Sequence: %5.2f%%' % (td.get('l2_microcode_sequencer') * 100))
print('Frontend Bound: %5.2f%%' % (td.get('l1_frontend_bound') * 100))
print(' Latency: %5.2f%%' % (td.get('l2_frontend_latency') * 100))
print(' Bandwidth: %5.2f%%' % (td.get('l2_frontend_bandwidth') * 100))
print('Backend Bound: %5.2f%%' % (td.get('l1_backend_bound') * 100))
print(' Memory Bound: %5.2f%%' % (td.get('l2_memory_bound') * 100))
print(' Core Bound: %5.2f%%' % (td.get('l2_core_bound') * 100))
print('Bad Speculation: %5.2f%%' % (td.get('l1_bad_speculation') * 100))
print(' Branch Mispredicts: %5.2f%%' % (td.get('l2_branch_mispredicts') * 100))
print(' Machine Clears: %5.2f%%' % (td.get('l2_machine_clears') * 100))
print('\n\n\n')
td = TopDownAbs(total_sg)
print('Retiring: %15u' % (td.get('l1_retiring') / 4))
print(' General Retirement: %15u' % (td.get('general_retirement') / 4))
print(' Microcode Sequence: %15u' % (td.get('microcode_sequencer') / 4))
print('Frontend Bound: %15u' % (td.get('l1_frontend_bound') / 4))
print(' Latency: %15u' % (td.get('frontend_latency') / 4))
print(' Bandwidth: %15u' % (td.get('frontend_bandwidth') / 4))
print('Backend Bound: %15u' % (td.get('l1_backend_bound') / 4))
print(' Memory Bound: %15u' % (td.get('memory_bound') / 4))
print(' Core Bound: %15u' % (td.get('core_bound') / 4))
print('Bad Speculation: %15u' % (td.get('l1_bad_speculation') / 4))
print(' Branch Mispredicts: %15u' % (td.get('branch_mispredicts') / 4))
print(' Machine Clears: %15u' % (td.get('machine_clears') / 4))
print('\n\n\n')
print('Breakdown to categories (cycles, cpi)')
cat_sgs = sgs_aggregate(sgs, classifier)
metric_table(cat_sgs, 'CPU_CLK_UNHALTED', ['CPU_CLK_UNHALTED', 'cpi'], TopDown,
col_format=['%5d', '%5.2f'])
print('\n\n\n')
print('Top 15 symbols')
sym_sgs = sgs_aggregate(sgs, lambda a: (a['dso'], a['symbol']))
metric_table(sym_sgs, 'CPU_CLK_UNHALTED', ['CPU_CLK_UNHALTED'], TopDown,
max_rows=15,
attr_map=lambda a: a[0] + ':' + a[1])
#sys.exit(0)
print('\n\n\n')
print('Top 10 symbols Backend bound')
sym_sgs = sgs_aggregate(sgs, lambda a: (a['dso'], a['symbol']))
metric_table(sym_sgs, 'l1_backend_bound', ['l1_backend_bound'], TopDownAbs,
max_rows=10,
attr_map=lambda a: a[0] + ':' + a[1])
print('\n\n\n')
print('Top 10 symbols Frontend bound')
sym_sgs = sgs_aggregate(sgs, lambda a: (a['dso'], a['symbol']))
metric_table(sym_sgs, 'l1_frontend_bound', ['l1_frontend_bound'], TopDownAbs,
max_rows=10,
attr_map=lambda a: a[0] + ':' + a[1])
print('\n\n\n')
print('Top 10 symbols Bad Speculation')
sym_sgs = sgs_aggregate(sgs, lambda a: (a['dso'], a['symbol']))
metric_table(sym_sgs, 'l1_bad_speculation', ['l1_bad_speculation'], TopDownAbs,
max_rows=10,
attr_map=lambda a: a[0] + ':' + a[1])
sgs_other = sgs_aggregate(
sgs_filter(sgs, lambda a: classify_linux(a) == 'k-other'),
lambda a: a['symbol'])
metric_table(sgs_other, 'CPU_CLK_UNHALTED', ['CPU_CLK_UNHALTED'], TopDown,
col_format='%04.3f', cutoff_frac=100)
print('\n\n\n')
sgs_dns = sgs_aggregate(
sgs_filter(sgs, lambda a: a['symbol'] == 'exit_dns_resolver'),
lambda a: hex(a['ip']))
metric_table(sgs_dns, 'CPU_CLK_UNHALTED', ['CPU_CLK_UNHALTED'], TopDown)
sys.exit(0)
metric_table(sym_sgs, 'CPU_CLK_UNHALTED', ['CPU_CLK_UNHALTED',
'CYCLE_ACTIVITY:STALLS_MEM_ANY', 'cpi'], TopDown, cutoff_frac=99,
col_format=['%15u', '%15u', '%05.2f'],
attr_width=50, attr_map=lambda x: x[0] + '.' + x[1])
print('\n\n\n')
metric_table(sym_sgs, 'CPU_CLK_UNHALTED',
['l1_retiring', 'l1_frontend_bound', 'l1_backend_bound', 'l1_bad_speculation'],
TopDown, cutoff_frac=99, col_format='%04.3f',
attr_width=50, attr_map=lambda x: x[0] + '.' + x[1])
print('\n\n\n')
metric_table(sym_sgs, 'CYCLE_ACTIVITY:STALLS_MEM_ANY',
['CYCLE_ACTIVITY:STALLS_MEM_ANY'], TopDown, cutoff_frac=99, col_format='%15u',
attr_width=50, attr_map=lambda x: x[0] + '.' + x[1])
print('\n\n\n')
metric_table(sym_sgs, 'l1_backend_bound',
['l1_backend_bound', 'memory_bound', 'core_bound'], TopDownAbs, cutoff_frac=99,
col_format='%15u',
attr_width=50, attr_map=lambda x: x[0] + '.' + x[1])
print('\n\n\n')
metric_table(sgs, 'l1_backend_bound',
['l1_backend_bound', 'memory_bound', 'core_bound', 'CPU_CLK_UNHALTED'],
TopDownAbs, cutoff_frac=95,
col_format='%15u',
attr_width=50, attr_map=lambda x: x['symbol'] + '.' + hex(x['symoff']))
from profiling import *
from topdown import *
from report_utils import *
from splittcp import *
import sys
def linux_component(a):
if a['dso'] == 'vmlinux':
return 'kernel'
else:
return 'app'
def linux_subcomponent(a):
if a['dso'] == 'vmlinux':
return classify_linux_kernel(a)
elif a['dso'].startswith('libc') or a['dso'].startswith('libpthread'):
return 'libc'
elif a['dso'].startswith('echo') or a['dso'].startswith('flexkvs'):
return 'app'
else:
return 'other'
def ix_component(a):
if a['dso'] == 'ix':
return 'ix'
elif a['dso'] == 'echoserver':
return 'app'
else:
return 'other'
def ix_subcomponent(a):
if a['dso'] == 'ix':
return classify_ix_symbol(a)
elif a['dso'] == 'echoserver':
return 'app'
else:
return 'other'
def tas_component(a):
return classify_stcp(a)
def tas_app_component(a):
return classify_stcp_app(a)
def class_symbol(a):
return a['symbol']
def class_symoff(a):
return hex(a['symoff'])
#(md, sgs) = load_exp_file(sys.argv[1])
(md, sgs) = load_file(sys.argv[1])
if sys.argv[2] == 'linux':
tids = find_tids_prefix(md, 'echo-w')
classifiers = [linux_component, linux_subcomponent]
elif sys.argv[2] == 'ix':
tids = find_tids_prefix(md, 'ix')
classifiers = [ix_component, ix_subcomponent]
elif sys.argv[2] == 'tas-fp':
tids = find_tids_prefix(md, 'stcp-fp')
classifiers = [tas_component]
elif sys.argv[2] == 'tas-app':
tids = find_tids_prefix(md, 'flexkvs')
classifiers = [tas_app_component]
elif sys.argv[2] == 'tas':
#tids_app = find_tids_prefix(md, 'flexkvs-w')
tids_app = find_tids_prefix(md, 'flexkvs-ll')
tids_fp = find_tids_prefix(md, 'stcp')
tids = tids_app + tids_fp
disc_app_fp = lambda a: 'tas' if a['dso'] == 'splittcp' else 'app'
disc_comp = lambda a: tas_component(a) if disc_app_fp(a) == 'tas' else tas_app_component(a)
classifiers = [disc_app_fp, disc_comp]
#classifiers += [class_symbol, class_symoff]
classifiers += [class_symbol]
sgs = sgs_filter(sgs, lambda a: a['tid'] in tids)
sgs = sgs_aggregate(sgs, lambda a: a.mask_out('tid'))
# estimate 'real' values for counters
balance_counters(sgs)
# scale to per-request
scale_counters(sgs, 1 / (float(md['duration'] * md['throughput'])))
cols = ['l1_retiring', 'l1_frontend_bound', 'l1_backend_bound',
'l1_bad_speculation']
#cols = []
hierarchic_table(sgs, 'CPU_CLK_UNHALTED', cols, TopDownAbs, classifiers, cutoff=1)
#hierarchic_table(sgs, 'INST_RETIRED:ANY_P', TopDownAbs, classifiers, cutoff=1)
#hierarchic_table(sgs, 'l1_backend_bound', TopDownAbs, classifiers, cutoff=1)
import json
import collections.abc
class SGAttrs(collections.abc.Hashable):
def __init__(self, ip, pid, tid, symbol, symoff, dso, cust=None):
self._ip = ip
self._pid = pid
self._tid = tid
self._symbol = symbol
self._symoff = symoff
self._dso = dso
self._cust = cust
def mask_out(self, field):
return self.replace(field, None)
def subset(self, fields):
ip = None
pid = None
tid = None
symbol = None
symoff = None
dso = None
cust = None
for field in fields:
if field == 'ip':
ip = self._ip
elif field == 'pid':
pid = self._pid
elif field == 'tid':
tid = self._tid
elif field == 'symbol':
symbol = self._symbol
elif field == 'symoff':
symoff = self._symoff
elif field == 'dso':
dso = self._dso
elif field == 'cust':
cust = self._cust
else:
raise 'Bad field'
return SGAttrs(ip, pid, tid, symbol, symoff, dso, cust)
def replace(self, field, value):
ip = self._ip
pid = self._pid
tid = self._tid
symbol = self._symbol
symoff = self._symoff
dso = self._dso
cust = self._cust
if field == 'ip':
ip = value
elif field == 'pid':
pid = value
elif field == 'tid':
tid = value
elif field == 'symbol':
symbol = value
elif field == 'symoff':
symoff = value
elif field == 'dso':
dso = value
elif field == 'cust':
cust = value
else:
raise KeyError()
return SGAttrs(ip, pid, tid, symbol, symoff, dso, cust)
def _tuple(self):
return (self._ip, self._pid, self._tid, self._symbol, self._symoff, \
self._dso, self._cust)
def __getitem__(self, field):
if field == 'ip':
return self._ip
elif field == 'pid':
return self._pid
elif field == 'tid':
return self._tid
elif field == 'symbol':
return self._symbol
elif field == 'symoff':
return self._symoff
elif field == 'dso':
return self._dso
elif field == 'cust':
return self._cust
else:
raise KeyError()
def __eq__(self, other):
return self._tuple() == other._tuple()
def __hash__(self):
return hash(self._tuple())
class SampleGroup(object):
def __init__(self):
self.events = {}
self.attrs = {}
def merge_in(self, other):
for k in other.events.keys():
if k not in self.events.keys():
self.events[k] = {'value': 0, 'time_e': 0, 'time_r': 0}
self.events[k]['value'] += other.events[k]['value']
self.events[k]['time_r'] += other.events[k]['time_r']
self.events[k]['time_e'] += other.events[k]['time_e']
def add_event(self, ev, value, time_e, time_r):
if ev not in self.events.keys():
self.events[ev] = {'value': value, 'time_e': time_e, 'time_r': time_r}
else:
self.events[ev]['value'] += value
self.events[ev]['time_e'] += time_e
self.events[ev]['time_r'] += time_r
def get(self, ev):
if ev not in self.events.keys():
return None
return self.events[ev]['value']
class SGAnalysis(object):
def __init__(self, sg):
self.sg = sg
def ctr(self, ev):
return self.sg.get(ev)
def get(self, ev):
x = self.sg.get(ev)
if x is not None:
return x
return getattr(self, 'get_' + ev)()
def add(self, a, b):
if a is None or b is None:
return None
return a + b
def sub(self, a, b):
if a is None or b is None:
return None
return a - b
def mul(self, a, b):
if a is None or b is None:
return None
return a * b
def div(self, a, b):
if a is None or b is None:
return None
elif b == 0:
return float('inf')
return float(a) / float(b)
# Load trace from json file
# Returns (metadata, list of sample groups)
def load_file(path):
with open(path, 'r') as f:
data = json.load(f)
return load_from_object(data)
def load_exp_file(path):
with open(path, 'r') as f:
data = json.load(f)
return load_from_object(data['profiler']['data'])
def load_from_object(data):
metadata = {}
for k in data:
if k == 'events':
continue
metadata[k] = data[k]
counters = list(data['event_descs'].keys())
sgs = []
for ev in data['events']:
if len(ev.keys()) == 0:
continue
sg = SampleGroup()
sg.attrs = SGAttrs(ev['ip'], ev['pid'], ev['tid'], ev['symbol'],
ev['symoff'], ev['dso'], None)
for cnt in counters:
if cnt not in ev['counters'].keys():
sg.add_event(cnt, 0, 0, 0)
else:
sg.add_event(cnt, ev['counters'][cnt]['value'],
ev['counters'][cnt]['time_enabled'],
ev['counters'][cnt]['time_running'])
sgs.append(sg)
sgs = patch_raw_sgs(sgs)
return (metadata, sgs)
# Patches up some snafus in raw sample group list
# (currently only one miscaptured symbol in glibc)
def patch_raw_sgs(sgs):
for sg in sgs:
if sg.attrs['symbol'] == '__nss_group_lookup':
# not sure why the glibc symbol for this is off
sg.attrs = sg.attrs.replace('symbol', 'memcpy')
elif sg.attrs['ip'] >= 0xfffffe0000000000 and sg.attrs['ip'] < 0xffffff0000000000:
# this is missing in the symbol table for current kernels, but will
# be there in later ones (maybe 4.16+?)
sg.attrs = sg.attrs.replace('symbol',
'__entry_SYSCALL_64_trampoline').replace('dso', 'vmlinux')
return sgs
# Normalize all counters to same time enabled
# (modifies in-place)
def balance_counters(sgs):
times_r = {}
times_e = {}
for sg in sgs:
for k in sg.events.keys():
if k not in times_r.keys():
times_r[k] = 0
times_e[k] = 0
times_r[k] += sg.events[k]['time_r']
times_e[k] += sg.events[k]['time_e']
fracs = {}
for k in times_r.keys():
if times_r[k] != 0:
fracs[k] = float(times_e[k]) / float(times_r[k])
else:
fracs[k] = 0
for sg in sgs:
for k in sg.events.keys():
sg.events[k]['value'] = int(sg.events[k]['value'] * fracs[k])
sg.events[k]['time_r'] = int(sg.events[k]['time_r'] * fracs[k])
def scale_counters(sgs, fac):
for sg in sgs:
for k in sg.events:
sg.events[k]['value'] *= fac
sg.events[k]['time_r'] *= fac
sg.events[k]['time_e'] *= fac
def find_tid(md, tname):
for p in md['procs'].values():
for i in p['threads'].keys():
if p['threads'][i]['comm'] == tname:
return int(i)
return None
def find_tids_prefix(md, tname):
tids = []
for p in md['procs'].values():
for i in p['threads'].keys():
if p['threads'][i]['comm'].startswith(tname):
tids.append(int(i))
return tids
def find_pid(md, pname):
for pid in md['procs'].keys():
if md['procs'][pid]['comm'] == pname:
return int(pid)
return None
def find_tids_proc(md, pname):
tids = []
pid = find_pid(md, pname)
for p in md['procs'].values():
if p['comm'] != pname:
continue
for i in p['threads'].keys():
tids.append(int(i))
return tids
def sgs_filter(sgs, pred):
new_sgs = []
for sg in sgs:
if pred(sg.attrs):
new_sgs.append(sg)
return new_sgs
def sgs_aggregate(sgs, mapper):
new_sgs = {}
for sg in sgs:
k = mapper(sg.attrs)
if k not in new_sgs:
nsg = SampleGroup()
nsg.attrs = k
new_sgs[k] = nsg
else:
nsg = new_sgs[k]
nsg.merge_in(sg)
return list(new_sgs.values())
def calculate_icache_footprint(sgs, primary='CPU_CLK_UNHALTED',
cutoff_frac=.99):
total_primary = 0
for sg in sgs:
x = sg.get(primary)
if x is not None:
total_primary += x
ips = set()
agg = 0
sorted_sgs = sorted(sgs, key=lambda x: x.get(primary), reverse=True)
for sg in sorted_sgs:
ips.add(sg.attrs['ip'] & ~63)
agg += sg.get(primary)
if float(agg) / total_primary > cutoff_frac:
break
return len(ips)
from profiling import *
def metric_table(sgs, primary, cols, analysis, cutoff_frac=None,
attr_map=lambda x:x, attr_width=50, col_format='%15u',
max_rows=None):
total_primary = 0
for sg in sgs:
x = analysis(sg).get(primary)
if x is not None:
total_primary += x
aggfr = 0
sorted_sgs = sorted(sgs, key=lambda x: analysis(x).get(primary), reverse=True)
n = 0
for sg in sorted_sgs:
an = analysis(sg)
fr = float(an.get(primary)) / total_primary * 100
s = attr_map(sg.attrs).ljust(attr_width)
s += ' %05.2f%% ' % (fr)
i = 0
for c in cols:
if isinstance(col_format, str):
f = col_format
else:
f = col_format[i]
s += (' ' + f) % (an.get(c))
i += 1
print(s)
aggfr += fr
if cutoff_frac is not None and aggfr > cutoff_frac:
break
n = n + 1
if max_rows is not None and n >= max_rows:
break
def hierarchic_table(sgs, primary, cols, analysis, classifiers,cutoff=None):
top_sg = SampleGroup()
for sg in sgs:
top_sg.merge_in(sg)
hierarchic_helper(sgs, primary, cols, analysis, 'top', top_sg, classifiers,
0, cutoff)
def hierarchic_helper(sgs, primary, cols, analysis, top_class, top_sg,
classifiers, indent, cutoff=None):
classifier = classifiers[0] if len(classifiers) >= 1 else None
sg_per = {}
for sg in sgs:
if classifier:
c = classifier(sg.attrs)
if c not in sg_per:
sg_per[c] = (SampleGroup(), [])
sg_per[c][0].merge_in(sg)
sg_per[c][1].append(sg)
ind_s = '\t' * indent
an = analysis(top_sg)
pri = an.get(primary)
if pri is None:
pri = -1
if cutoff is not None and pri < cutoff:
return
s = ''
for c in cols:
s += ('\t%d') % (an.get(c))
print('%s%s\t%15d%s' % (ind_s, top_class, pri, s))
if len(classifiers) >= 1:
children = sorted(sg_per.items(), key=lambda t: analysis(t[1][0]).get(primary), reverse=True)
for k,(sg,chi) in children:
hierarchic_helper(chi, primary, cols, analysis, k, sg,
classifiers[1:], indent + 1, cutoff)
def classify_stcp(attrs):
sym = attrs['symbol']
m = {
'tx_phase1': set([
'poll_queues',
'fast_appctx_poll_pf',
'fast_appctx_poll_fetch',
'fast_appctx_poll_bump',
'qman_set',
'utils_rng_gen32',
'fast_flows_bump',
'poll_qman_fwd.isra.6',
]),
'tx_phase2': set([
'qman_poll',
'poll_qman',
'fast_flows_qman',
'fast_flows_qman_pfbufs',
'fast_flows_qman_pf',
'flow_tx_read',
]),
'rx': set([
'fast_flows_packet',
'fast_actx_rxq_probe',
'fast_flows_packet_fss',
'poll_rx',
'fast_flows_packet_parse',
'arx_cache_flush',
'fast_actx_rxq_alloc',
]),
'common': set([
'dataplane_loop',
'bufcache_prealloc',
'qman_timestamp',
'rte_memcpy_generic',
'common_ring_mc_dequeue',
'common_ring_mp_enqueue',
]),
'driver': set([
'i40e_recv_pkts_vec',
'i40e_xmit_pkts',
]),
'kernel': set([
'poll_kernel',
'fast_kernel_poll',
]),
'slowpath': set([
'nicif_poll',
'nicif_connection_stats',
'cc_poll',
'nicif_connection_setrate',
'appif_ctx_poll',
]),
}
for c in m.keys():
if sym in m[c]:
return c
return 'other'
def classify_stcp_alt(attrs):
sym = attrs['symbol']
m = {
'tx': set([
'poll_queues',
'fast_actx_rxq_probe',
'fast_appctx_poll_pf',
'fast_appctx_poll_fetch',
'fast_appctx_poll_bump']),
'rx': set([
'fast_flows_packet',
'i40e_recv_pkts_vec',
'fast_flows_packet_fss',
'poll_rx',
'fast_flows_packet_parse',
'arx_cache_flush',
'fast_actx_rxq_alloc']),
'qman': set([
'fast_flows_bump',
'qman_poll',
'qman_set',
'fast_flows_qman',
'i40e_xmit_pkts',
'flow_tx_read',
'poll_qman',
'fast_flows_qman_pfbufs',
'poll_qman_fwd.isra.6',
'utils_rng_gen32',
'fast_flows_qman_pf']),
'common': set([
'dataplane_loop',
'bufcache_prealloc',
'qman_timestamp',
'rte_memcpy_generic',
'common_ring_mc_dequeue',
'common_ring_mp_enqueue',]),
'kernel': set([
'poll_kernel',
'fast_kernel_poll']),
}
for c in m.keys():
if sym in m[c]:
return c
return 'other'
def classify_stcp_app(attrs):
sym = attrs['symbol']
sockets_syms = set([
'lwip_epoll_wait',
'lwip_read',
'lwip_write',
'flextcp_epoll_set',
'flextcp_sockctx_poll',
'flextcp_fd_slookup',
'flextcp_sockctx_get',
'flextcp_fd_release',
'write',
'read',
'__tls_get_addr',
'flextcp_epoll_clear',
'flextcp_sockctx_poll_n',
'epoll_wait',
'flextcp_fd_elookup',
'.plt',
])
stack_syms = set([
'flextcp_context_poll',
'flextcp_context_tx_alloc',
'flextcp_context_tx_done',
'flextcp_connection_tx_send',
'flextcp_connection_rx_done',
'flextcp_connection_tx_alloc2',
'flextcp_conn_txbuf_available',
'flextcp_connection_tx_possible',
'fastpath_poll_vec',
'util_timeout_time_us',
'conns_bump',
'txq_probe',
'kernel_poll',
])
app_syms = set([
'thread_run',
])
if sym in sockets_syms:
return 'sockets'
elif sym in stack_syms:
return 'stack'
elif sym in app_syms:
return 'app'
elif sym == 'memcpy':
return 'copies'
else:
return 'other'
def classify_linux(attrs):
if attrs['dso'] == 'vmlinux':
return 'k-' + classify_linux_kernel(attrs)
elif attrs['dso'] == 'echoserver_linux':
return 'app'
elif attrs['dso'].startswith('libc'):
return 'libc'
elif attrs['dso'].startswith('libpthread'):
return 'libc'
else:
return 'other'
def classify_ix(attrs):
if attrs['dso'] == 'echoserver':
return 'app'
elif attrs['dso'] == 'ix':
return 'ix'
elif attrs['dso'].startswith('libc'):
return 'libc'
elif attrs['dso'].startswith('libpthread'):
return 'libc'
else:
return 'other'
def classify_linux_kernel(attrs):
sym = attrs['symbol']
m = {
'tcp_tx': set([
'tcp_clean_rtx_queue',
'tcp_transmit_skb',
'tcp_sendmsg',
'tcp_write_xmit',
'tcp_schedule_loss_probe',
'tcp_wfree',
'tcp_event_new_data_sent',
'tcp_established_options',
'__tcp_select_window',
'skb_entail',
'__tcp_v4_send_check',
'tcp_push',
'tcp_send_delayed_ack',
'tcp_send_mss',
'tcp_options_write',
'tcp_current_mss',
'__tcp_push_pending_frames',
'tcp_v4_send_check',
'tcp_nagle_check',
'tcp_init_tso_segs',
'tcp_send_ack',
'tcp_delack_timer_handler',
'tcp_delack_timer',
'tcp_sendmsg_locked',
'tcp_tso_segs',
'tcp_rate_check_app_limited',
'tcp_rate_skb_sent',
'tcp_small_queue_check.isra.28',
]),
'tcp_rx': set([
'tcp_ack',
'tcp_v4_rcv',
'tcp_recvmsg',
'tcp_rcv_established',
'tcp_poll',
'tcp_gro_receive',
'tcp_rearm_rto',
'tcp_event_data_recv',
'tcp_cleanup_rbuf',
'tcp_queue_rcv',
'tcp_update_pacing_rate',
'tcp4_gro_receive',
'dctcp_update_alpha',
'dctcp_cwnd_event',
'tcp_v4_early_demux',
'tcp_v4_inbound_md5_hash',
'tcp_rcv_space_adjust',
'tcp_check_space',
'__tcp_ecn_check_ce',
'tcp_md5_do_lookup',
'tcp_stream_memory_free',
'tcp_filter',
'tcp_v4_do_rcv',
'tcp_prequeue',
'tcp_release_cb',
'tcp_parse_md5sig_option',
'tcp_reno_cong_avoid',
'tcp_rack_advance',
'tcp_v4_md5_lookup',
'__tcp_ack_snd_check',
'tcp_parse_aligned_timestamp.part.41',
'bictcp_cwnd_event',
'bictcp_cong_avoid',
'tcp_data_queue',
'tcp_ack_update_rtt.isra.33',
'bictcp_acked',
'tcp_rate_gen',
'tcp_v4_fill_cb',
'tcp_rate_skb_delivered',
]),
'ip': set([
'ip_rcv',
'ip_finish_output',
'ip_finish_output2',
'inet_gro_receive',
'ip_queue_xmit',
'__inet_lookup_established',
'inet_ehashfn',
'inet_recvmsg',
'inet_sendmsg',
'__ip_local_out',
'ip_rcv_finish',
'ipv4_mtu',
'ip_local_deliver_finish',
'ip_output',
'ip_local_out',
'ip_local_deliver',
'ipv4_dst_check',
'ip_send_check',
'ip_copy_addrs',
'raw_local_deliver',
#questionably IP
'packet_rcv',
'__netif_receive_skb_core',
'netif_skb_features',
'netif_receive_skb_internal',
'validate_xmit_skb.isra.103.part.104',
'netdev_pick_tx',
'__netdev_pick_tx',
'dev_gro_receive',
'sch_direct_xmit',
'eth_type_trans',
'skb_network_protocol',
]),
'timers': set([
'mod_timer',
'internal_add_timer',
'__internal_add_timer',
'lock_timer_base.isra.34',
'ktime_get_with_offset',
'sk_reset_timer',
'detach_if_pending',
'get_nohz_timer_target',
'read_tsc',
'native_sched_clock',
'__usecs_to_jiffies',
'tcp_chrono_start',
'tcp_rearm_rto.part.61',
'sched_clock',
]),
'bufmgt': set([
'__slab_free',
'skb_release_data',
'__skb_clone',
'kmem_cache_alloc',
'kmem_cache_free',
'__alloc_skb',
'skb_release_head_state',
'kmem_cache_alloc_node',
'__kmalloc_node_track_caller',
'__copy_skb_header',
'___slab_alloc',
'__build_skb',
'fput',
'consume_skb',
'cmpxchg_double_slab.isra.55',
'__cmpxchg_double_slab.isra.47',
'ksize',
'__free_page_frag',
'kfree_skbmem',
'kmalloc_slab',
'skb_release_all',
'skb_clone',
'skb_copy_datagram_iter',
'kfree',
'sk_stream_alloc_skb',
'__slab_alloc',
'__kfree_skb',
'swiotlb_map_page',
'skb_push',
'sk_free',
'put_cpu_partial',
'skb_put',
'iommu_should_identity_map',
'get_page_from_freelist',
'intel_map_page',
'intel_mapping_error',
'free_one_page',
'iommu_no_mapping',
'dma_get_required_mask',
'__check_heap_object',
'prefetch_freepointer',
'device_has_rmrr',
'device_is_rmrr_locked',
'page_frag_free',
'intel_unmap',
'__free_pages_ok',
'__intel_map_single',
'__alloc_pages_nodemask',
'build_skb',
'intel_unmap_page',
'__kmalloc_reserve.isra.43',
]),
'sync': set([
'_raw_spin_lock',
'_raw_spin_lock_bh',
'_raw_spin_unlock_irqrestore',
'_raw_spin_lock_irqsave',
'lock_sock_nested',
'release_sock',
'__local_bh_enable_ip',
'__wake_up_common',
'rcu_irq_exit',
'wake_up_nohz_cpu',
'cmpxchg_double_slab.isra.61',
'__cmpxchg_double_slab.isra.51',
'lock_timer_base',
'__wake_up_common_lock',
'rcu_all_qs',
'native_queued_spin_lock_slowpath',
'_cond_resched',
]),
'sockets': set([
'fsnotify',
'aa_label_sk_perm',
'__fsnotify_parent',
'ep_send_events_proc',
'__fget',
'copy_user_generic_string',
'aa_file_perm',
'aa_sk_perm',
'aa_sock_msg_perm',
'common_file_perm',
'sock_read_iter',
'rw_verify_area',
'new_sync_write',
'new_sync_read',
'security_file_permission',
'sk_filter_trim_cap',
'__fget_light',
'sock_write_iter',
'vfs_read',
'copy_from_iter',
'copy_to_iter',
'sock_def_readable',
'sock_poll',
'sock_put',
'vfs_write',
'sock_rfree',
'ep_poll_callback',
'__sk_dst_check',
'SyS_write',
'__vfs_write',
'security_socket_recvmsg',
'apparmor_file_permission',
'iov_iter_init',
'security_socket_sendmsg',
'security_socket_sendmsg',
'__bpf_prog_run',
'__cgroup_bpf_run_filter_skb',
'entry_SYSCALL_64',
'entry_SYSCALL_64_after_swapgs',
'entry_SYSCALL_64_fastpath',
'sock_sendmsg',
'SyS_read',
'security_sock_rcv_skb',
'__memcpy',
'__memmove',
'sock_recvmsg',
'syscall_return_via_sysret',
'copy_user_enhanced_fast_string',
'entry_SYSCALL_64_stage2',
'__check_object_size',
'_copy_from_iter_full',
'do_syscall_64',
'ep_item_poll.isra.10',
'entry_SYSCALL_64_after_hwframe',
'__indirect_thunk_start',
'__virt_addr_valid',
'sys_read',
'sys_write',
'apparmor_socket_sendmsg',
'_copy_to_iter',
'apparmor_socket_sock_rcv_skb',
'iov_iter_advance',
'copyout',
'copyin',
'apparmor_socket_recvmsg',
'__entry_SYSCALL_64_trampoline',
'ep_scan_ready_list.constprop.17',
'__fdget_pos',
'check_stack_object'
]),
'driver_tx': set([
'ixgbe_xmit_frame_ring',
'ixgbe_select_queue',
'ixgbe_tx_ctxtdesc',
'ixgbe_xmit_frame',
'ixgbe_features_check',
'ixgbe_fdir_add_signature_filter_82599',
'validate_xmit_skb_list',
'validate_xmit_skb',
'dev_hard_start_xmit',
'__dev_queue_xmit',
'napi_consume_skb',
'napi_consume_skb',
'i40e_xmit_frame_ring',
'i40e_lan_xmit_frame',
'i40e_features_check',
]),
'driver_rx': set([
'ixgbe_clean_rx_irq',
'ixgbe_poll',
'napi_gro_receive',
'ixgbe_alloc_rx_buffers',
'net_rx_action',
'ixgbe_update_itr.isra.66',
'irq_entries_start',
'do_IRQ',
'handle_irq_event_percpu',
'handle_edge_irq',
'__do_softirq',
'__netif_receive_skb',
'i40e_clean_rx_irq',
'i40e_napi_poll',
'i40e_alloc_rx_buffers',
'i40e_msix_clean_rings',
]),
}
for c in m.keys():
if sym in m[c]:
return c
return 'other'
def classify_ix_symbol(attrs):
sym = attrs['symbol']
m = {
'tcp_tx': set([
'tcp_output',
'tcp_write',
'tcp_output_packet',
]),
'tcp_rx': set([
'tcp_input',
'tcp_receive',
'tcp_recved',
'lwip_tcp_event',
'tcp_parseopt',
'tcp_update_rcv_ann_wnd',
'tcp_input_tmp',
]),
'ip': set([
'eth_process_recv',
'eth_input',
'ip_send_one',
'arp_lookup_mac',
'eth_recv_handle_fg_transition',
'inet_chksum_pseudo',
'eth_process_reclaim',
'eth_process_send',
'eth_process_poll',
]),
'timers': set([
'timer_now',
'timer_add_abs',
'timer_run',
'tcp_unified_timer_handler',
'tcpip_tcp_timer',
'timer_add',
]),
'bufmgt': set([
'pbuf_free.part.2',
'pbuf_alloc',
'pbuf_clen',
'pbuf_header',
'tcp_seg_free',
'mbuf_default_done',
'pbuf_cat',
'mem_free',
'pbuf_split_64k',
'pbuf_free',
]),
'interface': set([
'bsys_dispatch.part.0',
'bsys_tcp_sendv',
'bsys_tcp_recv_done',
'sys_bpoll',
'dune_syscall_handler',
'syscall_handler',
'do_syscall',
'entry_SYSCALL_64_fastpath',
'entry_SYSCALL_64_after_swapgs',
'entry_SYSCALL_64',
]),
'sync': set([
]),
'driver_tx': set([
'i40e_tx_xmit',
'i40e_tx_reclaim',
]),
'driver_rx': set([
'i40e_rx_poll',
]),
}
for c in m.keys():
if sym in m[c]:
return c
return 'other'
from profiling import *
from topdown import *
from report_utils import *
from splittcp import *
import sys
(md, sgs) = load_file(sys.argv[1])
sgs = sgs_filter(sgs, lambda a: a['symbol'] == sys.argv[2])
balance_counters(sgs)
total_sg = sgs_aggregate(sgs, lambda a: '')[0]
td = TopDown(total_sg)
print('IPC: %5.2f' % (td.get('ipc')))
print('Retiring: %5.2f%%' % (td.get('l1_retiring') * 100))
print(' General Retirement: %5.2f%%' % (td.get('l2_general_retirement') * 100))
print(' Microcode Sequence: %5.2f%%' % (td.get('l2_microcode_sequencer') * 100))
print('Frontend Bound: %5.2f%%' % (td.get('l1_frontend_bound') * 100))
print(' Latency: %5.2f%%' % (td.get('l2_frontend_latency') * 100))
print(' Bandwidth: %5.2f%%' % (td.get('l2_frontend_bandwidth') * 100))
print('Backend Bound: %5.2f%%' % (td.get('l1_backend_bound') * 100))
print(' Memory Bound: %5.2f%%' % (td.get('l2_memory_bound') * 100))
print(' Core Bound: %5.2f%%' % (td.get('l2_core_bound') * 100))
print('Bad Speculation: %5.2f%%' % (td.get('l1_bad_speculation') * 100))
print(' Branch Mispredicts: %5.2f%%' % (td.get('l2_branch_mispredicts') * 100))
print(' Machine Clears: %5.2f%%' % (td.get('l2_machine_clears') * 100))
print('\n\n\n')
td = TopDownAbs(total_sg)
print('Retiring: %15u' % (td.get('l1_retiring') / 4))
print(' General Retirement: %15u' % (td.get('general_retirement') / 4))
print(' Microcode Sequence: %15u' % (td.get('microcode_sequencer') / 4))
print('Frontend Bound: %15u' % (td.get('l1_frontend_bound') / 4))
print(' Latency: %15u' % (td.get('frontend_latency') / 4))
print(' Bandwidth: %15u' % (td.get('frontend_bandwidth') / 4))
print('Backend Bound: %15u' % (td.get('l1_backend_bound') / 4))
print(' Memory Bound: %15u' % (td.get('memory_bound') / 4))
print(' Core Bound: %15u' % (td.get('core_bound') / 4))
print('Bad Speculation: %15u' % (td.get('l1_bad_speculation') / 4))
print(' Branch Mispredicts: %15u' % (td.get('branch_mispredicts') / 4))
print(' Machine Clears: %15u' % (td.get('machine_clears') / 4))
print('\n\nTop 15 ips')
ip_sgs = sgs_aggregate(sgs, lambda a: a['symoff'])
metric_table(ip_sgs, 'CPU_CLK_UNHALTED', ['CPU_CLK_UNHALTED',
'l1_backend_bound', 'l1_frontend_bound', 'l1_bad_speculation'], TopDownAbs,
max_rows=15,
attr_map=lambda a: hex(a))
print('\n\nTop 15backend bound ips')
metric_table(ip_sgs, 'l1_backend_bound', ['l1_backend_bound'], TopDownAbs,
max_rows=15,
attr_map=lambda a: hex(a))
from profiling import *
from topdown import *
from report_utils import *
from splittcp import *
import sys
(md, sgs) = load_file(sys.argv[1])
if sys.argv[2] == 'app':
tname = 'echo-w0'
classifier = classify_stcp_app
elif sys.argv[2] == 'linux':
tname = 'echo-w0'
classifier = classify_linux
else:
tname = 'stcp-fp-0'
classifier = classify_stcp
tid = find_tid(md, tname)
print(md['duration'])
sgs = sgs_filter(sgs, lambda a: a['tid'] == tid)
balance_counters(sgs)
sym_sgs = sgs_aggregate(sgs, lambda a: (a['dso'], a['symbol']))
cat_sgs = sgs_aggregate(sgs, classifier)
total_sg = sgs_aggregate(sgs, lambda a: '')[0]
metric_table(sym_sgs, 'CPU_CLK_UNHALTED', ['CPU_CLK_UNHALTED',
'CYCLE_ACTIVITY:STALLS_MEM_ANY', 'cpi'], TopDown, cutoff_frac=99,
col_format=['%15u', '%15u', '%05.2f'],
attr_width=50, attr_map=lambda x: x[0] + '.' + x[1])
print('\n\n\n')
td = TopDown(total_sg)
print('IPC: %5.2f' % (td.get('ipc')))
print('Retiring: %5.2f%%' % (td.get('l1_retiring') * 100))
print(' General Retirement: %5.2f%%' % (td.get('l2_general_retirement') * 100))
print(' Microcode Sequence: %5.2f%%' % (td.get('l2_microcode_sequencer') * 100))
print('Frontend Bound: %5.2f%%' % (td.get('l1_frontend_bound') * 100))
print(' Latency: %5.2f%%' % (td.get('l2_frontend_latency') * 100))
print(' Bandwidth: %5.2f%%' % (td.get('l2_frontend_bandwidth') * 100))
print('Backend Bound: %5.2f%%' % (td.get('l1_backend_bound') * 100))
print(' Memory Bound: %5.2f%%' % (td.get('l2_memory_bound') * 100))
print(' Core Bound: %5.2f%%' % (td.get('l2_core_bound') * 100))
print('Bad Speculation: %5.2f%%' % (td.get('l1_bad_speculation') * 100))
print(' Branch Mispredicts: %5.2f%%' % (td.get('l2_branch_mispredicts') * 100))
print(' Machine Clears: %5.2f%%' % (td.get('l2_machine_clears') * 100))
print('\n\n\n')
td = TopDownAbs(total_sg)
print('Retiring: %15u' % (td.get('l1_retiring')))
print(' General Retirement: %15u' % (td.get('general_retirement')))
print(' Microcode Sequence: %15u' % (td.get('microcode_sequencer')))
print('Frontend Bound: %15u' % (td.get('l1_frontend_bound')))
print(' Latency: %15u' % (td.get('frontend_latency')))
print(' Bandwidth: %15u' % (td.get('frontend_bandwidth')))
print('Backend Bound: %15u' % (td.get('l1_backend_bound')))
print(' Memory Bound: %15u' % (td.get('memory_bound')))
print(' Core Bound: %15u' % (td.get('core_bound')))
print('Bad Speculation: %15u' % (td.get('l1_bad_speculation')))
print(' Branch Mispredicts: %15u' % (td.get('branch_mispredicts')))
print(' Machine Clears: %15u' % (td.get('machine_clears')))
print('\n\n\n')
metric_table(cat_sgs, 'CPU_CLK_UNHALTED',
['l1_retiring', 'l1_frontend_bound', 'l1_backend_bound', 'l1_bad_speculation'],
TopDown, col_format='%04.3f')
print('\n\n\n')
metric_table(sym_sgs, 'CPU_CLK_UNHALTED',
['l1_retiring', 'l1_frontend_bound', 'l1_backend_bound', 'l1_bad_speculation'],
TopDown, cutoff_frac=99, col_format='%04.3f',
attr_width=50, attr_map=lambda x: x[0] + '.' + x[1])
print('\n\n\n')
metric_table(sym_sgs, 'CYCLE_ACTIVITY:STALLS_MEM_ANY',
['CYCLE_ACTIVITY:STALLS_MEM_ANY'], TopDown, cutoff_frac=99, col_format='%15u',
attr_width=50, attr_map=lambda x: x[0] + '.' + x[1])
print('\n\n\n')
metric_table(sym_sgs, 'l1_backend_bound',
['l1_backend_bound', 'memory_bound', 'core_bound'], TopDownAbs, cutoff_frac=99,
col_format='%15u',
attr_width=50, attr_map=lambda x: x[0] + '.' + x[1])
print('\n\n\n')
metric_table(sgs, 'l1_backend_bound',
['l1_backend_bound', 'memory_bound', 'core_bound', 'CPU_CLK_UNHALTED'],
TopDownAbs, cutoff_frac=95,
col_format='%15u',
attr_width=50, attr_map=lambda x: x['symbol'] + '.' + hex(x['symoff']))
from profiling import *
class TopDown(SGAnalysis):
smt = False
#
# L1
#
def get_l1_frontend_bound(self):
return self.get('IDQ_UOPS_NOT_DELIVERED:CORE') / self.get('slots')
def get_l1_bad_speculation(self):
return (self.get('UOPS_ISSUED:ANY') - self.get('UOPS_RETIRED:RETIRE_SLOTS')
+ self.get('pipeline_width') * self.get('recovery_cycles')) / \
self.get('slots')
def get_l1_retiring(self):
return self.get('UOPS_RETIRED:RETIRE_SLOTS') / self.get('slots')
def get_l1_backend_bound(self):
return 1 - (self.get('l1_frontend_bound') +
self.get('l1_bad_speculation') +
self.get('l1_retiring'))
#
# L2
#
def get_l2_frontend_latency(self):
return self.get('pipeline_width') * \
self.get('IDQ_UOPS_NOT_DELIVERED:CYCLES_0_UOPS_DELIV_CORE') / \
self.get('slots')
def get_l2_frontend_bandwidth(self):
return self.get('l1_frontend_bound') - self.get('l2_frontend_latency')
def get_l2_branch_mispredicts(self):
return self.get('mispred_clears_fraction') * self.get('l1_bad_speculation')
def get_l2_machine_clears(self):
return self.get('l1_bad_speculation') - self.get('l2_branch_mispredicts')
def get_l2_microcode_sequencer(self):
return self.get('retire_uop_fraction') * self.get('IDQ:MS_UOPS') / self.get('slots')
def get_l2_general_retirement(self):
return self.get('l1_retiring') - self.get('l2_microcode_sequencer')
def get_l2_memory_bound(self):
return self.get('memory_bound_fraction') * self.get('l1_backend_bound')
def get_l2_core_bound(self):
return self.get('l1_backend_bound') - self.get('l2_memory_bound')
#
# Auxiliary
#
def get_slots(self):
return self.get('pipeline_width') * self.get('core_clks')
def get_pipeline_width(self):
return 4
def get_core_clks(self):
if self.smt:
raise 'TODO'
return self.get('clks')
def get_clks(self):
return self.get('CPU_CLK_UNHALTED:THREAD_P')
def get_ipc(self):
return self.get('INST_RETIRED:ANY_P') / self.get('clks')
def get_cpi(self):
return self.div(1, self.get('ipc'))
def get_recovery_cycles(self):
if self.smt:
return self.get('INT_MISC:RECOVERY_CYCLES_ANY') / 2
return self.get('INT_MISC:RECOVERY_CYCLES_ANY')
def get_memory_bound_fraction(self):
return (self.get('CYCLE_ACTIVITY:STALLS_MEM_ANY') +
self.get('EXE_ACTIVITY:BOUND_ON_STORES')) / \
self.get('backend_bound_cycles')
def get_backend_bound_cycles(self):
return (self.get('EXE_ACTIVITY:EXE_BOUND_0_PORTS') +
self.get('EXE_ACTIVITY:1_PORTS_UTIL') +
self.get('few_uops_executed_threshold')) + \
(self.get('CYCLE_ACTIVITY:STALLS_MEM_ANY') +
self.get('EXE_ACTIVITY:BOUND_ON_STORES'))
def get_few_uops_executed_threshold(self):
if self.get('ipc') > 1.8:
return self.get('EXE_ACTIVITY:2_PORTS_UTIL')
return 0
def get_retire_uop_fraction(self):
return self.get('UOPS_RETIRED:RETIRE_SLOTS') / \
self.get('UOPS_ISSUED:ANY')
def get_mispred_clears_fraction(self):
return self.get('BR_MISP_RETIRED:ALL_BRANCHES') / \
(self.get('BR_MISP_RETIRED:ALL_BRANCHES') +
self.get('MACHINE_CLEARS:COUNT'))
class TopDownAbs(SGAnalysis):
smt = False
#
# L1
#
def get_l1_frontend_bound(self):
return self.get('IDQ_UOPS_NOT_DELIVERED:CORE')
def get_l1_bad_speculation(self):
return (self.get('UOPS_ISSUED:ANY') - self.get('UOPS_RETIRED:RETIRE_SLOTS')
+ self.get('pipeline_width') * self.get('recovery_cycles'))
def get_l1_retiring(self):
return self.get('UOPS_RETIRED:RETIRE_SLOTS')
def get_l1_backend_bound(self):
return self.get('slots') - (self.get('l1_frontend_bound') +
self.get('l1_bad_speculation') +
self.get('l1_retiring'))
#
# L2
#
def get_frontend_latency(self):
return self.get('pipeline_width') * \
self.get('IDQ_UOPS_NOT_DELIVERED:CYCLES_0_UOPS_DELIV_CORE')
def get_frontend_bandwidth(self):
return self.get('l1_frontend_bound') - self.get('frontend_latency')
def get_branch_mispredicts(self):
return self.get('mispred_clears_fraction') * self.get('l1_bad_speculation')
def get_machine_clears(self):
return self.get('l1_bad_speculation') - self.get('branch_mispredicts')
def get_microcode_sequencer(self):
return self.get('retire_uop_fraction') * self.get('IDQ:MS_UOPS')
def get_general_retirement(self):
return self.get('l1_retiring') - self.get('microcode_sequencer')
def get_memory_bound(self):
return self.get('memory_bound_fraction') * self.get('l1_backend_bound')
def get_core_bound(self):
return self.get('l1_backend_bound') - self.get('memory_bound')
#
# Auxiliary
#
def get_slots(self):
return self.get('pipeline_width') * self.get('core_clks')
def get_pipeline_width(self):
return 4
def get_core_clks(self):
if self.smt:
raise 'TODO'
return self.get('clks')
def get_clks(self):
return self.get('CPU_CLK_UNHALTED:THREAD_P')
def get_ipc(self):
return self.get('INST_RETIRED:ANY_P') / self.get('clks')
def get_cpi(self):
return self.div(1, self.get('ipc'))
def get_recovery_cycles(self):
if self.smt:
return self.get('INT_MISC:RECOVERY_CYCLES_ANY') / 2
return self.get('INT_MISC:RECOVERY_CYCLES_ANY')
def get_memory_bound_fraction(self):
return (self.get('CYCLE_ACTIVITY:STALLS_MEM_ANY') +
self.get('EXE_ACTIVITY:BOUND_ON_STORES')) / \
self.get('backend_bound_cycles')
def get_backend_bound_cycles(self):
return (self.get('EXE_ACTIVITY:EXE_BOUND_0_PORTS') +
self.get('EXE_ACTIVITY:1_PORTS_UTIL') +
self.get('few_uops_executed_threshold')) + \
(self.get('CYCLE_ACTIVITY:STALLS_MEM_ANY') +
self.get('EXE_ACTIVITY:BOUND_ON_STORES'))
def get_few_uops_executed_threshold(self):
if self.get('ipc') > 1.8:
return self.get('EXE_ACTIVITY:2_PORTS_UTIL')
return 0
def get_retire_uop_fraction(self):
return self.get('UOPS_RETIRED:RETIRE_SLOTS') / \
self.get('UOPS_ISSUED:ANY')
def get_mispred_clears_fraction(self):
return self.get('BR_MISP_RETIRED:ALL_BRANCHES') / \
(self.get('BR_MISP_RETIRED:ALL_BRANCHES') +
self.get('MACHINE_CLEARS:COUNT'))
......@@ -37,16 +37,20 @@ extern size_t config_procs_num;
extern struct config_thread *config_threads;
extern size_t config_threads_num;
extern uint64_t config_duration;
int config_init(int argc, char *argv[]);
void profile_sample(pid_t pid, pid_t tid, uint16_t cpu, uint64_t ip,
uint64_t value, uint64_t time_enabled, uint32_t event);
uint64_t value, uint64_t time_enabled, uint64_t time_running,
uint32_t event);
int events_init(void);
int events_enable(void);
unsigned events_poll(void);
int symbols_init(void);
int symbols_lookup(pid_t pid, uint64_t ip, char **sym, uint64_t *off);
int symbols_lookup(pid_t pid, uint64_t ip, char **sym, char **dso,
uint64_t *off);
#endif
#include <ctype.h>
#include <libgen.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
......@@ -10,6 +11,7 @@
struct symbol {
uint64_t ip;
char *symname;
char *dso;
};
struct symbol_map {
......@@ -49,7 +51,8 @@ static struct symbol_map *symmap_alloc(void)
}
/** Add new symbol to symbol map */
static int symmap_add(struct symbol_map *m, uint64_t ip, const char *sym)
static int symmap_add(struct symbol_map *m, uint64_t ip, const char *sym,
char *dso)
{
if (m->sym_num == m->sym_alloc) {
if ((m->symbols = realloc(m->symbols,
......@@ -63,6 +66,7 @@ static int symmap_add(struct symbol_map *m, uint64_t ip, const char *sym)
}
m->symbols[m->sym_num].ip = ip;
m->symbols[m->sym_num].dso = dso;
if ((m->symbols[m->sym_num].symname = strdup(sym)) == NULL) {
perror("strdup failed");
return -1;
......@@ -153,7 +157,7 @@ static struct symbol_map *kernel_symbols(void)
while (*q != 0 && !isspace(*q)) q++;
*q = 0;
if (symmap_add(m, n, p) != 0)
if (symmap_add(m, n, p, "vmlinux") != 0)
return NULL;
}
......@@ -161,12 +165,16 @@ static struct symbol_map *kernel_symbols(void)
m->pid = 0;
/* sort symbols */
symmap_sort(m);
return m;
}
/** Add all symbols from bfd symbol table */
static int symtab_process(struct symbol_map *m, uint64_t start_addr,
uint64_t len, uint64_t file_off, asymbol **symbol_table, size_t num)
uint64_t len, uint64_t file_off, asymbol **symbol_table, size_t num,
char *dso)
{
asymbol *sym;
asection *sec;
......@@ -192,7 +200,7 @@ static int symtab_process(struct symbol_map *m, uint64_t start_addr,
addr = start_addr + filepos - file_off;
if (symmap_add(m, addr, sym->name) != 0)
if (symmap_add(m, addr, sym->name, dso) != 0)
return -1;
}
......@@ -206,6 +214,12 @@ static int map_load_exec(struct symbol_map *m, const char *path,
bfd *f;
asymbol **symbol_table;
ssize_t storage, num_symbols;
char *dso, *p;
/* get dso name for symbols */
p = strdup(path);
dso = strdup(basename(p));
free(p);
if ((f = bfd_openr(path, NULL)) == NULL) {
fprintf(stderr, "map_load_exec: bfd_openr(%s) failed\n", path);
......@@ -235,7 +249,7 @@ static int map_load_exec(struct symbol_map *m, const char *path,
}
if (symtab_process(m, start_addr, len, file_off, symbol_table,
num_symbols) != 0)
num_symbols, dso) != 0)
{
return -1;
}
......@@ -260,7 +274,7 @@ static int map_load_exec(struct symbol_map *m, const char *path,
}
if (symtab_process(m, start_addr, len, file_off, symbol_table,
num_symbols) != 0)
num_symbols, dso) != 0)
{
return -1;
}
......@@ -292,10 +306,12 @@ static int map_load_builtin(struct symbol_map *m, const char *path,
}
return 0;
} else if (!strcmp(path, "")) {
return symmap_add(m, start_addr, "[anon]", strdup("[anon]"));
}
out:
return symmap_add(m, start_addr, path);
return symmap_add(m, start_addr, path, strdup(path));
}
/** Find or create symbol map for pid */
......@@ -393,7 +409,7 @@ static struct symbol_map *pid_map(pid_t pid)
*q = 0;
/* skip non-files */
if (p[0] == '[')
if (p[0] == '[' || p[0] == 0)
map_load_builtin(m, p, start_addr, end_addr - start_addr, off);
else
map_load_exec(m, p, start_addr, end_addr - start_addr, off);
......@@ -411,7 +427,8 @@ static struct symbol_map *pid_map(pid_t pid)
}
/** Lookup symbol */
int symbols_lookup(pid_t pid, uint64_t ip, char **sym, uint64_t *off)
int symbols_lookup(pid_t pid, uint64_t ip, char **sym, char **dso,
uint64_t *off)
{
struct symbol *s;
struct symbol_map *m;
......@@ -432,6 +449,7 @@ int symbols_lookup(pid_t pid, uint64_t ip, char **sym, uint64_t *off)
}
*sym = s->symname;
*dso = s->dso;
*off = ip - s->ip;
return 0;
......