Browse Source
Sometimes it is useful to have a quick histogram to represent a certain
distribution -- for example, when investigating a performance regression
in a hash table due to inadequate hashing.
The appended allows us to easily represent a distribution using Unicode
characters. Further, the data structure keeping track of the distribution
is so simple that obtaining its values for off-line processing is trivial.
Example, taking the last 10 commits to QEMU:
Characters in commit title Count
-----------------------------------
39 1
48 1
53 1
54 2
57 1
61 1
67 1
78 1
80 1
qdist_init(&dist);
qdist_inc(&dist, 39);
[...]
qdist_inc(&dist, 80);
char *str = qdist_pr(&dist, 9, QDIST_PR_LABELS);
// -> [39.0,43.6)▂▂ █▂ ▂ ▄[75.4,80.0]
g_free(str);
char *str = qdist_pr(&dist, 4, QDIST_PR_LABELS);
// -> [39.0,49.2)▁█▁▁[69.8,80.0]
g_free(str);
Reviewed-by: Richard Henderson <rth@twiddle.net>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Message-Id: <1465412133-3029-9-git-send-email-cota@braap.org>
Signed-off-by: Richard Henderson <rth@twiddle.net>
pull/42/head
committed by
Richard Henderson
3 changed files with 459 additions and 0 deletions
@ -0,0 +1,63 @@ |
|||
/*
|
|||
* Copyright (C) 2016, Emilio G. Cota <cota@braap.org> |
|||
* |
|||
* License: GNU GPL, version 2 or later. |
|||
* See the COPYING file in the top-level directory. |
|||
*/ |
|||
#ifndef QEMU_QDIST_H |
|||
#define QEMU_QDIST_H |
|||
|
|||
#include "qemu/osdep.h" |
|||
#include "qemu-common.h" |
|||
#include "qemu/bitops.h" |
|||
|
|||
/*
|
|||
* Samples with the same 'x value' end up in the same qdist_entry, |
|||
* e.g. inc(0.1) and inc(0.1) end up as {x=0.1, count=2}. |
|||
* |
|||
* Binning happens only at print time, so that we retain the flexibility to |
|||
* choose the binning. This might not be ideal for workloads that do not care |
|||
* much about precision and insert many samples all with different x values; |
|||
* in that case, pre-binning (e.g. entering both 0.115 and 0.097 as 0.1) |
|||
* should be considered. |
|||
*/ |
|||
struct qdist_entry { |
|||
double x; |
|||
unsigned long count; |
|||
}; |
|||
|
|||
struct qdist { |
|||
struct qdist_entry *entries; |
|||
size_t n; |
|||
size_t size; |
|||
}; |
|||
|
|||
#define QDIST_PR_BORDER BIT(0) |
|||
#define QDIST_PR_LABELS BIT(1) |
|||
/* the remaining options only work if PR_LABELS is set */ |
|||
#define QDIST_PR_NODECIMAL BIT(2) |
|||
#define QDIST_PR_PERCENT BIT(3) |
|||
#define QDIST_PR_100X BIT(4) |
|||
#define QDIST_PR_NOBINRANGE BIT(5) |
|||
|
|||
void qdist_init(struct qdist *dist); |
|||
void qdist_destroy(struct qdist *dist); |
|||
|
|||
void qdist_add(struct qdist *dist, double x, long count); |
|||
void qdist_inc(struct qdist *dist, double x); |
|||
double qdist_xmin(const struct qdist *dist); |
|||
double qdist_xmax(const struct qdist *dist); |
|||
double qdist_avg(const struct qdist *dist); |
|||
unsigned long qdist_sample_count(const struct qdist *dist); |
|||
size_t qdist_unique_entries(const struct qdist *dist); |
|||
|
|||
/* callers must free the returned string with g_free() */ |
|||
char *qdist_pr_plain(const struct qdist *dist, size_t n_groups); |
|||
|
|||
/* callers must free the returned string with g_free() */ |
|||
char *qdist_pr(const struct qdist *dist, size_t n_groups, uint32_t opt); |
|||
|
|||
/* Only qdist code and test code should ever call this function */ |
|||
void qdist_bin__internal(struct qdist *to, const struct qdist *from, size_t n); |
|||
|
|||
#endif /* QEMU_QDIST_H */ |
|||
@ -0,0 +1,395 @@ |
|||
/*
|
|||
* qdist.c - QEMU helpers for handling frequency distributions of data. |
|||
* |
|||
* Copyright (C) 2016, Emilio G. Cota <cota@braap.org> |
|||
* |
|||
* License: GNU GPL, version 2 or later. |
|||
* See the COPYING file in the top-level directory. |
|||
*/ |
|||
#include "qemu/qdist.h" |
|||
|
|||
#include <math.h> |
|||
#ifndef NAN |
|||
#define NAN (0.0 / 0.0) |
|||
#endif |
|||
|
|||
void qdist_init(struct qdist *dist) |
|||
{ |
|||
dist->entries = g_malloc(sizeof(*dist->entries)); |
|||
dist->size = 1; |
|||
dist->n = 0; |
|||
} |
|||
|
|||
void qdist_destroy(struct qdist *dist) |
|||
{ |
|||
g_free(dist->entries); |
|||
} |
|||
|
|||
static inline int qdist_cmp_double(double a, double b) |
|||
{ |
|||
if (a > b) { |
|||
return 1; |
|||
} else if (a < b) { |
|||
return -1; |
|||
} |
|||
return 0; |
|||
} |
|||
|
|||
static int qdist_cmp(const void *ap, const void *bp) |
|||
{ |
|||
const struct qdist_entry *a = ap; |
|||
const struct qdist_entry *b = bp; |
|||
|
|||
return qdist_cmp_double(a->x, b->x); |
|||
} |
|||
|
|||
void qdist_add(struct qdist *dist, double x, long count) |
|||
{ |
|||
struct qdist_entry *entry = NULL; |
|||
|
|||
if (dist->n) { |
|||
struct qdist_entry e; |
|||
|
|||
e.x = x; |
|||
entry = bsearch(&e, dist->entries, dist->n, sizeof(e), qdist_cmp); |
|||
} |
|||
|
|||
if (entry) { |
|||
entry->count += count; |
|||
return; |
|||
} |
|||
|
|||
if (unlikely(dist->n == dist->size)) { |
|||
dist->size *= 2; |
|||
dist->entries = g_realloc(dist->entries, |
|||
sizeof(*dist->entries) * (dist->size)); |
|||
} |
|||
dist->n++; |
|||
entry = &dist->entries[dist->n - 1]; |
|||
entry->x = x; |
|||
entry->count = count; |
|||
qsort(dist->entries, dist->n, sizeof(*entry), qdist_cmp); |
|||
} |
|||
|
|||
void qdist_inc(struct qdist *dist, double x) |
|||
{ |
|||
qdist_add(dist, x, 1); |
|||
} |
|||
|
|||
/*
|
|||
* Unicode for block elements. See: |
|||
* https://en.wikipedia.org/wiki/Block_Elements
|
|||
*/ |
|||
static const gunichar qdist_blocks[] = { |
|||
0x2581, |
|||
0x2582, |
|||
0x2583, |
|||
0x2584, |
|||
0x2585, |
|||
0x2586, |
|||
0x2587, |
|||
0x2588 |
|||
}; |
|||
|
|||
#define QDIST_NR_BLOCK_CODES ARRAY_SIZE(qdist_blocks) |
|||
|
|||
/*
|
|||
* Print a distribution into a string. |
|||
* |
|||
* This function assumes that appropriate binning has been done on the input; |
|||
* see qdist_bin__internal() and qdist_pr_plain(). |
|||
* |
|||
* Callers must free the returned string with g_free(). |
|||
*/ |
|||
static char *qdist_pr_internal(const struct qdist *dist) |
|||
{ |
|||
double min, max; |
|||
GString *s = g_string_new(""); |
|||
size_t i; |
|||
|
|||
/* if only one entry, its printout will be either full or empty */ |
|||
if (dist->n == 1) { |
|||
if (dist->entries[0].count) { |
|||
g_string_append_unichar(s, qdist_blocks[QDIST_NR_BLOCK_CODES - 1]); |
|||
} else { |
|||
g_string_append_c(s, ' '); |
|||
} |
|||
goto out; |
|||
} |
|||
|
|||
/* get min and max counts */ |
|||
min = dist->entries[0].count; |
|||
max = min; |
|||
for (i = 0; i < dist->n; i++) { |
|||
struct qdist_entry *e = &dist->entries[i]; |
|||
|
|||
if (e->count < min) { |
|||
min = e->count; |
|||
} |
|||
if (e->count > max) { |
|||
max = e->count; |
|||
} |
|||
} |
|||
|
|||
for (i = 0; i < dist->n; i++) { |
|||
struct qdist_entry *e = &dist->entries[i]; |
|||
int index; |
|||
|
|||
/* make an exception with 0; instead of using block[0], print a space */ |
|||
if (e->count) { |
|||
/* divide first to avoid loss of precision when e->count == max */ |
|||
index = (e->count - min) / (max - min) * (QDIST_NR_BLOCK_CODES - 1); |
|||
g_string_append_unichar(s, qdist_blocks[index]); |
|||
} else { |
|||
g_string_append_c(s, ' '); |
|||
} |
|||
} |
|||
out: |
|||
return g_string_free(s, FALSE); |
|||
} |
|||
|
|||
/*
|
|||
* Bin the distribution in @from into @n bins of consecutive, non-overlapping |
|||
* intervals, copying the result to @to. |
|||
* |
|||
* This function is internal to qdist: only this file and test code should |
|||
* ever call it. |
|||
* |
|||
* Note: calling this function on an already-binned qdist is a bug. |
|||
* |
|||
* If @n == 0 or @from->n == 1, use @from->n. |
|||
*/ |
|||
void qdist_bin__internal(struct qdist *to, const struct qdist *from, size_t n) |
|||
{ |
|||
double xmin, xmax; |
|||
double step; |
|||
size_t i, j; |
|||
|
|||
qdist_init(to); |
|||
|
|||
if (from->n == 0) { |
|||
return; |
|||
} |
|||
if (n == 0 || from->n == 1) { |
|||
n = from->n; |
|||
} |
|||
|
|||
/* set equally-sized bins between @from's left and right */ |
|||
xmin = qdist_xmin(from); |
|||
xmax = qdist_xmax(from); |
|||
step = (xmax - xmin) / n; |
|||
|
|||
if (n == from->n) { |
|||
/* if @from's entries are equally spaced, no need to re-bin */ |
|||
for (i = 0; i < from->n; i++) { |
|||
if (from->entries[i].x != xmin + i * step) { |
|||
goto rebin; |
|||
} |
|||
} |
|||
/* they're equally spaced, so copy the dist and bail out */ |
|||
to->entries = g_new(struct qdist_entry, from->n); |
|||
to->n = from->n; |
|||
memcpy(to->entries, from->entries, sizeof(*to->entries) * to->n); |
|||
return; |
|||
} |
|||
|
|||
rebin: |
|||
j = 0; |
|||
for (i = 0; i < n; i++) { |
|||
double x; |
|||
double left, right; |
|||
|
|||
left = xmin + i * step; |
|||
right = xmin + (i + 1) * step; |
|||
|
|||
/* Add x, even if it might not get any counts later */ |
|||
x = left; |
|||
qdist_add(to, x, 0); |
|||
|
|||
/*
|
|||
* To avoid double-counting we capture [left, right) ranges, except for |
|||
* the righmost bin, which captures a [left, right] range. |
|||
*/ |
|||
while (j < from->n && (from->entries[j].x < right || i == n - 1)) { |
|||
struct qdist_entry *o = &from->entries[j]; |
|||
|
|||
qdist_add(to, x, o->count); |
|||
j++; |
|||
} |
|||
} |
|||
} |
|||
|
|||
/*
|
|||
* Print @dist into a string, after re-binning it into @n bins of consecutive, |
|||
* non-overlapping intervals. |
|||
* |
|||
* If @n == 0, use @orig->n. |
|||
* |
|||
* Callers must free the returned string with g_free(). |
|||
*/ |
|||
char *qdist_pr_plain(const struct qdist *dist, size_t n) |
|||
{ |
|||
struct qdist binned; |
|||
char *ret; |
|||
|
|||
if (dist->n == 0) { |
|||
return NULL; |
|||
} |
|||
qdist_bin__internal(&binned, dist, n); |
|||
ret = qdist_pr_internal(&binned); |
|||
qdist_destroy(&binned); |
|||
return ret; |
|||
} |
|||
|
|||
static char *qdist_pr_label(const struct qdist *dist, size_t n_bins, |
|||
uint32_t opt, bool is_left) |
|||
{ |
|||
const char *percent; |
|||
const char *lparen; |
|||
const char *rparen; |
|||
GString *s; |
|||
double x1, x2, step; |
|||
double x; |
|||
double n; |
|||
int dec; |
|||
|
|||
s = g_string_new(""); |
|||
if (!(opt & QDIST_PR_LABELS)) { |
|||
goto out; |
|||
} |
|||
|
|||
dec = opt & QDIST_PR_NODECIMAL ? 0 : 1; |
|||
percent = opt & QDIST_PR_PERCENT ? "%" : ""; |
|||
|
|||
n = n_bins ? n_bins : dist->n; |
|||
x = is_left ? qdist_xmin(dist) : qdist_xmax(dist); |
|||
step = (qdist_xmax(dist) - qdist_xmin(dist)) / n; |
|||
|
|||
if (opt & QDIST_PR_100X) { |
|||
x *= 100.0; |
|||
step *= 100.0; |
|||
} |
|||
if (opt & QDIST_PR_NOBINRANGE) { |
|||
lparen = rparen = ""; |
|||
x1 = x; |
|||
x2 = x; /* unnecessary, but a dumb compiler might not figure it out */ |
|||
} else { |
|||
lparen = "["; |
|||
rparen = is_left ? ")" : "]"; |
|||
if (is_left) { |
|||
x1 = x; |
|||
x2 = x + step; |
|||
} else { |
|||
x1 = x - step; |
|||
x2 = x; |
|||
} |
|||
} |
|||
g_string_append_printf(s, "%s%.*f", lparen, dec, x1); |
|||
if (!(opt & QDIST_PR_NOBINRANGE)) { |
|||
g_string_append_printf(s, ",%.*f%s", dec, x2, rparen); |
|||
} |
|||
g_string_append(s, percent); |
|||
out: |
|||
return g_string_free(s, FALSE); |
|||
} |
|||
|
|||
/*
|
|||
* Print the distribution's histogram into a string. |
|||
* |
|||
* See also: qdist_pr_plain(). |
|||
* |
|||
* Callers must free the returned string with g_free(). |
|||
*/ |
|||
char *qdist_pr(const struct qdist *dist, size_t n_bins, uint32_t opt) |
|||
{ |
|||
const char *border = opt & QDIST_PR_BORDER ? "|" : ""; |
|||
char *llabel, *rlabel; |
|||
char *hgram; |
|||
GString *s; |
|||
|
|||
if (dist->n == 0) { |
|||
return NULL; |
|||
} |
|||
|
|||
s = g_string_new(""); |
|||
|
|||
llabel = qdist_pr_label(dist, n_bins, opt, true); |
|||
rlabel = qdist_pr_label(dist, n_bins, opt, false); |
|||
hgram = qdist_pr_plain(dist, n_bins); |
|||
g_string_append_printf(s, "%s%s%s%s%s", |
|||
llabel, border, hgram, border, rlabel); |
|||
g_free(llabel); |
|||
g_free(rlabel); |
|||
g_free(hgram); |
|||
|
|||
return g_string_free(s, FALSE); |
|||
} |
|||
|
|||
static inline double qdist_x(const struct qdist *dist, int index) |
|||
{ |
|||
if (dist->n == 0) { |
|||
return NAN; |
|||
} |
|||
return dist->entries[index].x; |
|||
} |
|||
|
|||
double qdist_xmin(const struct qdist *dist) |
|||
{ |
|||
return qdist_x(dist, 0); |
|||
} |
|||
|
|||
double qdist_xmax(const struct qdist *dist) |
|||
{ |
|||
return qdist_x(dist, dist->n - 1); |
|||
} |
|||
|
|||
size_t qdist_unique_entries(const struct qdist *dist) |
|||
{ |
|||
return dist->n; |
|||
} |
|||
|
|||
unsigned long qdist_sample_count(const struct qdist *dist) |
|||
{ |
|||
unsigned long count = 0; |
|||
size_t i; |
|||
|
|||
for (i = 0; i < dist->n; i++) { |
|||
struct qdist_entry *e = &dist->entries[i]; |
|||
|
|||
count += e->count; |
|||
} |
|||
return count; |
|||
} |
|||
|
|||
static double qdist_pairwise_avg(const struct qdist *dist, size_t index, |
|||
size_t n, unsigned long count) |
|||
{ |
|||
/* amortize the recursion by using a base case > 2 */ |
|||
if (n <= 8) { |
|||
size_t i; |
|||
double ret = 0; |
|||
|
|||
for (i = 0; i < n; i++) { |
|||
struct qdist_entry *e = &dist->entries[index + i]; |
|||
|
|||
ret += e->x * e->count / count; |
|||
} |
|||
return ret; |
|||
} else { |
|||
size_t n2 = n / 2; |
|||
|
|||
return qdist_pairwise_avg(dist, index, n2, count) + |
|||
qdist_pairwise_avg(dist, index + n2, n - n2, count); |
|||
} |
|||
} |
|||
|
|||
double qdist_avg(const struct qdist *dist) |
|||
{ |
|||
unsigned long count; |
|||
|
|||
count = qdist_sample_count(dist); |
|||
if (!count) { |
|||
return NAN; |
|||
} |
|||
return qdist_pairwise_avg(dist, 0, dist->n, count); |
|||
} |
|||
Loading…
Reference in new issue