qdist: add module to represent frequency distributions of data
Sometimes it is useful to have a quick histogram to represent a certain distribution -- for example, when investigating a performance regression in a hash table due to inadequate hashing. The appended allows us to easily represent a distribution using Unicode characters. Further, the data structure keeping track of the distribution is so simple that obtaining its values for off-line processing is trivial. Example, taking the last 10 commits to QEMU: Characters in commit title Count ----------------------------------- 39 1 48 1 53 1 54 2 57 1 61 1 67 1 78 1 80 1 qdist_init(&dist); qdist_inc(&dist, 39); [...] qdist_inc(&dist, 80); char *str = qdist_pr(&dist, 9, QDIST_PR_LABELS); // -> [39.0,43.6)▂▂ █▂ ▂ ▄[75.4,80.0] g_free(str); char *str = qdist_pr(&dist, 4, QDIST_PR_LABELS); // -> [39.0,49.2)▁█▁▁[69.8,80.0] g_free(str); Reviewed-by: Richard Henderson <rth@twiddle.net> Signed-off-by: Emilio G. Cota <cota@braap.org> Message-Id: <1465412133-3029-9-git-send-email-cota@braap.org> Signed-off-by: Richard Henderson <rth@twiddle.net>
This commit is contained in:
parent
42bd32287f
commit
bf3afd5f41
63
include/qemu/qdist.h
Normal file
63
include/qemu/qdist.h
Normal file
@ -0,0 +1,63 @@
|
||||
/*
|
||||
* Copyright (C) 2016, Emilio G. Cota <cota@braap.org>
|
||||
*
|
||||
* License: GNU GPL, version 2 or later.
|
||||
* See the COPYING file in the top-level directory.
|
||||
*/
|
||||
#ifndef QEMU_QDIST_H
|
||||
#define QEMU_QDIST_H
|
||||
|
||||
#include "qemu/osdep.h"
|
||||
#include "qemu-common.h"
|
||||
#include "qemu/bitops.h"
|
||||
|
||||
/*
|
||||
* Samples with the same 'x value' end up in the same qdist_entry,
|
||||
* e.g. inc(0.1) and inc(0.1) end up as {x=0.1, count=2}.
|
||||
*
|
||||
* Binning happens only at print time, so that we retain the flexibility to
|
||||
* choose the binning. This might not be ideal for workloads that do not care
|
||||
* much about precision and insert many samples all with different x values;
|
||||
* in that case, pre-binning (e.g. entering both 0.115 and 0.097 as 0.1)
|
||||
* should be considered.
|
||||
*/
|
||||
struct qdist_entry {
|
||||
double x;
|
||||
unsigned long count;
|
||||
};
|
||||
|
||||
struct qdist {
|
||||
struct qdist_entry *entries;
|
||||
size_t n;
|
||||
size_t size;
|
||||
};
|
||||
|
||||
#define QDIST_PR_BORDER BIT(0)
|
||||
#define QDIST_PR_LABELS BIT(1)
|
||||
/* the remaining options only work if PR_LABELS is set */
|
||||
#define QDIST_PR_NODECIMAL BIT(2)
|
||||
#define QDIST_PR_PERCENT BIT(3)
|
||||
#define QDIST_PR_100X BIT(4)
|
||||
#define QDIST_PR_NOBINRANGE BIT(5)
|
||||
|
||||
void qdist_init(struct qdist *dist);
|
||||
void qdist_destroy(struct qdist *dist);
|
||||
|
||||
void qdist_add(struct qdist *dist, double x, long count);
|
||||
void qdist_inc(struct qdist *dist, double x);
|
||||
double qdist_xmin(const struct qdist *dist);
|
||||
double qdist_xmax(const struct qdist *dist);
|
||||
double qdist_avg(const struct qdist *dist);
|
||||
unsigned long qdist_sample_count(const struct qdist *dist);
|
||||
size_t qdist_unique_entries(const struct qdist *dist);
|
||||
|
||||
/* callers must free the returned string with g_free() */
|
||||
char *qdist_pr_plain(const struct qdist *dist, size_t n_groups);
|
||||
|
||||
/* callers must free the returned string with g_free() */
|
||||
char *qdist_pr(const struct qdist *dist, size_t n_groups, uint32_t opt);
|
||||
|
||||
/* Only qdist code and test code should ever call this function */
|
||||
void qdist_bin__internal(struct qdist *to, const struct qdist *from, size_t n);
|
||||
|
||||
#endif /* QEMU_QDIST_H */
|
@ -32,3 +32,4 @@ util-obj-y += buffer.o
|
||||
util-obj-y += timed-average.o
|
||||
util-obj-y += base64.o
|
||||
util-obj-y += log.o
|
||||
util-obj-y += qdist.o
|
||||
|
395
util/qdist.c
Normal file
395
util/qdist.c
Normal file
@ -0,0 +1,395 @@
|
||||
/*
|
||||
* qdist.c - QEMU helpers for handling frequency distributions of data.
|
||||
*
|
||||
* Copyright (C) 2016, Emilio G. Cota <cota@braap.org>
|
||||
*
|
||||
* License: GNU GPL, version 2 or later.
|
||||
* See the COPYING file in the top-level directory.
|
||||
*/
|
||||
#include "qemu/qdist.h"
|
||||
|
||||
#include <math.h>
|
||||
#ifndef NAN
|
||||
#define NAN (0.0 / 0.0)
|
||||
#endif
|
||||
|
||||
void qdist_init(struct qdist *dist)
|
||||
{
|
||||
dist->entries = g_malloc(sizeof(*dist->entries));
|
||||
dist->size = 1;
|
||||
dist->n = 0;
|
||||
}
|
||||
|
||||
void qdist_destroy(struct qdist *dist)
|
||||
{
|
||||
g_free(dist->entries);
|
||||
}
|
||||
|
||||
static inline int qdist_cmp_double(double a, double b)
|
||||
{
|
||||
if (a > b) {
|
||||
return 1;
|
||||
} else if (a < b) {
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int qdist_cmp(const void *ap, const void *bp)
|
||||
{
|
||||
const struct qdist_entry *a = ap;
|
||||
const struct qdist_entry *b = bp;
|
||||
|
||||
return qdist_cmp_double(a->x, b->x);
|
||||
}
|
||||
|
||||
void qdist_add(struct qdist *dist, double x, long count)
|
||||
{
|
||||
struct qdist_entry *entry = NULL;
|
||||
|
||||
if (dist->n) {
|
||||
struct qdist_entry e;
|
||||
|
||||
e.x = x;
|
||||
entry = bsearch(&e, dist->entries, dist->n, sizeof(e), qdist_cmp);
|
||||
}
|
||||
|
||||
if (entry) {
|
||||
entry->count += count;
|
||||
return;
|
||||
}
|
||||
|
||||
if (unlikely(dist->n == dist->size)) {
|
||||
dist->size *= 2;
|
||||
dist->entries = g_realloc(dist->entries,
|
||||
sizeof(*dist->entries) * (dist->size));
|
||||
}
|
||||
dist->n++;
|
||||
entry = &dist->entries[dist->n - 1];
|
||||
entry->x = x;
|
||||
entry->count = count;
|
||||
qsort(dist->entries, dist->n, sizeof(*entry), qdist_cmp);
|
||||
}
|
||||
|
||||
void qdist_inc(struct qdist *dist, double x)
|
||||
{
|
||||
qdist_add(dist, x, 1);
|
||||
}
|
||||
|
||||
/*
|
||||
* Unicode for block elements. See:
|
||||
* https://en.wikipedia.org/wiki/Block_Elements
|
||||
*/
|
||||
static const gunichar qdist_blocks[] = {
|
||||
0x2581,
|
||||
0x2582,
|
||||
0x2583,
|
||||
0x2584,
|
||||
0x2585,
|
||||
0x2586,
|
||||
0x2587,
|
||||
0x2588
|
||||
};
|
||||
|
||||
#define QDIST_NR_BLOCK_CODES ARRAY_SIZE(qdist_blocks)
|
||||
|
||||
/*
|
||||
* Print a distribution into a string.
|
||||
*
|
||||
* This function assumes that appropriate binning has been done on the input;
|
||||
* see qdist_bin__internal() and qdist_pr_plain().
|
||||
*
|
||||
* Callers must free the returned string with g_free().
|
||||
*/
|
||||
static char *qdist_pr_internal(const struct qdist *dist)
|
||||
{
|
||||
double min, max;
|
||||
GString *s = g_string_new("");
|
||||
size_t i;
|
||||
|
||||
/* if only one entry, its printout will be either full or empty */
|
||||
if (dist->n == 1) {
|
||||
if (dist->entries[0].count) {
|
||||
g_string_append_unichar(s, qdist_blocks[QDIST_NR_BLOCK_CODES - 1]);
|
||||
} else {
|
||||
g_string_append_c(s, ' ');
|
||||
}
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* get min and max counts */
|
||||
min = dist->entries[0].count;
|
||||
max = min;
|
||||
for (i = 0; i < dist->n; i++) {
|
||||
struct qdist_entry *e = &dist->entries[i];
|
||||
|
||||
if (e->count < min) {
|
||||
min = e->count;
|
||||
}
|
||||
if (e->count > max) {
|
||||
max = e->count;
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 0; i < dist->n; i++) {
|
||||
struct qdist_entry *e = &dist->entries[i];
|
||||
int index;
|
||||
|
||||
/* make an exception with 0; instead of using block[0], print a space */
|
||||
if (e->count) {
|
||||
/* divide first to avoid loss of precision when e->count == max */
|
||||
index = (e->count - min) / (max - min) * (QDIST_NR_BLOCK_CODES - 1);
|
||||
g_string_append_unichar(s, qdist_blocks[index]);
|
||||
} else {
|
||||
g_string_append_c(s, ' ');
|
||||
}
|
||||
}
|
||||
out:
|
||||
return g_string_free(s, FALSE);
|
||||
}
|
||||
|
||||
/*
|
||||
* Bin the distribution in @from into @n bins of consecutive, non-overlapping
|
||||
* intervals, copying the result to @to.
|
||||
*
|
||||
* This function is internal to qdist: only this file and test code should
|
||||
* ever call it.
|
||||
*
|
||||
* Note: calling this function on an already-binned qdist is a bug.
|
||||
*
|
||||
* If @n == 0 or @from->n == 1, use @from->n.
|
||||
*/
|
||||
void qdist_bin__internal(struct qdist *to, const struct qdist *from, size_t n)
|
||||
{
|
||||
double xmin, xmax;
|
||||
double step;
|
||||
size_t i, j;
|
||||
|
||||
qdist_init(to);
|
||||
|
||||
if (from->n == 0) {
|
||||
return;
|
||||
}
|
||||
if (n == 0 || from->n == 1) {
|
||||
n = from->n;
|
||||
}
|
||||
|
||||
/* set equally-sized bins between @from's left and right */
|
||||
xmin = qdist_xmin(from);
|
||||
xmax = qdist_xmax(from);
|
||||
step = (xmax - xmin) / n;
|
||||
|
||||
if (n == from->n) {
|
||||
/* if @from's entries are equally spaced, no need to re-bin */
|
||||
for (i = 0; i < from->n; i++) {
|
||||
if (from->entries[i].x != xmin + i * step) {
|
||||
goto rebin;
|
||||
}
|
||||
}
|
||||
/* they're equally spaced, so copy the dist and bail out */
|
||||
to->entries = g_new(struct qdist_entry, from->n);
|
||||
to->n = from->n;
|
||||
memcpy(to->entries, from->entries, sizeof(*to->entries) * to->n);
|
||||
return;
|
||||
}
|
||||
|
||||
rebin:
|
||||
j = 0;
|
||||
for (i = 0; i < n; i++) {
|
||||
double x;
|
||||
double left, right;
|
||||
|
||||
left = xmin + i * step;
|
||||
right = xmin + (i + 1) * step;
|
||||
|
||||
/* Add x, even if it might not get any counts later */
|
||||
x = left;
|
||||
qdist_add(to, x, 0);
|
||||
|
||||
/*
|
||||
* To avoid double-counting we capture [left, right) ranges, except for
|
||||
* the righmost bin, which captures a [left, right] range.
|
||||
*/
|
||||
while (j < from->n && (from->entries[j].x < right || i == n - 1)) {
|
||||
struct qdist_entry *o = &from->entries[j];
|
||||
|
||||
qdist_add(to, x, o->count);
|
||||
j++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Print @dist into a string, after re-binning it into @n bins of consecutive,
|
||||
* non-overlapping intervals.
|
||||
*
|
||||
* If @n == 0, use @orig->n.
|
||||
*
|
||||
* Callers must free the returned string with g_free().
|
||||
*/
|
||||
char *qdist_pr_plain(const struct qdist *dist, size_t n)
|
||||
{
|
||||
struct qdist binned;
|
||||
char *ret;
|
||||
|
||||
if (dist->n == 0) {
|
||||
return NULL;
|
||||
}
|
||||
qdist_bin__internal(&binned, dist, n);
|
||||
ret = qdist_pr_internal(&binned);
|
||||
qdist_destroy(&binned);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static char *qdist_pr_label(const struct qdist *dist, size_t n_bins,
|
||||
uint32_t opt, bool is_left)
|
||||
{
|
||||
const char *percent;
|
||||
const char *lparen;
|
||||
const char *rparen;
|
||||
GString *s;
|
||||
double x1, x2, step;
|
||||
double x;
|
||||
double n;
|
||||
int dec;
|
||||
|
||||
s = g_string_new("");
|
||||
if (!(opt & QDIST_PR_LABELS)) {
|
||||
goto out;
|
||||
}
|
||||
|
||||
dec = opt & QDIST_PR_NODECIMAL ? 0 : 1;
|
||||
percent = opt & QDIST_PR_PERCENT ? "%" : "";
|
||||
|
||||
n = n_bins ? n_bins : dist->n;
|
||||
x = is_left ? qdist_xmin(dist) : qdist_xmax(dist);
|
||||
step = (qdist_xmax(dist) - qdist_xmin(dist)) / n;
|
||||
|
||||
if (opt & QDIST_PR_100X) {
|
||||
x *= 100.0;
|
||||
step *= 100.0;
|
||||
}
|
||||
if (opt & QDIST_PR_NOBINRANGE) {
|
||||
lparen = rparen = "";
|
||||
x1 = x;
|
||||
x2 = x; /* unnecessary, but a dumb compiler might not figure it out */
|
||||
} else {
|
||||
lparen = "[";
|
||||
rparen = is_left ? ")" : "]";
|
||||
if (is_left) {
|
||||
x1 = x;
|
||||
x2 = x + step;
|
||||
} else {
|
||||
x1 = x - step;
|
||||
x2 = x;
|
||||
}
|
||||
}
|
||||
g_string_append_printf(s, "%s%.*f", lparen, dec, x1);
|
||||
if (!(opt & QDIST_PR_NOBINRANGE)) {
|
||||
g_string_append_printf(s, ",%.*f%s", dec, x2, rparen);
|
||||
}
|
||||
g_string_append(s, percent);
|
||||
out:
|
||||
return g_string_free(s, FALSE);
|
||||
}
|
||||
|
||||
/*
|
||||
* Print the distribution's histogram into a string.
|
||||
*
|
||||
* See also: qdist_pr_plain().
|
||||
*
|
||||
* Callers must free the returned string with g_free().
|
||||
*/
|
||||
char *qdist_pr(const struct qdist *dist, size_t n_bins, uint32_t opt)
|
||||
{
|
||||
const char *border = opt & QDIST_PR_BORDER ? "|" : "";
|
||||
char *llabel, *rlabel;
|
||||
char *hgram;
|
||||
GString *s;
|
||||
|
||||
if (dist->n == 0) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
s = g_string_new("");
|
||||
|
||||
llabel = qdist_pr_label(dist, n_bins, opt, true);
|
||||
rlabel = qdist_pr_label(dist, n_bins, opt, false);
|
||||
hgram = qdist_pr_plain(dist, n_bins);
|
||||
g_string_append_printf(s, "%s%s%s%s%s",
|
||||
llabel, border, hgram, border, rlabel);
|
||||
g_free(llabel);
|
||||
g_free(rlabel);
|
||||
g_free(hgram);
|
||||
|
||||
return g_string_free(s, FALSE);
|
||||
}
|
||||
|
||||
static inline double qdist_x(const struct qdist *dist, int index)
|
||||
{
|
||||
if (dist->n == 0) {
|
||||
return NAN;
|
||||
}
|
||||
return dist->entries[index].x;
|
||||
}
|
||||
|
||||
double qdist_xmin(const struct qdist *dist)
|
||||
{
|
||||
return qdist_x(dist, 0);
|
||||
}
|
||||
|
||||
double qdist_xmax(const struct qdist *dist)
|
||||
{
|
||||
return qdist_x(dist, dist->n - 1);
|
||||
}
|
||||
|
||||
size_t qdist_unique_entries(const struct qdist *dist)
|
||||
{
|
||||
return dist->n;
|
||||
}
|
||||
|
||||
unsigned long qdist_sample_count(const struct qdist *dist)
|
||||
{
|
||||
unsigned long count = 0;
|
||||
size_t i;
|
||||
|
||||
for (i = 0; i < dist->n; i++) {
|
||||
struct qdist_entry *e = &dist->entries[i];
|
||||
|
||||
count += e->count;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
static double qdist_pairwise_avg(const struct qdist *dist, size_t index,
|
||||
size_t n, unsigned long count)
|
||||
{
|
||||
/* amortize the recursion by using a base case > 2 */
|
||||
if (n <= 8) {
|
||||
size_t i;
|
||||
double ret = 0;
|
||||
|
||||
for (i = 0; i < n; i++) {
|
||||
struct qdist_entry *e = &dist->entries[index + i];
|
||||
|
||||
ret += e->x * e->count / count;
|
||||
}
|
||||
return ret;
|
||||
} else {
|
||||
size_t n2 = n / 2;
|
||||
|
||||
return qdist_pairwise_avg(dist, index, n2, count) +
|
||||
qdist_pairwise_avg(dist, index + n2, n - n2, count);
|
||||
}
|
||||
}
|
||||
|
||||
double qdist_avg(const struct qdist *dist)
|
||||
{
|
||||
unsigned long count;
|
||||
|
||||
count = qdist_sample_count(dist);
|
||||
if (!count) {
|
||||
return NAN;
|
||||
}
|
||||
return qdist_pairwise_avg(dist, 0, dist->n, count);
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user