Optimize CPU and memory use of cdbw(3)

Reduce memory footprint and processing time by dropping the vertex parts
of the edges kept during the peeling. Hook up the
division-by-multiplication logic to help older platforms.
This commit is contained in:
joerg 2021-01-07 14:41:50 +00:00
parent 6bcc22484e
commit bb682c3920
1 changed files with 164 additions and 89 deletions

View File

@ -1,4 +1,4 @@
/* $NetBSD: cdbw.c,v 1.6 2017/11/11 18:05:31 alnsn Exp $ */
/* $NetBSD: cdbw.c,v 1.7 2021/01/07 14:41:50 joerg Exp $ */
/*-
* Copyright (c) 2009, 2010, 2015 The NetBSD Foundation, Inc.
* All rights reserved.
@ -36,7 +36,7 @@
#endif
#include <sys/cdefs.h>
__RCSID("$NetBSD: cdbw.c,v 1.6 2017/11/11 18:05:31 alnsn Exp $");
__RCSID("$NetBSD: cdbw.c,v 1.7 2021/01/07 14:41:50 joerg Exp $");
#include "namespace.h"
@ -49,6 +49,74 @@ __RCSID("$NetBSD: cdbw.c,v 1.6 2017/11/11 18:05:31 alnsn Exp $");
#include <string.h>
#include <unistd.h>
#if !HAVE_NBTOOL_CONFIG_H
#include <sys/bitops.h>
#else
static inline int
my_fls32(uint32_t n)
{
int v;
if (!n)
return 0;
v = 32;
if ((n & 0xFFFF0000U) == 0) {
n <<= 16;
v -= 16;
}
if ((n & 0xFF000000U) == 0) {
n <<= 8;
v -= 8;
}
if ((n & 0xF0000000U) == 0) {
n <<= 4;
v -= 4;
}
if ((n & 0xC0000000U) == 0) {
n <<= 2;
v -= 2;
}
if ((n & 0x80000000U) == 0) {
n <<= 1;
v -= 1;
}
return v;
}
static inline void
fast_divide32_prepare(uint32_t div, uint32_t * m,
uint8_t *s1, uint8_t *s2)
{
uint64_t mt;
int l;
l = my_fls32(div - 1);
mt = (uint64_t)(0x100000000ULL * ((1ULL << l) - div));
*m = (uint32_t)(mt / div + 1);
*s1 = (l > 1) ? 1U : (uint8_t)l;
*s2 = (l == 0) ? 0 : (uint8_t)(l - 1);
}
static inline uint32_t
fast_divide32(uint32_t v, uint32_t div, uint32_t m, uint8_t s1,
uint8_t s2)
{
uint32_t t;
t = (uint32_t)(((uint64_t)v * m) >> 32);
return (t + ((v - t) >> s1)) >> s2;
}
static inline uint32_t
fast_remainder32(uint32_t v, uint32_t div, uint32_t m, uint8_t s1,
uint8_t s2)
{
return v - div * fast_divide32(v, div, m, s1, s2);
}
#endif
#ifdef __weak_alias
__weak_alias(cdbw_close,_cdbw_close)
__weak_alias(cdbw_open,_cdbw_open)
@ -279,30 +347,29 @@ cdbw_stable_seeder(void)
}
/*
* The algorithm below is based on paper
* Cache-Oblivious Peeling of Random Hypergraphs by Djamal Belazzougui,
* Paolo Boldi, Giuseppe Ottaviano, Rossano Venturini, and Sebastiano
* Vigna.
* http://zola.di.unipi.it/rossano/wp-content/papercite-data/pdf/dcc14.pdf
* For each vertex in the 3-graph, the incidence lists needs to be kept.
* Avoid storing the full list by just XORing the indices of the still
* incident edges and remember the number of such edges as that's all
* the peeling computation needs. This is inspired by:
* Cache-Oblivious Peeling of Random Hypergraphs by Djamal Belazzougui,
* Paolo Boldi, Giuseppe Ottaviano, Rossano Venturini, and Sebastiano
* Vigna. https://arxiv.org/abs/1312.0526
*
* Unlike in the paper, we don't care about external storage and have
* the edge list at hand all the time. As such, no ordering is necessary
* and the vertices of the edge don't have to be copied.
*
* The core observation of the paper above is that for a degree of one,
* the incident edge can be obtained directly.
*/
/*
* Data type for a valid oriented edge (v0, v1, v2), v1 < v2.
* The first vertex v0 is implicit and is determined by an index
* of the corresponding element in the state->oedges array.
* If the degree of v0 is greater than 1, other members don't
* make sense because they're a result of XORing multiple values.
*/
struct oedge {
uint32_t degree; /* Degree of v0. */
uint32_t verts[2]; /* v1 and v2 */
uint32_t edge;
struct vertex {
uint32_t degree;
uint32_t edges;
};
struct edge {
uint32_t vertices[3];
uint32_t idx;
uint32_t left, middle, right;
};
struct state {
@ -314,48 +381,40 @@ struct state {
uint32_t *g;
char *visited;
struct oedge *oedges;
struct vertex *vertices;
struct edge *edges;
uint32_t output_index;
uint32_t *output_order;
};
/*
* Add (delta == 1) or remove (delta == -1) the edge e from vertex v0.
* Add (delta == 1) or remove (delta == -1) the edge e
* from the incidence lists.
*/
static inline void
add_remove_edge(struct oedge *o, int delta, uint32_t e,
uint32_t v0, uint32_t v1, uint32_t v2)
change_edge(struct state *state, int delta, uint32_t e)
{
int i;
struct vertex *v;
struct edge *e_ = &state->edges[e];
o[v0].verts[v1 < v2 ? 0 : 1] ^= v1;
o[v0].verts[v1 < v2 ? 1 : 0] ^= v2;
o[v0].degree += delta;
o[v0].edge ^= e;
for (i = 0; i < 3; ++i) {
v = &state->vertices[e_->vertices[i]];
v->edges ^= e;
v->degree += delta;
}
}
static inline void
add_edge(struct oedge *o, uint32_t e,
uint32_t v0, uint32_t v1, uint32_t v2)
remove_vertex(struct state *state, uint32_t v)
{
struct vertex *v_ = &state->vertices[v];
uint32_t e;
add_remove_edge(o, 1, e, v0, v1, v2);
}
static inline void
remove_vertex(struct state *state, uint32_t v0)
{
uint32_t e, v1, v2;
struct oedge *o = state->oedges;
if (o[v0].degree == 1) {
e = o[v0].edge;
v1 = o[v0].verts[0];
v2 = o[v0].verts[1];
o[v0].degree = 0;
add_remove_edge(o, -1, e, v1, v0, v2);
add_remove_edge(o, -1, e, v2, v0, v1);
if (v_->degree == 1) {
e = v_->edges;
state->output_order[--state->output_index] = e;
change_edge(state, -1, e);
}
}
@ -365,39 +424,49 @@ build_graph(struct cdbw *cdbw, struct state *state)
struct key_hash_head *head;
struct key_hash *key_hash;
struct edge *e;
uint32_t entries_m;
uint8_t entries_s1, entries_s2;
uint32_t hashes[3];
size_t i;
int j;
memset(state->oedges, 0, sizeof(struct oedge) * state->entries);
memset(state->vertices, 0, sizeof(*state->vertices) * state->entries);
e = state->edges;
fast_divide32_prepare(state->entries, &entries_m, &entries_s1,
&entries_s2);
for (i = 0; i < cdbw->hash_size; ++i) {
head = &cdbw->hash[i];
SLIST_FOREACH(key_hash, head, link) {
e->idx = key_hash->idx;
mi_vector_hash(key_hash->key, key_hash->keylen,
state->seed, hashes);
e->left = hashes[0] % state->entries;
e->middle = hashes[1] % state->entries;
e->right = hashes[2] % state->entries;
if (e->left == e->middle)
return -1;
add_edge(state->oedges, e - state->edges,
e->right, e->left, e->middle);
if (e->left == e->right)
return -1;
add_edge(state->oedges, e - state->edges,
e->middle, e->left, e->right);
if (e->middle == e->right)
return -1;
add_edge(state->oedges, e - state->edges,
e->left, e->middle, e->right);
for (j = 0; j < 3; ++j) {
e->vertices[j] = fast_remainder32(hashes[j],
state->entries, entries_m, entries_s1,
entries_s2);
}
if (e->vertices[0] == e->vertices[1])
return -1;
if (e->vertices[0] == e->vertices[2])
return -1;
if (e->vertices[1] == e->vertices[2])
return -1;
e->idx = key_hash->idx;
++e;
}
}
/*
* Do the edge processing separately as there is a good chance
* the degraded edge case above will happen; this avoid
*unnecessary work.
*/
for (i = 0; i < state->keys; ++i)
change_edge(state, 1, i);
state->output_index = state->keys;
for (i = 0; i < state->entries; ++i)
remove_vertex(state, i);
@ -406,9 +475,8 @@ build_graph(struct cdbw *cdbw, struct state *state)
while (i > 0 && i > state->output_index) {
--i;
e = state->edges + state->output_order[i];
remove_vertex(state, e->left);
remove_vertex(state, e->middle);
remove_vertex(state, e->right);
for (j = 0; j < 3; ++j)
remove_vertex(state, e->vertices[j]);
}
return state->output_index == 0 ? 0 : -1;
@ -420,28 +488,35 @@ assign_nodes(struct state *state)
struct edge *e;
size_t i;
uint32_t v0, v1, v2, entries_m;
uint8_t entries_s1, entries_s2;
fast_divide32_prepare(state->data_entries, &entries_m, &entries_s1,
&entries_s2);
for (i = 0; i < state->keys; ++i) {
e = state->edges + state->output_order[i];
if (!state->visited[e->left]) {
state->g[e->left] =
(2 * state->data_entries + e->idx
- state->g[e->middle] - state->g[e->right])
% state->data_entries;
} else if (!state->visited[e->middle]) {
state->g[e->middle] =
(2 * state->data_entries + e->idx
- state->g[e->left] - state->g[e->right])
% state->data_entries;
if (!state->visited[e->vertices[0]]) {
v0 = e->vertices[0];
v1 = e->vertices[1];
v2 = e->vertices[2];
} else if (!state->visited[e->vertices[1]]) {
v0 = e->vertices[1];
v1 = e->vertices[0];
v2 = e->vertices[2];
} else {
state->g[e->right] =
(2 * state->data_entries + e->idx
- state->g[e->left] - state->g[e->middle])
% state->data_entries;
v0 = e->vertices[2];
v1 = e->vertices[0];
v2 = e->vertices[1];
}
state->visited[e->left] = 1;
state->visited[e->middle] = 1;
state->visited[e->right] = 1;
state->g[v0] =
fast_remainder32((2 * state->data_entries + e->idx
- state->g[v1] - state->g[v2]),
state->data_entries, entries_m,
entries_s1, entries_s2);
state->visited[v0] = 1;
state->visited[v1] = 1;
state->visited[v2] = 1;
}
}
@ -559,13 +634,13 @@ cdbw_output(struct cdbw *cdbw, int fd, const char descr[16],
#define NALLOC(var, n) var = calloc(sizeof(*var), n)
NALLOC(state.g, state.entries);
NALLOC(state.visited, state.entries);
NALLOC(state.oedges, state.entries);
NALLOC(state.vertices, state.entries);
NALLOC(state.edges, state.keys);
NALLOC(state.output_order, state.keys);
#undef NALLOC
if (state.g == NULL || state.visited == NULL || state.oedges == NULL ||
state.edges == NULL || state.output_order == NULL) {
if (state.g == NULL || state.visited == NULL || state.edges == NULL ||
state.vertices == NULL || state.output_order == NULL) {
rv = -1;
goto release;
}
@ -584,7 +659,7 @@ cdbw_output(struct cdbw *cdbw, int fd, const char descr[16],
release:
free(state.g);
free(state.visited);
free(state.oedges);
free(state.vertices);
free(state.edges);
free(state.output_order);