improve and document numa support

2019-11-02 10:30:16 -07:00 · 2019-11-02 10:30:16 -07:00 · a69016c33e
commit a69016c33e
parent 2c12d7f223
2 changed files with 30 additions and 11 deletions
--- a/src/os.c
+++ b/src/os.c
@ -854,8 +854,11 @@ static void* mi_os_alloc_huge_os_pagesx(size_t size, int numa_node) {
  void* p = mi_unix_mmap(NULL, size, MI_SEGMENT_SIZE, PROT_READ | PROT_WRITE, true, true, &is_large);
  if (p == NULL) return NULL;
  #ifdef MI_HAS_NUMA
-  if (numa_node >= 0 && numa_node < 8*MI_INTPTR_SIZE) {
+  if (numa_node >= 0 && numa_node < 8*MI_INTPTR_SIZE) { // at most 64 nodes
    uintptr_t numa_mask = (1UL << numa_node);
+    // TODO: does `mbind` work correctly for huge OS pages? should we 
+    // use `set_mempolicy` before calling mmap instead?
+    // see: <https://lkml.org/lkml/2017/2/9/875>
    long err = mbind(p, size, MPOL_PREFERRED, &numa_mask, 8*MI_INTPTR_SIZE, 0);
    if (err != 0) {
      _mi_warning_message("failed to bind huge (1GiB) pages to NUMA node %d: %s\n", numa_node, strerror(errno));
@ -883,6 +886,9 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, size_t* psize) {
  return p;
 }

+/* ----------------------------------------------------------------------------
+Support NUMA aware allocation 
+-----------------------------------------------------------------------------*/
 #ifdef WIN32
 static int mi_os_numa_nodex() {
  PROCESSOR_NUMBER pnum;
@ -902,6 +908,9 @@ static int mi_os_numa_node_countx(void) {
 #include <stdlib.h>
 #include <numaif.h>
 static int mi_os_numa_nodex(void) {
+  #define MI_NUMA_NODE_SLOW  // too slow, so cache it
+  // TODO: perhaps use RDTSCP instruction on x64? 
+  // see <https://stackoverflow.com/questions/16862620/numa-get-current-node-core>
  #define MI_MAX_MASK (4)          // support at most 256 nodes
  unsigned long mask[MI_MAX_MASK];
  memset(mask,0,MI_MAX_MASK*sizeof(long));
@ -945,7 +954,7 @@ static int mi_os_numa_node_countx(void) {
 #endif

 int _mi_os_numa_node_count(void) {
-  static int numa_node_count = 0;
+  static int numa_node_count = 0;   // cache the node count 
  if (mi_unlikely(numa_node_count <= 0)) {
    int ncount = mi_os_numa_node_countx();
    // never more than max numa node and at least 1
@ -959,14 +968,24 @@ int _mi_os_numa_node_count(void) {
 }

 int _mi_os_numa_node(mi_os_tld_t* tld) {
+  int numa_node;
+#ifndef MI_NUMA_NODE_SLOW
+  UNUSED(tld);
+  numa_node = mi_os_numa_nodex();
+#else
  if (mi_unlikely(tld->numa_node < 0)) {
-    int nnode = mi_os_numa_nodex();
-    // never more than the node count
-    int ncount = _mi_os_numa_node_count();
-    if (nnode >= ncount) { nnode = nnode % ncount; }
-    if (nnode < 0) nnode = 0;
-    tld->numa_node = nnode;
+    // Cache the NUMA node of the thread if the call is slow.
+    // This may not be correct as threads can migrate to another cpu on
+    // another node -- however, for memory allocation this just means we keep
+    // using the same 'node id' for its allocations; new OS allocations
+    // naturally come from the actual node so in practice this may be fine.
+    tld->numa_node = mi_os_numa_nodex(); 
  }
-  mi_assert_internal(tld->numa_node >= 0 && tld->numa_node < _mi_os_numa_node_count());
-  return tld->numa_node;
+  numa_node = tld->numa_node
+#endif
+  // never more than the node count and >= 0
+  int numa_count = _mi_os_numa_node_count();
+  if (numa_node >= numa_count) { numa_node = numa_node % numa_count; }
+  if (numa_node < 0) numa_node = 0;  
+  return numa_node;
 }
--- a/test/main-override.cpp
+++ b/test/main-override.cpp
@ -24,7 +24,7 @@ public:


 int main() {
-  //mi_stats_reset();  // ignore earlier allocations
+  mi_stats_reset();  // ignore earlier allocations
  atexit(free_p);
  void* p1 = malloc(78);
  void* p2 = mi_malloc_aligned(16,24);