Brice Apologies, I didn't explain it very well, I do make sure that if the tile size 256*8 < 4096 (pagesize), then I double the number of tiles per page, I just wanted to keep the explanation simple.
here are some code snippets to give you the flavour of it initializing the helper sruct matrix_numa_binder(std::size_t Ncols, std::size_t Nrows, std::size_t Ntile, std::size_t Ntiles_per_domain, std::size_t Ncolprocs=1, std::size_t Nrowprocs=1, std::string pool_name="default" ) : cols_(Ncols), rows_(Nrows), tile_size_(Ntile), tiles_per_domain_(Ntiles_per_domain), colprocs_(Ncolprocs), rowprocs_(Nrowprocs) { using namespace hpx::compute::host; binding_helper<T>::pool_name_ = pool_name; const int CACHE_LINE_SIZE = sysconf (_SC_LEVEL1_DCACHE_LINESIZE); const int PAGE_SIZE = sysconf(_SC_PAGE_SIZE); const int ALIGNMENT = std::max(PAGE_SIZE,CACHE_LINE_SIZE); const int ELEMS_ALIGN = (ALIGNMENT/sizeof(T)); rows_page_ = ELEMS_ALIGN; leading_dim_ = ELEMS_ALIGN*((rows_*sizeof(T) + ALIGNMENT-1)/ALIGNMENT); tiles_per_domain_ = std::max(tiles_per_domain_, ELEMS_ALIGN/tile_size_); } operator called by allocator which returns the domain index to bind a page to virtual std::size_t operator ()( const T * const base_ptr, const T * const page_ptr, const std::size_t pagesize, const std::size_t domains) const override { std::size_t offset = (page_ptr - base_ptr); std::size_t col = (offset / leading_dim_); std::size_t row = (offset % leading_dim_); std::size_t index = (col / (tile_size_ * tiles_per_domain_)); if ((tile_size_*tiles_per_domain_*sizeof(T))>=pagesize) { index += (row / (tile_size_ * tiles_per_domain_)); } else { HPX_ASSERT(0); } return index % domains; } this function is called by each thread (one per numa domain) and if the domain returned by the page query matches the domain ID of the thread/task then the first memory location on the page is written to for (size_type i=0; i<num_pages; ++i) { // we pass the base pointer and current page pointer size_type dom = helper->operator()(p, page_ptr, pagesize, nodesets.size()); if (dom==numa_domain) { // trigger a memory read and rewrite without changing contents volatile char* vaddr = (volatile char*) page_ptr; *vaddr = T(0); // *vaddr; } page_ptr += pageN; } All of this has been debugged quite extensively and I can write numbers to memory and read them back and the patterns always match the domains expected. This function is called after all data is written to attempt to verify (and display the patterns above) int topology::get_numa_domain(const void *addr) const { #if HWLOC_API_VERSION >= 0x00010b06 hpx_hwloc_bitmap_wrapper *nodeset = topology::bitmap_storage_.get(); if (nullptr == nodeset) { hwloc_bitmap_t nodeset_ = hwloc_bitmap_alloc(); topology::bitmap_storage_.reset(new hpx_hwloc_bitmap_wrapper(nodeset_)); nodeset = topology::bitmap_storage_.get(); } // hwloc_nodeset_t ns = reinterpret_cast<hwloc_nodeset_t>(nodeset->get_bmp()); int ret = hwloc_get_area_memlocation(topo, addr, 1, ns, HWLOC_MEMBIND_BYNODESET); if (ret<0) { std::string msg(strerror(errno)); HPX_THROW_EXCEPTION(kernel_error , "hpx::threads::topology::get_numa_domain" , "hwloc_get_area_memlocation failed " + msg); return -1; } // this uses hwloc directly // int bit = hwloc_bitmap_first(ns); // return bit // this uses an alternative method, both give the same result AFAICT threads::mask_type mask = bitmap_to_mask(ns, HWLOC_OBJ_NUMANODE); return static_cast<int>(threads::find_first(mask)); #else return 0; #endif } Thanks for taking the time to look it over JB _______________________________________________ hwloc-users mailing list hwloc-users@lists.open-mpi.org https://lists.open-mpi.org/mailman/listinfo/hwloc-users