Hi David, Do you plan to send a v4? 17/09/2020 13:13, Burakov, Anatoly: > On 10-Aug-20 10:07 PM, David Christensen wrote: > > The SPAPR IOMMU requires that a DMA window size be defined before memory > > can be mapped for DMA. Current code dynamically modifies the DMA window > > size in response to every new memory allocation which is potentially > > dangerous because all existing mappings need to be unmapped/remapped in > > order to resize the DMA window, leaving hardware holding IOVA addresses > > that are temporarily unmapped. The new SPAPR code statically assigns > > the DMA window size on first use, using the largest physical memory > > memory address when IOVA=PA and the highest existing memseg virtual > > address when IOVA=VA. > > > > Signed-off-by: David Christensen <d...@linux.vnet.ibm.com> > > --- > > <snip> > > > +struct spapr_size_walk_param { > > + uint64_t max_va; > > + uint64_t page_sz; > > + int external; > > +}; > > + > > +/* > > + * In order to set the DMA window size required for the SPAPR IOMMU > > + * we need to walk the existing virtual memory allocations as well as > > + * find the hugepage size used. > > + */ > > static int > > -vfio_spapr_unmap_walk(const struct rte_memseg_list *msl, > > - const struct rte_memseg *ms, void *arg) > > +vfio_spapr_size_walk(const struct rte_memseg_list *msl, void *arg) > > { > > - int *vfio_container_fd = arg; > > + struct spapr_size_walk_param *param = arg; > > + uint64_t max = (uint64_t) msl->base_va + (uint64_t) msl->len; > > > > - /* skip external memory that isn't a heap */ > > - if (msl->external && !msl->heap) > > - return 0; > > + if (msl->external) { > > + param->external++; > > + if (!msl->heap) > > + return 0; > > + } > > It would be nice to have some comments in the code explaining what we're > skipping and why. > > Also, seems that you're using param->external as bool? This is a > non-public API so using stdbool is not an issue here, perhaps replace it > with bool param->has_external? > > > > > - /* skip any segments with invalid IOVA addresses */ > > - if (ms->iova == RTE_BAD_IOVA) > > - return 0; > > + if (max > param->max_va) { > > + param->page_sz = msl->page_sz; > > + param->max_va = max; > > + } > > > > - return vfio_spapr_dma_do_map(*vfio_container_fd, ms->addr_64, ms->iova, > > - ms->len, 0); > > + return 0; > > } > > > > -struct spapr_walk_param { > > - uint64_t window_size; > > - uint64_t hugepage_sz; > > -}; > > - > > +/* > > + * The SPAPRv2 IOMMU supports 2 DMA windows with starting > > + * address at 0 or 1<<59. By default, a DMA window is set > > + * at address 0, 2GB long, with a 4KB page. For DPDK we > > + * must remove the default window and setup a new DMA window > > + * based on the hugepage size and memory requirements of > > + * the application before we can map memory for DMA. > > + */ > > static int > > -vfio_spapr_window_size_walk(const struct rte_memseg_list *msl, > > - const struct rte_memseg *ms, void *arg) > > +spapr_dma_win_size(void) > > { > > - struct spapr_walk_param *param = arg; > > - uint64_t max = ms->iova + ms->len; > > + struct spapr_size_walk_param param; > > > > - /* skip external memory that isn't a heap */ > > - if (msl->external && !msl->heap) > > + /* only create DMA window once */ > > + if (spapr_dma_win_len > 0) > > return 0; > > > > - /* skip any segments with invalid IOVA addresses */ > > - if (ms->iova == RTE_BAD_IOVA) > > - return 0; > > + /* walk the memseg list to find the page size/max VA address */ > > + memset(¶m, 0, sizeof(param)); > > + if (rte_memseg_list_walk(vfio_spapr_size_walk, ¶m) < 0) { > > + RTE_LOG(ERR, EAL, "Failed to walk memseg list for DMA " > > + "window size\n"); > > + return -1; > > + } > > + > > + /* We can't be sure if DMA window covers external memory */ > > + if (param.external > 0) > > + RTE_LOG(WARNING, EAL, "Detected external memory which may " > > + "not be managed by the IOMMU\n"); > > + > > + /* find the maximum IOVA address for setting the DMA window size */ > > + if (rte_eal_iova_mode() == RTE_IOVA_PA) { > > + static const char proc_iomem[] = "/proc/iomem"; > > + static const char str_sysram[] = "System RAM"; > > + uint64_t start, end, max = 0; > > + char *line = NULL; > > + char *dash, *space; > > + size_t line_len; > > + > > + /* > > + * Example "System RAM" in /proc/iomem: > > + * 00000000-1fffffffff : System RAM > > + * 200000000000-201fffffffff : System RAM > > + */ > > + FILE *fd = fopen(proc_iomem, "r"); > > + if (fd == NULL) { > > + RTE_LOG(ERR, EAL, "Cannot open %s\n", proc_iomem); > > + return -1; > > + } > > + /* Scan /proc/iomem for the highest PA in the system */ > > + while (getline(&line, &line_len, fd) != -1) { > > + if (strstr(line, str_sysram) == NULL) > > + continue; > > + > > + space = strstr(line, " "); > > + dash = strstr(line, "-"); > > + > > + /* Validate the format of the memory string */ > > + if (space == NULL || dash == NULL || space < dash) { > > + RTE_LOG(ERR, EAL, "Can't parse line \"%s\" in " > > + "file %s\n", line, proc_iomem); > > + continue; > > + } > > + > > + start = strtoull(line, NULL, 16); > > + end = strtoull(dash + 1, NULL, 16); > > + RTE_LOG(DEBUG, EAL, "Found system RAM from 0x%" > > + PRIx64 " to 0x%" PRIx64 "\n", start, end); > > + if (end > max) > > + max = end; > > + } > > + free(line); > > + fclose(fd); > > I would've put all of this file reading business into a separate > function, as otherwise it's a bit hard to follow the mix of file ops and > using the results. Something like > > value = get_value_from_iomem(); > if (value > ...) > ... > > is much easier on the eyes :) > > > > > - if (max > param->window_size) { > > - param->hugepage_sz = ms->hugepage_sz; > > - param->window_size = max; > > + if (max == 0) { > > + RTE_LOG(ERR, EAL, "Failed to find valid \"System RAM\" " > > + "entry in file %s\n", proc_iomem); > > + return -1; > > + } > > + > > + spapr_dma_win_len = rte_align64pow2(max + 1); > > + RTE_LOG(DEBUG, EAL, "Setting DMA window size to 0x%" > > + PRIx64 "\n", spapr_dma_win_len); > > + } else if (rte_eal_iova_mode() == RTE_IOVA_VA) { > > + RTE_LOG(DEBUG, EAL, "Highest VA address in memseg list is 0x%" > > + PRIx64 "\n", param.max_va); > > + spapr_dma_win_len = rte_align64pow2(param.max_va); > > + RTE_LOG(DEBUG, EAL, "Setting DMA window size to 0x%" > > + PRIx64 "\n", spapr_dma_win_len); > > + } else { > > + RTE_LOG(ERR, EAL, "Unsupported IOVA mode\n"); > > + return -1; > > } > > > > + spapr_dma_win_page_sz = param.page_sz; > > + rte_mem_set_dma_mask(__builtin_ctzll(spapr_dma_win_len)); > > return 0; > > } > > > > static int > > -vfio_spapr_create_new_dma_window(int vfio_container_fd, > > - struct vfio_iommu_spapr_tce_create *create) { > > +vfio_spapr_create_dma_window(int vfio_container_fd) > > +{ > > + struct vfio_iommu_spapr_tce_create create = { > > + .argsz = sizeof(create), }; > > struct vfio_iommu_spapr_tce_remove remove = { > > - .argsz = sizeof(remove), > > - }; > > + .argsz = sizeof(remove), }; > > struct vfio_iommu_spapr_tce_info info = { > > - .argsz = sizeof(info), > > - }; > > + .argsz = sizeof(info), }; > > int ret; > > > > - /* query spapr iommu info */ > > + ret = spapr_dma_win_size(); > > + if (ret < 0) > > + return ret; > > + > > ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info); > > if (ret) { > > - RTE_LOG(ERR, EAL, " cannot get iommu info, " > > - "error %i (%s)\n", errno, strerror(errno)); > > Here and in other similar places, no need to split strings into multiline. > > Overall, since these changes are confined to PPC64 i can't really test > these, but with the above changes: > > Reviewed-by: Anatoly Burakov <anatoly.bura...@intel.com> > >