Re: [PATCH] RDMA/core: Fix error return in _ib_modify_qp()

2020-10-18 Thread Maor Gottlieb



On 10/16/2020 10:58 AM, Jing Xiangfeng wrote:

Fix to return error code PTR_ERR() from the error handling case instead of
0.

Fixes: 51aab12631dd ("RDMA/core: Get xmit slave for LAG")
Signed-off-by: Jing Xiangfeng 
---
  drivers/infiniband/core/verbs.c | 4 +++-
  1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 307886737646..bf63c7561e8c 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -1685,8 +1685,10 @@ static int _ib_modify_qp(struct ib_qp *qp, struct 
ib_qp_attr *attr,
slave = rdma_lag_get_ah_roce_slave(qp->device,
   >ah_attr,
   GFP_KERNEL);
-   if (IS_ERR(slave))
+   if (IS_ERR(slave)) {
+   ret = PTR_ERR(slave);
goto out_av;
+   }
attr->xmit_slave = slave;
}
}


Looks good,
Reviewed-by: Maor Gottlieb 



Re: [PATCH rdma-next v4 1/4] lib/scatterlist: Add support in dynamic allocation of SG table from pages

2020-10-02 Thread Maor Gottlieb



On 10/2/2020 6:02 PM, Jason Gunthorpe wrote:

On Sun, Sep 27, 2020 at 09:46:44AM +0300, Leon Romanovsky wrote:

+struct scatterlist *__sg_alloc_table_from_pages(struct sg_table *sgt,
+   struct page **pages, unsigned int n_pages, unsigned int offset,
+   unsigned long size, unsigned int max_segment,
+   struct scatterlist *prv, unsigned int left_pages,
+   gfp_t gfp_mask)
  {
-   unsigned int chunks, cur_page, seg_len, i;
+   unsigned int chunks, cur_page, seg_len, i, prv_len = 0;
+   struct scatterlist *s = prv;
+   unsigned int table_size;
+   unsigned int tmp_nents;
int ret;
-   struct scatterlist *s;

if (WARN_ON(!max_segment || offset_in_page(max_segment)))
-   return -EINVAL;
+   return ERR_PTR(-EINVAL);
+   if (IS_ENABLED(CONFIG_ARCH_NO_SG_CHAIN) && prv)
+   return ERR_PTR(-EOPNOTSUPP);
+
+   tmp_nents = prv ? sgt->nents : 0;
+
+   if (prv &&
+   page_to_pfn(sg_page(prv)) + (prv->length >> PAGE_SHIFT) ==

This calculation of the end doesn't consider sg->offset


Right, should be fixed.



+   page_to_pfn(pages[0]))
+   prv_len = prv->length;

/* compute number of contiguous chunks */
chunks = 1;
@@ -410,13 +461,17 @@ int __sg_alloc_table_from_pages(struct sg_table *sgt, 
struct page **pages,
}
}

-   ret = sg_alloc_table(sgt, chunks, gfp_mask);
-   if (unlikely(ret))
-   return ret;
+   if (!prv) {
+   /* Only the last allocation could be less than the maximum */
+   table_size = left_pages ? SG_MAX_SINGLE_ALLOC : chunks;
+   ret = sg_alloc_table(sgt, table_size, gfp_mask);
+   if (unlikely(ret))
+   return ERR_PTR(ret);
+   }

This is basically redundant right? Now that get_next_sg() can allocate
SGs it can just build them one by one, no need to preallocate.

Actually all the changes the the allocation seem like overkill, just
allocate a single new array directly in get_next_sg() whenever it
needs.


No, only the last allocation could be less than maximum. (as written in 
the comment).

I am preferring to stick with the current implementation and fix the offset.


Something like this:

@@ -365,6 +372,37 @@ int sg_alloc_table(struct sg_table *table, unsigned int 
nents, gfp_t gfp_mask)
  }
  EXPORT_SYMBOL(sg_alloc_table);
  
+static struct scatterlist *get_next_sg(struct sg_table *table,

+   struct scatterlist *cur, unsigned long needed_sges,
+   gfp_t gfp_mask)
+{
+   struct scatterlist *new_sg;
+   unsigned int alloc_size;
+
+   if (cur) {
+   struct scatterlist *next_sg = sg_next(cur);
+
+   /* Check if last entry should be keeped for chainning */
+   if (!sg_is_last(next_sg) || needed_sges == 1)
+   return next_sg;
+   }
+
+   alloc_size = min_t(unsigned long, needed_sges, SG_MAX_SINGLE_ALLOC);
+   new_sg = sg_kmalloc(alloc_size, gfp_mask);
+   if (!new_sg)
+   return ERR_PTR(-ENOMEM);
+   sg_init_table(new_sg, alloc_size);
+   if (cur) {
+   __sg_chain(cur, new_sg);
+   table->orig_nents += alloc_size - 1;
+   } else {
+   table->sgl = new_sg;
+   table->orig_nents = alloc_size;
+   table->nents = 0;
+   }
+   return new_sg;
+}
+
  /**
   * __sg_alloc_table_from_pages - Allocate and initialize an sg table from
   * an array of pages
@@ -374,29 +412,64 @@ EXPORT_SYMBOL(sg_alloc_table);
   * @offset:  Offset from start of the first page to the start of a buffer
   * @size:Number of valid bytes in the buffer (after offset)
   * @max_segment: Maximum size of a scatterlist node in bytes (page aligned)
+ * @prv:Last populated sge in sgt
+ * @left_pages:  Left pages caller have to set after this call
   * @gfp_mask:  GFP allocation mask
   *
- *  Description:
- *Allocate and initialize an sg table from a list of pages. Contiguous
- *ranges of the pages are squashed into a single scatterlist node up to the
- *maximum size specified in @max_segment. An user may provide an offset at 
a
- *start and a size of valid data in a buffer specified by the page array.
- *The returned sg table is released by sg_free_table.
+ * Description:
+ *If @prv is NULL, allocate and initialize an sg table from a list of 
pages,
+ *else reuse the scatterlist passed in at @prv.
+ *Contiguous ranges of the pages are squashed into a single scatterlist
+ *entry up to the maximum size specified in @max_segment.  A user may
+ *provide an offset at a start and a size of valid data in a buffer
+ *specified by the page array.
   *
   * Returns:
- *   0 on success, negative error on failure
+ *   Last SGE in sgt on success, PTR_ERR on otherwise.
+ *  

Re: [PATCH rdma-next v4 4/4] RDMA/umem: Move to allocate SG table from pages

2020-09-30 Thread Maor Gottlieb



On 9/30/2020 6:14 PM, Jason Gunthorpe wrote:

On Wed, Sep 30, 2020 at 06:05:15PM +0300, Maor Gottlieb wrote:

This is right only for the last iteration. E.g. in the first iteration in
case that there are more pages (left_pages), then we allocate
SG_MAX_SINGLE_ALLOC.  We don't know how many pages from the second iteration
will be squashed to the SGE from the first iteration.

Well, it is 0 or 1 SGE's. Check if the first page is mergable and
subtract one from the required length?

I dislike this sg_mark_end() it is something that should be internal,
IMHO.


I can move it to __sg_alloc_table_from_pages:

    sgt->nents = tmp_nents;
+ if (!left_pages)
+ sg_mark_end(s);
 out:
    return s;



Jason


Re: [PATCH rdma-next v4 4/4] RDMA/umem: Move to allocate SG table from pages

2020-09-30 Thread Maor Gottlieb



On 9/30/2020 2:58 PM, Jason Gunthorpe wrote:

On Wed, Sep 30, 2020 at 02:53:58PM +0300, Maor Gottlieb wrote:

On 9/30/2020 2:45 PM, Jason Gunthorpe wrote:

On Wed, Sep 30, 2020 at 12:53:21PM +0300, Leon Romanovsky wrote:

On Tue, Sep 29, 2020 at 04:59:29PM -0300, Jason Gunthorpe wrote:

On Sun, Sep 27, 2020 at 09:46:47AM +0300, Leon Romanovsky wrote:

@@ -296,11 +223,17 @@ static struct ib_umem *__ib_umem_get(struct ib_device 
*device,
goto umem_release;

cur_base += ret * PAGE_SIZE;
-   npages   -= ret;
-
-   sg = ib_umem_add_sg_table(sg, page_list, ret,
-   dma_get_max_seg_size(device->dma_device),
-   >sg_nents);
+   npages -= ret;
+   sg = __sg_alloc_table_from_pages(
+   >sg_head, page_list, ret, 0, ret << PAGE_SHIFT,
+   dma_get_max_seg_size(device->dma_device), sg, npages,
+   GFP_KERNEL);
+   umem->sg_nents = umem->sg_head.nents;
+   if (IS_ERR(sg)) {
+   unpin_user_pages_dirty_lock(page_list, ret, 0);
+   ret = PTR_ERR(sg);
+   goto umem_release;
+   }
}

sg_mark_end(sg);

Does it still need the sg_mark_end?

It is preserved here for correctness, the release logic doesn't rely on
this marker, but it is better to leave it.

I mean, my read of __sg_alloc_table_from_pages() is that it already
placed it, the final __alloc_table() does it?

It marks the last allocated sge, but not the last populated sge (with page).

Why are those different?

It looks like the last iteration calls __alloc_table() with an exact
number of sges

+   if (!prv) {
+   /* Only the last allocation could be less than the maximum */
+   table_size = left_pages ? SG_MAX_SINGLE_ALLOC : chunks;
+   ret = sg_alloc_table(sgt, table_size, gfp_mask);
+   if (unlikely(ret))
+   return ERR_PTR(ret);
+   }

Jason


This is right only for the last iteration. E.g. in the first iteration 
in case that there are more pages (left_pages), then we allocate 
SG_MAX_SINGLE_ALLOC.  We don't know how many pages from the second 
iteration will be squashed to the SGE from the first iteration.




Re: [PATCH rdma-next v4 4/4] RDMA/umem: Move to allocate SG table from pages

2020-09-30 Thread Maor Gottlieb



On 9/30/2020 2:45 PM, Jason Gunthorpe wrote:

On Wed, Sep 30, 2020 at 12:53:21PM +0300, Leon Romanovsky wrote:

On Tue, Sep 29, 2020 at 04:59:29PM -0300, Jason Gunthorpe wrote:

On Sun, Sep 27, 2020 at 09:46:47AM +0300, Leon Romanovsky wrote:

@@ -296,11 +223,17 @@ static struct ib_umem *__ib_umem_get(struct ib_device 
*device,
goto umem_release;

cur_base += ret * PAGE_SIZE;
-   npages   -= ret;
-
-   sg = ib_umem_add_sg_table(sg, page_list, ret,
-   dma_get_max_seg_size(device->dma_device),
-   >sg_nents);
+   npages -= ret;
+   sg = __sg_alloc_table_from_pages(
+   >sg_head, page_list, ret, 0, ret << PAGE_SHIFT,
+   dma_get_max_seg_size(device->dma_device), sg, npages,
+   GFP_KERNEL);
+   umem->sg_nents = umem->sg_head.nents;
+   if (IS_ERR(sg)) {
+   unpin_user_pages_dirty_lock(page_list, ret, 0);
+   ret = PTR_ERR(sg);
+   goto umem_release;
+   }
}

sg_mark_end(sg);

Does it still need the sg_mark_end?

It is preserved here for correctness, the release logic doesn't rely on
this marker, but it is better to leave it.

I mean, my read of __sg_alloc_table_from_pages() is that it already
placed it, the final __alloc_table() does it?

Jason



It marks the last allocated sge, but not the last populated sge (with page).



Re: [PATCH rdma-next 3/4] lib/scatterlist: Add support in dynamic allocation of SG table from pages

2020-09-07 Thread Maor Gottlieb



On 9/7/2020 10:29 AM, Christoph Hellwig wrote:

On Thu, Sep 03, 2020 at 03:18:52PM +0300, Leon Romanovsky wrote:

+struct sg_append {
+   struct scatterlist *prv; /* Previous entry to append */
+   unsigned int left_pages; /* Left pages to add to table */
+};

I don't really see the point in this structure.   Either pass it as
two separate arguments, or switch sg_alloc_table_append and the
internal helper to pass all arguments as a struct.


I did it to avoid more than 8 arguments of this function, will change it 
to be 9 if it's fine for you.



+ *A user may provide an offset at a start and a size of valid data in a 
buffer
+ *specified by the page array. A user may provide @append to chain pages to

This adds a few pointles > 80 char lines.


Will fix.



+struct scatterlist *
+sg_alloc_table_append(struct sg_table *sgt, struct page **pages,
+ unsigned int n_pages, unsigned int offset,
+ unsigned long size, unsigned int max_segment,
+ gfp_t gfp_mask, struct sg_append *append)
+{
+#ifdef CONFIG_ARCH_NO_SG_CHAIN
+   if (append->left_pages)
+   return ERR_PTR(-EOPNOTSUPP);
+#endif

Which makes this API entirely useless for !CONFIG_ARCH_NO_SG_CHAIN,
doesn't it?  Wouldn't it make more sense to not provide it for that
case and add an explicitl dependency in the callers?


Current implementation allow us to support small memory registration 
which not require chaining. I am not aware which archs has the SG_CHAIN 
support and I don't want to break it so I can't add it to as dependency 
to the Kconfig. Another option is to do the logic in the caller, but it 
isn't clean.





+   return alloc_from_pages_common(sgt, pages, n_pages, offset, size,
+  max_segment, gfp_mask, append);

And if we somehow manage to sort that out we can merge
sg_alloc_table_append and alloc_from_pages_common, reducing the amount
of wrappers that just make it too hard to follow the code.


+EXPORT_SYMBOL(sg_alloc_table_append);

EXPORT_SYMBOL_GPL, please.


Sure


Re: [PATCH rdma-next 2/4] lib/scatterlist: Add support in dynamically allocation of SG entries

2020-09-07 Thread Maor Gottlieb



On 9/7/2020 10:29 AM, Christoph Hellwig wrote:

+static inline void _sg_chain(struct scatterlist *chain_sg,
+struct scatterlist *sgl)
+{
+   /*
+* offset and length are unused for chain entry. Clear them.
+*/
+   chain_sg->offset = 0;
+   chain_sg->length = 0;
+
+   /*
+* Set lowest bit to indicate a link pointer, and make sure to clear
+* the termination bit if it happens to be set.
+*/
+   chain_sg->page_link = ((unsigned long) sgl | SG_CHAIN) & ~SG_END;
+}

Please call this __sg_chain to stick with our normal kernel naming
convention.


Will do.


Re: [PATCH rdma-next 1/4] lib/scatterlist: Refactor sg_alloc_table_from_pages

2020-09-07 Thread Maor Gottlieb



On 9/7/2020 10:29 AM, Christoph Hellwig wrote:

On Thu, Sep 03, 2020 at 06:54:34PM +0300, Leon Romanovsky wrote:

From: Maor Gottlieb 

Currently, sg_alloc_table_from_pages doesn't support dynamic chaining of
SG entries. Therefore it requires from user to allocate all the pages in
advance and hold them in a large buffer. Such a buffer consumes a lot of
temporary memory in HPC systems which do a very large memory registration.

The next patches introduce API for dynamically allocation from pages and
it requires us to do the following:
  * Extract the code to alloc_from_pages_common.
  * Change the build of the table to iterate on the chunks and not on the
SGEs. It will allow dynamic allocation of more SGEs.

Since sg_alloc_table_from_pages allocate exactly the number of chunks,
therefore chunks are equal to the number of SG entries.

Given how few users __sg_alloc_table_from_pages has, what about just
switching it to your desired calling conventions without another helper?


I tried it now. It didn't save a lot.  Please give me your decision and 
if needed I will update accordingly.


Re: linux-next: manual merge of the rdma tree with the net-next tree

2016-03-19 Thread Maor Gottlieb
2016-03-16 2:58 GMT+02:00 Stephen Rothwell :
> Hi all,
>
> Today's linux-next merge of the rdma tree got a conflict in:
>
>   drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
>
> between commit:
>
>   60ab4584f5bf ("net/mlx5_core: Set flow steering dest only for forward 
> rules")
>
> from the net-next tree and commit:
>
>   b3638e1a7664 ("net/mlx5_core: Introduce forward to next priority action")
>
> from the rdma tree.
>
> I fixed it up (see below) and can carry the fix as necessary (no action
> is required).
>
> --
> Cheers,
> Stephen Rothwell
>
> diff --cc drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
> index e848d708d2b7,bf3446794bd5..
> --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
> @@@ -73,10 -73,13 +73,13 @@@
>   #define BY_PASS_MIN_LEVEL (KENREL_MIN_LEVEL + MLX5_BY_PASS_NUM_PRIOS +\
>LEFTOVERS_MAX_FT)
>
>  -#define KERNEL_MAX_FT 2
>  -#define KERNEL_NUM_PRIOS 1
>  +#define KERNEL_MAX_FT 3
>  +#define KERNEL_NUM_PRIOS 2
>   #define KENREL_MIN_LEVEL 2
>
> + #define ANCHOR_MAX_FT 1
> + #define ANCHOR_NUM_PRIOS 1
> + #define ANCHOR_MIN_LEVEL (BY_PASS_MIN_LEVEL + 1)
>   struct node_caps {
> size_t  arr_sz;
> long*caps;
> @@@ -360,8 -367,13 +367,13 @@@ static void del_rule(struct fs_node *no
> memcpy(match_value, fte->val, sizeof(fte->val));
> fs_get_obj(ft, fg->node.parent);
> list_del(>node.list);
> +   if (rule->sw_action == MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO) {
> +   mutex_lock(>dest_attr.ft->lock);
> +   list_del(>next_ft);
> +   mutex_unlock(>dest_attr.ft->lock);
> +   }
>  -  fte->dests_size--;
>  -  if (fte->dests_size) {
>  +  if ((fte->action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) &&
>  +  --fte->dests_size) {
> err = mlx5_cmd_update_fte(dev, ft,
>   fg->id, fte);
> if (err)
> @@@ -762,9 -835,9 +835,10 @@@ static struct mlx5_flow_rule *alloc_rul
> if (!rule)
> return NULL;
>
> +   INIT_LIST_HEAD(>next_ft);
> rule->node.type = FS_TYPE_FLOW_DEST;
>  -  memcpy(>dest_attr, dest, sizeof(*dest));
>  +  if (dest)
>  +  memcpy(>dest_attr, dest, sizeof(*dest));
>
> return rule;
>   }
> @@@ -783,12 -856,16 +857,17 @@@ static struct mlx5_flow_rule *add_rule_
> return ERR_PTR(-ENOMEM);
>
> fs_get_obj(ft, fg->node.parent);
> -   /* Add dest to dests list- added as first element after the head */
> +   /* Add dest to dests list- we need flow tables to be in the
> +* end of the list for forward to next prio rules.
> +*/
> tree_init_node(>node, 1, del_rule);
> -   list_add_tail(>node.list, >node.children);
> +   if (dest && dest->type != MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE)
> +   list_add(>node.list, >node.children);
> +   else
> +   list_add_tail(>node.list, >node.children);
>  -  fte->dests_size++;
>  -  if (fte->dests_size == 1)
>  +  if (dest)
>  +  fte->dests_size++;
>  +  if (fte->dests_size == 1 || !dest)
> err = mlx5_cmd_create_fte(get_dev(>node),
>   ft, fg->id, fte);
> else

Hi Stephen,

I reveiwed your merge and it's fine.

Thanks,
Maor


Re: linux-next: manual merge of the rdma tree with the net-next tree

2016-03-19 Thread Maor Gottlieb
2016-03-16 2:58 GMT+02:00 Stephen Rothwell :
> Hi all,
>
> Today's linux-next merge of the rdma tree got a conflict in:
>
>   drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
>
> between commit:
>
>   60ab4584f5bf ("net/mlx5_core: Set flow steering dest only for forward 
> rules")
>
> from the net-next tree and commit:
>
>   b3638e1a7664 ("net/mlx5_core: Introduce forward to next priority action")
>
> from the rdma tree.
>
> I fixed it up (see below) and can carry the fix as necessary (no action
> is required).
>
> --
> Cheers,
> Stephen Rothwell
>
> diff --cc drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
> index e848d708d2b7,bf3446794bd5..
> --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
> @@@ -73,10 -73,13 +73,13 @@@
>   #define BY_PASS_MIN_LEVEL (KENREL_MIN_LEVEL + MLX5_BY_PASS_NUM_PRIOS +\
>LEFTOVERS_MAX_FT)
>
>  -#define KERNEL_MAX_FT 2
>  -#define KERNEL_NUM_PRIOS 1
>  +#define KERNEL_MAX_FT 3
>  +#define KERNEL_NUM_PRIOS 2
>   #define KENREL_MIN_LEVEL 2
>
> + #define ANCHOR_MAX_FT 1
> + #define ANCHOR_NUM_PRIOS 1
> + #define ANCHOR_MIN_LEVEL (BY_PASS_MIN_LEVEL + 1)
>   struct node_caps {
> size_t  arr_sz;
> long*caps;
> @@@ -360,8 -367,13 +367,13 @@@ static void del_rule(struct fs_node *no
> memcpy(match_value, fte->val, sizeof(fte->val));
> fs_get_obj(ft, fg->node.parent);
> list_del(>node.list);
> +   if (rule->sw_action == MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO) {
> +   mutex_lock(>dest_attr.ft->lock);
> +   list_del(>next_ft);
> +   mutex_unlock(>dest_attr.ft->lock);
> +   }
>  -  fte->dests_size--;
>  -  if (fte->dests_size) {
>  +  if ((fte->action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) &&
>  +  --fte->dests_size) {
> err = mlx5_cmd_update_fte(dev, ft,
>   fg->id, fte);
> if (err)
> @@@ -762,9 -835,9 +835,10 @@@ static struct mlx5_flow_rule *alloc_rul
> if (!rule)
> return NULL;
>
> +   INIT_LIST_HEAD(>next_ft);
> rule->node.type = FS_TYPE_FLOW_DEST;
>  -  memcpy(>dest_attr, dest, sizeof(*dest));
>  +  if (dest)
>  +  memcpy(>dest_attr, dest, sizeof(*dest));
>
> return rule;
>   }
> @@@ -783,12 -856,16 +857,17 @@@ static struct mlx5_flow_rule *add_rule_
> return ERR_PTR(-ENOMEM);
>
> fs_get_obj(ft, fg->node.parent);
> -   /* Add dest to dests list- added as first element after the head */
> +   /* Add dest to dests list- we need flow tables to be in the
> +* end of the list for forward to next prio rules.
> +*/
> tree_init_node(>node, 1, del_rule);
> -   list_add_tail(>node.list, >node.children);
> +   if (dest && dest->type != MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE)
> +   list_add(>node.list, >node.children);
> +   else
> +   list_add_tail(>node.list, >node.children);
>  -  fte->dests_size++;
>  -  if (fte->dests_size == 1)
>  +  if (dest)
>  +  fte->dests_size++;
>  +  if (fte->dests_size == 1 || !dest)
> err = mlx5_cmd_create_fte(get_dev(>node),
>   ft, fg->id, fte);
> else

Hi Stephen,

I reveiwed your merge and it's fine.

Thanks,
Maor