[PATCH net-next, 6/6] net/mlx5e: Add mlx5e HV VHCA stats agent

2019-08-14 Thread Haiyang Zhang
From: Eran Ben Elisha 

HV VHCA stats agent is responsible on running a preiodic rx/tx
packets/bytes stats update. Currently the supported format is version
MLX5_HV_VHCA_STATS_VERSION. Block ID 1 is dedicated for statistics data
transfer from the VF to the PF.

The reporter fetch the statistics data from all opened channels, fill it
in a buffer and send it to mlx5_hv_vhca_write_agent.

As the stats layer should include some metadata per block (sequence and
offset), the HV VHCA layer shall modify the buffer before actually send it
over block 1.

Signed-off-by: Eran Ben Elisha 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   1 +
 drivers/net/ethernet/mellanox/mlx5/core/en.h   |  13 ++
 .../ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c | 162 +
 .../ethernet/mellanox/mlx5/core/en/hv_vhca_stats.h |  25 
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |   3 +
 .../net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h  |   1 +
 6 files changed, 205 insertions(+)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.h

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile 
b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index e0a1056..1e8ade9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -36,6 +36,7 @@ mlx5_core-$(CONFIG_MLX5_CORE_EN_DCB) += en_dcbnl.o 
en/port_buffer.o
 mlx5_core-$(CONFIG_MLX5_ESWITCH) += en_rep.o en_tc.o en/tc_tun.o 
lib/port_tun.o lag_mp.o \
lib/geneve.o en/tc_tun_vxlan.o 
en/tc_tun_gre.o \
en/tc_tun_geneve.o
+mlx5_core-$(CONFIG_PCI_HYPERV_MINI)  += en/hv_vhca_stats.o
 
 #
 # Core extra
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 8fc5107..fc41653 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -54,6 +54,7 @@
 #include "mlx5_core.h"
 #include "en_stats.h"
 #include "en/fs.h"
+#include "lib/hv_vhca.h"
 
 extern const struct net_device_ops mlx5e_netdev_ops;
 struct page_pool;
@@ -777,6 +778,15 @@ struct mlx5e_modify_sq_param {
int rl_index;
 };
 
+#if IS_ENABLED(CONFIG_PCI_HYPERV_MINI)
+struct mlx5e_hv_vhca_stats_agent {
+   struct mlx5_hv_vhca_agent *agent;
+   struct delayed_workwork;
+   u16delay;
+   void  *buf;
+};
+#endif
+
 struct mlx5e_xsk {
/* UMEMs are stored separately from channels, because we don't want to
 * lose them when channels are recreated. The kernel also stores UMEMs,
@@ -848,6 +858,9 @@ struct mlx5e_priv {
struct devlink_health_reporter *tx_reporter;
struct devlink_health_reporter *rx_reporter;
struct mlx5e_xsk   xsk;
+#if IS_ENABLED(CONFIG_PCI_HYPERV_MINI)
+   struct mlx5e_hv_vhca_stats_agent stats_agent;
+#endif
 };
 
 struct mlx5e_profile {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c
new file mode 100644
index 000..c37b4ac
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c
@@ -0,0 +1,162 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+// Copyright (c) 2018 Mellanox Technologies
+
+#include "en.h"
+#include "en/hv_vhca_stats.h"
+#include "lib/hv_vhca.h"
+#include "lib/hv.h"
+
+struct mlx5e_hv_vhca_per_ring_stats {
+   u64 rx_packets;
+   u64 rx_bytes;
+   u64 tx_packets;
+   u64 tx_bytes;
+};
+
+static void
+mlx5e_hv_vhca_fill_ring_stats(struct mlx5e_priv *priv, int ch,
+ struct mlx5e_hv_vhca_per_ring_stats *data)
+{
+   struct mlx5e_channel_stats *stats;
+   int tc;
+
+   stats = >channel_stats[ch];
+   data->rx_packets = stats->rq.packets;
+   data->rx_bytes   = stats->rq.bytes;
+
+   for (tc = 0; tc < priv->max_opened_tc; tc++) {
+   data->tx_packets += stats->sq[tc].packets;
+   data->tx_bytes   += stats->sq[tc].bytes;
+   }
+}
+
+static void mlx5e_hv_vhca_fill_stats(struct mlx5e_priv *priv, u64 *data,
+int buf_len)
+{
+   int ch, i = 0;
+
+   for (ch = 0; ch < priv->max_nch; ch++) {
+   u64 *buf = data + i;
+
+   if (WARN_ON_ONCE(buf +
+sizeof(struct mlx5e_hv_vhca_per_ring_stats) >
+data + buf_len))
+   return;
+
+   mlx5e_hv_vhca_fill_ring_stats(priv, ch,
+ (struct 
mlx5e_hv_vhca_per_ring_stats *)buf);
+   i += sizeof(struct mlx5e_hv_vhca_per_ring_stats) / sizeof(u64);
+   }
+}
+
+static int 

[PATCH net-next, 2/6] PCI: hv: Add a Hyper-V PCI mini driver for software backchannel interface

2019-08-14 Thread Haiyang Zhang
This mini driver is a helper driver allows other drivers to
have a common interface with the Hyper-V PCI frontend driver.

Signed-off-by: Haiyang Zhang 
Signed-off-by: Saeed Mahameed 
---
 MAINTAINERS  |  1 +
 drivers/pci/Kconfig  |  1 +
 drivers/pci/controller/Kconfig   |  7 
 drivers/pci/controller/Makefile  |  1 +
 drivers/pci/controller/pci-hyperv-mini.c | 70 
 drivers/pci/controller/pci-hyperv.c  | 12 --
 include/linux/hyperv.h   | 30 ++
 7 files changed, 111 insertions(+), 11 deletions(-)
 create mode 100644 drivers/pci/controller/pci-hyperv-mini.c

diff --git a/MAINTAINERS b/MAINTAINERS
index e352550..c4962b9 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7453,6 +7453,7 @@ F:drivers/hid/hid-hyperv.c
 F: drivers/hv/
 F: drivers/input/serio/hyperv-keyboard.c
 F: drivers/pci/controller/pci-hyperv.c
+F: drivers/pci/controller/pci-hyperv-mini.c
 F: drivers/net/hyperv/
 F: drivers/scsi/storvsc_drv.c
 F: drivers/uio/uio_hv_generic.c
diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
index 2ab9240..bb852f5 100644
--- a/drivers/pci/Kconfig
+++ b/drivers/pci/Kconfig
@@ -182,6 +182,7 @@ config PCI_LABEL
 config PCI_HYPERV
 tristate "Hyper-V PCI Frontend"
 depends on X86 && HYPERV && PCI_MSI && PCI_MSI_IRQ_DOMAIN && X86_64
+   select PCI_HYPERV_MINI
 help
   The PCI device frontend driver allows the kernel to import arbitrary
   PCI devices from a PCI backend to support PCI driver domains.
diff --git a/drivers/pci/controller/Kconfig b/drivers/pci/controller/Kconfig
index fe9f9f1..8e31cba 100644
--- a/drivers/pci/controller/Kconfig
+++ b/drivers/pci/controller/Kconfig
@@ -281,5 +281,12 @@ config VMD
  To compile this driver as a module, choose M here: the
  module will be called vmd.
 
+config PCI_HYPERV_MINI
+   tristate "Hyper-V PCI Mini"
+   depends on X86 && HYPERV && PCI_MSI && PCI_MSI_IRQ_DOMAIN && X86_64
+   help
+ The Hyper-V PCI Mini is a helper driver allows other drivers to
+ have a common interface with the Hyper-V PCI frontend driver.
+
 source "drivers/pci/controller/dwc/Kconfig"
 endmenu
diff --git a/drivers/pci/controller/Makefile b/drivers/pci/controller/Makefile
index d56a507..77e0132 100644
--- a/drivers/pci/controller/Makefile
+++ b/drivers/pci/controller/Makefile
@@ -4,6 +4,7 @@ obj-$(CONFIG_PCIE_CADENCE_HOST) += pcie-cadence-host.o
 obj-$(CONFIG_PCIE_CADENCE_EP) += pcie-cadence-ep.o
 obj-$(CONFIG_PCI_FTPCI100) += pci-ftpci100.o
 obj-$(CONFIG_PCI_HYPERV) += pci-hyperv.o
+obj-$(CONFIG_PCI_HYPERV_MINI) += pci-hyperv-mini.o
 obj-$(CONFIG_PCI_MVEBU) += pci-mvebu.o
 obj-$(CONFIG_PCI_AARDVARK) += pci-aardvark.o
 obj-$(CONFIG_PCI_TEGRA) += pci-tegra.o
diff --git a/drivers/pci/controller/pci-hyperv-mini.c 
b/drivers/pci/controller/pci-hyperv-mini.c
new file mode 100644
index 000..9b6cd1c
--- /dev/null
+++ b/drivers/pci/controller/pci-hyperv-mini.c
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) Microsoft Corporation.
+ *
+ * Author:
+ *   Haiyang Zhang 
+ *
+ * This mini driver is a helper driver allows other drivers to
+ * have a common interface with the Hyper-V PCI frontend driver.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include 
+#include 
+#include 
+
+struct hyperv_pci_block_ops hvpci_block_ops;
+EXPORT_SYMBOL(hvpci_block_ops);
+
+int hyperv_read_cfg_blk(struct pci_dev *dev, void *buf, unsigned int buf_len,
+   unsigned int block_id, unsigned int *bytes_returned)
+{
+   if (!hvpci_block_ops.read_block)
+   return -EOPNOTSUPP;
+
+   return hvpci_block_ops.read_block(dev, buf, buf_len, block_id,
+ bytes_returned);
+}
+EXPORT_SYMBOL(hyperv_read_cfg_blk);
+
+int hyperv_write_cfg_blk(struct pci_dev *dev, void *buf, unsigned int len,
+unsigned int block_id)
+{
+   if (!hvpci_block_ops.write_block)
+   return -EOPNOTSUPP;
+
+   return hvpci_block_ops.write_block(dev, buf, len, block_id);
+}
+EXPORT_SYMBOL(hyperv_write_cfg_blk);
+
+int hyperv_reg_block_invalidate(struct pci_dev *dev, void *context,
+   void (*block_invalidate)(void *context,
+u64 block_mask))
+{
+   if (!hvpci_block_ops.reg_blk_invalidate)
+   return -EOPNOTSUPP;
+
+   return hvpci_block_ops.reg_blk_invalidate(dev, context,
+ block_invalidate);
+}
+EXPORT_SYMBOL(hyperv_reg_block_invalidate);
+
+static void __exit exit_hv_pci_mini(void)
+{
+   pr_info("unloaded\n");
+}
+
+static int __init init_hv_pci_mini(void)
+{
+   pr_info("loa

[PATCH net-next, 1/6] PCI: hv: Add a paravirtual backchannel in software

2019-08-14 Thread Haiyang Zhang
From: Dexuan Cui 

Windows SR-IOV provides a backchannel mechanism in software for communication
between a VF driver and a PF driver.  These "configuration blocks" are
similar in concept to PCI configuration space, but instead of doing reads and
writes in 32-bit chunks through a very slow path, packets of up to 128 bytes
can be sent or received asynchronously.

Nearly every SR-IOV device contains just such a communications channel in
hardware, so using this one in software is usually optional.  Using the
software channel, however, allows driver implementers to leverage software
tools that fuzz the communications channel looking for vulnerabilities.

The usage model for these packets puts the responsibility for reading or
writing on the VF driver.  The VF driver sends a read or a write packet,
indicating which "block" is being referred to by number.

If the PF driver wishes to initiate communication, it can "invalidate" one or
more of the first 64 blocks.  This invalidation is delivered via a callback
supplied by the VF driver by this driver.

No protocol is implied, except that supplied by the PF and VF drivers.

Signed-off-by: Jake Oshins 
Signed-off-by: Dexuan Cui 
Cc: Haiyang Zhang 
Cc: K. Y. Srinivasan 
Cc: Stephen Hemminger 
Signed-off-by: Saeed Mahameed 
Signed-off-by: Haiyang Zhang 
---
 drivers/pci/controller/pci-hyperv.c | 302 
 include/linux/hyperv.h  |  15 ++
 2 files changed, 317 insertions(+)

diff --git a/drivers/pci/controller/pci-hyperv.c 
b/drivers/pci/controller/pci-hyperv.c
index 40b6254..57adeca 100644
--- a/drivers/pci/controller/pci-hyperv.c
+++ b/drivers/pci/controller/pci-hyperv.c
@@ -365,6 +365,39 @@ struct pci_delete_interrupt {
struct tran_int_desc int_desc;
 } __packed;
 
+/*
+ * Note: the VM must pass a valid block id, wslot and bytes_requested.
+ */
+struct pci_read_block {
+   struct pci_message message_type;
+   u32 block_id;
+   union win_slot_encoding wslot;
+   u32 bytes_requested;
+} __packed;
+
+struct pci_read_block_response {
+   struct vmpacket_descriptor hdr;
+   u32 status;
+   u8 bytes[HV_CONFIG_BLOCK_SIZE_MAX];
+} __packed;
+
+/*
+ * Note: the VM must pass a valid block id, wslot and byte_count.
+ */
+struct pci_write_block {
+   struct pci_message message_type;
+   u32 block_id;
+   union win_slot_encoding wslot;
+   u32 byte_count;
+   u8 bytes[HV_CONFIG_BLOCK_SIZE_MAX];
+} __packed;
+
+struct pci_dev_inval_block {
+   struct pci_incoming_message incoming;
+   union win_slot_encoding wslot;
+   u64 block_mask;
+} __packed;
+
 struct pci_dev_incoming {
struct pci_incoming_message incoming;
union win_slot_encoding wslot;
@@ -499,6 +532,9 @@ struct hv_pci_dev {
struct hv_pcibus_device *hbus;
struct work_struct wrk;
 
+   void (*block_invalidate)(void *context, u64 block_mask);
+   void *invalidate_context;
+
/*
 * What would be observed if one wrote 0x to a BAR and then
 * read it back, for each of the BAR offsets within config space.
@@ -817,6 +853,256 @@ static int hv_pcifront_write_config(struct pci_bus *bus, 
unsigned int devfn,
.write = hv_pcifront_write_config,
 };
 
+/*
+ * Paravirtual backchannel
+ *
+ * Hyper-V SR-IOV provides a backchannel mechanism in software for
+ * communication between a VF driver and a PF driver.  These
+ * "configuration blocks" are similar in concept to PCI configuration space,
+ * but instead of doing reads and writes in 32-bit chunks through a very slow
+ * path, packets of up to 128 bytes can be sent or received asynchronously.
+ *
+ * Nearly every SR-IOV device contains just such a communications channel in
+ * hardware, so using this one in software is usually optional.  Using the
+ * software channel, however, allows driver implementers to leverage software
+ * tools that fuzz the communications channel looking for vulnerabilities.
+ *
+ * The usage model for these packets puts the responsibility for reading or
+ * writing on the VF driver.  The VF driver sends a read or a write packet,
+ * indicating which "block" is being referred to by number.
+ *
+ * If the PF driver wishes to initiate communication, it can "invalidate" one 
or
+ * more of the first 64 blocks.  This invalidation is delivered via a callback
+ * supplied by the VF driver by this driver.
+ *
+ * No protocol is implied, except that supplied by the PF and VF drivers.
+ */
+
+struct hv_read_config_compl {
+   struct hv_pci_compl comp_pkt;
+   void *buf;
+   unsigned int len;
+   unsigned int bytes_returned;
+};
+
+/**
+ * hv_pci_read_config_compl() - Invoked when a response packet
+ * for a read config block operation arrives.
+ * @context:   Identifies the read config operation
+ * @resp:  The response packet itself
+ * @resp_packet_size:  Size in bytes of the res

[PATCH net-next, 3/6] net/mlx5: Add wrappers for HyperV PCIe operations

2019-08-14 Thread Haiyang Zhang
From: Eran Ben Elisha 

Add wrapper functions for HyperV PCIe read / write /
block_invalidate_register operations.  This will be used as an
infrastructure in the downstream patch for software communication.

This will be enabled by default if CONFIG_PCI_HYPERV_MINI is set.

Signed-off-by: Eran Ben Elisha 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile |  1 +
 drivers/net/ethernet/mellanox/mlx5/core/lib/hv.c | 64 
 drivers/net/ethernet/mellanox/mlx5/core/lib/hv.h | 22 
 3 files changed, 87 insertions(+)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/hv.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/hv.h

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile 
b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index 8b7edaa..a8950b1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -45,6 +45,7 @@ mlx5_core-$(CONFIG_MLX5_ESWITCH)   += eswitch.o 
eswitch_offloads.o eswitch_offlo
 mlx5_core-$(CONFIG_MLX5_MPFS)  += lib/mpfs.o
 mlx5_core-$(CONFIG_VXLAN)  += lib/vxlan.o
 mlx5_core-$(CONFIG_PTP_1588_CLOCK) += lib/clock.o
+mlx5_core-$(CONFIG_PCI_HYPERV_MINI) += lib/hv.o
 
 #
 # Ipoib netdev
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/hv.c 
b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv.c
new file mode 100644
index 000..cf08d02
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv.c
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+// Copyright (c) 2018 Mellanox Technologies
+
+#include 
+#include "mlx5_core.h"
+#include "lib/hv.h"
+
+static int mlx5_hv_config_common(struct mlx5_core_dev *dev, void *buf, int len,
+int offset, bool read)
+{
+   int rc = -EOPNOTSUPP;
+   int bytes_returned;
+   int block_id;
+
+   if (offset % HV_CONFIG_BLOCK_SIZE_MAX || len % HV_CONFIG_BLOCK_SIZE_MAX)
+   return -EINVAL;
+
+   block_id = offset / HV_CONFIG_BLOCK_SIZE_MAX;
+
+   rc = read ?
+hyperv_read_cfg_blk(dev->pdev, buf,
+HV_CONFIG_BLOCK_SIZE_MAX, block_id,
+_returned) :
+hyperv_write_cfg_blk(dev->pdev, buf,
+ HV_CONFIG_BLOCK_SIZE_MAX, block_id);
+
+   /* Make sure len bytes were read successfully  */
+   if (read)
+   rc |= !(len == bytes_returned);
+
+   if (rc) {
+   mlx5_core_err(dev, "Failed to %s hv config, err = %d, len = %d, 
offset = %d\n",
+ read ? "read" : "write", rc, len,
+ offset);
+   return rc;
+   }
+
+   return 0;
+}
+
+int mlx5_hv_read_config(struct mlx5_core_dev *dev, void *buf, int len,
+   int offset)
+{
+   return mlx5_hv_config_common(dev, buf, len, offset, true);
+}
+
+int mlx5_hv_write_config(struct mlx5_core_dev *dev, void *buf, int len,
+int offset)
+{
+   return mlx5_hv_config_common(dev, buf, len, offset, false);
+}
+
+int mlx5_hv_register_invalidate(struct mlx5_core_dev *dev, void *context,
+   void (*block_invalidate)(void *context,
+u64 block_mask))
+{
+   return hyperv_reg_block_invalidate(dev->pdev, context,
+  block_invalidate);
+}
+
+void mlx5_hv_unregister_invalidate(struct mlx5_core_dev *dev)
+{
+   hyperv_reg_block_invalidate(dev->pdev, NULL, NULL);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/hv.h 
b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv.h
new file mode 100644
index 000..7f69771
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2019 Mellanox Technologies. */
+
+#ifndef __LIB_HV_H__
+#define __LIB_HV_H__
+
+#if IS_ENABLED(CONFIG_PCI_HYPERV_MINI)
+
+#include 
+#include 
+
+int mlx5_hv_read_config(struct mlx5_core_dev *dev, void *buf, int len,
+   int offset);
+int mlx5_hv_write_config(struct mlx5_core_dev *dev, void *buf, int len,
+int offset);
+int mlx5_hv_register_invalidate(struct mlx5_core_dev *dev, void *context,
+   void (*block_invalidate)(void *context,
+u64 block_mask));
+void mlx5_hv_unregister_invalidate(struct mlx5_core_dev *dev);
+#endif
+
+#endif /* __LIB_HV_H__ */
-- 
1.8.3.1



[PATCH net-next, 4/6] net/mlx5: Add HV VHCA infrastructure

2019-08-14 Thread Haiyang Zhang
From: Eran Ben Elisha 

HV VHCA is a layer which provides PF to VF communication channel based on
HyperV PCI config channel. It implements Mellanox's Inter VHCA control
communication protocol. The protocol contains control block in order to
pass messages between the PF and VF drivers, and data blocks in order to
pass actual data.

The infrastructure is agent based. Each agent will be responsible of
contiguous buffer blocks in the VHCA config space. This infrastructure will
bind agents to their blocks, and those agents can only access read/write
the buffer blocks assigned to them. Each agent will provide three
callbacks (control, invalidate, cleanup). Control will be invoked when
block-0 is invalidated with a command that concerns this agent. Invalidate
callback will be invoked if one of the blocks assigned to this agent was
invalidated. Cleanup will be invoked before the agent is being freed in
order to clean all of its open resources or deferred works.

Block-0 serves as the control block. All execution commands from the PF
will be written by the PF over this block. VF will ack on those by
writing on block-0 as well. Its format is described by struct
mlx5_hv_vhca_control_block layout.

Signed-off-by: Eran Ben Elisha 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   2 +-
 .../net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c  | 247 +
 .../net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h  | 102 +
 drivers/net/ethernet/mellanox/mlx5/core/main.c |   7 +
 include/linux/mlx5/driver.h|   2 +
 5 files changed, 359 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile 
b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index a8950b1..e0a1056 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -45,7 +45,7 @@ mlx5_core-$(CONFIG_MLX5_ESWITCH)   += eswitch.o 
eswitch_offloads.o eswitch_offlo
 mlx5_core-$(CONFIG_MLX5_MPFS)  += lib/mpfs.o
 mlx5_core-$(CONFIG_VXLAN)  += lib/vxlan.o
 mlx5_core-$(CONFIG_PTP_1588_CLOCK) += lib/clock.o
-mlx5_core-$(CONFIG_PCI_HYPERV_MINI) += lib/hv.o
+mlx5_core-$(CONFIG_PCI_HYPERV_MINI)+= lib/hv.o lib/hv_vhca.o
 
 #
 # Ipoib netdev
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c 
b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c
new file mode 100644
index 000..b2eebdf
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c
@@ -0,0 +1,247 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+// Copyright (c) 2018 Mellanox Technologies
+
+#include 
+#include "mlx5_core.h"
+#include "lib/hv.h"
+#include "lib/hv_vhca.h"
+
+struct mlx5_hv_vhca {
+   struct mlx5_core_dev   *dev;
+   struct workqueue_struct*work_queue;
+   struct mlx5_hv_vhca_agent  *agents[MLX5_HV_VHCA_AGENT_MAX];
+   struct mutexagents_lock; /* Protect agents array */
+};
+
+struct mlx5_hv_vhca_work {
+   struct work_struct invalidate_work;
+   struct mlx5_hv_vhca   *hv_vhca;
+   u64block_mask;
+};
+
+struct mlx5_hv_vhca_data_block {
+   u16 sequence;
+   u16 offset;
+   u8  reserved[4];
+   u64 data[15];
+};
+
+struct mlx5_hv_vhca_agent {
+   enum mlx5_hv_vhca_agent_type type;
+   struct mlx5_hv_vhca *hv_vhca;
+   void*priv;
+   int  seq;
+   void (*control)(struct mlx5_hv_vhca_agent *agent,
+   struct mlx5_hv_vhca_control_block *block);
+   void (*invalidate)(struct mlx5_hv_vhca_agent *agent,
+  u64 block_mask);
+   void (*cleanup)(struct mlx5_hv_vhca_agent *agent);
+};
+
+struct mlx5_hv_vhca *mlx5_hv_vhca_create(struct mlx5_core_dev *dev)
+{
+   struct mlx5_hv_vhca *hv_vhca = NULL;
+
+   hv_vhca = kzalloc(sizeof(*hv_vhca), GFP_KERNEL);
+   if (!hv_vhca)
+   return ERR_PTR(-ENOMEM);
+
+   hv_vhca->work_queue = create_singlethread_workqueue("mlx5_hv_vhca");
+   if (!hv_vhca->work_queue) {
+   kfree(hv_vhca);
+   return ERR_PTR(-ENOMEM);
+   }
+
+   hv_vhca->dev = dev;
+   mutex_init(_vhca->agents_lock);
+
+   return hv_vhca;
+}
+
+void mlx5_hv_vhca_destroy(struct mlx5_hv_vhca *hv_vhca)
+{
+   if (IS_ERR_OR_NULL(hv_vhca))
+   return;
+
+   flush_workqueue(hv_vhca->work_queue);
+   destroy_workqueue(hv_vhca->work_queue);
+   kfree(hv_vhca);
+}
+
+static void mlx5_hv_vhca_invalidate_work(struct work_struct *work)
+{
+   struct mlx5_hv_vhca_work *hwork;
+   struct mlx5_hv_vhca *hv_vhca;
+   int i;
+
+   hwork = container_of(work, struct mlx5_hv_vhca_work, 

[PATCH net-next, 0/6] Add software backchannel and mlx5e HV VHCA stats

2019-08-14 Thread Haiyang Zhang
This patch set adds paravirtual backchannel in software in pci_hyperv,
which is required by the mlx5e driver HV VHCA stats agent. 

The stats agent is responsible on running a periodic rx/tx packets/bytes
stats update.

Dexuan Cui (1):
  PCI: hv: Add a paravirtual backchannel in software

Eran Ben Elisha (4):
  net/mlx5: Add wrappers for HyperV PCIe operations
  net/mlx5: Add HV VHCA infrastructure
  net/mlx5: Add HV VHCA control agent
  net/mlx5e: Add mlx5e HV VHCA stats agent

Haiyang Zhang (1):
  PCI: hv: Add a Hyper-V PCI mini driver for software backchannel
interface

 MAINTAINERS|   1 +
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   2 +
 drivers/net/ethernet/mellanox/mlx5/core/en.h   |  13 +
 .../ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c | 162 +
 .../ethernet/mellanox/mlx5/core/en/hv_vhca_stats.h |  25 ++
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |   3 +
 drivers/net/ethernet/mellanox/mlx5/core/lib/hv.c   |  64 
 drivers/net/ethernet/mellanox/mlx5/core/lib/hv.h   |  22 ++
 .../net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c  | 365 +
 .../net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h  | 104 ++
 drivers/net/ethernet/mellanox/mlx5/core/main.c |   7 +
 drivers/pci/Kconfig|   1 +
 drivers/pci/controller/Kconfig |   7 +
 drivers/pci/controller/Makefile|   1 +
 drivers/pci/controller/pci-hyperv-mini.c   |  70 
 drivers/pci/controller/pci-hyperv.c| 308 +
 include/linux/hyperv.h |  29 ++
 include/linux/mlx5/driver.h|   2 +
 18 files changed, 1186 insertions(+)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.h
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/hv.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/hv.h
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h
 create mode 100644 drivers/pci/controller/pci-hyperv-mini.c

-- 
1.8.3.1



[PATCH v5,1/2] PCI: hv: Detect and fix Hyper-V PCI domain number collision

2019-08-14 Thread Haiyang Zhang
Currently in Azure cloud, for passthrough devices, the host sets the device
instance ID's bytes 8 - 15 to a value derived from the host HWID, which is
the same on all devices in a VM. So, the device instance ID's bytes 8 and 9
provided by the host are no longer unique. This affects all Azure hosts
since last year, and can cause device passthrough to VMs to fail because
the bytes 8 and 9 are used as PCI domain number. Collision of domain
numbers will cause the second device with the same domain number fail to
load.

In the cases of collision, we will detect and find another number that is
not in use.

Suggested-by: Michael Kelley 
Signed-off-by: Haiyang Zhang 
Acked-by: Sasha Levin 
---
 drivers/pci/controller/pci-hyperv.c | 92 +++--
 1 file changed, 79 insertions(+), 13 deletions(-)

diff --git a/drivers/pci/controller/pci-hyperv.c 
b/drivers/pci/controller/pci-hyperv.c
index 40b6254..31b8fd5 100644
--- a/drivers/pci/controller/pci-hyperv.c
+++ b/drivers/pci/controller/pci-hyperv.c
@@ -2510,6 +2510,48 @@ static void put_hvpcibus(struct hv_pcibus_device *hbus)
complete(>remove_event);
 }
 
+#define HVPCI_DOM_MAP_SIZE (64 * 1024)
+static DECLARE_BITMAP(hvpci_dom_map, HVPCI_DOM_MAP_SIZE);
+
+/*
+ * PCI domain number 0 is used by emulated devices on Gen1 VMs, so define 0
+ * as invalid for passthrough PCI devices of this driver.
+ */
+#define HVPCI_DOM_INVALID 0
+
+/**
+ * hv_get_dom_num() - Get a valid PCI domain number
+ * Check if the PCI domain number is in use, and return another number if
+ * it is in use.
+ *
+ * @dom: Requested domain number
+ *
+ * return: domain number on success, HVPCI_DOM_INVALID on failure
+ */
+static u16 hv_get_dom_num(u16 dom)
+{
+   unsigned int i;
+
+   if (test_and_set_bit(dom, hvpci_dom_map) == 0)
+   return dom;
+
+   for_each_clear_bit(i, hvpci_dom_map, HVPCI_DOM_MAP_SIZE) {
+   if (test_and_set_bit(i, hvpci_dom_map) == 0)
+   return i;
+   }
+
+   return HVPCI_DOM_INVALID;
+}
+
+/**
+ * hv_put_dom_num() - Mark the PCI domain number as free
+ * @dom: Domain number to be freed
+ */
+static void hv_put_dom_num(u16 dom)
+{
+   clear_bit(dom, hvpci_dom_map);
+}
+
 /**
  * hv_pci_probe() - New VMBus channel probe, for a root PCI bus
  * @hdev:  VMBus's tracking struct for this root PCI bus
@@ -2521,6 +2563,7 @@ static int hv_pci_probe(struct hv_device *hdev,
const struct hv_vmbus_device_id *dev_id)
 {
struct hv_pcibus_device *hbus;
+   u16 dom_req, dom;
int ret;
 
/*
@@ -2535,19 +2578,34 @@ static int hv_pci_probe(struct hv_device *hdev,
hbus->state = hv_pcibus_init;
 
/*
-* The PCI bus "domain" is what is called "segment" in ACPI and
-* other specs.  Pull it from the instance ID, to get something
-* unique.  Bytes 8 and 9 are what is used in Windows guests, so
-* do the same thing for consistency.  Note that, since this code
-* only runs in a Hyper-V VM, Hyper-V can (and does) guarantee
-* that (1) the only domain in use for something that looks like
-* a physical PCI bus (which is actually emulated by the
-* hypervisor) is domain 0 and (2) there will be no overlap
-* between domains derived from these instance IDs in the same
-* VM.
+* The PCI bus "domain" is what is called "segment" in ACPI and other
+* specs. Pull it from the instance ID, to get something usually
+* unique. In rare cases of collision, we will find out another number
+* not in use.
+*
+* Note that, since this code only runs in a Hyper-V VM, Hyper-V
+* together with this guest driver can guarantee that (1) The only
+* domain used by Gen1 VMs for something that looks like a physical
+* PCI bus (which is actually emulated by the hypervisor) is domain 0.
+* (2) There will be no overlap between domains (after fixing possible
+* collisions) in the same VM.
 */
-   hbus->sysdata.domain = hdev->dev_instance.b[9] |
-  hdev->dev_instance.b[8] << 8;
+   dom_req = hdev->dev_instance.b[8] << 8 | hdev->dev_instance.b[9];
+   dom = hv_get_dom_num(dom_req);
+
+   if (dom == HVPCI_DOM_INVALID) {
+   dev_err(>device,
+   "Unable to use dom# 0x%hx or other numbers", dom_req);
+   ret = -EINVAL;
+   goto free_bus;
+   }
+
+   if (dom != dom_req)
+   dev_info(>device,
+"PCI dom# 0x%hx has collision, using 0x%hx",
+dom_req, dom);
+
+   hbus->sysdata.domain = dom;
 
hbus->hdev = hdev;
refcount_set(>remove_lock, 1);
@@ -2562,7 +2620,7 @@ static int hv_pci_probe(str

[PATCH v5,2/2] PCI: hv: Use bytes 4 and 5 from instance ID as the PCI domain numbers

2019-08-14 Thread Haiyang Zhang
As recommended by Azure host team, the bytes 4, 5 have more uniqueness
(info entropy) than bytes 8, 9. So now we use bytes 4, 5 as the PCI domain
numbers. On older hosts, bytes 4, 5 can also be used -- no backward
compatibility issues here. The chance of collision is greatly reduced.

In the rare cases of collision, the driver code detects and finds another
number that is not in use.

Suggested-by: Michael Kelley 
Signed-off-by: Haiyang Zhang 
Acked-by: Sasha Levin 
---
 drivers/pci/controller/pci-hyperv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/pci/controller/pci-hyperv.c 
b/drivers/pci/controller/pci-hyperv.c
index 31b8fd5..4f3d97e 100644
--- a/drivers/pci/controller/pci-hyperv.c
+++ b/drivers/pci/controller/pci-hyperv.c
@@ -2590,7 +2590,7 @@ static int hv_pci_probe(struct hv_device *hdev,
 * (2) There will be no overlap between domains (after fixing possible
 * collisions) in the same VM.
 */
-   dom_req = hdev->dev_instance.b[8] << 8 | hdev->dev_instance.b[9];
+   dom_req = hdev->dev_instance.b[5] << 8 | hdev->dev_instance.b[4];
dom = hv_get_dom_num(dom_req);
 
if (dom == HVPCI_DOM_INVALID) {
-- 
1.8.3.1



RE: [PATCH v4,1/2] PCI: hv: Detect and fix Hyper-V PCI domain number collision

2019-08-14 Thread Haiyang Zhang



> -Original Message-
> From: Bjorn Helgaas 
> Sent: Wednesday, August 14, 2019 12:34 AM
> To: Haiyang Zhang 
> Cc: sas...@kernel.org; lorenzo.pieral...@arm.com; linux-
> hyp...@vger.kernel.org; linux-...@vger.kernel.org; KY Srinivasan
> ; Stephen Hemminger ;
> o...@aepfle.de; vkuznets ; linux-
> ker...@vger.kernel.org
> Subject: Re: [PATCH v4,1/2] PCI: hv: Detect and fix Hyper-V PCI domain
> number collision
> 
> Thanks for splitting these; I think that makes more sense.
> 
> On Wed, Aug 14, 2019 at 12:38:54AM +, Haiyang Zhang wrote:
> > Currently in Azure cloud, for passthrough devices including GPU, the host
> > sets the device instance ID's bytes 8 - 15 to a value derived from the host
> > HWID, which is the same on all devices in a VM. So, the device instance
> > ID's bytes 8 and 9 provided by the host are no longer unique. This can
> > cause device passthrough to VMs to fail because the bytes 8 and 9 are used
> > as PCI domain number. Collision of domain numbers will cause the second
> > device with the same domain number fail to load.
> 
> I think this patch is fine.  I could be misunderstanding the commit
> log, but when you say "the ID bytes 8 and 9 are *no longer* unique",
> that suggests that they *used* to be unique but stopped being unique
> at some point, which of course raises the question of *when* they
> became non-unique.
> 
> The specific information about that point would be useful to have in
> the commit log, e.g., is this related to a specific version of Azure,
> a configuration change, etc?
The host side change happened last year, rolled out to all azure hosts.
I will put "all current azure hosts" in the commit log.

> Does this problem affect GPUs more than other passthrough devices?  If
> all passthrough devices are affected, why mention GPUs in particular?
> I can't tell whether that information is relevant or superfluous.

We found this issue initially on multiple passthrough GPUs, I mentioned this
just as an example. I will remove this word, because any PCI devices may
be affected.

Thanks,
- Haiyang


[PATCH v4,1/2] PCI: hv: Detect and fix Hyper-V PCI domain number collision

2019-08-13 Thread Haiyang Zhang
Currently in Azure cloud, for passthrough devices including GPU, the host
sets the device instance ID's bytes 8 - 15 to a value derived from the host
HWID, which is the same on all devices in a VM. So, the device instance
ID's bytes 8 and 9 provided by the host are no longer unique. This can
cause device passthrough to VMs to fail because the bytes 8 and 9 are used
as PCI domain number. Collision of domain numbers will cause the second
device with the same domain number fail to load.

In the cases of collision, we will detect and find another number that is
not in use.

Suggested-by: Michael Kelley 
Signed-off-by: Haiyang Zhang 
Acked-by: Sasha Levin 
---
 drivers/pci/controller/pci-hyperv.c | 92 +++--
 1 file changed, 79 insertions(+), 13 deletions(-)

diff --git a/drivers/pci/controller/pci-hyperv.c 
b/drivers/pci/controller/pci-hyperv.c
index 40b6254..31b8fd5 100644
--- a/drivers/pci/controller/pci-hyperv.c
+++ b/drivers/pci/controller/pci-hyperv.c
@@ -2510,6 +2510,48 @@ static void put_hvpcibus(struct hv_pcibus_device *hbus)
complete(>remove_event);
 }
 
+#define HVPCI_DOM_MAP_SIZE (64 * 1024)
+static DECLARE_BITMAP(hvpci_dom_map, HVPCI_DOM_MAP_SIZE);
+
+/*
+ * PCI domain number 0 is used by emulated devices on Gen1 VMs, so define 0
+ * as invalid for passthrough PCI devices of this driver.
+ */
+#define HVPCI_DOM_INVALID 0
+
+/**
+ * hv_get_dom_num() - Get a valid PCI domain number
+ * Check if the PCI domain number is in use, and return another number if
+ * it is in use.
+ *
+ * @dom: Requested domain number
+ *
+ * return: domain number on success, HVPCI_DOM_INVALID on failure
+ */
+static u16 hv_get_dom_num(u16 dom)
+{
+   unsigned int i;
+
+   if (test_and_set_bit(dom, hvpci_dom_map) == 0)
+   return dom;
+
+   for_each_clear_bit(i, hvpci_dom_map, HVPCI_DOM_MAP_SIZE) {
+   if (test_and_set_bit(i, hvpci_dom_map) == 0)
+   return i;
+   }
+
+   return HVPCI_DOM_INVALID;
+}
+
+/**
+ * hv_put_dom_num() - Mark the PCI domain number as free
+ * @dom: Domain number to be freed
+ */
+static void hv_put_dom_num(u16 dom)
+{
+   clear_bit(dom, hvpci_dom_map);
+}
+
 /**
  * hv_pci_probe() - New VMBus channel probe, for a root PCI bus
  * @hdev:  VMBus's tracking struct for this root PCI bus
@@ -2521,6 +2563,7 @@ static int hv_pci_probe(struct hv_device *hdev,
const struct hv_vmbus_device_id *dev_id)
 {
struct hv_pcibus_device *hbus;
+   u16 dom_req, dom;
int ret;
 
/*
@@ -2535,19 +2578,34 @@ static int hv_pci_probe(struct hv_device *hdev,
hbus->state = hv_pcibus_init;
 
/*
-* The PCI bus "domain" is what is called "segment" in ACPI and
-* other specs.  Pull it from the instance ID, to get something
-* unique.  Bytes 8 and 9 are what is used in Windows guests, so
-* do the same thing for consistency.  Note that, since this code
-* only runs in a Hyper-V VM, Hyper-V can (and does) guarantee
-* that (1) the only domain in use for something that looks like
-* a physical PCI bus (which is actually emulated by the
-* hypervisor) is domain 0 and (2) there will be no overlap
-* between domains derived from these instance IDs in the same
-* VM.
+* The PCI bus "domain" is what is called "segment" in ACPI and other
+* specs. Pull it from the instance ID, to get something usually
+* unique. In rare cases of collision, we will find out another number
+* not in use.
+*
+* Note that, since this code only runs in a Hyper-V VM, Hyper-V
+* together with this guest driver can guarantee that (1) The only
+* domain used by Gen1 VMs for something that looks like a physical
+* PCI bus (which is actually emulated by the hypervisor) is domain 0.
+* (2) There will be no overlap between domains (after fixing possible
+* collisions) in the same VM.
 */
-   hbus->sysdata.domain = hdev->dev_instance.b[9] |
-  hdev->dev_instance.b[8] << 8;
+   dom_req = hdev->dev_instance.b[8] << 8 | hdev->dev_instance.b[9];
+   dom = hv_get_dom_num(dom_req);
+
+   if (dom == HVPCI_DOM_INVALID) {
+   dev_err(>device,
+   "Unable to use dom# 0x%hx or other numbers", dom_req);
+   ret = -EINVAL;
+   goto free_bus;
+   }
+
+   if (dom != dom_req)
+   dev_info(>device,
+"PCI dom# 0x%hx has collision, using 0x%hx",
+dom_req, dom);
+
+   hbus->sysdata.domain = dom;
 
hbus->hdev = hdev;
refcount_set(>remove_lock, 1);
@@ -2562,7 +2620,7 @@ static int hv_pci_probe(struct hv_device *hdev,

[PATCH v4,2/2] PCI: hv: Use bytes 4 and 5 from instance ID as the PCI domain numbers

2019-08-13 Thread Haiyang Zhang
As recommended by Azure host team, the bytes 4, 5 have more uniqueness
(info entropy) than bytes 8, 9. So now we use bytes 4, 5 as the PCI domain
numbers. On older hosts, bytes 4, 5 can also be used -- no backward
compatibility issues here. The chance of collision is greatly reduced.

In the rare cases of collision, the driver code detects and finds another
number that is not in use.

Suggested-by: Michael Kelley 
Signed-off-by: Haiyang Zhang 
Acked-by: Sasha Levin 
---
 drivers/pci/controller/pci-hyperv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/pci/controller/pci-hyperv.c 
b/drivers/pci/controller/pci-hyperv.c
index 31b8fd5..4f3d97e 100644
--- a/drivers/pci/controller/pci-hyperv.c
+++ b/drivers/pci/controller/pci-hyperv.c
@@ -2590,7 +2590,7 @@ static int hv_pci_probe(struct hv_device *hdev,
 * (2) There will be no overlap between domains (after fixing possible
 * collisions) in the same VM.
 */
-   dom_req = hdev->dev_instance.b[8] << 8 | hdev->dev_instance.b[9];
+   dom_req = hdev->dev_instance.b[5] << 8 | hdev->dev_instance.b[4];
dom = hv_get_dom_num(dom_req);
 
if (dom == HVPCI_DOM_INVALID) {
-- 
1.8.3.1



RE: [PATCH v3] PCI: hv: Detect and fix Hyper-V PCI domain number collision

2019-08-13 Thread Haiyang Zhang



> -Original Message-
> From: Lorenzo Pieralisi 
> Sent: Tuesday, August 13, 2019 10:26 AM
> To: Haiyang Zhang 
> Cc: sas...@kernel.org; bhelg...@google.com; linux-
> hyp...@vger.kernel.org; linux-...@vger.kernel.org; KY Srinivasan
> ; Stephen Hemminger ;
> o...@aepfle.de; vkuznets ; linux-
> ker...@vger.kernel.org
> Subject: Re: [PATCH v3] PCI: hv: Detect and fix Hyper-V PCI domain number
> collision
> 
> On Tue, Aug 13, 2019 at 12:55:59PM +, Haiyang Zhang wrote:
> >
> >
> > > -Original Message-
> > > From: Lorenzo Pieralisi 
> > > Sent: Tuesday, August 13, 2019 6:14 AM
> > > To: Haiyang Zhang 
> > > Cc: sas...@kernel.org; bhelg...@google.com; linux-
> > > hyp...@vger.kernel.org; linux-...@vger.kernel.org; KY Srinivasan
> > > ; Stephen Hemminger
> ;
> > > o...@aepfle.de; vkuznets ; linux-
> > > ker...@vger.kernel.org
> > > Subject: Re: [PATCH v3] PCI: hv: Detect and fix Hyper-V PCI domain
> number
> > > collision
> > >
> > > On Mon, Aug 12, 2019 at 06:20:53PM +, Haiyang Zhang wrote:
> > > > Currently in Azure cloud, for passthrough devices including GPU, the
> host
> > > > sets the device instance ID's bytes 8 - 15 to a value derived from the
> host
> > > > HWID, which is the same on all devices in a VM. So, the device instance
> > > > ID's bytes 8 and 9 provided by the host are no longer unique. This can
> > > > cause device passthrough to VMs to fail because the bytes 8 and 9 are
> used
> > > > as PCI domain number. Collision of domain numbers will cause the
> second
> > > > device with the same domain number fail to load.
> > > >
> > > > As recommended by Azure host team, the bytes 4, 5 have more
> uniqueness
> > > > (info entropy) than bytes 8, 9. So now we use bytes 4, 5 as the PCI
> domain
> > > > numbers. On older hosts, bytes 4, 5 can also be used -- no backward
> > > > compatibility issues here. The chance of collision is greatly reduced. 
> > > > In
> > > > the rare cases of collision, we will detect and find another number that
> is
> > > > not in use.
> > >
> > > I have not explained what I meant correctly. This patch fixes an
> > > issue and the "find another number" fallback can be also applied
> > > to the current kernel without changing the bytes you use for
> > > domain numbers.
> > >
> > > This patch would leave old kernels susceptible to breakage.
> > >
> > > Again, I have no Azure knowledge but it seems better to me to
> > > add a fallback "find another number" allocation on top of mainline
> > > and send it to stable kernels. Then we can add another patch to
> > > change the bytes you use to reduce the number of collision.
> > >
> > > Please let me know what you think, thanks.
> >
> > Thanks for your clarification.
> > Actually, I hope the stable kernel will be patched to use bytes 4,5 too,
> > because host provided numbers are persistent across reboots, we like
> > to use them if possible.
> >
> > I think we can either --
> > 1) Apply this patch for mainline and stable kernels as well.
> > 2) Or, break this patch into two patches, and apply both of them for
> > Mainline and stable kernels.
> 
> (2) since one patch is a fix and the other one an (optional - however
> important it is) change.
> 
> This way if the optional change needs reverting we still have a working
> kernel.
> 
> In the end it is up to you - I am just expressing what I think is the
> most sensible way forward.

Sure, I agree with you, and will break the patch into two, and resubmit.

Thanks,
- Haiyang


RE: [PATCH v3] PCI: hv: Detect and fix Hyper-V PCI domain number collision

2019-08-13 Thread Haiyang Zhang



> -Original Message-
> From: Lorenzo Pieralisi 
> Sent: Tuesday, August 13, 2019 6:14 AM
> To: Haiyang Zhang 
> Cc: sas...@kernel.org; bhelg...@google.com; linux-
> hyp...@vger.kernel.org; linux-...@vger.kernel.org; KY Srinivasan
> ; Stephen Hemminger ;
> o...@aepfle.de; vkuznets ; linux-
> ker...@vger.kernel.org
> Subject: Re: [PATCH v3] PCI: hv: Detect and fix Hyper-V PCI domain number
> collision
> 
> On Mon, Aug 12, 2019 at 06:20:53PM +, Haiyang Zhang wrote:
> > Currently in Azure cloud, for passthrough devices including GPU, the host
> > sets the device instance ID's bytes 8 - 15 to a value derived from the host
> > HWID, which is the same on all devices in a VM. So, the device instance
> > ID's bytes 8 and 9 provided by the host are no longer unique. This can
> > cause device passthrough to VMs to fail because the bytes 8 and 9 are used
> > as PCI domain number. Collision of domain numbers will cause the second
> > device with the same domain number fail to load.
> >
> > As recommended by Azure host team, the bytes 4, 5 have more uniqueness
> > (info entropy) than bytes 8, 9. So now we use bytes 4, 5 as the PCI domain
> > numbers. On older hosts, bytes 4, 5 can also be used -- no backward
> > compatibility issues here. The chance of collision is greatly reduced. In
> > the rare cases of collision, we will detect and find another number that is
> > not in use.
> 
> I have not explained what I meant correctly. This patch fixes an
> issue and the "find another number" fallback can be also applied
> to the current kernel without changing the bytes you use for
> domain numbers.
> 
> This patch would leave old kernels susceptible to breakage.
> 
> Again, I have no Azure knowledge but it seems better to me to
> add a fallback "find another number" allocation on top of mainline
> and send it to stable kernels. Then we can add another patch to
> change the bytes you use to reduce the number of collision.
> 
> Please let me know what you think, thanks.

Thanks for your clarification.
Actually, I hope the stable kernel will be patched to use bytes 4,5 too,
because host provided numbers are persistent across reboots, we like
to use them if possible.

I think we can either --
1) Apply this patch for mainline and stable kernels as well.
2) Or, break this patch into two patches, and apply both of them for 
Mainline and stable kernels.

Which way do you prefer?

Thanks,
- Haiyang



[PATCH v3] PCI: hv: Detect and fix Hyper-V PCI domain number collision

2019-08-12 Thread Haiyang Zhang
Currently in Azure cloud, for passthrough devices including GPU, the host
sets the device instance ID's bytes 8 - 15 to a value derived from the host
HWID, which is the same on all devices in a VM. So, the device instance
ID's bytes 8 and 9 provided by the host are no longer unique. This can
cause device passthrough to VMs to fail because the bytes 8 and 9 are used
as PCI domain number. Collision of domain numbers will cause the second
device with the same domain number fail to load.

As recommended by Azure host team, the bytes 4, 5 have more uniqueness
(info entropy) than bytes 8, 9. So now we use bytes 4, 5 as the PCI domain
numbers. On older hosts, bytes 4, 5 can also be used -- no backward
compatibility issues here. The chance of collision is greatly reduced. In
the rare cases of collision, we will detect and find another number that is
not in use.

Suggested-by: Michael Kelley 
Signed-off-by: Haiyang Zhang 
Acked-by: Sasha Levin 
---
 drivers/pci/controller/pci-hyperv.c | 92 +++--
 1 file changed, 79 insertions(+), 13 deletions(-)

diff --git a/drivers/pci/controller/pci-hyperv.c 
b/drivers/pci/controller/pci-hyperv.c
index 40b6254..4f3d97e 100644
--- a/drivers/pci/controller/pci-hyperv.c
+++ b/drivers/pci/controller/pci-hyperv.c
@@ -2510,6 +2510,48 @@ static void put_hvpcibus(struct hv_pcibus_device *hbus)
complete(>remove_event);
 }
 
+#define HVPCI_DOM_MAP_SIZE (64 * 1024)
+static DECLARE_BITMAP(hvpci_dom_map, HVPCI_DOM_MAP_SIZE);
+
+/*
+ * PCI domain number 0 is used by emulated devices on Gen1 VMs, so define 0
+ * as invalid for passthrough PCI devices of this driver.
+ */
+#define HVPCI_DOM_INVALID 0
+
+/**
+ * hv_get_dom_num() - Get a valid PCI domain number
+ * Check if the PCI domain number is in use, and return another number if
+ * it is in use.
+ *
+ * @dom: Requested domain number
+ *
+ * return: domain number on success, HVPCI_DOM_INVALID on failure
+ */
+static u16 hv_get_dom_num(u16 dom)
+{
+   unsigned int i;
+
+   if (test_and_set_bit(dom, hvpci_dom_map) == 0)
+   return dom;
+
+   for_each_clear_bit(i, hvpci_dom_map, HVPCI_DOM_MAP_SIZE) {
+   if (test_and_set_bit(i, hvpci_dom_map) == 0)
+   return i;
+   }
+
+   return HVPCI_DOM_INVALID;
+}
+
+/**
+ * hv_put_dom_num() - Mark the PCI domain number as free
+ * @dom: Domain number to be freed
+ */
+static void hv_put_dom_num(u16 dom)
+{
+   clear_bit(dom, hvpci_dom_map);
+}
+
 /**
  * hv_pci_probe() - New VMBus channel probe, for a root PCI bus
  * @hdev:  VMBus's tracking struct for this root PCI bus
@@ -2521,6 +2563,7 @@ static int hv_pci_probe(struct hv_device *hdev,
const struct hv_vmbus_device_id *dev_id)
 {
struct hv_pcibus_device *hbus;
+   u16 dom_req, dom;
int ret;
 
/*
@@ -2535,19 +2578,34 @@ static int hv_pci_probe(struct hv_device *hdev,
hbus->state = hv_pcibus_init;
 
/*
-* The PCI bus "domain" is what is called "segment" in ACPI and
-* other specs.  Pull it from the instance ID, to get something
-* unique.  Bytes 8 and 9 are what is used in Windows guests, so
-* do the same thing for consistency.  Note that, since this code
-* only runs in a Hyper-V VM, Hyper-V can (and does) guarantee
-* that (1) the only domain in use for something that looks like
-* a physical PCI bus (which is actually emulated by the
-* hypervisor) is domain 0 and (2) there will be no overlap
-* between domains derived from these instance IDs in the same
-* VM.
+* The PCI bus "domain" is what is called "segment" in ACPI and other
+* specs. Pull it from the instance ID, to get something usually
+* unique. In rare cases of collision, we will find out another number
+* not in use.
+*
+* Note that, since this code only runs in a Hyper-V VM, Hyper-V
+* together with this guest driver can guarantee that (1) The only
+* domain used by Gen1 VMs for something that looks like a physical
+* PCI bus (which is actually emulated by the hypervisor) is domain 0.
+* (2) There will be no overlap between domains (after fixing possible
+* collisions) in the same VM.
 */
-   hbus->sysdata.domain = hdev->dev_instance.b[9] |
-  hdev->dev_instance.b[8] << 8;
+   dom_req = hdev->dev_instance.b[5] << 8 | hdev->dev_instance.b[4];
+   dom = hv_get_dom_num(dom_req);
+
+   if (dom == HVPCI_DOM_INVALID) {
+   dev_err(>device,
+   "Unable to use dom# 0x%hx or other numbers", dom_req);
+   ret = -EINVAL;
+   goto free_bus;
+   }
+
+   if (dom != dom_req)
+   dev_info(>device,
+"PCI 

RE: [PATCH v2] PCI: hv: Detect and fix Hyper-V PCI domain number collision

2019-08-12 Thread Haiyang Zhang



> -Original Message-
> From: Lorenzo Pieralisi 
> Sent: Monday, August 12, 2019 11:39 AM
> To: Haiyang Zhang 
> Cc: sas...@kernel.org; bhelg...@google.com; linux-
> hyp...@vger.kernel.org; linux-...@vger.kernel.org; KY Srinivasan
> ; Stephen Hemminger ;
> o...@aepfle.de; vkuznets ; linux-
> ker...@vger.kernel.org
> Subject: Re: [PATCH v2] PCI: hv: Detect and fix Hyper-V PCI domain number
> collision
> 
> On Tue, Aug 06, 2019 at 11:52:11PM +, Haiyang Zhang wrote:
> > Currently in Azure cloud, for passthrough devices including GPU, the
> > host sets the device instance ID's bytes 8 - 15 to a value derived from
> > the host HWID, which is the same on all devices in a VM. So, the device
> > instance ID's bytes 8 and 9 provided by the host are no longer unique.
> >
> > This can cause device passthrough to VMs to fail because the bytes 8 and
> > 9 is used as PCI domain number. So, as recommended by Azure host team,
> > we now use the bytes 4 and 5 which usually contain unique numbers as PCI
> > domain. The chance of collision is greatly reduced. In the rare cases of
> > collision, we will detect and find another number that is not in use.
> 
> This is not clear at all. Why "finding another number" is fine with
> this patch while it is not with current kernel code ? Also does this
> have backward compatibility issues ?
The bytes 4, 5 have more uniqueness (info entropy) than bytes 8, 9, so we use
bytes 4, 5. On older hosts, bytes 4, 5 can also be used -- so it has no backward
compatibility issues.
 
> I do not understand if a collision is a problem or not from the
> log above.
Collision will cause the second device with the same domain number fails to 
load.
I will include these info into the patch description.

> 
> > Thanks to Michael Kelley  for proposing this
> idea.
> 
> Add it as Suggested-by: tag.
I will add this line.

Thanks,
- Haiyang


RE: [PATCH v2] PCI: hv: Detect and fix Hyper-V PCI domain number collision

2019-08-12 Thread Haiyang Zhang



> -Original Message-
> From: LKML haiyangz  On Behalf Of Haiyang
> Zhang
> Sent: Tuesday, August 6, 2019 7:52 PM
> To: sas...@kernel.org; bhelg...@google.com; lorenzo.pieral...@arm.com;
> linux-hyp...@vger.kernel.org; linux-...@vger.kernel.org
> Cc: Haiyang Zhang ; KY Srinivasan
> ; Stephen Hemminger ;
> o...@aepfle.de; vkuznets ; linux-
> ker...@vger.kernel.org
> Subject: [PATCH v2] PCI: hv: Detect and fix Hyper-V PCI domain number
> collision
> 
> Currently in Azure cloud, for passthrough devices including GPU, the
> host sets the device instance ID's bytes 8 - 15 to a value derived from
> the host HWID, which is the same on all devices in a VM. So, the device
> instance ID's bytes 8 and 9 provided by the host are no longer unique.
> 
> This can cause device passthrough to VMs to fail because the bytes 8 and
> 9 is used as PCI domain number. So, as recommended by Azure host team,
> we now use the bytes 4 and 5 which usually contain unique numbers as PCI
> domain. The chance of collision is greatly reduced. In the rare cases of
> collision, we will detect and find another number that is not in use.
> 
> Thanks to Michael Kelley  for proposing this idea.
> 
> Signed-off-by: Haiyang Zhang 
> Acked-by: Sasha Levin 
> ---
>  drivers/pci/controller/pci-hyperv.c | 92

Hi Lorenzo,

This patch has been updated based on Bjorn's comments. Do you have any further
comments? Could you take it from your tree?

Thanks,
- Haiyang


[PATCH v2] PCI: hv: Detect and fix Hyper-V PCI domain number collision

2019-08-06 Thread Haiyang Zhang
Currently in Azure cloud, for passthrough devices including GPU, the
host sets the device instance ID's bytes 8 - 15 to a value derived from
the host HWID, which is the same on all devices in a VM. So, the device
instance ID's bytes 8 and 9 provided by the host are no longer unique.

This can cause device passthrough to VMs to fail because the bytes 8 and
9 is used as PCI domain number. So, as recommended by Azure host team,
we now use the bytes 4 and 5 which usually contain unique numbers as PCI
domain. The chance of collision is greatly reduced. In the rare cases of
collision, we will detect and find another number that is not in use.

Thanks to Michael Kelley  for proposing this idea.

Signed-off-by: Haiyang Zhang 
Acked-by: Sasha Levin 
---
 drivers/pci/controller/pci-hyperv.c | 92 +++--
 1 file changed, 79 insertions(+), 13 deletions(-)

diff --git a/drivers/pci/controller/pci-hyperv.c 
b/drivers/pci/controller/pci-hyperv.c
index 40b6254..4f3d97e 100644
--- a/drivers/pci/controller/pci-hyperv.c
+++ b/drivers/pci/controller/pci-hyperv.c
@@ -2510,6 +2510,48 @@ static void put_hvpcibus(struct hv_pcibus_device *hbus)
complete(>remove_event);
 }
 
+#define HVPCI_DOM_MAP_SIZE (64 * 1024)
+static DECLARE_BITMAP(hvpci_dom_map, HVPCI_DOM_MAP_SIZE);
+
+/*
+ * PCI domain number 0 is used by emulated devices on Gen1 VMs, so define 0
+ * as invalid for passthrough PCI devices of this driver.
+ */
+#define HVPCI_DOM_INVALID 0
+
+/**
+ * hv_get_dom_num() - Get a valid PCI domain number
+ * Check if the PCI domain number is in use, and return another number if
+ * it is in use.
+ *
+ * @dom: Requested domain number
+ *
+ * return: domain number on success, HVPCI_DOM_INVALID on failure
+ */
+static u16 hv_get_dom_num(u16 dom)
+{
+   unsigned int i;
+
+   if (test_and_set_bit(dom, hvpci_dom_map) == 0)
+   return dom;
+
+   for_each_clear_bit(i, hvpci_dom_map, HVPCI_DOM_MAP_SIZE) {
+   if (test_and_set_bit(i, hvpci_dom_map) == 0)
+   return i;
+   }
+
+   return HVPCI_DOM_INVALID;
+}
+
+/**
+ * hv_put_dom_num() - Mark the PCI domain number as free
+ * @dom: Domain number to be freed
+ */
+static void hv_put_dom_num(u16 dom)
+{
+   clear_bit(dom, hvpci_dom_map);
+}
+
 /**
  * hv_pci_probe() - New VMBus channel probe, for a root PCI bus
  * @hdev:  VMBus's tracking struct for this root PCI bus
@@ -2521,6 +2563,7 @@ static int hv_pci_probe(struct hv_device *hdev,
const struct hv_vmbus_device_id *dev_id)
 {
struct hv_pcibus_device *hbus;
+   u16 dom_req, dom;
int ret;
 
/*
@@ -2535,19 +2578,34 @@ static int hv_pci_probe(struct hv_device *hdev,
hbus->state = hv_pcibus_init;
 
/*
-* The PCI bus "domain" is what is called "segment" in ACPI and
-* other specs.  Pull it from the instance ID, to get something
-* unique.  Bytes 8 and 9 are what is used in Windows guests, so
-* do the same thing for consistency.  Note that, since this code
-* only runs in a Hyper-V VM, Hyper-V can (and does) guarantee
-* that (1) the only domain in use for something that looks like
-* a physical PCI bus (which is actually emulated by the
-* hypervisor) is domain 0 and (2) there will be no overlap
-* between domains derived from these instance IDs in the same
-* VM.
+* The PCI bus "domain" is what is called "segment" in ACPI and other
+* specs. Pull it from the instance ID, to get something usually
+* unique. In rare cases of collision, we will find out another number
+* not in use.
+*
+* Note that, since this code only runs in a Hyper-V VM, Hyper-V
+* together with this guest driver can guarantee that (1) The only
+* domain used by Gen1 VMs for something that looks like a physical
+* PCI bus (which is actually emulated by the hypervisor) is domain 0.
+* (2) There will be no overlap between domains (after fixing possible
+* collisions) in the same VM.
 */
-   hbus->sysdata.domain = hdev->dev_instance.b[9] |
-  hdev->dev_instance.b[8] << 8;
+   dom_req = hdev->dev_instance.b[5] << 8 | hdev->dev_instance.b[4];
+   dom = hv_get_dom_num(dom_req);
+
+   if (dom == HVPCI_DOM_INVALID) {
+   dev_err(>device,
+   "Unable to use dom# 0x%hx or other numbers", dom_req);
+   ret = -EINVAL;
+   goto free_bus;
+   }
+
+   if (dom != dom_req)
+   dev_info(>device,
+"PCI dom# 0x%hx has collision, using 0x%hx",
+dom_req, dom);
+
+   hbus->sysdata.domain = dom;
 
hbus->hdev = hdev;
  

RE: [PATCH] PCI: hv: Detect and fix Hyper-V PCI domain number collision

2019-08-06 Thread Haiyang Zhang



> -Original Message-
> From: Bjorn Helgaas 
> Sent: Tuesday, August 6, 2019 2:55 PM
> To: Haiyang Zhang 
> Cc: sas...@kernel.org; lorenzo.pieral...@arm.com; linux-
> hyp...@vger.kernel.org; linux-...@vger.kernel.org; KY Srinivasan
> ; Stephen Hemminger ;
> o...@aepfle.de; vkuznets ; linux-
> ker...@vger.kernel.org
> Subject: Re: [PATCH] PCI: hv: Detect and fix Hyper-V PCI domain number
> collision
> 
> On Fri, Aug 02, 2019 at 06:52:56PM +, Haiyang Zhang wrote:
> > Due to Azure host agent settings, the device instance ID's bytes 8 and
> > 9 are no longer unique. This causes some of the PCI devices not
> > showing up in VMs with multiple passthrough devices, such as GPUs. So,
> > as recommended by Azure host team, we now use the bytes 4 and 5 which
> > usually provide unique numbers.
> 
> What does "Azure host agent settings" mean?  Would it be useful to say
> something more specific, so users could ready this and say "oh, I'm using the
> Azure host agent settings mentioned here, so I need this patch"?  Is this
> related to a specific Azure host agent commit or release?
> 
> "This causes some of the PCI devices ..." is not a sentence.  I think I
> understand what you're saying -- "This sometimes causes device passthrough
> to VMs to fail." Is there something about GPUs that makes them more
> susceptible to this problem?
> 
> I think there are really two changes in this patch:
> 
>   1) Start with a domain number from bytes 4-5 instead of bytes 8-9.
> 
>   2) If the domain number is not unique, allocate another one using
>   the bitmap.
> 
> It sounds like part 2) by itself would be enough to solve the problem, and
> including part 1) just reduces the likelihood of having to allocate another
> domain number.
> 
> > In the rare cases of collision, we will detect and find another number
> > that is not in use.
> > Thanks to Michael Kelley  for proposing this
> idea.
> 
> This looks like two paragraphs and should have a blank line between them.
> 
> > Signed-off-by: Haiyang Zhang 
> > ---
> >  drivers/pci/controller/pci-hyperv.c | 91
> > +++--
> >  1 file changed, 78 insertions(+), 13 deletions(-)
> >
> > diff --git a/drivers/pci/controller/pci-hyperv.c
> > b/drivers/pci/controller/pci-hyperv.c
> > index 82acd61..6b9cc6e60a 100644
> > --- a/drivers/pci/controller/pci-hyperv.c
> > +++ b/drivers/pci/controller/pci-hyperv.c
> > @@ -37,6 +37,8 @@
> >   * the PCI back-end driver in Hyper-V.
> >   */
> >
> > +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
> > +
> >  #include 
> >  #include 
> >  #include 
> > @@ -2507,6 +2509,47 @@ static void put_hvpcibus(struct
> hv_pcibus_device *hbus)
> > complete(>remove_event);
> >  }
> >
> > +#define HVPCI_DOM_MAP_SIZE (64 * 1024) static
> > +DECLARE_BITMAP(hvpci_dom_map, HVPCI_DOM_MAP_SIZE);
> > +
> > +/* PCI domain number 0 is used by emulated devices on Gen1 VMs, so
> > +define 0
> > + * as invalid for passthrough PCI devices of this driver.
> > + */
> 
> Please use the usual multi-line comment style:
> 
>   /*
>* PCI domain number ...
>*/
> 
> > +#define HVPCI_DOM_INVALID 0
> > +
> > +/**
> > + * hv_get_dom_num() - Get a valid PCI domain number
> > + * Check if the PCI domain number is in use, and return another
> > +number if
> > + * it is in use.
> > + *
> > + * @dom: Requested domain number
> > + *
> > + * return: domain number on success, HVPCI_DOM_INVALID on failure  */
> > +static u16 hv_get_dom_num(u16 dom) {
> > +   unsigned int i;
> > +
> > +   if (test_and_set_bit(dom, hvpci_dom_map) == 0)
> > +   return dom;
> > +
> > +   for_each_clear_bit(i, hvpci_dom_map, HVPCI_DOM_MAP_SIZE) {
> > +   if (test_and_set_bit(i, hvpci_dom_map) == 0)
> > +   return i;
> > +   }
> > +
> > +   return HVPCI_DOM_INVALID;
> > +}
> > +
> > +/**
> > + * hv_put_dom_num() - Mark the PCI domain number as free
> > + * @dom: Domain number to be freed
> > + */
> > +static void hv_put_dom_num(u16 dom)
> > +{
> > +   clear_bit(dom, hvpci_dom_map);
> > +}
> > +
> >  /**
> >   * hv_pci_probe() - New VMBus channel probe, for a root PCI bus
> >   * @hdev:  VMBus's tracking struct for this root PCI bus
> > @@ -2518,6 +2561,7 @@ static int hv_pci_probe(struct hv_device *hdev,
> > const struct hv_vmbus_device_id *dev_id)  {

[PATCH] PCI: hv: Detect and fix Hyper-V PCI domain number collision

2019-08-02 Thread Haiyang Zhang
Due to Azure host agent settings, the device instance ID's bytes 8 and 9
are no longer unique. This causes some of the PCI devices not showing up
in VMs with multiple passthrough devices, such as GPUs. So, as recommended
by Azure host team, we now use the bytes 4 and 5 which usually provide
unique numbers.

In the rare cases of collision, we will detect and find another number
that is not in use.
Thanks to Michael Kelley  for proposing this idea.

Signed-off-by: Haiyang Zhang 
---
 drivers/pci/controller/pci-hyperv.c | 91 +++--
 1 file changed, 78 insertions(+), 13 deletions(-)

diff --git a/drivers/pci/controller/pci-hyperv.c 
b/drivers/pci/controller/pci-hyperv.c
index 82acd61..6b9cc6e60a 100644
--- a/drivers/pci/controller/pci-hyperv.c
+++ b/drivers/pci/controller/pci-hyperv.c
@@ -37,6 +37,8 @@
  * the PCI back-end driver in Hyper-V.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include 
 #include 
 #include 
@@ -2507,6 +2509,47 @@ static void put_hvpcibus(struct hv_pcibus_device *hbus)
complete(>remove_event);
 }
 
+#define HVPCI_DOM_MAP_SIZE (64 * 1024)
+static DECLARE_BITMAP(hvpci_dom_map, HVPCI_DOM_MAP_SIZE);
+
+/* PCI domain number 0 is used by emulated devices on Gen1 VMs, so define 0
+ * as invalid for passthrough PCI devices of this driver.
+ */
+#define HVPCI_DOM_INVALID 0
+
+/**
+ * hv_get_dom_num() - Get a valid PCI domain number
+ * Check if the PCI domain number is in use, and return another number if
+ * it is in use.
+ *
+ * @dom: Requested domain number
+ *
+ * return: domain number on success, HVPCI_DOM_INVALID on failure
+ */
+static u16 hv_get_dom_num(u16 dom)
+{
+   unsigned int i;
+
+   if (test_and_set_bit(dom, hvpci_dom_map) == 0)
+   return dom;
+
+   for_each_clear_bit(i, hvpci_dom_map, HVPCI_DOM_MAP_SIZE) {
+   if (test_and_set_bit(i, hvpci_dom_map) == 0)
+   return i;
+   }
+
+   return HVPCI_DOM_INVALID;
+}
+
+/**
+ * hv_put_dom_num() - Mark the PCI domain number as free
+ * @dom: Domain number to be freed
+ */
+static void hv_put_dom_num(u16 dom)
+{
+   clear_bit(dom, hvpci_dom_map);
+}
+
 /**
  * hv_pci_probe() - New VMBus channel probe, for a root PCI bus
  * @hdev:  VMBus's tracking struct for this root PCI bus
@@ -2518,6 +2561,7 @@ static int hv_pci_probe(struct hv_device *hdev,
const struct hv_vmbus_device_id *dev_id)
 {
struct hv_pcibus_device *hbus;
+   u16 dom_req, dom;
int ret;
 
/*
@@ -2532,19 +2576,32 @@ static int hv_pci_probe(struct hv_device *hdev,
hbus->state = hv_pcibus_init;
 
/*
-* The PCI bus "domain" is what is called "segment" in ACPI and
-* other specs.  Pull it from the instance ID, to get something
-* unique.  Bytes 8 and 9 are what is used in Windows guests, so
-* do the same thing for consistency.  Note that, since this code
-* only runs in a Hyper-V VM, Hyper-V can (and does) guarantee
-* that (1) the only domain in use for something that looks like
-* a physical PCI bus (which is actually emulated by the
-* hypervisor) is domain 0 and (2) there will be no overlap
-* between domains derived from these instance IDs in the same
-* VM.
+* The PCI bus "domain" is what is called "segment" in ACPI and other
+* specs. Pull it from the instance ID, to get something usually
+* unique. In rare cases of collision, we will find out another number
+* not in use.
+* Note that, since this code only runs in a Hyper-V VM, Hyper-V
+* together with this guest driver can guarantee that (1) The only
+* domain used by Gen1 VMs for something that looks like a physical
+* PCI bus (which is actually emulated by the hypervisor) is domain 0.
+* (2) There will be no overlap between domains (after fixing possible
+* collisions) in the same VM.
 */
-   hbus->sysdata.domain = hdev->dev_instance.b[9] |
-  hdev->dev_instance.b[8] << 8;
+   dom_req = hdev->dev_instance.b[5] << 8 | hdev->dev_instance.b[4];
+   dom = hv_get_dom_num(dom_req);
+
+   if (dom == HVPCI_DOM_INVALID) {
+   pr_err("Unable to use dom# 0x%hx or other numbers",
+  dom_req);
+   ret = -EINVAL;
+   goto free_bus;
+   }
+
+   if (dom != dom_req)
+   pr_info("PCI dom# 0x%hx has collision, using 0x%hx",
+   dom_req, dom);
+
+   hbus->sysdata.domain = dom;
 
hbus->hdev = hdev;
refcount_set(>remove_lock, 1);
@@ -2559,7 +2616,7 @@ static int hv_pci_probe(struct hv_device *hdev,
   hbus->sysdata.domain);
if (!hbus->wq) {

[PATCH net-next] Name NICs based on vmbus offer and enable async probe by default

2019-07-23 Thread Haiyang Zhang
Previously the async probing caused NIC naming in random order.

The patch adds a dev_num field in vmbus channel structure. It’s assigned
to the first available number when the channel is offered. So netvsc can
use it for NIC naming based on channel offer sequence. Now we re-enable
the async probing mode by default for faster probing.

Also added a modules parameter, probe_type, to set sync probing mode if
a user wants to.

Fixes: af0a5646cb8d ("use the new async probing feature for the hyperv drivers")
Signed-off-by: Haiyang Zhang 
---
 drivers/hv/channel_mgmt.c   | 46 +++--
 drivers/net/hyperv/netvsc_drv.c | 33 ++---
 include/linux/hyperv.h  |  4 
 3 files changed, 78 insertions(+), 5 deletions(-)

diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index addcef5..ab7c05b 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -304,6 +304,8 @@ bool vmbus_prep_negotiate_resp(struct icmsg_hdr *icmsghdrp,
 
 EXPORT_SYMBOL_GPL(vmbus_prep_negotiate_resp);
 
+#define HV_DEV_NUM_INVALID (-1)
+
 /*
  * alloc_channel - Allocate and initialize a vmbus channel object
  */
@@ -315,6 +317,8 @@ static struct vmbus_channel *alloc_channel(void)
if (!channel)
return NULL;
 
+   channel->dev_num = HV_DEV_NUM_INVALID;
+
spin_lock_init(>lock);
init_completion(>rescind_event);
 
@@ -533,6 +537,42 @@ static void vmbus_add_channel_work(struct work_struct 
*work)
 }
 
 /*
+ * Get the first available device number of its type, then
+ * record it in the channel structure.
+ */
+static void hv_set_devnum(struct vmbus_channel *newchannel)
+{
+   struct vmbus_channel *channel;
+   unsigned int i = 0;
+   bool found;
+
+   BUG_ON(!mutex_is_locked(_connection.channel_mutex));
+
+   /* Only HV_NIC uses this number for now */
+   if (hv_get_dev_type(newchannel) != HV_NIC)
+   return;
+
+next:
+   found = false;
+
+   list_for_each_entry(channel, _connection.chn_list, listentry) {
+   if (i == channel->dev_num &&
+   guid_equal(>offermsg.offer.if_type,
+  >offermsg.offer.if_type)) {
+   found = true;
+   break;
+   }
+   }
+
+   if (found) {
+   i++;
+   goto next;
+   }
+
+   newchannel->dev_num = i;
+}
+
+/*
  * vmbus_process_offer - Process the offer by creating a channel/device
  * associated with this offer
  */
@@ -561,10 +601,12 @@ static void vmbus_process_offer(struct vmbus_channel 
*newchannel)
}
}
 
-   if (fnew)
+   if (fnew) {
+   hv_set_devnum(newchannel);
+
list_add_tail(>listentry,
  _connection.chn_list);
-   else {
+   } else {
/*
 * Check to see if this is a valid sub-channel.
 */
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index afdcc56..af53690 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -57,6 +57,10 @@
 module_param(debug, int, 0444);
 MODULE_PARM_DESC(debug, "Debug level (0=none,...,16=all)");
 
+static unsigned int probe_type __ro_after_init = PROBE_PREFER_ASYNCHRONOUS;
+module_param(probe_type, uint, 0444);
+MODULE_PARM_DESC(probe_type, "Probe type: 1=async(default), 2=sync");
+
 static LIST_HEAD(netvsc_dev_list);
 
 static void netvsc_change_rx_flags(struct net_device *net, int change)
@@ -2233,10 +2237,19 @@ static int netvsc_probe(struct hv_device *dev,
struct net_device_context *net_device_ctx;
struct netvsc_device_info *device_info = NULL;
struct netvsc_device *nvdev;
+   char name[IFNAMSIZ];
int ret = -ENOMEM;
 
-   net = alloc_etherdev_mq(sizeof(struct net_device_context),
-   VRSS_CHANNEL_MAX);
+   if (probe_type == PROBE_PREFER_ASYNCHRONOUS) {
+   snprintf(name, IFNAMSIZ, "eth%d", dev->channel->dev_num);
+   net = alloc_netdev_mqs(sizeof(struct net_device_context), name,
+  NET_NAME_ENUM, ether_setup,
+  VRSS_CHANNEL_MAX, VRSS_CHANNEL_MAX);
+   } else {
+   net = alloc_etherdev_mq(sizeof(struct net_device_context),
+   VRSS_CHANNEL_MAX);
+   }
+
if (!net)
goto no_net;
 
@@ -2323,6 +2336,14 @@ static int netvsc_probe(struct hv_device *dev,
net->max_mtu = ETH_DATA_LEN;
 
ret = register_netdevice(net);
+
+   if (ret == -EEXIST) {
+   pr_info("NIC name %s exists, request another name.\n",
+   net->name);
+   strlcpy(net->name, &q

[PATCH net] hv_netvsc: Fix extra rcu_read_unlock in netvsc_recv_callback()

2019-07-19 Thread Haiyang Zhang
There is an extra rcu_read_unlock left in netvsc_recv_callback(),
after a previous patch that removes RCU from this function.
This patch removes the extra RCU unlock.

Fixes: 345ac08990b8 ("hv_netvsc: pass netvsc_device to receive callback")
Signed-off-by: Haiyang Zhang 
---
 drivers/net/hyperv/netvsc_drv.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index afdcc56..3544e19 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -836,7 +836,6 @@ int netvsc_recv_callback(struct net_device *net,
 
if (unlikely(!skb)) {
++net_device_ctx->eth_stats.rx_no_memory;
-   rcu_read_unlock();
return NVSP_STAT_FAIL;
}
 
-- 
1.8.3.1



RE: [PATCH] PCI: pci-hyperv: fix build errors on non-SYSFS config

2019-07-12 Thread Haiyang Zhang


> -Original Message-
> From: Randy Dunlap 
> Sent: Friday, July 12, 2019 11:53 AM
> To: linux-pci ; LKML  ker...@vger.kernel.org>
> Cc: Matthew Wilcox ; Jake Oshins
> ; KY Srinivasan ; Haiyang
> Zhang ; Stephen Hemminger
> ; Stephen Hemminger
> ; Sasha Levin ; Bjorn
> Helgaas ; Dexuan Cui 
> Subject: [PATCH] PCI: pci-hyperv: fix build errors on non-SYSFS config
> 
> From: Randy Dunlap 
> 
> Fix build errors when building almost-allmodconfig but with SYSFS
> not set (not enabled).  Fixes these build errors:
> 
> ERROR: "pci_destroy_slot" [drivers/pci/controller/pci-hyperv.ko] undefined!
> ERROR: "pci_create_slot" [drivers/pci/controller/pci-hyperv.ko] undefined!
> 
> drivers/pci/slot.o is only built when SYSFS is enabled, so
> pci-hyperv.o has an implicit dependency on SYSFS.
> Make that explicit.
> 
> Also, depending on X86 && X86_64 is not needed, so just change that
> to depend on X86_64.
> 
> Fixes: a15f2c08c708 ("PCI: hv: support reporting serial number as slot
> information")
> 
> Signed-off-by: Randy Dunlap 
> Cc: Matthew Wilcox 
> Cc: Jake Oshins 
> Cc: "K. Y. Srinivasan" 
> Cc: Haiyang Zhang 
> Cc: Stephen Hemminger 
> Cc: Stephen Hemminger 
> Cc: Sasha Levin 
> Cc: Bjorn Helgaas 
> Cc: linux-...@vger.kernel.org
> Cc: linux-hyp...@vger.kernel.org
> Cc: Dexuan Cui 
> ---
> v3: corrected Fixes: tag [Dexuan Cui ]
> This is the Microsoft-preferred version of the patch.
> 
>  drivers/pci/Kconfig |2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> --- lnx-52.orig/drivers/pci/Kconfig
> +++ lnx-52/drivers/pci/Kconfig
> @@ -181,7 +181,7 @@ config PCI_LABEL
> 
>  config PCI_HYPERV
>  tristate "Hyper-V PCI Frontend"
> -depends on X86 && HYPERV && PCI_MSI && PCI_MSI_IRQ_DOMAIN
> && X86_64
> +    depends on X86_64 && HYPERV && PCI_MSI &&
> PCI_MSI_IRQ_DOMAIN && SYSFS
>  help
>The PCI device frontend driver allows the kernel to import 
> arbitrary
>PCI devices from a PCI backend to support PCI driver domains.
> 

Reviewed-by: Haiyang Zhang 



RE: [PATCH net-next] Name NICs based on vmbus offer and enable async probe by default

2019-07-09 Thread Haiyang Zhang



> -Original Message-
> From: linux-hyperv-ow...@vger.kernel.org  ow...@vger.kernel.org> On Behalf Of David Miller
> Sent: Tuesday, July 9, 2019 8:30 PM
> To: Haiyang Zhang 
> Cc: sas...@kernel.org; linux-hyp...@vger.kernel.org;
> net...@vger.kernel.org; KY Srinivasan ; Stephen
> Hemminger ; o...@aepfle.de; vkuznets
> ; linux-kernel@vger.kernel.org
> Subject: Re: [PATCH net-next] Name NICs based on vmbus offer and enable
> async probe by default
> 
> 
> The net-next tree, if you are reading netdev today, has been closed.
I will re-submit when the tree re-opened. 
Thanks,
- Haiyang


RE: [PATCH net-next] Name NICs based on vmbus offer and enable async probe by default

2019-07-09 Thread Haiyang Zhang



> -Original Message-
> From: Stephen Hemminger 
> Sent: Tuesday, July 9, 2019 7:47 PM
> To: Haiyang Zhang 
> Cc: sas...@kernel.org; linux-hyp...@vger.kernel.org;
> net...@vger.kernel.org; KY Srinivasan ; Stephen
> Hemminger ; o...@aepfle.de; vkuznets
> ; da...@davemloft.net; linux-
> ker...@vger.kernel.org
> Subject: Re: [PATCH net-next] Name NICs based on vmbus offer and enable
> async probe by default
> 
> On Tue, 9 Jul 2019 22:56:30 +
> Haiyang Zhang  wrote:
> 
> > -   VRSS_CHANNEL_MAX);
> > +   if (probe_type == PROBE_PREFER_ASYNCHRONOUS) {
> > +   snprintf(name, IFNAMSIZ, "eth%d", dev->channel->dev_num);
> 
> What about PCI passthrough or VF devices that are also being probed and
> consuming the ethN names.  Won't there be a collision?

VF usually shows up a few seconds later than the synthetic NIC. Faster probing
will reduce the probability of collision.
Even if a collision happens, the code below will re-register the NIC with 
"eth%d":
+   if (ret == -EEXIST) {
+   pr_info("NIC name %s exists, request another name.\n",
+   net->name);
+   strlcpy(net->name, "eth%d", IFNAMSIZ);
+   ret = register_netdevice(net);
+   }

Thanks,
- Haiyang


[PATCH net-next] Name NICs based on vmbus offer and enable async probe by default

2019-07-09 Thread Haiyang Zhang
Previously the async probing caused NIC naming in random order.

The patch adds a dev_num field in vmbus channel structure. It’s assigned
to the first available number when the channel is offered. So netvsc can
use it for NIC naming based on channel offer sequence. Now we re-enable
the async probing mode by default for faster probing.

Also added a modules parameter, probe_type, to set sync probing mode if
a user wants to.

Fixes: af0a5646cb8d ("use the new async probing feature for the hyperv drivers")
Signed-off-by: Haiyang Zhang 
---
 drivers/hv/channel_mgmt.c   | 46 +++--
 drivers/net/hyperv/netvsc_drv.c | 33 ++---
 include/linux/hyperv.h  |  4 
 3 files changed, 78 insertions(+), 5 deletions(-)

diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index addcef5..ab7c05b 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -304,6 +304,8 @@ bool vmbus_prep_negotiate_resp(struct icmsg_hdr *icmsghdrp,
 
 EXPORT_SYMBOL_GPL(vmbus_prep_negotiate_resp);
 
+#define HV_DEV_NUM_INVALID (-1)
+
 /*
  * alloc_channel - Allocate and initialize a vmbus channel object
  */
@@ -315,6 +317,8 @@ static struct vmbus_channel *alloc_channel(void)
if (!channel)
return NULL;
 
+   channel->dev_num = HV_DEV_NUM_INVALID;
+
spin_lock_init(>lock);
init_completion(>rescind_event);
 
@@ -533,6 +537,42 @@ static void vmbus_add_channel_work(struct work_struct 
*work)
 }
 
 /*
+ * Get the first available device number of its type, then
+ * record it in the channel structure.
+ */
+static void hv_set_devnum(struct vmbus_channel *newchannel)
+{
+   struct vmbus_channel *channel;
+   unsigned int i = 0;
+   bool found;
+
+   BUG_ON(!mutex_is_locked(_connection.channel_mutex));
+
+   /* Only HV_NIC uses this number for now */
+   if (hv_get_dev_type(newchannel) != HV_NIC)
+   return;
+
+next:
+   found = false;
+
+   list_for_each_entry(channel, _connection.chn_list, listentry) {
+   if (i == channel->dev_num &&
+   guid_equal(>offermsg.offer.if_type,
+  >offermsg.offer.if_type)) {
+   found = true;
+   break;
+   }
+   }
+
+   if (found) {
+   i++;
+   goto next;
+   }
+
+   newchannel->dev_num = i;
+}
+
+/*
  * vmbus_process_offer - Process the offer by creating a channel/device
  * associated with this offer
  */
@@ -561,10 +601,12 @@ static void vmbus_process_offer(struct vmbus_channel 
*newchannel)
}
}
 
-   if (fnew)
+   if (fnew) {
+   hv_set_devnum(newchannel);
+
list_add_tail(>listentry,
  _connection.chn_list);
-   else {
+   } else {
/*
 * Check to see if this is a valid sub-channel.
 */
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index afdcc56..af53690 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -57,6 +57,10 @@
 module_param(debug, int, 0444);
 MODULE_PARM_DESC(debug, "Debug level (0=none,...,16=all)");
 
+static unsigned int probe_type __ro_after_init = PROBE_PREFER_ASYNCHRONOUS;
+module_param(probe_type, uint, 0444);
+MODULE_PARM_DESC(probe_type, "Probe type: 1=async(default), 2=sync");
+
 static LIST_HEAD(netvsc_dev_list);
 
 static void netvsc_change_rx_flags(struct net_device *net, int change)
@@ -2233,10 +2237,19 @@ static int netvsc_probe(struct hv_device *dev,
struct net_device_context *net_device_ctx;
struct netvsc_device_info *device_info = NULL;
struct netvsc_device *nvdev;
+   char name[IFNAMSIZ];
int ret = -ENOMEM;
 
-   net = alloc_etherdev_mq(sizeof(struct net_device_context),
-   VRSS_CHANNEL_MAX);
+   if (probe_type == PROBE_PREFER_ASYNCHRONOUS) {
+   snprintf(name, IFNAMSIZ, "eth%d", dev->channel->dev_num);
+   net = alloc_netdev_mqs(sizeof(struct net_device_context), name,
+  NET_NAME_ENUM, ether_setup,
+  VRSS_CHANNEL_MAX, VRSS_CHANNEL_MAX);
+   } else {
+   net = alloc_etherdev_mq(sizeof(struct net_device_context),
+   VRSS_CHANNEL_MAX);
+   }
+
if (!net)
goto no_net;
 
@@ -2323,6 +2336,14 @@ static int netvsc_probe(struct hv_device *dev,
net->max_mtu = ETH_DATA_LEN;
 
ret = register_netdevice(net);
+
+   if (ret == -EEXIST) {
+   pr_info("NIC name %s exists, request another name.\n",
+   net->name);
+   strlcpy(net->name, &q

RE: [PATCH v2] PCI: hv: fix pci-hyperv build when SYSFS not enabled

2019-07-03 Thread Haiyang Zhang


> -Original Message-
> From: Randy Dunlap 
> Sent: Wednesday, July 3, 2019 12:59 PM
> To: LKML ; linux-pci  p...@vger.kernel.org>
> Cc: Matthew Wilcox ; Jake Oshins
> ; KY Srinivasan ; Haiyang
> Zhang ; Stephen Hemminger
> ; Sasha Levin ; Bjorn
> Helgaas ; linux-hyp...@vger.kernel.org; Dexuan
> Cui ; Yuehaibing 
> Subject: [PATCH v2] PCI: hv: fix pci-hyperv build when SYSFS not enabled
> 
> From: Randy Dunlap 
> 
> Fix build of drivers/pci/controller/pci-hyperv.o when
> CONFIG_SYSFS is not set/enabled by adding stubs for
> pci_create_slot() and pci_destroy_slot().
> 
> Fixes these build errors:
> 
> ERROR: "pci_destroy_slot" [drivers/pci/controller/pci-hyperv.ko] undefined!
> ERROR: "pci_create_slot" [drivers/pci/controller/pci-hyperv.ko] undefined!
> 
> Fixes: a15f2c08c708 ("PCI: hv: support reporting serial number as slot
> information")
> 
> Signed-off-by: Randy Dunlap 
> Cc: Matthew Wilcox 
> Cc: Jake Oshins 
> Cc: "K. Y. Srinivasan" 
> Cc: Haiyang Zhang 
> Cc: Stephen Hemminger 
> Cc: Sasha Levin 
> Cc: Bjorn Helgaas 
> Cc: linux-...@vger.kernel.org
> Cc: linux-hyp...@vger.kernel.org
> Cc: Dexuan Cui 
> Cc: Yuehaibing 
> ---
> v2:
> - provide non-CONFIG_SYSFS stubs for pci_create_slot() and
>   pci_destroy_slot() [suggested by Matthew Wilcox ]
> - use the correct Fixes: tag [Dexuan Cui ]
> 
>  include/linux/pci.h |   12 ++--
>  1 file changed, 10 insertions(+), 2 deletions(-)
> 
> --- lnx-52-rc7.orig/include/linux/pci.h
> +++ lnx-52-rc7/include/linux/pci.h
> @@ -25,6 +25,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -947,14 +948,21 @@ int pci_scan_root_bus_bridge(struct pci_
>  struct pci_bus *pci_add_new_bus(struct pci_bus *parent, struct pci_dev
> *dev,
>   int busnr);
>  void pcie_update_link_speed(struct pci_bus *bus, u16 link_status);
> +#ifdef CONFIG_SYSFS
> +void pci_dev_assign_slot(struct pci_dev *dev);
>  struct pci_slot *pci_create_slot(struct pci_bus *parent, int slot_nr,
>const char *name,
>struct hotplug_slot *hotplug);
>  void pci_destroy_slot(struct pci_slot *slot);
> -#ifdef CONFIG_SYSFS
> -void pci_dev_assign_slot(struct pci_dev *dev);
>  #else
>  static inline void pci_dev_assign_slot(struct pci_dev *dev) { }
> +static inline struct pci_slot *pci_create_slot(struct pci_bus *parent,
> +int slot_nr,
> +const char *name,
> +struct hotplug_slot *hotplug) {
> + return ERR_PTR(-EINVAL);
> +}
> +static inline void pci_destroy_slot(struct pci_slot *slot) { }
>  #endif
>  int pci_scan_slot(struct pci_bus *bus, int devfn);
>  struct pci_dev *pci_scan_single_device(struct pci_bus *bus, int devfn);
> 

The serial number in slot info is used to match VF NIC with Synthetic NIC.
Without selecting SYSFS, the SRIOV feature will fail on VM on Hyper-V and
Azure. The first version of this patch should be used.

@Stephen Hemminger how do you think?

Thanks,
- Haiyang


[PATCH net] hv_netvsc: Set probe mode to sync

2019-06-13 Thread Haiyang Zhang
For better consistency of synthetic NIC names, we set the probe mode to
PROBE_FORCE_SYNCHRONOUS. So the names can be aligned with the vmbus
channel offer sequence.

Fixes: af0a5646cb8d ("use the new async probing feature for the hyperv drivers")
Signed-off-by: Haiyang Zhang 
---
 drivers/net/hyperv/netvsc_drv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 03ea5a7..afdcc56 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -2407,7 +2407,7 @@ static int netvsc_remove(struct hv_device *dev)
.probe = netvsc_probe,
.remove = netvsc_remove,
.driver = {
-   .probe_type = PROBE_PREFER_ASYNCHRONOUS,
+   .probe_type = PROBE_FORCE_SYNCHRONOUS,
},
 };
 
-- 
1.8.3.1



[PATCH] PCI: hv: Detect and fix Hyper-V PCI domain number collision

2019-05-19 Thread Haiyang Zhang
Due to Azure host agent settings, the device instance ID's bytes 8 and 9
are no longer unique. This causes some of the PCI devices not showing up
in VMs with multiple passthrough devices, such as GPUs. So, as recommended
by Azure host team, we now use the bytes 4 and 5 which usually provide
unique numbers.

In the rare cases of collision, we will detect and find another number
that is not in use.
Thanks to Michael Kelley  for proposing this idea.

Signed-off-by: Haiyang Zhang 
---
 drivers/pci/controller/pci-hyperv.c | 91 +++--
 1 file changed, 78 insertions(+), 13 deletions(-)

diff --git a/drivers/pci/controller/pci-hyperv.c 
b/drivers/pci/controller/pci-hyperv.c
index 82acd61..6b9cc6e60a 100644
--- a/drivers/pci/controller/pci-hyperv.c
+++ b/drivers/pci/controller/pci-hyperv.c
@@ -37,6 +37,8 @@
  * the PCI back-end driver in Hyper-V.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include 
 #include 
 #include 
@@ -2507,6 +2509,47 @@ static void put_hvpcibus(struct hv_pcibus_device *hbus)
complete(>remove_event);
 }
 
+#define HVPCI_DOM_MAP_SIZE (64 * 1024)
+static DECLARE_BITMAP(hvpci_dom_map, HVPCI_DOM_MAP_SIZE);
+
+/* PCI domain number 0 is used by emulated devices on Gen1 VMs, so define 0
+ * as invalid for passthrough PCI devices of this driver.
+ */
+#define HVPCI_DOM_INVALID 0
+
+/**
+ * hv_get_dom_num() - Get a valid PCI domain number
+ * Check if the PCI domain number is in use, and return another number if
+ * it is in use.
+ *
+ * @dom: Requested domain number
+ *
+ * return: domain number on success, HVPCI_DOM_INVALID on failure
+ */
+static u16 hv_get_dom_num(u16 dom)
+{
+   unsigned int i;
+
+   if (test_and_set_bit(dom, hvpci_dom_map) == 0)
+   return dom;
+
+   for_each_clear_bit(i, hvpci_dom_map, HVPCI_DOM_MAP_SIZE) {
+   if (test_and_set_bit(i, hvpci_dom_map) == 0)
+   return i;
+   }
+
+   return HVPCI_DOM_INVALID;
+}
+
+/**
+ * hv_put_dom_num() - Mark the PCI domain number as free
+ * @dom: Domain number to be freed
+ */
+static void hv_put_dom_num(u16 dom)
+{
+   clear_bit(dom, hvpci_dom_map);
+}
+
 /**
  * hv_pci_probe() - New VMBus channel probe, for a root PCI bus
  * @hdev:  VMBus's tracking struct for this root PCI bus
@@ -2518,6 +2561,7 @@ static int hv_pci_probe(struct hv_device *hdev,
const struct hv_vmbus_device_id *dev_id)
 {
struct hv_pcibus_device *hbus;
+   u16 dom_req, dom;
int ret;
 
/*
@@ -2532,19 +2576,32 @@ static int hv_pci_probe(struct hv_device *hdev,
hbus->state = hv_pcibus_init;
 
/*
-* The PCI bus "domain" is what is called "segment" in ACPI and
-* other specs.  Pull it from the instance ID, to get something
-* unique.  Bytes 8 and 9 are what is used in Windows guests, so
-* do the same thing for consistency.  Note that, since this code
-* only runs in a Hyper-V VM, Hyper-V can (and does) guarantee
-* that (1) the only domain in use for something that looks like
-* a physical PCI bus (which is actually emulated by the
-* hypervisor) is domain 0 and (2) there will be no overlap
-* between domains derived from these instance IDs in the same
-* VM.
+* The PCI bus "domain" is what is called "segment" in ACPI and other
+* specs. Pull it from the instance ID, to get something usually
+* unique. In rare cases of collision, we will find out another number
+* not in use.
+* Note that, since this code only runs in a Hyper-V VM, Hyper-V
+* together with this guest driver can guarantee that (1) The only
+* domain used by Gen1 VMs for something that looks like a physical
+* PCI bus (which is actually emulated by the hypervisor) is domain 0.
+* (2) There will be no overlap between domains (after fixing possible
+* collisions) in the same VM.
 */
-   hbus->sysdata.domain = hdev->dev_instance.b[9] |
-  hdev->dev_instance.b[8] << 8;
+   dom_req = hdev->dev_instance.b[5] << 8 | hdev->dev_instance.b[4];
+   dom = hv_get_dom_num(dom_req);
+
+   if (dom == HVPCI_DOM_INVALID) {
+   pr_err("Unable to use dom# 0x%hx or other numbers",
+  dom_req);
+   ret = -EINVAL;
+   goto free_bus;
+   }
+
+   if (dom != dom_req)
+   pr_info("PCI dom# 0x%hx has collision, using 0x%hx",
+   dom_req, dom);
+
+   hbus->sysdata.domain = dom;
 
hbus->hdev = hdev;
refcount_set(>remove_lock, 1);
@@ -2559,7 +2616,7 @@ static int hv_pci_probe(struct hv_device *hdev,
   hbus->sysdata.domain);
if (!hbus->wq) {

[PATCH hyperv-fixes] hv_netvsc: Fix unwanted wakeup after tx_disable

2019-03-28 Thread Haiyang Zhang
From: Haiyang Zhang 

After queue stopped, the wakeup mechanism may wake it up again
when ring buffer usage is lower than a threshold. This may cause
send path panic on NULL pointer when we stopped all tx queues in
netvsc_detach and start removing the netvsc device.

This patch fix it by adding a tx_disable flag to prevent unwanted
queue wakeup.

Fixes: 7b2ee50c0cd5 ("hv_netvsc: common detach logic")
Reported-by: Mohammed Gamal 
Signed-off-by: Haiyang Zhang 
---
 drivers/net/hyperv/hyperv_net.h |  1 +
 drivers/net/hyperv/netvsc.c |  6 --
 drivers/net/hyperv/netvsc_drv.c | 32 ++--
 3 files changed, 31 insertions(+), 8 deletions(-)

diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
index e859ae2..49f41b6 100644
--- a/drivers/net/hyperv/hyperv_net.h
+++ b/drivers/net/hyperv/hyperv_net.h
@@ -987,6 +987,7 @@ struct netvsc_device {
 
wait_queue_head_t wait_drain;
bool destroy;
+   bool tx_disable; /* if true, do not wake up queue again */
 
/* Receive buffer allocated by us but manages by NetVSP */
void *recv_buf;
diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index 813d195..e0dce37 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -110,6 +110,7 @@ static struct netvsc_device *alloc_net_device(void)
 
init_waitqueue_head(_device->wait_drain);
net_device->destroy = false;
+   net_device->tx_disable = false;
 
net_device->max_pkt = RNDIS_MAX_PKT_DEFAULT;
net_device->pkt_align = RNDIS_PKT_ALIGN_DEFAULT;
@@ -719,7 +720,7 @@ static void netvsc_send_tx_complete(struct net_device *ndev,
} else {
struct netdev_queue *txq = netdev_get_tx_queue(ndev, q_idx);
 
-   if (netif_tx_queue_stopped(txq) &&
+   if (netif_tx_queue_stopped(txq) && !net_device->tx_disable &&
(hv_get_avail_to_write_percent(>outbound) >
 RING_AVAIL_PERCENT_HIWATER || queue_sends < 1)) {
netif_tx_wake_queue(txq);
@@ -874,7 +875,8 @@ static inline int netvsc_send_pkt(
} else if (ret == -EAGAIN) {
netif_tx_stop_queue(txq);
ndev_ctx->eth_stats.stop_queue++;
-   if (atomic_read(>queue_sends) < 1) {
+   if (atomic_read(>queue_sends) < 1 &&
+   !net_device->tx_disable) {
netif_tx_wake_queue(txq);
ndev_ctx->eth_stats.wake_queue++;
ret = -ENOSPC;
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 1a08679..0824155 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -109,6 +109,15 @@ static void netvsc_set_rx_mode(struct net_device *net)
rcu_read_unlock();
 }
 
+static inline void netvsc_tx_enable(struct netvsc_device *nvscdev,
+   struct net_device *ndev)
+{
+   nvscdev->tx_disable = false;
+   mb(); /* ensure queue wake up mechanism is on */
+
+   netif_tx_wake_all_queues(ndev);
+}
+
 static int netvsc_open(struct net_device *net)
 {
struct net_device_context *ndev_ctx = netdev_priv(net);
@@ -129,7 +138,7 @@ static int netvsc_open(struct net_device *net)
rdev = nvdev->extension;
if (!rdev->link_state) {
netif_carrier_on(net);
-   netif_tx_wake_all_queues(net);
+   netvsc_tx_enable(nvdev, net);
}
 
if (vf_netdev) {
@@ -184,6 +193,17 @@ static int netvsc_wait_until_empty(struct netvsc_device 
*nvdev)
}
 }
 
+static inline void netvsc_tx_disable(struct netvsc_device *nvscdev,
+struct net_device *ndev)
+{
+   if (nvscdev) {
+   nvscdev->tx_disable = true;
+   mb(); /* ensure txq will not wake up after stop */
+   }
+
+   netif_tx_disable(ndev);
+}
+
 static int netvsc_close(struct net_device *net)
 {
struct net_device_context *net_device_ctx = netdev_priv(net);
@@ -192,7 +212,7 @@ static int netvsc_close(struct net_device *net)
struct netvsc_device *nvdev = rtnl_dereference(net_device_ctx->nvdev);
int ret;
 
-   netif_tx_disable(net);
+   netvsc_tx_disable(nvdev, net);
 
/* No need to close rndis filter if it is removed already */
if (!nvdev)
@@ -918,7 +938,7 @@ static int netvsc_detach(struct net_device *ndev,
 
/* If device was up (receiving) then shutdown */
if (netif_running(ndev)) {
-   netif_tx_disable(ndev);
+   netvsc_tx_disable(nvdev, ndev);
 
ret = rndis_filter_close(nvdev);
if (ret) {
@@ -1906,7 +1926,7 @@ static void netvsc_link_change(struct work_struct *w)
if (rdev->

RE: [PATCH hyperv-fixes] hv_netvsc: Fix IP header checksum for coalesced packets

2019-02-23 Thread Haiyang Zhang



> -Original Message-
> From: Stephen Hemminger 
> Sent: Saturday, February 23, 2019 11:46 AM
> To: Haiyang Zhang 
> Cc: Haiyang Zhang ; sas...@kernel.org; linux-
> hyp...@vger.kernel.org; KY Srinivasan ; Stephen
> Hemminger ; o...@aepfle.de; vkuznets
> ; da...@davemloft.net; net...@vger.kernel.org;
> linux-kernel@vger.kernel.org
> Subject: Re: [PATCH hyperv-fixes] hv_netvsc: Fix IP header checksum for
> coalesced packets
> 
> On Fri, 22 Feb 2019 18:25:03 +0000
> Haiyang Zhang  wrote:
> 
> > From: Haiyang Zhang 
> >
> > Incoming packets may have IP header checksum verified by the host.
> > They may not have IP header checksum computed after coalescing.
> > This patch re-compute the checksum when necessary, otherwise the
> > packets may be dropped, because Linux network stack always checks it.
> >
> > Signed-off-by: Haiyang Zhang 
> > ---
> >  drivers/net/hyperv/netvsc_drv.c | 22 +++---
> >  1 file changed, 19 insertions(+), 3 deletions(-)
> >
> > diff --git a/drivers/net/hyperv/netvsc_drv.c
> > b/drivers/net/hyperv/netvsc_drv.c index 256adbd044f5..cf4897043e83
> > 100644
> > --- a/drivers/net/hyperv/netvsc_drv.c
> > +++ b/drivers/net/hyperv/netvsc_drv.c
> > @@ -744,6 +744,14 @@ void netvsc_linkstatus_callback(struct net_device
> *net,
> > schedule_delayed_work(_ctx->dwork, 0);  }
> >
> > +static void netvsc_comp_ipcsum(struct sk_buff *skb) {
> > +   struct iphdr *iph = (struct iphdr *)skb->data;
> 
> Can you use iphdr(skb) here?
This skb is just allocated by netvsc, the skb->network_header is not set yet.

> 
> > +
> > +   iph->check = 0;
> > +   iph->check = ip_fast_csum(iph, iph->ihl); }
> > +
> >  static struct sk_buff *netvsc_alloc_recv_skb(struct net_device *net,
> >  struct netvsc_channel *nvchan)
> { @@ -770,9 +778,17 @@
> > static struct sk_buff *netvsc_alloc_recv_skb(struct net_device *net,
> > /* skb is already created with CHECKSUM_NONE */
> > skb_checksum_none_assert(skb);
> >
> > -   /*
> > -* In Linux, the IP checksum is always checked.
> > -* Do L4 checksum offload if enabled and present.
> > +   /* Incoming packets may have IP header checksum verified by the
> host.
> > +* They may not have IP header checksum computed after coalescing.
> > +* We compute it here if the flags are set, because on Linux, the IP
> > +* checksum is always checked.
> > +*/
> > +   if (csum_info && csum_info->receive.ip_checksum_value_invalid &&
> > +   csum_info->receive.ip_checksum_succeeded &&
> > +   skb->protocol == htons(ETH_P_IP))
> > +   netvsc_comp_ipcsum(skb);
> 
> Does this still handle for coalesced and non-coalesced packets which are
> received with bad IP checksum?  My concern is that you are potentially
> correcting the checksum for a packet whose received checksum was bad.

Windows networking team told me that the flags above indicate host side 
already verified the checksum. Online doc is here:
https://docs.microsoft.com/en-us/windows-hardware/drivers/network/indicating-coalesced-segments
If the NIC or miniport driver validates the TCP and IPv4 checksums but does not 
recompute them for the coalesced segment, it must set the 
TcpChecksumValueInvalid and IpChecksumValueInvalid flags in the 
NDIS_TCP_IP_CHECKSUM_NET_BUFFER_LIST_INFO structure. Additionally, in this case 
the NIC or miniport driver may optionally zero out the TCP and IPv4 header 
checksum values in the segment.

The NIC and miniport driver must always set the IpChecksumSucceeded and 
TcpChecksumSucceeded flags in the NDIS_TCP_IP_CHECKSUM_NET_BUFFER_LIST_INFO 
structure before indicating the coalesced segment.

Thanks,
- Haiyang



[PATCH hyperv-fixes] hv_netvsc: Fix IP header checksum for coalesced packets

2019-02-22 Thread Haiyang Zhang
From: Haiyang Zhang 

Incoming packets may have IP header checksum verified by the host.
They may not have IP header checksum computed after coalescing.
This patch re-compute the checksum when necessary, otherwise the
packets may be dropped, because Linux network stack always checks it.

Signed-off-by: Haiyang Zhang 
---
 drivers/net/hyperv/netvsc_drv.c | 22 +++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 256adbd044f5..cf4897043e83 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -744,6 +744,14 @@ void netvsc_linkstatus_callback(struct net_device *net,
schedule_delayed_work(_ctx->dwork, 0);
 }
 
+static void netvsc_comp_ipcsum(struct sk_buff *skb)
+{
+   struct iphdr *iph = (struct iphdr *)skb->data;
+
+   iph->check = 0;
+   iph->check = ip_fast_csum(iph, iph->ihl);
+}
+
 static struct sk_buff *netvsc_alloc_recv_skb(struct net_device *net,
 struct netvsc_channel *nvchan)
 {
@@ -770,9 +778,17 @@ static struct sk_buff *netvsc_alloc_recv_skb(struct 
net_device *net,
/* skb is already created with CHECKSUM_NONE */
skb_checksum_none_assert(skb);
 
-   /*
-* In Linux, the IP checksum is always checked.
-* Do L4 checksum offload if enabled and present.
+   /* Incoming packets may have IP header checksum verified by the host.
+* They may not have IP header checksum computed after coalescing.
+* We compute it here if the flags are set, because on Linux, the IP
+* checksum is always checked.
+*/
+   if (csum_info && csum_info->receive.ip_checksum_value_invalid &&
+   csum_info->receive.ip_checksum_succeeded &&
+   skb->protocol == htons(ETH_P_IP))
+   netvsc_comp_ipcsum(skb);
+
+   /* Do L4 checksum offload if enabled and present.
 */
if (csum_info && (net->features & NETIF_F_RXCSUM)) {
if (csum_info->receive.tcp_checksum_succeeded ||
-- 
2.19.1



[PATCH v2] MAINTAINERS: Change mailing list for Hyper-V CORE AND DRIVERS

2019-02-20 Thread Haiyang Zhang
From: Haiyang Zhang 

The new mailing list is: linux-hyp...@vger.kernel.org

Signed-off-by: Haiyang Zhang 
---
 MAINTAINERS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 86aa227b5782..403d6e4b8257 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7137,7 +7137,7 @@ M:Haiyang Zhang 
 M: Stephen Hemminger 
 M: Sasha Levin 
 T: git git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux.git
-L: de...@linuxdriverproject.org
+L: linux-hyp...@vger.kernel.org
 S: Supported
 F: Documentation/networking/device_drivers/microsoft/netvsc.txt
 F: arch/x86/include/asm/mshyperv.h
-- 
2.19.1



[PATCH] MAINTAINERS: Add mailing list for Hyper-V CORE AND DRIVERS

2019-02-20 Thread Haiyang Zhang
From: Haiyang Zhang 

The new mailing list is: linux-hyp...@vger.kernel.org

Signed-off-by: Haiyang Zhang 
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 86aa227b5782..ef65de3cfe1b 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7137,6 +7137,7 @@ M:Haiyang Zhang 
 M: Stephen Hemminger 
 M: Sasha Levin 
 T: git git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux.git
+L: linux-hyp...@vger.kernel.org
 L: de...@linuxdriverproject.org
 S: Supported
 F: Documentation/networking/device_drivers/microsoft/netvsc.txt
-- 
2.19.1



RE: [PATCH] MAINTAINERS: Add mailing list for Hyper-V CORE AND DRIVERS

2019-02-20 Thread Haiyang Zhang



> -Original Message-
> From: Greg KH 
> Sent: Wednesday, February 20, 2019 2:52 PM
> To: Haiyang Zhang 
> Cc: sas...@kernel.org; linux-hyp...@vger.kernel.org;
> de...@linuxdriverproject.org; Stephen Hemminger
> ; linux-kernel@vger.kernel.org
> Subject: Re: [PATCH] MAINTAINERS: Add mailing list for Hyper-V CORE AND
> DRIVERS
> 
> On Wed, Feb 20, 2019 at 07:48:23PM +, Haiyang Zhang wrote:
> > From: Haiyang Zhang 
> >
> > The new mailing list is: linux-hyp...@vger.kernel.org
> >
> > Signed-off-by: Haiyang Zhang 
> > ---
> >  MAINTAINERS | 1 +
> >  1 file changed, 1 insertion(+)
> >
> > diff --git a/MAINTAINERS b/MAINTAINERS index
> > 86aa227b5782..ef65de3cfe1b 100644
> > --- a/MAINTAINERS
> > +++ b/MAINTAINERS
> > @@ -7137,6 +7137,7 @@ M:Haiyang Zhang 
> >  M: Stephen Hemminger 
> >  M: Sasha Levin 
> >  T: git git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux.git
> > +L: linux-hyp...@vger.kernel.org
> >  L: de...@linuxdriverproject.org
> 
> Why not just drop the linuxdriverproject one when you do this?
> 
> I for one will not mind :)

I will. Thanks,
- Haiyang


[PATCH hyperv-fixes,3/3] Fix hash key value reset after other ops

2019-01-14 Thread Haiyang Zhang
From: Haiyang Zhang 

Changing mtu, channels, or buffer sizes ops call to netvsc_attach(),
rndis_set_subchannel(), which always reset the hash key to default
value. That will override hash key changed previously. This patch
fixes the problem by save the hash key, then restore it when we re-
add the netvsc device.

Fixes: ff4a44199012 ("netvsc: allow get/set of RSS indirection table")
Signed-off-by: Haiyang Zhang 
---
 drivers/net/hyperv/hyperv_net.h   | 10 +++---
 drivers/net/hyperv/netvsc.c   |  2 +-
 drivers/net/hyperv/netvsc_drv.c   |  5 -
 drivers/net/hyperv/rndis_filter.c |  9 +++--
 4 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
index ef6f766f6389..e598a684700b 100644
--- a/drivers/net/hyperv/hyperv_net.h
+++ b/drivers/net/hyperv/hyperv_net.h
@@ -144,6 +144,8 @@ struct hv_netvsc_packet {
u32 total_data_buflen;
 };
 
+#define NETVSC_HASH_KEYLEN 40
+
 struct netvsc_device_info {
unsigned char mac_adr[ETH_ALEN];
u32  num_chn;
@@ -151,6 +153,8 @@ struct netvsc_device_info {
u32  recv_sections;
u32  send_section_size;
u32  recv_section_size;
+
+   u8 rss_key[NETVSC_HASH_KEYLEN];
 };
 
 enum rndis_device_state {
@@ -160,8 +164,6 @@ enum rndis_device_state {
RNDIS_DEV_DATAINITIALIZED,
 };
 
-#define NETVSC_HASH_KEYLEN 40
-
 struct rndis_device {
struct net_device *ndev;
 
@@ -209,7 +211,9 @@ int netvsc_recv_callback(struct net_device *net,
 void netvsc_channel_cb(void *context);
 int netvsc_poll(struct napi_struct *napi, int budget);
 
-int rndis_set_subchannel(struct net_device *ndev, struct netvsc_device *nvdev);
+int rndis_set_subchannel(struct net_device *ndev,
+struct netvsc_device *nvdev,
+struct netvsc_device_info *dev_info);
 int rndis_filter_open(struct netvsc_device *nvdev);
 int rndis_filter_close(struct netvsc_device *nvdev);
 struct netvsc_device *rndis_filter_device_add(struct hv_device *dev,
diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index 922054c1d544..1910810e55bd 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -84,7 +84,7 @@ static void netvsc_subchan_work(struct work_struct *w)
 
rdev = nvdev->extension;
if (rdev) {
-   ret = rndis_set_subchannel(rdev->ndev, nvdev);
+   ret = rndis_set_subchannel(rdev->ndev, nvdev, NULL);
if (ret == 0) {
netif_device_attach(rdev->ndev);
} else {
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index f424327f7206..e281829a04ef 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -877,6 +877,9 @@ static struct netvsc_device_info *netvsc_devinfo_get
dev_info->send_section_size = nvdev->send_section_size;
dev_info->recv_sections = nvdev->recv_section_cnt;
dev_info->recv_section_size = nvdev->recv_section_size;
+
+   memcpy(dev_info->rss_key, nvdev->extension->rss_key,
+  NETVSC_HASH_KEYLEN);
} else {
dev_info->num_chn = VRSS_CHANNEL_DEFAULT;
dev_info->send_sections = NETVSC_DEFAULT_TX;
@@ -939,7 +942,7 @@ static int netvsc_attach(struct net_device *ndev,
return PTR_ERR(nvdev);
 
if (nvdev->num_chn > 1) {
-   ret = rndis_set_subchannel(ndev, nvdev);
+   ret = rndis_set_subchannel(ndev, nvdev, dev_info);
 
/* if unavailable, just proceed with one queue */
if (ret) {
diff --git a/drivers/net/hyperv/rndis_filter.c 
b/drivers/net/hyperv/rndis_filter.c
index a4661d396e3c..db81378e6624 100644
--- a/drivers/net/hyperv/rndis_filter.c
+++ b/drivers/net/hyperv/rndis_filter.c
@@ -1134,7 +1134,9 @@ static void netvsc_sc_open(struct vmbus_channel *new_sc)
  * This breaks overlap of processing the host message for the
  * new primary channel with the initialization of sub-channels.
  */
-int rndis_set_subchannel(struct net_device *ndev, struct netvsc_device *nvdev)
+int rndis_set_subchannel(struct net_device *ndev,
+struct netvsc_device *nvdev,
+struct netvsc_device_info *dev_info)
 {
struct nvsp_message *init_packet = >channel_init_pkt;
struct net_device_context *ndev_ctx = netdev_priv(ndev);
@@ -1175,7 +1177,10 @@ int rndis_set_subchannel(struct net_device *ndev, struct 
netvsc_device *nvdev)
   atomic_read(>open_chn) == nvdev->num_chn);
 
/* ignore failues from setting rss parameters, still have channels */
-   rndis_filter_set_rss_param(rdev, netvsc_hash_key);
+   if (dev_info)
+   rndis_filter_set_rss_param(rdev, dev_info->rss_key);
+   

[PATCH hyperv-fixes,2/3] Refactor assignments of struct netvsc_device_info

2019-01-14 Thread Haiyang Zhang
From: Haiyang Zhang 

These assignments occur in multiple places. The patch refactor them
to a function for simplicity. It also puts the struct to heap area
for future expension.

Signed-off-by: Haiyang Zhang 
---
 drivers/net/hyperv/netvsc_drv.c | 134 
 1 file changed, 85 insertions(+), 49 deletions(-)

diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 91ed15ea5883..f424327f7206 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -858,6 +858,36 @@ static void netvsc_get_channels(struct net_device *net,
}
 }
 
+/* Alloc struct netvsc_device_info, and initialize it from either existing
+ * struct netvsc_device, or from default values.
+ */
+static struct netvsc_device_info *netvsc_devinfo_get
+   (struct netvsc_device *nvdev)
+{
+   struct netvsc_device_info *dev_info;
+
+   dev_info = kzalloc(sizeof(*dev_info), GFP_ATOMIC);
+
+   if (!dev_info)
+   return NULL;
+
+   if (nvdev) {
+   dev_info->num_chn = nvdev->num_chn;
+   dev_info->send_sections = nvdev->send_section_cnt;
+   dev_info->send_section_size = nvdev->send_section_size;
+   dev_info->recv_sections = nvdev->recv_section_cnt;
+   dev_info->recv_section_size = nvdev->recv_section_size;
+   } else {
+   dev_info->num_chn = VRSS_CHANNEL_DEFAULT;
+   dev_info->send_sections = NETVSC_DEFAULT_TX;
+   dev_info->send_section_size = NETVSC_SEND_SECTION_SIZE;
+   dev_info->recv_sections = NETVSC_DEFAULT_RX;
+   dev_info->recv_section_size = NETVSC_RECV_SECTION_SIZE;
+   }
+
+   return dev_info;
+}
+
 static int netvsc_detach(struct net_device *ndev,
 struct netvsc_device *nvdev)
 {
@@ -943,7 +973,7 @@ static int netvsc_set_channels(struct net_device *net,
struct net_device_context *net_device_ctx = netdev_priv(net);
struct netvsc_device *nvdev = rtnl_dereference(net_device_ctx->nvdev);
unsigned int orig, count = channels->combined_count;
-   struct netvsc_device_info device_info;
+   struct netvsc_device_info *device_info;
int ret;
 
/* We do not support separate count for rx, tx, or other */
@@ -962,24 +992,26 @@ static int netvsc_set_channels(struct net_device *net,
 
orig = nvdev->num_chn;
 
-   memset(_info, 0, sizeof(device_info));
-   device_info.num_chn = count;
-   device_info.send_sections = nvdev->send_section_cnt;
-   device_info.send_section_size = nvdev->send_section_size;
-   device_info.recv_sections = nvdev->recv_section_cnt;
-   device_info.recv_section_size = nvdev->recv_section_size;
+   device_info = netvsc_devinfo_get(nvdev);
+
+   if (!device_info)
+   return -ENOMEM;
+
+   device_info->num_chn = count;
 
ret = netvsc_detach(net, nvdev);
if (ret)
-   return ret;
+   goto out;
 
-   ret = netvsc_attach(net, _info);
+   ret = netvsc_attach(net, device_info);
if (ret) {
-   device_info.num_chn = orig;
-   if (netvsc_attach(net, _info))
+   device_info->num_chn = orig;
+   if (netvsc_attach(net, device_info))
netdev_err(net, "restoring channel setting failed\n");
}
 
+out:
+   kfree(device_info);
return ret;
 }
 
@@ -1048,48 +1080,45 @@ static int netvsc_change_mtu(struct net_device *ndev, 
int mtu)
struct net_device *vf_netdev = rtnl_dereference(ndevctx->vf_netdev);
struct netvsc_device *nvdev = rtnl_dereference(ndevctx->nvdev);
int orig_mtu = ndev->mtu;
-   struct netvsc_device_info device_info;
+   struct netvsc_device_info *device_info;
int ret = 0;
 
if (!nvdev || nvdev->destroy)
return -ENODEV;
 
+   device_info = netvsc_devinfo_get(nvdev);
+
+   if (!device_info)
+   return -ENOMEM;
+
/* Change MTU of underlying VF netdev first. */
if (vf_netdev) {
ret = dev_set_mtu(vf_netdev, mtu);
if (ret)
-   return ret;
+   goto out;
}
 
-   memset(_info, 0, sizeof(device_info));
-   device_info.num_chn = nvdev->num_chn;
-   device_info.send_sections = nvdev->send_section_cnt;
-   device_info.send_section_size = nvdev->send_section_size;
-   device_info.recv_sections = nvdev->recv_section_cnt;
-   device_info.recv_section_size = nvdev->recv_section_size;
-
ret = netvsc_detach(ndev, nvdev);
if (ret)
goto rollback_vf;
 
ndev->mtu = mtu;
 
-   ret = netvsc_attach(ndev, _info);
-   if (ret)
-   goto rollback;
-
-   r

[PATCH hyperv-fixes,0/3] fixes for hash key setting issues

2019-01-14 Thread Haiyang Zhang
From: Haiyang Zhang 

Using ethtool to change Hash key failed on Linux VM runnig on
Hyper-V. This patch set fix them.
It targets Hyper-V tree, hyperv-fixes branch managed by 
Sasha Levin .

Haiyang Zhang (3):
  Fix ethtool change hash key error
  Refactor assignments of struct netvsc_device_info
  Fix hash key value reset after other ops

 drivers/net/hyperv/hyperv_net.h   |  10 ++-
 drivers/net/hyperv/netvsc.c   |   2 +-
 drivers/net/hyperv/netvsc_drv.c   | 139 +++---
 drivers/net/hyperv/rndis_filter.c |  34 ++--
 4 files changed, 123 insertions(+), 62 deletions(-)

-- 
2.19.1



[PATCH hyperv-fixes,1/3] Fix ethtool change hash key error

2019-01-14 Thread Haiyang Zhang
From: Haiyang Zhang 

Hyper-V hosts require us to disable RSS before changing RSS key,
otherwise the changing request will fail. This patch fixes the
coding error.

Fixes: ff4a44199012 ("netvsc: allow get/set of RSS indirection table")
Reported-by: Wei Hu 
Signed-off-by: Haiyang Zhang 
---
 drivers/net/hyperv/rndis_filter.c | 25 +++--
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/drivers/net/hyperv/rndis_filter.c 
b/drivers/net/hyperv/rndis_filter.c
index 8b537a049c1e..a4661d396e3c 100644
--- a/drivers/net/hyperv/rndis_filter.c
+++ b/drivers/net/hyperv/rndis_filter.c
@@ -774,8 +774,8 @@ rndis_filter_set_offload_params(struct net_device *ndev,
return ret;
 }
 
-int rndis_filter_set_rss_param(struct rndis_device *rdev,
-  const u8 *rss_key)
+static int rndis_set_rss_param_msg(struct rndis_device *rdev,
+  const u8 *rss_key, u16 flag)
 {
struct net_device *ndev = rdev->ndev;
struct rndis_request *request;
@@ -804,7 +804,7 @@ int rndis_filter_set_rss_param(struct rndis_device *rdev,
rssp->hdr.type = NDIS_OBJECT_TYPE_RSS_PARAMETERS;
rssp->hdr.rev = NDIS_RECEIVE_SCALE_PARAMETERS_REVISION_2;
rssp->hdr.size = sizeof(struct ndis_recv_scale_param);
-   rssp->flag = 0;
+   rssp->flag = flag;
rssp->hashinfo = NDIS_HASH_FUNC_TOEPLITZ | NDIS_HASH_IPV4 |
 NDIS_HASH_TCP_IPV4 | NDIS_HASH_IPV6 |
 NDIS_HASH_TCP_IPV6;
@@ -829,9 +829,12 @@ int rndis_filter_set_rss_param(struct rndis_device *rdev,
 
wait_for_completion(>wait_event);
set_complete = >response_msg.msg.set_complete;
-   if (set_complete->status == RNDIS_STATUS_SUCCESS)
-   memcpy(rdev->rss_key, rss_key, NETVSC_HASH_KEYLEN);
-   else {
+   if (set_complete->status == RNDIS_STATUS_SUCCESS) {
+   if (!(flag & NDIS_RSS_PARAM_FLAG_DISABLE_RSS) &&
+   !(flag & NDIS_RSS_PARAM_FLAG_HASH_KEY_UNCHANGED))
+   memcpy(rdev->rss_key, rss_key, NETVSC_HASH_KEYLEN);
+
+   } else {
netdev_err(ndev, "Fail to set RSS parameters:0x%x\n",
   set_complete->status);
ret = -EINVAL;
@@ -842,6 +845,16 @@ int rndis_filter_set_rss_param(struct rndis_device *rdev,
return ret;
 }
 
+int rndis_filter_set_rss_param(struct rndis_device *rdev,
+  const u8 *rss_key)
+{
+   /* Disable RSS before change */
+   rndis_set_rss_param_msg(rdev, rss_key,
+   NDIS_RSS_PARAM_FLAG_DISABLE_RSS);
+
+   return rndis_set_rss_param_msg(rdev, rss_key, 0);
+}
+
 static int rndis_filter_query_device_link_status(struct rndis_device *dev,
 struct netvsc_device 
*net_device)
 {
-- 
2.19.1



RE: [PATCH] hv_netvsc: fix typos in code comments

2019-01-03 Thread Haiyang Zhang



> -Original Message-
> From: Adrian Vladu 
> Sent: Thursday, January 3, 2019 2:43 PM
> To: linux-kernel@vger.kernel.org
> Cc: Adrian Vladu ; KY Srinivasan
> ; Haiyang Zhang ; Stephen
> Hemminger ; Sasha Levin ;
> David S. Miller ; apilotti
> 
> Subject: [PATCH] hv_netvsc: fix typos in code comments
> 
> Fix all typos from hyperv netvsc code comments.
> 
> Signed-off-by: Adrian Vladu 
> 
> Cc: "K. Y. Srinivasan" 
> Cc: Haiyang Zhang 
> Cc: Stephen Hemminger 
> Cc: Sasha Levin 
> Cc: "David S. Miller" 
> Cc: "Alessandro Pilotti" 
> ---

Thanks.

Reviewed-by: Haiyang Zhang 


RE: [PATCH V2 3/5] Drivers: hv: kvp: Fix the recent regression caused by incorrect clean-up

2018-11-01 Thread Haiyang Zhang



> -Original Message-
> From: k...@linuxonhyperv.com 
> Sent: Thursday, October 18, 2018 1:10 AM
> To: gre...@linuxfoundation.org; linux-kernel@vger.kernel.org;
> de...@linuxdriverproject.org; o...@aepfle.de; a...@canonical.com;
> jasow...@redhat.com; Stephen Hemminger ;
> Michael Kelley ; vkuznets 
> Cc: Dexuan Cui ; KY Srinivasan ;
> Haiyang Zhang ; sta...@vger.kernel.org
> Subject: [PATCH V2 3/5] Drivers: hv: kvp: Fix the recent regression caused by
> incorrect clean-up
> 
> From: Dexuan Cui 
> 
> In kvp_send_key(), we do need call process_ib_ipinfo() if
> message->kvp_hdr.operation is KVP_OP_GET_IP_INFO, because it turns out
> the userland hv_kvp_daemon needs the info of operation, adapter_id and
> addr_family. With the incorrect fc62c3b1977d, the host can't get the VM's IP
> via KVP.
> 
> And, fc62c3b1977d added a "break;", but actually forgot to initialize the
> key_size/value in the case of KVP_OP_SET, so the default key_size of
> 0 is passed to the kvp daemon, and the pool files
> /var/lib/hyperv/.kvp_pool_* can't be updated.
> 
> This patch effectively rolls back the previous fc62c3b1977d, and correctly 
> fixes
> the "this statement may fall through" warnings.
> 
> This patch is tested on WS 2012 R2 and 2016.
> 
> Fixes: fc62c3b1977d ("Drivers: hv: kvp: Fix two "this statement may fall
> through" warnings")
> Signed-off-by: Dexuan Cui 
> Cc: K. Y. Srinivasan 
> Cc: Haiyang Zhang 
> Cc: Stephen Hemminger 
> Cc: 
> Signed-off-by: K. Y. Srinivasan 

Signed-off-by: Haiyang Zhang 

Thanks!



RE: [PATCH V2 3/5] Drivers: hv: kvp: Fix the recent regression caused by incorrect clean-up

2018-11-01 Thread Haiyang Zhang



> -Original Message-
> From: k...@linuxonhyperv.com 
> Sent: Thursday, October 18, 2018 1:10 AM
> To: gre...@linuxfoundation.org; linux-kernel@vger.kernel.org;
> de...@linuxdriverproject.org; o...@aepfle.de; a...@canonical.com;
> jasow...@redhat.com; Stephen Hemminger ;
> Michael Kelley ; vkuznets 
> Cc: Dexuan Cui ; KY Srinivasan ;
> Haiyang Zhang ; sta...@vger.kernel.org
> Subject: [PATCH V2 3/5] Drivers: hv: kvp: Fix the recent regression caused by
> incorrect clean-up
> 
> From: Dexuan Cui 
> 
> In kvp_send_key(), we do need call process_ib_ipinfo() if
> message->kvp_hdr.operation is KVP_OP_GET_IP_INFO, because it turns out
> the userland hv_kvp_daemon needs the info of operation, adapter_id and
> addr_family. With the incorrect fc62c3b1977d, the host can't get the VM's IP
> via KVP.
> 
> And, fc62c3b1977d added a "break;", but actually forgot to initialize the
> key_size/value in the case of KVP_OP_SET, so the default key_size of
> 0 is passed to the kvp daemon, and the pool files
> /var/lib/hyperv/.kvp_pool_* can't be updated.
> 
> This patch effectively rolls back the previous fc62c3b1977d, and correctly 
> fixes
> the "this statement may fall through" warnings.
> 
> This patch is tested on WS 2012 R2 and 2016.
> 
> Fixes: fc62c3b1977d ("Drivers: hv: kvp: Fix two "this statement may fall
> through" warnings")
> Signed-off-by: Dexuan Cui 
> Cc: K. Y. Srinivasan 
> Cc: Haiyang Zhang 
> Cc: Stephen Hemminger 
> Cc: 
> Signed-off-by: K. Y. Srinivasan 

Signed-off-by: Haiyang Zhang 

Thanks!



RE: [PATCH 4.18 101/135] hv_netvsc: pair VF based on serial number

2018-10-17 Thread Haiyang Zhang



> -Original Message-
> From: Greg Kroah-Hartman 
> Sent: Wednesday, October 17, 2018 9:17 AM
> To: Haiyang Zhang 
> Cc: linux-kernel@vger.kernel.org; sta...@vger.kernel.org; Stephen Hemminger
> ; David S. Miller ; Sasha
> Levin 
> Subject: Re: [PATCH 4.18 101/135] hv_netvsc: pair VF based on serial number
> 
> On Wed, Oct 17, 2018 at 01:02:17PM +, Haiyang Zhang wrote:
> >
> >
> > > -Original Message-
> > > From: Greg Kroah-Hartman 
> > > Sent: Wednesday, October 17, 2018 3:46 AM
> > > To: Haiyang Zhang 
> > > Cc: linux-kernel@vger.kernel.org; sta...@vger.kernel.org; Stephen
> > > Hemminger ; David S. Miller
> > > ; Sasha Levin 
> > > Subject: Re: [PATCH 4.18 101/135] hv_netvsc: pair VF based on serial
> > > number
> > >
> > > On Tue, Oct 16, 2018 at 07:13:12PM +, Haiyang Zhang wrote:
> > > >
> > > >
> > > > > -Original Message-
> > > > > From: linux-kernel-ow...@vger.kernel.org  > > > > ow...@vger.kernel.org> On Behalf Of Greg Kroah-Hartman
> > > > > Sent: Tuesday, October 16, 2018 1:06 PM
> > > > > To: linux-kernel@vger.kernel.org
> > > > > Cc: Greg Kroah-Hartman ;
> > > > > sta...@vger.kernel.org; Stephen Hemminger
> > > > > ; David S. Miller ;
> > > > > Sasha Levin 
> > > > > Subject: [PATCH 4.18 101/135] hv_netvsc: pair VF based on serial
> > > > > number
> > > > >
> > > > > 4.18-stable review patch.  If anyone has any objections, please let me
> know.
> > > > >
> > > > > --
> > > > >
> > > > > From: Stephen Hemminger 
> > > > >
> > > > > [ Upstream commit 00d7ddba1143623b31bc2c15d18216e2da031b14 ]
> > > > >
> > > > > Matching network device based on MAC address is problematic
> > > > > since a non VF network device can be creted with a duplicate MAC
> > > > > address causing confusion and problems.  The VMBus API does
> > > > > provide a serial number that is a better matching method.
> > > > >
> > > > > Signed-off-by: Stephen Hemminger 
> > > > > Signed-off-by: David S. Miller 
> > > > > Signed-off-by: Sasha Levin 
> > > > > Signed-off-by: Greg Kroah-Hartman 
> > > > > ---
> > > > >  drivers/net/hyperv/netvsc.c |3 ++
> > > > >  drivers/net/hyperv/netvsc_drv.c |   58 ++---
> -
> > > -
> > > > > -
> > > > >  2 files changed, 36 insertions(+), 25 deletions(-)
> > > > >
> > > > > --- a/drivers/net/hyperv/netvsc.c
> > > > > +++ b/drivers/net/hyperv/netvsc.c
> > > > > @@ -1203,6 +1203,9 @@ static void netvsc_send_vf(struct net_de
> > > > >
> > > > >   net_device_ctx->vf_alloc = nvmsg-
> >msg.v4_msg.vf_assoc.allocated;
> > > > >   net_device_ctx->vf_serial = nvmsg-
> >msg.v4_msg.vf_assoc.serial;
> > > > > + netdev_info(ndev, "VF slot %u %s\n",
> > > > > + net_device_ctx->vf_serial,
> > > > > + net_device_ctx->vf_alloc ? "added" : "removed");
> > > > >  }
> > > > >
> > > > >  static  void netvsc_receive_inband(struct net_device *ndev,
> > > > > --- a/drivers/net/hyperv/netvsc_drv.c
> > > > > +++ b/drivers/net/hyperv/netvsc_drv.c
> > > > > @@ -1794,20 +1794,6 @@ out_unlock:
> > > > >   rtnl_unlock();
> > > > >  }
> > > > >
> > > > > -static struct net_device *get_netvsc_bymac(const u8 *mac) -{
> > > > > - struct net_device_context *ndev_ctx;
> > > > > -
> > > > > - list_for_each_entry(ndev_ctx, _dev_list, list) {
> > > > > - struct net_device *dev = hv_get_drvdata(ndev_ctx-
> > > > > >device_ctx);
> > > > > -
> > > > > - if (ether_addr_equal(mac, dev->perm_addr))
> > > > > - return dev;
> > > > > - }
> > > > > -
> > > > > - return NULL;
> > > > > -}
> > > > > -
> > > > >  static struct net_device *get_netvsc_byref(struct net_device
> *vf_netdev)  {
> > > 

RE: [PATCH 4.18 101/135] hv_netvsc: pair VF based on serial number

2018-10-17 Thread Haiyang Zhang



> -Original Message-
> From: Greg Kroah-Hartman 
> Sent: Wednesday, October 17, 2018 9:17 AM
> To: Haiyang Zhang 
> Cc: linux-kernel@vger.kernel.org; sta...@vger.kernel.org; Stephen Hemminger
> ; David S. Miller ; Sasha
> Levin 
> Subject: Re: [PATCH 4.18 101/135] hv_netvsc: pair VF based on serial number
> 
> On Wed, Oct 17, 2018 at 01:02:17PM +, Haiyang Zhang wrote:
> >
> >
> > > -Original Message-
> > > From: Greg Kroah-Hartman 
> > > Sent: Wednesday, October 17, 2018 3:46 AM
> > > To: Haiyang Zhang 
> > > Cc: linux-kernel@vger.kernel.org; sta...@vger.kernel.org; Stephen
> > > Hemminger ; David S. Miller
> > > ; Sasha Levin 
> > > Subject: Re: [PATCH 4.18 101/135] hv_netvsc: pair VF based on serial
> > > number
> > >
> > > On Tue, Oct 16, 2018 at 07:13:12PM +, Haiyang Zhang wrote:
> > > >
> > > >
> > > > > -Original Message-
> > > > > From: linux-kernel-ow...@vger.kernel.org  > > > > ow...@vger.kernel.org> On Behalf Of Greg Kroah-Hartman
> > > > > Sent: Tuesday, October 16, 2018 1:06 PM
> > > > > To: linux-kernel@vger.kernel.org
> > > > > Cc: Greg Kroah-Hartman ;
> > > > > sta...@vger.kernel.org; Stephen Hemminger
> > > > > ; David S. Miller ;
> > > > > Sasha Levin 
> > > > > Subject: [PATCH 4.18 101/135] hv_netvsc: pair VF based on serial
> > > > > number
> > > > >
> > > > > 4.18-stable review patch.  If anyone has any objections, please let me
> know.
> > > > >
> > > > > --
> > > > >
> > > > > From: Stephen Hemminger 
> > > > >
> > > > > [ Upstream commit 00d7ddba1143623b31bc2c15d18216e2da031b14 ]
> > > > >
> > > > > Matching network device based on MAC address is problematic
> > > > > since a non VF network device can be creted with a duplicate MAC
> > > > > address causing confusion and problems.  The VMBus API does
> > > > > provide a serial number that is a better matching method.
> > > > >
> > > > > Signed-off-by: Stephen Hemminger 
> > > > > Signed-off-by: David S. Miller 
> > > > > Signed-off-by: Sasha Levin 
> > > > > Signed-off-by: Greg Kroah-Hartman 
> > > > > ---
> > > > >  drivers/net/hyperv/netvsc.c |3 ++
> > > > >  drivers/net/hyperv/netvsc_drv.c |   58 ++---
> -
> > > -
> > > > > -
> > > > >  2 files changed, 36 insertions(+), 25 deletions(-)
> > > > >
> > > > > --- a/drivers/net/hyperv/netvsc.c
> > > > > +++ b/drivers/net/hyperv/netvsc.c
> > > > > @@ -1203,6 +1203,9 @@ static void netvsc_send_vf(struct net_de
> > > > >
> > > > >   net_device_ctx->vf_alloc = nvmsg-
> >msg.v4_msg.vf_assoc.allocated;
> > > > >   net_device_ctx->vf_serial = nvmsg-
> >msg.v4_msg.vf_assoc.serial;
> > > > > + netdev_info(ndev, "VF slot %u %s\n",
> > > > > + net_device_ctx->vf_serial,
> > > > > + net_device_ctx->vf_alloc ? "added" : "removed");
> > > > >  }
> > > > >
> > > > >  static  void netvsc_receive_inband(struct net_device *ndev,
> > > > > --- a/drivers/net/hyperv/netvsc_drv.c
> > > > > +++ b/drivers/net/hyperv/netvsc_drv.c
> > > > > @@ -1794,20 +1794,6 @@ out_unlock:
> > > > >   rtnl_unlock();
> > > > >  }
> > > > >
> > > > > -static struct net_device *get_netvsc_bymac(const u8 *mac) -{
> > > > > - struct net_device_context *ndev_ctx;
> > > > > -
> > > > > - list_for_each_entry(ndev_ctx, _dev_list, list) {
> > > > > - struct net_device *dev = hv_get_drvdata(ndev_ctx-
> > > > > >device_ctx);
> > > > > -
> > > > > - if (ether_addr_equal(mac, dev->perm_addr))
> > > > > - return dev;
> > > > > - }
> > > > > -
> > > > > - return NULL;
> > > > > -}
> > > > > -
> > > > >  static struct net_device *get_netvsc_byref(struct net_device
> *vf_netdev)  {
> > > 

RE: [PATCH 4.18 101/135] hv_netvsc: pair VF based on serial number

2018-10-17 Thread Haiyang Zhang



> -Original Message-
> From: Greg Kroah-Hartman 
> Sent: Wednesday, October 17, 2018 3:46 AM
> To: Haiyang Zhang 
> Cc: linux-kernel@vger.kernel.org; sta...@vger.kernel.org; Stephen Hemminger
> ; David S. Miller ; Sasha
> Levin 
> Subject: Re: [PATCH 4.18 101/135] hv_netvsc: pair VF based on serial number
> 
> On Tue, Oct 16, 2018 at 07:13:12PM +, Haiyang Zhang wrote:
> >
> >
> > > -Original Message-
> > > From: linux-kernel-ow...@vger.kernel.org  > > ow...@vger.kernel.org> On Behalf Of Greg Kroah-Hartman
> > > Sent: Tuesday, October 16, 2018 1:06 PM
> > > To: linux-kernel@vger.kernel.org
> > > Cc: Greg Kroah-Hartman ;
> > > sta...@vger.kernel.org; Stephen Hemminger ;
> > > David S. Miller ; Sasha Levin
> > > 
> > > Subject: [PATCH 4.18 101/135] hv_netvsc: pair VF based on serial
> > > number
> > >
> > > 4.18-stable review patch.  If anyone has any objections, please let me 
> > > know.
> > >
> > > --
> > >
> > > From: Stephen Hemminger 
> > >
> > > [ Upstream commit 00d7ddba1143623b31bc2c15d18216e2da031b14 ]
> > >
> > > Matching network device based on MAC address is problematic since a
> > > non VF network device can be creted with a duplicate MAC address
> > > causing confusion and problems.  The VMBus API does provide a serial
> > > number that is a better matching method.
> > >
> > > Signed-off-by: Stephen Hemminger 
> > > Signed-off-by: David S. Miller 
> > > Signed-off-by: Sasha Levin 
> > > Signed-off-by: Greg Kroah-Hartman 
> > > ---
> > >  drivers/net/hyperv/netvsc.c |3 ++
> > >  drivers/net/hyperv/netvsc_drv.c |   58 ++
> -
> > > -
> > >  2 files changed, 36 insertions(+), 25 deletions(-)
> > >
> > > --- a/drivers/net/hyperv/netvsc.c
> > > +++ b/drivers/net/hyperv/netvsc.c
> > > @@ -1203,6 +1203,9 @@ static void netvsc_send_vf(struct net_de
> > >
> > >   net_device_ctx->vf_alloc = nvmsg->msg.v4_msg.vf_assoc.allocated;
> > >   net_device_ctx->vf_serial = nvmsg->msg.v4_msg.vf_assoc.serial;
> > > + netdev_info(ndev, "VF slot %u %s\n",
> > > + net_device_ctx->vf_serial,
> > > + net_device_ctx->vf_alloc ? "added" : "removed");
> > >  }
> > >
> > >  static  void netvsc_receive_inband(struct net_device *ndev,
> > > --- a/drivers/net/hyperv/netvsc_drv.c
> > > +++ b/drivers/net/hyperv/netvsc_drv.c
> > > @@ -1794,20 +1794,6 @@ out_unlock:
> > >   rtnl_unlock();
> > >  }
> > >
> > > -static struct net_device *get_netvsc_bymac(const u8 *mac) -{
> > > - struct net_device_context *ndev_ctx;
> > > -
> > > - list_for_each_entry(ndev_ctx, _dev_list, list) {
> > > - struct net_device *dev = hv_get_drvdata(ndev_ctx-
> > > >device_ctx);
> > > -
> > > - if (ether_addr_equal(mac, dev->perm_addr))
> > > - return dev;
> > > - }
> > > -
> > > - return NULL;
> > > -}
> > > -
> > >  static struct net_device *get_netvsc_byref(struct net_device *vf_netdev) 
> > >  {
> > >   struct net_device_context *net_device_ctx; @@ -1936,26 +1922,48
> @@
> > > static void netvsc_vf_setup(struct work_
> > >   rtnl_unlock();
> > >  }
> > >
> > > +/* Find netvsc by VMBus serial number.
> > > + * The PCI hyperv controller records the serial number as the slot.
> > > + */
> > > +static struct net_device *get_netvsc_byslot(const struct net_device
> > > +*vf_netdev) {
> > > + struct device *parent = vf_netdev->dev.parent;
> > > + struct net_device_context *ndev_ctx;
> > > + struct pci_dev *pdev;
> > > +
> > > + if (!parent || !dev_is_pci(parent))
> > > + return NULL; /* not a PCI device */
> > > +
> > > + pdev = to_pci_dev(parent);
> > > + if (!pdev->slot) {
> > > + netdev_notice(vf_netdev, "no PCI slot information\n");
> > > + return NULL;
> > > + }
> > > +
> > > + list_for_each_entry(ndev_ctx, _dev_list, list) {
> > > + if (!ndev_ctx->vf_alloc)
> > > + continue;
> > > +
> > > + if (ndev_ctx->vf_serial == pdev->slot->number)
> > > +   

RE: [PATCH 4.18 101/135] hv_netvsc: pair VF based on serial number

2018-10-17 Thread Haiyang Zhang



> -Original Message-
> From: Greg Kroah-Hartman 
> Sent: Wednesday, October 17, 2018 3:46 AM
> To: Haiyang Zhang 
> Cc: linux-kernel@vger.kernel.org; sta...@vger.kernel.org; Stephen Hemminger
> ; David S. Miller ; Sasha
> Levin 
> Subject: Re: [PATCH 4.18 101/135] hv_netvsc: pair VF based on serial number
> 
> On Tue, Oct 16, 2018 at 07:13:12PM +, Haiyang Zhang wrote:
> >
> >
> > > -Original Message-
> > > From: linux-kernel-ow...@vger.kernel.org  > > ow...@vger.kernel.org> On Behalf Of Greg Kroah-Hartman
> > > Sent: Tuesday, October 16, 2018 1:06 PM
> > > To: linux-kernel@vger.kernel.org
> > > Cc: Greg Kroah-Hartman ;
> > > sta...@vger.kernel.org; Stephen Hemminger ;
> > > David S. Miller ; Sasha Levin
> > > 
> > > Subject: [PATCH 4.18 101/135] hv_netvsc: pair VF based on serial
> > > number
> > >
> > > 4.18-stable review patch.  If anyone has any objections, please let me 
> > > know.
> > >
> > > --
> > >
> > > From: Stephen Hemminger 
> > >
> > > [ Upstream commit 00d7ddba1143623b31bc2c15d18216e2da031b14 ]
> > >
> > > Matching network device based on MAC address is problematic since a
> > > non VF network device can be creted with a duplicate MAC address
> > > causing confusion and problems.  The VMBus API does provide a serial
> > > number that is a better matching method.
> > >
> > > Signed-off-by: Stephen Hemminger 
> > > Signed-off-by: David S. Miller 
> > > Signed-off-by: Sasha Levin 
> > > Signed-off-by: Greg Kroah-Hartman 
> > > ---
> > >  drivers/net/hyperv/netvsc.c |3 ++
> > >  drivers/net/hyperv/netvsc_drv.c |   58 ++
> -
> > > -
> > >  2 files changed, 36 insertions(+), 25 deletions(-)
> > >
> > > --- a/drivers/net/hyperv/netvsc.c
> > > +++ b/drivers/net/hyperv/netvsc.c
> > > @@ -1203,6 +1203,9 @@ static void netvsc_send_vf(struct net_de
> > >
> > >   net_device_ctx->vf_alloc = nvmsg->msg.v4_msg.vf_assoc.allocated;
> > >   net_device_ctx->vf_serial = nvmsg->msg.v4_msg.vf_assoc.serial;
> > > + netdev_info(ndev, "VF slot %u %s\n",
> > > + net_device_ctx->vf_serial,
> > > + net_device_ctx->vf_alloc ? "added" : "removed");
> > >  }
> > >
> > >  static  void netvsc_receive_inband(struct net_device *ndev,
> > > --- a/drivers/net/hyperv/netvsc_drv.c
> > > +++ b/drivers/net/hyperv/netvsc_drv.c
> > > @@ -1794,20 +1794,6 @@ out_unlock:
> > >   rtnl_unlock();
> > >  }
> > >
> > > -static struct net_device *get_netvsc_bymac(const u8 *mac) -{
> > > - struct net_device_context *ndev_ctx;
> > > -
> > > - list_for_each_entry(ndev_ctx, _dev_list, list) {
> > > - struct net_device *dev = hv_get_drvdata(ndev_ctx-
> > > >device_ctx);
> > > -
> > > - if (ether_addr_equal(mac, dev->perm_addr))
> > > - return dev;
> > > - }
> > > -
> > > - return NULL;
> > > -}
> > > -
> > >  static struct net_device *get_netvsc_byref(struct net_device *vf_netdev) 
> > >  {
> > >   struct net_device_context *net_device_ctx; @@ -1936,26 +1922,48
> @@
> > > static void netvsc_vf_setup(struct work_
> > >   rtnl_unlock();
> > >  }
> > >
> > > +/* Find netvsc by VMBus serial number.
> > > + * The PCI hyperv controller records the serial number as the slot.
> > > + */
> > > +static struct net_device *get_netvsc_byslot(const struct net_device
> > > +*vf_netdev) {
> > > + struct device *parent = vf_netdev->dev.parent;
> > > + struct net_device_context *ndev_ctx;
> > > + struct pci_dev *pdev;
> > > +
> > > + if (!parent || !dev_is_pci(parent))
> > > + return NULL; /* not a PCI device */
> > > +
> > > + pdev = to_pci_dev(parent);
> > > + if (!pdev->slot) {
> > > + netdev_notice(vf_netdev, "no PCI slot information\n");
> > > + return NULL;
> > > + }
> > > +
> > > + list_for_each_entry(ndev_ctx, _dev_list, list) {
> > > + if (!ndev_ctx->vf_alloc)
> > > + continue;
> > > +
> > > + if (ndev_ctx->vf_serial == pdev->slot->number)
> > > +   

RE: [PATCH 4.18 101/135] hv_netvsc: pair VF based on serial number

2018-10-16 Thread Haiyang Zhang


> -Original Message-
> From: linux-kernel-ow...@vger.kernel.org  ow...@vger.kernel.org> On Behalf Of Greg Kroah-Hartman
> Sent: Tuesday, October 16, 2018 1:06 PM
> To: linux-kernel@vger.kernel.org
> Cc: Greg Kroah-Hartman ;
> sta...@vger.kernel.org; Stephen Hemminger ;
> David S. Miller ; Sasha Levin
> 
> Subject: [PATCH 4.18 101/135] hv_netvsc: pair VF based on serial number
> 
> 4.18-stable review patch.  If anyone has any objections, please let me know.
> 
> --
> 
> From: Stephen Hemminger 
> 
> [ Upstream commit 00d7ddba1143623b31bc2c15d18216e2da031b14 ]
> 
> Matching network device based on MAC address is problematic since a non VF
> network device can be creted with a duplicate MAC address causing confusion
> and problems.  The VMBus API does provide a serial number that is a better
> matching method.
> 
> Signed-off-by: Stephen Hemminger 
> Signed-off-by: David S. Miller 
> Signed-off-by: Sasha Levin 
> Signed-off-by: Greg Kroah-Hartman 
> ---
>  drivers/net/hyperv/netvsc.c |3 ++
>  drivers/net/hyperv/netvsc_drv.c |   58 
> ++-
> -
>  2 files changed, 36 insertions(+), 25 deletions(-)
> 
> --- a/drivers/net/hyperv/netvsc.c
> +++ b/drivers/net/hyperv/netvsc.c
> @@ -1203,6 +1203,9 @@ static void netvsc_send_vf(struct net_de
> 
>   net_device_ctx->vf_alloc = nvmsg->msg.v4_msg.vf_assoc.allocated;
>   net_device_ctx->vf_serial = nvmsg->msg.v4_msg.vf_assoc.serial;
> + netdev_info(ndev, "VF slot %u %s\n",
> + net_device_ctx->vf_serial,
> + net_device_ctx->vf_alloc ? "added" : "removed");
>  }
> 
>  static  void netvsc_receive_inband(struct net_device *ndev,
> --- a/drivers/net/hyperv/netvsc_drv.c
> +++ b/drivers/net/hyperv/netvsc_drv.c
> @@ -1794,20 +1794,6 @@ out_unlock:
>   rtnl_unlock();
>  }
> 
> -static struct net_device *get_netvsc_bymac(const u8 *mac) -{
> - struct net_device_context *ndev_ctx;
> -
> - list_for_each_entry(ndev_ctx, _dev_list, list) {
> - struct net_device *dev = hv_get_drvdata(ndev_ctx-
> >device_ctx);
> -
> - if (ether_addr_equal(mac, dev->perm_addr))
> - return dev;
> - }
> -
> - return NULL;
> -}
> -
>  static struct net_device *get_netvsc_byref(struct net_device *vf_netdev)  {
>   struct net_device_context *net_device_ctx; @@ -1936,26 +1922,48
> @@ static void netvsc_vf_setup(struct work_
>   rtnl_unlock();
>  }
> 
> +/* Find netvsc by VMBus serial number.
> + * The PCI hyperv controller records the serial number as the slot.
> + */
> +static struct net_device *get_netvsc_byslot(const struct net_device
> +*vf_netdev) {
> + struct device *parent = vf_netdev->dev.parent;
> + struct net_device_context *ndev_ctx;
> + struct pci_dev *pdev;
> +
> + if (!parent || !dev_is_pci(parent))
> + return NULL; /* not a PCI device */
> +
> + pdev = to_pci_dev(parent);
> + if (!pdev->slot) {
> + netdev_notice(vf_netdev, "no PCI slot information\n");
> + return NULL;
> + }
> +
> + list_for_each_entry(ndev_ctx, _dev_list, list) {
> + if (!ndev_ctx->vf_alloc)
> + continue;
> +
> + if (ndev_ctx->vf_serial == pdev->slot->number)
> + return hv_get_drvdata(ndev_ctx->device_ctx);

Greg,

I had a fix to this patch. If not already, Could you include my following fix 
together?
hv_netvsc: fix vf serial matching with pci slot info
https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git/commit/?id=005479556197f80139771960dda0dfdcd2d2aad5

Thanks,
- Haiyang



RE: [PATCH 4.18 101/135] hv_netvsc: pair VF based on serial number

2018-10-16 Thread Haiyang Zhang


> -Original Message-
> From: linux-kernel-ow...@vger.kernel.org  ow...@vger.kernel.org> On Behalf Of Greg Kroah-Hartman
> Sent: Tuesday, October 16, 2018 1:06 PM
> To: linux-kernel@vger.kernel.org
> Cc: Greg Kroah-Hartman ;
> sta...@vger.kernel.org; Stephen Hemminger ;
> David S. Miller ; Sasha Levin
> 
> Subject: [PATCH 4.18 101/135] hv_netvsc: pair VF based on serial number
> 
> 4.18-stable review patch.  If anyone has any objections, please let me know.
> 
> --
> 
> From: Stephen Hemminger 
> 
> [ Upstream commit 00d7ddba1143623b31bc2c15d18216e2da031b14 ]
> 
> Matching network device based on MAC address is problematic since a non VF
> network device can be creted with a duplicate MAC address causing confusion
> and problems.  The VMBus API does provide a serial number that is a better
> matching method.
> 
> Signed-off-by: Stephen Hemminger 
> Signed-off-by: David S. Miller 
> Signed-off-by: Sasha Levin 
> Signed-off-by: Greg Kroah-Hartman 
> ---
>  drivers/net/hyperv/netvsc.c |3 ++
>  drivers/net/hyperv/netvsc_drv.c |   58 
> ++-
> -
>  2 files changed, 36 insertions(+), 25 deletions(-)
> 
> --- a/drivers/net/hyperv/netvsc.c
> +++ b/drivers/net/hyperv/netvsc.c
> @@ -1203,6 +1203,9 @@ static void netvsc_send_vf(struct net_de
> 
>   net_device_ctx->vf_alloc = nvmsg->msg.v4_msg.vf_assoc.allocated;
>   net_device_ctx->vf_serial = nvmsg->msg.v4_msg.vf_assoc.serial;
> + netdev_info(ndev, "VF slot %u %s\n",
> + net_device_ctx->vf_serial,
> + net_device_ctx->vf_alloc ? "added" : "removed");
>  }
> 
>  static  void netvsc_receive_inband(struct net_device *ndev,
> --- a/drivers/net/hyperv/netvsc_drv.c
> +++ b/drivers/net/hyperv/netvsc_drv.c
> @@ -1794,20 +1794,6 @@ out_unlock:
>   rtnl_unlock();
>  }
> 
> -static struct net_device *get_netvsc_bymac(const u8 *mac) -{
> - struct net_device_context *ndev_ctx;
> -
> - list_for_each_entry(ndev_ctx, _dev_list, list) {
> - struct net_device *dev = hv_get_drvdata(ndev_ctx-
> >device_ctx);
> -
> - if (ether_addr_equal(mac, dev->perm_addr))
> - return dev;
> - }
> -
> - return NULL;
> -}
> -
>  static struct net_device *get_netvsc_byref(struct net_device *vf_netdev)  {
>   struct net_device_context *net_device_ctx; @@ -1936,26 +1922,48
> @@ static void netvsc_vf_setup(struct work_
>   rtnl_unlock();
>  }
> 
> +/* Find netvsc by VMBus serial number.
> + * The PCI hyperv controller records the serial number as the slot.
> + */
> +static struct net_device *get_netvsc_byslot(const struct net_device
> +*vf_netdev) {
> + struct device *parent = vf_netdev->dev.parent;
> + struct net_device_context *ndev_ctx;
> + struct pci_dev *pdev;
> +
> + if (!parent || !dev_is_pci(parent))
> + return NULL; /* not a PCI device */
> +
> + pdev = to_pci_dev(parent);
> + if (!pdev->slot) {
> + netdev_notice(vf_netdev, "no PCI slot information\n");
> + return NULL;
> + }
> +
> + list_for_each_entry(ndev_ctx, _dev_list, list) {
> + if (!ndev_ctx->vf_alloc)
> + continue;
> +
> + if (ndev_ctx->vf_serial == pdev->slot->number)
> + return hv_get_drvdata(ndev_ctx->device_ctx);

Greg,

I had a fix to this patch. If not already, Could you include my following fix 
together?
hv_netvsc: fix vf serial matching with pci slot info
https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git/commit/?id=005479556197f80139771960dda0dfdcd2d2aad5

Thanks,
- Haiyang



RE: [PATCH net-next, v2] hv_netvsc: fix vf serial matching with pci slot info

2018-10-12 Thread Haiyang Zhang



> -Original Message-
> From: Stephen Hemminger 
> Sent: Friday, October 12, 2018 6:21 PM
> To: Haiyang Zhang 
> Cc: Haiyang Zhang ; da...@davemloft.net;
> net...@vger.kernel.org; o...@aepfle.de; linux-kernel@vger.kernel.org;
> de...@linuxdriverproject.org; vkuznets 
> Subject: Re: [PATCH net-next, v2] hv_netvsc: fix vf serial matching with pci 
> slot
> info
> 
> On Fri, 12 Oct 2018 20:55:15 +
> Haiyang Zhang  wrote:
> 
> Thanks for fixing this.
> 
> 
> > +   if (kstrtou32(kobject_name(>slot->kobj), 10, )) {
> > +   netdev_notice(vf_netdev, "Invalid vf serial:%s\n",
> > + pdev->slot->kobj.name);
> > +   return NULL;
> > +   }
> 
> Shouldn't this use kobject_name() in the message as well.
> 
> Looking at the pci.h code there is already an API to get name from slot (it 
> uses
> kobject_name()). So please use that one.

Sure, I will look for that api. Thanks.



RE: [PATCH net-next, v2] hv_netvsc: fix vf serial matching with pci slot info

2018-10-12 Thread Haiyang Zhang



> -Original Message-
> From: Stephen Hemminger 
> Sent: Friday, October 12, 2018 6:21 PM
> To: Haiyang Zhang 
> Cc: Haiyang Zhang ; da...@davemloft.net;
> net...@vger.kernel.org; o...@aepfle.de; linux-kernel@vger.kernel.org;
> de...@linuxdriverproject.org; vkuznets 
> Subject: Re: [PATCH net-next, v2] hv_netvsc: fix vf serial matching with pci 
> slot
> info
> 
> On Fri, 12 Oct 2018 20:55:15 +
> Haiyang Zhang  wrote:
> 
> Thanks for fixing this.
> 
> 
> > +   if (kstrtou32(kobject_name(>slot->kobj), 10, )) {
> > +   netdev_notice(vf_netdev, "Invalid vf serial:%s\n",
> > + pdev->slot->kobj.name);
> > +   return NULL;
> > +   }
> 
> Shouldn't this use kobject_name() in the message as well.
> 
> Looking at the pci.h code there is already an API to get name from slot (it 
> uses
> kobject_name()). So please use that one.

Sure, I will look for that api. Thanks.



[PATCH net,v2] hv_netvsc: Fix napi reschedule while receive completion is busy

2018-07-17 Thread Haiyang Zhang
From: Haiyang Zhang 

If out ring is full temporarily and receive completion cannot go out,
we may still need to reschedule napi if certain conditions are met.
Otherwise the napi poll might be stopped forever, and cause network
disconnect.

Fixes: 7426b1a51803 ("netvsc: optimize receive completions")
Signed-off-by: Stephen Hemminger 
Signed-off-by: Haiyang Zhang 
---
 drivers/net/hyperv/netvsc.c | 17 ++---
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index 8e9d0ee1572b..31c3d77b4733 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -1274,6 +1274,7 @@ int netvsc_poll(struct napi_struct *napi, int budget)
struct hv_device *device = netvsc_channel_to_device(channel);
struct net_device *ndev = hv_get_drvdata(device);
int work_done = 0;
+   int ret;
 
/* If starting a new interval */
if (!nvchan->desc)
@@ -1285,16 +1286,18 @@ int netvsc_poll(struct napi_struct *napi, int budget)
nvchan->desc = hv_pkt_iter_next(channel, nvchan->desc);
}
 
-   /* If send of pending receive completions suceeded
-*   and did not exhaust NAPI budget this time
-*   and not doing busy poll
+   /* Send any pending receive completions */
+   ret = send_recv_completions(ndev, net_device, nvchan);
+
+   /* If it did not exhaust NAPI budget this time
+*  and not doing busy poll
 * then re-enable host interrupts
-* and reschedule if ring is not empty.
+*  and reschedule if ring is not empty
+*   or sending receive completion failed.
 */
-   if (send_recv_completions(ndev, net_device, nvchan) == 0 &&
-   work_done < budget &&
+   if (work_done < budget &&
napi_complete_done(napi, work_done) &&
-   hv_end_read(>inbound) &&
+   (ret || hv_end_read(>inbound)) &&
napi_schedule_prep(napi)) {
hv_begin_read(>inbound);
__napi_schedule(napi);
-- 
2.17.1



[PATCH net,v2] hv_netvsc: Fix napi reschedule while receive completion is busy

2018-07-17 Thread Haiyang Zhang
From: Haiyang Zhang 

If out ring is full temporarily and receive completion cannot go out,
we may still need to reschedule napi if certain conditions are met.
Otherwise the napi poll might be stopped forever, and cause network
disconnect.

Fixes: 7426b1a51803 ("netvsc: optimize receive completions")
Signed-off-by: Stephen Hemminger 
Signed-off-by: Haiyang Zhang 
---
 drivers/net/hyperv/netvsc.c | 17 ++---
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index 8e9d0ee1572b..31c3d77b4733 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -1274,6 +1274,7 @@ int netvsc_poll(struct napi_struct *napi, int budget)
struct hv_device *device = netvsc_channel_to_device(channel);
struct net_device *ndev = hv_get_drvdata(device);
int work_done = 0;
+   int ret;
 
/* If starting a new interval */
if (!nvchan->desc)
@@ -1285,16 +1286,18 @@ int netvsc_poll(struct napi_struct *napi, int budget)
nvchan->desc = hv_pkt_iter_next(channel, nvchan->desc);
}
 
-   /* If send of pending receive completions suceeded
-*   and did not exhaust NAPI budget this time
-*   and not doing busy poll
+   /* Send any pending receive completions */
+   ret = send_recv_completions(ndev, net_device, nvchan);
+
+   /* If it did not exhaust NAPI budget this time
+*  and not doing busy poll
 * then re-enable host interrupts
-* and reschedule if ring is not empty.
+*  and reschedule if ring is not empty
+*   or sending receive completion failed.
 */
-   if (send_recv_completions(ndev, net_device, nvchan) == 0 &&
-   work_done < budget &&
+   if (work_done < budget &&
napi_complete_done(napi, work_done) &&
-   hv_end_read(>inbound) &&
+   (ret || hv_end_read(>inbound)) &&
napi_schedule_prep(napi)) {
hv_begin_read(>inbound);
__napi_schedule(napi);
-- 
2.17.1



RE: [PATCH net] hv_netvsc: Fix napi reschedule while receive completion is busy

2018-07-09 Thread Haiyang Zhang



> -Original Message-
> From: Stephen Hemminger 
> Sent: Monday, July 9, 2018 2:15 PM
> To: Haiyang Zhang 
> Cc: Haiyang Zhang ; da...@davemloft.net;
> net...@vger.kernel.org; o...@aepfle.de; Stephen Hemminger
> ; linux-kernel@vger.kernel.org;
> de...@linuxdriverproject.org; vkuzn...@redhat.com
> Subject: Re: [PATCH net] hv_netvsc: Fix napi reschedule while receive
> completion is busy
> 
> On Mon,  9 Jul 2018 16:43:19 +
> Haiyang Zhang  wrote:
> 
> > From: Haiyang Zhang 
> >
> > If out ring is full temporarily and receive completion cannot go out,
> > we may still need to reschedule napi if other conditions are met.
> > Otherwise the napi poll might be stopped forever, and cause network
> > disconnect.
> >
> > Fixes: 7426b1a51803 ("netvsc: optimize receive completions")
> > Signed-off-by: Haiyang Zhang 
> > ---
> >  drivers/net/hyperv/netvsc.c | 8 
> >  1 file changed, 4 insertions(+), 4 deletions(-)
> >
> > diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
> > index 8e9d0ee1572b..caaf5054f446 100644
> > --- a/drivers/net/hyperv/netvsc.c
> > +++ b/drivers/net/hyperv/netvsc.c
> > @@ -1285,14 +1285,14 @@ int netvsc_poll(struct napi_struct *napi, int
> budget)
> > nvchan->desc = hv_pkt_iter_next(channel, nvchan->desc);
> > }
> >
> > -   /* If send of pending receive completions suceeded
> > -*   and did not exhaust NAPI budget this time
> > +   send_recv_completions(ndev, net_device, nvchan);
> > +
> > +   /* If it did not exhaust NAPI budget this time
> >  *   and not doing busy poll
> >  * then re-enable host interrupts
> >  * and reschedule if ring is not empty.
> >  */
> > -   if (send_recv_completions(ndev, net_device, nvchan) == 0 &&
> > -   work_done < budget &&
> > +   if (work_done < budget &&
> > napi_complete_done(napi, work_done) &&
> > hv_end_read(>inbound) &&
> > napi_schedule_prep(napi)) {
> 
> This patch doesn't look right. I think the existing code works as written.
> 
> If send_receive_completions is unable to send because ring is full then
> vmbus_sendpacket will return -EBUSY which gets returns from
> send_receive_completions.  Because the return is non-zero, the driver will not
> call napi_complete_done.
> Since napi_complete_done was not called, NAPI will reschedule the napi poll
> routine.

With the existing code, we found in test, the rx_comp_busy counter increased,
one of the in-ring mask is 1, but guest is not reading it... With this patch, 
the 
pending receive completion will stay in the buffer (no loss), and be sent next 
time. 
It solves the disconnection problem when high number of connections.

If not calling napi_complete_done(), upper layer should guarantee napi_schedule,
then seems the upper NAPI code may have a bug -- the auto scheduling did not
happen in this case. I will check it further.

Thanks,
- Haiyang



RE: [PATCH net] hv_netvsc: Fix napi reschedule while receive completion is busy

2018-07-09 Thread Haiyang Zhang



> -Original Message-
> From: Stephen Hemminger 
> Sent: Monday, July 9, 2018 2:15 PM
> To: Haiyang Zhang 
> Cc: Haiyang Zhang ; da...@davemloft.net;
> net...@vger.kernel.org; o...@aepfle.de; Stephen Hemminger
> ; linux-kernel@vger.kernel.org;
> de...@linuxdriverproject.org; vkuzn...@redhat.com
> Subject: Re: [PATCH net] hv_netvsc: Fix napi reschedule while receive
> completion is busy
> 
> On Mon,  9 Jul 2018 16:43:19 +
> Haiyang Zhang  wrote:
> 
> > From: Haiyang Zhang 
> >
> > If out ring is full temporarily and receive completion cannot go out,
> > we may still need to reschedule napi if other conditions are met.
> > Otherwise the napi poll might be stopped forever, and cause network
> > disconnect.
> >
> > Fixes: 7426b1a51803 ("netvsc: optimize receive completions")
> > Signed-off-by: Haiyang Zhang 
> > ---
> >  drivers/net/hyperv/netvsc.c | 8 
> >  1 file changed, 4 insertions(+), 4 deletions(-)
> >
> > diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
> > index 8e9d0ee1572b..caaf5054f446 100644
> > --- a/drivers/net/hyperv/netvsc.c
> > +++ b/drivers/net/hyperv/netvsc.c
> > @@ -1285,14 +1285,14 @@ int netvsc_poll(struct napi_struct *napi, int
> budget)
> > nvchan->desc = hv_pkt_iter_next(channel, nvchan->desc);
> > }
> >
> > -   /* If send of pending receive completions suceeded
> > -*   and did not exhaust NAPI budget this time
> > +   send_recv_completions(ndev, net_device, nvchan);
> > +
> > +   /* If it did not exhaust NAPI budget this time
> >  *   and not doing busy poll
> >  * then re-enable host interrupts
> >  * and reschedule if ring is not empty.
> >  */
> > -   if (send_recv_completions(ndev, net_device, nvchan) == 0 &&
> > -   work_done < budget &&
> > +   if (work_done < budget &&
> > napi_complete_done(napi, work_done) &&
> > hv_end_read(>inbound) &&
> > napi_schedule_prep(napi)) {
> 
> This patch doesn't look right. I think the existing code works as written.
> 
> If send_receive_completions is unable to send because ring is full then
> vmbus_sendpacket will return -EBUSY which gets returns from
> send_receive_completions.  Because the return is non-zero, the driver will not
> call napi_complete_done.
> Since napi_complete_done was not called, NAPI will reschedule the napi poll
> routine.

With the existing code, we found in test, the rx_comp_busy counter increased,
one of the in-ring mask is 1, but guest is not reading it... With this patch, 
the 
pending receive completion will stay in the buffer (no loss), and be sent next 
time. 
It solves the disconnection problem when high number of connections.

If not calling napi_complete_done(), upper layer should guarantee napi_schedule,
then seems the upper NAPI code may have a bug -- the auto scheduling did not
happen in this case. I will check it further.

Thanks,
- Haiyang



[PATCH net] hv_netvsc: Fix napi reschedule while receive completion is busy

2018-07-09 Thread Haiyang Zhang
From: Haiyang Zhang 

If out ring is full temporarily and receive completion cannot go out,
we may still need to reschedule napi if other conditions are met.
Otherwise the napi poll might be stopped forever, and cause network
disconnect.

Fixes: 7426b1a51803 ("netvsc: optimize receive completions")
Signed-off-by: Haiyang Zhang 
---
 drivers/net/hyperv/netvsc.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index 8e9d0ee1572b..caaf5054f446 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -1285,14 +1285,14 @@ int netvsc_poll(struct napi_struct *napi, int budget)
nvchan->desc = hv_pkt_iter_next(channel, nvchan->desc);
}
 
-   /* If send of pending receive completions suceeded
-*   and did not exhaust NAPI budget this time
+   send_recv_completions(ndev, net_device, nvchan);
+
+   /* If it did not exhaust NAPI budget this time
 *   and not doing busy poll
 * then re-enable host interrupts
 * and reschedule if ring is not empty.
 */
-   if (send_recv_completions(ndev, net_device, nvchan) == 0 &&
-   work_done < budget &&
+   if (work_done < budget &&
napi_complete_done(napi, work_done) &&
hv_end_read(>inbound) &&
napi_schedule_prep(napi)) {
-- 
2.17.1



[PATCH net] hv_netvsc: Fix napi reschedule while receive completion is busy

2018-07-09 Thread Haiyang Zhang
From: Haiyang Zhang 

If out ring is full temporarily and receive completion cannot go out,
we may still need to reschedule napi if other conditions are met.
Otherwise the napi poll might be stopped forever, and cause network
disconnect.

Fixes: 7426b1a51803 ("netvsc: optimize receive completions")
Signed-off-by: Haiyang Zhang 
---
 drivers/net/hyperv/netvsc.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index 8e9d0ee1572b..caaf5054f446 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -1285,14 +1285,14 @@ int netvsc_poll(struct napi_struct *napi, int budget)
nvchan->desc = hv_pkt_iter_next(channel, nvchan->desc);
}
 
-   /* If send of pending receive completions suceeded
-*   and did not exhaust NAPI budget this time
+   send_recv_completions(ndev, net_device, nvchan);
+
+   /* If it did not exhaust NAPI budget this time
 *   and not doing busy poll
 * then re-enable host interrupts
 * and reschedule if ring is not empty.
 */
-   if (send_recv_completions(ndev, net_device, nvchan) == 0 &&
-   work_done < budget &&
+   if (work_done < budget &&
napi_complete_done(napi, work_done) &&
hv_end_read(>inbound) &&
napi_schedule_prep(napi)) {
-- 
2.17.1



RE: [PATCH] PCI: hv: Fix a __local_bh_enable_ip warning in hv_compose_msi_msg()

2018-05-25 Thread Haiyang Zhang


> -Original Message-
> From: Dexuan Cui
> Sent: Tuesday, May 22, 2018 8:18 PM
> To: Lorenzo Pieralisi <lorenzo.pieral...@arm.com>; Bjorn Helgaas
> <bhelg...@google.com>; linux-...@vger.kernel.org; KY Srinivasan
> <k...@microsoft.com>; Stephen Hemminger <sthem...@microsoft.com>;
> o...@aepfle.de; a...@canonical.com; jasow...@redhat.com
> Cc: linux-kernel@vger.kernel.org; driverdev-de...@linuxdriverproject.org;
> Haiyang Zhang <haiya...@microsoft.com>; vkuzn...@redhat.com;
> marcelo.ce...@canonical.com
> Subject: [PATCH] PCI: hv: Fix a __local_bh_enable_ip warning in
> hv_compose_msi_msg()
> 
> 
> Commit de0aa7b2f97d ("PCI: hv: Fix 2 hang issues in hv_compose_msi_msg()")
> uses local_bh_disable()/enable(), because hv_pci_onchannelcallback() can also
> run in tasklet context as the channel event callback.
> 
> With CONFIG_PROVE_LOCKING=y in the latest mainline, or old kernels that
> don't have commit f71b74bca637 ("irq/softirqs: Use lockdep to assert IRQs are
> disabled/enabled"), it turns out can we trigger a warning at the beginning of
> __local_bh_enable_ip(), because the upper layer irq code can call
> hv_compose_msi_msg() with local irqs disabled.
> 
> Let's fix the warning by switching to local_irq_save()/restore(). This is not 
> an
> issue because hv_pci_onchannelcallback() is not slow, and it not a hot path.
> 
> Fixes: de0aa7b2f97d ("PCI: hv: Fix 2 hang issues in hv_compose_msi_msg()")
> Signed-off-by: Dexuan Cui <de...@microsoft.com>
> Cc: <sta...@vger.kernel.org>
> Cc: Stephen Hemminger <sthem...@microsoft.com>
> Cc: K. Y. Srinivasan <k...@microsoft.com>
> ---

Reviewed-by: Haiyang Zhang <haiya...@microsoft.com>

Thanks you.



RE: [PATCH] PCI: hv: Fix a __local_bh_enable_ip warning in hv_compose_msi_msg()

2018-05-25 Thread Haiyang Zhang


> -Original Message-
> From: Dexuan Cui
> Sent: Tuesday, May 22, 2018 8:18 PM
> To: Lorenzo Pieralisi ; Bjorn Helgaas
> ; linux-...@vger.kernel.org; KY Srinivasan
> ; Stephen Hemminger ;
> o...@aepfle.de; a...@canonical.com; jasow...@redhat.com
> Cc: linux-kernel@vger.kernel.org; driverdev-de...@linuxdriverproject.org;
> Haiyang Zhang ; vkuzn...@redhat.com;
> marcelo.ce...@canonical.com
> Subject: [PATCH] PCI: hv: Fix a __local_bh_enable_ip warning in
> hv_compose_msi_msg()
> 
> 
> Commit de0aa7b2f97d ("PCI: hv: Fix 2 hang issues in hv_compose_msi_msg()")
> uses local_bh_disable()/enable(), because hv_pci_onchannelcallback() can also
> run in tasklet context as the channel event callback.
> 
> With CONFIG_PROVE_LOCKING=y in the latest mainline, or old kernels that
> don't have commit f71b74bca637 ("irq/softirqs: Use lockdep to assert IRQs are
> disabled/enabled"), it turns out can we trigger a warning at the beginning of
> __local_bh_enable_ip(), because the upper layer irq code can call
> hv_compose_msi_msg() with local irqs disabled.
> 
> Let's fix the warning by switching to local_irq_save()/restore(). This is not 
> an
> issue because hv_pci_onchannelcallback() is not slow, and it not a hot path.
> 
> Fixes: de0aa7b2f97d ("PCI: hv: Fix 2 hang issues in hv_compose_msi_msg()")
> Signed-off-by: Dexuan Cui 
> Cc: 
> Cc: Stephen Hemminger 
> Cc: K. Y. Srinivasan 
> ---

Reviewed-by: Haiyang Zhang 

Thanks you.



RE: [PATCH] PCI: hv: Do not wait forever on a device that has disappeared

2018-05-25 Thread Haiyang Zhang


> -Original Message-
> From: Dexuan Cui
> Sent: Wednesday, May 23, 2018 5:12 PM
> To: 'Lorenzo Pieralisi' <lorenzo.pieral...@arm.com>; 'Bjorn Helgaas'
> <bhelg...@google.com>; 'linux-...@vger.kernel.org'  p...@vger.kernel.org>; KY Srinivasan <k...@microsoft.com>; Stephen
> Hemminger <sthem...@microsoft.com>; 'o...@aepfle.de' <o...@aepfle.de>;
> 'a...@canonical.com' <a...@canonical.com>; 'jasow...@redhat.com'
> <jasow...@redhat.com>
> Cc: 'linux-kernel@vger.kernel.org' <linux-kernel@vger.kernel.org>; 'driverdev-
> de...@linuxdriverproject.org' <driverdev-de...@linuxdriverproject.org>;
> Haiyang Zhang <haiya...@microsoft.com>; 'vkuzn...@redhat.com'
> <vkuzn...@redhat.com>; 'marcelo.ce...@canonical.com'
> <marcelo.ce...@canonical.com>
> Subject: [PATCH] PCI: hv: Do not wait forever on a device that has disappeared
> 
> 
> Before the guest finishes the device initialization, the device can be removed
> anytime by the host, and after that the host won't respond to the guest's
> request, so the guest should be prepared to handle this case.
> 
> Signed-off-by: Dexuan Cui <de...@microsoft.com>
> Cc: Stephen Hemminger <sthem...@microsoft.com>
> Cc: K. Y. Srinivasan <k...@microsoft.com>
> ---
>  drivers/pci/host/pci-hyperv.c | 46 ---
> 

Reviewed-by: Haiyang Zhang <haiya...@microsoft.com>

Thank you!



RE: [PATCH] PCI: hv: Do not wait forever on a device that has disappeared

2018-05-25 Thread Haiyang Zhang


> -Original Message-
> From: Dexuan Cui
> Sent: Wednesday, May 23, 2018 5:12 PM
> To: 'Lorenzo Pieralisi' ; 'Bjorn Helgaas'
> ; 'linux-...@vger.kernel.org'  p...@vger.kernel.org>; KY Srinivasan ; Stephen
> Hemminger ; 'o...@aepfle.de' ;
> 'a...@canonical.com' ; 'jasow...@redhat.com'
> 
> Cc: 'linux-kernel@vger.kernel.org' ; 'driverdev-
> de...@linuxdriverproject.org' ;
> Haiyang Zhang ; 'vkuzn...@redhat.com'
> ; 'marcelo.ce...@canonical.com'
> 
> Subject: [PATCH] PCI: hv: Do not wait forever on a device that has disappeared
> 
> 
> Before the guest finishes the device initialization, the device can be removed
> anytime by the host, and after that the host won't respond to the guest's
> request, so the guest should be prepared to handle this case.
> 
> Signed-off-by: Dexuan Cui 
> Cc: Stephen Hemminger 
> Cc: K. Y. Srinivasan 
> ---
>  drivers/pci/host/pci-hyperv.c | 46 ---
> 

Reviewed-by: Haiyang Zhang 

Thank you!



[PATCH net-next] hv_netvsc: Add handlers for ethtool get/set msg level

2018-05-22 Thread Haiyang Zhang
From: Haiyang Zhang <haiya...@microsoft.com>

The handlers for ethtool get/set msg level are missing from netvsc.
This patch adds them.

Signed-off-by: Haiyang Zhang <haiya...@microsoft.com>
---
 drivers/net/hyperv/netvsc_drv.c | 16 
 1 file changed, 16 insertions(+)

diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index da07ccdf84bf..60a5769ef5a1 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -1618,8 +1618,24 @@ static int netvsc_set_ringparam(struct net_device *ndev,
return ret;
 }
 
+static u32 netvsc_get_msglevel(struct net_device *ndev)
+{
+   struct net_device_context *ndev_ctx = netdev_priv(ndev);
+
+   return ndev_ctx->msg_enable;
+}
+
+static void netvsc_set_msglevel(struct net_device *ndev, u32 val)
+{
+   struct net_device_context *ndev_ctx = netdev_priv(ndev);
+
+   ndev_ctx->msg_enable = val;
+}
+
 static const struct ethtool_ops ethtool_ops = {
.get_drvinfo= netvsc_get_drvinfo,
+   .get_msglevel   = netvsc_get_msglevel,
+   .set_msglevel   = netvsc_set_msglevel,
.get_link   = ethtool_op_get_link,
.get_ethtool_stats = netvsc_get_ethtool_stats,
.get_sset_count = netvsc_get_sset_count,
-- 
2.17.0



[PATCH net-next] hv_netvsc: Add handlers for ethtool get/set msg level

2018-05-22 Thread Haiyang Zhang
From: Haiyang Zhang 

The handlers for ethtool get/set msg level are missing from netvsc.
This patch adds them.

Signed-off-by: Haiyang Zhang 
---
 drivers/net/hyperv/netvsc_drv.c | 16 
 1 file changed, 16 insertions(+)

diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index da07ccdf84bf..60a5769ef5a1 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -1618,8 +1618,24 @@ static int netvsc_set_ringparam(struct net_device *ndev,
return ret;
 }
 
+static u32 netvsc_get_msglevel(struct net_device *ndev)
+{
+   struct net_device_context *ndev_ctx = netdev_priv(ndev);
+
+   return ndev_ctx->msg_enable;
+}
+
+static void netvsc_set_msglevel(struct net_device *ndev, u32 val)
+{
+   struct net_device_context *ndev_ctx = netdev_priv(ndev);
+
+   ndev_ctx->msg_enable = val;
+}
+
 static const struct ethtool_ops ethtool_ops = {
.get_drvinfo= netvsc_get_drvinfo,
+   .get_msglevel   = netvsc_get_msglevel,
+   .set_msglevel   = netvsc_set_msglevel,
.get_link   = ethtool_op_get_link,
.get_ethtool_stats = netvsc_get_ethtool_stats,
.get_sset_count = netvsc_get_sset_count,
-- 
2.17.0



[PATCH net-next] hv_netvsc: Add NetVSP v6 and v6.1 into version negotiation

2018-04-17 Thread Haiyang Zhang
From: Haiyang Zhang <haiya...@microsoft.com>

This patch adds the NetVSP v6 and 6.1 message structures, and includes
these versions into NetVSC/NetVSP version negotiation process.

Signed-off-by: Haiyang Zhang <haiya...@microsoft.com>
---
 drivers/net/hyperv/hyperv_net.h | 164 
 drivers/net/hyperv/netvsc.c |   3 +-
 2 files changed, 166 insertions(+), 1 deletion(-)

diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
index 960f06141472..6ebe39a3dde6 100644
--- a/drivers/net/hyperv/hyperv_net.h
+++ b/drivers/net/hyperv/hyperv_net.h
@@ -237,6 +237,8 @@ void netvsc_switch_datapath(struct net_device *nv_dev, bool 
vf);
 #define NVSP_PROTOCOL_VERSION_20x30002
 #define NVSP_PROTOCOL_VERSION_40x4
 #define NVSP_PROTOCOL_VERSION_50x5
+#define NVSP_PROTOCOL_VERSION_60x6
+#define NVSP_PROTOCOL_VERSION_61   0x60001
 
 enum {
NVSP_MSG_TYPE_NONE = 0,
@@ -308,6 +310,12 @@ enum {
NVSP_MSG5_TYPE_SEND_INDIRECTION_TABLE,
 
NVSP_MSG5_MAX = NVSP_MSG5_TYPE_SEND_INDIRECTION_TABLE,
+
+   /* Version 6 messages */
+   NVSP_MSG6_TYPE_PD_API,
+   NVSP_MSG6_TYPE_PD_POST_BATCH,
+
+   NVSP_MSG6_MAX = NVSP_MSG6_TYPE_PD_POST_BATCH
 };
 
 enum {
@@ -619,12 +627,168 @@ union nvsp_5_message_uber {
struct nvsp_5_send_indirect_table send_table;
 } __packed;
 
+enum nvsp_6_pd_api_op {
+   PD_API_OP_CONFIG = 1,
+   PD_API_OP_SW_DATAPATH, /* Switch Datapath */
+   PD_API_OP_OPEN_PROVIDER,
+   PD_API_OP_CLOSE_PROVIDER,
+   PD_API_OP_CREATE_QUEUE,
+   PD_API_OP_FLUSH_QUEUE,
+   PD_API_OP_FREE_QUEUE,
+   PD_API_OP_ALLOC_COM_BUF, /* Allocate Common Buffer */
+   PD_API_OP_FREE_COM_BUF, /* Free Common Buffer */
+   PD_API_OP_MAX
+};
+
+struct grp_affinity {
+   u64 mask;
+   u16 grp;
+   u16 reserved[3];
+} __packed;
+
+struct nvsp_6_pd_api_req {
+   u32 op;
+
+   union {
+   /* MMIO information is sent from the VM to VSP */
+   struct __packed {
+   u64 mmio_pa; /* MMIO Physical Address */
+   u32 mmio_len;
+
+   /* Number of PD queues a VM can support */
+   u16 num_subchn;
+   } config;
+
+   /* Switch Datapath */
+   struct __packed {
+   /* Host Datapath Is PacketDirect */
+   u8 host_dpath_is_pd;
+
+   /* Guest PacketDirect Is Enabled */
+   u8 guest_pd_enabled;
+   } sw_dpath;
+
+   /* Open Provider*/
+   struct __packed {
+   u32 prov_id; /* Provider id */
+   u32 flag;
+   } open_prov;
+
+   /* Close Provider */
+   struct __packed {
+   u32 prov_id;
+   } cls_prov;
+
+   /* Create Queue*/
+   struct __packed {
+   u32 prov_id;
+   u16 q_id;
+   u16 q_size;
+   u8 is_recv_q;
+   u8 is_rss_q;
+   u32 recv_data_len;
+   struct grp_affinity affy;
+   } cr_q;
+
+   /* Delete Queue*/
+   struct __packed {
+   u32 prov_id;
+   u16 q_id;
+   } del_q;
+
+   /* Flush Queue */
+   struct __packed {
+   u32 prov_id;
+   u16 q_id;
+   } flush_q;
+
+   /* Allocate Common Buffer */
+   struct __packed {
+   u32 len;
+   u32 pf_node; /* Preferred Node */
+   u16 region_id;
+   } alloc_com_buf;
+
+   /* Free Common Buffer */
+   struct __packed {
+   u32 len;
+   u64 pa; /* Physical Address */
+   u32 pf_node; /* Preferred Node */
+   u16 region_id;
+   u8 cache_type;
+   } free_com_buf;
+   } __packed;
+} __packed;
+
+struct nvsp_6_pd_api_comp {
+   u32 op;
+   u32 status;
+
+   union {
+   struct __packed {
+   /* actual number of PD queues allocated to the VM */
+   u16 num_pd_q;
+
+   /* Num Receive Rss PD Queues */
+   u8 num_rss_q;
+
+   u8 is_supported; /* Is supported by VSP */
+   u8 is_enabled; /* Is enabled by VSP */
+   } config;
+
+   /* Open Provider */
+   struct __packed {
+   u32 prov_id;
+   } open_prov;
+
+   /* Create Queue */
+ 

[PATCH net-next] hv_netvsc: Add NetVSP v6 and v6.1 into version negotiation

2018-04-17 Thread Haiyang Zhang
From: Haiyang Zhang 

This patch adds the NetVSP v6 and 6.1 message structures, and includes
these versions into NetVSC/NetVSP version negotiation process.

Signed-off-by: Haiyang Zhang 
---
 drivers/net/hyperv/hyperv_net.h | 164 
 drivers/net/hyperv/netvsc.c |   3 +-
 2 files changed, 166 insertions(+), 1 deletion(-)

diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
index 960f06141472..6ebe39a3dde6 100644
--- a/drivers/net/hyperv/hyperv_net.h
+++ b/drivers/net/hyperv/hyperv_net.h
@@ -237,6 +237,8 @@ void netvsc_switch_datapath(struct net_device *nv_dev, bool 
vf);
 #define NVSP_PROTOCOL_VERSION_20x30002
 #define NVSP_PROTOCOL_VERSION_40x4
 #define NVSP_PROTOCOL_VERSION_50x5
+#define NVSP_PROTOCOL_VERSION_60x6
+#define NVSP_PROTOCOL_VERSION_61   0x60001
 
 enum {
NVSP_MSG_TYPE_NONE = 0,
@@ -308,6 +310,12 @@ enum {
NVSP_MSG5_TYPE_SEND_INDIRECTION_TABLE,
 
NVSP_MSG5_MAX = NVSP_MSG5_TYPE_SEND_INDIRECTION_TABLE,
+
+   /* Version 6 messages */
+   NVSP_MSG6_TYPE_PD_API,
+   NVSP_MSG6_TYPE_PD_POST_BATCH,
+
+   NVSP_MSG6_MAX = NVSP_MSG6_TYPE_PD_POST_BATCH
 };
 
 enum {
@@ -619,12 +627,168 @@ union nvsp_5_message_uber {
struct nvsp_5_send_indirect_table send_table;
 } __packed;
 
+enum nvsp_6_pd_api_op {
+   PD_API_OP_CONFIG = 1,
+   PD_API_OP_SW_DATAPATH, /* Switch Datapath */
+   PD_API_OP_OPEN_PROVIDER,
+   PD_API_OP_CLOSE_PROVIDER,
+   PD_API_OP_CREATE_QUEUE,
+   PD_API_OP_FLUSH_QUEUE,
+   PD_API_OP_FREE_QUEUE,
+   PD_API_OP_ALLOC_COM_BUF, /* Allocate Common Buffer */
+   PD_API_OP_FREE_COM_BUF, /* Free Common Buffer */
+   PD_API_OP_MAX
+};
+
+struct grp_affinity {
+   u64 mask;
+   u16 grp;
+   u16 reserved[3];
+} __packed;
+
+struct nvsp_6_pd_api_req {
+   u32 op;
+
+   union {
+   /* MMIO information is sent from the VM to VSP */
+   struct __packed {
+   u64 mmio_pa; /* MMIO Physical Address */
+   u32 mmio_len;
+
+   /* Number of PD queues a VM can support */
+   u16 num_subchn;
+   } config;
+
+   /* Switch Datapath */
+   struct __packed {
+   /* Host Datapath Is PacketDirect */
+   u8 host_dpath_is_pd;
+
+   /* Guest PacketDirect Is Enabled */
+   u8 guest_pd_enabled;
+   } sw_dpath;
+
+   /* Open Provider*/
+   struct __packed {
+   u32 prov_id; /* Provider id */
+   u32 flag;
+   } open_prov;
+
+   /* Close Provider */
+   struct __packed {
+   u32 prov_id;
+   } cls_prov;
+
+   /* Create Queue*/
+   struct __packed {
+   u32 prov_id;
+   u16 q_id;
+   u16 q_size;
+   u8 is_recv_q;
+   u8 is_rss_q;
+   u32 recv_data_len;
+   struct grp_affinity affy;
+   } cr_q;
+
+   /* Delete Queue*/
+   struct __packed {
+   u32 prov_id;
+   u16 q_id;
+   } del_q;
+
+   /* Flush Queue */
+   struct __packed {
+   u32 prov_id;
+   u16 q_id;
+   } flush_q;
+
+   /* Allocate Common Buffer */
+   struct __packed {
+   u32 len;
+   u32 pf_node; /* Preferred Node */
+   u16 region_id;
+   } alloc_com_buf;
+
+   /* Free Common Buffer */
+   struct __packed {
+   u32 len;
+   u64 pa; /* Physical Address */
+   u32 pf_node; /* Preferred Node */
+   u16 region_id;
+   u8 cache_type;
+   } free_com_buf;
+   } __packed;
+} __packed;
+
+struct nvsp_6_pd_api_comp {
+   u32 op;
+   u32 status;
+
+   union {
+   struct __packed {
+   /* actual number of PD queues allocated to the VM */
+   u16 num_pd_q;
+
+   /* Num Receive Rss PD Queues */
+   u8 num_rss_q;
+
+   u8 is_supported; /* Is supported by VSP */
+   u8 is_enabled; /* Is enabled by VSP */
+   } config;
+
+   /* Open Provider */
+   struct __packed {
+   u32 prov_id;
+   } open_prov;
+
+   /* Create Queue */
+   struct __packed {
+   u32 prov_id

RE: [PATCH v7] Revert "PCI: hv: Use device serial number as PCI domain"

2018-04-12 Thread Haiyang Zhang


> -Original Message-
> From: Sridhar Pitchai
> Sent: Thursday, April 12, 2018 11:14 AM
> To: Bjorn Helgaas <helg...@kernel.org>; Lorenzo Pieralisi
> <lorenzo.pieral...@arm.com>
> Cc: Haiyang Zhang <haiya...@microsoft.com>; Dexuan Cui
> <de...@microsoft.com>; Stephen Hemminger <sthem...@microsoft.com>;
> Jake Oshins <ja...@microsoft.com>; KY Srinivasan <k...@microsoft.com>;
> Michael Kelley (EOSG) <michael.h.kel...@microsoft.com>;
> de...@linuxdriverproject.org; linux-...@vger.kernel.org; linux-
> ker...@vger.kernel.org
> Subject: Re: [PATCH v7] Revert "PCI: hv: Use device serial number as PCI
> domain"
> 
> >> I am still not happy with this patch.
> >>
> >> -  You do not explain at all the dependency on commit 0c195567a8f6 and
> >>you should because that's fundamental, if that patch is not present
> >>this revert breaks the kernel as per previous discussions[1].
> >> -  You are sending this patch to all stable kernels that contain the
> >>commit you are fixing - some that may not contain the commit above
> >>(that was merged in v4.14), you are breaking those kernels, if not
> >>explain me why please
> 
> >If there's a dependency on 0c195567a8f6, I totally agree that
> >needs to be cleared up.  I was assuming that turned out to be
> >irrelevant.
> That is right. There is no dependency on 0c195567a8f6. We just need to revert
> 4a9b0933bdfc.

This patch should only be applied to later versions after  0c195567a8f6" 
(transparent VF). Otherwise it causes long & random names of VF NICs for 
bonding. That will make bonding config difficult, especially for auto config.

Thanks,
- Haiyang


RE: [PATCH v7] Revert "PCI: hv: Use device serial number as PCI domain"

2018-04-12 Thread Haiyang Zhang


> -Original Message-
> From: Sridhar Pitchai
> Sent: Thursday, April 12, 2018 11:14 AM
> To: Bjorn Helgaas ; Lorenzo Pieralisi
> 
> Cc: Haiyang Zhang ; Dexuan Cui
> ; Stephen Hemminger ;
> Jake Oshins ; KY Srinivasan ;
> Michael Kelley (EOSG) ;
> de...@linuxdriverproject.org; linux-...@vger.kernel.org; linux-
> ker...@vger.kernel.org
> Subject: Re: [PATCH v7] Revert "PCI: hv: Use device serial number as PCI
> domain"
> 
> >> I am still not happy with this patch.
> >>
> >> -  You do not explain at all the dependency on commit 0c195567a8f6 and
> >>you should because that's fundamental, if that patch is not present
> >>this revert breaks the kernel as per previous discussions[1].
> >> -  You are sending this patch to all stable kernels that contain the
> >>commit you are fixing - some that may not contain the commit above
> >>(that was merged in v4.14), you are breaking those kernels, if not
> >>explain me why please
> 
> >If there's a dependency on 0c195567a8f6, I totally agree that
> >needs to be cleared up.  I was assuming that turned out to be
> >irrelevant.
> That is right. There is no dependency on 0c195567a8f6. We just need to revert
> 4a9b0933bdfc.

This patch should only be applied to later versions after  0c195567a8f6" 
(transparent VF). Otherwise it causes long & random names of VF NICs for 
bonding. That will make bonding config difficult, especially for auto config.

Thanks,
- Haiyang


[PATCH net-next] hv_netvsc: Add NetVSP v6 into version negotiation

2018-04-05 Thread Haiyang Zhang
From: Haiyang Zhang <haiya...@microsoft.com>

This patch adds the NetVSP v6 message structures, and includes this
version into NetVSC/NetVSP version negotiation process.

Signed-off-by: Haiyang Zhang <haiya...@microsoft.com>
---
 drivers/net/hyperv/hyperv_net.h | 33 +
 drivers/net/hyperv/netvsc.c |  3 ++-
 2 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
index 960f06141472..036cd55c66fe 100644
--- a/drivers/net/hyperv/hyperv_net.h
+++ b/drivers/net/hyperv/hyperv_net.h
@@ -237,6 +237,7 @@ void netvsc_switch_datapath(struct net_device *nv_dev, bool 
vf);
 #define NVSP_PROTOCOL_VERSION_20x30002
 #define NVSP_PROTOCOL_VERSION_40x4
 #define NVSP_PROTOCOL_VERSION_50x5
+#define NVSP_PROTOCOL_VERSION_60x6
 
 enum {
NVSP_MSG_TYPE_NONE = 0,
@@ -308,6 +309,11 @@ enum {
NVSP_MSG5_TYPE_SEND_INDIRECTION_TABLE,
 
NVSP_MSG5_MAX = NVSP_MSG5_TYPE_SEND_INDIRECTION_TABLE,
+
+   /* Version 6 messages */
+   NVSP_MSG6_TYPE_PD_API,
+
+   NVSP_MSG6_MAX = NVSP_MSG6_TYPE_PD_API
 };
 
 enum {
@@ -619,12 +625,39 @@ union nvsp_5_message_uber {
struct nvsp_5_send_indirect_table send_table;
 } __packed;
 
+enum nvsp6_pd_api_op {
+   PD_API_OP_NOTIFY_VSP = 0,
+   PD_API_OP_CONFIG,
+   PD_API_OP_MAX
+};
+
+struct nvsp_6_pd_api_req {
+   u32 op;
+   u64 mmio_pa; /* MMIO Physical Address */
+   u32 mmio_len;
+   u32 num_subchn; /* Number of PD subchannels */
+} __packed;
+
+struct nvsp_6_pd_api_comp {
+   u32 op;
+   u32 status;
+   u32 num_subchn; /* Number of PD subchannels */
+   u8 is_supported; /* Is supported by VSP */
+   u8 is_enabled; /* Is enabled by VSP */
+} __packed;
+
+union nvsp_6_message_uber {
+   struct nvsp_6_pd_api_req pd_req;
+   struct nvsp_6_pd_api_comp pd_comp;
+} __packed;
+
 union nvsp_all_messages {
union nvsp_message_init_uber init_msg;
union nvsp_1_message_uber v1_msg;
union nvsp_2_message_uber v2_msg;
union nvsp_4_message_uber v4_msg;
union nvsp_5_message_uber v5_msg;
+   union nvsp_6_message_uber v6_msg;
 } __packed;
 
 /* ALL Messages */
diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index c9910c33e671..3abe57bd85bb 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -509,7 +509,8 @@ static int netvsc_connect_vsp(struct hv_device *device,
struct net_device *ndev = hv_get_drvdata(device);
static const u32 ver_list[] = {
NVSP_PROTOCOL_VERSION_1, NVSP_PROTOCOL_VERSION_2,
-   NVSP_PROTOCOL_VERSION_4, NVSP_PROTOCOL_VERSION_5
+   NVSP_PROTOCOL_VERSION_4, NVSP_PROTOCOL_VERSION_5,
+   NVSP_PROTOCOL_VERSION_6
};
struct nvsp_message *init_packet;
int ndis_version, i, ret;
-- 
2.15.1



[PATCH net-next] hv_netvsc: Add NetVSP v6 into version negotiation

2018-04-05 Thread Haiyang Zhang
From: Haiyang Zhang 

This patch adds the NetVSP v6 message structures, and includes this
version into NetVSC/NetVSP version negotiation process.

Signed-off-by: Haiyang Zhang 
---
 drivers/net/hyperv/hyperv_net.h | 33 +
 drivers/net/hyperv/netvsc.c |  3 ++-
 2 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
index 960f06141472..036cd55c66fe 100644
--- a/drivers/net/hyperv/hyperv_net.h
+++ b/drivers/net/hyperv/hyperv_net.h
@@ -237,6 +237,7 @@ void netvsc_switch_datapath(struct net_device *nv_dev, bool 
vf);
 #define NVSP_PROTOCOL_VERSION_20x30002
 #define NVSP_PROTOCOL_VERSION_40x4
 #define NVSP_PROTOCOL_VERSION_50x5
+#define NVSP_PROTOCOL_VERSION_60x6
 
 enum {
NVSP_MSG_TYPE_NONE = 0,
@@ -308,6 +309,11 @@ enum {
NVSP_MSG5_TYPE_SEND_INDIRECTION_TABLE,
 
NVSP_MSG5_MAX = NVSP_MSG5_TYPE_SEND_INDIRECTION_TABLE,
+
+   /* Version 6 messages */
+   NVSP_MSG6_TYPE_PD_API,
+
+   NVSP_MSG6_MAX = NVSP_MSG6_TYPE_PD_API
 };
 
 enum {
@@ -619,12 +625,39 @@ union nvsp_5_message_uber {
struct nvsp_5_send_indirect_table send_table;
 } __packed;
 
+enum nvsp6_pd_api_op {
+   PD_API_OP_NOTIFY_VSP = 0,
+   PD_API_OP_CONFIG,
+   PD_API_OP_MAX
+};
+
+struct nvsp_6_pd_api_req {
+   u32 op;
+   u64 mmio_pa; /* MMIO Physical Address */
+   u32 mmio_len;
+   u32 num_subchn; /* Number of PD subchannels */
+} __packed;
+
+struct nvsp_6_pd_api_comp {
+   u32 op;
+   u32 status;
+   u32 num_subchn; /* Number of PD subchannels */
+   u8 is_supported; /* Is supported by VSP */
+   u8 is_enabled; /* Is enabled by VSP */
+} __packed;
+
+union nvsp_6_message_uber {
+   struct nvsp_6_pd_api_req pd_req;
+   struct nvsp_6_pd_api_comp pd_comp;
+} __packed;
+
 union nvsp_all_messages {
union nvsp_message_init_uber init_msg;
union nvsp_1_message_uber v1_msg;
union nvsp_2_message_uber v2_msg;
union nvsp_4_message_uber v4_msg;
union nvsp_5_message_uber v5_msg;
+   union nvsp_6_message_uber v6_msg;
 } __packed;
 
 /* ALL Messages */
diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index c9910c33e671..3abe57bd85bb 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -509,7 +509,8 @@ static int netvsc_connect_vsp(struct hv_device *device,
struct net_device *ndev = hv_get_drvdata(device);
static const u32 ver_list[] = {
NVSP_PROTOCOL_VERSION_1, NVSP_PROTOCOL_VERSION_2,
-   NVSP_PROTOCOL_VERSION_4, NVSP_PROTOCOL_VERSION_5
+   NVSP_PROTOCOL_VERSION_4, NVSP_PROTOCOL_VERSION_5,
+   NVSP_PROTOCOL_VERSION_6
};
struct nvsp_message *init_packet;
int ndis_version, i, ret;
-- 
2.15.1



[PATCH net-next] hv_netvsc: Clean up extra parameter from rndis_filter_receive_data()

2018-03-30 Thread Haiyang Zhang
From: Haiyang Zhang <haiya...@microsoft.com>

The variables, msg and data, have the same value. This patch removes
the extra one.

Signed-off-by: Haiyang Zhang <haiya...@microsoft.com>
---
 drivers/net/hyperv/rndis_filter.c | 16 +---
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/drivers/net/hyperv/rndis_filter.c 
b/drivers/net/hyperv/rndis_filter.c
index 4a4952363e8a..e2b68d9328a7 100644
--- a/drivers/net/hyperv/rndis_filter.c
+++ b/drivers/net/hyperv/rndis_filter.c
@@ -365,14 +365,15 @@ static inline void *rndis_get_ppi(struct rndis_packet 
*rpkt, u32 type)
 
 static int rndis_filter_receive_data(struct net_device *ndev,
 struct netvsc_device *nvdev,
-struct rndis_message *msg,
 struct vmbus_channel *channel,
-void *data, u32 data_buflen)
+struct rndis_message *msg,
+u32 data_buflen)
 {
struct rndis_packet *rndis_pkt = >msg.pkt;
const struct ndis_tcp_ip_checksum_info *csum_info;
const struct ndis_pkt_8021q_info *vlan;
u32 data_offset;
+   void *data;
 
/* Remove the rndis header and pass it back up the stack */
data_offset = RNDIS_HEADER_SIZE + rndis_pkt->data_offset;
@@ -393,14 +394,15 @@ static int rndis_filter_receive_data(struct net_device 
*ndev,
 
vlan = rndis_get_ppi(rndis_pkt, IEEE_8021Q_INFO);
 
+   csum_info = rndis_get_ppi(rndis_pkt, TCPIP_CHKSUM_PKTINFO);
+
+   data = (void *)msg + data_offset;
+
/*
 * Remove the rndis trailer padding from rndis packet message
 * rndis_pkt->data_len tell us the real data length, we only copy
 * the data packet to the stack, without the rndis trailer padding
 */
-   data = (void *)((unsigned long)data + data_offset);
-   csum_info = rndis_get_ppi(rndis_pkt, TCPIP_CHKSUM_PKTINFO);
-
return netvsc_recv_callback(ndev, nvdev, channel,
data, rndis_pkt->data_len,
csum_info, vlan);
@@ -419,8 +421,8 @@ int rndis_filter_receive(struct net_device *ndev,
 
switch (rndis_msg->ndis_msg_type) {
case RNDIS_MSG_PACKET:
-   return rndis_filter_receive_data(ndev, net_dev, rndis_msg,
-channel, data, buflen);
+   return rndis_filter_receive_data(ndev, net_dev, channel,
+rndis_msg, buflen);
case RNDIS_MSG_INIT_C:
case RNDIS_MSG_QUERY_C:
case RNDIS_MSG_SET_C:
-- 
2.15.1



[PATCH net-next] hv_netvsc: Clean up extra parameter from rndis_filter_receive_data()

2018-03-30 Thread Haiyang Zhang
From: Haiyang Zhang 

The variables, msg and data, have the same value. This patch removes
the extra one.

Signed-off-by: Haiyang Zhang 
---
 drivers/net/hyperv/rndis_filter.c | 16 +---
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/drivers/net/hyperv/rndis_filter.c 
b/drivers/net/hyperv/rndis_filter.c
index 4a4952363e8a..e2b68d9328a7 100644
--- a/drivers/net/hyperv/rndis_filter.c
+++ b/drivers/net/hyperv/rndis_filter.c
@@ -365,14 +365,15 @@ static inline void *rndis_get_ppi(struct rndis_packet 
*rpkt, u32 type)
 
 static int rndis_filter_receive_data(struct net_device *ndev,
 struct netvsc_device *nvdev,
-struct rndis_message *msg,
 struct vmbus_channel *channel,
-void *data, u32 data_buflen)
+struct rndis_message *msg,
+u32 data_buflen)
 {
struct rndis_packet *rndis_pkt = >msg.pkt;
const struct ndis_tcp_ip_checksum_info *csum_info;
const struct ndis_pkt_8021q_info *vlan;
u32 data_offset;
+   void *data;
 
/* Remove the rndis header and pass it back up the stack */
data_offset = RNDIS_HEADER_SIZE + rndis_pkt->data_offset;
@@ -393,14 +394,15 @@ static int rndis_filter_receive_data(struct net_device 
*ndev,
 
vlan = rndis_get_ppi(rndis_pkt, IEEE_8021Q_INFO);
 
+   csum_info = rndis_get_ppi(rndis_pkt, TCPIP_CHKSUM_PKTINFO);
+
+   data = (void *)msg + data_offset;
+
/*
 * Remove the rndis trailer padding from rndis packet message
 * rndis_pkt->data_len tell us the real data length, we only copy
 * the data packet to the stack, without the rndis trailer padding
 */
-   data = (void *)((unsigned long)data + data_offset);
-   csum_info = rndis_get_ppi(rndis_pkt, TCPIP_CHKSUM_PKTINFO);
-
return netvsc_recv_callback(ndev, nvdev, channel,
data, rndis_pkt->data_len,
csum_info, vlan);
@@ -419,8 +421,8 @@ int rndis_filter_receive(struct net_device *ndev,
 
switch (rndis_msg->ndis_msg_type) {
case RNDIS_MSG_PACKET:
-   return rndis_filter_receive_data(ndev, net_dev, rndis_msg,
-channel, data, buflen);
+   return rndis_filter_receive_data(ndev, net_dev, channel,
+rndis_msg, buflen);
case RNDIS_MSG_INIT_C:
case RNDIS_MSG_QUERY_C:
case RNDIS_MSG_SET_C:
-- 
2.15.1



RE: [PATCH net-next, 2/2] hv_netvsc: Add range checking for rx packet offset and length

2018-03-27 Thread Haiyang Zhang


> -Original Message-
> From: Stephen Hemminger <step...@networkplumber.org>
> Sent: Tuesday, March 27, 2018 11:23 AM
> To: Haiyang Zhang <haiya...@linuxonhyperv.com>
> Cc: Haiyang Zhang <haiya...@microsoft.com>; da...@davemloft.net;
> net...@vger.kernel.org; o...@aepfle.de; Stephen Hemminger
> <sthem...@microsoft.com>; linux-kernel@vger.kernel.org;
> de...@linuxdriverproject.org; vkuzn...@redhat.com
> Subject: Re: [PATCH net-next, 2/2] hv_netvsc: Add range checking for rx packet
> offset and length
> 
> On Thu, 22 Mar 2018 12:01:14 -0700
> Haiyang Zhang <haiya...@linuxonhyperv.com> wrote:
> 
> > From: Haiyang Zhang <haiya...@microsoft.com>
> >
> > This patch adds range checking for rx packet offset and length.
> > It may only happen if there is a host side bug.
> >
> > Signed-off-by: Haiyang Zhang <haiya...@microsoft.com>
> > ---
> >  drivers/net/hyperv/hyperv_net.h |  1 +
> >  drivers/net/hyperv/netvsc.c | 17 +++--
> >  2 files changed, 16 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/net/hyperv/hyperv_net.h
> > b/drivers/net/hyperv/hyperv_net.h index 0db3bd1ea06f..49c05ac894e5
> > 100644
> > --- a/drivers/net/hyperv/hyperv_net.h
> > +++ b/drivers/net/hyperv/hyperv_net.h
> > @@ -793,6 +793,7 @@ struct netvsc_device {
> >
> > /* Receive buffer allocated by us but manages by NetVSP */
> > void *recv_buf;
> > +   u32 recv_buf_size; /* allocated bytes */
> > u32 recv_buf_gpadl_handle;
> > u32 recv_section_cnt;
> > u32 recv_section_size;
> > diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
> > index 1ddb2c39b6e4..a6700d65f206 100644
> > --- a/drivers/net/hyperv/netvsc.c
> > +++ b/drivers/net/hyperv/netvsc.c
> > @@ -289,6 +289,8 @@ static int netvsc_init_buf(struct hv_device *device,
> > goto cleanup;
> > }
> >
> > +   net_device->recv_buf_size = buf_size;
> > +
> > /*
> >  * Establish the gpadl handle for this buffer on this
> >  * channel.  Note: This call uses the vmbus connection rather @@
> > -1095,11 +1097,22 @@ static int netvsc_receive(struct net_device
> > *ndev,
> >
> > /* Each range represents 1 RNDIS pkt that contains 1 ethernet frame */
> > for (i = 0; i < count; i++) {
> > -   void *data = recv_buf
> > -   + vmxferpage_packet->ranges[i].byte_offset;
> > +   u32 offset = vmxferpage_packet->ranges[i].byte_offset;
> > u32 buflen = vmxferpage_packet->ranges[i].byte_count;
> > +   void *data;
> > int ret;
> >
> > +   if (unlikely(offset + buflen > net_device->recv_buf_size)) {
> > +   status = NVSP_STAT_FAIL;
> > +   netif_err(net_device_ctx, rx_err, ndev,
> > + "Packet offset:%u + len:%u too big\n",
> > + offset, buflen);
> > +
> > +   continue;
> > +   }
> > +
> 
> If one part of the RNDIS packet is wrong then the whole receive buffer is
> damaged. Just return, don't continue.
> 
> It could really just be a statistic and a one shot log message.

I will let the loop terminates and send NVSP status fail to the host.

For statistics, this range check is to catch potential host side issues, just 
like
these checks in the same function earlier:
/* Make sure this is a valid nvsp packet */
if (unlikely(nvsp->hdr.msg_type != NVSP_MSG1_TYPE_SEND_RNDIS_PKT)) {
netif_err(net_device_ctx, rx_err, ndev,
  "Unknown nvsp packet type received %u\n",
  nvsp->hdr.msg_type);
return 0;
}

if (unlikely(vmxferpage_packet->xfer_pageset_id != 
NETVSC_RECEIVE_BUFFER_ID)) {
netif_err(net_device_ctx, rx_err, ndev,
  "Invalid xfer page set id - expecting %x got %x\n",
  NETVSC_RECEIVE_BUFFER_ID,
  vmxferpage_packet->xfer_pageset_id);
return 0;
}

If these kinds of errors need statistics, there will be many stat variables... 
Maybe we 
should just create one stat variable for all of the "invalid format from host"?

Thanks,
- Haiyang



RE: [PATCH net-next, 2/2] hv_netvsc: Add range checking for rx packet offset and length

2018-03-27 Thread Haiyang Zhang


> -Original Message-
> From: Stephen Hemminger 
> Sent: Tuesday, March 27, 2018 11:23 AM
> To: Haiyang Zhang 
> Cc: Haiyang Zhang ; da...@davemloft.net;
> net...@vger.kernel.org; o...@aepfle.de; Stephen Hemminger
> ; linux-kernel@vger.kernel.org;
> de...@linuxdriverproject.org; vkuzn...@redhat.com
> Subject: Re: [PATCH net-next, 2/2] hv_netvsc: Add range checking for rx packet
> offset and length
> 
> On Thu, 22 Mar 2018 12:01:14 -0700
> Haiyang Zhang  wrote:
> 
> > From: Haiyang Zhang 
> >
> > This patch adds range checking for rx packet offset and length.
> > It may only happen if there is a host side bug.
> >
> > Signed-off-by: Haiyang Zhang 
> > ---
> >  drivers/net/hyperv/hyperv_net.h |  1 +
> >  drivers/net/hyperv/netvsc.c | 17 +++--
> >  2 files changed, 16 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/net/hyperv/hyperv_net.h
> > b/drivers/net/hyperv/hyperv_net.h index 0db3bd1ea06f..49c05ac894e5
> > 100644
> > --- a/drivers/net/hyperv/hyperv_net.h
> > +++ b/drivers/net/hyperv/hyperv_net.h
> > @@ -793,6 +793,7 @@ struct netvsc_device {
> >
> > /* Receive buffer allocated by us but manages by NetVSP */
> > void *recv_buf;
> > +   u32 recv_buf_size; /* allocated bytes */
> > u32 recv_buf_gpadl_handle;
> > u32 recv_section_cnt;
> > u32 recv_section_size;
> > diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
> > index 1ddb2c39b6e4..a6700d65f206 100644
> > --- a/drivers/net/hyperv/netvsc.c
> > +++ b/drivers/net/hyperv/netvsc.c
> > @@ -289,6 +289,8 @@ static int netvsc_init_buf(struct hv_device *device,
> > goto cleanup;
> > }
> >
> > +   net_device->recv_buf_size = buf_size;
> > +
> > /*
> >  * Establish the gpadl handle for this buffer on this
> >  * channel.  Note: This call uses the vmbus connection rather @@
> > -1095,11 +1097,22 @@ static int netvsc_receive(struct net_device
> > *ndev,
> >
> > /* Each range represents 1 RNDIS pkt that contains 1 ethernet frame */
> > for (i = 0; i < count; i++) {
> > -   void *data = recv_buf
> > -   + vmxferpage_packet->ranges[i].byte_offset;
> > +   u32 offset = vmxferpage_packet->ranges[i].byte_offset;
> > u32 buflen = vmxferpage_packet->ranges[i].byte_count;
> > +   void *data;
> > int ret;
> >
> > +   if (unlikely(offset + buflen > net_device->recv_buf_size)) {
> > +   status = NVSP_STAT_FAIL;
> > +   netif_err(net_device_ctx, rx_err, ndev,
> > + "Packet offset:%u + len:%u too big\n",
> > + offset, buflen);
> > +
> > +   continue;
> > +   }
> > +
> 
> If one part of the RNDIS packet is wrong then the whole receive buffer is
> damaged. Just return, don't continue.
> 
> It could really just be a statistic and a one shot log message.

I will let the loop terminates and send NVSP status fail to the host.

For statistics, this range check is to catch potential host side issues, just 
like
these checks in the same function earlier:
/* Make sure this is a valid nvsp packet */
if (unlikely(nvsp->hdr.msg_type != NVSP_MSG1_TYPE_SEND_RNDIS_PKT)) {
netif_err(net_device_ctx, rx_err, ndev,
  "Unknown nvsp packet type received %u\n",
  nvsp->hdr.msg_type);
return 0;
}

if (unlikely(vmxferpage_packet->xfer_pageset_id != 
NETVSC_RECEIVE_BUFFER_ID)) {
netif_err(net_device_ctx, rx_err, ndev,
  "Invalid xfer page set id - expecting %x got %x\n",
  NETVSC_RECEIVE_BUFFER_ID,
  vmxferpage_packet->xfer_pageset_id);
return 0;
}

If these kinds of errors need statistics, there will be many stat variables... 
Maybe we 
should just create one stat variable for all of the "invalid format from host"?

Thanks,
- Haiyang



RE: [PATCH net-next,1/2] hv_netvsc: Fix the return status in RX path

2018-03-24 Thread Haiyang Zhang


> -Original Message-
> From: Michael Kelley (EOSG)
> Sent: Saturday, March 24, 2018 12:48 PM
> To: Haiyang Zhang <haiya...@microsoft.com>; da...@davemloft.net;
> net...@vger.kernel.org
> Cc: KY Srinivasan <k...@microsoft.com>; Stephen Hemminger
> <sthem...@microsoft.com>; o...@aepfle.de; vkuzn...@redhat.com;
> de...@linuxdriverproject.org; linux-kernel@vger.kernel.org
> Subject: RE: [PATCH net-next,1/2] hv_netvsc: Fix the return status in RX path
> 
> > -Original Message-
> > From: linux-kernel-ow...@vger.kernel.org
> > <linux-kernel-ow...@vger.kernel.org> On Behalf Of Haiyang Zhang
> > Sent: Thursday, March 22, 2018 12:01 PM
> > To: da...@davemloft.net; net...@vger.kernel.org
> > Cc: Haiyang Zhang <haiya...@microsoft.com>; KY Srinivasan
> > <k...@microsoft.com>; Stephen Hemminger <sthem...@microsoft.com>;
> > o...@aepfle.de; vkuzn...@redhat.com; de...@linuxdriverproject.org;
> > linux-kernel@vger.kernel.org
> > Subject: [PATCH net-next,1/2] hv_netvsc: Fix the return status in RX
> > path
> >
> > From: Haiyang Zhang <haiya...@microsoft.com>
> >
> > As defined in hyperv_net.h, the NVSP_STAT_SUCCESS is one not zero.
> > Some functions returns 0 when it actually means NVSP_STAT_SUCCESS.
> > This patch fixes them.
> >
> > In netvsc_receive(), it puts the last RNDIS packet's receive status
> > for all packets in a vmxferpage which may contain multiple RNDIS
> > packets.
> > This patch puts NVSP_STAT_FAIL in the receive completion if one of the
> > packets in a vmxferpage fails.
> 
> This patch changes the status field that is being reported back to the Hyper-V
> host in the receive completion message in
> enq_receive_complete().   The current code reports 0 on success,
> and with the patch, it will report 1 on success.  So does this change affect
> anything on the Hyper-V side?  Or is Hyper-V just ignoring
> the value?   If this change doesn't have any impact on the
> interactions with Hyper-V, perhaps it would be good to explain why in the
> commit message.

Here is the definition of each status code for NetVSP. 
enum {
NVSP_STAT_NONE = 0,
NVSP_STAT_SUCCESS,
NVSP_STAT_FAIL,
NVSP_STAT_PROTOCOL_TOO_NEW,
NVSP_STAT_PROTOCOL_TOO_OLD,
NVSP_STAT_INVALID_RNDIS_PKT,
NVSP_STAT_BUSY,
NVSP_STAT_PROTOCOL_UNSUPPORTED,
NVSP_STAT_MAX,
};

Existing code returns NVSP_STAT_NONE = 0, and with this patch
we return NVSP_STAT_SUCCESS = 1. 
Based on testing, either way works for now. But for correctness
and future stability (e.g. host side becomes more stringent), we
should follow the protocol.

Thanks,
- Haiyang



RE: [PATCH net-next,1/2] hv_netvsc: Fix the return status in RX path

2018-03-24 Thread Haiyang Zhang


> -Original Message-
> From: Michael Kelley (EOSG)
> Sent: Saturday, March 24, 2018 12:48 PM
> To: Haiyang Zhang ; da...@davemloft.net;
> net...@vger.kernel.org
> Cc: KY Srinivasan ; Stephen Hemminger
> ; o...@aepfle.de; vkuzn...@redhat.com;
> de...@linuxdriverproject.org; linux-kernel@vger.kernel.org
> Subject: RE: [PATCH net-next,1/2] hv_netvsc: Fix the return status in RX path
> 
> > -Original Message-
> > From: linux-kernel-ow...@vger.kernel.org
> >  On Behalf Of Haiyang Zhang
> > Sent: Thursday, March 22, 2018 12:01 PM
> > To: da...@davemloft.net; net...@vger.kernel.org
> > Cc: Haiyang Zhang ; KY Srinivasan
> > ; Stephen Hemminger ;
> > o...@aepfle.de; vkuzn...@redhat.com; de...@linuxdriverproject.org;
> > linux-kernel@vger.kernel.org
> > Subject: [PATCH net-next,1/2] hv_netvsc: Fix the return status in RX
> > path
> >
> > From: Haiyang Zhang 
> >
> > As defined in hyperv_net.h, the NVSP_STAT_SUCCESS is one not zero.
> > Some functions returns 0 when it actually means NVSP_STAT_SUCCESS.
> > This patch fixes them.
> >
> > In netvsc_receive(), it puts the last RNDIS packet's receive status
> > for all packets in a vmxferpage which may contain multiple RNDIS
> > packets.
> > This patch puts NVSP_STAT_FAIL in the receive completion if one of the
> > packets in a vmxferpage fails.
> 
> This patch changes the status field that is being reported back to the Hyper-V
> host in the receive completion message in
> enq_receive_complete().   The current code reports 0 on success,
> and with the patch, it will report 1 on success.  So does this change affect
> anything on the Hyper-V side?  Or is Hyper-V just ignoring
> the value?   If this change doesn't have any impact on the
> interactions with Hyper-V, perhaps it would be good to explain why in the
> commit message.

Here is the definition of each status code for NetVSP. 
enum {
NVSP_STAT_NONE = 0,
NVSP_STAT_SUCCESS,
NVSP_STAT_FAIL,
NVSP_STAT_PROTOCOL_TOO_NEW,
NVSP_STAT_PROTOCOL_TOO_OLD,
NVSP_STAT_INVALID_RNDIS_PKT,
NVSP_STAT_BUSY,
NVSP_STAT_PROTOCOL_UNSUPPORTED,
NVSP_STAT_MAX,
};

Existing code returns NVSP_STAT_NONE = 0, and with this patch
we return NVSP_STAT_SUCCESS = 1. 
Based on testing, either way works for now. But for correctness
and future stability (e.g. host side becomes more stringent), we
should follow the protocol.

Thanks,
- Haiyang



RE: [PATCH net-next,2/2] hv_netvsc: Add range checking for rx packet offset and length

2018-03-23 Thread Haiyang Zhang


> -Original Message-
> From: Vitaly Kuznetsov <vkuzn...@redhat.com>
> Sent: Friday, March 23, 2018 11:17 AM
> To: Haiyang Zhang <haiya...@linuxonhyperv.com>
> Cc: da...@davemloft.net; net...@vger.kernel.org; Haiyang Zhang
> <haiya...@microsoft.com>; KY Srinivasan <k...@microsoft.com>; Stephen
> Hemminger <sthem...@microsoft.com>; o...@aepfle.de;
> de...@linuxdriverproject.org; linux-kernel@vger.kernel.org
> Subject: Re: [PATCH net-next,2/2] hv_netvsc: Add range checking for rx packet
> offset and length
> 
> Haiyang Zhang <haiya...@linuxonhyperv.com> writes:
> 
> > From: Haiyang Zhang <haiya...@microsoft.com>
> >
> > This patch adds range checking for rx packet offset and length.
> > It may only happen if there is a host side bug.
> >
> > Signed-off-by: Haiyang Zhang <haiya...@microsoft.com>
> > ---
> >  drivers/net/hyperv/hyperv_net.h |  1 +
> >  drivers/net/hyperv/netvsc.c | 17 +++--
> >  2 files changed, 16 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/net/hyperv/hyperv_net.h
> > b/drivers/net/hyperv/hyperv_net.h index 0db3bd1ea06f..49c05ac894e5
> > 100644
> > --- a/drivers/net/hyperv/hyperv_net.h
> > +++ b/drivers/net/hyperv/hyperv_net.h
> > @@ -793,6 +793,7 @@ struct netvsc_device {
> >
> > /* Receive buffer allocated by us but manages by NetVSP */
> > void *recv_buf;
> > +   u32 recv_buf_size; /* allocated bytes */
> > u32 recv_buf_gpadl_handle;
> > u32 recv_section_cnt;
> > u32 recv_section_size;
> > diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
> > index 1ddb2c39b6e4..a6700d65f206 100644
> > --- a/drivers/net/hyperv/netvsc.c
> > +++ b/drivers/net/hyperv/netvsc.c
> > @@ -289,6 +289,8 @@ static int netvsc_init_buf(struct hv_device *device,
> > goto cleanup;
> > }
> >
> > +   net_device->recv_buf_size = buf_size;
> > +
> > /*
> >  * Establish the gpadl handle for this buffer on this
> >  * channel.  Note: This call uses the vmbus connection rather @@
> > -1095,11 +1097,22 @@ static int netvsc_receive(struct net_device
> > *ndev,
> >
> > /* Each range represents 1 RNDIS pkt that contains 1 ethernet frame */
> > for (i = 0; i < count; i++) {
> > -   void *data = recv_buf
> > -   + vmxferpage_packet->ranges[i].byte_offset;
> > +   u32 offset = vmxferpage_packet->ranges[i].byte_offset;
> > u32 buflen = vmxferpage_packet->ranges[i].byte_count;
> > +   void *data;
> > int ret;
> >
> > +   if (unlikely(offset + buflen > net_device->recv_buf_size)) {
> > +   status = NVSP_STAT_FAIL;
> > +   netif_err(net_device_ctx, rx_err, ndev,
> > + "Packet offset:%u + len:%u too big\n",
> > + offset, buflen);
> 
> This shouldn't happen, of course, but I'd rather ratelimit this error or even 
> used
> something like netdev_WARN_ONCE().

Actually I thought about ratelimit, but this range check is only to catch host 
side bug. 
It should not happen. 
But if it happens, the VM should not be used anymore. And we need to debug
the host. Similarly, some other this kind of checks in the same function are 
not using
ratelimit:

if (unlikely(nvsp->hdr.msg_type != NVSP_MSG1_TYPE_SEND_RNDIS_PKT)) {
netif_err(net_device_ctx, rx_err, ndev,
  "Unknown nvsp packet type received %u\n",
  nvsp->hdr.msg_type);

Thanks,
- Haiyang


RE: [PATCH net-next,2/2] hv_netvsc: Add range checking for rx packet offset and length

2018-03-23 Thread Haiyang Zhang


> -Original Message-
> From: Vitaly Kuznetsov 
> Sent: Friday, March 23, 2018 11:17 AM
> To: Haiyang Zhang 
> Cc: da...@davemloft.net; net...@vger.kernel.org; Haiyang Zhang
> ; KY Srinivasan ; Stephen
> Hemminger ; o...@aepfle.de;
> de...@linuxdriverproject.org; linux-kernel@vger.kernel.org
> Subject: Re: [PATCH net-next,2/2] hv_netvsc: Add range checking for rx packet
> offset and length
> 
> Haiyang Zhang  writes:
> 
> > From: Haiyang Zhang 
> >
> > This patch adds range checking for rx packet offset and length.
> > It may only happen if there is a host side bug.
> >
> > Signed-off-by: Haiyang Zhang 
> > ---
> >  drivers/net/hyperv/hyperv_net.h |  1 +
> >  drivers/net/hyperv/netvsc.c | 17 +++--
> >  2 files changed, 16 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/net/hyperv/hyperv_net.h
> > b/drivers/net/hyperv/hyperv_net.h index 0db3bd1ea06f..49c05ac894e5
> > 100644
> > --- a/drivers/net/hyperv/hyperv_net.h
> > +++ b/drivers/net/hyperv/hyperv_net.h
> > @@ -793,6 +793,7 @@ struct netvsc_device {
> >
> > /* Receive buffer allocated by us but manages by NetVSP */
> > void *recv_buf;
> > +   u32 recv_buf_size; /* allocated bytes */
> > u32 recv_buf_gpadl_handle;
> > u32 recv_section_cnt;
> > u32 recv_section_size;
> > diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
> > index 1ddb2c39b6e4..a6700d65f206 100644
> > --- a/drivers/net/hyperv/netvsc.c
> > +++ b/drivers/net/hyperv/netvsc.c
> > @@ -289,6 +289,8 @@ static int netvsc_init_buf(struct hv_device *device,
> > goto cleanup;
> > }
> >
> > +   net_device->recv_buf_size = buf_size;
> > +
> > /*
> >  * Establish the gpadl handle for this buffer on this
> >  * channel.  Note: This call uses the vmbus connection rather @@
> > -1095,11 +1097,22 @@ static int netvsc_receive(struct net_device
> > *ndev,
> >
> > /* Each range represents 1 RNDIS pkt that contains 1 ethernet frame */
> > for (i = 0; i < count; i++) {
> > -   void *data = recv_buf
> > -   + vmxferpage_packet->ranges[i].byte_offset;
> > +   u32 offset = vmxferpage_packet->ranges[i].byte_offset;
> > u32 buflen = vmxferpage_packet->ranges[i].byte_count;
> > +   void *data;
> > int ret;
> >
> > +   if (unlikely(offset + buflen > net_device->recv_buf_size)) {
> > +   status = NVSP_STAT_FAIL;
> > +   netif_err(net_device_ctx, rx_err, ndev,
> > + "Packet offset:%u + len:%u too big\n",
> > + offset, buflen);
> 
> This shouldn't happen, of course, but I'd rather ratelimit this error or even 
> used
> something like netdev_WARN_ONCE().

Actually I thought about ratelimit, but this range check is only to catch host 
side bug. 
It should not happen. 
But if it happens, the VM should not be used anymore. And we need to debug
the host. Similarly, some other this kind of checks in the same function are 
not using
ratelimit:

if (unlikely(nvsp->hdr.msg_type != NVSP_MSG1_TYPE_SEND_RNDIS_PKT)) {
netif_err(net_device_ctx, rx_err, ndev,
  "Unknown nvsp packet type received %u\n",
  nvsp->hdr.msg_type);

Thanks,
- Haiyang


RE: [PATCH 1/3] Vmbus: Add function to report available ring buffer to write in total ring size percentage

2018-03-23 Thread Haiyang Zhang


> -Original Message-
> From: Long Li <lon...@linuxonhyperv.com>
> Sent: Thursday, March 22, 2018 8:16 PM
> To: KY Srinivasan <k...@microsoft.com>; Haiyang Zhang
> <haiya...@microsoft.com>; Stephen Hemminger <sthem...@microsoft.com>;
> James E . J . Bottomley <jbottom...@odin.com>; Martin K . Petersen
> <martin.peter...@oracle.com>; de...@linuxdriverproject.org; linux-
> s...@vger.kernel.org; linux-kernel@vger.kernel.org
> Cc: Long Li <lon...@microsoft.com>
> Subject: [PATCH 1/3] Vmbus: Add function to report available ring buffer to
> write in total ring size percentage
> 
> From: Long Li <lon...@microsoft.com>
> 
> Netvsc has a similar function to calculate how much ring buffer in percentage
> is available to write. This function is useful for storvsc and other vmbus 
> devices.
> 
> Define a similar function in vmbus to be used by storvsc.
> 
> Signed-off-by: Long Li <lon...@microsoft.com>
> ---

Reviewed-by: Haiyang Zhang <haiya...@microsoft.com>



RE: [PATCH 1/3] Vmbus: Add function to report available ring buffer to write in total ring size percentage

2018-03-23 Thread Haiyang Zhang


> -Original Message-
> From: Long Li 
> Sent: Thursday, March 22, 2018 8:16 PM
> To: KY Srinivasan ; Haiyang Zhang
> ; Stephen Hemminger ;
> James E . J . Bottomley ; Martin K . Petersen
> ; de...@linuxdriverproject.org; linux-
> s...@vger.kernel.org; linux-kernel@vger.kernel.org
> Cc: Long Li 
> Subject: [PATCH 1/3] Vmbus: Add function to report available ring buffer to
> write in total ring size percentage
> 
> From: Long Li 
> 
> Netvsc has a similar function to calculate how much ring buffer in percentage
> is available to write. This function is useful for storvsc and other vmbus 
> devices.
> 
> Define a similar function in vmbus to be used by storvsc.
> 
> Signed-off-by: Long Li 
> ---

Reviewed-by: Haiyang Zhang 



RE: [PATCH 2/3] Netvsc: Use the vmbus functiton to calculate ring buffer percentage

2018-03-23 Thread Haiyang Zhang


> -Original Message-
> From: Long Li <lon...@linuxonhyperv.com>
> Sent: Thursday, March 22, 2018 8:16 PM
> To: KY Srinivasan <k...@microsoft.com>; Haiyang Zhang
> <haiya...@microsoft.com>; Stephen Hemminger <sthem...@microsoft.com>;
> James E . J . Bottomley <jbottom...@odin.com>; Martin K . Petersen
> <martin.peter...@oracle.com>; de...@linuxdriverproject.org; linux-
> s...@vger.kernel.org; linux-kernel@vger.kernel.org
> Cc: Long Li <lon...@microsoft.com>
> Subject: [PATCH 2/3] Netvsc: Use the vmbus functiton to calculate ring buffer
> percentage
> 
> From: Long Li <lon...@microsoft.com>
> 
> In Vmbus, we have defined a function to calculate available ring buffer
> percentage to write.
> 
> Use that function and remove duplicate netvsc code.
> 
> Signed-off-by: Long Li <lon...@microsoft.com>
> ---
>  drivers/net/hyperv/netvsc.c | 17 +++--
>  drivers/net/hyperv/netvsc_drv.c |  3 ---
>  2 files changed, 3 insertions(+), 17 deletions(-)
> 
> diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index
> 0265d703eb03..8af0069e4d8c 100644
> --- a/drivers/net/hyperv/netvsc.c
> +++ b/drivers/net/hyperv/netvsc.c
> @@ -31,7 +31,6 @@
>  #include 
>  #include 
>  #include 
> -#include 
> 
>  #include 
> 
> @@ -590,17 +589,6 @@ void netvsc_device_remove(struct hv_device *device)
> #define RING_AVAIL_PERCENT_HIWATER 20  #define
> RING_AVAIL_PERCENT_LOWATER 10
> 
> -/*
> - * Get the percentage of available bytes to write in the ring.
> - * The return value is in range from 0 to 100.
> - */
> -static u32 hv_ringbuf_avail_percent(const struct hv_ring_buffer_info
> *ring_info) -{
> - u32 avail_write = hv_get_bytes_to_write(ring_info);
> -
> - return reciprocal_divide(avail_write  * 100, netvsc_ring_reciprocal);
> -}
> -
>  static inline void netvsc_free_send_slot(struct netvsc_device *net_device,
>u32 index)
>  {
> @@ -649,7 +637,8 @@ static void netvsc_send_tx_complete(struct
> netvsc_device *net_device,
>   wake_up(_device->wait_drain);
> 
>   if (netif_tx_queue_stopped(netdev_get_tx_queue(ndev, q_idx)) &&
> - (hv_ringbuf_avail_percent(>outbound) >
> RING_AVAIL_PERCENT_HIWATER ||
> + (hv_get_avail_to_write_percent(>outbound) >
> +  RING_AVAIL_PERCENT_HIWATER ||
>queue_sends < 1)) {
>   netif_tx_wake_queue(netdev_get_tx_queue(ndev, q_idx));
>   ndev_ctx->eth_stats.wake_queue++;
> @@ -757,7 +746,7 @@ static inline int netvsc_send_pkt(
>   struct netdev_queue *txq = netdev_get_tx_queue(ndev, packet->q_idx);
>   u64 req_id;
>   int ret;
> - u32 ring_avail = hv_ringbuf_avail_percent(_channel->outbound);
> + u32 ring_avail =
> +hv_get_avail_to_write_percent(_channel->outbound);
> 
>   nvmsg.hdr.msg_type = NVSP_MSG1_TYPE_SEND_RNDIS_PKT;
>   if (skb)
> diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
> index faea0be18924..b0b1c2fd2b7b 100644
> --- a/drivers/net/hyperv/netvsc_drv.c
> +++ b/drivers/net/hyperv/netvsc_drv.c
> @@ -35,7 +35,6 @@
>  #include 
>  #include 
>  #include 
> -#include 
> 
>  #include 
>  #include 
> @@ -55,7 +54,6 @@ static unsigned int ring_size __ro_after_init = 128;
> module_param(ring_size, uint, S_IRUGO);  MODULE_PARM_DESC(ring_size,
> "Ring buffer size (# of pages)");  unsigned int netvsc_ring_bytes 
> __ro_after_init;
> -struct reciprocal_value netvsc_ring_reciprocal __ro_after_init;
> 
>  static const u32 default_msg = NETIF_MSG_DRV | NETIF_MSG_PROBE |
>   NETIF_MSG_LINK | NETIF_MSG_IFUP |
> @@ -2186,7 +2184,6 @@ static int __init netvsc_drv_init(void)
>   ring_size);
>   }
>   netvsc_ring_bytes = ring_size * PAGE_SIZE;
> - netvsc_ring_reciprocal = reciprocal_value(netvsc_ring_bytes);
> 
>   ret = vmbus_driver_register(_drv);
>   if (ret)
> --


Please also remove netvsc_ring_reciprocal from hyperv_net.h
Thanks.

Reviewed-by: Haiyang Zhang <haiya...@microsoft.com>


RE: [PATCH 2/3] Netvsc: Use the vmbus functiton to calculate ring buffer percentage

2018-03-23 Thread Haiyang Zhang


> -Original Message-
> From: Long Li 
> Sent: Thursday, March 22, 2018 8:16 PM
> To: KY Srinivasan ; Haiyang Zhang
> ; Stephen Hemminger ;
> James E . J . Bottomley ; Martin K . Petersen
> ; de...@linuxdriverproject.org; linux-
> s...@vger.kernel.org; linux-kernel@vger.kernel.org
> Cc: Long Li 
> Subject: [PATCH 2/3] Netvsc: Use the vmbus functiton to calculate ring buffer
> percentage
> 
> From: Long Li 
> 
> In Vmbus, we have defined a function to calculate available ring buffer
> percentage to write.
> 
> Use that function and remove duplicate netvsc code.
> 
> Signed-off-by: Long Li 
> ---
>  drivers/net/hyperv/netvsc.c | 17 +++--
>  drivers/net/hyperv/netvsc_drv.c |  3 ---
>  2 files changed, 3 insertions(+), 17 deletions(-)
> 
> diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index
> 0265d703eb03..8af0069e4d8c 100644
> --- a/drivers/net/hyperv/netvsc.c
> +++ b/drivers/net/hyperv/netvsc.c
> @@ -31,7 +31,6 @@
>  #include 
>  #include 
>  #include 
> -#include 
> 
>  #include 
> 
> @@ -590,17 +589,6 @@ void netvsc_device_remove(struct hv_device *device)
> #define RING_AVAIL_PERCENT_HIWATER 20  #define
> RING_AVAIL_PERCENT_LOWATER 10
> 
> -/*
> - * Get the percentage of available bytes to write in the ring.
> - * The return value is in range from 0 to 100.
> - */
> -static u32 hv_ringbuf_avail_percent(const struct hv_ring_buffer_info
> *ring_info) -{
> - u32 avail_write = hv_get_bytes_to_write(ring_info);
> -
> - return reciprocal_divide(avail_write  * 100, netvsc_ring_reciprocal);
> -}
> -
>  static inline void netvsc_free_send_slot(struct netvsc_device *net_device,
>u32 index)
>  {
> @@ -649,7 +637,8 @@ static void netvsc_send_tx_complete(struct
> netvsc_device *net_device,
>   wake_up(_device->wait_drain);
> 
>   if (netif_tx_queue_stopped(netdev_get_tx_queue(ndev, q_idx)) &&
> - (hv_ringbuf_avail_percent(>outbound) >
> RING_AVAIL_PERCENT_HIWATER ||
> + (hv_get_avail_to_write_percent(>outbound) >
> +  RING_AVAIL_PERCENT_HIWATER ||
>queue_sends < 1)) {
>   netif_tx_wake_queue(netdev_get_tx_queue(ndev, q_idx));
>   ndev_ctx->eth_stats.wake_queue++;
> @@ -757,7 +746,7 @@ static inline int netvsc_send_pkt(
>   struct netdev_queue *txq = netdev_get_tx_queue(ndev, packet->q_idx);
>   u64 req_id;
>   int ret;
> - u32 ring_avail = hv_ringbuf_avail_percent(_channel->outbound);
> + u32 ring_avail =
> +hv_get_avail_to_write_percent(_channel->outbound);
> 
>   nvmsg.hdr.msg_type = NVSP_MSG1_TYPE_SEND_RNDIS_PKT;
>   if (skb)
> diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
> index faea0be18924..b0b1c2fd2b7b 100644
> --- a/drivers/net/hyperv/netvsc_drv.c
> +++ b/drivers/net/hyperv/netvsc_drv.c
> @@ -35,7 +35,6 @@
>  #include 
>  #include 
>  #include 
> -#include 
> 
>  #include 
>  #include 
> @@ -55,7 +54,6 @@ static unsigned int ring_size __ro_after_init = 128;
> module_param(ring_size, uint, S_IRUGO);  MODULE_PARM_DESC(ring_size,
> "Ring buffer size (# of pages)");  unsigned int netvsc_ring_bytes 
> __ro_after_init;
> -struct reciprocal_value netvsc_ring_reciprocal __ro_after_init;
> 
>  static const u32 default_msg = NETIF_MSG_DRV | NETIF_MSG_PROBE |
>   NETIF_MSG_LINK | NETIF_MSG_IFUP |
> @@ -2186,7 +2184,6 @@ static int __init netvsc_drv_init(void)
>   ring_size);
>   }
>   netvsc_ring_bytes = ring_size * PAGE_SIZE;
> - netvsc_ring_reciprocal = reciprocal_value(netvsc_ring_bytes);
> 
>   ret = vmbus_driver_register(_drv);
>   if (ret)
> --


Please also remove netvsc_ring_reciprocal from hyperv_net.h
Thanks.

Reviewed-by: Haiyang Zhang 


[PATCH net-next,1/2] hv_netvsc: Fix the return status in RX path

2018-03-22 Thread Haiyang Zhang
From: Haiyang Zhang <haiya...@microsoft.com>

As defined in hyperv_net.h, the NVSP_STAT_SUCCESS is one not zero.
Some functions returns 0 when it actually means NVSP_STAT_SUCCESS.
This patch fixes them.

In netvsc_receive(), it puts the last RNDIS packet's receive status
for all packets in a vmxferpage which may contain multiple RNDIS
packets.
This patch puts NVSP_STAT_FAIL in the receive completion if one of
the packets in a vmxferpage fails.

Signed-off-by: Haiyang Zhang <haiya...@microsoft.com>
---
 drivers/net/hyperv/netvsc.c   | 8 ++--
 drivers/net/hyperv/netvsc_drv.c   | 2 +-
 drivers/net/hyperv/rndis_filter.c | 4 ++--
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index aa95e81af6e5..1ddb2c39b6e4 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -1098,12 +1098,16 @@ static int netvsc_receive(struct net_device *ndev,
void *data = recv_buf
+ vmxferpage_packet->ranges[i].byte_offset;
u32 buflen = vmxferpage_packet->ranges[i].byte_count;
+   int ret;
 
trace_rndis_recv(ndev, q_idx, data);
 
/* Pass it to the upper layer */
-   status = rndis_filter_receive(ndev, net_device,
- channel, data, buflen);
+   ret = rndis_filter_receive(ndev, net_device,
+  channel, data, buflen);
+
+   if (unlikely(ret != NVSP_STAT_SUCCESS))
+   status = NVSP_STAT_FAIL;
}
 
enq_receive_complete(ndev, net_device, q_idx,
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index cdb78eefab67..33607995be62 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -818,7 +818,7 @@ int netvsc_recv_callback(struct net_device *net,
u64_stats_update_end(_stats->syncp);
 
napi_gro_receive(>napi, skb);
-   return 0;
+   return NVSP_STAT_SUCCESS;
 }
 
 static void netvsc_get_drvinfo(struct net_device *net,
diff --git a/drivers/net/hyperv/rndis_filter.c 
b/drivers/net/hyperv/rndis_filter.c
index 2dc00f714482..591fb8080f11 100644
--- a/drivers/net/hyperv/rndis_filter.c
+++ b/drivers/net/hyperv/rndis_filter.c
@@ -443,10 +443,10 @@ int rndis_filter_receive(struct net_device *ndev,
"unhandled rndis message (type %u len %u)\n",
   rndis_msg->ndis_msg_type,
   rndis_msg->msg_len);
-   break;
+   return NVSP_STAT_FAIL;
}
 
-   return 0;
+   return NVSP_STAT_SUCCESS;
 }
 
 static int rndis_filter_query_device(struct rndis_device *dev,
-- 
2.15.1



[PATCH net-next,1/2] hv_netvsc: Fix the return status in RX path

2018-03-22 Thread Haiyang Zhang
From: Haiyang Zhang 

As defined in hyperv_net.h, the NVSP_STAT_SUCCESS is one not zero.
Some functions returns 0 when it actually means NVSP_STAT_SUCCESS.
This patch fixes them.

In netvsc_receive(), it puts the last RNDIS packet's receive status
for all packets in a vmxferpage which may contain multiple RNDIS
packets.
This patch puts NVSP_STAT_FAIL in the receive completion if one of
the packets in a vmxferpage fails.

Signed-off-by: Haiyang Zhang 
---
 drivers/net/hyperv/netvsc.c   | 8 ++--
 drivers/net/hyperv/netvsc_drv.c   | 2 +-
 drivers/net/hyperv/rndis_filter.c | 4 ++--
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index aa95e81af6e5..1ddb2c39b6e4 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -1098,12 +1098,16 @@ static int netvsc_receive(struct net_device *ndev,
void *data = recv_buf
+ vmxferpage_packet->ranges[i].byte_offset;
u32 buflen = vmxferpage_packet->ranges[i].byte_count;
+   int ret;
 
trace_rndis_recv(ndev, q_idx, data);
 
/* Pass it to the upper layer */
-   status = rndis_filter_receive(ndev, net_device,
- channel, data, buflen);
+   ret = rndis_filter_receive(ndev, net_device,
+  channel, data, buflen);
+
+   if (unlikely(ret != NVSP_STAT_SUCCESS))
+   status = NVSP_STAT_FAIL;
}
 
enq_receive_complete(ndev, net_device, q_idx,
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index cdb78eefab67..33607995be62 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -818,7 +818,7 @@ int netvsc_recv_callback(struct net_device *net,
u64_stats_update_end(_stats->syncp);
 
napi_gro_receive(>napi, skb);
-   return 0;
+   return NVSP_STAT_SUCCESS;
 }
 
 static void netvsc_get_drvinfo(struct net_device *net,
diff --git a/drivers/net/hyperv/rndis_filter.c 
b/drivers/net/hyperv/rndis_filter.c
index 2dc00f714482..591fb8080f11 100644
--- a/drivers/net/hyperv/rndis_filter.c
+++ b/drivers/net/hyperv/rndis_filter.c
@@ -443,10 +443,10 @@ int rndis_filter_receive(struct net_device *ndev,
"unhandled rndis message (type %u len %u)\n",
   rndis_msg->ndis_msg_type,
   rndis_msg->msg_len);
-   break;
+   return NVSP_STAT_FAIL;
}
 
-   return 0;
+   return NVSP_STAT_SUCCESS;
 }
 
 static int rndis_filter_query_device(struct rndis_device *dev,
-- 
2.15.1



[PATCH net-next,0/2] hv_netvsc: Fix/improve RX path error handling

2018-03-22 Thread Haiyang Zhang
From: Haiyang Zhang <haiya...@microsoft.com>

Fix the status code returned to the host. Also add range
check for rx packet offset and length.

Haiyang Zhang (2):
  hv_netvsc: Fix the return status in RX path
  hv_netvsc: Add range checking for rx packet offset and length

 drivers/net/hyperv/hyperv_net.h   |  1 +
 drivers/net/hyperv/netvsc.c   | 25 +
 drivers/net/hyperv/netvsc_drv.c   |  2 +-
 drivers/net/hyperv/rndis_filter.c |  4 ++--
 4 files changed, 25 insertions(+), 7 deletions(-)

-- 
2.15.1



[PATCH net-next,2/2] hv_netvsc: Add range checking for rx packet offset and length

2018-03-22 Thread Haiyang Zhang
From: Haiyang Zhang <haiya...@microsoft.com>

This patch adds range checking for rx packet offset and length.
It may only happen if there is a host side bug.

Signed-off-by: Haiyang Zhang <haiya...@microsoft.com>
---
 drivers/net/hyperv/hyperv_net.h |  1 +
 drivers/net/hyperv/netvsc.c | 17 +++--
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
index 0db3bd1ea06f..49c05ac894e5 100644
--- a/drivers/net/hyperv/hyperv_net.h
+++ b/drivers/net/hyperv/hyperv_net.h
@@ -793,6 +793,7 @@ struct netvsc_device {
 
/* Receive buffer allocated by us but manages by NetVSP */
void *recv_buf;
+   u32 recv_buf_size; /* allocated bytes */
u32 recv_buf_gpadl_handle;
u32 recv_section_cnt;
u32 recv_section_size;
diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index 1ddb2c39b6e4..a6700d65f206 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -289,6 +289,8 @@ static int netvsc_init_buf(struct hv_device *device,
goto cleanup;
}
 
+   net_device->recv_buf_size = buf_size;
+
/*
 * Establish the gpadl handle for this buffer on this
 * channel.  Note: This call uses the vmbus connection rather
@@ -1095,11 +1097,22 @@ static int netvsc_receive(struct net_device *ndev,
 
/* Each range represents 1 RNDIS pkt that contains 1 ethernet frame */
for (i = 0; i < count; i++) {
-   void *data = recv_buf
-   + vmxferpage_packet->ranges[i].byte_offset;
+   u32 offset = vmxferpage_packet->ranges[i].byte_offset;
u32 buflen = vmxferpage_packet->ranges[i].byte_count;
+   void *data;
int ret;
 
+   if (unlikely(offset + buflen > net_device->recv_buf_size)) {
+   status = NVSP_STAT_FAIL;
+   netif_err(net_device_ctx, rx_err, ndev,
+ "Packet offset:%u + len:%u too big\n",
+ offset, buflen);
+
+   continue;
+   }
+
+   data = recv_buf + offset;
+
trace_rndis_recv(ndev, q_idx, data);
 
/* Pass it to the upper layer */
-- 
2.15.1



[PATCH net-next,0/2] hv_netvsc: Fix/improve RX path error handling

2018-03-22 Thread Haiyang Zhang
From: Haiyang Zhang 

Fix the status code returned to the host. Also add range
check for rx packet offset and length.

Haiyang Zhang (2):
  hv_netvsc: Fix the return status in RX path
  hv_netvsc: Add range checking for rx packet offset and length

 drivers/net/hyperv/hyperv_net.h   |  1 +
 drivers/net/hyperv/netvsc.c   | 25 +
 drivers/net/hyperv/netvsc_drv.c   |  2 +-
 drivers/net/hyperv/rndis_filter.c |  4 ++--
 4 files changed, 25 insertions(+), 7 deletions(-)

-- 
2.15.1



[PATCH net-next,2/2] hv_netvsc: Add range checking for rx packet offset and length

2018-03-22 Thread Haiyang Zhang
From: Haiyang Zhang 

This patch adds range checking for rx packet offset and length.
It may only happen if there is a host side bug.

Signed-off-by: Haiyang Zhang 
---
 drivers/net/hyperv/hyperv_net.h |  1 +
 drivers/net/hyperv/netvsc.c | 17 +++--
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
index 0db3bd1ea06f..49c05ac894e5 100644
--- a/drivers/net/hyperv/hyperv_net.h
+++ b/drivers/net/hyperv/hyperv_net.h
@@ -793,6 +793,7 @@ struct netvsc_device {
 
/* Receive buffer allocated by us but manages by NetVSP */
void *recv_buf;
+   u32 recv_buf_size; /* allocated bytes */
u32 recv_buf_gpadl_handle;
u32 recv_section_cnt;
u32 recv_section_size;
diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index 1ddb2c39b6e4..a6700d65f206 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -289,6 +289,8 @@ static int netvsc_init_buf(struct hv_device *device,
goto cleanup;
}
 
+   net_device->recv_buf_size = buf_size;
+
/*
 * Establish the gpadl handle for this buffer on this
 * channel.  Note: This call uses the vmbus connection rather
@@ -1095,11 +1097,22 @@ static int netvsc_receive(struct net_device *ndev,
 
/* Each range represents 1 RNDIS pkt that contains 1 ethernet frame */
for (i = 0; i < count; i++) {
-   void *data = recv_buf
-   + vmxferpage_packet->ranges[i].byte_offset;
+   u32 offset = vmxferpage_packet->ranges[i].byte_offset;
u32 buflen = vmxferpage_packet->ranges[i].byte_count;
+   void *data;
int ret;
 
+   if (unlikely(offset + buflen > net_device->recv_buf_size)) {
+   status = NVSP_STAT_FAIL;
+   netif_err(net_device_ctx, rx_err, ndev,
+ "Packet offset:%u + len:%u too big\n",
+ offset, buflen);
+
+   continue;
+   }
+
+   data = recv_buf + offset;
+
trace_rndis_recv(ndev, q_idx, data);
 
/* Pass it to the upper layer */
-- 
2.15.1



RE: [PATCH v3 5/6] PCI: hv: hv_pci_devices_present(): only queue a new work when necessary

2018-03-09 Thread Haiyang Zhang


> -Original Message-
> From: Dexuan Cui
> Sent: Tuesday, March 6, 2018 1:22 PM
> To: bhelg...@google.com; linux-...@vger.kernel.org; KY Srinivasan
> <k...@microsoft.com>; Stephen Hemminger <sthem...@microsoft.com>;
> o...@aepfle.de; a...@canonical.com; jasow...@redhat.com
> Cc: linux-kernel@vger.kernel.org; driverdev-de...@linuxdriverproject.org;
> Haiyang Zhang <haiya...@microsoft.com>; vkuzn...@redhat.com;
> marcelo.ce...@canonical.com; Michael Kelley (EOSG)
> <michael.h.kel...@microsoft.com>; Dexuan Cui <de...@microsoft.com>; Jack
> Morgenstein <ja...@mellanox.com>; sta...@vger.kernel.org
> Subject: [PATCH v3 5/6] PCI: hv: hv_pci_devices_present(): only queue a new
> work when necessary
> 
> If there is a pending work, we just need to add the new dr into the dr_list.
> 
> This is suggested by Michael Kelley.
> 
> Signed-off-by: Dexuan Cui <de...@microsoft.com>
> Cc: Vitaly Kuznetsov <vkuzn...@redhat.com>
> Cc: Jack Morgenstein <ja...@mellanox.com>
> Cc: sta...@vger.kernel.org
> Cc: Stephen Hemminger <sthem...@microsoft.com>
> Cc: K. Y. Srinivasan <k...@microsoft.com>
> Cc: Michael Kelley (EOSG) <michael.h.kel...@microsoft.com>
> ---

Acked-by: Haiyang Zhang <haiya...@microsoft.com>


RE: [PATCH v3 5/6] PCI: hv: hv_pci_devices_present(): only queue a new work when necessary

2018-03-09 Thread Haiyang Zhang


> -Original Message-
> From: Dexuan Cui
> Sent: Tuesday, March 6, 2018 1:22 PM
> To: bhelg...@google.com; linux-...@vger.kernel.org; KY Srinivasan
> ; Stephen Hemminger ;
> o...@aepfle.de; a...@canonical.com; jasow...@redhat.com
> Cc: linux-kernel@vger.kernel.org; driverdev-de...@linuxdriverproject.org;
> Haiyang Zhang ; vkuzn...@redhat.com;
> marcelo.ce...@canonical.com; Michael Kelley (EOSG)
> ; Dexuan Cui ; Jack
> Morgenstein ; sta...@vger.kernel.org
> Subject: [PATCH v3 5/6] PCI: hv: hv_pci_devices_present(): only queue a new
> work when necessary
> 
> If there is a pending work, we just need to add the new dr into the dr_list.
> 
> This is suggested by Michael Kelley.
> 
> Signed-off-by: Dexuan Cui 
> Cc: Vitaly Kuznetsov 
> Cc: Jack Morgenstein 
> Cc: sta...@vger.kernel.org
> Cc: Stephen Hemminger 
> Cc: K. Y. Srinivasan 
> Cc: Michael Kelley (EOSG) 
> ---

Acked-by: Haiyang Zhang 


RE: [PATCH v3 6/6] PCI: hv: fix 2 hang issues in hv_compose_msi_msg()

2018-03-09 Thread Haiyang Zhang


> -Original Message-
> From: Dexuan Cui
> Sent: Tuesday, March 6, 2018 1:22 PM
> To: bhelg...@google.com; linux-...@vger.kernel.org; KY Srinivasan
> <k...@microsoft.com>; Stephen Hemminger <sthem...@microsoft.com>;
> o...@aepfle.de; a...@canonical.com; jasow...@redhat.com
> Cc: linux-kernel@vger.kernel.org; driverdev-de...@linuxdriverproject.org;
> Haiyang Zhang <haiya...@microsoft.com>; vkuzn...@redhat.com;
> marcelo.ce...@canonical.com; Michael Kelley (EOSG)
> <michael.h.kel...@microsoft.com>; Dexuan Cui <de...@microsoft.com>;
> sta...@vger.kernel.org; Jack Morgenstein <ja...@mellanox.com>
> Subject: [PATCH v3 6/6] PCI: hv: fix 2 hang issues in hv_compose_msi_msg()
> 
> 1. With the patch "x86/vector/msi: Switch to global reservation mode"
> (4900be8360), the recent v4.15 and newer kernels always hang for 1-vCPU
> Hyper-V VM with SR-IOV. This is because when we reach
> hv_compose_msi_msg() by request_irq()  -> request_threaded_irq() ->
> __setup_irq()->irq_startup()  -> __irq_startup() -> irq_domain_activate_irq() 
> -
> > ... ->
> msi_domain_activate() -> ... -> hv_compose_msi_msg(), local irq is disabled in
> __setup_irq().
> 
> Fix this by polling the channel.
> 
> 2. If the host is ejecting the VF device before we reach hv_compose_msi_msg(),
> in a UP VM, we can hang in hv_compose_msi_msg() forever, because at this
> time the host doesn't respond to the CREATE_INTERRUPT request. This issue
> also happens to old kernels like v4.14, v4.13, etc.
> 
> Fix this by polling the channel for the PCI_EJECT message and
> hpdev->state, and by checking the PCI vendor ID.
> 
> Note: actually the above issues also happen to a SMP VM, if "hbus->hdev-
> >channel->target_cpu == smp_processor_id()" is true.
> 
> Signed-off-by: Dexuan Cui <de...@microsoft.com>
> Tested-by: Adrian Suhov <v-ads...@microsoft.com>
> Tested-by: Chris Valean <v-chv...@microsoft.com>
> Cc: sta...@vger.kernel.org
> Cc: Stephen Hemminger <sthem...@microsoft.com>
> Cc: K. Y. Srinivasan <k...@microsoft.com>
> Cc: Vitaly Kuznetsov <vkuzn...@redhat.com>
> Cc: Jack Morgenstein <ja...@mellanox.com>
> ---

Acked-by: Haiyang Zhang <haiya...@microsoft.com>


RE: [PATCH v3 6/6] PCI: hv: fix 2 hang issues in hv_compose_msi_msg()

2018-03-09 Thread Haiyang Zhang


> -Original Message-
> From: Dexuan Cui
> Sent: Tuesday, March 6, 2018 1:22 PM
> To: bhelg...@google.com; linux-...@vger.kernel.org; KY Srinivasan
> ; Stephen Hemminger ;
> o...@aepfle.de; a...@canonical.com; jasow...@redhat.com
> Cc: linux-kernel@vger.kernel.org; driverdev-de...@linuxdriverproject.org;
> Haiyang Zhang ; vkuzn...@redhat.com;
> marcelo.ce...@canonical.com; Michael Kelley (EOSG)
> ; Dexuan Cui ;
> sta...@vger.kernel.org; Jack Morgenstein 
> Subject: [PATCH v3 6/6] PCI: hv: fix 2 hang issues in hv_compose_msi_msg()
> 
> 1. With the patch "x86/vector/msi: Switch to global reservation mode"
> (4900be8360), the recent v4.15 and newer kernels always hang for 1-vCPU
> Hyper-V VM with SR-IOV. This is because when we reach
> hv_compose_msi_msg() by request_irq()  -> request_threaded_irq() ->
> __setup_irq()->irq_startup()  -> __irq_startup() -> irq_domain_activate_irq() 
> -
> > ... ->
> msi_domain_activate() -> ... -> hv_compose_msi_msg(), local irq is disabled in
> __setup_irq().
> 
> Fix this by polling the channel.
> 
> 2. If the host is ejecting the VF device before we reach hv_compose_msi_msg(),
> in a UP VM, we can hang in hv_compose_msi_msg() forever, because at this
> time the host doesn't respond to the CREATE_INTERRUPT request. This issue
> also happens to old kernels like v4.14, v4.13, etc.
> 
> Fix this by polling the channel for the PCI_EJECT message and
> hpdev->state, and by checking the PCI vendor ID.
> 
> Note: actually the above issues also happen to a SMP VM, if "hbus->hdev-
> >channel->target_cpu == smp_processor_id()" is true.
> 
> Signed-off-by: Dexuan Cui 
> Tested-by: Adrian Suhov 
> Tested-by: Chris Valean 
> Cc: sta...@vger.kernel.org
> Cc: Stephen Hemminger 
> Cc: K. Y. Srinivasan 
> Cc: Vitaly Kuznetsov 
> Cc: Jack Morgenstein 
> ---

Acked-by: Haiyang Zhang 


RE: [PATCH v3 4/6] PCI: hv: remove hbus->enum_sem

2018-03-09 Thread Haiyang Zhang


> -Original Message-
> From: Dexuan Cui
> Sent: Tuesday, March 6, 2018 1:22 PM
> To: bhelg...@google.com; linux-...@vger.kernel.org; KY Srinivasan
> <k...@microsoft.com>; Stephen Hemminger <sthem...@microsoft.com>;
> o...@aepfle.de; a...@canonical.com; jasow...@redhat.com
> Cc: linux-kernel@vger.kernel.org; driverdev-de...@linuxdriverproject.org;
> Haiyang Zhang <haiya...@microsoft.com>; vkuzn...@redhat.com;
> marcelo.ce...@canonical.com; Michael Kelley (EOSG)
> <michael.h.kel...@microsoft.com>; Dexuan Cui <de...@microsoft.com>; Jack
> Morgenstein <ja...@mellanox.com>; sta...@vger.kernel.org
> Subject: [PATCH v3 4/6] PCI: hv: remove hbus->enum_sem
> 
> Since we serialize the present/eject work items now, we don't need the
> semaphore any more.
> 
> This is suggested by Michael Kelley.
> 
> Signed-off-by: Dexuan Cui <de...@microsoft.com>
> Cc: Vitaly Kuznetsov <vkuzn...@redhat.com>
> Cc: Jack Morgenstein <ja...@mellanox.com>
> Cc: sta...@vger.kernel.org
> Cc: Stephen Hemminger <sthem...@microsoft.com>
> Cc: K. Y. Srinivasan <k...@microsoft.com>
> Cc: Michael Kelley (EOSG) <michael.h.kel...@microsoft.com>
> ---

Acked-by: Haiyang Zhang <haiya...@microsoft.com>


RE: [PATCH v3 3/6] PCI: hv: serialize the present/eject work items

2018-03-09 Thread Haiyang Zhang


> -Original Message-
> From: Dexuan Cui
> Sent: Tuesday, March 6, 2018 1:22 PM
> To: bhelg...@google.com; linux-...@vger.kernel.org; KY Srinivasan
> <k...@microsoft.com>; Stephen Hemminger <sthem...@microsoft.com>;
> o...@aepfle.de; a...@canonical.com; jasow...@redhat.com
> Cc: linux-kernel@vger.kernel.org; driverdev-de...@linuxdriverproject.org;
> Haiyang Zhang <haiya...@microsoft.com>; vkuzn...@redhat.com;
> marcelo.ce...@canonical.com; Michael Kelley (EOSG)
> <michael.h.kel...@microsoft.com>; Dexuan Cui <de...@microsoft.com>; Jack
> Morgenstein <ja...@mellanox.com>; sta...@vger.kernel.org
> Subject: [PATCH v3 3/6] PCI: hv: serialize the present/eject work items
> 
> When we hot-remove the device, we first receive a PCI_EJECT message and
> then receive a PCI_BUS_RELATIONS message with bus_rel->device_count == 0.
> 
> The first message is offloaded to hv_eject_device_work(), and the second is
> offloaded to pci_devices_present_work(). Both the paths can be running
> list_del(>list_entry), causing general protection fault, because
> system_wq can run them concurrently.
> 
> The patch eliminates the race condition.
> 
> Signed-off-by: Dexuan Cui <de...@microsoft.com>
> Tested-by: Adrian Suhov <v-ads...@microsoft.com>
> Tested-by: Chris Valean <v-chv...@microsoft.com>
> Cc: Vitaly Kuznetsov <vkuzn...@redhat.com>
> Cc: Jack Morgenstein <ja...@mellanox.com>
> Cc: sta...@vger.kernel.org
> Cc: Stephen Hemminger <sthem...@microsoft.com>
> Cc: K. Y. Srinivasan <k...@microsoft.com>
> ---

Acked-by: Haiyang Zhang <haiya...@microsoft.com>


RE: [PATCH v3 4/6] PCI: hv: remove hbus->enum_sem

2018-03-09 Thread Haiyang Zhang


> -Original Message-
> From: Dexuan Cui
> Sent: Tuesday, March 6, 2018 1:22 PM
> To: bhelg...@google.com; linux-...@vger.kernel.org; KY Srinivasan
> ; Stephen Hemminger ;
> o...@aepfle.de; a...@canonical.com; jasow...@redhat.com
> Cc: linux-kernel@vger.kernel.org; driverdev-de...@linuxdriverproject.org;
> Haiyang Zhang ; vkuzn...@redhat.com;
> marcelo.ce...@canonical.com; Michael Kelley (EOSG)
> ; Dexuan Cui ; Jack
> Morgenstein ; sta...@vger.kernel.org
> Subject: [PATCH v3 4/6] PCI: hv: remove hbus->enum_sem
> 
> Since we serialize the present/eject work items now, we don't need the
> semaphore any more.
> 
> This is suggested by Michael Kelley.
> 
> Signed-off-by: Dexuan Cui 
> Cc: Vitaly Kuznetsov 
> Cc: Jack Morgenstein 
> Cc: sta...@vger.kernel.org
> Cc: Stephen Hemminger 
> Cc: K. Y. Srinivasan 
> Cc: Michael Kelley (EOSG) 
> ---

Acked-by: Haiyang Zhang 


RE: [PATCH v3 3/6] PCI: hv: serialize the present/eject work items

2018-03-09 Thread Haiyang Zhang


> -Original Message-
> From: Dexuan Cui
> Sent: Tuesday, March 6, 2018 1:22 PM
> To: bhelg...@google.com; linux-...@vger.kernel.org; KY Srinivasan
> ; Stephen Hemminger ;
> o...@aepfle.de; a...@canonical.com; jasow...@redhat.com
> Cc: linux-kernel@vger.kernel.org; driverdev-de...@linuxdriverproject.org;
> Haiyang Zhang ; vkuzn...@redhat.com;
> marcelo.ce...@canonical.com; Michael Kelley (EOSG)
> ; Dexuan Cui ; Jack
> Morgenstein ; sta...@vger.kernel.org
> Subject: [PATCH v3 3/6] PCI: hv: serialize the present/eject work items
> 
> When we hot-remove the device, we first receive a PCI_EJECT message and
> then receive a PCI_BUS_RELATIONS message with bus_rel->device_count == 0.
> 
> The first message is offloaded to hv_eject_device_work(), and the second is
> offloaded to pci_devices_present_work(). Both the paths can be running
> list_del(>list_entry), causing general protection fault, because
> system_wq can run them concurrently.
> 
> The patch eliminates the race condition.
> 
> Signed-off-by: Dexuan Cui 
> Tested-by: Adrian Suhov 
> Tested-by: Chris Valean 
> Cc: Vitaly Kuznetsov 
> Cc: Jack Morgenstein 
> Cc: sta...@vger.kernel.org
> Cc: Stephen Hemminger 
> Cc: K. Y. Srinivasan 
> ---

Acked-by: Haiyang Zhang 


RE: [PATCH v3 2/6] PCI: hv: hv_eject_device_work(): remove the bogus test

2018-03-09 Thread Haiyang Zhang


> -Original Message-
> From: Dexuan Cui
> Sent: Tuesday, March 6, 2018 1:22 PM
> To: bhelg...@google.com; linux-...@vger.kernel.org; KY Srinivasan
> <k...@microsoft.com>; Stephen Hemminger <sthem...@microsoft.com>;
> o...@aepfle.de; a...@canonical.com; jasow...@redhat.com
> Cc: linux-kernel@vger.kernel.org; driverdev-de...@linuxdriverproject.org;
> Haiyang Zhang <haiya...@microsoft.com>; vkuzn...@redhat.com;
> marcelo.ce...@canonical.com; Michael Kelley (EOSG)
> <michael.h.kel...@microsoft.com>; Dexuan Cui <de...@microsoft.com>; Jack
> Morgenstein <ja...@mellanox.com>; sta...@vger.kernel.org
> Subject: [PATCH v3 2/6] PCI: hv: hv_eject_device_work(): remove the bogus test
> 
> When we're in the function, hpdev->state must be hv_pcichild_ejecting:
> see hv_pci_eject_device().
> 
> Signed-off-by: Dexuan Cui <de...@microsoft.com>
> Cc: Vitaly Kuznetsov <vkuzn...@redhat.com>
> Cc: Jack Morgenstein <ja...@mellanox.com>
> Cc: sta...@vger.kernel.org
> Cc: Stephen Hemminger <sthem...@microsoft.com>
> Cc: K. Y. Srinivasan <k...@microsoft.com>
> Cc: Michael Kelley (EOSG) <michael.h.kel...@microsoft.com>
> ---

Acked-by: Haiyang Zhang <haiya...@microsoft.com>


RE: [PATCH v3 1/6] PCI: hv: fix a comment typo in _hv_pcifront_read_config()

2018-03-09 Thread Haiyang Zhang


> -Original Message-
> From: Dexuan Cui
> Sent: Tuesday, March 6, 2018 1:22 PM
> To: bhelg...@google.com; linux-...@vger.kernel.org; KY Srinivasan
> <k...@microsoft.com>; Stephen Hemminger <sthem...@microsoft.com>;
> o...@aepfle.de; a...@canonical.com; jasow...@redhat.com
> Cc: linux-kernel@vger.kernel.org; driverdev-de...@linuxdriverproject.org;
> Haiyang Zhang <haiya...@microsoft.com>; vkuzn...@redhat.com;
> marcelo.ce...@canonical.com; Michael Kelley (EOSG)
> <michael.h.kel...@microsoft.com>; Dexuan Cui <de...@microsoft.com>;
> sta...@vger.kernel.org
> Subject: [PATCH v3 1/6] PCI: hv: fix a comment typo in
> _hv_pcifront_read_config()
> 
> No functional change.
> 
> Signed-off-by: Dexuan Cui <de...@microsoft.com>
> Fixes: bdd74440d9e8 ("PCI: hv: Add explicit barriers to config space access")
> Cc: Vitaly Kuznetsov <vkuzn...@redhat.com>
> Cc: sta...@vger.kernel.org
> Cc: Stephen Hemminger <sthem...@microsoft.com>
> Cc: K. Y. Srinivasan <k...@microsoft.com>
> ---

Acked-by: Haiyang Zhang <haiya...@microsoft.com>



RE: [PATCH v3 2/6] PCI: hv: hv_eject_device_work(): remove the bogus test

2018-03-09 Thread Haiyang Zhang


> -Original Message-
> From: Dexuan Cui
> Sent: Tuesday, March 6, 2018 1:22 PM
> To: bhelg...@google.com; linux-...@vger.kernel.org; KY Srinivasan
> ; Stephen Hemminger ;
> o...@aepfle.de; a...@canonical.com; jasow...@redhat.com
> Cc: linux-kernel@vger.kernel.org; driverdev-de...@linuxdriverproject.org;
> Haiyang Zhang ; vkuzn...@redhat.com;
> marcelo.ce...@canonical.com; Michael Kelley (EOSG)
> ; Dexuan Cui ; Jack
> Morgenstein ; sta...@vger.kernel.org
> Subject: [PATCH v3 2/6] PCI: hv: hv_eject_device_work(): remove the bogus test
> 
> When we're in the function, hpdev->state must be hv_pcichild_ejecting:
> see hv_pci_eject_device().
> 
> Signed-off-by: Dexuan Cui 
> Cc: Vitaly Kuznetsov 
> Cc: Jack Morgenstein 
> Cc: sta...@vger.kernel.org
> Cc: Stephen Hemminger 
> Cc: K. Y. Srinivasan 
> Cc: Michael Kelley (EOSG) 
> ---

Acked-by: Haiyang Zhang 


RE: [PATCH v3 1/6] PCI: hv: fix a comment typo in _hv_pcifront_read_config()

2018-03-09 Thread Haiyang Zhang


> -Original Message-
> From: Dexuan Cui
> Sent: Tuesday, March 6, 2018 1:22 PM
> To: bhelg...@google.com; linux-...@vger.kernel.org; KY Srinivasan
> ; Stephen Hemminger ;
> o...@aepfle.de; a...@canonical.com; jasow...@redhat.com
> Cc: linux-kernel@vger.kernel.org; driverdev-de...@linuxdriverproject.org;
> Haiyang Zhang ; vkuzn...@redhat.com;
> marcelo.ce...@canonical.com; Michael Kelley (EOSG)
> ; Dexuan Cui ;
> sta...@vger.kernel.org
> Subject: [PATCH v3 1/6] PCI: hv: fix a comment typo in
> _hv_pcifront_read_config()
> 
> No functional change.
> 
> Signed-off-by: Dexuan Cui 
> Fixes: bdd74440d9e8 ("PCI: hv: Add explicit barriers to config space access")
> Cc: Vitaly Kuznetsov 
> Cc: sta...@vger.kernel.org
> Cc: Stephen Hemminger 
> Cc: K. Y. Srinivasan 
> ---

Acked-by: Haiyang Zhang 



RE: Any known soft lockup issue with vfs_write()->fsnotify()?

2018-03-08 Thread Haiyang Zhang
There was another report of the same issue on CoreOS, 4.14.11-coreos. The 
host/guest is AWS G4. So the problem is not limited to Azure VMs. It doesn't 
happen on older kernel like 4.4. Maybe the problem is related to some (recent) 
changes on fsnotify or other fs code?

Soft lockup kernel panic reboot on AWS instance on fsnotify and vfs_write  #2356
https://github.com/coreos/bugs/issues/2356

Thanks,
- Haiyang

> -Original Message-
> From: Jan Kara <j...@suse.cz>
> Sent: Monday, March 5, 2018 3:49 PM
> To: Dexuan Cui <de...@microsoft.com>
> Cc: linux-fsde...@vger.kernel.org; Jan Kara <j...@suse.cz>; Amir Goldstein
> <amir7...@gmail.com>; Miklos Szeredi <mszer...@redhat.com>; Haiyang
> Zhang <haiya...@microsoft.com>; 'linux-kernel@vger.kernel.org'  ker...@vger.kernel.org>; Jork Loeser <jork.loe...@microsoft.com>
> Subject: Re: Any known soft lockup issue with vfs_write()->fsnotify()?
> 
> Hi!
> 
> On Fri 02-03-18 22:28:50, Dexuan Cui wrote:
> > Recently people are getting a soft lock issue with vfs_write()->fsnotify().
> > The detailed calltrace is available at:
> > https://na01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgithu
> >
> b.com%2Fcoreos%2Fbugs%2Fissues%2F2356=04%7C01%7Chaiyangz%40
> micros
> >
> oft.com%7Ca1b1bc6822c9442195ad08d582da7942%7C72f988bf86f141af91ab2
> d7cd
> >
> 011db47%7C1%7C0%7C636558797237925702%7CUnknown%7CTWFpbGZsb3d8
> eyJWIjoiM
> > C4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwifQ%3D%3D%7C-
> 2=pdwtsbU
> > 0%2FW3y7Zy%2BX%2Ffkbx%2FPktoKVBgimfxMyVk6Lyw%3D=0
> > https://na01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgithu
> >
> b.com%2Fcoreos%2Fbugs%2Fissues%2F2364=04%7C01%7Chaiyangz%40
> micros
> >
> oft.com%7Ca1b1bc6822c9442195ad08d582da7942%7C72f988bf86f141af91ab2
> d7cd
> >
> 011db47%7C1%7C0%7C636558797237925702%7CUnknown%7CTWFpbGZsb3d8
> eyJWIjoiM
> > C4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwifQ%3D%3D%7C-
> 2=w%2Bjed
> > u0yIYlpRut5sHa2%2Bhs5cdcdxp1dd3sHkyvRCPw%3D=0
> 
> I didn't see them yet.
> 
> > The kernel versions showing up the issue are:
> > 4.14.11-coreos
> > 4.14.19-coreos
> > 4.13.0-1009 -- this is the kernel with which I'm personally seeing the 
> > lockup.
> >
> > I have not got a chance to try the latest mainline kernel yet.
> 
> It would be good to try 4.15 kernel to see whether recent fixes from Miklos
> didn't fix your problem. They should be present in 4.14.11/19 kernels as well
> but one never knows...
> 
> > Before the lockup error message suddenly appears, Linux has been
> > running fine for many hours.  I have NOT found a consistent way to
> > reproduce the lockup yet.
> >
> > Looks the kernel is stuck in fsnotify(), when it tries to get the
> > fsnotify_mark_srcu lock.
> 
> It is not possible that we would 'hang' in srcu_read_lock() - that is just a 
> read of
> one variable and increment of another. We'd have to be looping somewhere
> and watchdog would have to happen to hit us always at that place. Weird. Are
> you sure RIP points to srcu_read_lock?
> 
> > "git log fs/notify/fsnotify.c" on the latest mainline shows that some
> > recent patches might help.
> >
> > I'd like to check if this is a known issue.
> 
> As I've mentioned above, so far I didn't see reports like this...
> 
>   Honza
> --
> Jan Kara <j...@suse.com>
> SUSE Labs, CR


RE: Any known soft lockup issue with vfs_write()->fsnotify()?

2018-03-08 Thread Haiyang Zhang
There was another report of the same issue on CoreOS, 4.14.11-coreos. The 
host/guest is AWS G4. So the problem is not limited to Azure VMs. It doesn't 
happen on older kernel like 4.4. Maybe the problem is related to some (recent) 
changes on fsnotify or other fs code?

Soft lockup kernel panic reboot on AWS instance on fsnotify and vfs_write  #2356
https://github.com/coreos/bugs/issues/2356

Thanks,
- Haiyang

> -Original Message-
> From: Jan Kara 
> Sent: Monday, March 5, 2018 3:49 PM
> To: Dexuan Cui 
> Cc: linux-fsde...@vger.kernel.org; Jan Kara ; Amir Goldstein
> ; Miklos Szeredi ; Haiyang
> Zhang ; 'linux-kernel@vger.kernel.org'  ker...@vger.kernel.org>; Jork Loeser 
> Subject: Re: Any known soft lockup issue with vfs_write()->fsnotify()?
> 
> Hi!
> 
> On Fri 02-03-18 22:28:50, Dexuan Cui wrote:
> > Recently people are getting a soft lock issue with vfs_write()->fsnotify().
> > The detailed calltrace is available at:
> > https://na01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgithu
> >
> b.com%2Fcoreos%2Fbugs%2Fissues%2F2356=04%7C01%7Chaiyangz%40
> micros
> >
> oft.com%7Ca1b1bc6822c9442195ad08d582da7942%7C72f988bf86f141af91ab2
> d7cd
> >
> 011db47%7C1%7C0%7C636558797237925702%7CUnknown%7CTWFpbGZsb3d8
> eyJWIjoiM
> > C4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwifQ%3D%3D%7C-
> 2=pdwtsbU
> > 0%2FW3y7Zy%2BX%2Ffkbx%2FPktoKVBgimfxMyVk6Lyw%3D=0
> > https://na01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgithu
> >
> b.com%2Fcoreos%2Fbugs%2Fissues%2F2364=04%7C01%7Chaiyangz%40
> micros
> >
> oft.com%7Ca1b1bc6822c9442195ad08d582da7942%7C72f988bf86f141af91ab2
> d7cd
> >
> 011db47%7C1%7C0%7C636558797237925702%7CUnknown%7CTWFpbGZsb3d8
> eyJWIjoiM
> > C4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwifQ%3D%3D%7C-
> 2=w%2Bjed
> > u0yIYlpRut5sHa2%2Bhs5cdcdxp1dd3sHkyvRCPw%3D=0
> 
> I didn't see them yet.
> 
> > The kernel versions showing up the issue are:
> > 4.14.11-coreos
> > 4.14.19-coreos
> > 4.13.0-1009 -- this is the kernel with which I'm personally seeing the 
> > lockup.
> >
> > I have not got a chance to try the latest mainline kernel yet.
> 
> It would be good to try 4.15 kernel to see whether recent fixes from Miklos
> didn't fix your problem. They should be present in 4.14.11/19 kernels as well
> but one never knows...
> 
> > Before the lockup error message suddenly appears, Linux has been
> > running fine for many hours.  I have NOT found a consistent way to
> > reproduce the lockup yet.
> >
> > Looks the kernel is stuck in fsnotify(), when it tries to get the
> > fsnotify_mark_srcu lock.
> 
> It is not possible that we would 'hang' in srcu_read_lock() - that is just a 
> read of
> one variable and increment of another. We'd have to be looping somewhere
> and watchdog would have to happen to hit us always at that place. Weird. Are
> you sure RIP points to srcu_read_lock?
> 
> > "git log fs/notify/fsnotify.c" on the latest mainline shows that some
> > recent patches might help.
> >
> > I'd like to check if this is a known issue.
> 
> As I've mentioned above, so far I didn't see reports like this...
> 
>   Honza
> --
> Jan Kara 
> SUSE Labs, CR


[PATCH] hv_vmbus: Correct the stale comments regarding cpu affinity

2018-01-26 Thread Haiyang Zhang
The comments doesn't match what the current code does, also have a
typo. This patch corrects them.

Signed-off-by: Haiyang Zhang <haiya...@microsoft.com>
---
 drivers/hv/channel_mgmt.c | 6 ++
 include/linux/hyperv.h| 2 +-
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index c21020b69114..c6d9d19bc04e 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -596,10 +596,8 @@ static int next_numa_node_id;
 /*
  * Starting with Win8, we can statically distribute the incoming
  * channel interrupt load by binding a channel to VCPU.
- * We do this in a hierarchical fashion:
- * First distribute the primary channels across available NUMA nodes
- * and then distribute the subchannels amongst the CPUs in the NUMA
- * node assigned to the primary channel.
+ * We distribute the interrupt loads to one or more NUMA nodes based on
+ * the channel's affinity_policy.
  *
  * For pre-win8 hosts or non-performance critical channels we assign the
  * first CPU in the first NUMA node.
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 93bd6fcd6e62..2048f3c3b68a 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -844,7 +844,7 @@ struct vmbus_channel {
 
/*
 * NUMA distribution policy:
-* We support teo policies:
+* We support two policies:
 * 1) Balanced: Here all performance critical channels are
 *distributed evenly amongst all the NUMA nodes.
 *This policy will be the default policy.
-- 
2.15.1


<    1   2   3   4   5   6   7   8   9   >