Thu, Mar 01, 2018 at 09:08:43PM CET, sridhar.samudr...@intel.com wrote:
>This patch enables virtio_net to switch over to a VF datapath when a VF
>netdev is present with the same MAC address. It allows live migration
>of a VM with a direct attached VF without the need to setup a bond/team
>between a VF and virtio net device in the guest.
>
>The hypervisor needs to enable only one datapath at any time so that
>packets don't get looped back to the VM over the other datapath. When a VF
>is plugged, the virtio datapath link state can be marked as down. The
>hypervisor needs to unplug the VF device from the guest on the source host
>and reset the MAC filter of the VF to initiate failover of datapath to
>virtio before starting the migration. After the migration is completed,
>the destination hypervisor sets the MAC filter on the VF and plugs it back
>to the guest to switch over to VF datapath.
>
>When BACKUP feature is enabled, an additional netdev(bypass netdev) is
>created that acts as a master device and tracks the state of the 2 lower
>netdevs. The original virtio_net netdev is marked as 'backup' netdev and a
>passthru device with the same MAC is registered as 'active' netdev.
>
>This patch is based on the discussion initiated by Jesse on this thread.
>https://marc.info/?l=linux-virtualization&m=151189725224231&w=2
>
>Signed-off-by: Sridhar Samudrala <sridhar.samudr...@intel.com>
>Signed-off-by: Alexander Duyck <alexander.h.du...@intel.com>
>Reviewed-by: Jesse Brandeburg <jesse.brandeb...@intel.com>

[...]


>+static int virtnet_bypass_register_child(struct net_device *child_netdev)
>+{
>+      struct virtnet_bypass_info *vbi;
>+      struct net_device *dev;
>+      bool backup;
>+      int ret;
>+
>+      if (child_netdev->addr_len != ETH_ALEN)
>+              return NOTIFY_DONE;
>+
>+      /* We will use the MAC address to locate the virtnet_bypass netdev
>+       * to associate with the child netdev. If we don't find a matching
>+       * bypass netdev, move on.
>+       */
>+      dev = get_virtnet_bypass_bymac(dev_net(child_netdev),
>+                                     child_netdev->perm_addr);
>+      if (!dev)
>+              return NOTIFY_DONE;
>+
>+      vbi = netdev_priv(dev);
>+      backup = (child_netdev->dev.parent == dev->dev.parent);
>+      if (backup ? rtnl_dereference(vbi->backup_netdev) :
>+                      rtnl_dereference(vbi->active_netdev)) {
>+              netdev_info(dev,
>+                          "%s attempting to join bypass dev when %s already 
>present\n",
>+                          child_netdev->name, backup ? "backup" : "active");
>+              return NOTIFY_DONE;
>+      }
>+
>+      /* Avoid non pci devices as active netdev */
>+      if (!backup && (!child_netdev->dev.parent ||
>+                      !dev_is_pci(child_netdev->dev.parent)))
>+              return NOTIFY_DONE;
>+
>+      ret = netdev_rx_handler_register(child_netdev,
>+                                       virtnet_bypass_handle_frame, dev);
>+      if (ret != 0) {
>+              netdev_err(child_netdev,
>+                         "can not register bypass receive handler (err = 
>%d)\n",
>+                         ret);
>+              goto rx_handler_failed;
>+      }
>+
>+      ret = netdev_upper_dev_link(child_netdev, dev, NULL);
>+      if (ret != 0) {
>+              netdev_err(child_netdev,
>+                         "can not set master device %s (err = %d)\n",
>+                         dev->name, ret);
>+              goto upper_link_failed;
>+      }
>+
>+      child_netdev->flags |= IFF_SLAVE;
>+
>+      if (netif_running(dev)) {
>+              ret = dev_open(child_netdev);
>+              if (ret && (ret != -EBUSY)) {
>+                      netdev_err(dev, "Opening child %s failed ret:%d\n",
>+                                 child_netdev->name, ret);
>+                      goto err_interface_up;
>+              }
>+      }

Much of this function is copy of netvsc_vf_join, should be shared with
netvsc.


>+
>+      /* Align MTU of child with master */
>+      ret = dev_set_mtu(child_netdev, dev->mtu);
>+      if (ret) {
>+              netdev_err(dev,
>+                         "unable to change mtu of %s to %u register failed\n",
>+                         child_netdev->name, dev->mtu);
>+              goto err_set_mtu;
>+      }
>+
>+      call_netdevice_notifiers(NETDEV_JOIN, child_netdev);
>+
>+      netdev_info(dev, "registering %s\n", child_netdev->name);
>+
>+      dev_hold(child_netdev);
>+      if (backup) {
>+              rcu_assign_pointer(vbi->backup_netdev, child_netdev);
>+              dev_get_stats(vbi->backup_netdev, &vbi->backup_stats);
>+      } else {
>+              rcu_assign_pointer(vbi->active_netdev, child_netdev);
>+              dev_get_stats(vbi->active_netdev, &vbi->active_stats);
>+              dev->min_mtu = child_netdev->min_mtu;
>+              dev->max_mtu = child_netdev->max_mtu;
>+      }
>+
>+      return NOTIFY_OK;
>+
>+err_set_mtu:
>+      dev_close(child_netdev);
>+err_interface_up:
>+      netdev_upper_dev_unlink(child_netdev, dev);
>+      child_netdev->flags &= ~IFF_SLAVE;
>+upper_link_failed:
>+      netdev_rx_handler_unregister(child_netdev);
>+rx_handler_failed:
>+      return NOTIFY_DONE;
>+}
>+
>+static int virtnet_bypass_unregister_child(struct net_device *child_netdev)
>+{
>+      struct virtnet_bypass_info *vbi;
>+      struct net_device *dev, *backup;
>+
>+      dev = get_virtnet_bypass_byref(child_netdev);
>+      if (!dev)
>+              return NOTIFY_DONE;
>+
>+      vbi = netdev_priv(dev);
>+
>+      netdev_info(dev, "unregistering %s\n", child_netdev->name);
>+
>+      netdev_rx_handler_unregister(child_netdev);
>+      netdev_upper_dev_unlink(child_netdev, dev);
>+      child_netdev->flags &= ~IFF_SLAVE;
>+
>+      if (child_netdev->dev.parent == dev->dev.parent) {
>+              RCU_INIT_POINTER(vbi->backup_netdev, NULL);
>+      } else {
>+              RCU_INIT_POINTER(vbi->active_netdev, NULL);
>+              backup = rtnl_dereference(vbi->backup_netdev);
>+              if (backup) {
>+                      dev->min_mtu = backup->min_mtu;
>+                      dev->max_mtu = backup->max_mtu;
>+              }
>+      }
>+
>+      dev_put(child_netdev);
>+
>+      return NOTIFY_OK;
>+}
>+
>+static int virtnet_bypass_update_link(struct net_device *child_netdev)
>+{
>+      struct net_device *dev, *active, *backup;
>+      struct virtnet_bypass_info *vbi;
>+
>+      dev = get_virtnet_bypass_byref(child_netdev);
>+      if (!dev || !netif_running(dev))
>+              return NOTIFY_DONE;
>+
>+      vbi = netdev_priv(dev);
>+
>+      active = rtnl_dereference(vbi->active_netdev);
>+      backup = rtnl_dereference(vbi->backup_netdev);
>+
>+      if ((active && virtnet_bypass_xmit_ready(active)) ||
>+          (backup && virtnet_bypass_xmit_ready(backup))) {
>+              netif_carrier_on(dev);
>+              netif_tx_wake_all_queues(dev);
>+      } else {
>+              netif_carrier_off(dev);
>+              netif_tx_stop_all_queues(dev);
>+      }
>+
>+      return NOTIFY_OK;
>+}
>+
>+static int virtnet_bypass_event(struct notifier_block *this,
>+                              unsigned long event, void *ptr)
>+{
>+      struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
>+
>+      /* Skip our own events */
>+      if (event_dev->netdev_ops == &virtnet_bypass_netdev_ops)
>+              return NOTIFY_DONE;
>+
>+      /* Avoid non-Ethernet type devices */
>+      if (event_dev->type != ARPHRD_ETHER)
>+              return NOTIFY_DONE;
>+
>+      /* Avoid Vlan dev with same MAC registering as child dev */
>+      if (is_vlan_dev(event_dev))
>+              return NOTIFY_DONE;
>+
>+      /* Avoid Bonding master dev with same MAC registering as child dev */
>+      if ((event_dev->priv_flags & IFF_BONDING) &&
>+          (event_dev->flags & IFF_MASTER))
>+              return NOTIFY_DONE;
>+
>+      switch (event) {
>+      case NETDEV_REGISTER:
>+              return virtnet_bypass_register_child(event_dev);
>+      case NETDEV_UNREGISTER:
>+              return virtnet_bypass_unregister_child(event_dev);
>+      case NETDEV_UP:
>+      case NETDEV_DOWN:
>+      case NETDEV_CHANGE:
>+              return virtnet_bypass_update_link(event_dev);
>+      default:
>+              return NOTIFY_DONE;
>+      }
>+}

For example this function is 1:1 copy of netvsc, even with comments
and bugs (like IFF_BODING check).

This is also something that should be shared with netvsc.


>+
>+static struct notifier_block virtnet_bypass_notifier = {
>+      .notifier_call = virtnet_bypass_event,
>+};
>+
>+static int virtnet_bypass_create(struct virtnet_info *vi)
>+{
>+      struct net_device *backup_netdev = vi->dev;
>+      struct device *dev = &vi->vdev->dev;
>+      struct net_device *bypass_netdev;
>+      int res;
>+
>+      /* Alloc at least 2 queues, for now we are going with 16 assuming
>+       * that most devices being bonded won't have too many queues.
>+       */
>+      bypass_netdev = alloc_etherdev_mq(sizeof(struct virtnet_bypass_info),
>+                                        16);
>+      if (!bypass_netdev) {
>+              dev_err(dev, "Unable to allocate bypass_netdev!\n");
>+              return -ENOMEM;
>+      }
>+
>+      dev_net_set(bypass_netdev, dev_net(backup_netdev));
>+      SET_NETDEV_DEV(bypass_netdev, dev);
>+
>+      bypass_netdev->netdev_ops = &virtnet_bypass_netdev_ops;
>+      bypass_netdev->ethtool_ops = &virtnet_bypass_ethtool_ops;
>+
>+      /* Initialize the device options */
>+      bypass_netdev->flags |= IFF_MASTER;
>+      bypass_netdev->priv_flags |= IFF_BONDING | IFF_UNICAST_FLT |

No clue why you set IFF_BONDING here...





Reply via email to