[ovs-dev] [PATCH per-port ingress scheduling 2/2] ingress scheduling: Provide per interface ingress priority

2018-08-28 Thread Billy O'Mahony
Allow configuration to specify an ingress priority for interfaces.
Modify dpif-netdev datapath to act on this configuration so that packets
on interfaces with a higher priority will tend be processed ahead of
packets on lower priority interfaces.  This protects traffic on higher
priority interfaces from packet loss as PMDs get overloaded.

Signed-off-by: Billy O'Mahony 
---
 include/openvswitch/ofp-parse.h |   3 +
 lib/dpif-netdev.c   | 188 +---
 lib/netdev-dpdk.c   |  10 +++
 3 files changed, 170 insertions(+), 31 deletions(-)

diff --git a/include/openvswitch/ofp-parse.h b/include/openvswitch/ofp-parse.h
index 3fdd468..d77ab8f 100644
--- a/include/openvswitch/ofp-parse.h
+++ b/include/openvswitch/ofp-parse.h
@@ -33,6 +33,9 @@ extern "C" {
 struct match;
 struct mf_field;
 struct ofputil_port_map;
+struct tun_table;
+struct flow_wildcards;
+struct ofputil_port_map;
 
 struct ofp_protocol {
 const char *name;
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 807a462..3ed8e09 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -22,6 +22,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -42,6 +43,7 @@
 #include "dpif.h"
 #include "dpif-netdev-perf.h"
 #include "dpif-provider.h"
+#include "netdev-provider.h"
 #include "dummy.h"
 #include "fat-rwlock.h"
 #include "flow.h"
@@ -49,7 +51,6 @@
 #include "id-pool.h"
 #include "latch.h"
 #include "netdev.h"
-#include "netdev-provider.h"
 #include "netdev-vport.h"
 #include "netlink.h"
 #include "odp-execute.h"
@@ -460,6 +461,7 @@ struct dp_netdev_port {
 struct ovs_mutex txq_used_mutex;
 char *type; /* Port type as requested by user. */
 char *rxq_affinity_list;/* Requested affinity of rx queues. */
+int ingress_prio;   /* 0 lowest to 3 highest. Default 0. */
 };
 
 /* Contained by struct dp_netdev_flow's 'stats' member.  */
@@ -572,6 +574,7 @@ static void dp_netdev_actions_free(struct dp_netdev_actions 
*);
 struct polled_queue {
 struct dp_netdev_rxq *rxq;
 odp_port_t port_no;
+uint8_t max_reads;
 };
 
 /* Contained by struct dp_netdev_pmd_thread's 'poll_list' member. */
@@ -711,6 +714,10 @@ struct dpif_netdev {
 uint64_t last_port_seq;
 };
 
+static int
+dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
+   struct dp_netdev_rxq *rxq,
+   odp_port_t port_no);
 static int get_port_by_number(struct dp_netdev *dp, odp_port_t port_no,
   struct dp_netdev_port **portp)
 OVS_REQUIRES(dp->port_mutex);
@@ -3847,6 +3854,36 @@ exit:
 return error;
 }
 
+static void
+set_need_reload_on_all_pmds_for_port(struct dp_netdev *dp, odp_port_t port_no)
+{
+/* Check each pmd to see if it is reading a queue belonging to
+   port_no and if so set need_reload of that pmd */
+struct dp_netdev_pmd_thread *pmd;
+CMAP_FOR_EACH (pmd, node, >poll_threads) {
+struct rxq_poll *poll;
+HMAP_FOR_EACH (poll, node, >poll_list) {
+if (poll->rxq->port->port_no == port_no) {
+pmd->need_reload = true;
+}
+}
+}
+}
+
+static void
+reload_affected_pmds(struct dp_netdev *dp)
+{
+struct dp_netdev_pmd_thread *pmd;
+
+CMAP_FOR_EACH (pmd, node, >poll_threads) {
+if (pmd->need_reload) {
+flow_mark_flush(pmd);
+dp_netdev_reload_pmd__(pmd);
+pmd->need_reload = false;
+}
+}
+}
+
 /* Changes the affinity of port's rx queues.  The changes are actually applied
  * in dpif_netdev_run(). */
 static int
@@ -3859,20 +3896,41 @@ dpif_netdev_port_set_config(struct dpif *dpif, 
odp_port_t port_no,
 const char *affinity_list = smap_get(cfg, "pmd-rxq-affinity");
 
 ovs_mutex_lock(>port_mutex);
+
 error = get_port_by_number(dp, port_no, );
-if (error || !netdev_is_pmd(port->netdev)
-|| nullable_string_is_equal(affinity_list, port->rxq_affinity_list)) {
+if (error || !netdev_is_pmd(port->netdev)) {
 goto unlock;
 }
 
-error = dpif_netdev_port_set_rxq_affinity(port, affinity_list);
-if (error) {
-goto unlock;
+if (!nullable_string_is_equal(affinity_list, port->rxq_affinity_list)) {
+error = dpif_netdev_port_set_rxq_affinity(port, affinity_list);
+if (!error) {
+free(port->rxq_affinity_list);
+port->rxq_affinity_list = nullable_xstrdup(affinity_list);
+dp_netdev_request_reconfigure(dp);
+}
+}
+
+const char *port_prio_str = smap_get(cfg, "port_prio");
+uint8_t port_prio;
+char *mallocd_err_str; /* str_to_x mallocs a str we'll need to free */
+if (port_prio_str) {
+

[ovs-dev] [PATCH per-port ingress scheduling 1/2] ingress scheduling: documentation

2018-08-28 Thread Billy O'Mahony
Signed-off-by: Billy O'Mahony 
---
 Documentation/howto/dpdk.rst | 15 +++
 vswitchd/vswitch.xml | 15 +++
 2 files changed, 30 insertions(+)

diff --git a/Documentation/howto/dpdk.rst b/Documentation/howto/dpdk.rst
index ab3d576..83284e7 100644
--- a/Documentation/howto/dpdk.rst
+++ b/Documentation/howto/dpdk.rst
@@ -360,6 +360,21 @@ devices to bridge ``br0``. Once complete, follow the below 
steps:
 
$ cat /proc/interrupts | grep virtio
 
+Ingress Scheduling
+--
+
+The ingress scheduling feature is described in general in
+``ovs-vswitchd.conf.db (5)``.
+
+Ingress scheduling currently supports setting a priority for incoming packets
+for an entire interface. Priority levels 0 (lowest) to 3 (highest) are
+supported.  The default priority is 0.
+
+To prioritize packets on a particular port:
+
+$ ovs-vsctl set Interface dpdk0 \
+ingress_sched=port_prio=3
+
 .. _dpdk-flow-hardware-offload:
 
 Flow Hardware Offload (Experimental)
diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml
index 71bbe95..e88a69a 100644
--- a/vswitchd/vswitch.xml
+++ b/vswitchd/vswitch.xml
@@ -3196,6 +3196,21 @@ ovs-vsctl add-port br0 p0 -- set Interface p0 type=patch 
options:peer=p1 \
   
 
 
+
+  
+   Configuration to allow rxd traffic to be prioritized on a per Interface
+   basis.
+  
+  
+
+  The ingress priority of the port: 0 (lowest) to 3 (highest). Higher
+  priority ports are read more frequently than lower priority ports.
+  This provides enhanced protection to packets ingressing high priority
+  ports against being dropped due to Rx queue overflow.
+
+  
+
+
 
   
 BFD, defined in RFC 5880 and RFC 5881, allows point-to-point
-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH per-port ingress scheduling 0/2]

2018-08-28 Thread Billy O'Mahony
Hi All,

I've updated the patch to account for two sets of comments on the RFCv2 see
history below.

This patch set implements the 'preferential read' part of the feature of
ingress scheduling described at OvS 2017 Fall Conference
https://www.slideshare.net/LF_OpenvSwitch/lfovs17ingress-scheduling-82280320.

It allows configuration to specify an ingress priority for an entire
interface. This protects traffic on higher priority interfaces from loss and
latency as PMDs get overloaded.

Results for physical interfaces are excellent - higher priority ports suffer
much less loss:

Phy i/f:   |dpdk_0 dpdk_1 dpdk_2 dpdk_3
% Total Load   |   25%25%25%25%
Priority (3=Hi)| 0  1  2  3
---+---
Total Offered  |
Load (kpps)| Pkt Loss (kpps)
---
2100   | 0  0  0  0
2300   |23  0  0  0
2500   |   308  0  0  0
2900   |   628 24  0  0
3400   |   811370  8  0
3500   |   821391 52  0
4000   |   964565238 20

This also holds true to a great extent when the 'priority' port is carrying
most of the traffic:

Phy i/f:   |dpdk_0 dpdk_1 dpdk_2 dpdk_3
% Total Load   |   10%20%30%40%
Priority (3=Hi)| 0  1  2  3
---+---
Total Offered  |
Load (kpps)| Pkt Loss (kpps)
---
2300   | 8  0  0  0
2500   |   181  0  0  0
2550   |   213 13  0  0
2620   |   223 63  0  9
2700   |   230 82 10 52
3000   |   262143101172
3500   |   310242249370
4000   |   361341398569

For vhostuser ports VMs running iperf3 (TCP) benefit to an appreciable extent
from being on a 'priority' ports - without a drop
in overall throughput.

Scenario: 3 VM-pairs running iperf3 (baseline)
-
VM pair  | 1,23,45,6
priority |   0  0  0
Tput (Gbit/s)| 3.33.33.3

Scenario: 3 VM-pairs running iperf3 (one pair prioritized)
--
VM pair  | 1,23,45,6
priority |   0  0  0
Tput (Gbit/s)| 2.72.74.6

History:

v1:
* the configuration in only in dpif-netdev and will work with any polled
  netdev's not just dpdk netdevs.
* re-configuration of the priorities at run-time is supported.
* keep configuration in Interfaces other_config
* applies cleanly on 9b4f08c

RFCv2:
* Keep ingress prio config in netdev base rather than in each netdev type.
* Account for differing rxq lengths
* Applies cleanly to 4299145

RFCv1:
Initial version.


Billy O'Mahony (2):
  ingress scheduling: documentation
  ingress scheduling: Provide per interface ingress priority

 Documentation/howto/dpdk.rst|  15 
 include/openvswitch/ofp-parse.h |   3 +
 lib/dpif-netdev.c   | 188 +---
 lib/netdev-dpdk.c   |  10 +++
 vswitchd/vswitch.xml|  15 
 5 files changed, 200 insertions(+), 31 deletions(-)

-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH v3] OVS-DPDK: Change "dpdk-socket-mem" default value.

2018-05-09 Thread Billy O'Mahony
From: Marcin Rybka <marcinx.ry...@intel.com>

When "dpdk-socket-mem" and "dpdk-alloc-mem" are not specified,
"dpdk-socket-mem" will be set to allocate 1024MB on each NUMA node.
This change will prevent OVS from failing when NIC is attached on
NUMA node 1 and higher. Patch contains documentation update.

Signed-off-by: Marcin Rybka <marcinx.ry...@intel.com>
Co-authored-by: Billy O'Mahony <billy.o.mah...@intel.com>
Signed-off-by: Billy O'Mahony <billy.o.mah...@intel.com>
---
 Documentation/intro/install/dpdk.rst |  3 ++-
 lib/dpdk.c   | 28 +++-
 vswitchd/vswitch.xml |  7 ---
 3 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/Documentation/intro/install/dpdk.rst 
b/Documentation/intro/install/dpdk.rst
index fea4890..b68438d 100644
--- a/Documentation/intro/install/dpdk.rst
+++ b/Documentation/intro/install/dpdk.rst
@@ -228,7 +228,8 @@ listed below. Defaults will be provided for all values not 
explicitly set.
 
 ``dpdk-socket-mem``
   Comma separated list of memory to pre-allocate from hugepages on specific
-  sockets.
+  sockets. If not specified, 1024 MB will be set for each numa node by
+  default.
 
 ``dpdk-hugepage-dir``
   Directory where hugetlbfs is mounted
diff --git a/lib/dpdk.c b/lib/dpdk.c
index 00dd974..40aa20f 100644
--- a/lib/dpdk.c
+++ b/lib/dpdk.c
@@ -35,6 +35,7 @@
 #include "netdev-dpdk.h"
 #include "openvswitch/dynamic-string.h"
 #include "openvswitch/vlog.h"
+#include "ovs-numa.h"
 #include "smap.h"
 
 VLOG_DEFINE_THIS_MODULE(dpdk);
@@ -163,6 +164,28 @@ construct_dpdk_options(const struct smap *ovs_other_config,
 return ret;
 }
 
+static char *
+construct_dpdk_socket_mem(void)
+{
+int numa;
+const char *def_value = "1024";
+int numa_nodes = ovs_numa_get_n_numas();
+
+if (numa_nodes == 0 || numa_nodes == OVS_NUMA_UNSPEC) {
+numa_nodes = 1;
+}
+/* Allocate enough memory for digits, comma-sep and terminator. */
+char *dpdk_socket_mem = xzalloc(numa_nodes * (strlen(def_value) + 1));
+
+strcat(dpdk_socket_mem, def_value);
+for (numa = 1; numa < numa_nodes; ++numa) {
+strcat(dpdk_socket_mem, ",");
+strcat(dpdk_socket_mem, def_value);
+}
+
+return dpdk_socket_mem;
+}
+
 #define MAX_DPDK_EXCL_OPTS 10
 
 static int
@@ -170,6 +193,7 @@ construct_dpdk_mutex_options(const struct smap 
*ovs_other_config,
  char ***argv, const int initial_size,
  char **extra_args, const size_t extra_argc)
 {
+char *default_dpdk_socket_mem = construct_dpdk_socket_mem();
 struct dpdk_exclusive_options_map {
 const char *category;
 const char *ovs_dpdk_options[MAX_DPDK_EXCL_OPTS];
@@ -180,7 +204,7 @@ construct_dpdk_mutex_options(const struct smap 
*ovs_other_config,
 {"memory type",
  {"dpdk-alloc-mem", "dpdk-socket-mem", NULL,},
  {"-m", "--socket-mem",NULL,},
- "1024,0", 1
+ default_dpdk_socket_mem, 1
 },
 };
 
@@ -227,6 +251,8 @@ construct_dpdk_mutex_options(const struct smap 
*ovs_other_config,
 }
 }
 
+free(default_dpdk_socket_mem);
+
 return ret;
 }
 
diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml
index 9c2a826..d74f28b 100644
--- a/vswitchd/vswitch.xml
+++ b/vswitchd/vswitch.xml
@@ -302,9 +302,10 @@
 
 
   If dpdk-socket-mem and dpdk-alloc-mem are not specified, 
dpdk-socket-mem
-  will be used and the default value is 1024,0. If dpdk-socket-mem and
-  dpdk-alloc-mem are specified at same time, dpdk-socket-mem will be
-  used as default. Changing this value requires restarting the daemon.
+  will be used and the default value is 1024 for each numa node. If
+  dpdk-socket-mem and dpdk-alloc-mem are specified at same time,
+  dpdk-socket-mem will be used as default. Changing this value
+  requires restarting the daemon.
 
   
 
-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH v3] OVS-DPDK: Change "dpdk-socket-mem" default value

2018-05-09 Thread Billy O'Mahony
Hi All,

This change is the outcome of this discussion:
https://mail.openvswitch.org/pipermail/ovs-discuss/2017-April/044110.html.

I am posting this on behalf of Marcin who is the author but no longer works for
Intel.  I made a few changes hence the double sign-off but please remove my sig
if required.

Regards,
Billy.

v1 -> v2 Remove some debug logs
v2 -> v3 Add Signed-off-by tag for co-author

Marcin Rybka (1):
  OVS-DPDK: Change "dpdk-socket-mem" default value.

 Documentation/intro/install/dpdk.rst |  3 ++-
 lib/dpdk.c   | 28 +++-
 vswitchd/vswitch.xml |  7 ---
 3 files changed, 33 insertions(+), 5 deletions(-)

-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH v2] OVS-DPDK: Change "dpdk-socket-mem" default value.

2018-05-08 Thread Billy O'Mahony
From: Marcin Rybka <marcinx.ry...@intel.com>

When "dpdk-socket-mem" and "dpdk-alloc-mem" are not specified,
"dpdk-socket-mem" will be set to allocate 1024MB on each NUMA node.
This change will prevent OVS from failing when NIC is attached on
NUMA node 1 and higher. Patch contains documentation update.

Signed-off-by: Marcin Rybka <marcinx.ry...@intel.com>
Co-authored-by:: Billy O'Mahony <billy.o.mah...@intel.com>
---
 Documentation/intro/install/dpdk.rst |  3 ++-
 lib/dpdk.c   | 29 -
 vswitchd/vswitch.xml |  7 ---
 3 files changed, 34 insertions(+), 5 deletions(-)

diff --git a/Documentation/intro/install/dpdk.rst 
b/Documentation/intro/install/dpdk.rst
index fea4890..b68438d 100644
--- a/Documentation/intro/install/dpdk.rst
+++ b/Documentation/intro/install/dpdk.rst
@@ -228,7 +228,8 @@ listed below. Defaults will be provided for all values not 
explicitly set.
 
 ``dpdk-socket-mem``
   Comma separated list of memory to pre-allocate from hugepages on specific
-  sockets.
+  sockets. If not specified, 1024 MB will be set for each numa node by
+  default.
 
 ``dpdk-hugepage-dir``
   Directory where hugetlbfs is mounted
diff --git a/lib/dpdk.c b/lib/dpdk.c
index 00dd974..733c67d 100644
--- a/lib/dpdk.c
+++ b/lib/dpdk.c
@@ -35,6 +35,7 @@
 #include "netdev-dpdk.h"
 #include "openvswitch/dynamic-string.h"
 #include "openvswitch/vlog.h"
+#include "ovs-numa.h"
 #include "smap.h"
 
 VLOG_DEFINE_THIS_MODULE(dpdk);
@@ -163,6 +164,29 @@ construct_dpdk_options(const struct smap *ovs_other_config,
 return ret;
 }
 
+static char *
+construct_dpdk_socket_mem(void)
+{
+int numa = 0;
+const char *def_value = "1024";
+int numa_nodes = ovs_numa_get_n_numas();
+
+if (numa_nodes == 0 || numa_nodes == OVS_NUMA_UNSPEC) {
+numa_nodes = 1;
+}
+
+/* Allocate enough memory for digits, comma-sep and terminator. */
+char *dpdk_socket_mem = xzalloc(numa_nodes * (strlen(def_value) + 1));
+
+strcat(dpdk_socket_mem, def_value);
+for (numa = 1; numa < numa_nodes; ++numa) {
+strcat(dpdk_socket_mem, ",");
+strcat(dpdk_socket_mem, def_value);
+}
+
+return dpdk_socket_mem;
+}
+
 #define MAX_DPDK_EXCL_OPTS 10
 
 static int
@@ -170,6 +194,7 @@ construct_dpdk_mutex_options(const struct smap 
*ovs_other_config,
  char ***argv, const int initial_size,
  char **extra_args, const size_t extra_argc)
 {
+char *default_dpdk_socket_mem = construct_dpdk_socket_mem();
 struct dpdk_exclusive_options_map {
 const char *category;
 const char *ovs_dpdk_options[MAX_DPDK_EXCL_OPTS];
@@ -180,7 +205,7 @@ construct_dpdk_mutex_options(const struct smap 
*ovs_other_config,
 {"memory type",
  {"dpdk-alloc-mem", "dpdk-socket-mem", NULL,},
  {"-m", "--socket-mem",NULL,},
- "1024,0", 1
+ default_dpdk_socket_mem, 1
 },
 };
 
@@ -227,6 +252,8 @@ construct_dpdk_mutex_options(const struct smap 
*ovs_other_config,
 }
 }
 
+free(default_dpdk_socket_mem);
+
 return ret;
 }
 
diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml
index 9c2a826..d74f28b 100644
--- a/vswitchd/vswitch.xml
+++ b/vswitchd/vswitch.xml
@@ -302,9 +302,10 @@
 
 
   If dpdk-socket-mem and dpdk-alloc-mem are not specified, 
dpdk-socket-mem
-  will be used and the default value is 1024,0. If dpdk-socket-mem and
-  dpdk-alloc-mem are specified at same time, dpdk-socket-mem will be
-  used as default. Changing this value requires restarting the daemon.
+  will be used and the default value is 1024 for each numa node. If
+  dpdk-socket-mem and dpdk-alloc-mem are specified at same time,
+  dpdk-socket-mem will be used as default. Changing this value
+  requires restarting the daemon.
 
   
 
-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH v2] OVS-DPDK: Change "dpdk-socket-mem" default value

2018-05-08 Thread Billy O'Mahony
Hi All,

This change is teh outcome of this discussion: 
https://mail.openvswitch.org/pipermail/ovs-discuss/2017-April/044110.html.

I am posting this on behalf of Marcin who is the author but no longer works for 
Intel.  I made a few changes hence the double sign-off but please remove my sig 
if required.

Regards,
Billy.

v1 -> v2 Remove some debug logs

Marcin Rybka (1):
  OVS-DPDK: Change "dpdk-socket-mem" default value.

 Documentation/intro/install/dpdk.rst |  3 ++-
 lib/dpdk.c   | 29 -
 vswitchd/vswitch.xml |  7 ---
 3 files changed, 34 insertions(+), 5 deletions(-)

-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH] OVS-DPDK: Change "dpdk-socket-mem" default value.

2018-05-08 Thread Billy O'Mahony
From: Marcin Rybka <marcinx.ry...@intel.com>

When "dpdk-socket-mem" and "dpdk-alloc-mem" are not specified,
"dpdk-socket-mem" will be set to allocate 1024MB on each NUMA node.
This change will prevent OVS from failing when NIC is attached on
NUMA node 1 and higher. Patch contains documentation update.

Signed-off-by: Marcin Rybka <marcinx.ry...@intel.com>
Signed-off-by: Billy O'Mahony <billy.o.mah...@intel.com>
---
 Documentation/intro/install/dpdk.rst |  3 ++-
 lib/dpdk.c   | 35 ++-
 vswitchd/vswitch.xml |  7 ---
 3 files changed, 40 insertions(+), 5 deletions(-)

diff --git a/Documentation/intro/install/dpdk.rst 
b/Documentation/intro/install/dpdk.rst
index fea4890..b68438d 100644
--- a/Documentation/intro/install/dpdk.rst
+++ b/Documentation/intro/install/dpdk.rst
@@ -228,7 +228,8 @@ listed below. Defaults will be provided for all values not 
explicitly set.
 
 ``dpdk-socket-mem``
   Comma separated list of memory to pre-allocate from hugepages on specific
-  sockets.
+  sockets. If not specified, 1024 MB will be set for each numa node by
+  default.
 
 ``dpdk-hugepage-dir``
   Directory where hugetlbfs is mounted
diff --git a/lib/dpdk.c b/lib/dpdk.c
index 00dd974..125178e 100644
--- a/lib/dpdk.c
+++ b/lib/dpdk.c
@@ -35,6 +35,7 @@
 #include "netdev-dpdk.h"
 #include "openvswitch/dynamic-string.h"
 #include "openvswitch/vlog.h"
+#include "ovs-numa.h"
 #include "smap.h"
 
 VLOG_DEFINE_THIS_MODULE(dpdk);
@@ -163,6 +164,29 @@ construct_dpdk_options(const struct smap *ovs_other_config,
 return ret;
 }
 
+static char *
+construct_dpdk_socket_mem(void)
+{
+int numa = 0;
+const char *def_value = "1024";
+int numa_nodes = ovs_numa_get_n_numas();
+
+if (numa_nodes == 0 || numa_nodes == OVS_NUMA_UNSPEC) {
+numa_nodes = 1;
+}
+
+/* Allocate enough memory for digits, comma-sep and terminator. */
+char *dpdk_socket_mem = xzalloc(numa_nodes * (strlen(def_value) + 1));
+
+strcat(dpdk_socket_mem, def_value);
+for (numa = 1; numa < numa_nodes; ++numa) {
+strcat(dpdk_socket_mem, ",");
+strcat(dpdk_socket_mem, def_value);
+}
+
+return dpdk_socket_mem;
+}
+
 #define MAX_DPDK_EXCL_OPTS 10
 
 static int
@@ -170,6 +194,7 @@ construct_dpdk_mutex_options(const struct smap 
*ovs_other_config,
  char ***argv, const int initial_size,
  char **extra_args, const size_t extra_argc)
 {
+char *default_dpdk_socket_mem = construct_dpdk_socket_mem();
 struct dpdk_exclusive_options_map {
 const char *category;
 const char *ovs_dpdk_options[MAX_DPDK_EXCL_OPTS];
@@ -180,7 +205,7 @@ construct_dpdk_mutex_options(const struct smap 
*ovs_other_config,
 {"memory type",
  {"dpdk-alloc-mem", "dpdk-socket-mem", NULL,},
  {"-m", "--socket-mem",NULL,},
- "1024,0", 1
+ default_dpdk_socket_mem, 1
 },
 };
 
@@ -227,6 +252,8 @@ construct_dpdk_mutex_options(const struct smap 
*ovs_other_config,
 }
 }
 
+free(default_dpdk_socket_mem);
+
 return ret;
 }
 
@@ -420,6 +447,12 @@ dpdk_init__(const struct smap *ovs_other_config)
 argv_to_release[argc_tmp] = argv[argc_tmp];
 }
 
+{
+int i;
+for (i = 0; i <= argc; i++) {
+VLOG_WARN("BOM %d '%s'", argc, argv[i]);
+}
+}
 /* Make sure things are initialized ... */
 result = rte_eal_init(argc, argv);
 if (result < 0) {
diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml
index 9c2a826..d74f28b 100644
--- a/vswitchd/vswitch.xml
+++ b/vswitchd/vswitch.xml
@@ -302,9 +302,10 @@
 
 
   If dpdk-socket-mem and dpdk-alloc-mem are not specified, 
dpdk-socket-mem
-  will be used and the default value is 1024,0. If dpdk-socket-mem and
-  dpdk-alloc-mem are specified at same time, dpdk-socket-mem will be
-  used as default. Changing this value requires restarting the daemon.
+  will be used and the default value is 1024 for each numa node. If
+  dpdk-socket-mem and dpdk-alloc-mem are specified at same time,
+  dpdk-socket-mem will be used as default. Changing this value
+  requires restarting the daemon.
 
   
 
-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH] Change "dpdk-socket-mem" default value.

2018-05-08 Thread Billy O'Mahony
Hi All,

This change is teh outcome of this discussion: 
https://mail.openvswitch.org/pipermail/ovs-discuss/2017-April/044110.html.

I am posting this on behalf of Marcin who is the author but no longer works for 
Intel.  I made a few changes hence the double sign-off but please remove my sig 
if required.

Regards,
Billy.

Marcin Rybka (1):
  OVS-DPDK: Change "dpdk-socket-mem" default value.

 Documentation/intro/install/dpdk.rst |  3 ++-
 lib/dpdk.c   | 35 ++-
 vswitchd/vswitch.xml |  7 ---
 3 files changed, 40 insertions(+), 5 deletions(-)

-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [RFC v2 2/2] ingress scheduling: Provide per interface ingress priority

2018-03-28 Thread Billy O'Mahony
Allow configuration to specify an ingress priority for interfaces.
Modify ovs-netdev datapath to act on this configuration so that packets
on interfaces with a higher priority will tend be processed ahead of
packets on lower priority interfaces.  This protects traffic on higher
priority interfaces from loss and latency as PMDs get overloaded.

Signed-off-by: Billy O'Mahony <billy.o.mah...@intel.com>
---
 include/openvswitch/ofp-parse.h |   3 ++
 lib/dpif-netdev.c   | 103 +++-
 lib/netdev-bsd.c|   1 +
 lib/netdev-dpdk.c   |  13 -
 lib/netdev-dummy.c  |   1 +
 lib/netdev-linux.c  |   1 +
 lib/netdev-provider.h   |  11 -
 lib/netdev-vport.c  |   1 +
 lib/netdev.c|  42 
 lib/netdev.h|   2 +
 vswitchd/bridge.c   |   2 +
 11 files changed, 165 insertions(+), 15 deletions(-)

diff --git a/include/openvswitch/ofp-parse.h b/include/openvswitch/ofp-parse.h
index 3fdd468..d77ab8f 100644
--- a/include/openvswitch/ofp-parse.h
+++ b/include/openvswitch/ofp-parse.h
@@ -33,6 +33,9 @@ extern "C" {
 struct match;
 struct mf_field;
 struct ofputil_port_map;
+struct tun_table;
+struct flow_wildcards;
+struct ofputil_port_map;
 
 struct ofp_protocol {
 const char *name;
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index b07fc6b..736d0b6 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -22,6 +22,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -42,6 +43,7 @@
 #include "dpif.h"
 #include "dpif-netdev-perf.h"
 #include "dpif-provider.h"
+#include "netdev-provider.h"
 #include "dummy.h"
 #include "fat-rwlock.h"
 #include "flow.h"
@@ -487,6 +489,7 @@ static void dp_netdev_actions_free(struct dp_netdev_actions 
*);
 struct polled_queue {
 struct dp_netdev_rxq *rxq;
 odp_port_t port_no;
+uint8_t priority;
 };
 
 /* Contained by struct dp_netdev_pmd_thread's 'poll_list' member. */
@@ -626,6 +629,12 @@ struct dpif_netdev {
 uint64_t last_port_seq;
 };
 
+static void
+dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
+   struct dp_netdev_rxq *rxq,
+   odp_port_t port_no,
+   unsigned int *rxd_cnt,
+   unsigned int *txd_cnt);
 static int get_port_by_number(struct dp_netdev *dp, odp_port_t port_no,
   struct dp_netdev_port **portp)
 OVS_REQUIRES(dp->port_mutex);
@@ -3259,15 +3268,16 @@ dp_netdev_pmd_flush_output_packets(struct 
dp_netdev_pmd_thread *pmd,
 return output_cnt;
 }
 
-static int
+static void
 dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
struct dp_netdev_rxq *rxq,
-   odp_port_t port_no)
+   odp_port_t port_no,
+   unsigned int *rxd_cnt,
+   unsigned int *txd_cnt)
 {
 struct dp_packet_batch batch;
 struct cycle_timer timer;
 int error;
-int batch_cnt = 0, output_cnt = 0;
 uint64_t cycles;
 
 /* Measure duration for polling and processing rx burst. */
@@ -3279,17 +3289,17 @@ dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread 
*pmd,
 error = netdev_rxq_recv(rxq->rx, );
 if (!error) {
 /* At least one packet received. */
+*rxd_cnt = batch.count;
 *recirc_depth_get() = 0;
 pmd_thread_ctx_time_update(pmd);
 
-batch_cnt = batch.count;
 dp_netdev_input(pmd, , port_no);
 
 /* Assign processing cycles to rx queue. */
 cycles = cycle_timer_stop(>perf_stats, );
 dp_netdev_rxq_add_cycles(rxq, RXQ_CYCLES_PROC_CURR, cycles);
 
-output_cnt = dp_netdev_pmd_flush_output_packets(pmd, false);
+*txd_cnt = dp_netdev_pmd_flush_output_packets(pmd, false);
 } else {
 /* Discard cycles. */
 cycle_timer_stop(>perf_stats, );
@@ -3299,11 +3309,11 @@ dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread 
*pmd,
 VLOG_ERR_RL(, "error receiving data from %s: %s",
 netdev_rxq_get_name(rxq->rx), ovs_strerror(error));
 }
+*txd_cnt = 0;
 }
 
 pmd->ctx.last_rxq = NULL;
 
-return batch_cnt + output_cnt;
 }
 
 static struct tx_port *
@@ -3935,11 +3945,16 @@ dpif_netdev_run(struct dpif *dpif)
 HMAP_FOR_EACH (port, node, >ports) {
 if (!netdev_is_pmd(port->netdev)) {
 int i;
+unsigned int rxd_cnt;
+unsigned int txd_cnt;
 
 for (i = 0; i < port->n_rxq; i++) {
-if (dp_netdev_process_rxq_port(non_pmd,
-   >rxqs[

[ovs-dev] [RFC v2 1/2] ingress scheduling: schema and docs

2018-03-28 Thread Billy O'Mahony
Signed-off-by: Billy O'Mahony <billy.o.mah...@intel.com>
---
 Documentation/howto/dpdk.rst | 18 ++
 vswitchd/vswitch.ovsschema   |  9 +++--
 vswitchd/vswitch.xml | 40 
 3 files changed, 65 insertions(+), 2 deletions(-)

diff --git a/Documentation/howto/dpdk.rst b/Documentation/howto/dpdk.rst
index 79b626c..fca353a 100644
--- a/Documentation/howto/dpdk.rst
+++ b/Documentation/howto/dpdk.rst
@@ -237,6 +237,24 @@ respective parameter. To disable the flow control at tx 
side, run::
 
 $ ovs-vsctl set Interface dpdk-p0 options:tx-flow-ctrl=false
 
+Ingress Scheduling
+--
+
+The ingress scheduling feature is described in general in
+``ovs-vswitchd.conf.db (5)``.
+
+Ingress scheduling currently only supports setting a priority for incoming
+packets for an entire interface. Priority levels 0 (lowest) to 3 (highest) are
+supported.  The default priority is 0.
+
+Interfaces of type ``dpdk`` and ``dpdkvhostuserclient`` support ingress
+scheduling.
+
+To prioritize packets on a particular port:
+
+$ ovs-vsctl set Interface dpdk0 \
+ingress_sched=port_prio=3
+
 pdump
 -
 
diff --git a/vswitchd/vswitch.ovsschema b/vswitchd/vswitch.ovsschema
index 90e50b6..5552d87 100644
--- a/vswitchd/vswitch.ovsschema
+++ b/vswitchd/vswitch.ovsschema
@@ -1,6 +1,6 @@
 {"name": "Open_vSwitch",
- "version": "7.15.1",
- "cksum": "3682332033 23608",
+ "version": "7.15.2",
+ "cksum": "2390903851 23814",
  "tables": {
"Open_vSwitch": {
  "columns": {
@@ -352,6 +352,11 @@
"minInteger": 1},
"min": 0,
"max": 1}},
+   "ingress_sched": {
+ "type": {"key": {"type": "string",
+  "enum": ["set", ["port_prio"]]},
+  "value": "string",
+  "min": 0, "max": 1}},
"error": {
  "type": {"key": "string", "min": 0, "max": 1}}},
  "indexes": [["name"]]},
diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml
index f899a19..9ab3960 100644
--- a/vswitchd/vswitch.xml
+++ b/vswitchd/vswitch.xml
@@ -3071,6 +3071,33 @@ ovs-vsctl add-port br0 p0 -- set Interface p0 type=patch 
options:peer=p1 \
   
 
 
+
+  
+   Configuration to allow certain traffic to be prioritized.  This means
+   some combination of:
+  
+  
+
+ prioritized packets are forwarded to their destination port before
+ non-prioritized
+
+
+ prioritized packets are less likely to be dropped in an overloaded
+ situation than prioritized packets
+
+  
+  
+   Ingress scheduling is supported with the best effort of the Interface
+   type and datapath.  Currently the only field supported is port_prio
+   which applies a priority to all incoming packets on the Interface.
+  
+  
+
+ Priority levels are 0 (lowest) to 3 (highest).
+
+  
+
+
 
   
 BFD, defined in RFC 5880 and RFC 5881, allows point-to-point
@@ -3627,6 +3654,19 @@ ovs-vsctl add-port br0 p0 -- set Interface p0 type=patch 
options:peer=p1 \
   
 
 
+
+  
+Ingress Scheduling allows XXX TODO BOM
+  
+
+  
+The ingress priority of the port: 0 lowest to 3 highest. Higher
+priority ports are read more frequently than lower priority ports. This
+providing enhanced protection against being dropped due to Rx queue
+overflow to packets ingressing on those ports.
+  
+
+
 
   The overall purpose of these columns is described under Common
   Columns at the beginning of this document.
-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [RFC v2 0/2] Ingress Scheduling

2018-03-28 Thread Billy O'Mahony
Hi All,

I've updated the original RFC patch to account for two sets of comments. See
below.

I had originally intended to make this a v1 patch but two items are
outstanding:
* further testing needs to be completed to determine the full effect on vports
* re-configuration of the priorities at run-time is not supported in RFCv2

I want to allow re-configuration of the port priorities without calling
netdev_request_reconfigure as that will, I understand, restart the
netdevs and drop packets.  I am thinking that adding the ingress_prio config
smap to ofproto_port_set_config call in bridge_reconfigure and then calling
dp_netdev_reload_pmd__ from dpif_netdev_port_set_config will end up forcing the
pmd to reload_queues_and_ports and giving effect to the new priorities.  But
I'd appreciate guidance on that approach.

Original Description:
This patch set implements the 'preferential read' part of the feature of
ingress scheduling described at OvS 2017 Fall Conference
https://www.slideshare.net/LF_OpenvSwitch/lfovs17ingress-scheduling-82280320.

It allows configuration to specify an ingress priority for an entire
interface. This protects traffic on higher priority interfaces from loss and
latency as PMDs get overloaded.

Results so far a are very promising; For a uniform traffic distribution as
total offered load increases loss starts on the lowest priority port first and
the highest priority port last.

History:

RFCv1:
Initial version.

RFCv2:
* Keep ingress prio config in netdev base rather than in each netdev type.
* Account for differing rxq lengths
* Applies clean to 4299145

Billy O'Mahony (2):
  ingress scheduling: schema and docs
  ingress scheduling: Provide per interface ingress priority

 Documentation/howto/dpdk.rst|  18 +++
 include/openvswitch/ofp-parse.h |   3 ++
 lib/dpif-netdev.c   | 103 +++-
 lib/netdev-bsd.c|   1 +
 lib/netdev-dpdk.c   |  13 -
 lib/netdev-dummy.c  |   1 +
 lib/netdev-linux.c  |   1 +
 lib/netdev-provider.h   |  11 -
 lib/netdev-vport.c  |   1 +
 lib/netdev.c|  42 
 lib/netdev.h|   2 +
 vswitchd/bridge.c   |   2 +
 vswitchd/vswitch.ovsschema  |   9 +++-
 vswitchd/vswitch.xml|  40 
 14 files changed, 230 insertions(+), 17 deletions(-)

-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [RFC 2/2] ingress scheduling: Provide per interface ingress priority

2018-02-16 Thread Billy O'Mahony
Allow configuration to specify an ingress priority for interfaces.
Modify ovs-netdev datapath to act on this configuration so that packets
on interfaces with a higher priority will tend be processed ahead of
packets on lower priority interfaces.  This protects traffic on higher
priority interfaces from loss and latency as PMDs get overloaded.

Signed-off-by: Billy O'Mahony <billy.o.mah...@intel.com>
---
 include/openvswitch/ofp-parse.h |  3 ++
 lib/dpif-netdev.c   | 47 +-
 lib/netdev-bsd.c|  1 +
 lib/netdev-dpdk.c   | 64 +++--
 lib/netdev-dummy.c  |  1 +
 lib/netdev-linux.c  |  1 +
 lib/netdev-provider.h   | 11 ++-
 lib/netdev-vport.c  |  1 +
 lib/netdev.c| 23 +++
 lib/netdev.h|  2 ++
 vswitchd/bridge.c   |  2 ++
 11 files changed, 140 insertions(+), 16 deletions(-)

diff --git a/include/openvswitch/ofp-parse.h b/include/openvswitch/ofp-parse.h
index 3fdd468..d77ab8f 100644
--- a/include/openvswitch/ofp-parse.h
+++ b/include/openvswitch/ofp-parse.h
@@ -33,6 +33,9 @@ extern "C" {
 struct match;
 struct mf_field;
 struct ofputil_port_map;
+struct tun_table;
+struct flow_wildcards;
+struct ofputil_port_map;
 
 struct ofp_protocol {
 const char *name;
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index d49c986..89d8229 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -42,6 +42,7 @@
 #include "dpif.h"
 #include "dpif-netdev-perf.h"
 #include "dpif-provider.h"
+#include "netdev-provider.h"
 #include "dummy.h"
 #include "fat-rwlock.h"
 #include "flow.h"
@@ -487,6 +488,7 @@ static void dp_netdev_actions_free(struct dp_netdev_actions 
*);
 struct polled_queue {
 struct dp_netdev_rxq *rxq;
 odp_port_t port_no;
+uint8_t priority;
 };
 
 /* Contained by struct dp_netdev_pmd_thread's 'poll_list' member. */
@@ -626,6 +628,12 @@ struct dpif_netdev {
 uint64_t last_port_seq;
 };
 
+static void
+dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
+   struct dp_netdev_rxq *rxq,
+   odp_port_t port_no,
+   unsigned int *rxd_cnt,
+   unsigned int *txd_cnt);
 static int get_port_by_number(struct dp_netdev *dp, odp_port_t port_no,
   struct dp_netdev_port **portp)
 OVS_REQUIRES(dp->port_mutex);
@@ -3259,15 +3267,16 @@ dp_netdev_pmd_flush_output_packets(struct 
dp_netdev_pmd_thread *pmd,
 return output_cnt;
 }
 
-static int
+static void
 dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
struct dp_netdev_rxq *rxq,
-   odp_port_t port_no)
+   odp_port_t port_no,
+   unsigned int *rxd_cnt,
+   unsigned int *txd_cnt)
 {
 struct dp_packet_batch batch;
 struct cycle_timer timer;
 int error;
-int batch_cnt = 0, output_cnt = 0;
 uint64_t cycles;
 
 /* Measure duration for polling and processing rx burst. */
@@ -3279,17 +3288,17 @@ dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread 
*pmd,
 error = netdev_rxq_recv(rxq->rx, );
 if (!error) {
 /* At least one packet received. */
+*rxd_cnt = batch.count;
 *recirc_depth_get() = 0;
 pmd_thread_ctx_time_update(pmd);
 
-batch_cnt = batch.count;
 dp_netdev_input(pmd, , port_no);
 
 /* Assign processing cycles to rx queue. */
 cycles = cycle_timer_stop(>perf_stats, );
 dp_netdev_rxq_add_cycles(rxq, RXQ_CYCLES_PROC_CURR, cycles);
 
-output_cnt = dp_netdev_pmd_flush_output_packets(pmd, false);
+*txd_cnt = dp_netdev_pmd_flush_output_packets(pmd, false);
 } else {
 /* Discard cycles. */
 cycle_timer_stop(>perf_stats, );
@@ -3299,11 +3308,11 @@ dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread 
*pmd,
 VLOG_ERR_RL(, "error receiving data from %s: %s",
 netdev_rxq_get_name(rxq->rx), ovs_strerror(error));
 }
+*txd_cnt = 0;
 }
 
 pmd->ctx.last_rxq = NULL;
 
-return batch_cnt + output_cnt;
 }
 
 static struct tx_port *
@@ -3935,11 +3944,16 @@ dpif_netdev_run(struct dpif *dpif)
 HMAP_FOR_EACH (port, node, >ports) {
 if (!netdev_is_pmd(port->netdev)) {
 int i;
+unsigned int rxd_cnt;
+unsigned int txd_cnt;
 
 for (i = 0; i < port->n_rxq; i++) {
-if (dp_netdev_process_rxq_port(non_pmd,
-   >rxqs[i],
-   port->port_no)) {
+dp_netdev_proc

[ovs-dev] [RFC 1/2] ingress scheduling: schema and docs

2018-02-16 Thread Billy O'Mahony
Signed-off-by: Billy O'Mahony <billy.o.mah...@intel.com>
---
 Documentation/howto/dpdk.rst | 18 ++
 vswitchd/vswitch.ovsschema   |  9 +++--
 vswitchd/vswitch.xml | 40 
 3 files changed, 65 insertions(+), 2 deletions(-)

diff --git a/Documentation/howto/dpdk.rst b/Documentation/howto/dpdk.rst
index d717d2e..b9d55d8 100644
--- a/Documentation/howto/dpdk.rst
+++ b/Documentation/howto/dpdk.rst
@@ -237,6 +237,24 @@ respective parameter. To disable the flow control at tx 
side, run::
 
 $ ovs-vsctl set Interface dpdk-p0 options:tx-flow-ctrl=false
 
+Ingress Scheduling
+--
+
+The ingress scheduling feature is described in general in
+``ovs-vswitchd.conf.db (5)``.
+
+Ingress scheduling currently only supports setting a priority for incoming
+packets for an entire interface. Priority levels 0 (lowest) to 3 (highest) are
+supported.  The default priority is 0.
+
+Interfaces of type ``dpdk`` and ``dpdkvhostuserclient`` support ingress
+scheduling.
+
+To prioritize packets on a particular port:
+
+$ ovs-vsctl set Interface dpdk0 \
+ingress_sched=port_prio=3
+
 pdump
 -
 
diff --git a/vswitchd/vswitch.ovsschema b/vswitchd/vswitch.ovsschema
index 90e50b6..5552d87 100644
--- a/vswitchd/vswitch.ovsschema
+++ b/vswitchd/vswitch.ovsschema
@@ -1,6 +1,6 @@
 {"name": "Open_vSwitch",
- "version": "7.15.1",
- "cksum": "3682332033 23608",
+ "version": "7.15.2",
+ "cksum": "2390903851 23814",
  "tables": {
"Open_vSwitch": {
  "columns": {
@@ -352,6 +352,11 @@
"minInteger": 1},
"min": 0,
"max": 1}},
+   "ingress_sched": {
+ "type": {"key": {"type": "string",
+  "enum": ["set", ["port_prio"]]},
+  "value": "string",
+  "min": 0, "max": 1}},
"error": {
  "type": {"key": "string", "min": 0, "max": 1}}},
  "indexes": [["name"]]},
diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml
index 0c6a43d..24bdc0b 100644
--- a/vswitchd/vswitch.xml
+++ b/vswitchd/vswitch.xml
@@ -3075,6 +3075,33 @@ ovs-vsctl add-port br0 p0 -- set Interface p0 type=patch 
options:peer=p1 \
   
 
 
+
+  
+   Configuration to allow certain traffic to be prioritized.  This means
+   some combination of:
+  
+  
+
+ prioritized packets are forwarded to their destination port before
+ non-prioritized
+
+
+ prioritized packets are less likely to be dropped in an overloaded
+ situation than prioritized packets
+
+  
+  
+   Ingress scheduling is supported with the best effort of the Interface
+   type and datapath.  Currently the only field supported is port_prio
+   which applies a priority to all incoming packets on the Interface.
+  
+  
+
+ Priority levels are 0 (lowest) to 3 (highest).
+
+  
+
+
 
   
 BFD, defined in RFC 5880 and RFC 5881, allows point-to-point
@@ -3631,6 +3658,19 @@ ovs-vsctl add-port br0 p0 -- set Interface p0 type=patch 
options:peer=p1 \
   
 
 
+
+  
+Ingress Scheduling allows XXX TODO BOM
+  
+
+  
+The ingress priority of the port: 0 lowest to 3 highest. Higher
+priority ports are read more frequently than lower priority ports. This
+providing enhanced protection against being dropped due to Rx queue
+overflow to packets ingressing on those ports.
+  
+
+
 
   The overall purpose of these columns is described under Common
   Columns at the beginning of this document.
-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [RFC 0/2] Ingress Scheduling

2018-02-16 Thread Billy O'Mahony
This patch set implements the 'preferential read' part of the feature of
ingress scheduling described at OvS 2017 Fall Conference
https://www.slideshare.net/LF_OpenvSwitch/lfovs17ingress-scheduling-82280320.

It allows configuration to specify an ingress priority for and entire
interface. This protects traffic on higher priority interfaces from loss and
latency as PMDs get overloaded.

Results so far a are very promising; For a uniform traffic distribution as
total offered load increases loss starts on the lowest priority port first and
the highest priority port last.

When using four physical ports with each port forwarded to one of the other
ports. The following packets loss is seen. The EMC was bypassed in this case
and a small delay loop was added to each packet to simulate more realistic per
packet processing cost of 1000cycles approx.

Port dpdk_0  dpdk_1  dpdk_2  dpdk_3
Traffic
Dist.   25% 25% 25% 25%
Priority  0   1   2   3
n_rxq 8   8   8   8

Total
Load Kpps   Loss Rate Per Port (Kpps)
2110  0   0   0   0
2120  5   0   0   0
2920676   0   0   0
2930677   5   0   0
3510854 411   0   0
3520860 415   3   0
4390   1083 716 348   0
4400   1088 720 354   1


Even in the case where most traffic is on the priority port this remains the
case:

Port dpdk_0  dpdk_1  dpdk_2  dpdk_3
Traffic
Dist.   10% 20% 30% 40%
Priority  0   1   2   3
n_rxq 8   8   8   8

Total
Load Kpps   Loss Rate Per Port (Kpps)
 2400 0   0   0   0
 2410 5   0   0   0
 2720   225   5   0   0
 2880   252 121   9   0
 3030   269 176  82   3
 4000   369 377 384 392
 5000   471 580 691 801

The latency characteristics of the traffic on the higher priority ports is also
protected.

Port dpdk_0  dpdk_1  dpdk_2  dpdk_3
Traffic
Dist.   10% 20% 30% 40%
Priority  0   1   2   3
n_rxq 8   8   8   8

Totaldpdk0dpdk1dpdk2dpdk3
Load Kpps
 2400  113  122  120  125
 241036117  571  577  560
 2720   32324214424 3265 3235
 2880   3914043350810075 4600
 3030   4125973545017061 7965
 4000   414729360701774011106
 5000   416801364451823311653

Some General setup notes:
Fortville. (X710 DA4. firmware-version: 6.01 0x800034af 1.1747.0)
Intel(R) Xeon(R) CPU E5-2695 v3 @ 2.30GHz
One pmd
Port fwding port 0<->1, 2 <-> 3
Frame 64B, UDP 221 streams per port.
OvS base - 4c80644 http://github.com/istokes/ovs dpdk_merge_2_9. Added 
600cycles approx pkt processing in order to bring per packet cost to ~1000 
cycles.
DPDK v17.11.1

Billy O'Mahony (2):
  ingress scheduling: schema and docs
  ingress scheduling: Provide per interface ingress priority

 Documentation/howto/dpdk.rst| 18 
 include/openvswitch/ofp-parse.h |  3 ++
 lib/dpif-netdev.c   | 47 +-
 lib/netdev-bsd.c|  1 +
 lib/netdev-dpdk.c   | 64 +++--
 lib/netdev-dummy.c  |  1 +
 lib/netdev-linux.c  |  1 +
 lib/netdev-provider.h   | 11 ++-
 lib/netdev-vport.c  |  1 +
 lib/netdev.c| 23 +++
 lib/netdev.h|  2 ++
 vswitchd/bridge.c   |  2 ++
 vswitchd/vswitch.ovsschema  |  9 --
 vswitchd/vswitch.xml| 40 ++
 14 files changed, 205 insertions(+), 18 deletions(-)

-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [RFC 1/2] dpif-netdev: Fix per packet cycles statistics.

2017-09-22 Thread Billy O'Mahony
From: Ilya Maximets 

DP_STAT_LOOKUP_HIT statistics used mistakenly for calculation
of total number of packets. This leads to completely wrong
per packet cycles statistics.

For example:

emc hits:0
megaflow hits:253702308
avg. subtable lookups per hit:1.50
miss:0
lost:0
avg cycles per packet: 248.32 (157498766585/634255770)

In this case 634255770 total_packets value used for avg
per packet calculation:

  total_packets = 'megaflow hits' + 'megaflow hits' * 1.5

The real value should be 524.38 (157498766585/253702308)

Fix that by summing only stats that reflects match/not match.
It's decided to make direct summing of required values instead of
disabling some stats in a loop to make calculations more clear and
avoid similar issues in the future.

CC: Jan Scheurich 
Fixes: 3453b4d62a98 ("dpif-netdev: dpcls per in_port with sorted subtables")
Signed-off-by: Ilya Maximets 
Acked-by: Jan Scheurich 
---
 lib/dpif-netdev.c | 11 +--
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 071ec14..e3a5590 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -796,7 +796,7 @@ pmd_info_show_stats(struct ds *reply,
 unsigned long long stats[DP_N_STATS],
 uint64_t cycles[PMD_N_CYCLES])
 {
-unsigned long long total_packets = 0;
+unsigned long long total_packets;
 uint64_t total_cycles = 0;
 int i;
 
@@ -812,13 +812,12 @@ pmd_info_show_stats(struct ds *reply,
 } else {
 stats[i] = 0;
 }
-
-if (i != DP_STAT_LOST) {
-/* Lost packets are already included in DP_STAT_MISS */
-total_packets += stats[i];
-}
 }
 
+/* Sum of all the matched and not matched packets gives the total.  */
+total_packets = stats[DP_STAT_EXACT_HIT] + stats[DP_STAT_MASKED_HIT]
++ stats[DP_STAT_MISS];
+
 for (i = 0; i < PMD_N_CYCLES; i++) {
 if (cycles[i] > pmd->cycles_zero[i]) {
cycles[i] -= pmd->cycles_zero[i];
-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [RFC 2/2] dpif-netdev: RFC EMC load-shedding

2017-09-22 Thread Billy O'Mahony
When EMC hit rate goes down start start shedding load from the EMC.
---
 lib/dpif-netdev.c | 107 --
 1 file changed, 103 insertions(+), 4 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index e3a5590..f77e79a 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -158,6 +158,13 @@ struct netdev_flow_key {
 #define DEFAULT_EM_FLOW_INSERT_MIN (UINT32_MAX / \
 DEFAULT_EM_FLOW_INSERT_INV_PROB)
 
+struct emc_shed_state {
+unsigned long long last_hit_cnt;
+unsigned long long last_miss_cnt;
+unsigned long long last_skip_cnt;
+uint32_t shed_threshold;
+};
+
 struct emc_entry {
 struct dp_netdev_flow *flow;
 struct netdev_flow_key key;   /* key.hash used for emc hash value. */
@@ -166,6 +173,7 @@ struct emc_entry {
 struct emc_cache {
 struct emc_entry entries[EM_FLOW_HASH_ENTRIES];
 int sweep_idx;/* For emc_cache_slow_sweep(). */
+struct emc_shed_state emc_shed_state;
 };
 
 /* Iterate in the exact match cache through every entry that might contain a
@@ -337,6 +345,7 @@ enum dp_stat_type {
 DP_STAT_LOST,   /* Packets not passed up to the client. */
 DP_STAT_LOOKUP_HIT, /* Number of subtable lookups for flow table
hits */
+DP_STAT_EXACT_SKIPPED,  /* Packets where EMC lookup skipped */
 DP_N_STATS
 };
 
@@ -733,6 +742,10 @@ emc_cache_init(struct emc_cache *flow_cache)
 int i;
 
 flow_cache->sweep_idx = 0;
+flow_cache->emc_shed_state.last_hit_cnt = 0;
+flow_cache->emc_shed_state.last_miss_cnt = 0;
+flow_cache->emc_shed_state.last_skip_cnt = 0;
+flow_cache->emc_shed_state.shed_threshold = 0;
 for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) {
 flow_cache->entries[i].flow = NULL;
 flow_cache->entries[i].key.hash = 0;
@@ -749,6 +762,10 @@ emc_cache_uninit(struct emc_cache *flow_cache)
 for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) {
 emc_clear_entry(_cache->entries[i]);
 }
+flow_cache->emc_shed_state.last_hit_cnt = 0;
+flow_cache->emc_shed_state.last_miss_cnt = 0;
+flow_cache->emc_shed_state.last_skip_cnt = 0;
+flow_cache->emc_shed_state.shed_threshold = 0;
 }
 
 /* Check and clear dead flow references slowly (one entry at each
@@ -839,11 +856,28 @@ pmd_info_show_stats(struct ds *reply,
 }
 ds_put_cstr(reply, ":\n");
 
+/* XXX some added items added here are for debug */
 ds_put_format(reply,
   "\temc hits:%llu\n\tmegaflow hits:%llu\n"
+  "\tshed thresh:0x%08X\n"
+  "\temc skips:%llu\n"
+  "\temc hit rate (nett) :%llu%%\n"
+  "\temc hit rate (gross):%llu%%\n"
   "\tavg. subtable lookups per hit:%.2f\n"
   "\tmiss:%llu\n\tlost:%llu\n",
   stats[DP_STAT_EXACT_HIT], stats[DP_STAT_MASKED_HIT],
+  pmd->flow_cache.emc_shed_state.shed_threshold,
+  stats[DP_STAT_EXACT_SKIPPED],
+  (stats[DP_STAT_EXACT_HIT] + stats[DP_STAT_MASKED_HIT] -
+  stats[DP_STAT_EXACT_SKIPPED])
+  ? ((stats[DP_STAT_EXACT_HIT] * 100) /
+  (stats[DP_STAT_EXACT_HIT] + stats[DP_STAT_MASKED_HIT] -
+  stats[DP_STAT_EXACT_SKIPPED]))
+  : 0,
+  (stats[DP_STAT_EXACT_HIT] + stats[DP_STAT_MASKED_HIT])
+  ? ((stats[DP_STAT_EXACT_HIT] * 100) /
+  (stats[DP_STAT_EXACT_HIT] + stats[DP_STAT_MASKED_HIT]))
+  : 0,
   stats[DP_STAT_MASKED_HIT] > 0
   ? (1.0*stats[DP_STAT_LOOKUP_HIT])/stats[DP_STAT_MASKED_HIT]
   : 0,
@@ -1470,6 +1504,8 @@ dpif_netdev_get_stats(const struct dpif *dpif, struct 
dpif_dp_stats *stats)
 stats->n_hit += n;
 atomic_read_relaxed(>stats.n[DP_STAT_EXACT_HIT], );
 stats->n_hit += n;
+atomic_read_relaxed(>stats.n[DP_STAT_EXACT_SKIPPED], );
+stats->n_hit += n;
 atomic_read_relaxed(>stats.n[DP_STAT_MISS], );
 stats->n_missed += n;
 atomic_read_relaxed(>stats.n[DP_STAT_LOST], );
@@ -4849,6 +4885,54 @@ dp_netdev_queue_batches(struct dp_packet *pkt,
 packet_batch_per_flow_update(batch, pkt, mf);
 }
 
+#define SHED_ADJ_INTERVAL_PKTS (3e6)
+#define SHED_ADJ_QUANTUM (0x1000)
+#define SHED_THRESH_MAX (SHED_ADJ_QUANTUM + \
+(SHED_ADJ_QUANTUM << 1) + \
+(SHED_ADJ_QUANTUM << 2) + \
+(SHED_ADJ_QUANTUM << 3))
+/* XXX use cost of EMC lookup & miss in cycles to replace hard bounds */
+#define SHED_HIT_RATE_LOWER_PC (50)
+#define SHED_HIT_RATE_UPPER_PC (70)
+
+
+static inline void
+adjust_emc_shedding (struct dp_netdev_pmd_thread *pmd)
+{
+struct emc_cache *emc = >flow_cache;
+

[ovs-dev] [RFC 0/2] EMC load-shedding

2017-09-22 Thread Billy O'Mahony
Hi All,

Please find attached RFC patch for EMC load-shedding [1] as promised [2].

This applies clean on 5ff834 "Increment ct packet counters..." It also uses
Ilya's patch "Fix per packet cycles statistics." [3] so I've included that in
the patch set as it wasn't merged when I started the RFC.

The main goal for this RFC is only to demonstrate the outline of the mechanism
and get feedback & advice for further work.

However I did some initial testing with promising results. For 8K to 64K flows
the cycles per packet drop from ~1200 to ~1100. For small numbers of flows
(~16) the cycles per packet remain at ~900 which I beleive means no increase
but I didn't baseline that situation.

There are some TODOs commented in the patch with XXX.

For one I think the mechanism should take into account the expected cycle-cost
of EMC lookup and EMC miss (dpcls lookup) when deciding how much load to shed.
Rather than the heuristic in this patch which is to keep the emc hit rate (for
flow which have not been diverted from the EMC) between certain bounds.

Also we should decide on at least one flow distribution that would be useful
(i.e. realistic) for EMC testing. The tests above have either been carried out
with a random (uniform) flow distribution which doesn't play well with flow
caching or else a round-robin flow distribution which is actually adverserial
to flow caching. If I have an agreed flow distribution I can then figure out
how to produce it for testing :).

[1] https://mail.openvswitch.org/pipermail/ovs-dev/2017-August/336509.html
[2] https://mail.openvswitch.org/pipermail/ovs-dev/2017-September/338380.html
[3] https://mail.openvswitch.org/pipermail/ovs-dev/2017-August/337309.html

Billy O'Mahony (1):
  dpif-netdev: RFC EMC load-shedding

Ilya Maximets (1):
  dpif-netdev: Fix per packet cycles statistics.

 lib/dpif-netdev.c | 118 +-
 1 file changed, 108 insertions(+), 10 deletions(-)

-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH v2 4/4] docs: Document ingress scheduling feature

2017-08-17 Thread Billy O'Mahony
Signed-off-by: Billy O'Mahony <billy.o.mah...@intel.com>
---
 Documentation/howto/dpdk.rst | 31 +++
 vswitchd/vswitch.xml | 31 +++
 2 files changed, 62 insertions(+)

diff --git a/Documentation/howto/dpdk.rst b/Documentation/howto/dpdk.rst
index d7f6610..c7bfc01 100644
--- a/Documentation/howto/dpdk.rst
+++ b/Documentation/howto/dpdk.rst
@@ -188,6 +188,37 @@ respective parameter. To disable the flow control at tx 
side, run::
 
 $ ovs-vsctl set Interface dpdk-p0 options:tx-flow-ctrl=false
 
+Ingress Scheduling
+--
+
+The ingress scheduling feature is described in general in
+``ovs-vswitchd.conf.db (5)``.
+
+Interfaces of type ``dpdk`` support ingress scheduling only for
+either ether_type or else a fully specificed combination of src and
+dst ip address and port numbers for TCP or UDP packets.
+
+To prioritize packets for Precision Time Protocol:
+
+$ ovs-vsctl set Interface dpdk-p0 \
+other_config:ingress_sched=eth_type=0x88F7
+
+To prioritize UDP packets between specific IP source and destination:
+
+$ ovs-vsctl set Interface dpdk-p0 \
+other_config:ingress_sched=udp,ip_src=1.1.1.1,ip_dst=2.2.2.2,\
+udp_src=11,udp_dst=22
+
+If unsupported ingress scheduling configuration is specified or it cannot be
+applied for any reason a warning message is logged and the Interface operates
+as if no ingress scheduling was configured.
+
+Interfaces of type ``dpdkvhostuserclient``, ``dpdkr`` and ``dpdkvhostuser`` do
+not support ingress scheduling.
+
+Currently only the match fields listed above are supported. No wildcarding of
+fields is supported.
+
 pdump
 -
 
diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml
index 074535b..4564536 100644
--- a/vswitchd/vswitch.xml
+++ b/vswitchd/vswitch.xml
@@ -2923,6 +2923,37 @@
   
 
 
+
+  
+   Packets matching the ingress_sched value are prioritized. This means
+   some combination of:
+  
+  
+
+ prioritized packets are forwarded to their destination port before
+ non-prioritized
+
+
+ prioritized packets are less likely to be dropped in an overloaded
+ situation than prioritized packets
+
+  
+  
+   Ingress scheduling is supported with the best effort of the Interface.
+   It may be dependant on the interface type and it's supporting
+   implementation devices. Different interface types may have different
+   levels of support for the feature and the same interface type attached
+   to different devices (physical NICs or vhost ports, device driver,
+   NIC model) may also offer different levels of support.
+  
+  
+
+ The format of the ingress_sched field is specified in ovs-fields(7) in
+ the ``Matching'' and ``FIELD REFERENCE'' sections.
+
+  
+
+
 
   
 BFD, defined in RFC 5880 and RFC 5881, allows point-to-point
-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH v2 3/4] dpif-netdev: Add rxq prioritization

2017-08-17 Thread Billy O'Mahony
If an rxq is marked as 'prioritized' then keep reading from this queue until
there are no packets available. Only then proceed to other queues.

Signed-off-by: Billy O'Mahony <billy.o.mah...@intel.com>
---
 lib/dpif-netdev.c | 24 ++--
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 9ce3456..d9f014d 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -488,6 +488,7 @@ struct dp_netdev_pmd_cycles {
 struct polled_queue {
 struct netdev_rxq *rx;
 odp_port_t port_no;
+uint8_t is_priority;
 };
 
 /* Contained by struct dp_netdev_pmd_thread's 'poll_list' member. */
@@ -3801,6 +3802,8 @@ pmd_load_queues_and_ports(struct dp_netdev_pmd_thread 
*pmd,
 HMAP_FOR_EACH (poll, node, >poll_list) {
 poll_list[i].rx = poll->rxq->rx;
 poll_list[i].port_no = poll->rxq->port->port_no;
+poll_list[i].is_priority = \
+(poll->rxq->rx->queue_id == poll->rxq->rx->netdev->priority_rxq);
 i++;
 }
 
@@ -3849,15 +3852,24 @@ reload:
 lc = UINT_MAX;
 }
 
+unsigned int log_cnt = 0;
+int streak_len;
+const unsigned int MAX_STREAK_LEN = 100;
 cycles_count_start(pmd);
 for (;;) {
+log_cnt++;
 for (i = 0; i < poll_cnt; i++) {
-process_packets =
-dp_netdev_process_rxq_port(pmd, poll_list[i].rx,
-   poll_list[i].port_no);
-cycles_count_intermediate(pmd,
-  process_packets ? PMD_CYCLES_PROCESSING
-  : PMD_CYCLES_IDLE);
+streak_len = 0;
+do {
+process_packets =
+dp_netdev_process_rxq_port(pmd, poll_list[i].rx,
+   poll_list[i].port_no);
+cycles_count_intermediate(pmd,
+   process_packets ? PMD_CYCLES_PROCESSING
+   : PMD_CYCLES_IDLE);
+streak_len++;
+} while (process_packets && poll_list[i].is_priority &&
+ streak_len < MAX_STREAK_LEN);
 }
 
 if (lc++ > 1024) {
-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH v2 2/4] netdev-dpdk: Apply ingress_sched config to dpdk phy ports

2017-08-17 Thread Billy O'Mahony
Ingress scheduling configuration is given effect by way of Flow Director
filters. A small subset of the ingress scheduling possible is
implemented in this patch.

Signed-off-by: Billy O'Mahony <billy.o.mah...@intel.com>
---
 include/openvswitch/ofp-parse.h |   3 +
 lib/dpif-netdev.c   |   1 +
 lib/netdev-dpdk.c   | 181 ++--
 vswitchd/bridge.c   |   2 +
 4 files changed, 180 insertions(+), 7 deletions(-)

diff --git a/include/openvswitch/ofp-parse.h b/include/openvswitch/ofp-parse.h
index 013a8f3..1991694 100644
--- a/include/openvswitch/ofp-parse.h
+++ b/include/openvswitch/ofp-parse.h
@@ -41,6 +41,9 @@ struct ofputil_table_mod;
 struct ofputil_bundle_msg;
 struct ofputil_tlv_table_mod;
 struct simap;
+struct tun_table;
+struct flow_wildcards;
+struct ofputil_port_map;
 enum ofputil_protocol;
 
 char *parse_ofp_str(struct ofputil_flow_mod *, int command, const char *str_,
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index e2cd931..9ce3456 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -44,6 +44,7 @@
 #include "dp-packet.h"
 #include "dpif.h"
 #include "dpif-provider.h"
+#include "netdev-provider.h"
 #include "dummy.h"
 #include "fat-rwlock.h"
 #include "flow.h"
diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 1ffedd4..d9aab4f 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -49,6 +49,8 @@
 #include "openvswitch/list.h"
 #include "openvswitch/ofp-print.h"
 #include "openvswitch/vlog.h"
+#include "openvswitch/ofp-parse.h"
+#include "openvswitch/ofp-util.h"
 #include "ovs-numa.h"
 #include "ovs-thread.h"
 #include "ovs-rcu.h"
@@ -175,6 +177,10 @@ static const struct rte_eth_conf port_conf = {
 .txmode = {
 .mq_mode = ETH_MQ_TX_NONE,
 },
+.fdir_conf = {
+.mode = RTE_FDIR_MODE_PERFECT,
+},
+
 };
 
 /*
@@ -351,6 +357,11 @@ enum dpdk_hw_ol_features {
 NETDEV_RX_CHECKSUM_OFFLOAD = 1 << 0,
 };
 
+union ingress_filter {
+struct rte_eth_ethertype_filter ethertype;
+struct rte_eth_fdir_filter fdir;
+};
+
 struct netdev_dpdk {
 struct netdev up;
 dpdk_port_t port_id;
@@ -390,8 +401,11 @@ struct netdev_dpdk {
 /* If true, device was attached by rte_eth_dev_attach(). */
 bool attached;
 
-/* Ingress Scheduling config */
+/* Ingress Scheduling config & state. */
 char *ingress_sched_str;
+bool ingress_sched_changed;
+enum rte_filter_type ingress_filter_type;
+union ingress_filter ingress_filter;
 
 /* In dpdk_list. */
 struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex);
@@ -674,6 +688,22 @@ dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int 
n_rxq, int n_txq)
 int i;
 struct rte_eth_conf conf = port_conf;
 
+/* Ingress scheduling uses an extra RX queue reserved for prioritized
+   frames but using RSS will 'pollute' that queue by distributing
+   non-priority packets on to it.  Therefore RSS is not compatible with
+   ingress scheduling.  Also requesting anything other than two queues
+   with ingress scheduling is wasteful as without RSS only two queues are
+   required.  Rather than force n_rxq to two here and overriding the
+   configured value it's less surprising for the user to issue a warning
+   (see dpdk_apply_ingress_scheduling()) and not enable ingress shceduling.
+*/
+if (dev->ingress_sched_str && n_rxq == 2) {
+conf.rxmode.mq_mode = ETH_MQ_RX_NONE;
+}
+else {
+conf.rxmode.mq_mode = ETH_MQ_RX_RSS;
+}
+
 /* For some NICs (e.g. Niantic), scatter_rx mode needs to be explicitly
  * enabled. */
 if (dev->mtu > ETHER_MTU) {
@@ -757,6 +787,128 @@ dpdk_eth_flow_ctrl_setup(struct netdev_dpdk *dev) 
OVS_REQUIRES(dev->mutex)
 }
 }
 
+static void
+dpdk_apply_ingress_scheduling(struct netdev_dpdk *dev, int n_rxq)
+{
+if (!dev->ingress_sched_str) {
+return;
+}
+
+/* See dpdk_eth_dev_queue_setup for n_rxq requirement */
+if (n_rxq != 2) {
+VLOG_ERR("Interface %s: Ingress scheduling config ignored; " \
+ "Requires n_rxq==2.", dev->up.name);
+return;
+}
+
+int priority_q_id = n_rxq-1;
+char *key, *val, *str, *iter;
+
+ovs_be32 ip_src, ip_dst;
+ip_src = ip_dst = 0;
+
+uint16_t eth_type, port_src, port_dst;
+eth_type = port_src = port_dst = 0;
+uint8_t ip_proto = 0;
+int diag = 0;
+
+/* delete any existing filter */
+if (dev->ingress_filter_type == RTE_ETH_FILTER_FDIR) {
+diag = rte_eth_dev_filter_ctrl(dev->port_id, RTE_ETH_FILTER_FDIR,
+RTE_ETH_FILTER_DELETE, >ingress_filter.fdir);
+} else if (dev->ingress_filter_type == RTE_ETH_FILTER_ETHERTYPE) {
+diag = rte_eth_dev_filter_

[ovs-dev] [PATCH v2 1/4] netdev: Add set_ingress_sched to netdev api

2017-08-17 Thread Billy O'Mahony
Passes ingress_sched config item from other_config column of Interface
table to the netdev.

Signed-off-by: Billy O'Mahony <billy.o.mah...@intel.com>
---
 lib/netdev-bsd.c  |  1 +
 lib/netdev-dpdk.c | 21 +
 lib/netdev-dummy.c|  1 +
 lib/netdev-linux.c|  1 +
 lib/netdev-provider.h | 10 ++
 lib/netdev-vport.c|  1 +
 lib/netdev.c  | 22 ++
 lib/netdev.h  |  1 +
 vswitchd/bridge.c |  2 ++
 9 files changed, 60 insertions(+)

diff --git a/lib/netdev-bsd.c b/lib/netdev-bsd.c
index 8a4cdb3..3943b7b 100644
--- a/lib/netdev-bsd.c
+++ b/lib/netdev-bsd.c
@@ -1506,6 +1506,7 @@ netdev_bsd_update_flags(struct netdev *netdev_, enum 
netdev_flags off,
 netdev_bsd_get_etheraddr,\
 netdev_bsd_get_mtu,  \
 NULL, /* set_mtu */  \
+NULL, /* set_ingress_sched */\
 netdev_bsd_get_ifindex,  \
 netdev_bsd_get_carrier,  \
 NULL, /* get_carrier_resets */   \
diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 1aaf6f7..1ffedd4 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -390,6 +390,9 @@ struct netdev_dpdk {
 /* If true, device was attached by rte_eth_dev_attach(). */
 bool attached;
 
+/* Ingress Scheduling config */
+char *ingress_sched_str;
+
 /* In dpdk_list. */
 struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex);
 
@@ -1081,6 +1084,9 @@ netdev_dpdk_destruct(struct netdev *netdev)
 }
 
 free(dev->devargs);
+if (dev->ingress_sched_str) {
+free(dev->ingress_sched_str);
+}
 common_destruct(dev);
 
 ovs_mutex_unlock(_mutex);
@@ -2004,6 +2010,20 @@ netdev_dpdk_set_mtu(struct netdev *netdev, int mtu)
 }
 
 static int
+netdev_dpdk_set_ingress_sched(struct netdev *netdev,
+  const char *ingress_sched_str)
+{
+struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
+
+free(dev->ingress_sched_str);
+if (ingress_sched_str) {
+dev->ingress_sched_str = xstrdup(ingress_sched_str);
+}
+
+return 0;
+}
+
+static int
 netdev_dpdk_get_carrier(const struct netdev *netdev, bool *carrier);
 
 static int
@@ -3342,6 +3362,7 @@ unlock:
 netdev_dpdk_get_etheraddr,\
 netdev_dpdk_get_mtu,  \
 netdev_dpdk_set_mtu,  \
+netdev_dpdk_set_ingress_sched,\
 netdev_dpdk_get_ifindex,  \
 GET_CARRIER,  \
 netdev_dpdk_get_carrier_resets,   \
diff --git a/lib/netdev-dummy.c b/lib/netdev-dummy.c
index f731af1..0c36d2d 100644
--- a/lib/netdev-dummy.c
+++ b/lib/netdev-dummy.c
@@ -1378,6 +1378,7 @@ netdev_dummy_update_flags(struct netdev *netdev_,
 netdev_dummy_get_etheraddr, \
 netdev_dummy_get_mtu,   \
 netdev_dummy_set_mtu,   \
+NULL,   /* set_ingress_sched */ \
 netdev_dummy_get_ifindex,   \
 NULL,   /* get_carrier */   \
 NULL,   /* get_carrier_resets */\
diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
index 2ff3e2b..00cbe17 100644
--- a/lib/netdev-linux.c
+++ b/lib/netdev-linux.c
@@ -2847,6 +2847,7 @@ netdev_linux_update_flags(struct netdev *netdev_, enum 
netdev_flags off,
 netdev_linux_get_etheraddr, \
 netdev_linux_get_mtu,   \
 netdev_linux_set_mtu,   \
+NULL,   /* set_ingress_sched */ \
 netdev_linux_get_ifindex,   \
 netdev_linux_get_carrier,   \
 netdev_linux_get_carrier_resets,\
diff --git a/lib/netdev-provider.h b/lib/netdev-provider.h
index b3c57d5..6cbd066 100644
--- a/lib/netdev-provider.h
+++ b/lib/netdev-provider.h
@@ -34,6 +34,7 @@ extern "C" {
 
 struct netdev_tnl_build_header_params;
 #define NETDEV_NUMA_UNSPEC OVS_NUMA_UNSPEC
+#define NETDEV_PRIO_RXQ_UNSPEC (-1)
 
 /* A network device (e.g. an Ethernet device).
  *
@@ -76,6 +77,7 @@ struct netdev {
  * modify them. */
 int n_txq;
 int n_rxq;
+int priority_rxq;   /* id of prioritized rxq. -1 = None */
 int ref_cnt;/* Times this devices was opened. */
 struct shash_node *node;/* Pointer to element in global map. */
 struct ovs_list saved_flags_list; /* Contains "struct netdev_saved_flags". 
*/
@@ -412,6 +414,14 @@ struct netde

[ovs-dev] [PATCH 0/4] prioritizing latency sensitive traffic

2017-08-17 Thread Billy O'Mahony
Hi All,

v2: Addresses various review comments; Applies cleanly on 0bedb3d6.

This patch set provides a method to request ingress scheduling on interfaces.
It also provides an implemtation of same for DPDK physical ports.

This allows specific packet types to be:
* forwarded to their destination port ahead of other packets.
and/or
* be less likely to be dropped in an overloaded situation.

It was previously discussed
https://mail.openvswitch.org/pipermail/ovs-discuss/2017-May/044395.html
and RFC'd
https://mail.openvswitch.org/pipermail/ovs-dev/2017-July/335237.html

Limitations of this patch:
* The patch uses the Flow Director filter API in DPDK and has only been tested
 on Fortville (XL710) NIC.
* Prioritization is limited to:
** eth_type
** Fully specified 5-tuple src & dst ip and port numbers for UDP & TCP packets
* ovs-appctl dpif-netdev/pmd-*-show o/p should indicate rxq prioritization.
* any requirements for a more granular prioritization mechanism

Initial results:
* even when userspace OVS is very much overloaded and
  dropping significant numbers of packets the drop rate for prioritized traffic
  is running at 1/1000th of the drop rate for non-prioritized traffic.

* the latency profile of prioritized traffic through userspace OVS is also much
  improved

1e0 |*
|*
1e-1|* | Non-prioritized pkt latency
|* * Prioritized pkt latency
1e-2|*
|*
1e-3|*   |
|*   |
1e-4|*   | | |
|*   |*| |
1e-5|*   |*| | |
|*   |*|*| |  |
1e-6|*   |*|*|*|  |
|*   |*|*|*|* |
1e-7|*   |*|*|*|* |*
|*   |*|*|*|* |*
1e-8|*   |*|*|*|* |*
  0-1 1-20 20-40 40-50 50-60 60-70 ... 120-400
Latency (us)

 Proportion of packets per latency bin @ 80% Max Throughput
  (Log scale)


Regards,
Billy.

billy O'Mahony (4):
  netdev: Add set_ingress_sched to netdev api
  netdev-dpdk: Apply ingress_sched config to dpdk phy ports
  dpif-netdev: Add rxq prioritization
  docs: Document ingress scheduling feature

 Documentation/howto/dpdk.rst|  31 +++
 include/openvswitch/ofp-parse.h |   3 +
 lib/dpif-netdev.c   |  25 --
 lib/netdev-bsd.c|   1 +
 lib/netdev-dpdk.c   | 192 +++-
 lib/netdev-dummy.c  |   1 +
 lib/netdev-linux.c  |   1 +
 lib/netdev-provider.h   |  10 +++
 lib/netdev-vport.c  |   1 +
 lib/netdev.c|  22 +
 lib/netdev.h|   1 +
 vswitchd/bridge.c   |   4 +
 vswitchd/vswitch.xml|  31 +++
 13 files changed, 315 insertions(+), 8 deletions(-)

-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH 3/4] dpif-netdev: Add rxq prioritization

2017-07-20 Thread Billy O'Mahony
If an rxq is marked as 'prioritized' then keep reading from this queue until
there are no packets available. Only then proceed to other queues.

Signed-off-by: Billy O'Mahony <billy.o.mah...@intel.com>
---
 lib/dpif-netdev.c | 24 ++--
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index d35566f..3a67ce2 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -484,6 +484,7 @@ struct dp_netdev_pmd_cycles {
 struct polled_queue {
 struct netdev_rxq *rx;
 odp_port_t port_no;
+uint8_t is_priority;
 };
 
 /* Contained by struct dp_netdev_pmd_thread's 'poll_list' member. */
@@ -3700,6 +3701,8 @@ pmd_load_queues_and_ports(struct dp_netdev_pmd_thread 
*pmd,
 HMAP_FOR_EACH (poll, node, >poll_list) {
 poll_list[i].rx = poll->rxq->rx;
 poll_list[i].port_no = poll->rxq->port->port_no;
+poll_list[i].is_priority = \
+(poll->rxq->rx->queue_id == poll->rxq->rx->netdev->priority_rxq);
 i++;
 }
 
@@ -3747,15 +3750,24 @@ reload:
 lc = UINT_MAX;
 }
 
+unsigned int log_cnt = 0;
+int streak_len;
+const unsigned int MAX_STREAK_LEN = 100;
 cycles_count_start(pmd);
 for (;;) {
+log_cnt++;
 for (i = 0; i < poll_cnt; i++) {
-process_packets =
-dp_netdev_process_rxq_port(pmd, poll_list[i].rx,
-   poll_list[i].port_no);
-cycles_count_intermediate(pmd,
-  process_packets ? PMD_CYCLES_PROCESSING
-  : PMD_CYCLES_IDLE);
+streak_len = 0;
+do {
+process_packets =
+dp_netdev_process_rxq_port(pmd, poll_list[i].rx,
+   poll_list[i].port_no);
+cycles_count_intermediate(pmd,
+   process_packets ? PMD_CYCLES_PROCESSING
+   : PMD_CYCLES_IDLE);
+streak_len++;
+} while (process_packets && poll_list[i].is_priority &&
+ streak_len < MAX_STREAK_LEN);
 }
 
 if (lc++ > 1024) {
-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH 4/4] docs: Document ingress scheduling feature

2017-07-20 Thread Billy O'Mahony
Signed-off-by: Billy O'Mahony <billy.o.mah...@intel.com>
---
 Documentation/howto/dpdk.rst | 31 +++
 vswitchd/vswitch.xml | 31 +++
 2 files changed, 62 insertions(+)

diff --git a/Documentation/howto/dpdk.rst b/Documentation/howto/dpdk.rst
index af01d3e..c08fe79 100644
--- a/Documentation/howto/dpdk.rst
+++ b/Documentation/howto/dpdk.rst
@@ -188,6 +188,37 @@ respective parameter. To disable the flow control at tx 
side, run::
 
 $ ovs-vsctl set Interface dpdk-p0 options:tx-flow-ctrl=false
 
+Ingress Scheduling
+--
+
+The ingress scheduling feature is described in general in
+``ovs-vswitchd.conf.db (5)``.
+
+Interfaces of type ``dpdk`` support ingress scheduling only for
+either ether_type or else a fully specificed combination of src and
+dst ip address and port numbers for TCP or UDP packets.
+
+To prioritize packets for Precision Time Protocol:
+
+$ ovs-vsctl set Interface dpdk-p0 \
+other_config:ingress_sched=eth_type=0x88F7
+
+To prioritize UDP packets between specific IP source and destination:
+
+$ ovs-vsctl set Interface dpdk-p0 \
+other_config:ingress_sched=udp,ip_src=1.1.1.1,ip_dst=2.2.2.2,\
+udp_src=11,udp_dst=22
+
+If unsupported ingress scheduling configuration is specified or it cannot be
+applied for any reason a warning message is logged and the Interface operates
+as if no ingress scheduling was configured.
+
+Interfaces of type ``dpdkvhostuserclient``, ``dpdkr`` and ``dpdkvhostuser`` do
+not support ingress scheduling.
+
+Currently only the match fields listed above are supported. No wildcarding of
+fields is supported.
+
 pdump
 -
 
diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml
index 883ecd8..b5302b6 100644
--- a/vswitchd/vswitch.xml
+++ b/vswitchd/vswitch.xml
@@ -2924,6 +2924,37 @@
   
 
 
+
+  
+   Packets matching the ingress_sched value are prioritized. This means
+   some combination of:
+  
+  
+
+ prioritized packets are forwarded to their destination port before
+ non-prioritized
+
+
+ prioritized packets are less likely to be dropped in an overloaded
+ situation than prioritized packets
+
+  
+  
+   Ingress scheduling is supported with the best effort of the Interface.
+   It may be dependant on the interface type and it's supporting
+   implementation devices. Different interface types may have different
+   levels of support for the feature and the same interface type attached
+   to different devices (physical NICs or vhost ports, device driver,
+   NIC model) may also offer different levels of support.
+  
+  
+
+ The format of the ingress_sched field is specified in ovs-fields(7) in
+ the ``Matching'' and ``FIELD REFERENCE'' sections.
+
+  
+
+
 
   
 BFD, defined in RFC 5880 and RFC 5881, allows point-to-point
-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH 2/4] netdev-dpdk: Apply ingress_sched config to dpdk phy ports

2017-07-20 Thread Billy O'Mahony
Ingress scheduling configuration is given effect by way of Flow Director
filters. A small subset of the ingress scheduling possible is
implemented in this patch.

Signed-off-by: Billy O'Mahony <billy.o.mah...@intel.com>
---
 include/openvswitch/ofp-parse.h |   3 +
 lib/dpif-netdev.c   |   1 +
 lib/netdev-dpdk.c   | 167 ++--
 vswitchd/bridge.c   |   2 +
 4 files changed, 166 insertions(+), 7 deletions(-)

diff --git a/include/openvswitch/ofp-parse.h b/include/openvswitch/ofp-parse.h
index fc5784e..08d6086 100644
--- a/include/openvswitch/ofp-parse.h
+++ b/include/openvswitch/ofp-parse.h
@@ -37,6 +37,9 @@ struct ofputil_table_mod;
 struct ofputil_bundle_msg;
 struct ofputil_tlv_table_mod;
 struct simap;
+struct tun_table;
+struct flow_wildcards;
+struct ofputil_port_map;
 enum ofputil_protocol;
 
 char *parse_ofp_str(struct ofputil_flow_mod *, int command, const char *str_,
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 47a9fa0..d35566f 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -44,6 +44,7 @@
 #include "dp-packet.h"
 #include "dpif.h"
 #include "dpif-provider.h"
+#include "netdev-provider.h"
 #include "dummy.h"
 #include "fat-rwlock.h"
 #include "flow.h"
diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index e74c50f..e393abf 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -33,6 +33,8 @@
 #include 
 #include 
 
+#include 
+#include 
 #include "dirs.h"
 #include "dp-packet.h"
 #include "dpdk.h"
@@ -169,6 +171,10 @@ static const struct rte_eth_conf port_conf = {
 .txmode = {
 .mq_mode = ETH_MQ_TX_NONE,
 },
+.fdir_conf = {
+.mode = RTE_FDIR_MODE_PERFECT,
+},
+
 };
 
 enum { DPDK_RING_SIZE = 256 };
@@ -330,6 +336,11 @@ enum dpdk_hw_ol_features {
 NETDEV_RX_CHECKSUM_OFFLOAD = 1 << 0,
 };
 
+union ingress_filter {
+struct rte_eth_ethertype_filter ethertype;
+struct rte_eth_fdir_filter fdir;
+};
+
 struct netdev_dpdk {
 struct netdev up;
 dpdk_port_t port_id;
@@ -369,8 +380,11 @@ struct netdev_dpdk {
 /* If true, device was attached by rte_eth_dev_attach(). */
 bool attached;
 
-/* Ingress Scheduling config */
+/* Ingress Scheduling config & state. */
 char *ingress_sched_str;
+bool ingress_sched_changed;
+enum rte_filter_type ingress_filter_type;
+union ingress_filter ingress_filter;
 
 /* In dpdk_list. */
 struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex);
@@ -653,6 +667,15 @@ dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int 
n_rxq, int n_txq)
 int i;
 struct rte_eth_conf conf = port_conf;
 
+/* Ingress scheduling requires ETH_MQ_RX_NONE so limit it to when exactly
+ * two rxqs are defined. Otherwise MQ will not work as expected. */
+if (dev->ingress_sched_str && n_rxq == 2) {
+conf.rxmode.mq_mode = ETH_MQ_RX_NONE;
+}
+else {
+conf.rxmode.mq_mode = ETH_MQ_RX_RSS;
+}
+
 if (dev->mtu > ETHER_MTU) {
 conf.rxmode.jumbo_frame = 1;
 conf.rxmode.max_rx_pkt_len = dev->max_packet_len;
@@ -730,6 +753,121 @@ dpdk_eth_flow_ctrl_setup(struct netdev_dpdk *dev) 
OVS_REQUIRES(dev->mutex)
 }
 }
 
+static void
+dpdk_apply_ingress_scheduling(struct netdev_dpdk *dev, int n_rxq)
+{
+if (!dev->ingress_sched_str) {
+return;
+}
+
+if (n_rxq != 2) {
+VLOG_ERR("Interface %s: Ingress scheduling config ignored; " \
+ "Requires n_rxq==2.", dev->up.name);
+}
+
+int priority_q_id = n_rxq-1;
+char *key, *val, *str, *iter;
+
+ovs_be32 ip_src, ip_dst;
+ip_src = ip_dst = 0;
+
+uint16_t eth_type, port_src, port_dst;
+eth_type = port_src = port_dst = 0;
+uint8_t ip_proto = 0;
+int diag = 0;
+
+/* delete any existing filter */
+if (dev->ingress_filter_type == RTE_ETH_FILTER_FDIR) {
+diag = rte_eth_dev_filter_ctrl(dev->port_id, RTE_ETH_FILTER_FDIR,
+RTE_ETH_FILTER_DELETE, >ingress_filter.fdir);
+} else if (dev->ingress_filter_type == RTE_ETH_FILTER_ETHERTYPE) {
+diag = rte_eth_dev_filter_ctrl(dev->port_id, RTE_ETH_FILTER_ETHERTYPE,
+RTE_ETH_FILTER_DELETE, >ingress_filter.ethertype);
+}
+
+char *mallocd_str; /* str_to_x returns malloc'd str we'll need to free */
+/* Parse the configuration into local vars */
+iter = str = xstrdup(dev->ingress_sched_str);
+while (ofputil_parse_key_value (, , )) {
+if (strcmp(key, "nw_src") == 0 || strcmp(key, "ip_src") == 0) {
+mallocd_str = str_to_ip(val, _src);
+} else if (strcmp(key, "nw_dst") == 0 || strcmp(key, "ip_dst") == 0) {
+mallocd_str = str_to_ip(val, _dst);
+} else if (strcmp(key, "dl_typ

[ovs-dev] [RFC v2 4/4] docs: Document ingress scheduling feature

2017-07-11 Thread Billy O'Mahony
Signed-off-by: Billy O'Mahony <billy.o.mah...@intel.com>
---
 Documentation/howto/dpdk.rst | 31 +++
 vswitchd/vswitch.xml | 31 +++
 2 files changed, 62 insertions(+)

diff --git a/Documentation/howto/dpdk.rst b/Documentation/howto/dpdk.rst
index 93248b4..07fb97d 100644
--- a/Documentation/howto/dpdk.rst
+++ b/Documentation/howto/dpdk.rst
@@ -188,6 +188,37 @@ respective parameter. To disable the flow control at tx 
side, run::
 
 $ ovs-vsctl set Interface dpdk-p0 options:tx-flow-ctrl=false
 
+Ingress Scheduling
+--
+
+The ingress scheduling feature is described in general in
+``ovs-vswitchd.conf.db (5)``.
+
+Interfaces of type ``dpdk`` support ingress scheduling only for
+either ether_type or else a fully specificed combination of src and
+dst ip address and port numbers for TCP or UDP packets.
+
+To prioritize packets for Precision Time Protocol:
+
+$ ovs-vsctl set Interface dpdk-p0 \
+other_config:ingress_sched=eth_type=0x88F7
+
+To prioritize UDP packets between specific IP source and destination:
+
+$ ovs-vsctl set Interface dpdk-p0 \
+other_config:ingress_sched=udp,ip_src=1.1.1.1,ip_dst=2.2.2.2,\
+udp_src=11,udp_dst=22
+
+If unsupported ingress scheduling configuration is specified or it cannot be
+applied for any reason a warning message is logged and the Interface operates
+as if no ingress scheduling was configured.
+
+Interfaces of type ``dpdkvhostuserclient``, ``dpdkr`` and ``dpdkvhostuser`` do
+not support ingress scheduling.
+
+Currently only the match fields listed above are supported. No wildcarding of
+fields is supported.
+
 pdump
 -
 
diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml
index 0bf986d..299d725 100644
--- a/vswitchd/vswitch.xml
+++ b/vswitchd/vswitch.xml
@@ -2842,6 +2842,37 @@
   
 
 
+
+  
+   Packets matching the ingress_sched value are prioritized. This means
+   some combination of:
+  
+  
+
+ prioritized packets are forwarded to their destination port before
+ non-prioritized
+
+
+ prioritized packets are less likely to be dropped in an overloaded
+ situation than prioritized packets
+
+  
+  
+   Ingress scheduling is supported with the best effort of the Interface.
+   It may be dependant on the interface type and it's supporting
+   implementation devices. Different interface types may have different
+   levels of support for the feature and the same interface type attached
+   to different devices (physical NICs or vhost ports, device driver,
+   NIC model) may also offer different levels of support.
+  
+  
+
+ The format of the ingress_sched field is specified in ovs-fields(7) in
+ the ``Matching'' and ``FIELD REFERENCE'' sections.
+
+  
+
+
 
   
 BFD, defined in RFC 5880 and RFC 5881, allows point-to-point
-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [RFC v2 3/4] dpif-netdev: Add rxq prioritization

2017-07-11 Thread Billy O'Mahony
If an rxq is marked as 'prioritized' then keep reading from this queue until
there are no packets available. Only then proceed to other queues.

Signed-off-by: Billy O'Mahony <billy.o.mah...@intel.com>
---
 lib/dpif-netdev.c | 23 ---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 66712c7..0dca0f5 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -480,6 +480,7 @@ struct dp_netdev_pmd_cycles {
 struct polled_queue {
 struct netdev_rxq *rx;
 odp_port_t port_no;
+uint8_t is_priority;
 };
 
 /* Contained by struct dp_netdev_pmd_thread's 'poll_list' member. */
@@ -3080,19 +3081,21 @@ cycles_count_end(struct dp_netdev_pmd_thread *pmd,
 non_atomic_ullong_add(>cycles.n[type], interval);
 }
 
-static void
+static unsigned int
 dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
struct netdev_rxq *rx,
odp_port_t port_no)
 {
 struct dp_packet_batch batch;
 int error;
+unsigned int pkt_cnt = 0;
 
 dp_packet_batch_init();
 cycles_count_start(pmd);
 error = netdev_rxq_recv(rx, );
 cycles_count_end(pmd, PMD_CYCLES_POLLING);
 if (!error) {
+ pkt_cnt = batch.count;
 *recirc_depth_get() = 0;
 
 cycles_count_start(pmd);
@@ -3104,6 +3107,7 @@ dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread 
*pmd,
 VLOG_ERR_RL(, "error receiving data from %s: %s",
 netdev_rxq_get_name(rx), ovs_strerror(error));
 }
+return pkt_cnt;
 }
 
 static struct tx_port *
@@ -3685,6 +3689,8 @@ pmd_load_queues_and_ports(struct dp_netdev_pmd_thread 
*pmd,
 HMAP_FOR_EACH (poll, node, >poll_list) {
 poll_list[i].rx = poll->rxq->rx;
 poll_list[i].port_no = poll->rxq->port->port_no;
+poll_list[i].is_priority = \
+(poll->rxq->rx->queue_id == poll->rxq->rx->netdev->priority_rxq);
 i++;
 }
 
@@ -3731,10 +3737,21 @@ reload:
 lc = UINT_MAX;
 }
 
+unsigned int log_cnt = 0;
+int rxd_cnt;
+int streak_len;
+const unsigned int MAX_STREAK_LEN = 100;
+
 for (;;) {
+log_cnt++;
 for (i = 0; i < poll_cnt; i++) {
-dp_netdev_process_rxq_port(pmd, poll_list[i].rx,
-   poll_list[i].port_no);
+streak_len = 0;
+do {
+rxd_cnt = dp_netdev_process_rxq_port(pmd, poll_list[i].rx,
+ poll_list[i].port_no);
+streak_len++;
+} while (rxd_cnt && poll_list[i].is_priority &&
+ streak_len < MAX_STREAK_LEN);
 }
 
 if (lc++ > 1024) {
-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [RFC v2 2/4] netdev-dpdk: Apply ingress_sched config to dpdk phy ports

2017-07-11 Thread Billy O'Mahony
Ingress scheduling configuration is given effect by way of Flow Director
filters. A small subset of the possible ingress scheduling possible is
implemented in this patch.

Signed-off-by: Billy O'Mahony <billy.o.mah...@intel.com>
---
 include/openvswitch/ofp-parse.h |   3 ++
 lib/dpif-netdev.c   |   1 +
 lib/netdev-dpdk.c   | 117 
 3 files changed, 121 insertions(+)

diff --git a/include/openvswitch/ofp-parse.h b/include/openvswitch/ofp-parse.h
index fc5784e..08d6086 100644
--- a/include/openvswitch/ofp-parse.h
+++ b/include/openvswitch/ofp-parse.h
@@ -37,6 +37,9 @@ struct ofputil_table_mod;
 struct ofputil_bundle_msg;
 struct ofputil_tlv_table_mod;
 struct simap;
+struct tun_table;
+struct flow_wildcards;
+struct ofputil_port_map;
 enum ofputil_protocol;
 
 char *parse_ofp_str(struct ofputil_flow_mod *, int command, const char *str_,
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 2f224db..66712c7 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -44,6 +44,7 @@
 #include "dp-packet.h"
 #include "dpif.h"
 #include "dpif-provider.h"
+#include "netdev-provider.h"
 #include "dummy.h"
 #include "fat-rwlock.h"
 #include "flow.h"
diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index d14c381..93556e7 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -33,6 +33,8 @@
 #include 
 #include 
 
+#include 
+#include 
 #include "dirs.h"
 #include "dp-packet.h"
 #include "dpdk.h"
@@ -168,6 +170,10 @@ static const struct rte_eth_conf port_conf = {
 .txmode = {
 .mq_mode = ETH_MQ_TX_NONE,
 },
+.fdir_conf = {
+.mode = RTE_FDIR_MODE_PERFECT,
+},
+
 };
 
 enum { DPDK_RING_SIZE = 256 };
@@ -652,6 +658,15 @@ dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int 
n_rxq, int n_txq)
 int i;
 struct rte_eth_conf conf = port_conf;
 
+/* Ingress scheduling requires ETH_MQ_RX_NONE so limit it to when exactly
+ * two rxqs are defined. Otherwise MQ will not work as expected. */
+if (dev->ingress_sched_str && n_rxq == 2) {
+conf.rxmode.mq_mode = ETH_MQ_RX_NONE;
+}
+else {
+conf.rxmode.mq_mode = ETH_MQ_RX_RSS;
+}
+
 if (dev->mtu > ETHER_MTU) {
 conf.rxmode.jumbo_frame = 1;
 conf.rxmode.max_rx_pkt_len = dev->max_packet_len;
@@ -752,6 +767,106 @@ dpdk_eth_flow_ctrl_setup(struct netdev_dpdk *dev) 
OVS_REQUIRES(dev->mutex)
 }
 }
 
+static void
+dpdk_apply_ingress_scheduling(struct netdev_dpdk *dev, int n_rxq)
+{
+if (!dev->ingress_sched_str) {
+return;
+}
+
+if (n_rxq != 2) {
+VLOG_ERR("Interface %s: Ingress scheduling config ignored; " \
+ "Requires n_rxq==2.", dev->up.name);
+}
+
+int priority_q_id = n_rxq-1;
+char *key, *val, *str, *iter;
+
+ovs_be32 ip_src, ip_dst;
+ip_src = ip_dst = 0;
+
+uint16_t eth_type, port_src, port_dst;
+eth_type = port_src = port_dst = 0;
+uint8_t ip_proto = 0;
+
+char *mallocd_str; /* str_to_x returns malloc'd str we'll need to free */
+/* Parse the configuration into local vars */
+iter = str = xstrdup(dev->ingress_sched_str);
+while (ofputil_parse_key_value (, , )) {
+if (strcmp(key, "nw_src") == 0 || strcmp(key, "ip_src") == 0) {
+mallocd_str = str_to_ip(val, _src);
+} else if (strcmp(key, "nw_dst") == 0 || strcmp(key, "ip_dst") == 0) {
+mallocd_str = str_to_ip(val, _dst);
+} else if (strcmp(key, "dl_type") == 0 ||
+   strcmp(key, "eth_type") == 0) {
+mallocd_str = str_to_u16(val, "eth_type/dl_type", _type);
+} else if (strcmp(key, "tcp_src") == 0 ||
+  strcmp(key, "tp_src") == 0 ||
+  strcmp(key, "udp_src") == 0) {
+mallocd_str = str_to_u16(val, "tcp/udp_src", _src);
+} else if (strcmp(key, "tcp_dst") == 0 ||
+   strcmp(key, "tp_dst") == 0 ||
+   strcmp(key, "udp_dst") == 0) {
+mallocd_str = str_to_u16(val, "tcp/udp_dst", _dst);
+} else if (strcmp(key, "ip") == 0) {
+eth_type = 0x0800;
+} else if (strcmp(key, "udp") == 0) {
+eth_type = 0x0800;
+ip_proto = 17;
+} else if (strcmp(key, "tcp") == 0) {
+eth_type = 0x0800;
+ip_proto = 6;
+} else {
+VLOG_WARN("Ignoring unsupported ingress scheduling field '%s'", \
+  key);
+}
+if (mallocd_str) {
+VLOG_ERR ("%s", mallocd_str);
+free(mallocd_str);
+mallocd_str = NULL;
+}
+}

[ovs-dev] [RFC v2 1/4] netdev: Add set_ingress_sched to netdev api

2017-07-11 Thread Billy O'Mahony
Passes ingress_sched config item from other_config column of Interface
table to the netdev.

Signed-off-by: Billy O'Mahony <billy.o.mah...@intel.com>
---
 lib/netdev-bsd.c  |  1 +
 lib/netdev-dpdk.c | 19 +++
 lib/netdev-dummy.c|  1 +
 lib/netdev-linux.c|  1 +
 lib/netdev-provider.h | 10 ++
 lib/netdev-vport.c|  1 +
 lib/netdev.c  | 22 ++
 lib/netdev.h  |  1 +
 vswitchd/bridge.c |  2 ++
 9 files changed, 58 insertions(+)

diff --git a/lib/netdev-bsd.c b/lib/netdev-bsd.c
index f863a18..5582b0f 100644
--- a/lib/netdev-bsd.c
+++ b/lib/netdev-bsd.c
@@ -1509,6 +1509,7 @@ netdev_bsd_update_flags(struct netdev *netdev_, enum 
netdev_flags off,
 netdev_bsd_get_etheraddr,\
 netdev_bsd_get_mtu,  \
 NULL, /* set_mtu */  \
+NULL, /* set_ingress_sched */\
 netdev_bsd_get_ifindex,  \
 netdev_bsd_get_carrier,  \
 NULL, /* get_carrier_resets */   \
diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 2c92bd3..d14c381 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -368,6 +368,9 @@ struct netdev_dpdk {
 /* If true, device was attached by rte_eth_dev_attach(). */
 bool attached;
 
+/* Ingress Scheduling config */
+char *ingress_sched_str;
+
 /* In dpdk_list. */
 struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex);
 
@@ -1028,6 +1031,7 @@ netdev_dpdk_destruct(struct netdev *netdev)
 }
 
 free(dev->devargs);
+free(dev->ingress_sched_str);
 common_destruct(dev);
 
 ovs_mutex_unlock(_mutex);
@@ -1963,6 +1967,20 @@ netdev_dpdk_set_mtu(struct netdev *netdev, int mtu)
 }
 
 static int
+netdev_dpdk_set_ingress_sched(struct netdev *netdev,
+  const char *ingress_sched_str)
+{
+struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
+
+free(dev->ingress_sched_str);
+if (ingress_sched_str) {
+dev->ingress_sched_str = xstrdup(ingress_sched_str);
+}
+
+return 0;
+}
+
+static int
 netdev_dpdk_get_carrier(const struct netdev *netdev, bool *carrier);
 
 static int
@@ -3268,6 +3286,7 @@ unlock:
 netdev_dpdk_get_etheraddr,\
 netdev_dpdk_get_mtu,  \
 netdev_dpdk_set_mtu,  \
+netdev_dpdk_set_ingress_sched,\
 netdev_dpdk_get_ifindex,  \
 GET_CARRIER,  \
 netdev_dpdk_get_carrier_resets,   \
diff --git a/lib/netdev-dummy.c b/lib/netdev-dummy.c
index d189a86..9c8a9aa 100644
--- a/lib/netdev-dummy.c
+++ b/lib/netdev-dummy.c
@@ -1374,6 +1374,7 @@ netdev_dummy_update_flags(struct netdev *netdev_,
 netdev_dummy_get_etheraddr, \
 netdev_dummy_get_mtu,   \
 netdev_dummy_set_mtu,   \
+NULL,   /* set_ingress_sched */ \
 netdev_dummy_get_ifindex,   \
 NULL,   /* get_carrier */   \
 NULL,   /* get_carrier_resets */\
diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
index 44dfac5..4186b24 100644
--- a/lib/netdev-linux.c
+++ b/lib/netdev-linux.c
@@ -2827,6 +2827,7 @@ netdev_linux_update_flags(struct netdev *netdev_, enum 
netdev_flags off,
 netdev_linux_get_etheraddr, \
 netdev_linux_get_mtu,   \
 netdev_linux_set_mtu,   \
+NULL,   /* set_ingress_sched */ \
 netdev_linux_get_ifindex,   \
 netdev_linux_get_carrier,   \
 netdev_linux_get_carrier_resets,\
diff --git a/lib/netdev-provider.h b/lib/netdev-provider.h
index 79143d2..36c1c5e 100644
--- a/lib/netdev-provider.h
+++ b/lib/netdev-provider.h
@@ -34,6 +34,7 @@ extern "C" {
 
 struct netdev_tnl_build_header_params;
 #define NETDEV_NUMA_UNSPEC OVS_NUMA_UNSPEC
+#define NETDEV_UNSPEC (-1)
 
 /* A network device (e.g. an Ethernet device).
  *
@@ -72,6 +73,7 @@ struct netdev {
  * modify them. */
 int n_txq;
 int n_rxq;
+int priority_rxq;   /* id of prioritized rxq. -1 = None */
 int ref_cnt;/* Times this devices was opened. */
 struct shash_node *node;/* Pointer to element in global map. */
 struct ovs_list saved_flags_list; /* Contains "struct netdev_saved_flags". 
*/
@@ -408,6 +410,14 @@ struct netdev_class {
  * null if it would always return EOPNOTSUPP

[ovs-dev] [RFC v2 0/4] prioritizing latency sensitive traffic

2017-07-11 Thread Billy O'Mahony
Hi All,

rather than waiting for more polished PATCH v1 next week as I suggested this
morning please find v2 RFC patchset updated with documentation changes and also
some results from testing with this patchset.

Initial results:
* even when userspace OVS is very much overloaded and
  dropping significant numbers of packets the drop rate for prioritized traffic
  is running at 1/1000th of the drop rate for non-prioritized traffic.

* the latency profile of prioritized traffic through userspace OVS is also much
  improved

1e0 |*
|*
1e-1|* | Non-prioritized pkt latency
|* * Prioritized pkt latency
1e-2|*
|*
1e-3|*   |
|*   |
1e-4|*   | | |
|*   |*| |
1e-5|*   |*| | |
|*   |*|*| |  |
1e-6|*   |*|*|*|  |
|*   |*|*|*|* |
1e-7|*   |*|*|*|* |*
|*   |*|*|*|* |*
1e-8|*   |*|*|*|* |*
  0-1 1-20 20-40 40-50 50-60 60-70 ... 120-400
Latency (us)

 Proportion of packets per latency bin @ 80% Max Throughput
  (Log scale)


The patch works for the supported field types but should not be considered
complete - at this stage I'd like to get an idea if people agree with
general layout of the implementation.

With this patchset prioritized ingress scheduling can be achieved based on:
* eth_type
* src & dst ip and port numbers for UDP & TCP packets

The patch uses Flow Director filter API in DPDK and has only been tested on a
Fortville (XL710) NIC.

This is RFC so there are things it does not take into account:
* reconfiguration - of the ingress_sched config of pmd masks and so on.
* ovs-appctl dpif-netdev/pmd-*-show o/p should indicate rxq prioritization.
* any requirements for a more granular prioritization mechanism
* ...

Regards,
Billy.


Billy O'Mahony (4):
  netdev: Add set_ingress_sched to netdev api
  netdev-dpdk: Apply ingress_sched config to dpdk phy ports
  dpif-netdev: Add rxq prioritization
  docs: Document ingress scheduling feature

 Documentation/howto/dpdk.rst|  31 +
 include/openvswitch/ofp-parse.h |   3 +
 lib/dpif-netdev.c   |  24 ++-
 lib/netdev-bsd.c|   1 +
 lib/netdev-dpdk.c   | 136 
 lib/netdev-dummy.c  |   1 +
 lib/netdev-linux.c  |   1 +
 lib/netdev-provider.h   |  10 +++
 lib/netdev-vport.c  |   1 +
 lib/netdev.c|  22 +++
 lib/netdev.h|   1 +
 vswitchd/bridge.c   |   2 +
 vswitchd/vswitch.xml|  31 +
 13 files changed, 261 insertions(+), 3 deletions(-)

-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH v10] dpif-netdev: Assign ports to pmds on non-local numa node.

2017-07-10 Thread Billy O'Mahony
Previously if there is no available (non-isolated) pmd on the numa node
for a port then the port is not polled at all. This can result in a
non-operational system until such time as nics are physically
repositioned. It is preferable to operate with a pmd on the 'wrong' numa
node albeit with lower performance. Local pmds are still chosen when
available.

Signed-off-by: Billy O'Mahony <billy.o.mah...@intel.com>
Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
Co-authored-by: Ilya Maximets <i.maxim...@samsung.com>
Tested-by: Ian Stokes <ian.sto...@intel.com>
Acked-by: Ian Stokes <ian.sto...@intel.com>
---
v10: tweak code comments, docs based on reviews
v9: v8 missed some comments on v7
v8: Some coding style issues; doc tweak
v7: Incorporate review comments on docs and implementation
v6: Change 'port' to 'queue' in a warning msg
v5: Fix warning msg; Update same in docs
v4: Fix a checkpatch error
v3: Fix warning messages not appearing when using multiqueue
v2: Add details of warning messages into docs

 Documentation/intro/install/dpdk.rst | 21 +++---
 lib/dpif-netdev.c| 42 +---
 2 files changed, 57 insertions(+), 6 deletions(-)

diff --git a/Documentation/intro/install/dpdk.rst 
b/Documentation/intro/install/dpdk.rst
index e83f852..e1f1108 100644
--- a/Documentation/intro/install/dpdk.rst
+++ b/Documentation/intro/install/dpdk.rst
@@ -449,7 +449,7 @@ affinitized accordingly.
 
   A poll mode driver (pmd) thread handles the I/O of all DPDK interfaces
   assigned to it. A pmd thread shall poll the ports for incoming packets,
-  switch the packets and send to tx port.  pmd thread is CPU bound, and needs
+  switch the packets and send to tx port.  A pmd thread is CPU bound, and needs
   to be affinitized to isolated cores for optimum performance.
 
   By setting a bit in the mask, a pmd thread is created and pinned to the
@@ -458,8 +458,23 @@ affinitized accordingly.
   $ ovs-vsctl set Open_vSwitch . other_config:pmd-cpu-mask=0x4
 
   .. note::
-pmd thread on a NUMA node is only created if there is at least one DPDK
-interface from that NUMA node added to OVS.
+A pmd thread on a NUMA node is only created if there is at least one DPDK
+interface from that NUMA node added to OVS.  A pmd thread is created by
+default on a core of a NUMA node or when a specified pmd-cpu-mask has
+indicated so.  Even though a PMD thread may exist, the thread only starts
+consuming CPU cycles if there is least one receive queue assigned to
+the pmd.
+
+  .. note::
+On NUMA systems PCI devices are also local to a NUMA node.  Unbound rx
+queues for a PCI device will be assigned to a pmd on it's local NUMA node
+if a non-isolated PMD exists on that NUMA node.  If not, the queue will be
+assigned to a non-isolated pmd on a remote NUMA node.  This will result in
+reduced maximum throughput on that device and possibly on other devices
+assigned to that pmd thread. If such a queue assignment is made a warning
+message will be logged: "There's no available (non-isolated) pmd thread on
+numa node N. Queue Q on port P will be assigned to the pmd on core C
+(numa node N'). Expect reduced performance."
 
 - QEMU vCPU thread Affinity
 
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 4e29085..07b2033 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -3195,6 +3195,24 @@ rr_numa_list_lookup(struct rr_numa_list *rr, int numa_id)
 return NULL;
 }
 
+/* Returns the next node in numa list following 'numa' in round-robin fashion.
+ * Returns first node if 'numa' is a null pointer or the last node in 'rr'.
+ * Returns NULL if 'rr' numa list is empty. */
+static struct rr_numa *
+rr_numa_list_next(struct rr_numa_list *rr, const struct rr_numa *numa)
+{
+struct hmap_node *node = NULL;
+
+if (numa) {
+node = hmap_next(>numas, >node);
+}
+if (!node) {
+node = hmap_first(>numas);
+}
+
+return (node) ? CONTAINER_OF(node, struct rr_numa, node) : NULL;
+}
+
 static void
 rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list *rr)
 {
@@ -3249,6 +3267,7 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) 
OVS_REQUIRES(dp->port_mutex)
 {
 struct dp_netdev_port *port;
 struct rr_numa_list rr;
+struct rr_numa *non_local_numa = NULL;
 
 rr_numa_list_populate(dp, );
 
@@ -3281,11 +3300,28 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) 
OVS_REQUIRES(dp->port_mutex)
 }
 } else if (!pinned && q->core_id == OVS_CORE_UNSPEC) {
 if (!numa) {
-VLOG_WARN("There's no available (non isolated) pmd thread "
+/* There are no pmds on the queue's local NUMA node.
+   Round-robin on the NUMA nodes that do have pmds. */
+non_local_numa = r

[ovs-dev] [PATCH v9] dpif-netdev: Assign ports to pmds on non-local numa node.

2017-06-29 Thread Billy O'Mahony
Previously if there is no available (non-isolated) pmd on the numa node
for a port then the port is not polled at all. This can result in a
non-operational system until such time as nics are physically
repositioned. It is preferable to operate with a pmd on the 'wrong' numa
node albeit with lower performance. Local pmds are still chosen when
available.

Signed-off-by: Billy O'Mahony <billy.o.mah...@intel.com>
Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
Co-authored-by: Ilya Maximets <i.maxim...@samsung.com>
---
v9: v8 missed some comments on v7
v8: Some coding style issues; doc tweak
v7: Incorporate review comments on docs and implementation
v6: Change 'port' to 'queue' in a warning msg
v5: Fix warning msg; Update same in docs
v4: Fix a checkpatch error
v3: Fix warning messages not appearing when using multiqueue
v2: Add details of warning messages into docs

 Documentation/intro/install/dpdk.rst | 21 +++---
 lib/dpif-netdev.c| 41 +---
 2 files changed, 56 insertions(+), 6 deletions(-)

diff --git a/Documentation/intro/install/dpdk.rst 
b/Documentation/intro/install/dpdk.rst
index e83f852..89775d6 100644
--- a/Documentation/intro/install/dpdk.rst
+++ b/Documentation/intro/install/dpdk.rst
@@ -449,7 +449,7 @@ affinitized accordingly.
 
   A poll mode driver (pmd) thread handles the I/O of all DPDK interfaces
   assigned to it. A pmd thread shall poll the ports for incoming packets,
-  switch the packets and send to tx port.  pmd thread is CPU bound, and needs
+  switch the packets and send to tx port.  A pmd thread is CPU bound, and needs
   to be affinitized to isolated cores for optimum performance.
 
   By setting a bit in the mask, a pmd thread is created and pinned to the
@@ -458,8 +458,23 @@ affinitized accordingly.
   $ ovs-vsctl set Open_vSwitch . other_config:pmd-cpu-mask=0x4
 
   .. note::
-pmd thread on a NUMA node is only created if there is at least one DPDK
-interface from that NUMA node added to OVS.
+A pmd thread on a NUMA node is only created if there is at least one DPDK
+interface from that NUMA node added to OVS.  A pmd thread is created by
+default on a core of a NUMA node or when a specified pmd-cpu-mask has
+indicated so.  Even though a PMD thread may exist, the thread only starts
+consuming CPU cycles if there is least one receive queue assigned to
+the pmd.
+
+  .. note::
+On NUMA systems PCI devices are also local to a NUMA node.  Unbound rx
+queues for a PCI device will assigned to a pmd on it's local NUMA node if a
+non-isolated PMD exists on that NUMA node.  If not, the queue will be
+assigned to a non-isolated pmd on a remote NUMA node.  This will result in
+reduced maximum throughput on that device and possibly on other devices
+assigned to that pmd thread. In the case such, a queue assignment is made a
+warning message will be logged: "There's no available (non-isolated) pmd
+thread on numa node N. Queue Q on port P will be assigned to the pmd on
+core C (numa node N'). Expect reduced performance."
 
 - QEMU vCPU thread Affinity
 
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 4e29085..7557f32 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -3195,6 +3195,23 @@ rr_numa_list_lookup(struct rr_numa_list *rr, int numa_id)
 return NULL;
 }
 
+/* Returns next NUMA from rr list in round-robin fashion. Returns the first
+ * NUMA node if 'NULL' or the last node passed, and 'NULL' if list is empty. */
+static struct rr_numa *
+rr_numa_list_next(struct rr_numa_list *rr, const struct rr_numa *numa)
+{
+struct hmap_node *node = NULL;
+
+if (numa) {
+node = hmap_next(>numas, >node);
+}
+if (!node) {
+node = hmap_first(>numas);
+}
+
+return (node) ? CONTAINER_OF(node, struct rr_numa, node) : NULL;
+}
+
 static void
 rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list *rr)
 {
@@ -3249,6 +3266,7 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) 
OVS_REQUIRES(dp->port_mutex)
 {
 struct dp_netdev_port *port;
 struct rr_numa_list rr;
+struct rr_numa *non_local_numa = NULL;
 
 rr_numa_list_populate(dp, );
 
@@ -3281,11 +3299,28 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) 
OVS_REQUIRES(dp->port_mutex)
 }
 } else if (!pinned && q->core_id == OVS_CORE_UNSPEC) {
 if (!numa) {
-VLOG_WARN("There's no available (non isolated) pmd thread "
+/* There are no pmds on the queue's local NUMA node.
+   Round-robin on the NUMA nodes that do have pmds. */
+non_local_numa = rr_numa_list_next(, non_local_numa);
+if (!non_local_numa) {
+VLOG_ERR("There is no available (non-isolated) pmd "
+   

[ovs-dev] [PATCH v8] dpif-netdev: Assign ports to pmds on non-local numa node.

2017-06-28 Thread Billy O'Mahony
Previously if there is no available (non-isolated) pmd on the numa node
for a port then the port is not polled at all. This can result in a
non-operational system until such time as nics are physically
repositioned. It is preferable to operate with a pmd on the 'wrong' numa
node albeit with lower performance. Local pmds are still chosen when
available.

Signed-off-by: Billy O'Mahony <billy.o.mah...@intel.com>
Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
Co-authored-by: Ilya Maximets <i.maxim...@samsung.com>
---
v8: Some coding style issues; doc tweak
v7: Incorporate review comments on docs and implementation
v6: Change 'port' to 'queue' in a warning msg
v5: Fix warning msg; Update same in docs
v4: Fix a checkpatch error
v3: Fix warning messages not appearing when using multiqueue
v2: Add details of warning messages into docs

 Documentation/intro/install/dpdk.rst | 17 ---
 lib/dpif-netdev.c| 41 +---
 2 files changed, 52 insertions(+), 6 deletions(-)

diff --git a/Documentation/intro/install/dpdk.rst 
b/Documentation/intro/install/dpdk.rst
index e83f852..42d14d2 100644
--- a/Documentation/intro/install/dpdk.rst
+++ b/Documentation/intro/install/dpdk.rst
@@ -449,7 +449,7 @@ affinitized accordingly.
 
   A poll mode driver (pmd) thread handles the I/O of all DPDK interfaces
   assigned to it. A pmd thread shall poll the ports for incoming packets,
-  switch the packets and send to tx port.  pmd thread is CPU bound, and needs
+  switch the packets and send to tx port.  A pmd thread is CPU bound, and needs
   to be affinitized to isolated cores for optimum performance.
 
   By setting a bit in the mask, a pmd thread is created and pinned to the
@@ -458,8 +458,19 @@ affinitized accordingly.
   $ ovs-vsctl set Open_vSwitch . other_config:pmd-cpu-mask=0x4
 
   .. note::
-pmd thread on a NUMA node is only created if there is at least one DPDK
-interface from that NUMA node added to OVS.
+Even though a PMD thread may exist, the thread only starts consuming CPU
+cycles if there is at least one receive queue assigned to the pmd.
+
+  .. note::
+
+On NUMA systems PCI devices are also local to a NUMA node.  Unbound rx
+queues for PCI devices will be assigned to a pmd on it's local NUMA node if
+pmd-cpu-mask has created a pmd thread on that NUMA node.  If not the queue
+will be assigned to a pmd on a remote NUMA node.  This will result in
+reduced maximum throughput on that device.  In case such a queue assignment
+is made a warning message will be logged: "There's no available (non-
+isolated) pmd thread on numa node N. Queue Q on port P will be assigned to
+the pmd on core C (numa node N'). Expect reduced performance."
 
 - QEMU vCPU thread Affinity
 
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 4e29085..7557f32 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -3195,6 +3195,23 @@ rr_numa_list_lookup(struct rr_numa_list *rr, int numa_id)
 return NULL;
 }
 
+/* Returns next NUMA from rr list in round-robin fashion. Returns the first
+ * NUMA node if 'NULL' or the last node passed, and 'NULL' if list is empty. */
+static struct rr_numa *
+rr_numa_list_next(struct rr_numa_list *rr, const struct rr_numa *numa)
+{
+struct hmap_node *node = NULL;
+
+if (numa) {
+node = hmap_next(>numas, >node);
+}
+if (!node) {
+node = hmap_first(>numas);
+}
+
+return (node) ? CONTAINER_OF(node, struct rr_numa, node) : NULL;
+}
+
 static void
 rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list *rr)
 {
@@ -3249,6 +3266,7 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) 
OVS_REQUIRES(dp->port_mutex)
 {
 struct dp_netdev_port *port;
 struct rr_numa_list rr;
+struct rr_numa *non_local_numa = NULL;
 
 rr_numa_list_populate(dp, );
 
@@ -3281,11 +3299,28 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) 
OVS_REQUIRES(dp->port_mutex)
 }
 } else if (!pinned && q->core_id == OVS_CORE_UNSPEC) {
 if (!numa) {
-VLOG_WARN("There's no available (non isolated) pmd thread "
+/* There are no pmds on the queue's local NUMA node.
+   Round-robin on the NUMA nodes that do have pmds. */
+non_local_numa = rr_numa_list_next(, non_local_numa);
+if (!non_local_numa) {
+VLOG_ERR("There is no available (non-isolated) pmd "
+ "thread for port \'%s\' queue %d. This queue "
+ "will not be polled. Is pmd-cpu-mask set to "
+ "zero? Or are all PMDs isolated to other "
+ "queues?", netdev_get_name(port->netdev),
+ 

[ovs-dev] [PATCH v7] dpif-netdev: Assign ports to pmds on non-local numa node.

2017-06-27 Thread Billy O'Mahony
Previously if there is no available (non-isolated) pmd on the numa node
for a port then the port is not polled at all. This can result in a
non-operational system until such time as nics are physically
repositioned. It is preferable to operate with a pmd on the 'wrong' numa
node albeit with lower performance. Local pmds are still chosen when
available.

Signed-off-by: Billy O'Mahony <billy.o.mah...@intel.com>
Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
Co-authored-by: Ilya Maximets <i.maxim...@samsung.com>
---
v7: Incorporate review comments on docs and implementation
v6: Change 'port' to 'queue' in a warning msg
v5: Fix warning msg; Update same in docs
v4: Fix a checkpatch error
v3: Fix warning messages not appearing when using multiqueue
v2: Add details of warning messages into docs

 Documentation/intro/install/dpdk.rst | 18 +---
 lib/dpif-netdev.c| 42 
 2 files changed, 53 insertions(+), 7 deletions(-)

diff --git a/Documentation/intro/install/dpdk.rst 
b/Documentation/intro/install/dpdk.rst
index e83f852..a760fb6 100644
--- a/Documentation/intro/install/dpdk.rst
+++ b/Documentation/intro/install/dpdk.rst
@@ -449,7 +449,7 @@ affinitized accordingly.
 
   A poll mode driver (pmd) thread handles the I/O of all DPDK interfaces
   assigned to it. A pmd thread shall poll the ports for incoming packets,
-  switch the packets and send to tx port.  pmd thread is CPU bound, and needs
+  switch the packets and send to tx port.  A pmd thread is CPU bound, and needs
   to be affinitized to isolated cores for optimum performance.
 
   By setting a bit in the mask, a pmd thread is created and pinned to the
@@ -458,8 +458,20 @@ affinitized accordingly.
   $ ovs-vsctl set Open_vSwitch . other_config:pmd-cpu-mask=0x4
 
   .. note::
-pmd thread on a NUMA node is only created if there is at least one DPDK
-interface from that NUMA node added to OVS.
+While pmd threads are created based on pmd-cpu-mask, the thread only starts
+consuming CPU cycles if there is least one receive queue assigned to the
+pmd.
+
+  .. note::
+
+On NUMA systems PCI devices are also local to a NUMA node.  Unbound Rx
+queues for PCI device will assigned to a pmd on it's local NUMA node if
+pmd-cpu-mask has created a pmd thread on that NUMA node.  If not the queue
+will be assigned to a pmd on a remote NUMA node.  This will result in
+reduced maximum throughput on that device.  In case such a queue assignment
+is made a warning message will be logged: "There's no available (non-
+isolated) pmd thread on numa node N. Queue Q on port P will be assigned to
+the pmd on core C (numa node N'). Expect reduced performance."
 
 - QEMU vCPU thread Affinity
 
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 4e29085..38a0fd3 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -3195,6 +3195,23 @@ rr_numa_list_lookup(struct rr_numa_list *rr, int numa_id)
 return NULL;
 }
 
+/* Returns next NUMA from rr list in round-robin fashion. Returns the first
+ * NUMA node if 'NULL' or the last node passed, and 'NULL' if list is empty. */
+static struct rr_numa *
+rr_numa_list_next(struct rr_numa_list *rr, const struct rr_numa *numa)
+{
+struct hmap_node *node = NULL;
+
+if (numa) {
+node = hmap_next(>numas, >node);
+}
+if (!node) {
+node = hmap_first(>numas);
+}
+
+return (node) ? CONTAINER_OF(node, struct rr_numa, node) : NULL;
+}
+
 static void
 rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list *rr)
 {
@@ -3249,6 +3266,7 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) 
OVS_REQUIRES(dp->port_mutex)
 {
 struct dp_netdev_port *port;
 struct rr_numa_list rr;
+struct rr_numa * non_local_numa = NULL;
 
 rr_numa_list_populate(dp, );
 
@@ -3262,7 +3280,6 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) 
OVS_REQUIRES(dp->port_mutex)
 
 numa_id = netdev_get_numa_id(port->netdev);
 numa = rr_numa_list_lookup(, numa_id);
-
 for (int qid = 0; qid < port->n_rxq; qid++) {
 struct dp_netdev_rxq *q = >rxqs[qid];
 
@@ -3281,11 +3298,28 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) 
OVS_REQUIRES(dp->port_mutex)
 }
 } else if (!pinned && q->core_id == OVS_CORE_UNSPEC) {
 if (!numa) {
-VLOG_WARN("There's no available (non isolated) pmd thread "
+/* There are no pmds on the queue's local NUMA node.
+   Round-robin on the NUMA nodes that do have pmds. */
+non_local_numa = rr_numa_list_next(, non_local_numa);
+if (!non_local_numa) {
+VLOG_ERR("There is no available (non-isolated) pmd "
+ "thread for port \'%s\' que

[ovs-dev] [PATCH v7] dpif-netdev: Assign ports to pmds on non-local numa node.

2017-06-27 Thread Billy O'Mahony
I have incorporated Darrell's doc comments and Ilya's implementation and
tested with various scenarios. I had to make a few small changes to ensure that
the warning messages were issued for each queue as per previous patch comments.

Billy O'Mahony (1):
  dpif-netdev: Assign ports to pmds on non-local numa node.

 Documentation/intro/install/dpdk.rst | 18 +---
 lib/dpif-netdev.c| 42 
 2 files changed, 53 insertions(+), 7 deletions(-)

-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [RFC PATCH 2/3] netdev-dpdk: Apply ingress_sched config to dpdk phy ports

2017-06-16 Thread Billy O'Mahony
Ingress scheduling configuration is given effect by way of Flow Director
filters. A small subset of the possible ingress scheduling possible is
implemented in this patch.

Signed-off-by: Billy O'Mahony <billy.o.mah...@intel.com>
---
 include/openvswitch/ofp-parse.h |   3 ++
 lib/dpif-netdev.c   |   1 +
 lib/netdev-dpdk.c   | 117 
 3 files changed, 121 insertions(+)

diff --git a/include/openvswitch/ofp-parse.h b/include/openvswitch/ofp-parse.h
index fc5784e..08d6086 100644
--- a/include/openvswitch/ofp-parse.h
+++ b/include/openvswitch/ofp-parse.h
@@ -37,6 +37,9 @@ struct ofputil_table_mod;
 struct ofputil_bundle_msg;
 struct ofputil_tlv_table_mod;
 struct simap;
+struct tun_table;
+struct flow_wildcards;
+struct ofputil_port_map;
 enum ofputil_protocol;
 
 char *parse_ofp_str(struct ofputil_flow_mod *, int command, const char *str_,
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 2f224db..66712c7 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -44,6 +44,7 @@
 #include "dp-packet.h"
 #include "dpif.h"
 #include "dpif-provider.h"
+#include "netdev-provider.h"
 #include "dummy.h"
 #include "fat-rwlock.h"
 #include "flow.h"
diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index d14c381..93556e7 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -33,6 +33,8 @@
 #include 
 #include 
 
+#include 
+#include 
 #include "dirs.h"
 #include "dp-packet.h"
 #include "dpdk.h"
@@ -168,6 +170,10 @@ static const struct rte_eth_conf port_conf = {
 .txmode = {
 .mq_mode = ETH_MQ_TX_NONE,
 },
+.fdir_conf = {
+.mode = RTE_FDIR_MODE_PERFECT,
+},
+
 };
 
 enum { DPDK_RING_SIZE = 256 };
@@ -652,6 +658,15 @@ dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int 
n_rxq, int n_txq)
 int i;
 struct rte_eth_conf conf = port_conf;
 
+/* Ingress scheduling requires ETH_MQ_RX_NONE so limit it to when exactly
+ * two rxqs are defined. Otherwise MQ will not work as expected. */
+if (dev->ingress_sched_str && n_rxq == 2) {
+conf.rxmode.mq_mode = ETH_MQ_RX_NONE;
+}
+else {
+conf.rxmode.mq_mode = ETH_MQ_RX_RSS;
+}
+
 if (dev->mtu > ETHER_MTU) {
 conf.rxmode.jumbo_frame = 1;
 conf.rxmode.max_rx_pkt_len = dev->max_packet_len;
@@ -752,6 +767,106 @@ dpdk_eth_flow_ctrl_setup(struct netdev_dpdk *dev) 
OVS_REQUIRES(dev->mutex)
 }
 }
 
+static void
+dpdk_apply_ingress_scheduling(struct netdev_dpdk *dev, int n_rxq)
+{
+if (!dev->ingress_sched_str) {
+return;
+}
+
+if (n_rxq != 2) {
+VLOG_ERR("Interface %s: Ingress scheduling config ignored; " \
+ "Requires n_rxq==2.", dev->up.name);
+}
+
+int priority_q_id = n_rxq-1;
+char *key, *val, *str, *iter;
+
+ovs_be32 ip_src, ip_dst;
+ip_src = ip_dst = 0;
+
+uint16_t eth_type, port_src, port_dst;
+eth_type = port_src = port_dst = 0;
+uint8_t ip_proto = 0;
+
+char *mallocd_str; /* str_to_x returns malloc'd str we'll need to free */
+/* Parse the configuration into local vars */
+iter = str = xstrdup(dev->ingress_sched_str);
+while (ofputil_parse_key_value (, , )) {
+if (strcmp(key, "nw_src") == 0 || strcmp(key, "ip_src") == 0) {
+mallocd_str = str_to_ip(val, _src);
+} else if (strcmp(key, "nw_dst") == 0 || strcmp(key, "ip_dst") == 0) {
+mallocd_str = str_to_ip(val, _dst);
+} else if (strcmp(key, "dl_type") == 0 ||
+   strcmp(key, "eth_type") == 0) {
+mallocd_str = str_to_u16(val, "eth_type/dl_type", _type);
+} else if (strcmp(key, "tcp_src") == 0 ||
+  strcmp(key, "tp_src") == 0 ||
+  strcmp(key, "udp_src") == 0) {
+mallocd_str = str_to_u16(val, "tcp/udp_src", _src);
+} else if (strcmp(key, "tcp_dst") == 0 ||
+   strcmp(key, "tp_dst") == 0 ||
+   strcmp(key, "udp_dst") == 0) {
+mallocd_str = str_to_u16(val, "tcp/udp_dst", _dst);
+} else if (strcmp(key, "ip") == 0) {
+eth_type = 0x0800;
+} else if (strcmp(key, "udp") == 0) {
+eth_type = 0x0800;
+ip_proto = 17;
+} else if (strcmp(key, "tcp") == 0) {
+eth_type = 0x0800;
+ip_proto = 6;
+} else {
+VLOG_WARN("Ignoring unsupported ingress scheduling field '%s'", \
+  key);
+}
+if (mallocd_str) {
+VLOG_ERR ("%s", mallocd_str);
+free(mallocd_str);
+mallocd_str = NULL;
+}
+}

[ovs-dev] [RFC PATCH 1/3] netdev: Add set_ingress_sched to netdev api

2017-06-16 Thread Billy O'Mahony
Passes ingress_sched config item from other_config column of Interface
table to the netdev.

Signed-off-by: Billy O'Mahony <billy.o.mah...@intel.com>
---
 lib/netdev-bsd.c  |  1 +
 lib/netdev-dpdk.c | 19 +++
 lib/netdev-dummy.c|  1 +
 lib/netdev-linux.c|  1 +
 lib/netdev-provider.h | 10 ++
 lib/netdev-vport.c|  1 +
 lib/netdev.c  | 22 ++
 lib/netdev.h  |  1 +
 vswitchd/bridge.c |  2 ++
 9 files changed, 58 insertions(+)

diff --git a/lib/netdev-bsd.c b/lib/netdev-bsd.c
index f863a18..5582b0f 100644
--- a/lib/netdev-bsd.c
+++ b/lib/netdev-bsd.c
@@ -1509,6 +1509,7 @@ netdev_bsd_update_flags(struct netdev *netdev_, enum 
netdev_flags off,
 netdev_bsd_get_etheraddr,\
 netdev_bsd_get_mtu,  \
 NULL, /* set_mtu */  \
+NULL, /* set_ingress_sched */\
 netdev_bsd_get_ifindex,  \
 netdev_bsd_get_carrier,  \
 NULL, /* get_carrier_resets */   \
diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 2c92bd3..d14c381 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -368,6 +368,9 @@ struct netdev_dpdk {
 /* If true, device was attached by rte_eth_dev_attach(). */
 bool attached;
 
+/* Ingress Scheduling config */
+char *ingress_sched_str;
+
 /* In dpdk_list. */
 struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex);
 
@@ -1028,6 +1031,7 @@ netdev_dpdk_destruct(struct netdev *netdev)
 }
 
 free(dev->devargs);
+free(dev->ingress_sched_str);
 common_destruct(dev);
 
 ovs_mutex_unlock(_mutex);
@@ -1963,6 +1967,20 @@ netdev_dpdk_set_mtu(struct netdev *netdev, int mtu)
 }
 
 static int
+netdev_dpdk_set_ingress_sched(struct netdev *netdev,
+  const char *ingress_sched_str)
+{
+struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
+
+free(dev->ingress_sched_str);
+if (ingress_sched_str) {
+dev->ingress_sched_str = xstrdup(ingress_sched_str);
+}
+
+return 0;
+}
+
+static int
 netdev_dpdk_get_carrier(const struct netdev *netdev, bool *carrier);
 
 static int
@@ -3268,6 +3286,7 @@ unlock:
 netdev_dpdk_get_etheraddr,\
 netdev_dpdk_get_mtu,  \
 netdev_dpdk_set_mtu,  \
+netdev_dpdk_set_ingress_sched,\
 netdev_dpdk_get_ifindex,  \
 GET_CARRIER,  \
 netdev_dpdk_get_carrier_resets,   \
diff --git a/lib/netdev-dummy.c b/lib/netdev-dummy.c
index d189a86..9c8a9aa 100644
--- a/lib/netdev-dummy.c
+++ b/lib/netdev-dummy.c
@@ -1374,6 +1374,7 @@ netdev_dummy_update_flags(struct netdev *netdev_,
 netdev_dummy_get_etheraddr, \
 netdev_dummy_get_mtu,   \
 netdev_dummy_set_mtu,   \
+NULL,   /* set_ingress_sched */ \
 netdev_dummy_get_ifindex,   \
 NULL,   /* get_carrier */   \
 NULL,   /* get_carrier_resets */\
diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
index 44dfac5..4186b24 100644
--- a/lib/netdev-linux.c
+++ b/lib/netdev-linux.c
@@ -2827,6 +2827,7 @@ netdev_linux_update_flags(struct netdev *netdev_, enum 
netdev_flags off,
 netdev_linux_get_etheraddr, \
 netdev_linux_get_mtu,   \
 netdev_linux_set_mtu,   \
+NULL,   /* set_ingress_sched */ \
 netdev_linux_get_ifindex,   \
 netdev_linux_get_carrier,   \
 netdev_linux_get_carrier_resets,\
diff --git a/lib/netdev-provider.h b/lib/netdev-provider.h
index 79143d2..36c1c5e 100644
--- a/lib/netdev-provider.h
+++ b/lib/netdev-provider.h
@@ -34,6 +34,7 @@ extern "C" {
 
 struct netdev_tnl_build_header_params;
 #define NETDEV_NUMA_UNSPEC OVS_NUMA_UNSPEC
+#define NETDEV_UNSPEC (-1)
 
 /* A network device (e.g. an Ethernet device).
  *
@@ -72,6 +73,7 @@ struct netdev {
  * modify them. */
 int n_txq;
 int n_rxq;
+int priority_rxq;   /* id of prioritized rxq. -1 = None */
 int ref_cnt;/* Times this devices was opened. */
 struct shash_node *node;/* Pointer to element in global map. */
 struct ovs_list saved_flags_list; /* Contains "struct netdev_saved_flags". 
*/
@@ -408,6 +410,14 @@ struct netdev_class {
  * null if it would always return EOPNOTSUPP

[ovs-dev] [PATCH v6] dpif-netdev: Assign ports to pmds on non-local numa node.

2017-05-10 Thread Billy O'Mahony
From: billyom <billy.o.mah...@intel.com>

Previously if there is no available (non-isolated) pmd on the numa node
for a port then the port is not polled at all. This can result in a
non-operational system until such time as nics are physically
repositioned. It is preferable to operate with a pmd on the 'wrong' numa
node albeit with lower performance. Local pmds are still chosen when
available.

Signed-off-by: Billy O'Mahony <billy.o.mah...@intel.com>
---
v6: Change 'port' to 'queue' in a warning msg
v5: Fix warning msg; Update same in docs
v4: Fix a checkpatch error
v3: Fix warning messages not appearing when using multiqueue
v2: Add details of warning messages into docs

 Documentation/intro/install/dpdk.rst | 10 +
 lib/dpif-netdev.c| 43 +++-
 2 files changed, 48 insertions(+), 5 deletions(-)

diff --git a/Documentation/intro/install/dpdk.rst 
b/Documentation/intro/install/dpdk.rst
index d1c0e65..7a66bff 100644
--- a/Documentation/intro/install/dpdk.rst
+++ b/Documentation/intro/install/dpdk.rst
@@ -460,6 +460,16 @@ affinitized accordingly.
 pmd thread on a NUMA node is only created if there is at least one DPDK
 interface from that NUMA node added to OVS.
 
+  .. note::
+   On NUMA systems PCI devices are also local to a NUMA node.  Rx queues for
+   PCI device will assigned to a pmd on it's local NUMA node if pmd-cpu-mask
+   has created a pmd thread on that NUMA node.  If not the queue will be
+   assigned to a pmd on a remote NUMA node.  This will result in reduced
+   maximum throughput on that device.  In the case such a queue assignment
+   is made a warning message will be logged: "There's no available (non-
+   isolated) pmd thread on numa node N. Queue Q on port P will be assigned to
+   the pmd on core C (numa node N'). Expect reduced performance."
+
 - QEMU vCPU thread Affinity
 
   A VM performing simple packet forwarding or running complex packet pipelines
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index b3a0806..34f1963 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -3149,10 +3149,13 @@ rr_numa_list_lookup(struct rr_numa_list *rr, int 
numa_id)
 }
 
 static void
-rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list *rr)
+rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list *rr,
+  int *all_numa_ids, unsigned all_numa_ids_sz,
+  int *num_ids_written)
 {
 struct dp_netdev_pmd_thread *pmd;
 struct rr_numa *numa;
+unsigned idx = 0;
 
 hmap_init(>numas);
 
@@ -3170,7 +3173,11 @@ rr_numa_list_populate(struct dp_netdev *dp, struct 
rr_numa_list *rr)
 numa->n_pmds++;
 numa->pmds = xrealloc(numa->pmds, numa->n_pmds * sizeof *numa->pmds);
 numa->pmds[numa->n_pmds - 1] = pmd;
+
+all_numa_ids[idx % all_numa_ids_sz] = pmd->numa_id;
+idx++;
 }
+*num_ids_written = idx;
 }
 
 static struct dp_netdev_pmd_thread *
@@ -3202,8 +3209,15 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) 
OVS_REQUIRES(dp->port_mutex)
 {
 struct dp_netdev_port *port;
 struct rr_numa_list rr;
+int all_numa_ids [64];
+int all_numa_ids_sz = sizeof all_numa_ids / sizeof all_numa_ids[0];
+unsigned all_numa_ids_idx = 0;
+int all_numa_ids_max_idx = 0;
+int num_numa_ids = 0;
 
-rr_numa_list_populate(dp, );
+rr_numa_list_populate(dp, , all_numa_ids, all_numa_ids_sz,
+  _numa_ids);
+all_numa_ids_max_idx = MIN(num_numa_ids - 1, all_numa_ids_sz - 1);
 
 HMAP_FOR_EACH (port, node, >ports) {
 struct rr_numa *numa;
@@ -3234,10 +3248,29 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) 
OVS_REQUIRES(dp->port_mutex)
 }
 } else if (!pinned && q->core_id == OVS_CORE_UNSPEC) {
 if (!numa) {
-VLOG_WARN("There's no available (non isolated) pmd thread "
+if (all_numa_ids_max_idx < 0) {
+VLOG_ERR("There is no available (non-isolated) pmd "
+ "thread for port \'%s\' queue %d. This queue "
+ "will not be polled. Is pmd-cpu-mask set to "
+ "zero? Or are all PMDs isolated to other "
+ "queues?", netdev_get_name(port->netdev),
+ qid);
+continue;
+}
+int alt_numa_id = all_numa_ids[all_numa_ids_idx];
+struct rr_numa *alt_numa;
+alt_numa = rr_numa_list_lookup(, alt_numa_id);
+q->pmd = rr_numa_get_pmd(alt_numa);
+VLOG_WARN("There's no available (non-isolated) pmd thread "
   "on numa n

[ovs-dev] [PATCH v5] dpif-netdev: Assign ports to pmds on non-local numa node.

2017-05-10 Thread Billy O'Mahony
From: billyom <billy.o.mah...@intel.com>

Previously if there is no available (non-isolated) pmd on the numa node
for a port then the port is not polled at all. This can result in a
non-operational system until such time as nics are physically
repositioned. It is preferable to operate with a pmd on the 'wrong' numa
node albeit with lower performance. Local pmds are still chosen when
available.

Signed-off-by: Billy O'Mahony <billy.o.mah...@intel.com>
---
 Documentation/intro/install/dpdk.rst | 10 +
 lib/dpif-netdev.c| 43 +++-
 2 files changed, 48 insertions(+), 5 deletions(-)

diff --git a/Documentation/intro/install/dpdk.rst 
b/Documentation/intro/install/dpdk.rst
index d1c0e65..7a66bff 100644
--- a/Documentation/intro/install/dpdk.rst
+++ b/Documentation/intro/install/dpdk.rst
@@ -460,6 +460,16 @@ affinitized accordingly.
 pmd thread on a NUMA node is only created if there is at least one DPDK
 interface from that NUMA node added to OVS.
 
+  .. note::
+   On NUMA systems PCI devices are also local to a NUMA node.  Rx queues for
+   PCI device will assigned to a pmd on it's local NUMA node if pmd-cpu-mask
+   has created a pmd thread on that NUMA node.  If not the queue will be
+   assigned to a pmd on a remote NUMA node.  This will result in reduced
+   maximum throughput on that device.  In the case such a queue assignment
+   is made a warning message will be logged: "There's no available (non-
+   isolated) pmd thread on numa node N. Queue Q on port P will be assigned to
+   the pmd on core C (numa node N'). Expect reduced performance."
+
 - QEMU vCPU thread Affinity
 
   A VM performing simple packet forwarding or running complex packet pipelines
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index b3a0806..466a818 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -3149,10 +3149,13 @@ rr_numa_list_lookup(struct rr_numa_list *rr, int 
numa_id)
 }
 
 static void
-rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list *rr)
+rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list *rr,
+  int *all_numa_ids, unsigned all_numa_ids_sz,
+  int *num_ids_written)
 {
 struct dp_netdev_pmd_thread *pmd;
 struct rr_numa *numa;
+unsigned idx = 0;
 
 hmap_init(>numas);
 
@@ -3170,7 +3173,11 @@ rr_numa_list_populate(struct dp_netdev *dp, struct 
rr_numa_list *rr)
 numa->n_pmds++;
 numa->pmds = xrealloc(numa->pmds, numa->n_pmds * sizeof *numa->pmds);
 numa->pmds[numa->n_pmds - 1] = pmd;
+
+all_numa_ids[idx % all_numa_ids_sz] = pmd->numa_id;
+idx++;
 }
+*num_ids_written = idx;
 }
 
 static struct dp_netdev_pmd_thread *
@@ -3202,8 +3209,15 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) 
OVS_REQUIRES(dp->port_mutex)
 {
 struct dp_netdev_port *port;
 struct rr_numa_list rr;
+int all_numa_ids [64];
+int all_numa_ids_sz = sizeof all_numa_ids / sizeof all_numa_ids[0];
+unsigned all_numa_ids_idx = 0;
+int all_numa_ids_max_idx = 0;
+int num_numa_ids = 0;
 
-rr_numa_list_populate(dp, );
+rr_numa_list_populate(dp, , all_numa_ids, all_numa_ids_sz,
+  _numa_ids);
+all_numa_ids_max_idx = MIN(num_numa_ids - 1, all_numa_ids_sz - 1);
 
 HMAP_FOR_EACH (port, node, >ports) {
 struct rr_numa *numa;
@@ -3234,10 +3248,29 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) 
OVS_REQUIRES(dp->port_mutex)
 }
 } else if (!pinned && q->core_id == OVS_CORE_UNSPEC) {
 if (!numa) {
-VLOG_WARN("There's no available (non isolated) pmd thread "
+if (all_numa_ids_max_idx < 0) {
+VLOG_ERR("There is no available (non-isolated) pmd "
+ "thread for port \'%s\' queue %d. This port "
+ "will not be polled. Is pmd-cpu-mask set to "
+ "zero? Or are all PMDs isolated to other "
+ "queues?", netdev_get_name(port->netdev),
+ qid);
+continue;
+}
+int alt_numa_id = all_numa_ids[all_numa_ids_idx];
+struct rr_numa *alt_numa;
+alt_numa = rr_numa_list_lookup(, alt_numa_id);
+q->pmd = rr_numa_get_pmd(alt_numa);
+VLOG_WARN("There's no available (non-isolated) pmd thread "
   "on numa node %d. Queue %d on port \'%s\' will "
-  "not be polled.",
-  numa_id, qid, netdev_get_name(port->netdev));
+  "be 

[ovs-dev] [PATCH v4] dpif-netdev: Assign ports to pmds on non-local numa node.

2017-05-05 Thread Billy O'Mahony
From: billyom <billy.o.mah...@intel.com>

Previously if there is no available (non-isolated) pmd on the numa node
for a port then the port is not polled at all. This can result in a
non-operational system until such time as nics are physically
repositioned. It is preferable to operate with a pmd on the 'wrong' numa
node albeit with lower performance. Local pmds are still chosen when
available.

Signed-off-by: Billy O'Mahony <billy.o.mah...@intel.com>
---
v4: Fix a checkpatch error
v3: Fix warning messages not appearing when using multiqueue
v2: Add details of warning messages into docs

 Documentation/intro/install/dpdk.rst | 10 +
 lib/dpif-netdev.c| 40 
 2 files changed, 46 insertions(+), 4 deletions(-)

diff --git a/Documentation/intro/install/dpdk.rst 
b/Documentation/intro/install/dpdk.rst
index d1c0e65..ca73184 100644
--- a/Documentation/intro/install/dpdk.rst
+++ b/Documentation/intro/install/dpdk.rst
@@ -460,6 +460,16 @@ affinitized accordingly.
 pmd thread on a NUMA node is only created if there is at least one DPDK
 interface from that NUMA node added to OVS.
 
+  .. note::
+   On NUMA systems PCI devices are also local to a NUMA node.  Rx queues for
+   PCI device will assigned to a pmd on it's local NUMA node if pmd-cpu-mask
+   has created a pmd thread on that NUMA node.  If not the queue will be
+   assigned to a pmd on a remote NUMA node.  This will result in reduced
+   maximum throughput on that device.  In the case such a queue assignment
+   is made a warning message will be logged: "There's no available
+   (non isolated) pmd thread on numa node N. Queue Q on port P will be assigned
+   to a pmd on numa node M. Expect reduced performance."
+
 - QEMU vCPU thread Affinity
 
   A VM performing simple packet forwarding or running complex packet pipelines
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index b3a0806..bcbd325 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -3149,10 +3149,13 @@ rr_numa_list_lookup(struct rr_numa_list *rr, int 
numa_id)
 }
 
 static void
-rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list *rr)
+rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list *rr,
+  int *all_numa_ids, unsigned all_numa_ids_sz,
+  int *num_ids_written)
 {
 struct dp_netdev_pmd_thread *pmd;
 struct rr_numa *numa;
+unsigned idx = 0;
 
 hmap_init(>numas);
 
@@ -3170,7 +3173,11 @@ rr_numa_list_populate(struct dp_netdev *dp, struct 
rr_numa_list *rr)
 numa->n_pmds++;
 numa->pmds = xrealloc(numa->pmds, numa->n_pmds * sizeof *numa->pmds);
 numa->pmds[numa->n_pmds - 1] = pmd;
+
+all_numa_ids[idx % all_numa_ids_sz] = pmd->numa_id;
+idx++;
 }
+*num_ids_written = idx;
 }
 
 static struct dp_netdev_pmd_thread *
@@ -3202,8 +3209,15 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) 
OVS_REQUIRES(dp->port_mutex)
 {
 struct dp_netdev_port *port;
 struct rr_numa_list rr;
+int all_numa_ids [64];
+int all_numa_ids_sz = sizeof all_numa_ids / sizeof all_numa_ids[0];
+unsigned all_numa_ids_idx = 0;
+int all_numa_ids_max_idx = 0;
+int num_numa_ids = 0;
 
-rr_numa_list_populate(dp, );
+rr_numa_list_populate(dp, , all_numa_ids, all_numa_ids_sz,
+  _numa_ids);
+all_numa_ids_max_idx = MIN(num_numa_ids - 1, all_numa_ids_sz - 1);
 
 HMAP_FOR_EACH (port, node, >ports) {
 struct rr_numa *numa;
@@ -3234,10 +3248,28 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) 
OVS_REQUIRES(dp->port_mutex)
 }
 } else if (!pinned && q->core_id == OVS_CORE_UNSPEC) {
 if (!numa) {
+if (all_numa_ids_max_idx < 0) {
+VLOG_ERR("There is no available (non-isolated) pmd "
+ "thread for port \'%s\'. This port will "
+ "not be polled. Is pmd-cpu-mask set to "
+ "zero? Or are all PMDs isolated to other "
+ "queues?", netdev_get_name(port->netdev));
+continue;
+}
+int alt_numa_id = all_numa_ids[all_numa_ids_idx];
+struct rr_numa *alt_numa;
+alt_numa = rr_numa_list_lookup(, alt_numa_id);
+q->pmd = rr_numa_get_pmd(alt_numa);
 VLOG_WARN("There's no available (non isolated) pmd thread "
   "on numa node %d. Queue %d on port \'%s\' will "
-  "not be polled.",
-  numa_id, qid, netdev_get_name(port->netdev));
+  "be assigned to the pm

[ovs-dev] [PATCH v3] dpif-netdev: Assign ports to pmds on non-local numa node.

2017-05-04 Thread Billy O'Mahony
From: billyom <billy.o.mah...@intel.com>

Previously if there is no available (non-isolated) pmd on the numa node
for a port then the port is not polled at all. This can result in a
non-operational system until such time as nics are physically
repositioned. It is preferable to operate with a pmd on the 'wrong' numa
node albeit with lower performance. Local pmds are still chosen when
available.

Signed-off-by: Billy O'Mahony <billy.o.mah...@intel.com>
---
 Documentation/intro/install/dpdk.rst | 10 +
 lib/dpif-netdev.c| 39 
 2 files changed, 45 insertions(+), 4 deletions(-)

diff --git a/Documentation/intro/install/dpdk.rst 
b/Documentation/intro/install/dpdk.rst
index d1c0e65..ca73184 100644
--- a/Documentation/intro/install/dpdk.rst
+++ b/Documentation/intro/install/dpdk.rst
@@ -460,6 +460,16 @@ affinitized accordingly.
 pmd thread on a NUMA node is only created if there is at least one DPDK
 interface from that NUMA node added to OVS.
 
+  .. note::
+   On NUMA systems PCI devices are also local to a NUMA node.  Rx queues for
+   PCI device will assigned to a pmd on it's local NUMA node if pmd-cpu-mask
+   has created a pmd thread on that NUMA node.  If not the queue will be
+   assigned to a pmd on a remote NUMA node.  This will result in reduced
+   maximum throughput on that device.  In the case such a queue assignment
+   is made a warning message will be logged: "There's no available
+   (non isolated) pmd thread on numa node N. Queue Q on port P will be assigned
+   to a pmd on numa node M. Expect reduced performance."
+
 - QEMU vCPU thread Affinity
 
   A VM performing simple packet forwarding or running complex packet pipelines
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index b3a0806..1745c6e 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -3149,10 +3149,13 @@ rr_numa_list_lookup(struct rr_numa_list *rr, int 
numa_id)
 }
 
 static void
-rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list *rr)
+rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list *rr,
+  int *all_numa_ids, unsigned all_numa_ids_sz,
+  int *num_ids_written)
 {
 struct dp_netdev_pmd_thread *pmd;
 struct rr_numa *numa;
+unsigned idx = 0;
 
 hmap_init(>numas);
 
@@ -3170,7 +3173,11 @@ rr_numa_list_populate(struct dp_netdev *dp, struct 
rr_numa_list *rr)
 numa->n_pmds++;
 numa->pmds = xrealloc(numa->pmds, numa->n_pmds * sizeof *numa->pmds);
 numa->pmds[numa->n_pmds - 1] = pmd;
+
+all_numa_ids[idx % all_numa_ids_sz] = pmd->numa_id;
+idx++;
 }
+*num_ids_written = idx;
 }
 
 static struct dp_netdev_pmd_thread *
@@ -3202,8 +3209,15 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) 
OVS_REQUIRES(dp->port_mutex)
 {
 struct dp_netdev_port *port;
 struct rr_numa_list rr;
+int all_numa_ids [64];
+int all_numa_ids_sz = sizeof all_numa_ids / sizeof all_numa_ids[0];
+unsigned all_numa_ids_idx = 0;
+int all_numa_ids_max_idx = 0;
+int num_numa_ids = 0;
 
-rr_numa_list_populate(dp, );
+rr_numa_list_populate(dp, , all_numa_ids, all_numa_ids_sz,
+  _numa_ids);
+all_numa_ids_max_idx = MIN(num_numa_ids - 1, all_numa_ids_sz - 1);
 
 HMAP_FOR_EACH (port, node, >ports) {
 struct rr_numa *numa;
@@ -3234,10 +3248,27 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) 
OVS_REQUIRES(dp->port_mutex)
 }
 } else if (!pinned && q->core_id == OVS_CORE_UNSPEC) {
 if (!numa) {
+if (all_numa_ids_max_idx < 0) {
+VLOG_ERR("There is no available (non-isolated) pmd "
+ "thread for port \'%s\'. This port will "
+ "not be polled. Is pmd-cpu-mask set to "
+ "zero? Or are all PMDs isolated to other "
+ "queues?", netdev_get_name(port->netdev));
+continue;
+}
+int alt_numa_id = all_numa_ids[all_numa_ids_idx];
+struct rr_numa *alt_numa = rr_numa_list_lookup(, 
alt_numa_id);
+q->pmd = rr_numa_get_pmd(alt_numa);
 VLOG_WARN("There's no available (non isolated) pmd thread "
   "on numa node %d. Queue %d on port \'%s\' will "
-  "not be polled.",
-  numa_id, qid, netdev_get_name(port->netdev));
+  "be assigned to the pmd on core %d"
+  " (numa node %d). Expect reduced performance.",
+  numa_id, qid

[ovs-dev] [PATCH v2] dpif-netdev: Assign ports to pmds on non-local numa node.

2017-03-30 Thread Billy O'Mahony
From: billyom <billy.o.mah...@intel.com>

Previously if there is no available (non-isolated) pmd on the numa node
for a port then the port is not polled at all. This can result in a
non-operational system until such time as nics are physically
repositioned. It is preferable to operate with a pmd on the 'wrong' numa
node albeit with lower performance. Local pmds are still chosen when
available.

Signed-off-by: Billy O'Mahony <billy.o.mah...@intel.com>
---
 Documentation/intro/install/dpdk.rst | 10 ++
 lib/dpif-netdev.c| 36 
 2 files changed, 42 insertions(+), 4 deletions(-)

diff --git a/Documentation/intro/install/dpdk.rst 
b/Documentation/intro/install/dpdk.rst
index b947bd5..0e0b9f0 100644
--- a/Documentation/intro/install/dpdk.rst
+++ b/Documentation/intro/install/dpdk.rst
@@ -450,6 +450,16 @@ affinitized accordingly.
 pmd thread on a NUMA node is only created if there is at least one DPDK
 interface from that NUMA node added to OVS.
 
+  .. note::
+   On NUMA systems PCI devices are also local to a NUMA node.  Rx queues for
+   PCI device will assigned to a pmd on it's local NUMA node if pmd-cpu-mask
+   has created a pmd thread on that NUMA node.  If not the queue will be
+   assigned to a pmd on a remote NUMA node.  This will result in reduced
+   maximum throughput on that device.  In the case such a queue assingment
+   is made a warning message will be logged: "There's no available
+   (non isolated) pmd thread on numa node N. Queue Q on port P will be assigned
+   to a pmd on numa node M. Expect reduced performance."
+
 - QEMU vCPU thread Affinity
 
   A VM performing simple packet forwarding or running complex packet pipelines
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index a14a2eb..c6570ba 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -3149,10 +3149,13 @@ rr_numa_list_lookup(struct rr_numa_list *rr, int 
numa_id)
 }
 
 static void
-rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list *rr)
+rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list *rr,
+  int *all_numa_ids, unsigned all_numa_ids_sz,
+  int *num_ids_written)
 {
 struct dp_netdev_pmd_thread *pmd;
 struct rr_numa *numa;
+unsigned idx = 0;
 
 hmap_init(>numas);
 
@@ -3170,7 +3173,11 @@ rr_numa_list_populate(struct dp_netdev *dp, struct 
rr_numa_list *rr)
 numa->n_pmds++;
 numa->pmds = xrealloc(numa->pmds, numa->n_pmds * sizeof *numa->pmds);
 numa->pmds[numa->n_pmds - 1] = pmd;
+
+all_numa_ids[idx % all_numa_ids_sz] = pmd->numa_id;
+idx++;
 }
+*num_ids_written = idx;
 }
 
 static struct dp_netdev_pmd_thread *
@@ -3202,8 +3209,15 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) 
OVS_REQUIRES(dp->port_mutex)
 {
 struct dp_netdev_port *port;
 struct rr_numa_list rr;
+int all_numa_ids [64];
+int all_numa_ids_sz = sizeof all_numa_ids / sizeof all_numa_ids[0];
+unsigned all_numa_ids_idx = 0;
+int all_numa_ids_max_idx = 0;
+int num_numa_ids = 0;
 
-rr_numa_list_populate(dp, );
+rr_numa_list_populate(dp, , all_numa_ids, all_numa_ids_sz,
+  _numa_ids);
+all_numa_ids_max_idx = MIN(num_numa_ids - 1, all_numa_ids_sz - 1);
 
 HMAP_FOR_EACH (port, node, >ports) {
 struct rr_numa *numa;
@@ -3234,10 +3248,24 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) 
OVS_REQUIRES(dp->port_mutex)
 }
 } else if (!pinned && q->core_id == OVS_CORE_UNSPEC) {
 if (!numa) {
+if (all_numa_ids_max_idx < 0) {
+VLOG_ERR("There are no pmd threads. "
+ "Is pmd-cpu-mask set to zero?");
+continue;
+}
 VLOG_WARN("There's no available (non isolated) pmd thread "
   "on numa node %d. Queue %d on port \'%s\' will "
-  "not be polled.",
-  numa_id, qid, netdev_get_name(port->netdev));
+  "be assigned to a pmd on numa node %d. Expect "
+  "reduced performance.",
+  numa_id, qid, netdev_get_name(port->netdev),
+  all_numa_ids[all_numa_ids_idx]);
+numa_id = all_numa_ids[all_numa_ids_idx];
+numa = rr_numa_list_lookup(, numa_id);
+q->pmd = rr_numa_get_pmd(numa);
+all_numa_ids_idx++;
+if (all_numa_ids_idx > all_numa_ids_max_idx) {
+all_numa_ids_idx = 0;
+}
 } else {
 q->pmd = rr_numa_ge

[ovs-dev] [PATCH] dpif-netdev: emc comments

2017-03-30 Thread Billy O'Mahony
Add a concrete example of how a flow's hash determines the set of
possible storage locations in the EMC.

Signed-off-by: Billy O'Mahony <billy.o.mah...@intel.com>
---
 lib/dpif-netdev.c | 24 +---
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 7d53a8d..d99eec7 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -124,16 +124,26 @@ struct netdev_flow_key {
 /* Exact match cache for frequently used flows
  *
  * The cache uses a 32-bit hash of the packet (which can be the RSS hash) to
- * search its entries for a miniflow that matches exactly the miniflow of the
- * packet. It stores the 'dpcls_rule' (rule) that matches the miniflow.
+ * search its entries for the miniflow that matches exactly the miniflow of the
+ * packet. The dp_netdev_flow reference in the matching emc_entry also stores
+ * the dpcls_rule that corresponds to the miniflow.
  *
- * A cache entry holds a reference to its 'dp_netdev_flow'.
+ * A miniflow with a given hash can be stored in any one of EM_FLOW_HASH_SEGS
+ * different entries. The 32-bit hash is split into EM_FLOW_HASH_SEGS values
+ * (each of which is EM_FLOW_HASH_SHIFT bits wide - the remainder is thrown
+ * away). Each  value is the index of a cache entry where the miniflow could be
+ * stored.
  *
- * A miniflow with a given hash can be in one of EM_FLOW_HASH_SEGS different
- * entries. The 32-bit hash is split into EM_FLOW_HASH_SEGS values (each of
- * them is EM_FLOW_HASH_SHIFT bits wide and the remainder is thrown away). Each
- * value is the index of a cache entry where the miniflow could be.
+ * For example, assuming that EM_FLOW_HASH_SHIFT is 13 and EM_FLOW_HASH_SEGS is
+ * 2, an entry with 32-bit hash of 0xDEADBEEF could be stored at either
+ * entries[0x156D] or entries[0x1EEF]. The indices are derived from bits 0-12
+ * and bits 13-25 of the 32-bit hash respectively. Bits 26-31 are ignored. Both
+ * entries have to be checked with netdev_flow_key_equal() to find the actual
+ * match. Note: bits 0 and 31 are the least and most significant bits
+ * respectively of the 32 bit hash value.
  *
+ * It follows from the search indices being generated from n bits that the
+ * number of entries in the cache must be a power of two.
  *
  * Thread-safety
  * =
-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH] netdev-dpdk: Enable INDIRECT_DESC on DPDK vHostUser.

2017-03-01 Thread Billy O'Mahony
This gives much better performance for linux apps in the guest without
affecting dpdk applications in the guest. Test details in cover letter.

Signed-off-by: Billy O'Mahony <billy.o.mah...@intel.com>
---
 lib/netdev-dpdk.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index ee53c4c..f449508 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -2769,8 +2769,7 @@ netdev_dpdk_vhost_class_init(void)
 rte_vhost_driver_callback_register(_net_device_ops);
 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4
   | 1ULL << VIRTIO_NET_F_HOST_TSO6
-  | 1ULL << VIRTIO_NET_F_CSUM
-  | 1ULL << VIRTIO_RING_F_INDIRECT_DESC);
+  | 1ULL << VIRTIO_NET_F_CSUM);
 ovs_thread_create("vhost_thread", start_vhost_loop, NULL);
 
 ovsthread_once_done();
-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH] netdev-dpdk: Enable INDIRECT_DESC on DPDK vHostUser.

2017-03-01 Thread Billy O'Mahony
Hi All,

I'm creating this patch on the basis of performance results outlined below. In
summary it appears that enabling INDIRECT_DESC on DPDK vHostUser ports leads to
very large increase in performance when using linux stack applications in the
guest with no noticable performance drop for DPDK based applications in the
guest.

Test#1 (VM-VM iperf3 performance)
 VMs use DPDK vhostuser ports
 OVS bridge is configured for normal action.
 OVS version 603381a (on 2.7.0 branch but before release,
 also seen on v2.6.0 and v2.6.1)
 DPDK v16.11
 QEMU v2.5.0 (also seen with v2.7.1)

 Results:
  INDIRECT_DESC enabled5.30 Gbit/s
  INDIRECT_DESC disabled   0.05 Gbit/s

Test#2  (Phy-VM-Phy RFC2544 Throughput)
 DPDK PMDs are polling NIC, DPDK loopback app running in guest.
 OVS bridge is configured with port forwarding to VM (via dpdkvhostuser ports).
 OVS version 603381a (on 2.7.0 branch but before release),
 other versions not tested.
 DPDK v16.11
 QEMU v2.5.0 (also seen with v2.7.1)

 Results:
  INDIRECT_DESC enabled2.75 Mpps @64B pkts (0.176 Gbit/s)
  INDIRECT_DESC disabled   2.75 Mpps @64B pkts (0.176 Gbit/s)


Billy O'Mahony (1):
  netdev-dpdk: Enable INDIRECT_DESC on DPDK vHostUser.

 lib/netdev-dpdk.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev