[PATCHv2 net-next 0/4] MV88E6390 batch two

2016-12-02 Thread Andrew Lunn
This is the second batch of patches adding support for the
MV88e6390. They are not sufficient to make it work properly.

The mv88e6390 has a much expanded set of priority maps. Refactor the
existing code, and implement basic support for the new device.

Similarly, the monitor control register has been reworked.

The mv88e6390 has something odd in its EDSA tagging implementation,
which means it is not possible to use it. So we need to use DSA
tagging. This is the first device with EDSA support where we need to
use DSA, and the code does not support this. So two patches refactor
the existing code. The two different register definitions are
separated out, and using DSA on an EDSA capable device is added.

v2:
Add port prefix
Add helper function for 6390
Add _IEEE_ into #defines
Split monitor_ctrl into a number of separate ops.
Remove 6390 code which is management, used in a later patch
s/EGREES/EGRESS/.
Broke up setup_port_dsa() and set_port_dsa() into a number of ops

v3:
Verify mandatory ops for port setup
Don't set ether type for DSA port.

Andrew Lunn (4):
  net: dsa: mv88e6xxx: Implement mv88e6390 tag remap
  net: dsa: mv88e6xxx: Monitor and Management tables
  net: dsa: mv88e6xxx: Move the tagging protocol into info
  net: dsa: mv88e6xxx: Refactor CPU and DSA port setup

 drivers/net/dsa/mv88e6xxx/chip.c  | 339 ++
 drivers/net/dsa/mv88e6xxx/global1.c   |  69 +++
 drivers/net/dsa/mv88e6xxx/global1.h   |   4 +
 drivers/net/dsa/mv88e6xxx/mv88e6xxx.h |  62 +--
 drivers/net/dsa/mv88e6xxx/port.c  | 181 ++
 drivers/net/dsa/mv88e6xxx/port.h  |  15 ++
 6 files changed, 583 insertions(+), 87 deletions(-)

-- 
2.10.2



Re: [PATCH net-next 2/4] mlx4: xdp: Allow raising MTU up to one page minus eth and vlan hdrs

2016-12-02 Thread Eric Dumazet
On Fri, 2016-12-02 at 19:42 -0800, Martin KaFai Lau wrote:
> On Fri, Dec 02, 2016 at 06:15:26PM -0800, Eric Dumazet wrote:
> > My question was more like :
> >
> > Can we double check all these patches wont break mlx4 driver (non XDP
> > path) on arches with PAGE_SIZE=64KB.
> The page/pkt requirement is not added by this patch.  The earlier
> XDP patch series has already ensured this page/pkt requirement
> is effective only when XDP prog is attached.
> 
> In the earlier XDP patches, MTU is limited to 1514 when
> XDP is ative.   This patch is to allow fully use of the
> page for a packet (and also only matter when XDP is active).

OK, thanks for the clarification.




[PATCH v1 net-next 1/5] net: dsa: mv88e6xxx: Reserved Management frames to CPU

2016-12-02 Thread Andrew Lunn
Older devices have a couple of registers in global2. The mv88e6390
family has a single register in global1 behind which hides similar
configuration. Implement and op for this.

Signed-off-by: Andrew Lunn 
---
 drivers/net/dsa/mv88e6xxx/chip.c  | 35 
 drivers/net/dsa/mv88e6xxx/global1.c   | 27 ++
 drivers/net/dsa/mv88e6xxx/global1.h   |  1 +
 drivers/net/dsa/mv88e6xxx/global2.c   | 43 ---
 drivers/net/dsa/mv88e6xxx/global2.h   |  6 +
 drivers/net/dsa/mv88e6xxx/mv88e6xxx.h |  3 +++
 6 files changed, 97 insertions(+), 18 deletions(-)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 9c14aaad5103..b2b6fe3ef4bf 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -2899,6 +2899,17 @@ static int mv88e6xxx_setup(struct dsa_switch *ds)
goto unlock;
}
 
+   /* Some generations have the configuration of sending reserved
+* management frames to the CPU in global2, others in
+* global1. Hence it does not fit the two setup functions
+* above.
+*/
+   if (chip->info->ops->mgmt_rsvd2cpu) {
+   err = chip->info->ops->mgmt_rsvd2cpu(chip);
+   if (err)
+   goto unlock;
+   }
+
 unlock:
mutex_unlock(&chip->reg_lock);
 
@@ -3221,6 +3232,7 @@ static const struct mv88e6xxx_ops mv88e6085_ops = {
.stats_get_stats = mv88e6095_stats_get_stats,
.g1_set_cpu_port = mv88e6095_g1_set_cpu_port,
.g1_set_egress_port = mv88e6095_g1_set_egress_port,
+   .mgmt_rsvd2cpu = mv88e6095_g2_mgmt_rsvd2cpu,
 };
 
 static const struct mv88e6xxx_ops mv88e6095_ops = {
@@ -3237,6 +3249,7 @@ static const struct mv88e6xxx_ops mv88e6095_ops = {
.stats_get_sset_count = mv88e6095_stats_get_sset_count,
.stats_get_strings = mv88e6095_stats_get_strings,
.stats_get_stats = mv88e6095_stats_get_stats,
+   .mgmt_rsvd2cpu = mv88e6095_g2_mgmt_rsvd2cpu,
 };
 
 static const struct mv88e6xxx_ops mv88e6097_ops = {
@@ -3257,6 +3270,7 @@ static const struct mv88e6xxx_ops mv88e6097_ops = {
.stats_get_stats = mv88e6095_stats_get_stats,
.g1_set_cpu_port = mv88e6095_g1_set_cpu_port,
.g1_set_egress_port = mv88e6095_g1_set_egress_port,
+   .mgmt_rsvd2cpu = mv88e6095_g2_mgmt_rsvd2cpu,
 };
 
 static const struct mv88e6xxx_ops mv88e6123_ops = {
@@ -3275,6 +3289,7 @@ static const struct mv88e6xxx_ops mv88e6123_ops = {
.stats_get_stats = mv88e6095_stats_get_stats,
.g1_set_cpu_port = mv88e6095_g1_set_cpu_port,
.g1_set_egress_port = mv88e6095_g1_set_egress_port,
+   .mgmt_rsvd2cpu = mv88e6095_g2_mgmt_rsvd2cpu,
 };
 
 static const struct mv88e6xxx_ops mv88e6131_ops = {
@@ -3295,6 +3310,7 @@ static const struct mv88e6xxx_ops mv88e6131_ops = {
.stats_get_stats = mv88e6095_stats_get_stats,
.g1_set_cpu_port = mv88e6095_g1_set_cpu_port,
.g1_set_egress_port = mv88e6095_g1_set_egress_port,
+   .mgmt_rsvd2cpu = mv88e6095_g2_mgmt_rsvd2cpu,
 };
 
 static const struct mv88e6xxx_ops mv88e6161_ops = {
@@ -3315,6 +3331,7 @@ static const struct mv88e6xxx_ops mv88e6161_ops = {
.stats_get_stats = mv88e6095_stats_get_stats,
.g1_set_cpu_port = mv88e6095_g1_set_cpu_port,
.g1_set_egress_port = mv88e6095_g1_set_egress_port,
+   .mgmt_rsvd2cpu = mv88e6095_g2_mgmt_rsvd2cpu,
 };
 
 static const struct mv88e6xxx_ops mv88e6165_ops = {
@@ -3331,6 +3348,7 @@ static const struct mv88e6xxx_ops mv88e6165_ops = {
.stats_get_stats = mv88e6095_stats_get_stats,
.g1_set_cpu_port = mv88e6095_g1_set_cpu_port,
.g1_set_egress_port = mv88e6095_g1_set_egress_port,
+   .mgmt_rsvd2cpu = mv88e6095_g2_mgmt_rsvd2cpu,
 };
 
 static const struct mv88e6xxx_ops mv88e6171_ops = {
@@ -3352,6 +3370,7 @@ static const struct mv88e6xxx_ops mv88e6171_ops = {
.stats_get_stats = mv88e6095_stats_get_stats,
.g1_set_cpu_port = mv88e6095_g1_set_cpu_port,
.g1_set_egress_port = mv88e6095_g1_set_egress_port,
+   .mgmt_rsvd2cpu = mv88e6095_g2_mgmt_rsvd2cpu,
 };
 
 static const struct mv88e6xxx_ops mv88e6172_ops = {
@@ -3375,6 +3394,7 @@ static const struct mv88e6xxx_ops mv88e6172_ops = {
.stats_get_stats = mv88e6095_stats_get_stats,
.g1_set_cpu_port = mv88e6095_g1_set_cpu_port,
.g1_set_egress_port = mv88e6095_g1_set_egress_port,
+   .mgmt_rsvd2cpu = mv88e6095_g2_mgmt_rsvd2cpu,
 };
 
 static const struct mv88e6xxx_ops mv88e6175_ops = {
@@ -3396,6 +3416,7 @@ static const struct mv88e6xxx_ops mv88e6175_ops = {
.stats_get_stats = mv88e6095_stats_get_stats,
.g1_set_cpu_port = mv88e6095_g1_set_cpu_port,
.g1_set_egress_port = mv88e6095_g1_set_egress_port,
+   .mgmt_rsvd2cpu = mv88e6095_g2_mgmt_rsvd2cpu,
 };
 
 static const struct mv88e6xxx_ops mv88e6176_ops = {
@@ -3419,6 +3440,7 @@ static const struct mv8

[PATCH v1 net-next 0/5] mv88e6390 batch 3

2016-12-02 Thread Andrew Lunn
More patches to support the MV88e6390. This is mostly refactoring
existing code and adding implementations for the mv88e6390.  This
patchset set which reserved frames are sent to the cpu, the size of
jumbo frames that will be accepted, turn off egress rate limiting, and
configuration of pause frames.

Andrew Lunn (5):
  net: dsa: mv88e6xxx: Reserved Management frames to CPU
  net: dsa: mv88e6xxx: Refactor setting of jumbo frames
  net: dsa: mv88e6xxx: Refactor egress rate limiting
  net: dsa: mv88e6xxx: Refactor pause configuration
  net: dsa: mv88e6xxx: Implement mv88e6390 pause control

 drivers/net/dsa/mv88e6xxx/chip.c  | 125 +++---
 drivers/net/dsa/mv88e6xxx/global1.c   |  27 
 drivers/net/dsa/mv88e6xxx/global1.h   |   1 +
 drivers/net/dsa/mv88e6xxx/global2.c   |  43 +++-
 drivers/net/dsa/mv88e6xxx/global2.h   |   6 ++
 drivers/net/dsa/mv88e6xxx/mv88e6xxx.h |   9 +++
 drivers/net/dsa/mv88e6xxx/port.c  |  50 ++
 drivers/net/dsa/mv88e6xxx/port.h  |   6 +-
 8 files changed, 225 insertions(+), 42 deletions(-)

-- 
2.10.2



[PATCH v1 net-next 4/5] net: dsa: mv88e6xxx: Refactor pause configuration

2016-12-02 Thread Andrew Lunn
The mv88e6390 has a different mechanism for configuring pause.
Refactor the code into an ops function, and for the moment, don't add
any mv88e6390 code yet.

Signed-off-by: Andrew Lunn 
---
 drivers/net/dsa/mv88e6xxx/chip.c  | 28 
 drivers/net/dsa/mv88e6xxx/mv88e6xxx.h |  1 +
 drivers/net/dsa/mv88e6xxx/port.c  | 11 +++
 drivers/net/dsa/mv88e6xxx/port.h  |  1 +
 4 files changed, 33 insertions(+), 8 deletions(-)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 1b0917e44809..3ddb1f79e709 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -2625,17 +2625,15 @@ static int mv88e6xxx_setup_port(struct mv88e6xxx_chip 
*chip, int port)
if (err)
return err;
 
-   if (mv88e6xxx_6352_family(chip) || mv88e6xxx_6351_family(chip) ||
-   mv88e6xxx_6165_family(chip) || mv88e6xxx_6097_family(chip) ||
-   mv88e6xxx_6320_family(chip)) {
-   /* Do not limit the period of time that this port can
-* be paused for by the remote end or the period of
-* time that this port can pause the remote end.
-*/
-   err = mv88e6xxx_port_write(chip, port, PORT_PAUSE_CTRL, 0x);
+   if (chip->info->ops->port_pause_config) {
+   err = chip->info->ops->port_pause_config(chip, port);
if (err)
return err;
+   }
 
+   if (mv88e6xxx_6352_family(chip) || mv88e6xxx_6351_family(chip) ||
+   mv88e6xxx_6165_family(chip) || mv88e6xxx_6097_family(chip) ||
+   mv88e6xxx_6320_family(chip)) {
/* Port ATU control: disable limiting the number of
 * address database entries that this port is allowed
 * to use.
@@ -3220,6 +3218,7 @@ static const struct mv88e6xxx_ops mv88e6085_ops = {
.port_set_egress_unknowns = mv88e6351_port_set_egress_unknowns,
.port_set_ether_type = mv88e6351_port_set_ether_type,
.port_egress_rate_limiting = mv88e6097_port_egress_rate_limiting,
+   .port_pause_config = mv88e6097_port_pause_config,
.stats_snapshot = mv88e6xxx_g1_stats_snapshot,
.stats_get_sset_count = mv88e6095_stats_get_sset_count,
.stats_get_strings = mv88e6095_stats_get_strings,
@@ -3260,6 +3259,7 @@ static const struct mv88e6xxx_ops mv88e6097_ops = {
.port_set_ether_type = mv88e6351_port_set_ether_type,
.port_jumbo_config = mv88e6165_port_jumbo_config,
.port_egress_rate_limiting = mv88e6095_port_egress_rate_limiting,
+   .port_pause_config = mv88e6097_port_pause_config,
.stats_snapshot = mv88e6xxx_g1_stats_snapshot,
.stats_get_sset_count = mv88e6095_stats_get_sset_count,
.stats_get_strings = mv88e6095_stats_get_strings,
@@ -3302,6 +3302,7 @@ static const struct mv88e6xxx_ops mv88e6131_ops = {
.port_set_ether_type = mv88e6351_port_set_ether_type,
.port_jumbo_config = mv88e6165_port_jumbo_config,
.port_egress_rate_limiting = mv88e6097_port_egress_rate_limiting,
+   .port_pause_config = mv88e6097_port_pause_config,
.stats_snapshot = mv88e6xxx_g1_stats_snapshot,
.stats_get_sset_count = mv88e6095_stats_get_sset_count,
.stats_get_strings = mv88e6095_stats_get_strings,
@@ -3325,6 +3326,7 @@ static const struct mv88e6xxx_ops mv88e6161_ops = {
.port_set_ether_type = mv88e6351_port_set_ether_type,
.port_jumbo_config = mv88e6165_port_jumbo_config,
.port_egress_rate_limiting = mv88e6097_port_egress_rate_limiting,
+   .port_pause_config = mv88e6097_port_pause_config,
.stats_snapshot = mv88e6xxx_g1_stats_snapshot,
.stats_get_sset_count = mv88e6095_stats_get_sset_count,
.stats_get_strings = mv88e6095_stats_get_strings,
@@ -3366,6 +3368,7 @@ static const struct mv88e6xxx_ops mv88e6171_ops = {
.port_set_ether_type = mv88e6351_port_set_ether_type,
.port_jumbo_config = mv88e6165_port_jumbo_config,
.port_egress_rate_limiting = mv88e6097_port_egress_rate_limiting,
+   .port_pause_config = mv88e6097_port_pause_config,
.stats_snapshot = mv88e6320_g1_stats_snapshot,
.stats_get_sset_count = mv88e6095_stats_get_sset_count,
.stats_get_strings = mv88e6095_stats_get_strings,
@@ -3392,6 +3395,7 @@ static const struct mv88e6xxx_ops mv88e6172_ops = {
.port_set_ether_type = mv88e6351_port_set_ether_type,
.port_jumbo_config = mv88e6165_port_jumbo_config,
.port_egress_rate_limiting = mv88e6097_port_egress_rate_limiting,
+   .port_pause_config = mv88e6097_port_pause_config,
.stats_snapshot = mv88e6320_g1_stats_snapshot,
.stats_get_sset_count = mv88e6095_stats_get_sset_count,
.stats_get_strings = mv88e6095_stats_get_strings,
@@ -3416,6 +3420,7 @@ static const struct mv88e6xxx_ops mv88e6175_ops = {
.port_set_ether_type = mv88e6351_por

[PATCH v1 net-next 2/5] net: dsa: mv88e6xxx: Refactor setting of jumbo frames

2016-12-02 Thread Andrew Lunn
Some switches support jumbo frames. Refactor this code into operations
in the ops structure.

Signed-off-by: Andrew Lunn 
---
 drivers/net/dsa/mv88e6xxx/chip.c  | 26 ++
 drivers/net/dsa/mv88e6xxx/mv88e6xxx.h |  1 +
 drivers/net/dsa/mv88e6xxx/port.c  | 14 ++
 drivers/net/dsa/mv88e6xxx/port.h  |  2 +-
 4 files changed, 38 insertions(+), 5 deletions(-)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index b2b6fe3ef4bf..db1542e05e62 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -2582,10 +2582,6 @@ static int mv88e6xxx_setup_port(struct mv88e6xxx_chip 
*chip, int port)
mv88e6xxx_6185_family(chip))
reg = PORT_CONTROL_2_MAP_DA;
 
-   if (mv88e6xxx_6352_family(chip) || mv88e6xxx_6351_family(chip) ||
-   mv88e6xxx_6165_family(chip) || mv88e6xxx_6320_family(chip))
-   reg |= PORT_CONTROL_2_JUMBO_10240;
-
if (mv88e6xxx_6095_family(chip) || mv88e6xxx_6185_family(chip)) {
/* Set the upstream port this port should use */
reg |= dsa_upstream_port(ds);
@@ -2604,6 +2600,12 @@ static int mv88e6xxx_setup_port(struct mv88e6xxx_chip 
*chip, int port)
return err;
}
 
+   if (chip->info->ops->port_jumbo_config) {
+   err = chip->info->ops->port_jumbo_config(chip, port);
+   if (err)
+   return err;
+   }
+
/* Port Association Vector: when learning source addresses
 * of packets, add the address to the address database using
 * a port bitmap that has only the bit for this port set and
@@ -2663,6 +2665,7 @@ static int mv88e6xxx_setup_port(struct mv88e6xxx_chip 
*chip, int port)
   0x0001);
if (err)
return err;
+
} else if (mv88e6xxx_6185_family(chip) || mv88e6xxx_6095_family(chip)) {
err = mv88e6xxx_port_write(chip, port, PORT_RATE_CONTROL,
   0x);
@@ -3264,6 +3267,7 @@ static const struct mv88e6xxx_ops mv88e6097_ops = {
.port_set_frame_mode = mv88e6351_port_set_frame_mode,
.port_set_egress_unknowns = mv88e6351_port_set_egress_unknowns,
.port_set_ether_type = mv88e6351_port_set_ether_type,
+   .port_jumbo_config = mv88e6165_port_jumbo_config,
.stats_snapshot = mv88e6xxx_g1_stats_snapshot,
.stats_get_sset_count = mv88e6095_stats_get_sset_count,
.stats_get_strings = mv88e6095_stats_get_strings,
@@ -3304,6 +3308,7 @@ static const struct mv88e6xxx_ops mv88e6131_ops = {
.port_set_frame_mode = mv88e6351_port_set_frame_mode,
.port_set_egress_unknowns = mv88e6351_port_set_egress_unknowns,
.port_set_ether_type = mv88e6351_port_set_ether_type,
+   .port_jumbo_config = mv88e6165_port_jumbo_config,
.stats_snapshot = mv88e6xxx_g1_stats_snapshot,
.stats_get_sset_count = mv88e6095_stats_get_sset_count,
.stats_get_strings = mv88e6095_stats_get_strings,
@@ -3325,6 +3330,7 @@ static const struct mv88e6xxx_ops mv88e6161_ops = {
.port_set_frame_mode = mv88e6351_port_set_frame_mode,
.port_set_egress_unknowns = mv88e6351_port_set_egress_unknowns,
.port_set_ether_type = mv88e6351_port_set_ether_type,
+   .port_jumbo_config = mv88e6165_port_jumbo_config,
.stats_snapshot = mv88e6xxx_g1_stats_snapshot,
.stats_get_sset_count = mv88e6095_stats_get_sset_count,
.stats_get_strings = mv88e6095_stats_get_strings,
@@ -3364,6 +3370,7 @@ static const struct mv88e6xxx_ops mv88e6171_ops = {
.port_set_frame_mode = mv88e6351_port_set_frame_mode,
.port_set_egress_unknowns = mv88e6351_port_set_egress_unknowns,
.port_set_ether_type = mv88e6351_port_set_ether_type,
+   .port_jumbo_config = mv88e6165_port_jumbo_config,
.stats_snapshot = mv88e6320_g1_stats_snapshot,
.stats_get_sset_count = mv88e6095_stats_get_sset_count,
.stats_get_strings = mv88e6095_stats_get_strings,
@@ -3388,6 +3395,7 @@ static const struct mv88e6xxx_ops mv88e6172_ops = {
.port_set_frame_mode = mv88e6351_port_set_frame_mode,
.port_set_egress_unknowns = mv88e6351_port_set_egress_unknowns,
.port_set_ether_type = mv88e6351_port_set_ether_type,
+   .port_jumbo_config = mv88e6165_port_jumbo_config,
.stats_snapshot = mv88e6320_g1_stats_snapshot,
.stats_get_sset_count = mv88e6095_stats_get_sset_count,
.stats_get_strings = mv88e6095_stats_get_strings,
@@ -3410,6 +3418,7 @@ static const struct mv88e6xxx_ops mv88e6175_ops = {
.port_set_frame_mode = mv88e6351_port_set_frame_mode,
.port_set_egress_unknowns = mv88e6351_port_set_egress_unknowns,
.port_set_ether_type = mv88e6351_port_set_ether_type,
+   .port_jumbo_config = mv88e6165_port_jumbo_config,
.stats_snapsho

[PATCH v1 net-next 5/5] net: dsa: mv88e6xxx: Implement mv88e6390 pause control

2016-12-02 Thread Andrew Lunn
The mv88e6390 has a number flow control registers accessed via the
Flow Control register. Use these to set the pause control.

Signed-off-by: Andrew Lunn 
---
 drivers/net/dsa/mv88e6xxx/chip.c  |  7 +++
 drivers/net/dsa/mv88e6xxx/mv88e6xxx.h |  2 ++
 drivers/net/dsa/mv88e6xxx/port.c  | 13 +
 drivers/net/dsa/mv88e6xxx/port.h  |  1 +
 4 files changed, 23 insertions(+)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 3ddb1f79e709..ca453f3243cd 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -3490,6 +3490,7 @@ static const struct mv88e6xxx_ops mv88e6190_ops = {
.port_set_frame_mode = mv88e6351_port_set_frame_mode,
.port_set_egress_unknowns = mv88e6351_port_set_egress_unknowns,
.port_set_ether_type = mv88e6351_port_set_ether_type,
+   .port_pause_config = mv88e6390_port_pause_config,
.stats_snapshot = mv88e6390_g1_stats_snapshot,
.stats_set_histogram = mv88e6390_g1_stats_set_histogram,
.stats_get_sset_count = mv88e6320_stats_get_sset_count,
@@ -3513,6 +3514,7 @@ static const struct mv88e6xxx_ops mv88e6190x_ops = {
.port_set_frame_mode = mv88e6351_port_set_frame_mode,
.port_set_egress_unknowns = mv88e6351_port_set_egress_unknowns,
.port_set_ether_type = mv88e6351_port_set_ether_type,
+   .port_pause_config = mv88e6390_port_pause_config,
.stats_snapshot = mv88e6390_g1_stats_snapshot,
.stats_set_histogram = mv88e6390_g1_stats_set_histogram,
.stats_get_sset_count = mv88e6320_stats_get_sset_count,
@@ -3536,6 +3538,7 @@ static const struct mv88e6xxx_ops mv88e6191_ops = {
.port_set_frame_mode = mv88e6351_port_set_frame_mode,
.port_set_egress_unknowns = mv88e6351_port_set_egress_unknowns,
.port_set_ether_type = mv88e6351_port_set_ether_type,
+   .port_pause_config = mv88e6390_port_pause_config,
.stats_snapshot = mv88e6390_g1_stats_snapshot,
.stats_set_histogram = mv88e6390_g1_stats_set_histogram,
.stats_get_sset_count = mv88e6320_stats_get_sset_count,
@@ -3586,6 +3589,7 @@ static const struct mv88e6xxx_ops mv88e6290_ops = {
.port_set_frame_mode = mv88e6351_port_set_frame_mode,
.port_set_egress_unknowns = mv88e6351_port_set_egress_unknowns,
.port_set_ether_type = mv88e6351_port_set_ether_type,
+   .port_pause_config = mv88e6390_port_pause_config,
.stats_snapshot = mv88e6390_g1_stats_snapshot,
.stats_set_histogram = mv88e6390_g1_stats_set_histogram,
.stats_get_sset_count = mv88e6320_stats_get_sset_count,
@@ -3739,6 +3743,7 @@ static const struct mv88e6xxx_ops mv88e6390_ops = {
.port_set_ether_type = mv88e6351_port_set_ether_type,
.port_jumbo_config = mv88e6165_port_jumbo_config,
.port_egress_rate_limiting = mv88e6097_port_egress_rate_limiting,
+   .port_pause_config = mv88e6390_port_pause_config,
.stats_snapshot = mv88e6390_g1_stats_snapshot,
.stats_set_histogram = mv88e6390_g1_stats_set_histogram,
.stats_get_sset_count = mv88e6320_stats_get_sset_count,
@@ -3764,6 +3769,7 @@ static const struct mv88e6xxx_ops mv88e6390x_ops = {
.port_set_ether_type = mv88e6351_port_set_ether_type,
.port_jumbo_config = mv88e6165_port_jumbo_config,
.port_egress_rate_limiting = mv88e6097_port_egress_rate_limiting,
+   .port_pause_config = mv88e6390_port_pause_config,
.stats_snapshot = mv88e6390_g1_stats_snapshot,
.stats_set_histogram = mv88e6390_g1_stats_set_histogram,
.stats_get_sset_count = mv88e6320_stats_get_sset_count,
@@ -3787,6 +3793,7 @@ static const struct mv88e6xxx_ops mv88e6391_ops = {
.port_set_frame_mode = mv88e6351_port_set_frame_mode,
.port_set_egress_unknowns = mv88e6351_port_set_egress_unknowns,
.port_set_ether_type = mv88e6351_port_set_ether_type,
+   .port_pause_config = mv88e6390_port_pause_config,
.stats_snapshot = mv88e6390_g1_stats_snapshot,
.stats_set_histogram = mv88e6390_g1_stats_set_histogram,
.stats_get_sset_count = mv88e6320_stats_get_sset_count,
diff --git a/drivers/net/dsa/mv88e6xxx/mv88e6xxx.h 
b/drivers/net/dsa/mv88e6xxx/mv88e6xxx.h
index 3b1f3ab490b9..13c7cc443454 100644
--- a/drivers/net/dsa/mv88e6xxx/mv88e6xxx.h
+++ b/drivers/net/dsa/mv88e6xxx/mv88e6xxx.h
@@ -78,6 +78,8 @@
 #define PORT_PCS_CTRL_SPEED_1  (0x03) /* 6390X */
 #define PORT_PCS_CTRL_SPEED_UNFORCED   (0x03)
 #define PORT_PAUSE_CTRL0x02
+#define PORT_FLOW_CTRL_LIMIT_IN((0x00 << 8) | BIT(15))
+#define PORT_FLOW_CTRL_LIMIT_OUT   ((0x01 << 8) | BIT(15))
 #define PORT_SWITCH_ID 0x03
 #define PORT_SWITCH_ID_PROD_NUM_6085   0x04a
 #define PORT_SWITCH_ID_PROD_NUM_6095   0x095
diff --git a/drivers/net/dsa/mv88e6xxx/port.c b/drivers/net/dsa/mv88e6xxx/port.c
index 8d14833b2e49..0db7fa0373ae 100644
--- a/drivers/net/dsa/mv88e6xxx/port.c
+++ b/

[PATCH v1 net-next 3/5] net: dsa: mv88e6xxx: Refactor egress rate limiting

2016-12-02 Thread Andrew Lunn
There are two different rate limiting configurations, depending on the
switch generation. Refactor this into ops.

Signed-off-by: Andrew Lunn 
---
 drivers/net/dsa/mv88e6xxx/chip.c  | 31 +++
 drivers/net/dsa/mv88e6xxx/mv88e6xxx.h |  2 ++
 drivers/net/dsa/mv88e6xxx/port.c  | 12 
 drivers/net/dsa/mv88e6xxx/port.h  |  2 ++
 4 files changed, 35 insertions(+), 12 deletions(-)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index db1542e05e62..1b0917e44809 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -2657,18 +2657,8 @@ static int mv88e6xxx_setup_port(struct mv88e6xxx_chip 
*chip, int port)
return err;
}
 
-   /* Rate Control: disable ingress rate limiting. */
-   if (mv88e6xxx_6352_family(chip) || mv88e6xxx_6351_family(chip) ||
-   mv88e6xxx_6165_family(chip) || mv88e6xxx_6097_family(chip) ||
-   mv88e6xxx_6320_family(chip)) {
-   err = mv88e6xxx_port_write(chip, port, PORT_RATE_CONTROL,
-  0x0001);
-   if (err)
-   return err;
-
-   } else if (mv88e6xxx_6185_family(chip) || mv88e6xxx_6095_family(chip)) {
-   err = mv88e6xxx_port_write(chip, port, PORT_RATE_CONTROL,
-  0x);
+   if (chip->info->ops->port_egress_rate_limiting) {
+   err = chip->info->ops->port_egress_rate_limiting(chip, port);
if (err)
return err;
}
@@ -3229,6 +3219,7 @@ static const struct mv88e6xxx_ops mv88e6085_ops = {
.port_set_frame_mode = mv88e6351_port_set_frame_mode,
.port_set_egress_unknowns = mv88e6351_port_set_egress_unknowns,
.port_set_ether_type = mv88e6351_port_set_ether_type,
+   .port_egress_rate_limiting = mv88e6097_port_egress_rate_limiting,
.stats_snapshot = mv88e6xxx_g1_stats_snapshot,
.stats_get_sset_count = mv88e6095_stats_get_sset_count,
.stats_get_strings = mv88e6095_stats_get_strings,
@@ -3268,6 +3259,7 @@ static const struct mv88e6xxx_ops mv88e6097_ops = {
.port_set_egress_unknowns = mv88e6351_port_set_egress_unknowns,
.port_set_ether_type = mv88e6351_port_set_ether_type,
.port_jumbo_config = mv88e6165_port_jumbo_config,
+   .port_egress_rate_limiting = mv88e6095_port_egress_rate_limiting,
.stats_snapshot = mv88e6xxx_g1_stats_snapshot,
.stats_get_sset_count = mv88e6095_stats_get_sset_count,
.stats_get_strings = mv88e6095_stats_get_strings,
@@ -3309,6 +3301,7 @@ static const struct mv88e6xxx_ops mv88e6131_ops = {
.port_set_egress_unknowns = mv88e6351_port_set_egress_unknowns,
.port_set_ether_type = mv88e6351_port_set_ether_type,
.port_jumbo_config = mv88e6165_port_jumbo_config,
+   .port_egress_rate_limiting = mv88e6097_port_egress_rate_limiting,
.stats_snapshot = mv88e6xxx_g1_stats_snapshot,
.stats_get_sset_count = mv88e6095_stats_get_sset_count,
.stats_get_strings = mv88e6095_stats_get_strings,
@@ -3331,6 +3324,7 @@ static const struct mv88e6xxx_ops mv88e6161_ops = {
.port_set_egress_unknowns = mv88e6351_port_set_egress_unknowns,
.port_set_ether_type = mv88e6351_port_set_ether_type,
.port_jumbo_config = mv88e6165_port_jumbo_config,
+   .port_egress_rate_limiting = mv88e6097_port_egress_rate_limiting,
.stats_snapshot = mv88e6xxx_g1_stats_snapshot,
.stats_get_sset_count = mv88e6095_stats_get_sset_count,
.stats_get_strings = mv88e6095_stats_get_strings,
@@ -3371,6 +3365,7 @@ static const struct mv88e6xxx_ops mv88e6171_ops = {
.port_set_egress_unknowns = mv88e6351_port_set_egress_unknowns,
.port_set_ether_type = mv88e6351_port_set_ether_type,
.port_jumbo_config = mv88e6165_port_jumbo_config,
+   .port_egress_rate_limiting = mv88e6097_port_egress_rate_limiting,
.stats_snapshot = mv88e6320_g1_stats_snapshot,
.stats_get_sset_count = mv88e6095_stats_get_sset_count,
.stats_get_strings = mv88e6095_stats_get_strings,
@@ -3396,6 +3391,7 @@ static const struct mv88e6xxx_ops mv88e6172_ops = {
.port_set_egress_unknowns = mv88e6351_port_set_egress_unknowns,
.port_set_ether_type = mv88e6351_port_set_ether_type,
.port_jumbo_config = mv88e6165_port_jumbo_config,
+   .port_egress_rate_limiting = mv88e6097_port_egress_rate_limiting,
.stats_snapshot = mv88e6320_g1_stats_snapshot,
.stats_get_sset_count = mv88e6095_stats_get_sset_count,
.stats_get_strings = mv88e6095_stats_get_strings,
@@ -3419,6 +3415,7 @@ static const struct mv88e6xxx_ops mv88e6175_ops = {
.port_set_egress_unknowns = mv88e6351_port_set_egress_unknowns,
.port_set_ether_type = mv88e6351_port_set_ether_type,
.port_jumbo_config = mv88e6165_port_jumbo_config,
+

Re: [PATCH net-next 1/4] bpf: xdp: Allow head adjustment in XDP prog

2016-12-02 Thread Martin KaFai Lau
On Sat, Dec 03, 2016 at 01:22:05AM +0100, Daniel Borkmann wrote:
> On 12/03/2016 12:23 AM, Martin KaFai Lau wrote:
> >This patch allows XDP prog to extend/remove the packet
> >data at the head (like adding or removing header).  It is
> >done by adding a new XDP helper bpf_xdp_adjust_head().
> >
> >It also renames bpf_helper_changes_skb_data() to
> >bpf_helper_changes_pkt_data() to better reflect
> >that XDP prog does not work on skb.
> >
> >Signed-off-by: Martin KaFai Lau 
> [...]
> >diff --git a/net/core/filter.c b/net/core/filter.c
> >index 56b43587d200..6902e2f73e38 100644
> >--- a/net/core/filter.c
> >+++ b/net/core/filter.c
> >@@ -2234,7 +2234,34 @@ static const struct bpf_func_proto 
> >bpf_skb_change_head_proto = {
> > .arg3_type  = ARG_ANYTHING,
> >  };
> >
> >-bool bpf_helper_changes_skb_data(void *func)
> >+BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset)
> >+{
> >+/* Both mlx4 and mlx5 driver align each packet to PAGE_SIZE when
> >+ * XDP prog is set.
> >+ * If the above is not true for the other drivers to support
> >+ * bpf_xdp_adjust_head, struct xdp_buff can be extended.
> >+ */
> >+void *head = (void *)((unsigned long)xdp->data & PAGE_MASK);
> >+void *new_data = xdp->data + offset;
> >+
> >+if (new_data < head || new_data >= xdp->data_end)
> >+/* The packet length must be >=1 */
>
> Patch looks generally good to me. Should the min pkt len here be
> limited to ETH_HLEN instead of 1?
Make sense.  Will make the change.

>
> >+return -EINVAL;
> >+
> >+xdp->data = new_data;
> >+
> >+return 0;
> >+}
> >+
> >+static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
> >+.func   = bpf_xdp_adjust_head,
> >+.gpl_only   = false,
> >+.ret_type   = RET_INTEGER,
> >+.arg1_type  = ARG_PTR_TO_CTX,
> >+.arg2_type  = ARG_ANYTHING,
> >+};
> >+
> >+bool bpf_helper_changes_pkt_data(void *func)
> >  {
> > if (func == bpf_skb_vlan_push ||
> > func == bpf_skb_vlan_pop ||
> [...]


Re: [PATCH net-next 2/4] mlx4: xdp: Allow raising MTU up to one page minus eth and vlan hdrs

2016-12-02 Thread Martin KaFai Lau
On Fri, Dec 02, 2016 at 06:15:26PM -0800, Eric Dumazet wrote:
> On Fri, 2016-12-02 at 16:53 -0800, Alexei Starovoitov wrote:
> > On 12/2/16 4:38 PM, Eric Dumazet wrote:
> > > On Fri, 2016-12-02 at 15:23 -0800, Martin KaFai Lau wrote:
> > >> When XDP prog is attached, it is currently limiting
> > >> MTU to be FRAG_SZ0 - ETH_HLEN - (2 * VLAN_HLEN) which is 1514
> > >> in x86.
> > >>
> > >> AFAICT, since mlx4 is doing one page per packet for XDP,
> > >> we can at least raise the MTU limitation up to
> > >> PAGE_SIZE - ETH_HLEN - (2 * VLAN_HLEN) which this patch is
> > >> doing.  It will be useful in the next patch which allows
> > >> XDP program to extend the packet by adding new header(s).
> > >>
> > >> Signed-off-by: Martin KaFai Lau 
> > >> ---
> > >
> > > Have you tested your patch on a host with PAGE_SIZE = 64 KB ?
> > >
> > > Looks XDP really kills arches with bigger pages :(
> >
> > I'm afraid xdp mlx[45] support was not tested on arches
> > with 64k pages at all. Not just this patch.
> > I think people who care about such archs should test?
> > Note page per packet is not a hard requirement for all drivers
> > and all archs. For mlx[45] it was the easiest and the most
> > convenient way to achieve desired performance.
> > If there are ways to do the same performance differently,
> > I'm all ears :)
> >
>
> My question was more like :
>
> Can we double check all these patches wont break mlx4 driver (non XDP
> path) on arches with PAGE_SIZE=64KB.
The page/pkt requirement is not added by this patch.  The earlier
XDP patch series has already ensured this page/pkt requirement
is effective only when XDP prog is attached.

In the earlier XDP patches, MTU is limited to 1514 when
XDP is ative.   This patch is to allow fully use of the
page for a packet (and also only matter when XDP is active).


[PATCH v3 net-next 1/4] net: dsa: mv88e6xxx: Implement mv88e6390 tag remap

2016-12-02 Thread Andrew Lunn
The mv88e6390 does not have the two registers to set the frame
priority map. Instead it has an indirection registers for setting a
number of different priority maps. Refactor the old code into an
function, implement the mv88e6390 version, and use an op to call the
right one.

Signed-off-by: Andrew Lunn 
Reviewed-by: Vivien Didelot 
---
v2:
Add port prefix
Add helper function for 6390
Add _IEEE_ into #defines
---
 drivers/net/dsa/mv88e6xxx/chip.c  | 37 
 drivers/net/dsa/mv88e6xxx/mv88e6xxx.h | 12 +++
 drivers/net/dsa/mv88e6xxx/port.c  | 63 +++
 drivers/net/dsa/mv88e6xxx/port.h  |  2 ++
 4 files changed, 101 insertions(+), 13 deletions(-)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index ce2f7ff8066e..ff4bd2f74357 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -2617,20 +2617,10 @@ static int mv88e6xxx_setup_port(struct mv88e6xxx_chip 
*chip, int port)
if (err)
return err;
}
+   }
 
-   /* Tag Remap: use an identity 802.1p prio -> switch
-* prio mapping.
-*/
-   err = mv88e6xxx_port_write(chip, port, PORT_TAG_REGMAP_0123,
-  0x3210);
-   if (err)
-   return err;
-
-   /* Tag Remap 2: use an identity 802.1p prio -> switch
-* prio mapping.
-*/
-   err = mv88e6xxx_port_write(chip, port, PORT_TAG_REGMAP_4567,
-  0x7654);
+   if (chip->info->ops->port_tag_remap) {
+   err = chip->info->ops->port_tag_remap(chip, port);
if (err)
return err;
}
@@ -3189,6 +3179,7 @@ static const struct mv88e6xxx_ops mv88e6085_ops = {
.port_set_link = mv88e6xxx_port_set_link,
.port_set_duplex = mv88e6xxx_port_set_duplex,
.port_set_speed = mv88e6185_port_set_speed,
+   .port_tag_remap = mv88e6095_port_tag_remap,
.stats_snapshot = mv88e6xxx_g1_stats_snapshot,
.stats_get_sset_count = mv88e6095_stats_get_sset_count,
.stats_get_strings = mv88e6095_stats_get_strings,
@@ -3217,6 +3208,7 @@ static const struct mv88e6xxx_ops mv88e6097_ops = {
.port_set_link = mv88e6xxx_port_set_link,
.port_set_duplex = mv88e6xxx_port_set_duplex,
.port_set_speed = mv88e6185_port_set_speed,
+   .port_tag_remap = mv88e6095_port_tag_remap,
.stats_snapshot = mv88e6xxx_g1_stats_snapshot,
.stats_get_sset_count = mv88e6095_stats_get_sset_count,
.stats_get_strings = mv88e6095_stats_get_strings,
@@ -3245,6 +3237,7 @@ static const struct mv88e6xxx_ops mv88e6131_ops = {
.port_set_link = mv88e6xxx_port_set_link,
.port_set_duplex = mv88e6xxx_port_set_duplex,
.port_set_speed = mv88e6185_port_set_speed,
+   .port_tag_remap = mv88e6095_port_tag_remap,
.stats_snapshot = mv88e6xxx_g1_stats_snapshot,
.stats_get_sset_count = mv88e6095_stats_get_sset_count,
.stats_get_strings = mv88e6095_stats_get_strings,
@@ -3259,6 +3252,7 @@ static const struct mv88e6xxx_ops mv88e6161_ops = {
.port_set_link = mv88e6xxx_port_set_link,
.port_set_duplex = mv88e6xxx_port_set_duplex,
.port_set_speed = mv88e6185_port_set_speed,
+   .port_tag_remap = mv88e6095_port_tag_remap,
.stats_snapshot = mv88e6xxx_g1_stats_snapshot,
.stats_get_sset_count = mv88e6095_stats_get_sset_count,
.stats_get_strings = mv88e6095_stats_get_strings,
@@ -3288,6 +3282,7 @@ static const struct mv88e6xxx_ops mv88e6171_ops = {
.port_set_duplex = mv88e6xxx_port_set_duplex,
.port_set_rgmii_delay = mv88e6352_port_set_rgmii_delay,
.port_set_speed = mv88e6185_port_set_speed,
+   .port_tag_remap = mv88e6095_port_tag_remap,
.stats_snapshot = mv88e6320_g1_stats_snapshot,
.stats_get_sset_count = mv88e6095_stats_get_sset_count,
.stats_get_strings = mv88e6095_stats_get_strings,
@@ -3305,6 +3300,7 @@ static const struct mv88e6xxx_ops mv88e6172_ops = {
.port_set_duplex = mv88e6xxx_port_set_duplex,
.port_set_rgmii_delay = mv88e6352_port_set_rgmii_delay,
.port_set_speed = mv88e6352_port_set_speed,
+   .port_tag_remap = mv88e6095_port_tag_remap,
.stats_snapshot = mv88e6320_g1_stats_snapshot,
.stats_get_sset_count = mv88e6095_stats_get_sset_count,
.stats_get_strings = mv88e6095_stats_get_strings,
@@ -3320,6 +3316,7 @@ static const struct mv88e6xxx_ops mv88e6175_ops = {
.port_set_duplex = mv88e6xxx_port_set_duplex,
.port_set_rgmii_delay = mv88e6352_port_set_rgmii_delay,
.port_set_speed = mv88e6185_port_set_speed,
+   .port_tag_remap = mv88e6095_port_tag_remap,
.stats_snapshot = mv88e6320_g1_stats_snapshot,

[PATCH v3 net-next 2/4] net: dsa: mv88e6xxx: Monitor and Management tables

2016-12-02 Thread Andrew Lunn
The mv88e6390 changes the monitor control register into the Monitor
and Management control, which is an indirection register to various
registers.

Add ops to set the CPU port and the ingress/egress port for both
register layouts, to global1

Signed-off-by: Andrew Lunn 
---
 drivers/net/dsa/mv88e6xxx/chip.c  | 68 +-
 drivers/net/dsa/mv88e6xxx/global1.c   | 69 +++
 drivers/net/dsa/mv88e6xxx/global1.h   |  4 ++
 drivers/net/dsa/mv88e6xxx/mv88e6xxx.h | 13 +++
 4 files changed, 145 insertions(+), 9 deletions(-)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index ff4bd2f74357..6e981bedd028 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -2747,15 +2747,17 @@ static int mv88e6xxx_g1_setup(struct mv88e6xxx_chip 
*chip)
if (err)
return err;
 
-   /* Configure the upstream port, and configure it as the port to which
-* ingress and egress and ARP monitor frames are to be sent.
-*/
-   reg = upstream_port << GLOBAL_MONITOR_CONTROL_INGRESS_SHIFT |
-   upstream_port << GLOBAL_MONITOR_CONTROL_EGRESS_SHIFT |
-   upstream_port << GLOBAL_MONITOR_CONTROL_ARP_SHIFT;
-   err = mv88e6xxx_g1_write(chip, GLOBAL_MONITOR_CONTROL, reg);
-   if (err)
-   return err;
+   if (chip->info->ops->g1_set_cpu_port) {
+   err = chip->info->ops->g1_set_cpu_port(chip, upstream_port);
+   if (err)
+   return err;
+   }
+
+   if (chip->info->ops->g1_set_egress_port) {
+   err = chip->info->ops->g1_set_egress_port(chip, upstream_port);
+   if (err)
+   return err;
+   }
 
/* Disable remote management, and set the switch's DSA device number. */
err = mv88e6xxx_g1_write(chip, GLOBAL_CONTROL_2,
@@ -3184,6 +3186,8 @@ static const struct mv88e6xxx_ops mv88e6085_ops = {
.stats_get_sset_count = mv88e6095_stats_get_sset_count,
.stats_get_strings = mv88e6095_stats_get_strings,
.stats_get_stats = mv88e6095_stats_get_stats,
+   .g1_set_cpu_port = mv88e6095_g1_set_cpu_port,
+   .g1_set_egress_port = mv88e6095_g1_set_egress_port,
 };
 
 static const struct mv88e6xxx_ops mv88e6095_ops = {
@@ -3213,6 +3217,8 @@ static const struct mv88e6xxx_ops mv88e6097_ops = {
.stats_get_sset_count = mv88e6095_stats_get_sset_count,
.stats_get_strings = mv88e6095_stats_get_strings,
.stats_get_stats = mv88e6095_stats_get_stats,
+   .g1_set_cpu_port = mv88e6095_g1_set_cpu_port,
+   .g1_set_egress_port = mv88e6095_g1_set_egress_port,
 };
 
 static const struct mv88e6xxx_ops mv88e6123_ops = {
@@ -3227,6 +3233,8 @@ static const struct mv88e6xxx_ops mv88e6123_ops = {
.stats_get_sset_count = mv88e6095_stats_get_sset_count,
.stats_get_strings = mv88e6095_stats_get_strings,
.stats_get_stats = mv88e6095_stats_get_stats,
+   .g1_set_cpu_port = mv88e6095_g1_set_cpu_port,
+   .g1_set_egress_port = mv88e6095_g1_set_egress_port,
 };
 
 static const struct mv88e6xxx_ops mv88e6131_ops = {
@@ -3242,6 +3250,8 @@ static const struct mv88e6xxx_ops mv88e6131_ops = {
.stats_get_sset_count = mv88e6095_stats_get_sset_count,
.stats_get_strings = mv88e6095_stats_get_strings,
.stats_get_stats = mv88e6095_stats_get_stats,
+   .g1_set_cpu_port = mv88e6095_g1_set_cpu_port,
+   .g1_set_egress_port = mv88e6095_g1_set_egress_port,
 };
 
 static const struct mv88e6xxx_ops mv88e6161_ops = {
@@ -3257,6 +3267,8 @@ static const struct mv88e6xxx_ops mv88e6161_ops = {
.stats_get_sset_count = mv88e6095_stats_get_sset_count,
.stats_get_strings = mv88e6095_stats_get_strings,
.stats_get_stats = mv88e6095_stats_get_stats,
+   .g1_set_cpu_port = mv88e6095_g1_set_cpu_port,
+   .g1_set_egress_port = mv88e6095_g1_set_egress_port,
 };
 
 static const struct mv88e6xxx_ops mv88e6165_ops = {
@@ -3271,6 +3283,8 @@ static const struct mv88e6xxx_ops mv88e6165_ops = {
.stats_get_sset_count = mv88e6095_stats_get_sset_count,
.stats_get_strings = mv88e6095_stats_get_strings,
.stats_get_stats = mv88e6095_stats_get_stats,
+   .g1_set_cpu_port = mv88e6095_g1_set_cpu_port,
+   .g1_set_egress_port = mv88e6095_g1_set_egress_port,
 };
 
 static const struct mv88e6xxx_ops mv88e6171_ops = {
@@ -3287,6 +3301,8 @@ static const struct mv88e6xxx_ops mv88e6171_ops = {
.stats_get_sset_count = mv88e6095_stats_get_sset_count,
.stats_get_strings = mv88e6095_stats_get_strings,
.stats_get_stats = mv88e6095_stats_get_stats,
+   .g1_set_cpu_port = mv88e6095_g1_set_cpu_port,
+   .g1_set_egress_port = mv88e6095_g1_set_egress_port,
 };
 
 static const struct mv88e6xxx_ops mv88e6172_ops = {
@@ -3305,6 +3321,8 @@ static const struct mv88e6xxx_ops mv88e6172_ops = {
.stats_get_sset_cou

[PATCH v3 net-next 4/4] net: dsa: mv88e6xxx: Refactor CPU and DSA port setup

2016-12-02 Thread Andrew Lunn
Older chips only support DSA tagging. Newer chips have both DSA and
EDSA tagging. Refactor the code by adding port functions for setting the
frame mode, egress mode, and if to forward unknown frames.

This results in the helper mv88e6xxx_6065_family() becoming unused, so
remove it.

Signed-off-by: Andrew Lunn 
v3:
Verify mandatory ops for port setup
Don't set ether type for DSA port.
---
 drivers/net/dsa/mv88e6xxx/chip.c  | 217 ++
 drivers/net/dsa/mv88e6xxx/mv88e6xxx.h |  20 
 drivers/net/dsa/mv88e6xxx/port.c  | 118 ++
 drivers/net/dsa/mv88e6xxx/port.h  |  13 ++
 4 files changed, 319 insertions(+), 49 deletions(-)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 80efee6f5e16..9c14aaad5103 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -677,11 +677,6 @@ static int mv88e6xxx_phy_ppu_write(struct mv88e6xxx_chip 
*chip, int addr,
return err;
 }
 
-static bool mv88e6xxx_6065_family(struct mv88e6xxx_chip *chip)
-{
-   return chip->info->family == MV88E6XXX_FAMILY_6065;
-}
-
 static bool mv88e6xxx_6095_family(struct mv88e6xxx_chip *chip)
 {
return chip->info->family == MV88E6XXX_FAMILY_6095;
@@ -2438,6 +2433,72 @@ static int mv88e6xxx_serdes_power_on(struct 
mv88e6xxx_chip *chip)
return err;
 }
 
+static int mv88e6xxx_setup_port_dsa(struct mv88e6xxx_chip *chip, int port,
+   int upstream_port)
+{
+   int err;
+
+   err = chip->info->ops->port_set_frame_mode(
+   chip, port, MV88E6XXX_FRAME_MODE_DSA);
+   if (err)
+   return err;
+
+   return chip->info->ops->port_set_egress_unknowns(
+   chip, port, port == upstream_port);
+}
+
+static int mv88e6xxx_setup_port_cpu(struct mv88e6xxx_chip *chip, int port)
+{
+   int err;
+
+   switch (chip->info->tag_protocol) {
+   case DSA_TAG_PROTO_EDSA:
+   err = chip->info->ops->port_set_frame_mode(
+   chip, port, MV88E6XXX_FRAME_MODE_ETHERTYPE);
+   if (err)
+   return err;
+
+   err = mv88e6xxx_port_set_egress_mode(
+   chip, port, PORT_CONTROL_EGRESS_ADD_TAG);
+   if (err)
+   return err;
+
+   if (chip->info->ops->port_set_ether_type)
+   err = chip->info->ops->port_set_ether_type(
+   chip, port, ETH_P_EDSA);
+   break;
+
+   case DSA_TAG_PROTO_DSA:
+   err = chip->info->ops->port_set_frame_mode(
+   chip, port, MV88E6XXX_FRAME_MODE_DSA);
+   if (err)
+   return err;
+
+   err = mv88e6xxx_port_set_egress_mode(
+   chip, port, PORT_CONTROL_EGRESS_UNMODIFIED);
+   break;
+   default:
+   err = -EINVAL;
+   }
+
+   if (err)
+   return err;
+
+   return chip->info->ops->port_set_egress_unknowns(chip, port, true);
+}
+
+static int mv88e6xxx_setup_port_normal(struct mv88e6xxx_chip *chip, int port)
+{
+   int err;
+
+   err = chip->info->ops->port_set_frame_mode(
+   chip, port, MV88E6XXX_FRAME_MODE_NORMAL);
+   if (err)
+   return err;
+
+   return chip->info->ops->port_set_egress_unknowns(chip, port, false);
+}
+
 static int mv88e6xxx_setup_port(struct mv88e6xxx_chip *chip, int port)
 {
struct dsa_switch *ds = chip->ds;
@@ -2473,44 +2534,23 @@ static int mv88e6xxx_setup_port(struct mv88e6xxx_chip 
*chip, int port)
 * If this is the upstream port for this switch, enable
 * forwarding of unknown unicasts and multicasts.
 */
-   reg = 0;
-   if (mv88e6xxx_6352_family(chip) || mv88e6xxx_6351_family(chip) ||
-   mv88e6xxx_6165_family(chip) || mv88e6xxx_6097_family(chip) ||
-   mv88e6xxx_6095_family(chip) || mv88e6xxx_6065_family(chip) ||
-   mv88e6xxx_6185_family(chip) || mv88e6xxx_6320_family(chip))
-   reg = PORT_CONTROL_IGMP_MLD_SNOOP |
+   reg = PORT_CONTROL_IGMP_MLD_SNOOP |
PORT_CONTROL_USE_TAG | PORT_CONTROL_USE_IP |
PORT_CONTROL_STATE_FORWARDING;
-   if (dsa_is_cpu_port(ds, port)) {
-   if (chip->info->tag_protocol == DSA_TAG_PROTO_EDSA)
-   reg |= PORT_CONTROL_FRAME_ETHER_TYPE_DSA |
-   PORT_CONTROL_FORWARD_UNKNOWN_MC;
-   else
-   reg |= PORT_CONTROL_DSA_TAG;
-   reg |= PORT_CONTROL_EGRESS_ADD_TAG |
-   PORT_CONTROL_FORWARD_UNKNOWN;
-   }
-   if (dsa_is_dsa_port(ds, port)) {
-   if (mv88e6xxx_6095_family(chip) ||
-   mv88e6xxx_6185_family(chip))
-   reg |= PORT_CONTROL_DSA_TAG;
-   if (mv88e6xxx_6352_family(chip) ||
-  

[PATCH v3 net-next 3/4] net: dsa: mv88e6xxx: Move the tagging protocol into info

2016-12-02 Thread Andrew Lunn
Older chips support a single tagging protocol, DSA. New chips support
both DSA and EDSA, an enhanced version. Having both as an option
changes the register layouts. Up until now, it has been assumed that
if EDSA is supported, it will be used. Hence the register layout has
been determined by which protocol should be used. However, mv88e6390
has a different implementation of EDSA, which requires we need to use
the DSA tagging. Hence separate the selection of the protocol from the
register layout.

Signed-off-by: Andrew Lunn 
Reviewed-by: Vivien Didelot 
---
 drivers/net/dsa/mv88e6xxx/chip.c  | 33 +++--
 drivers/net/dsa/mv88e6xxx/mv88e6xxx.h | 17 -
 2 files changed, 31 insertions(+), 19 deletions(-)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 6e981bedd028..80efee6f5e16 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -2482,7 +2482,7 @@ static int mv88e6xxx_setup_port(struct mv88e6xxx_chip 
*chip, int port)
PORT_CONTROL_USE_TAG | PORT_CONTROL_USE_IP |
PORT_CONTROL_STATE_FORWARDING;
if (dsa_is_cpu_port(ds, port)) {
-   if (mv88e6xxx_has(chip, MV88E6XXX_FLAG_EDSA))
+   if (chip->info->tag_protocol == DSA_TAG_PROTO_EDSA)
reg |= PORT_CONTROL_FRAME_ETHER_TYPE_DSA |
PORT_CONTROL_FORWARD_UNKNOWN_MC;
else
@@ -2611,7 +2611,7 @@ static int mv88e6xxx_setup_port(struct mv88e6xxx_chip 
*chip, int port)
/* Port Ethertype: use the Ethertype DSA Ethertype
 * value.
 */
-   if (mv88e6xxx_has(chip, MV88E6XXX_FLAG_EDSA)) {
+   if (chip->info->tag_protocol == DSA_TAG_PROTO_EDSA) {
err = mv88e6xxx_port_write(chip, port, PORT_ETH_TYPE,
   ETH_P_EDSA);
if (err)
@@ -3637,6 +3637,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = {
.global1_addr = 0x1b,
.age_time_coeff = 15000,
.g1_irqs = 8,
+   .tag_protocol = DSA_TAG_PROTO_DSA,
.flags = MV88E6XXX_FLAGS_FAMILY_6097,
.ops = &mv88e6085_ops,
},
@@ -3651,6 +3652,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = {
.global1_addr = 0x1b,
.age_time_coeff = 15000,
.g1_irqs = 8,
+   .tag_protocol = DSA_TAG_PROTO_DSA,
.flags = MV88E6XXX_FLAGS_FAMILY_6095,
.ops = &mv88e6095_ops,
},
@@ -3679,6 +3681,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = {
.global1_addr = 0x1b,
.age_time_coeff = 15000,
.g1_irqs = 9,
+   .tag_protocol = DSA_TAG_PROTO_DSA,
.flags = MV88E6XXX_FLAGS_FAMILY_6165,
.ops = &mv88e6123_ops,
},
@@ -3693,6 +3696,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = {
.global1_addr = 0x1b,
.age_time_coeff = 15000,
.g1_irqs = 9,
+   .tag_protocol = DSA_TAG_PROTO_DSA,
.flags = MV88E6XXX_FLAGS_FAMILY_6185,
.ops = &mv88e6131_ops,
},
@@ -3707,6 +3711,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = {
.global1_addr = 0x1b,
.age_time_coeff = 15000,
.g1_irqs = 9,
+   .tag_protocol = DSA_TAG_PROTO_DSA,
.flags = MV88E6XXX_FLAGS_FAMILY_6165,
.ops = &mv88e6161_ops,
},
@@ -3721,6 +3726,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = {
.global1_addr = 0x1b,
.age_time_coeff = 15000,
.g1_irqs = 9,
+   .tag_protocol = DSA_TAG_PROTO_DSA,
.flags = MV88E6XXX_FLAGS_FAMILY_6165,
.ops = &mv88e6165_ops,
},
@@ -3735,6 +3741,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = {
.global1_addr = 0x1b,
.age_time_coeff = 15000,
.g1_irqs = 9,
+   .tag_protocol = DSA_TAG_PROTO_EDSA,
.flags = MV88E6XXX_FLAGS_FAMILY_6351,
.ops = &mv88e6171_ops,
},
@@ -3749,6 +3756,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = {
.global1_addr = 0x1b,
.age_time_coeff = 15000,
.g1_irqs = 9,
+   .tag_protocol = DSA_TAG_PROTO_EDSA,
.flags = MV88E6XXX_FLAGS_FAMILY_6352,
.ops = &mv88e6172_ops,
},
@@ -3763,6 +3771,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = {
.global1_addr = 0x1b,
.age_time_coeff = 15000,
.g1_irqs = 9,
+   .tag_protocol = DSA_TAG_PROTO_EDSA,
.

Re: [net-next PATCH v4 5/6] virtio_net: add XDP_TX support

2016-12-02 Thread John Fastabend
On 16-12-02 12:51 PM, John Fastabend wrote:
> This adds support for the XDP_TX action to virtio_net. When an XDP
> program is run and returns the XDP_TX action the virtio_net XDP
> implementation will transmit the packet on a TX queue that aligns
> with the current CPU that the XDP packet was processed on.
> 
> Before sending the packet the header is zeroed.  Also XDP is expected
> to handle checksum correctly so no checksum offload  support is
> provided.
> 
> Signed-off-by: John Fastabend 
> ---
>  drivers/net/virtio_net.c |   63 
> --
>  1 file changed, 60 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index b67203e..137caba 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -330,12 +330,43 @@ static struct sk_buff *page_to_skb(struct virtnet_info 
> *vi,
>   return skb;
>  }
>  
> +static void virtnet_xdp_xmit(struct virtnet_info *vi,
> +  unsigned int qnum, struct xdp_buff *xdp)
> +{
> + struct send_queue *sq = &vi->sq[qnum];
> + struct virtio_net_hdr_mrg_rxbuf *hdr;
> + unsigned int num_sg, len;
> + void *xdp_sent;
> + int err;
> +
> + /* Free up any pending old buffers before queueing new ones. */
> + while ((xdp_sent = virtqueue_get_buf(sq->vq, &len)) != NULL) {
> + struct page *page = virt_to_head_page(xdp_sent);
> +
> + put_page(page);
> + }
> +
> + /* Zero header and leave csum up to XDP layers */
> + hdr = xdp->data;
> + memset(hdr, 0, vi->hdr_len);
> +
> + num_sg = 1;
> + sg_init_one(sq->sg, xdp->data, xdp->data_end - xdp->data);
> + err = virtqueue_add_outbuf(sq->vq, sq->sg, num_sg,
> +xdp->data, GFP_ATOMIC);
> + if (unlikely(err))
> + put_page(virt_to_head_page(xdp->data));
> + else
> + virtqueue_kick(sq->vq);
> +}
> +

Hi Michael,

Any idea why the above pattern

> + err = virtqueue_add_outbuf(sq->vq, sq->sg, num_sg,
> +xdp->data, GFP_ATOMIC);
> + if (unlikely(err))
> + put_page(virt_to_head_page(xdp->data));
> + else
> + virtqueue_kick(sq->vq);
> +}

would cause a hang but if I call the virtqueue_kick as below
even in the error case everything seems to be fine.

err = virtqueue_add_outbuf(sq->vq, sq->sg, num_sg,
   xdp->data, GFP_ATOMIC);
if (unlikely(err))
put_page(virt_to_head_page(xdp->data));

virtqueue_kick(sq->vq);


I'll take a look through the virtio code but thought I might ask in
case you know off-hand or it could be something else entirely.

I noticed virtio_input.c uses the second pattern and virtio_net.c
uses the above pattern but I'm guessing it never gets exercised due
to stack backoff.

Thanks,
John


Re: [PATCH 2/3] uapi: export tc_skbmod.h

2016-12-02 Thread kbuild test robot
Hi Stephen,

[auto build test ERROR on linus/master]
[also build test ERROR on v4.9-rc7]
[cannot apply to next-20161202]
[if your patch is applied to the wrong git tree, please drop us a note to help 
improve the system]

url:
https://github.com/0day-ci/linux/commits/Stephen-Hemminger/UAPI-export-missing-headers/20161203-104831
config: i386-tinyconfig (attached as .config)
compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901
reproduce:
# save the attached .config to linux build tree
make ARCH=i386 

All errors (new ones prefixed by >>):

>> scripts/Makefile.headersinst:55: *** Missing UAPI file 
>> include/uapi/linux/tc_act/tc_sbkmod.h.  Stop.
--
>> scripts/Makefile.headersinst:55: *** Missing UAPI file 
>> include/uapi/linux/tc_act/tc_sbkmod.h.  Stop.
   make[3]: *** [tc_act] Error 2
   make[3]: Target '__headersinst' not remade because of errors.
   make[2]: *** [linux] Error 2
   make[2]: Target '__headersinst' not remade because of errors.
   make[1]: *** [headers_install] Error 2
   make: *** [sub-make] Error 2

vim +55 scripts/Makefile.headersinst

d8ecc5cd Sam Ravnborg2011-04-27  39  
10b63956 David Howells   2012-10-02  40  srcdir:= $(srctree)/$(obj)
10b63956 David Howells   2012-10-02  41  gendir:= $(objtree)/$(gen)
10b63956 David Howells   2012-10-02  42  
10b63956 David Howells   2012-10-02  43  oldsrcdir := $(srctree)/$(subst 
/uapi,,$(obj))
10b63956 David Howells   2012-10-02  44  
7712401a Sam Ravnborg2008-06-15  45  # all headers files for this dir
d8ecc5cd Sam Ravnborg2011-04-27  46  header-y  := $(filter-out 
$(generic-y), $(header-y))
40f1d4c2 David Howells   2012-10-02  47  all-files := $(header-y) 
$(genhdr-y) $(wrapper-files)
10b63956 David Howells   2012-10-02  48  output-files  := $(addprefix 
$(installdir)/, $(all-files))
10b63956 David Howells   2012-10-02  49  
c0ff68f1 Nicolas Dichtel 2013-04-29  50  input-files1  := $(foreach hdr, 
$(header-y), \
c4619bc6 Sam Ravnborg2013-03-04  51$(if $(wildcard 
$(srcdir)/$(hdr)), \
c0ff68f1 Nicolas Dichtel 2013-04-29  52 $(wildcard 
$(srcdir)/$(hdr))) \
c0ff68f1 Nicolas Dichtel 2013-04-29  53)
c0ff68f1 Nicolas Dichtel 2013-04-29  54  input-files1-name := $(notdir 
$(input-files1))
c0ff68f1 Nicolas Dichtel 2013-04-29 @55  input-files2  := $(foreach hdr, 
$(header-y), \
c0ff68f1 Nicolas Dichtel 2013-04-29  56$(if  $(wildcard 
$(srcdir)/$(hdr)),, \
c4619bc6 Sam Ravnborg2013-03-04  57 $(if $(wildcard 
$(oldsrcdir)/$(hdr)), \
10b63956 David Howells   2012-10-02  58 
$(wildcard $(oldsrcdir)/$(hdr)), \
c4619bc6 Sam Ravnborg2013-03-04  59 $(error 
Missing UAPI file $(srcdir)/$(hdr))) \
c0ff68f1 Nicolas Dichtel 2013-04-29  60))
c0ff68f1 Nicolas Dichtel 2013-04-29  61  input-files2-name := $(notdir 
$(input-files2))
c0ff68f1 Nicolas Dichtel 2013-04-29  62  input-files3  := $(foreach hdr, 
$(genhdr-y), \
c4619bc6 Sam Ravnborg2013-03-04  63$(if $(wildcard 
$(gendir)/$(hdr)), \

:: The code at line 55 was first introduced by commit
:: c0ff68f1611d6855a06d672989ad5cfea160a4eb kbuild: fix make 
headers_install when path is too long

:: TO: Nicolas Dichtel 
:: CC: Michal Marek 

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


.config.gz
Description: application/gzip


Re: [PATCH] net: wireless: realtek: constify rate_control_ops structures

2016-12-02 Thread Bhumika Goyal
On Sat, Dec 3, 2016 at 2:09 AM, Larry Finger  wrote:
> On 12/02/2016 03:50 AM, Bhumika Goyal wrote:
>>
>> The structures rate_control_ops are only passed as an argument to the
>> functions ieee80211_rate_control_{register/unregister}. This argument is
>> of type const, so rate_control_ops having this property can also be
>> declared as const.
>> Done using Coccinelle:
>>
>> @r1 disable optional_qualifier @
>> identifier i;
>> position p;
>> @@
>> static struct rate_control_ops i@p = {...};
>>
>> @ok1@
>> identifier r1.i;
>> position p;
>> @@
>> ieee80211_rate_control_register(&i@p)
>>
>> @ok2@
>> identifier r1.i;
>> position p;
>> @@
>> ieee80211_rate_control_unregister(&i@p)
>>
>> @bad@
>> position p!={r1.p,ok1.p,ok2.p};
>> identifier r1.i;
>> @@
>> i@p
>>
>> @depends on !bad disable optional_qualifier@
>> identifier r1.i;
>> @@
>> static
>> +const
>> struct rate_control_ops i={...};
>>
>> @depends on !bad disable optional_qualifier@
>> identifier r1.i;
>> @@
>> +const
>> struct rate_control_ops i;
>>
>> File size before:
>>textdata bss dec hex filename
>>1991 104   02095 82f wireless/realtek/rtlwifi/rc.o
>>
>> File size after:
>>textdata bss dec hex filename
>>2095   0   02095 wireless/realtek/rtlwifi/rc.o
>>
>> Signed-off-by: Bhumika Goyal 
>> ---
>>  drivers/net/wireless/realtek/rtlwifi/rc.c | 2 +-
>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/drivers/net/wireless/realtek/rtlwifi/rc.c
>> b/drivers/net/wireless/realtek/rtlwifi/rc.c
>> index ce8621a..107c13c 100644
>> --- a/drivers/net/wireless/realtek/rtlwifi/rc.c
>> +++ b/drivers/net/wireless/realtek/rtlwifi/rc.c
>> @@ -284,7 +284,7 @@ static void rtl_rate_free_sta(void *rtlpriv,
>> kfree(rate_priv);
>>  }
>>
>> -static struct rate_control_ops rtl_rate_ops = {
>> +static const struct rate_control_ops rtl_rate_ops = {
>> .name = "rtl_rc",
>> .alloc = rtl_rate_alloc,
>> .free = rtl_rate_free,
>>
>
> The content of your patch is OK; however, your subject is not. By
> convention, "net: wireless: realtek:" is assumed. We do, however, include
> "rtlwifi:" to indicate which part of drivers/net/wireless/realtek/ is
> referenced.
>
Ok, I will send a v2 with the correct subject. Thanks for the input.

Thanks,
Bhumika

> NACK
>
> Larry
>


[PATCH net-next v2 1/4] bnxt_en: Re-factor bnxt_setup_tc().

2016-12-02 Thread Michael Chan
Add a new function bnxt_setup_mq_tc() to handle MQPRIO.  This new function
will be called during ETS setup when we add DCBNL in the next patch.

Signed-off-by: Michael Chan 
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 18 ++
 drivers/net/ethernet/broadcom/bnxt/bnxt.h |  1 +
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 0e4f168..7664281 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -6337,17 +6337,10 @@ static int bnxt_change_mtu(struct net_device *dev, int 
new_mtu)
return 0;
 }
 
-static int bnxt_setup_tc(struct net_device *dev, u32 handle, __be16 proto,
-struct tc_to_netdev *ntc)
+int bnxt_setup_mq_tc(struct net_device *dev, u8 tc)
 {
struct bnxt *bp = netdev_priv(dev);
bool sh = false;
-   u8 tc;
-
-   if (ntc->type != TC_SETUP_MQPRIO)
-   return -EINVAL;
-
-   tc = ntc->tc;
 
if (tc > bp->max_tc) {
netdev_err(dev, "too many traffic classes requested: %d Max 
supported is %d\n",
@@ -6390,6 +6383,15 @@ static int bnxt_setup_tc(struct net_device *dev, u32 
handle, __be16 proto,
return 0;
 }
 
+static int bnxt_setup_tc(struct net_device *dev, u32 handle, __be16 proto,
+struct tc_to_netdev *ntc)
+{
+   if (ntc->type != TC_SETUP_MQPRIO)
+   return -EINVAL;
+
+   return bnxt_setup_mq_tc(dev, ntc->tc);
+}
+
 #ifdef CONFIG_RFS_ACCEL
 static bool bnxt_fltr_match(struct bnxt_ntuple_filter *f1,
struct bnxt_ntuple_filter *f2)
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 47be789..fcd07ee 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -1225,5 +1225,6 @@ static inline void bnxt_disable_poll(struct bnxt_napi 
*bnapi)
 int bnxt_hwrm_fw_set_time(struct bnxt *);
 int bnxt_open_nic(struct bnxt *, bool, bool);
 int bnxt_close_nic(struct bnxt *, bool, bool);
+int bnxt_setup_mq_tc(struct net_device *dev, u8 tc);
 int bnxt_get_max_rings(struct bnxt *, int *, int *, bool);
 #endif
-- 
1.8.3.1



[PATCH net-next v2 2/4] bnxt_en: Update firmware header file to latest 1.6.0.

2016-12-02 Thread Michael Chan
Latest interface has the latest DCB command structs.  Get and store the
max number of lossless TCs the hardware can support.

Signed-off-by: Michael Chan 
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c   |   28 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt.h   |5 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h   | 1725 ++-
 drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c |8 +-
 4 files changed, 1069 insertions(+), 697 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 7664281..7ba5a99 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -186,11 +186,11 @@ enum board_idx {
 };
 
 static const u16 bnxt_async_events_arr[] = {
-   HWRM_ASYNC_EVENT_CMPL_EVENT_ID_LINK_STATUS_CHANGE,
-   HWRM_ASYNC_EVENT_CMPL_EVENT_ID_PF_DRVR_UNLOAD,
-   HWRM_ASYNC_EVENT_CMPL_EVENT_ID_PORT_CONN_NOT_ALLOWED,
-   HWRM_ASYNC_EVENT_CMPL_EVENT_ID_VF_CFG_CHANGE,
-   HWRM_ASYNC_EVENT_CMPL_EVENT_ID_LINK_SPEED_CFG_CHANGE,
+   ASYNC_EVENT_CMPL_EVENT_ID_LINK_STATUS_CHANGE,
+   ASYNC_EVENT_CMPL_EVENT_ID_PF_DRVR_UNLOAD,
+   ASYNC_EVENT_CMPL_EVENT_ID_PORT_CONN_NOT_ALLOWED,
+   ASYNC_EVENT_CMPL_EVENT_ID_VF_CFG_CHANGE,
+   ASYNC_EVENT_CMPL_EVENT_ID_LINK_SPEED_CFG_CHANGE,
 };
 
 static bool bnxt_vf_pciid(enum board_idx idx)
@@ -1476,8 +1476,8 @@ static int bnxt_rx_pkt(struct bnxt *bp, struct bnxt_napi 
*bnapi, u32 *raw_cons,
 }
 
 #define BNXT_GET_EVENT_PORT(data)  \
-   ((data) &   \
-HWRM_ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_EVENT_DATA1_PORT_ID_MASK)
+   ((data) &   \
+ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_EVENT_DATA1_PORT_ID_MASK)
 
 static int bnxt_async_event_process(struct bnxt *bp,
struct hwrm_async_event_cmpl *cmpl)
@@ -1486,7 +1486,7 @@ static int bnxt_async_event_process(struct bnxt *bp,
 
/* TODO CHIMP_FW: Define event id's for link change, error etc */
switch (event_id) {
-   case HWRM_ASYNC_EVENT_CMPL_EVENT_ID_LINK_SPEED_CFG_CHANGE: {
+   case ASYNC_EVENT_CMPL_EVENT_ID_LINK_SPEED_CFG_CHANGE: {
u32 data1 = le32_to_cpu(cmpl->event_data1);
struct bnxt_link_info *link_info = &bp->link_info;
 
@@ -1502,13 +1502,13 @@ static int bnxt_async_event_process(struct bnxt *bp,
set_bit(BNXT_LINK_SPEED_CHNG_SP_EVENT, &bp->sp_event);
/* fall thru */
}
-   case HWRM_ASYNC_EVENT_CMPL_EVENT_ID_LINK_STATUS_CHANGE:
+   case ASYNC_EVENT_CMPL_EVENT_ID_LINK_STATUS_CHANGE:
set_bit(BNXT_LINK_CHNG_SP_EVENT, &bp->sp_event);
break;
-   case HWRM_ASYNC_EVENT_CMPL_EVENT_ID_PF_DRVR_UNLOAD:
+   case ASYNC_EVENT_CMPL_EVENT_ID_PF_DRVR_UNLOAD:
set_bit(BNXT_HWRM_PF_UNLOAD_SP_EVENT, &bp->sp_event);
break;
-   case HWRM_ASYNC_EVENT_CMPL_EVENT_ID_PORT_CONN_NOT_ALLOWED: {
+   case ASYNC_EVENT_CMPL_EVENT_ID_PORT_CONN_NOT_ALLOWED: {
u32 data1 = le32_to_cpu(cmpl->event_data1);
u16 port_id = BNXT_GET_EVENT_PORT(data1);
 
@@ -1521,7 +1521,7 @@ static int bnxt_async_event_process(struct bnxt *bp,
set_bit(BNXT_HWRM_PORT_MODULE_SP_EVENT, &bp->sp_event);
break;
}
-   case HWRM_ASYNC_EVENT_CMPL_EVENT_ID_VF_CFG_CHANGE:
+   case ASYNC_EVENT_CMPL_EVENT_ID_VF_CFG_CHANGE:
if (BNXT_PF(bp))
goto async_event_process_exit;
set_bit(BNXT_RESET_TASK_SILENT_SP_EVENT, &bp->sp_event);
@@ -4261,12 +4261,16 @@ static int bnxt_hwrm_queue_qportcfg(struct bnxt *bp)
goto qportcfg_exit;
}
bp->max_tc = resp->max_configurable_queues;
+   bp->max_lltc = resp->max_configurable_lossless_queues;
if (bp->max_tc > BNXT_MAX_QUEUE)
bp->max_tc = BNXT_MAX_QUEUE;
 
if (resp->queue_cfg_info & QUEUE_QPORTCFG_RESP_QUEUE_CFG_INFO_ASYM_CFG)
bp->max_tc = 1;
 
+   if (bp->max_lltc > bp->max_tc)
+   bp->max_lltc = bp->max_tc;
+
qptr = &resp->queue_id0;
for (i = 0; i < bp->max_tc; i++) {
bp->q_info[i].queue_id = *qptr++;
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index fcd07ee..1f3d852 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -11,10 +11,10 @@
 #define BNXT_H
 
 #define DRV_MODULE_NAME"bnxt_en"
-#define DRV_MODULE_VERSION "1.5.0"
+#define DRV_MODULE_VERSION "1.6.0"
 
 #define DRV_VER_MAJ1
-#define DRV_VER_MIN5
+#define DRV_VER_MIN6
 #define DRV_VER_UPD0
 
 struct tx_bd {
@@ -1010,6 +1010,7 @@ struct bnxt {
u32 rss_hash_cfg;
 
u8  max_tc;
+   u8

[PATCH net-next v2 4/4] bnxt_en: Add PFC statistics.

2016-12-02 Thread Michael Chan
Report PFC statistics to ethtool -S and DCBNL.

Signed-off-by: Michael Chan 
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.h |  7 +++
 drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c | 14 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 23 ---
 3 files changed, 36 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 2a714cf..b4abc1b 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -1124,6 +1124,13 @@ struct bnxt {
u32 lpi_tmr_hi;
 };
 
+#define BNXT_RX_STATS_OFFSET(counter)  \
+   (offsetof(struct rx_port_stats, counter) / 8)
+
+#define BNXT_TX_STATS_OFFSET(counter)  \
+   ((offsetof(struct tx_port_stats, counter) + \
+ sizeof(struct rx_port_stats) + 512) / 8)
+
 #ifdef CONFIG_NET_RX_BUSY_POLL
 static inline void bnxt_enable_poll(struct bnxt_napi *bnapi)
 {
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c
index f391b47..fdf2d8c 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c
@@ -347,8 +347,10 @@ static int bnxt_dcbnl_ieee_setets(struct net_device *dev, 
struct ieee_ets *ets)
 static int bnxt_dcbnl_ieee_getpfc(struct net_device *dev, struct ieee_pfc *pfc)
 {
struct bnxt *bp = netdev_priv(dev);
+   __le64 *stats = (__le64 *)bp->hw_rx_port_stats;
struct ieee_pfc *my_pfc = bp->ieee_pfc;
-   int rc;
+   long rx_off, tx_off;
+   int i, rc;
 
pfc->pfc_cap = bp->max_lltc;
 
@@ -369,6 +371,16 @@ static int bnxt_dcbnl_ieee_getpfc(struct net_device *dev, 
struct ieee_pfc *pfc)
pfc->mbc = my_pfc->mbc;
pfc->delay = my_pfc->delay;
 
+   if (!stats)
+   return 0;
+
+   rx_off = BNXT_RX_STATS_OFFSET(rx_pfc_ena_frames_pri0);
+   tx_off = BNXT_TX_STATS_OFFSET(tx_pfc_ena_frames_pri0);
+   for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++, rx_off++, tx_off++) {
+   pfc->requests[i] = le64_to_cpu(*(stats + tx_off));
+   pfc->indications[i] = le64_to_cpu(*(stats + rx_off));
+   }
+
return 0;
 }
 
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
index fa6125e..784aa77 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
@@ -107,16 +107,9 @@ static int bnxt_set_coalesce(struct net_device *dev,
 
 #define BNXT_NUM_STATS 21
 
-#define BNXT_RX_STATS_OFFSET(counter)  \
-   (offsetof(struct rx_port_stats, counter) / 8)
-
 #define BNXT_RX_STATS_ENTRY(counter)   \
{ BNXT_RX_STATS_OFFSET(counter), __stringify(counter) }
 
-#define BNXT_TX_STATS_OFFSET(counter)  \
-   ((offsetof(struct tx_port_stats, counter) + \
- sizeof(struct rx_port_stats) + 512) / 8)
-
 #define BNXT_TX_STATS_ENTRY(counter)   \
{ BNXT_TX_STATS_OFFSET(counter), __stringify(counter) }
 
@@ -150,6 +143,14 @@ static int bnxt_set_coalesce(struct net_device *dev,
BNXT_RX_STATS_ENTRY(rx_tagged_frames),
BNXT_RX_STATS_ENTRY(rx_double_tagged_frames),
BNXT_RX_STATS_ENTRY(rx_good_frames),
+   BNXT_RX_STATS_ENTRY(rx_pfc_ena_frames_pri0),
+   BNXT_RX_STATS_ENTRY(rx_pfc_ena_frames_pri1),
+   BNXT_RX_STATS_ENTRY(rx_pfc_ena_frames_pri2),
+   BNXT_RX_STATS_ENTRY(rx_pfc_ena_frames_pri3),
+   BNXT_RX_STATS_ENTRY(rx_pfc_ena_frames_pri4),
+   BNXT_RX_STATS_ENTRY(rx_pfc_ena_frames_pri5),
+   BNXT_RX_STATS_ENTRY(rx_pfc_ena_frames_pri6),
+   BNXT_RX_STATS_ENTRY(rx_pfc_ena_frames_pri7),
BNXT_RX_STATS_ENTRY(rx_undrsz_frames),
BNXT_RX_STATS_ENTRY(rx_eee_lpi_events),
BNXT_RX_STATS_ENTRY(rx_eee_lpi_duration),
@@ -179,6 +180,14 @@ static int bnxt_set_coalesce(struct net_device *dev,
BNXT_TX_STATS_ENTRY(tx_fcs_err_frames),
BNXT_TX_STATS_ENTRY(tx_err),
BNXT_TX_STATS_ENTRY(tx_fifo_underruns),
+   BNXT_TX_STATS_ENTRY(tx_pfc_ena_frames_pri0),
+   BNXT_TX_STATS_ENTRY(tx_pfc_ena_frames_pri1),
+   BNXT_TX_STATS_ENTRY(tx_pfc_ena_frames_pri2),
+   BNXT_TX_STATS_ENTRY(tx_pfc_ena_frames_pri3),
+   BNXT_TX_STATS_ENTRY(tx_pfc_ena_frames_pri4),
+   BNXT_TX_STATS_ENTRY(tx_pfc_ena_frames_pri5),
+   BNXT_TX_STATS_ENTRY(tx_pfc_ena_frames_pri6),
+   BNXT_TX_STATS_ENTRY(tx_pfc_ena_frames_pri7),
BNXT_TX_STATS_ENTRY(tx_eee_lpi_events),
BNXT_TX_STATS_ENTRY(tx_eee_lpi_duration),
BNXT_TX_STATS_ENTRY(tx_total_collisions),
-- 
1.8.3.1



[PATCH net-next v2 3/4] bnxt_en: Implement DCBNL to support host-based DCBX.

2016-12-02 Thread Michael Chan
Support only IEEE DCBX initially.  Add IEEE DCBNL ops and functions to
get and set the hardware DCBX parameters.  The DCB code is conditional on
Kconfig CONFIG_BNXT_DCB.

Signed-off-by: Michael Chan 
---
 drivers/net/ethernet/broadcom/Kconfig |  10 +
 drivers/net/ethernet/broadcom/bnxt/Makefile   |   2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt.c |   8 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt.h |   9 +
 drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c | 490 ++
 drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.h |  41 +++
 6 files changed, 557 insertions(+), 3 deletions(-)
 create mode 100644 drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c
 create mode 100644 drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.h

diff --git a/drivers/net/ethernet/broadcom/Kconfig 
b/drivers/net/ethernet/broadcom/Kconfig
index bd8c80c..404c020 100644
--- a/drivers/net/ethernet/broadcom/Kconfig
+++ b/drivers/net/ethernet/broadcom/Kconfig
@@ -203,4 +203,14 @@ config BNXT_SRIOV
  Virtualization support in the NetXtreme-C/E products. This
  allows for virtual function acceleration in virtual environments.
 
+config BNXT_DCB
+   bool "Data Center Bridging (DCB) Support"
+   default n
+   depends on BNXT && DCB
+   ---help---
+ Say Y here if you want to use Data Center Bridging (DCB) in the
+ driver.
+
+ If unsure, say N.
+
 endif # NET_VENDOR_BROADCOM
diff --git a/drivers/net/ethernet/broadcom/bnxt/Makefile 
b/drivers/net/ethernet/broadcom/bnxt/Makefile
index 97e78e2..b233a86 100644
--- a/drivers/net/ethernet/broadcom/bnxt/Makefile
+++ b/drivers/net/ethernet/broadcom/bnxt/Makefile
@@ -1,3 +1,3 @@
 obj-$(CONFIG_BNXT) += bnxt_en.o
 
-bnxt_en-y := bnxt.o bnxt_sriov.o bnxt_ethtool.o
+bnxt_en-y := bnxt.o bnxt_sriov.o bnxt_ethtool.o bnxt_dcb.o
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 7ba5a99..e8ab5fd 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -54,6 +54,7 @@
 #include "bnxt.h"
 #include "bnxt_sriov.h"
 #include "bnxt_ethtool.h"
+#include "bnxt_dcb.h"
 
 #define BNXT_TX_TIMEOUT(5 * HZ)
 
@@ -4997,7 +4998,7 @@ static void bnxt_enable_napi(struct bnxt *bp)
}
 }
 
-static void bnxt_tx_disable(struct bnxt *bp)
+void bnxt_tx_disable(struct bnxt *bp)
 {
int i;
struct bnxt_tx_ring_info *txr;
@@ -5015,7 +5016,7 @@ static void bnxt_tx_disable(struct bnxt *bp)
netif_carrier_off(bp->dev);
 }
 
-static void bnxt_tx_enable(struct bnxt *bp)
+void bnxt_tx_enable(struct bnxt *bp)
 {
int i;
struct bnxt_tx_ring_info *txr;
@@ -6686,6 +6687,7 @@ static void bnxt_remove_one(struct pci_dev *pdev)
 
bnxt_hwrm_func_drv_unrgtr(bp);
bnxt_free_hwrm_resources(bp);
+   bnxt_dcb_free(bp);
pci_iounmap(pdev, bp->bar2);
pci_iounmap(pdev, bp->bar1);
pci_iounmap(pdev, bp->bar0);
@@ -6913,6 +6915,8 @@ static int bnxt_init_one(struct pci_dev *pdev, const 
struct pci_device_id *ent)
dev->min_mtu = ETH_ZLEN;
dev->max_mtu = 9500;
 
+   bnxt_dcb_init(bp);
+
 #ifdef CONFIG_BNXT_SRIOV
init_waitqueue_head(&bp->sriov_cfg_wait);
 #endif
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 1f3d852..2a714cf 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -1026,6 +1026,13 @@ struct bnxt {
struct bnxt_irq *irq_tbl;
u8  mac_addr[ETH_ALEN];
 
+#ifdef CONFIG_BNXT_DCB
+   struct ieee_pfc *ieee_pfc;
+   struct ieee_ets *ieee_ets;
+   u8  dcbx_cap;
+   u8  default_pri;
+#endif /* CONFIG_BNXT_DCB */
+
u32 msg_enable;
 
u32 hwrm_spec_code;
@@ -1221,6 +1228,8 @@ static inline void bnxt_disable_poll(struct bnxt_napi 
*bnapi)
 int hwrm_send_message_silent(struct bnxt *, void *, u32, int);
 int bnxt_hwrm_set_coal(struct bnxt *);
 int bnxt_hwrm_func_qcaps(struct bnxt *);
+void bnxt_tx_disable(struct bnxt *bp);
+void bnxt_tx_enable(struct bnxt *bp);
 int bnxt_hwrm_set_pause(struct bnxt *);
 int bnxt_hwrm_set_link_setting(struct bnxt *, bool, bool);
 int bnxt_hwrm_fw_set_time(struct bnxt *);
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c
new file mode 100644
index 000..f391b47
--- /dev/null
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c
@@ -0,0 +1,490 @@
+/* Broadcom NetXtreme-C/E network driver.
+ *
+ * Copyright (c) 2014-2016 Broadcom Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "b

[PATCH net-next v2 0/4] bnxt_en: Add DCBNL support.

2016-12-02 Thread Michael Chan
This series adds DCBNL operations to support host-based IEEE DCBX.

v2: Updated to the latest firmware interface spec.

David, please consider this series for net-next.

Michael Chan (4):
  bnxt_en: Re-factor bnxt_setup_tc().
  bnxt_en: Update firmware header file to latest 1.6.0.
  bnxt_en: Implement DCBNL to support host-based DCBX.
  bnxt_en: Add PFC statistics.

 drivers/net/ethernet/broadcom/Kconfig |   10 +
 drivers/net/ethernet/broadcom/bnxt/Makefile   |2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt.c |   54 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt.h |   22 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c |  502 ++
 drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.h |   41 +
 drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c |   23 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h | 1725 +
 drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c   |8 +-
 9 files changed, 1672 insertions(+), 715 deletions(-)
 create mode 100644 drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c
 create mode 100644 drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.h

-- 
1.8.3.1



Re: [PATCH net-next 2/4] mlx4: xdp: Allow raising MTU up to one page minus eth and vlan hdrs

2016-12-02 Thread Eric Dumazet
On Fri, 2016-12-02 at 16:53 -0800, Alexei Starovoitov wrote:
> On 12/2/16 4:38 PM, Eric Dumazet wrote:
> > On Fri, 2016-12-02 at 15:23 -0800, Martin KaFai Lau wrote:
> >> When XDP prog is attached, it is currently limiting
> >> MTU to be FRAG_SZ0 - ETH_HLEN - (2 * VLAN_HLEN) which is 1514
> >> in x86.
> >>
> >> AFAICT, since mlx4 is doing one page per packet for XDP,
> >> we can at least raise the MTU limitation up to
> >> PAGE_SIZE - ETH_HLEN - (2 * VLAN_HLEN) which this patch is
> >> doing.  It will be useful in the next patch which allows
> >> XDP program to extend the packet by adding new header(s).
> >>
> >> Signed-off-by: Martin KaFai Lau 
> >> ---
> >
> > Have you tested your patch on a host with PAGE_SIZE = 64 KB ?
> >
> > Looks XDP really kills arches with bigger pages :(
> 
> I'm afraid xdp mlx[45] support was not tested on arches
> with 64k pages at all. Not just this patch.
> I think people who care about such archs should test?
> Note page per packet is not a hard requirement for all drivers
> and all archs. For mlx[45] it was the easiest and the most
> convenient way to achieve desired performance.
> If there are ways to do the same performance differently,
> I'm all ears :)
> 

My question was more like :

Can we double check all these patches wont break mlx4 driver (non XDP
path) on arches with PAGE_SIZE=64KB.

I have no plan using XDP before a while, but I certainly know some
customers are using mlx4 on powerpc.





[PATCH] net: ethernet: ti: cpdma: use desc_read in chan_process instead of raw read

2016-12-02 Thread Ivan Khoronzhuk
There is desc_read() macros to read desc fields, so no need to
use __raw_readl();

Signed-off-by: Ivan Khoronzhuk 
---
Based on net-next/master

 drivers/net/ethernet/ti/davinci_cpdma.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/ti/davinci_cpdma.c 
b/drivers/net/ethernet/ti/davinci_cpdma.c
index c776e45..d96dca5 100644
--- a/drivers/net/ethernet/ti/davinci_cpdma.c
+++ b/drivers/net/ethernet/ti/davinci_cpdma.c
@@ -1132,7 +1132,7 @@ static int __cpdma_chan_process(struct cpdma_chan *chan)
}
desc_dma = desc_phys(pool, desc);
 
-   status  = __raw_readl(&desc->hw_mode);
+   status = desc_read(desc, hw_mode);
outlen  = status & 0x7ff;
if (status & CPDMA_DESC_OWNER) {
chan->stats.busy_dequeue++;
-- 
2.7.4



Re: [PATCH net-next] liquidio: 'imply' ptp instead of 'select'

2016-12-02 Thread kbuild test robot
Hi Arnd,

[auto build test ERROR on net-next/master]

url:
https://github.com/0day-ci/linux/commits/Arnd-Bergmann/liquidio-imply-ptp-instead-of-select/20161203-084019
config: x86_64-allmodconfig
compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901
reproduce:
make ARCH=x86_64  allmodconfig
make ARCH=x86_64 

All errors (new ones prefixed by >>):

>> drivers/net/ethernet/cavium/Kconfig:81: syntax error
>> drivers/net/ethernet/cavium/Kconfig:80: unknown option "imply"
   make[2]: *** [allmodconfig] Error 1
   make[1]: *** [allmodconfig] Error 2
   make: *** [sub-make] Error 2
--
>> drivers/net/ethernet/cavium/Kconfig:81: syntax error
>> drivers/net/ethernet/cavium/Kconfig:80: unknown option "imply"
   make[2]: *** [oldconfig] Error 1
   make[1]: *** [oldconfig] Error 2
   make: *** [sub-make] Error 2
--
>> drivers/net/ethernet/cavium/Kconfig:81: syntax error
>> drivers/net/ethernet/cavium/Kconfig:80: unknown option "imply"
   make[2]: *** [olddefconfig] Error 1
   make[2]: Target 'oldnoconfig' not remade because of errors.
   make[1]: *** [oldnoconfig] Error 2
   make: *** [sub-make] Error 2

vim +81 drivers/net/ethernet/cavium/Kconfig

d07a147f David Daney 2016-03-14  74   port on Cavium Networks' 
Octeon CN57XX, CN56XX, CN55XX,
d07a147f David Daney 2016-03-14  75   CN54XX, CN52XX, and CN6XXX 
chips.
d07a147f David Daney 2016-03-14  76  
111fc64a Raghu Vatsavayi 2016-11-28  77  config LIQUIDIO_VF
111fc64a Raghu Vatsavayi 2016-11-28  78 tristate "Cavium LiquidIO VF 
support"
111fc64a Raghu Vatsavayi 2016-11-28  79 depends on 64BIT && PCI_MSI
2d6e65ca Arnd Bergmann   2016-12-03 @80 imply PTP_1588_CLOCK
111fc64a Raghu Vatsavayi 2016-11-28 @81 ---help---
111fc64a Raghu Vatsavayi 2016-11-28  82   This driver supports Cavium 
LiquidIO Intelligent Server Adapter
111fc64a Raghu Vatsavayi 2016-11-28  83   based on CN23XX chips.
111fc64a Raghu Vatsavayi 2016-11-28  84  

:: The code at line 81 was first introduced by commit
:: 111fc64a237f231bc2d3187bdf8358eb7966e6a9 liquidio CN23XX: VF registration

:: TO: Raghu Vatsavayi 
:: CC: David S. Miller 

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


[PATCH] net: ping: check minimum size on ICMP header length

2016-12-02 Thread Kees Cook
Prior to commit c0371da6047a ("put iov_iter into msghdr") in v3.19, there
was no check that the iovec contained enough bytes for a icmp header,
and the read loop would walk across neighboring stack contents. Since
the iov_iter conversion, bad arguments are noticed, but the returned
error is EFAULT. Returning EMSGSIZE is a clearer fix and solves the
problem prior to v3.19.

This was found using trinity with KASAN on v3.18:

BUG: KASAN: stack-out-of-bounds in memcpy_fromiovec+0x60/0x114 at addr 
ffc071077da0
Read of size 8 by task trinity-c2/9623
page:ffbe034b9a08 count:0 mapcount:0 mapping:  (null) index:0x0
flags: 0x0()
page dumped because: kasan: bad access detected
CPU: 0 PID: 9623 Comm: trinity-c2 Tainted: GBU 3.18.0-dirty #15
Hardware name: Google Tegra210 Smaug Rev 1,3+ (DT)
Call trace:
[] dump_backtrace+0x0/0x1ac arch/arm64/kernel/traps.c:90
[] show_stack+0x10/0x1c arch/arm64/kernel/traps.c:171
[< inline >] __dump_stack lib/dump_stack.c:15
[] dump_stack+0x7c/0xd0 lib/dump_stack.c:50
[< inline >] print_address_description mm/kasan/report.c:147
[< inline >] kasan_report_error mm/kasan/report.c:236
[] kasan_report+0x380/0x4b8 mm/kasan/report.c:259
[< inline >] check_memory_region mm/kasan/kasan.c:264
[] __asan_load8+0x20/0x70 mm/kasan/kasan.c:507
[] memcpy_fromiovec+0x5c/0x114 lib/iovec.c:15
[< inline >] memcpy_from_msg include/linux/skbuff.h:2667
[] ping_common_sendmsg+0x50/0x108 net/ipv4/ping.c:674
[] ping_v4_sendmsg+0xd8/0x698 net/ipv4/ping.c:714
[] inet_sendmsg+0xe0/0x12c net/ipv4/af_inet.c:749
[< inline >] __sock_sendmsg_nosec net/socket.c:624
[< inline >] __sock_sendmsg net/socket.c:632
[] sock_sendmsg+0x124/0x164 net/socket.c:643
[< inline >] SYSC_sendto net/socket.c:1797
[] SyS_sendto+0x178/0x1d8 net/socket.c:1761

CVE-2016-8399

Reported-by: Qidan He 
Fixes: c319b4d76b9e ("net: ipv4: add IPPROTO_ICMP socket kind")
Cc: sta...@vger.kernel.org
Signed-off-by: Kees Cook 
---
 net/ipv4/ping.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 205e2000d395..8257be3f032c 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -654,7 +654,7 @@ int ping_common_sendmsg(int family, struct msghdr *msg, 
size_t len,
void *user_icmph, size_t icmph_len) {
u8 type, code;
 
-   if (len > 0x)
+   if (len > 0x || len < icmph_len)
return -EMSGSIZE;
 
/*
-- 
2.7.4


-- 
Kees Cook
Nexus Security


Re: [PATCH net-next 2/4] mlx4: xdp: Allow raising MTU up to one page minus eth and vlan hdrs

2016-12-02 Thread Alexei Starovoitov

On 12/2/16 4:38 PM, Eric Dumazet wrote:

On Fri, 2016-12-02 at 15:23 -0800, Martin KaFai Lau wrote:

When XDP prog is attached, it is currently limiting
MTU to be FRAG_SZ0 - ETH_HLEN - (2 * VLAN_HLEN) which is 1514
in x86.

AFAICT, since mlx4 is doing one page per packet for XDP,
we can at least raise the MTU limitation up to
PAGE_SIZE - ETH_HLEN - (2 * VLAN_HLEN) which this patch is
doing.  It will be useful in the next patch which allows
XDP program to extend the packet by adding new header(s).

Signed-off-by: Martin KaFai Lau 
---


Have you tested your patch on a host with PAGE_SIZE = 64 KB ?

Looks XDP really kills arches with bigger pages :(


I'm afraid xdp mlx[45] support was not tested on arches
with 64k pages at all. Not just this patch.
I think people who care about such archs should test?
Note page per packet is not a hard requirement for all drivers
and all archs. For mlx[45] it was the easiest and the most
convenient way to achieve desired performance.
If there are ways to do the same performance differently,
I'm all ears :)



Re: [PATCH net-next 2/4] mlx4: xdp: Allow raising MTU up to one page minus eth and vlan hdrs

2016-12-02 Thread Eric Dumazet
On Fri, 2016-12-02 at 15:23 -0800, Martin KaFai Lau wrote:
> When XDP prog is attached, it is currently limiting
> MTU to be FRAG_SZ0 - ETH_HLEN - (2 * VLAN_HLEN) which is 1514
> in x86.
> 
> AFAICT, since mlx4 is doing one page per packet for XDP,
> we can at least raise the MTU limitation up to
> PAGE_SIZE - ETH_HLEN - (2 * VLAN_HLEN) which this patch is
> doing.  It will be useful in the next patch which allows
> XDP program to extend the packet by adding new header(s).
> 
> Signed-off-by: Martin KaFai Lau 
> ---

Have you tested your patch on a host with PAGE_SIZE = 64 KB ?

Looks XDP really kills arches with bigger pages :(

Thanks.




Re: [PATCH 5/7] Documentation: DT: net: cpsw: allow to specify descriptors pool size

2016-12-02 Thread Ivan Khoronzhuk
On Fri, Dec 02, 2016 at 11:22:28AM -0600, Grygorii Strashko wrote:
> 
> 
> On 12/02/2016 05:28 AM, Ivan Khoronzhuk wrote:
> > On Thu, Dec 01, 2016 at 05:34:30PM -0600, Grygorii Strashko wrote:
> >> Add optional property "descs_pool_size" to specify buffer descriptor's
> >> pool size. The "descs_pool_size" should define total number of CPDMA
> >> CPPI descriptors to be used for both ingress/egress packets
> >> processing. If not specified - the default value 256 will be used
> >> which will allow to place descriptor's pool into the internal CPPI
> >> RAM on most of TI SoC.
> >>
> >> Signed-off-by: Grygorii Strashko 
> >> ---
> >>  Documentation/devicetree/bindings/net/cpsw.txt | 5 +
> >>  1 file changed, 5 insertions(+)
> >>
> >> diff --git a/Documentation/devicetree/bindings/net/cpsw.txt 
> >> b/Documentation/devicetree/bindings/net/cpsw.txt
> >> index 5ad439f..b99d196 100644
> >> --- a/Documentation/devicetree/bindings/net/cpsw.txt
> >> +++ b/Documentation/devicetree/bindings/net/cpsw.txt
> >> @@ -35,6 +35,11 @@ Optional properties:
> >>  For example in dra72x-evm, pcf gpio has to be
> >>  driven low so that cpsw slave 0 and phy data
> >>  lines are connected via mux.
> >> +- descs_pool_size : total number of CPDMA CPPI descriptors to be used for
> >> +both ingress/egress packets processing. if not
> >> +specified the default value 256 will be used which
> >> +will allow to place descriptors pool into the
> >> +internal CPPI RAM.
> > Does it describe h/w? Why now module parameter? or even smth like ethtool 
> > num
> > ring entries?
> > 
> 
> It can be module parameter too. in general this is expected to be 
>  one-time boot setting only.  
> 
> - OR
> So, do you propose to use 
>ethtool -g ethX
> 
>ethtool -G ethX [rx N] [tx N]
> ?
It has a little different names, but yes, why not?
No need, maybe, butIt's just a proposition, at least I was thinking
about it after proposition from +cc Schuyler Patton to leave rx desc num
property. In this case it's possible to tune tx/rx desc num ratio, even
with SRAM descs.

> 
> Now cpdma has one pool for all RX/TX channels, so changing this settings
> by ethtool will require: pause interfaces, reallocate cpdma pool, 
Pause can lead to losts only for rx, and only for very short time, so
it's not very bad, especially when user knows what he is doing.


> re-arrange buffers between channels, resume interface. Correct?
correct.

But, some alternative variants can be used, like replacing descriptors.
Shrink num of desc for every channels to 1, replace/add others, and expand.
In this case no losts, but it's harder to debug issues after

> 
> How do you think - we can move forward with one pool or better to have two 
> (Rx and Tx)?
I think one is enough, just split, if no harm on perf.

> 
> Wouldn't it be reasonable to still have DT (or module) parameter to avoid 
> cpdma reconfiguration on system startup (pause/resume interfaces) (faster 
> boot)?
Would be, your choice, but it's not flexible.

> 
> How about cpdma re-allocation policy (with expectation that is shouldn't 
> happen too often)?
> - increasing of Rx, Tx will grow total number of physically allocated buffers 
> (total_desc_num)
> - decreasing of Rx, Tx will just change number of available buffers (no 
> memory re-allocation)
> 
> - OR 
> Can we move forward with current patch (total number of CPDMA CPPI 
> descriptors defined in DT) 
> and add ethtool -G ethX [rx N] [tx N] which will allow to re-split descs 
> between RX and TX?
No objections, It anyway requires re-allocations. Re-split of Rx and Tx will
not have a lot changes as most code exists already.

> 
> 
> 
> -- 
> regards,
> -grygorii


Re: [PATCH net] geneve: avoid use-after-free of skb->data

2016-12-02 Thread Sabrina Dubroca
2016-12-02, 14:09:25 -0500, David Miller wrote:
> From: Sabrina Dubroca 
> Date: Fri,  2 Dec 2016 16:49:29 +0100
> 
> > geneve{,6}_build_skb can end up doing a pskb_expand_head(), which
> > makes the ip_hdr(skb) reference we stashed earlier stale. Since it's
> > only needed as an argument to ip_tunnel_ecn_encap(), move this
> > directly in the function call.
> > 
> > Fixes: 08399efc6319 ("geneve: ensure ECN info is handled properly in all 
> > tx/rx paths")
> > Signed-off-by: Sabrina Dubroca 
> 
> Applied and queued up for -stable, thanks.
> 
> This bug happens so many times that I think it might be time for
> a debugging mode for pskb_expand_head() that unconditionally
> reallocates the skb->data buffer regardless of whether it's
> necessary or not and somehow unmaps the previous buffer to
> force a trap on stale pointers.

The problem with that is you'd need to enable the "debugging mode" in
all wrappers, so that they don't bypass the actual call to
pskb_expand_head(). And that still leaves all the direct calls to
pskb_expand_head() that are guarded by some kind of check (just two
random hits without even looking very hard:
net/core/pktgen.c:process_ipsec, net/ipv4/ip_gre.c:gre_fb_xmit).

Then I think we could just rely on KASAN (that's how I noticed this
bug).


> Better ideas welcome, of course :)

May not be better ;)  but at least another idea:

I'd like to try something based on static analysis. We'd need a way to
tag cached pointers to skb->data (via ip_hdr() or whatever), and
propagate the notion that pskb_expand_head() makes these cached
pointers stale through layers of function calls.  I don't know how
feasible this is with the tools we have.

-- 
Sabrina


Re: [PATCH net-next 1/4] bpf: xdp: Allow head adjustment in XDP prog

2016-12-02 Thread Daniel Borkmann

On 12/03/2016 12:23 AM, Martin KaFai Lau wrote:

This patch allows XDP prog to extend/remove the packet
data at the head (like adding or removing header).  It is
done by adding a new XDP helper bpf_xdp_adjust_head().

It also renames bpf_helper_changes_skb_data() to
bpf_helper_changes_pkt_data() to better reflect
that XDP prog does not work on skb.

Signed-off-by: Martin KaFai Lau 

[...]

diff --git a/net/core/filter.c b/net/core/filter.c
index 56b43587d200..6902e2f73e38 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2234,7 +2234,34 @@ static const struct bpf_func_proto 
bpf_skb_change_head_proto = {
.arg3_type  = ARG_ANYTHING,
  };

-bool bpf_helper_changes_skb_data(void *func)
+BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset)
+{
+   /* Both mlx4 and mlx5 driver align each packet to PAGE_SIZE when
+* XDP prog is set.
+* If the above is not true for the other drivers to support
+* bpf_xdp_adjust_head, struct xdp_buff can be extended.
+*/
+   void *head = (void *)((unsigned long)xdp->data & PAGE_MASK);
+   void *new_data = xdp->data + offset;
+
+   if (new_data < head || new_data >= xdp->data_end)
+   /* The packet length must be >=1 */


Patch looks generally good to me. Should the min pkt len here be
limited to ETH_HLEN instead of 1?


+   return -EINVAL;
+
+   xdp->data = new_data;
+
+   return 0;
+}
+
+static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
+   .func   = bpf_xdp_adjust_head,
+   .gpl_only   = false,
+   .ret_type   = RET_INTEGER,
+   .arg1_type  = ARG_PTR_TO_CTX,
+   .arg2_type  = ARG_ANYTHING,
+};
+
+bool bpf_helper_changes_pkt_data(void *func)
  {
if (func == bpf_skb_vlan_push ||
func == bpf_skb_vlan_pop ||

[...]


Re: bpf bounded loops. Was: [flamebait] xdp

2016-12-02 Thread Alexei Starovoitov
On Fri, Dec 02, 2016 at 11:42:15AM -0800, John Fastabend wrote:
> >> As far as pattern search for DNS packets...
> >> it was requested by Cloudflare guys back in March:
> >> https://github.com/iovisor/bcc/issues/471
> >> and it is useful for several tracing use cases as well.
> >> Unfortunately no one had time to implement it yet.
> > 
> > The string operations you proposed on the other hand, which would count
> > as one eBPF instructions, would give a lot more flexibility and allow
> > more cycles to burn, but don't help parsing binary protocols like IPv6
> > extension headers.

these are two separate things. we need pattern search regardless
of bounded loops. bpf program shouldn't be doing any complicated
algorithms. The main reasons to have loops are:
- speed up execution (smaller I-cache footprint)
- avoid forcing compiler to unroll loops (easier for users)
- support loops where unroll is not possible (like example below)

> My rough thinking on this was the verifier had to start looking for loop
> invariants and to guarantee termination. Sounds scary in general but
> LLVM could put these in some normal form for us and the verifier could
> only accept decreasing loops, the invariants could be required to be
> integers, etc. By simplifying the loop enough the problem becomes
> tractable.

yep. I think what Hannes was proposing earlier is straighforward
to implement for a compiler guy. The following:
for (int i = 0; i < (var & 0xff); i++)
  sum += map->value[i];  /* map value_size >= 0xff */
is obviously bounded and dataflow analysis can easily prove
that all memory operations are valid.
Static analysis tools do way way more than this.

> I think this would be better than new instructions and/or multiple
> verifiers.

agree that it's better than new instructions that would have
required JIT changes. Though there are pros to new insns too :)



[PATCH net-next 3/4] mlx4: xdp: Reserve headroom for receiving packet when XDP prog is active

2016-12-02 Thread Martin KaFai Lau
Reserve XDP_PACKET_HEADROOM when XDP prog is active.

Signed-off-by: Martin KaFai Lau 
---
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 17 +++--
 drivers/net/ethernet/mellanox/mlx4/en_rx.c | 23 +--
 drivers/net/ethernet/mellanox/mlx4/en_tx.c |  9 +
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h   |  3 ++-
 4 files changed, 39 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c 
b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 5df0bbd88d67..fb6d87dbc350 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -51,7 +51,8 @@
 #include "mlx4_en.h"
 #include "en_port.h"
 
-#define MLX4_EN_MAX_XDP_MTU ((int)(PAGE_SIZE - ETH_HLEN - (2 * VLAN_HLEN)))
+#define MLX4_EN_MAX_XDP_MTU ((int)(PAGE_SIZE - ETH_HLEN - (2 * VLAN_HLEN) - \
+  XDP_PACKET_HEADROOM))
 
 int mlx4_en_setup_tc(struct net_device *dev, u8 up)
 {
@@ -1551,6 +1552,7 @@ int mlx4_en_start_port(struct net_device *dev)
struct mlx4_en_tx_ring *tx_ring;
int rx_index = 0;
int err = 0;
+   int mtu;
int i, t;
int j;
u8 mc_list[16] = {0};
@@ -1684,8 +1686,12 @@ int mlx4_en_start_port(struct net_device *dev)
}
 
/* Configure port */
+   mtu = priv->rx_skb_size + ETH_FCS_LEN;
+   if (priv->tx_ring_num[TX_XDP])
+   mtu += XDP_PACKET_HEADROOM;
+
err = mlx4_SET_PORT_general(mdev->dev, priv->port,
-   priv->rx_skb_size + ETH_FCS_LEN,
+   mtu,
priv->prof->tx_pause,
priv->prof->tx_ppp,
priv->prof->rx_pause,
@@ -2268,6 +2274,13 @@ static bool mlx4_en_check_xdp_mtu(struct net_device 
*dev, int mtu)
 {
struct mlx4_en_priv *priv = netdev_priv(dev);
 
+   if (mtu + XDP_PACKET_HEADROOM > priv->max_mtu) {
+   en_err(priv,
+  "Device max mtu:%d does not allow %d bytes reserved 
headroom for XDP prog\n",
+  priv->max_mtu, XDP_PACKET_HEADROOM);
+   return false;
+   }
+
if (mtu > MLX4_EN_MAX_XDP_MTU) {
en_err(priv, "mtu:%d > max:%d when XDP prog is attached\n",
   mtu, MLX4_EN_MAX_XDP_MTU);
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c 
b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index 23e9d04d1ef4..324771ac929e 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -96,7 +96,6 @@ static int mlx4_en_alloc_frags(struct mlx4_en_priv *priv,
struct mlx4_en_rx_alloc page_alloc[MLX4_EN_MAX_RX_FRAGS];
const struct mlx4_en_frag_info *frag_info;
struct page *page;
-   dma_addr_t dma;
int i;
 
for (i = 0; i < priv->num_frags; i++) {
@@ -115,9 +114,10 @@ static int mlx4_en_alloc_frags(struct mlx4_en_priv *priv,
 
for (i = 0; i < priv->num_frags; i++) {
frags[i] = ring_alloc[i];
-   dma = ring_alloc[i].dma + ring_alloc[i].page_offset;
+   frags[i].page_offset += priv->frag_info[i].rx_headroom;
+   rx_desc->data[i].addr = cpu_to_be64(frags[i].dma +
+   frags[i].page_offset);
ring_alloc[i] = page_alloc[i];
-   rx_desc->data[i].addr = cpu_to_be64(dma);
}
 
return 0;
@@ -250,7 +250,8 @@ static int mlx4_en_prepare_rx_desc(struct mlx4_en_priv 
*priv,
 
if (ring->page_cache.index > 0) {
frags[0] = ring->page_cache.buf[--ring->page_cache.index];
-   rx_desc->data[0].addr = cpu_to_be64(frags[0].dma);
+   rx_desc->data[0].addr = cpu_to_be64(frags[0].dma +
+   frags[0].page_offset);
return 0;
}
 
@@ -889,6 +890,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct 
mlx4_en_cq *cq, int bud
if (xdp_prog) {
struct xdp_buff xdp;
dma_addr_t dma;
+   void *pg_addr, *orig_data;
u32 act;
 
dma = be64_to_cpu(rx_desc->data[0].addr);
@@ -896,11 +898,18 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct 
mlx4_en_cq *cq, int bud
priv->frag_info[0].frag_size,
DMA_FROM_DEVICE);
 
-   xdp.data = page_address(frags[0].page) +
-   frags[0].page_offset;
+   pg_addr = page_address(frags[0].page);
+   orig_data = pg_addr + frags[0].page_offset;
+   xdp.data = orig_data;
xdp.data_end = x

Re: [PATCH net-next 2/4] mlx4: xdp: Allow raising MTU up to one page minus eth and vlan hdrs

2016-12-02 Thread Rick Jones

On 12/02/2016 03:23 PM, Martin KaFai Lau wrote:

When XDP prog is attached, it is currently limiting
MTU to be FRAG_SZ0 - ETH_HLEN - (2 * VLAN_HLEN) which is 1514
in x86.

AFAICT, since mlx4 is doing one page per packet for XDP,
we can at least raise the MTU limitation up to
PAGE_SIZE - ETH_HLEN - (2 * VLAN_HLEN) which this patch is
doing.  It will be useful in the next patch which allows
XDP program to extend the packet by adding new header(s).


Is mlx4 the only driver doing page-per-packet?

rick jones



[PATCH net-next 1/4] bpf: xdp: Allow head adjustment in XDP prog

2016-12-02 Thread Martin KaFai Lau
This patch allows XDP prog to extend/remove the packet
data at the head (like adding or removing header).  It is
done by adding a new XDP helper bpf_xdp_adjust_head().

It also renames bpf_helper_changes_skb_data() to
bpf_helper_changes_pkt_data() to better reflect
that XDP prog does not work on skb.

Signed-off-by: Martin KaFai Lau 
---
 arch/powerpc/net/bpf_jit_comp64.c |  4 ++--
 arch/s390/net/bpf_jit_comp.c  |  2 +-
 arch/x86/net/bpf_jit_comp.c   |  2 +-
 include/linux/filter.h|  2 +-
 include/uapi/linux/bpf.h  | 11 ++-
 kernel/bpf/core.c |  2 +-
 kernel/bpf/verifier.c |  2 +-
 net/core/filter.c | 34 --
 8 files changed, 49 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/net/bpf_jit_comp64.c 
b/arch/powerpc/net/bpf_jit_comp64.c
index 0fe98a567125..73a5cf18fd84 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -766,7 +766,7 @@ static int bpf_jit_build_body(struct bpf_prog *fp, u32 
*image,
func = (u8 *) __bpf_call_base + imm;
 
/* Save skb pointer if we need to re-cache skb data */
-   if (bpf_helper_changes_skb_data(func))
+   if (bpf_helper_changes_pkt_data(func))
PPC_BPF_STL(3, 1, bpf_jit_stack_local(ctx));
 
bpf_jit_emit_func_call(image, ctx, (u64)func);
@@ -775,7 +775,7 @@ static int bpf_jit_build_body(struct bpf_prog *fp, u32 
*image,
PPC_MR(b2p[BPF_REG_0], 3);
 
/* refresh skb cache */
-   if (bpf_helper_changes_skb_data(func)) {
+   if (bpf_helper_changes_pkt_data(func)) {
/* reload skb pointer to r3 */
PPC_BPF_LL(3, 1, bpf_jit_stack_local(ctx));
bpf_jit_emit_skb_loads(image, ctx);
diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index bee281f3163d..167b31b186c1 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -981,7 +981,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, 
struct bpf_prog *fp, int i
EMIT2(0x0d00, REG_14, REG_W1);
/* lgr %b0,%r2: load return value into %b0 */
EMIT4(0xb904, BPF_REG_0, REG_2);
-   if (bpf_helper_changes_skb_data((void *)func)) {
+   if (bpf_helper_changes_pkt_data((void *)func)) {
jit->seen |= SEEN_SKB_CHANGE;
/* lg %b1,ST_OFF_SKBP(%r15) */
EMIT6_DISP_LH(0xe300, 0x0004, BPF_REG_1, REG_0,
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index fe04a04dab8e..e76d1af60f7a 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -853,7 +853,7 @@ xadd:   if (is_imm8(insn->off))
func = (u8 *) __bpf_call_base + imm32;
jmp_offset = func - (image + addrs[i]);
if (seen_ld_abs) {
-   reload_skb_data = 
bpf_helper_changes_skb_data(func);
+   reload_skb_data = 
bpf_helper_changes_pkt_data(func);
if (reload_skb_data) {
EMIT1(0x57); /* push %rdi */
jmp_offset += 22; /* pop, mov, sub, mov 
*/
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 97338134398f..3c02de77ad6a 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -590,7 +590,7 @@ void sk_filter_uncharge(struct sock *sk, struct sk_filter 
*fp);
 u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 
 struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog);
-bool bpf_helper_changes_skb_data(void *func);
+bool bpf_helper_changes_pkt_data(void *func);
 
 struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
   const struct bpf_insn *patch, u32 len);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 6123d9b8e828..0eb0e87dbe9f 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -424,6 +424,12 @@ union bpf_attr {
  * @len: length of header to be pushed in front
  * @flags: Flags (unused for now)
  * Return: 0 on success or negative error
+ *
+ * int bpf_xdp_adjust_head(xdp_md, delta)
+ * Adjust the xdp_md.data by delta
+ * @xdp_md: pointer to xdp_md
+ * @delta: An positive/negative integer to be added to xdp_md.data
+ * Return: 0 on success or negative on error
  */
 #define __BPF_FUNC_MAPPER(FN)  \
FN(unspec), \
@@ -469,7 +475,8 @@ union bpf_attr {
FN(csum_update),\
FN(set_hash_invalid),   \
FN(ge

[PATCH net-next 2/4] mlx4: xdp: Allow raising MTU up to one page minus eth and vlan hdrs

2016-12-02 Thread Martin KaFai Lau
When XDP prog is attached, it is currently limiting
MTU to be FRAG_SZ0 - ETH_HLEN - (2 * VLAN_HLEN) which is 1514
in x86.

AFAICT, since mlx4 is doing one page per packet for XDP,
we can at least raise the MTU limitation up to
PAGE_SIZE - ETH_HLEN - (2 * VLAN_HLEN) which this patch is
doing.  It will be useful in the next patch which allows
XDP program to extend the packet by adding new header(s).

Signed-off-by: Martin KaFai Lau 
---
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 28 +++-
 drivers/net/ethernet/mellanox/mlx4/en_rx.c | 46 ++
 2 files changed, 44 insertions(+), 30 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c 
b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 091b904262bc..5df0bbd88d67 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -51,6 +51,8 @@
 #include "mlx4_en.h"
 #include "en_port.h"
 
+#define MLX4_EN_MAX_XDP_MTU ((int)(PAGE_SIZE - ETH_HLEN - (2 * VLAN_HLEN)))
+
 int mlx4_en_setup_tc(struct net_device *dev, u8 up)
 {
struct mlx4_en_priv *priv = netdev_priv(dev);
@@ -2262,6 +2264,19 @@ void mlx4_en_destroy_netdev(struct net_device *dev)
free_netdev(dev);
 }
 
+static bool mlx4_en_check_xdp_mtu(struct net_device *dev, int mtu)
+{
+   struct mlx4_en_priv *priv = netdev_priv(dev);
+
+   if (mtu > MLX4_EN_MAX_XDP_MTU) {
+   en_err(priv, "mtu:%d > max:%d when XDP prog is attached\n",
+  mtu, MLX4_EN_MAX_XDP_MTU);
+   return false;
+   }
+
+   return true;
+}
+
 static int mlx4_en_change_mtu(struct net_device *dev, int new_mtu)
 {
struct mlx4_en_priv *priv = netdev_priv(dev);
@@ -2271,11 +2286,10 @@ static int mlx4_en_change_mtu(struct net_device *dev, 
int new_mtu)
en_dbg(DRV, priv, "Change MTU called - current:%d new:%d\n",
 dev->mtu, new_mtu);
 
-   if (priv->tx_ring_num[TX_XDP] && MLX4_EN_EFF_MTU(new_mtu) > FRAG_SZ0) {
-   en_err(priv, "MTU size:%d requires frags but XDP running\n",
-  new_mtu);
-   return -EOPNOTSUPP;
-   }
+   if (priv->tx_ring_num[TX_XDP] &&
+   !mlx4_en_check_xdp_mtu(dev, new_mtu))
+   return -ENOTSUPP;
+
dev->mtu = new_mtu;
 
if (netif_running(dev)) {
@@ -2723,10 +2737,8 @@ static int mlx4_xdp_set(struct net_device *dev, struct 
bpf_prog *prog)
return 0;
}
 
-   if (priv->num_frags > 1) {
-   en_err(priv, "Cannot set XDP if MTU requires multiple frags\n");
+   if (!mlx4_en_check_xdp_mtu(dev, dev->mtu))
return -EOPNOTSUPP;
-   }
 
tmp = kzalloc(sizeof(*tmp), GFP_KERNEL);
if (!tmp)
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c 
b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index 6562f78b07f4..23e9d04d1ef4 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -1164,37 +1164,39 @@ static const int frag_sizes[] = {
 
 void mlx4_en_calc_rx_buf(struct net_device *dev)
 {
-   enum dma_data_direction dma_dir = PCI_DMA_FROMDEVICE;
struct mlx4_en_priv *priv = netdev_priv(dev);
int eff_mtu = MLX4_EN_EFF_MTU(dev->mtu);
-   int order = MLX4_EN_ALLOC_PREFER_ORDER;
-   u32 align = SMP_CACHE_BYTES;
-   int buf_size = 0;
int i = 0;
 
/* bpf requires buffers to be set up as 1 packet per page.
 * This only works when num_frags == 1.
 */
if (priv->tx_ring_num[TX_XDP]) {
-   dma_dir = PCI_DMA_BIDIRECTIONAL;
-   /* This will gain efficient xdp frame recycling at the expense
-* of more costly truesize accounting
+   priv->frag_info[0].order = 0;
+   priv->frag_info[0].frag_size = eff_mtu;
+   priv->frag_info[0].frag_prefix_size = 0;
+   /* This will gain efficient xdp frame recycling at the
+* expense of more costly truesize accounting
 */
-   align = PAGE_SIZE;
-   order = 0;
-   }
-
-   while (buf_size < eff_mtu) {
-   priv->frag_info[i].order = order;
-   priv->frag_info[i].frag_size =
-   (eff_mtu > buf_size + frag_sizes[i]) ?
-   frag_sizes[i] : eff_mtu - buf_size;
-   priv->frag_info[i].frag_prefix_size = buf_size;
-   priv->frag_info[i].frag_stride =
-   ALIGN(priv->frag_info[i].frag_size, align);
-   priv->frag_info[i].dma_dir = dma_dir;
-   buf_size += priv->frag_info[i].frag_size;
-   i++;
+   priv->frag_info[0].frag_stride = PAGE_SIZE;
+   priv->frag_info[0].dma_dir = PCI_DMA_BIDIRECTIONAL;
+   i = 1;
+   } else {
+   int buf_size = 0;
+
+   while (buf_s

Re: [PATCH v2 0/7] stmmac: dwmac-meson8b: configurable RGMII TX delay

2016-12-02 Thread Martin Blumenstingl
On Mon, Nov 28, 2016 at 2:33 AM, David Miller  wrote:
> From: Martin Blumenstingl 
> Date: Fri, 25 Nov 2016 14:01:49 +0100
>
>> Currently the dwmac-meson8b stmmac glue driver uses a hardcoded 1/4
>> cycle TX clock delay. This seems to work fine for many boards (for
>> example Odroid-C2 or Amlogic's reference boards) but there are some
>> others where TX traffic is simply broken.
>> There are probably multiple reasons why it's working on some boards
>> while it's broken on others:
>> - some of Amlogic's reference boards are using a Micrel PHY
>> - hardware circuit design
>> - maybe more...
>
> The ARM arch file changes do not apply cleanly to net-next, you probably
> want to merge them via the ARM tree instead of mine, and respin this series
> to be without the .dts file changes.
done, v3 contains only the net-next changes while the dts changes can
be found here: [0]


Regards,
Martin


[0] http://lists.infradead.org/pipermail/linux-amlogic/2016-December/001836.html


[PATCH net-next 4/4] bpf: xdp: Add XDP example for head adjustment

2016-12-02 Thread Martin KaFai Lau
The XDP prog checks if the incoming packet matches any VIP:PORT
combination in the BPF hashmap.  If it is, it will encapsulate
the packet with a IPv4/v6 header as instructed by the value of
the BPF hashmap and then XDP_TX it out.

The VIP:PORT -> IP-Encap-Info can be specified by the cmd args
of the user prog.

Signed-off-by: Martin KaFai Lau 
---
 samples/bpf/Makefile  |   4 +
 samples/bpf/bpf_helpers.h |   2 +
 samples/bpf/bpf_load.c|  94 ++
 samples/bpf/bpf_load.h|   1 +
 samples/bpf/xdp1_user.c   |  93 --
 samples/bpf/xdp_tx_iptnl_common.h |  37 ++
 samples/bpf/xdp_tx_iptnl_kern.c   | 232 ++
 samples/bpf/xdp_tx_iptnl_user.c   | 253 ++
 8 files changed, 623 insertions(+), 93 deletions(-)
 create mode 100644 samples/bpf/xdp_tx_iptnl_common.h
 create mode 100644 samples/bpf/xdp_tx_iptnl_kern.c
 create mode 100644 samples/bpf/xdp_tx_iptnl_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index bfc2cb88a1f7..e4d6be8bd94b 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -32,6 +32,7 @@ hostprogs-y += trace_event
 hostprogs-y += sampleip
 hostprogs-y += tc_l2_redirect
 hostprogs-y += lwt_len_hist
+hostprogs-y += xdp_tx_iptnl
 
 test_lru_dist-objs := test_lru_dist.o libbpf.o
 sock_example-objs := sock_example.o libbpf.o
@@ -65,6 +66,7 @@ trace_event-objs := bpf_load.o libbpf.o trace_event_user.o
 sampleip-objs := bpf_load.o libbpf.o sampleip_user.o
 tc_l2_redirect-objs := bpf_load.o libbpf.o tc_l2_redirect_user.o
 lwt_len_hist-objs := bpf_load.o libbpf.o lwt_len_hist_user.o
+xdp_tx_iptnl-objs := bpf_load.o libbpf.o xdp_tx_iptnl_user.o
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -97,6 +99,7 @@ always += test_current_task_under_cgroup_kern.o
 always += trace_event_kern.o
 always += sampleip_kern.o
 always += lwt_len_hist_kern.o
+always += xdp_tx_iptnl_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 HOSTCFLAGS += -I$(srctree)/tools/testing/selftests/bpf/
@@ -127,6 +130,7 @@ HOSTLOADLIBES_trace_event += -lelf
 HOSTLOADLIBES_sampleip += -lelf
 HOSTLOADLIBES_tc_l2_redirect += -l elf
 HOSTLOADLIBES_lwt_len_hist += -l elf
+HOSTLOADLIBES_xdp_tx_iptnl += -lelf
 
 # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on 
cmdline:
 #  make samples/bpf/ LLC=~/git/llvm/build/bin/llc 
CLANG=~/git/llvm/build/bin/clang
diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h
index a246c6122629..8e9dca50b73a 100644
--- a/samples/bpf/bpf_helpers.h
+++ b/samples/bpf/bpf_helpers.h
@@ -57,6 +57,8 @@ static int (*bpf_skb_set_tunnel_opt)(void *ctx, void *md, int 
size) =
(void *) BPF_FUNC_skb_set_tunnel_opt;
 static unsigned long long (*bpf_get_prandom_u32)(void) =
(void *) BPF_FUNC_get_prandom_u32;
+static int (*bpf_xdp_adjust_head)(void *ctx, int offset) =
+   (void *) BPF_FUNC_xdp_adjust_head;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index 49b45ccbe153..e30b6de94f2e 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -12,6 +12,10 @@
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 
+#include 
 #include 
 #include 
 #include 
@@ -450,3 +454,93 @@ struct ksym *ksym_search(long key)
/* out of range. return _stext */
return &syms[0];
 }
+
+int set_link_xdp_fd(int ifindex, int fd)
+{
+   struct sockaddr_nl sa;
+   int sock, seq = 0, len, ret = -1;
+   char buf[4096];
+   struct nlattr *nla, *nla_xdp;
+   struct {
+   struct nlmsghdr  nh;
+   struct ifinfomsg ifinfo;
+   char attrbuf[64];
+   } req;
+   struct nlmsghdr *nh;
+   struct nlmsgerr *err;
+
+   memset(&sa, 0, sizeof(sa));
+   sa.nl_family = AF_NETLINK;
+
+   sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
+   if (sock < 0) {
+   printf("open netlink socket: %s\n", strerror(errno));
+   return -1;
+   }
+
+   if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) {
+   printf("bind to netlink: %s\n", strerror(errno));
+   goto cleanup;
+   }
+
+   memset(&req, 0, sizeof(req));
+   req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
+   req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+   req.nh.nlmsg_type = RTM_SETLINK;
+   req.nh.nlmsg_pid = 0;
+   req.nh.nlmsg_seq = ++seq;
+   req.ifinfo.ifi_family = AF_UNSPEC;
+   req.ifinfo.ifi_index = ifindex;
+   nla = (struct nlattr *)(((char *)&req)
+   + NLMSG_ALIGN(req.nh.nlmsg_len));
+   nla->nla_type = NLA_F_NESTED | 43/*IFLA_XDP*/;
+
+   nla_xdp = (struct nlattr *)((char *)nla + NLA_HDRLEN);
+   nla_xdp->nla_type = 1/*IFLA_XDP_FD*/;
+   nla_xdp->nla_len = NLA_HD

[PATCH net-next 0/4]: Allow head adjustment in XDP prog

2016-12-02 Thread Martin KaFai Lau
This series adds a helper to allow head adjustment in XDP prog.  mlx4
driver has been modified to support this feature.  An example is written
to encapsulate a packet with an IPv4/v6 header and then XDP_TX it
out.

Thanks,
--Martin



Re: [PATCH net-next] liquidio: 'imply' ptp instead of 'select'

2016-12-02 Thread David Daney

On 12/02/2016 03:04 PM, Arnd Bergmann wrote:

ptp now depends on the optional POSIX_TIMERS setting and fails to build
if we select it without that:

warning: (LIQUIDIO_VF && TI_CPTS) selects PTP_1588_CLOCK which has unmet direct 
dependencies (NET && POSIX_TIMERS)
warning: (LIQUIDIO_VF && TI_CPTS) selects PTP_1588_CLOCK which has unmet direct 
dependencies (NET && POSIX_TIMERS)
ERROR: "posix_clock_unregister" [drivers/ptp/ptp.ko] undefined!
ERROR: "posix_clock_register" [drivers/ptp/ptp.ko] undefined!
ERROR: "pps_unregister_source" [drivers/ptp/ptp.ko] undefined!
ERROR: "pps_event" [drivers/ptp/ptp.ko] undefined!
ERROR: "pps_register_source" [drivers/ptp/ptp.ko] undefined!

It seems that two patches have collided here, the build failure
is a result of the combination. Changing the new option to 'imply'
as well fixes it.

Fixes: 111fc64a237f ("liquidio CN23XX: VF registration")
Fixes: d1cbfd771ce8 ("ptp_clock: Allow for it to be optional")
Signed-off-by: Arnd Bergmann 


I didn't know about this new "imply" thing.  This seems like a plausible 
fix, so...


Acked-by: David Daney 

Thanks for fixing this up.



---
 drivers/net/ethernet/cavium/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/cavium/Kconfig 
b/drivers/net/ethernet/cavium/Kconfig
index bbc8bd16cb97..dcbce6cac63e 100644
--- a/drivers/net/ethernet/cavium/Kconfig
+++ b/drivers/net/ethernet/cavium/Kconfig
@@ -77,7 +77,7 @@ config OCTEON_MGMT_ETHERNET
 config LIQUIDIO_VF
tristate "Cavium LiquidIO VF support"
depends on 64BIT && PCI_MSI
-   select PTP_1588_CLOCK
+   imply PTP_1588_CLOCK
---help---
  This driver supports Cavium LiquidIO Intelligent Server Adapter
  based on CN23XX chips.





Re: bpf bounded loops. Was: [flamebait] xdp

2016-12-02 Thread Alexei Starovoitov
On Fri, Dec 02, 2016 at 08:42:41PM +0100, Hannes Frederic Sowa wrote:
> On Fri, Dec 2, 2016, at 20:25, Hannes Frederic Sowa wrote:
> > On 02.12.2016 19:39, Alexei Starovoitov wrote:
> > > On Thu, Dec 01, 2016 at 10:27:12PM +0100, Hannes Frederic Sowa wrote:
> > >> like") and the problematic of parsing DNS packets in XDP due to string
> > >> processing and looping inside eBPF.
> > > 
> > > Hannes,
> > > Not too long ago you proposed a very interesting idea to add
> > > support for bounded loops without adding any new bpf instructions and
> > > changing llvm (which was way better than my 'rep' like instructions
> > > I was experimenting with). I thought systemtap guys also wanted bounded
> > > loops and you were cooperating on the design, so I gave up on my work and
> > > was expecting an imminent patch from you. I guess it sounds like you know
> > > believe that bounded loops are impossible or I misunderstand your 
> > > statement ?
> > 
> > Your argument was that it would need a new verifier as the current first
> > pass checks that we indeed can lay out the basic blocks as a DAG which
> > the second pass depends on. This would be violated.

yes. today the main part of verifier depends on cfg check that confirms DAG
property of the program. This was done as a simplification for the algorithm,
so any programmer that understands C can understand the verifier code.
It certainly was the case, since most of the people who hacked
verifier had zero compiler background.
Now I'm thinking to introduce proper compiler technologies to it.
On one side it will make the bar to understand higher and on the other
side it will cleanup the logic and reuse tens of years of data flow
analysis theory and will make verifier more robust and mathematically
solid.

> > Because eBPF is available by non privileged users this would need a lot
> > of effort to rewrite and verify (or indeed keep two verifiers in the
> > kernel for priv and non-priv). The verifier itself is exposed to
> > unprivileged users.

I certainly hear your concerns that people unfamiliar with it are simply
scared that more and more verification logic being added. So I don't mind
freezing current verifier for unpriv and let proper data flow analysis
to be done in root only component.

> > Also, by design, if we keep the current limits, this would not give you
> > more instructions to operate on compared to the flattened version of the
> > program, it would merely reduce the numbers of optimizations in LLVM
> > that let the verifier reject the program.

I think we most likely will keep 4k insn limit (since there were no
requests to increase it). The bounded loops will improve performance
and reduce I-cache misses.

> The only solution to protect the verifier, which I saw, would be to
> limit it by time and space, thus making loading of eBPF programs
> depending on how fast and hot (thermal throttling) one CPU thread is.

the verifier already has time and space limits.
See no reason to rely on physical cpu sensors.

> Those are the complexity problems I am talking and concerned about.

Do you have concerns when people implement encryption algorithm
that you're unfamiliar with?
Isn't it much bigger concern, since any bugs in the algorithm
are directly exploitable and when encryption is actually used
it's protecting sensitive data, whereas here the verifier
protects kernel from crashing.



[PATCH net-next v3 2/2] net: stmmac: dwmac-meson8b: make the RGMII TX delay configurable

2016-12-02 Thread Martin Blumenstingl
Prior to this patch we were using a hardcoded RGMII TX clock delay of
2ns (= 1/4 cycle of the 125MHz RGMII TX clock). This value works for
many boards, but unfortunately not for all (due to the way the actual
circuit is designed, sometimes because the TX delay is enabled in the
PHY, etc.). Making the TX delay on the MAC side configurable allows us
to support all possible hardware combinations.

This allows fixing a compatibility issue on some boards, where the
RTL8211F PHY is configured to generate the TX delay. We can now turn
off the TX delay in the MAC, because otherwise we would be applying the
delay twice (which results in non-working TX traffic).

Signed-off-by: Martin Blumenstingl 
Tested-by: Neil Armstrong 
---
 drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c | 21 +++--
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c 
b/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c
index 250e4ce..dad31b0 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c
@@ -35,10 +35,6 @@
 
 #define PRG_ETH0_TXDLY_SHIFT   5
 #define PRG_ETH0_TXDLY_MASKGENMASK(6, 5)
-#define PRG_ETH0_TXDLY_OFF (0x0 << PRG_ETH0_TXDLY_SHIFT)
-#define PRG_ETH0_TXDLY_QUARTER (0x1 << PRG_ETH0_TXDLY_SHIFT)
-#define PRG_ETH0_TXDLY_HALF(0x2 << PRG_ETH0_TXDLY_SHIFT)
-#define PRG_ETH0_TXDLY_THREE_QUARTERS  (0x3 << PRG_ETH0_TXDLY_SHIFT)
 
 /* divider for the result of m250_sel */
 #define PRG_ETH0_CLK_M250_DIV_SHIFT7
@@ -69,6 +65,8 @@ struct meson8b_dwmac {
 
struct clk_divider  m25_div;
struct clk  *m25_div_clk;
+
+   u32 tx_delay_ns;
 };
 
 static void meson8b_dwmac_mask_bits(struct meson8b_dwmac *dwmac, u32 reg,
@@ -179,6 +177,7 @@ static int meson8b_init_prg_eth(struct meson8b_dwmac *dwmac)
 {
int ret;
unsigned long clk_rate;
+   u8 tx_dly_val;
 
switch (dwmac->phy_mode) {
case PHY_INTERFACE_MODE_RGMII:
@@ -196,9 +195,13 @@ static int meson8b_init_prg_eth(struct meson8b_dwmac 
*dwmac)
meson8b_dwmac_mask_bits(dwmac, PRG_ETH0,
PRG_ETH0_INVERTED_RMII_CLK, 0);
 
-   /* TX clock delay - all known boards use a 1/4 cycle delay */
+   /* TX clock delay in ns = "8ns / 4 * tx_dly_val" (where
+* 8ns are exactly one cycle of the 125MHz RGMII TX clock):
+* 0ns = 0x0, 2ns = 0x1, 4ns = 0x2, 6ns = 0x3
+*/
+   tx_dly_val = dwmac->tx_delay_ns >> 1;
meson8b_dwmac_mask_bits(dwmac, PRG_ETH0, PRG_ETH0_TXDLY_MASK,
-   PRG_ETH0_TXDLY_QUARTER);
+   tx_dly_val << PRG_ETH0_TXDLY_SHIFT);
break;
 
case PHY_INTERFACE_MODE_RMII:
@@ -277,6 +280,12 @@ static int meson8b_dwmac_probe(struct platform_device 
*pdev)
if (dwmac->phy_mode < 0) {
dev_err(&pdev->dev, "missing phy-mode property\n");
return -EINVAL;
+   } else if (dwmac->phy_mode != PHY_INTERFACE_MODE_RMII) {
+   /* ignore errors as this is an optional property - by default
+* we assume a TX delay of 0ns.
+*/
+   of_property_read_u32(pdev->dev.of_node, "amlogic,tx-delay-ns",
+&dwmac->tx_delay_ns);
}
 
ret = meson8b_init_clk(dwmac);
-- 
2.10.2



[PATCH net-next v3 1/2] net: dt-bindings: add RGMII TX delay configuration to meson8b-dwmac

2016-12-02 Thread Martin Blumenstingl
This allows configuring the RGMII TX clock delay. The RGMII clock is
generated by underlying hardware of the the Meson 8b / GXBB DWMAC glue.
The configuration depends on the actual hardware (no delay may be
needed due to the design of the actual circuit, the PHY might add this
delay, etc.).

Signed-off-by: Martin Blumenstingl 
Tested-by: Neil Armstrong 
---
 Documentation/devicetree/bindings/net/meson-dwmac.txt | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/Documentation/devicetree/bindings/net/meson-dwmac.txt 
b/Documentation/devicetree/bindings/net/meson-dwmac.txt
index 89e62dd..f8bc540 100644
--- a/Documentation/devicetree/bindings/net/meson-dwmac.txt
+++ b/Documentation/devicetree/bindings/net/meson-dwmac.txt
@@ -25,6 +25,20 @@ Required properties on Meson8b and newer:
- "clkin0" - first parent clock of the internal mux
- "clkin1" - second parent clock of the internal mux
 
+Optional properties on Meson8b and newer:
+- amlogic,tx-delay-ns: The internal RGMII TX clock delay (provided
+   by this driver) in nanoseconds. Allowed values
+   are: 0ns, 2ns, 4ns, 6ns.
+   This must be configured when the phy-mode is
+   "rgmii" (typically a value of 2ns is used in
+   this case).
+   When phy-mode is set to "rgmii-id" or
+   "rgmii-txid" the TX clock delay is already
+   provided by the PHY. In that case this
+   property should be set to 0ns (which disables
+   the TX clock delay in the MAC to prevent the
+   clock from going off because both PHY and MAC
+   are adding a delay).
 
 Example for Meson6:
 
-- 
2.10.2



[PATCH net-next v3 0/2] stmmac: dwmac-meson8b: configurable RGMII TX delay

2016-12-02 Thread Martin Blumenstingl
Currently the dwmac-meson8b stmmac glue driver uses a hardcoded 1/4
cycle (= 2ns) TX clock delay. This seems to work fine for many boards
(for example Odroid-C2 or Amlogic's reference boards) but there are
some others where TX traffic is simply broken.
There are probably multiple reasons why it's working on some boards
while it's broken on others:
- some of Amlogic's reference boards are using a Micrel PHY
- hardware circuit design
- maybe more...

iperf3 results on my Mecool BB2 board (Meson GXM, RTL8211F PHY) with
TX clock delay disabled on the MAC (as it's enabled in the PHY driver).
TX throughput was virtually zero before:
$ iperf3 -c 192.168.1.100 -R
Connecting to host 192.168.1.100, port 5201
Reverse mode, remote host 192.168.1.100 is sending
[  4] local 192.168.1.206 port 52828 connected to 192.168.1.100 port 5201
[ ID] Interval   Transfer Bandwidth
[  4]   0.00-1.00   sec   108 MBytes   901 Mbits/sec
[  4]   1.00-2.00   sec  94.2 MBytes   791 Mbits/sec
[  4]   2.00-3.00   sec  96.5 MBytes   810 Mbits/sec
[  4]   3.00-4.00   sec  96.2 MBytes   808 Mbits/sec
[  4]   4.00-5.00   sec  96.6 MBytes   810 Mbits/sec
[  4]   5.00-6.00   sec  96.5 MBytes   810 Mbits/sec
[  4]   6.00-7.00   sec  96.6 MBytes   810 Mbits/sec
[  4]   7.00-8.00   sec  96.5 MBytes   809 Mbits/sec
[  4]   8.00-9.00   sec   105 MBytes   884 Mbits/sec
[  4]   9.00-10.00  sec   111 MBytes   934 Mbits/sec
- - - - - - - - - - - - - - - - - - - - - - - - -
[ ID] Interval   Transfer Bandwidth   Retr
[  4]   0.00-10.00  sec  1000 MBytes   839 Mbits/sec0 sender
[  4]   0.00-10.00  sec   998 MBytes   837 Mbits/sec  receiver

iperf Done.
$ iperf3 -c 192.168.1.100
Connecting to host 192.168.1.100, port 5201
[  4] local 192.168.1.206 port 52832 connected to 192.168.1.100 port 5201
[ ID] Interval   Transfer Bandwidth   Retr  Cwnd
[  4]   0.00-1.01   sec  99.5 MBytes   829 Mbits/sec  117139 KBytes
[  4]   1.01-2.00   sec   105 MBytes   884 Mbits/sec  129   70.7 KBytes
[  4]   2.00-3.01   sec   107 MBytes   889 Mbits/sec  106187 KBytes
[  4]   3.01-4.01   sec   105 MBytes   878 Mbits/sec   92143 KBytes
[  4]   4.01-5.00   sec   105 MBytes   882 Mbits/sec  140129 KBytes
[  4]   5.00-6.01   sec   106 MBytes   883 Mbits/sec  115195 KBytes
[  4]   6.01-7.00   sec   102 MBytes   863 Mbits/sec  133   70.7 KBytes
[  4]   7.00-8.01   sec   106 MBytes   884 Mbits/sec  143   97.6 KBytes
[  4]   8.01-9.01   sec   104 MBytes   875 Mbits/sec  124107 KBytes
[  4]   9.01-10.01  sec   105 MBytes   876 Mbits/sec   90139 KBytes
- - - - - - - - - - - - - - - - - - - - - - - - -
[ ID] Interval   Transfer Bandwidth   Retr
[  4]   0.00-10.01  sec  1.02 GBytes   874 Mbits/sec  1189 sender
[  4]   0.00-10.01  sec  1.02 GBytes   873 Mbits/sec  receiver

iperf Done.

I get similar TX throughput on my Meson GXBB "MXQ Pro+" board when I
disable the PHY's TX-delay and configure a 4ms TX-delay on the MAC.
So changes to at least the RTL8211F PHY driver are needed to get it
working properly in all situations.

Changes since v2:
- moved all .dts patches (3-7) to a separate series
- removed the default 2ns TX delay when phy-mode RGMII is specified
- (rebased against current net-next)

Changes since v1:
- renamed the devicetree property "amlogic,tx-delay" to
  "amlogic,tx-delay-ns", which makes the .dts easier to read as we can
  simply specify human-readable values instead of having "preprocessor
  defines and calculation in human brain". Thanks to Andrew Lunn for
  the suggestion!
- improved documentation to indicate when the MAC TX-delay should be
  configured and how to use the PHY's TX-delay
- changed the default TX-delay in the dwmac-meson8b driver from 2ns
  to 0ms when any of the rgmii-*id modes are used (the 2ns default
  value still applies for phy-mode "rgmii")
- added patches to properly reset the PHY on Meson GXBB devices and to
  use a similar configuration than the one we use on Meson GXL devices
  (by passing a phy-handle to stmmac and defining the PHY in the mdio0
  bus - patch 3-6)
- add the "amlogic,tx-delay-ns" property to all boards which are using
  the RGMII PHY (patch 7)


Martin Blumenstingl (2):
  net: dt-bindings: add RGMII TX delay configuration to meson8b-dwmac
  net: stmmac: dwmac-meson8b: make the RGMII TX delay configurable

 .../devicetree/bindings/net/meson-dwmac.txt | 14 ++
 drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c | 21 +++--
 2 files changed, 29 insertions(+), 6 deletions(-)

-- 
2.10.2



Re: [PATCH 2/7] net: ethernet: ti: cpdma: fix desc re-queuing

2016-12-02 Thread Ivan Khoronzhuk
On Fri, Dec 02, 2016 at 10:45:07AM -0600, Grygorii Strashko wrote:
> 
> 
> On 12/02/2016 05:03 AM, Ivan Khoronzhuk wrote:
> > On Thu, Dec 01, 2016 at 05:34:27PM -0600, Grygorii Strashko wrote:
> >> The currently processing cpdma descriptor with EOQ flag set may
> >> contain two values in Next Descriptor Pointer field:
> >> - valid pointer: means CPDMA missed addition of new desc in queue;
> > It shouldn't happen in normal circumstances, right?
> 
> it might happen, because desc push compete with desc pop.
> You can check stats values:
> chan->stats.misqueued
> chan->stats.requeue
>  under different types of net-loads.
I've done this, of-course.
By whole logic the misqueued counter has to cover all cases.
But that's not true.

> 
> TRM:
> "
> If the pNext pointer is initially NULL, and more packets need to be queued 
> for transmit, the software
> application may alter this pointer to point to a newly appended descriptor. 
> The EMAC will use the new
> pointer value and proceed to the next descriptor unless the pNext value has 
> already been read. In this
> latter case, the transmitter will halt on the transmit channel in question, 
> and the software application may
> restart it at that time. The software can detect this case by checking for an 
> end of queue (EOQ) condition
> flag on the updated packet descriptor when it is returned by the EMAC.
> "
That's true. No issues in desc.
In the code no any place to update next_desc except submit function.

And this case is supposed to be caught here:
For submit:
cpdma_chan_submit()
spin_lock_irqsave(&chan->lock);
...
--->__cpdma_chan_submit()
...
--> desc_write(prev, hw_next, desc_dma); // here next pointer is updated, 
it can be not in time
...
--> mode = desc_read(prev, hw_mode); // pay attention, it's read after 
updating next pointer
--> if ((mode & CPDMA_DESC_EOQ) &&
--> (chan->state == CPDMA_STATE_ACTIVE)) { // here checked if it was late 
update
-> chan_write(chan, hdp, desc_dma); // here transmit is restarted, if 
needed

For process it only caught the fact of late update, but it has to be caught in
submit() already:
__cpdma_chan_process()
spin_lock_irqsave(&chan->lock);
--> if (mode & CPDMA_DESC_EOQ) // here transmit is restarted, if needed
-> chan_write(chan, hdp, desc_dma); // but w/o updating next pointer

Seems there is no place where hw_next is updated w/o updating hdp :-| in case
of late hw_next set. And that is strange. I know it happens, I've checked it
before of-course. Then I thought, maybe there is some problem with write order,
thus out of sync, nothing more.

> 
> 
> > So, why it happens only for egress channels? And Does that mean
> > there is some resynchronization between submit and process function,
> > or this is h/w issue?
> 
> no hw issues. this patch just removes one unnecessary I/O access 
No objections against patch. Anyway it's better then before.
Just want to know the real reason why it happens, maybe there is smth else.

> 
> > 
> >> - null: no more descriptors in queue.
> >> In the later case, it's not required to write to HDP register, but now
> >> CPDMA does it.
> >>
> >> Hence, add additional check for Next Descriptor Pointer != null in
> >> cpdma_chan_process() function before writing in HDP register.
> >>
> >> Signed-off-by: Grygorii Strashko 
> >> ---
> >>  drivers/net/ethernet/ti/davinci_cpdma.c | 2 +-
> >>  1 file changed, 1 insertion(+), 1 deletion(-)
> >>
> >> diff --git a/drivers/net/ethernet/ti/davinci_cpdma.c 
> >> b/drivers/net/ethernet/ti/davinci_cpdma.c
> >> index 0924014..379314f 100644
> >> --- a/drivers/net/ethernet/ti/davinci_cpdma.c
> >> +++ b/drivers/net/ethernet/ti/davinci_cpdma.c
> >> @@ -1152,7 +1152,7 @@ static int __cpdma_chan_process(struct cpdma_chan 
> >> *chan)
> >>chan->count--;
> >>chan->stats.good_dequeue++;
> >>  
> >> -  if (status & CPDMA_DESC_EOQ) {
> >> +  if ((status & CPDMA_DESC_EOQ) && chan->head) {
> >>chan->stats.requeue++;
> >>chan_write(chan, hdp, desc_phys(pool, chan->head));
> >>}
> >> -- 
> >> 2.10.1
> >>
> 
> -- 
> regards,
> -grygorii


Re: [PATCH] iproute2: ss: escape all null bytes in abstract unix domain socket

2016-12-02 Thread Eric Dumazet
On Fri, 2016-12-02 at 15:18 -0800, Stephen Hemminger wrote:
> name[i] = '@';
> > 
> > ss.c: In function 'unix_show_sock':
> > ss.c:3128:4: error: 'for' loop initial declarations are only allowed in C99 
> > mode
> > ss.c:3128:4: note: use option -std=c99 or -std=gnu99 to compile your code
> > make[1]: *** [ss.o] Error 1
> > 
> > 
> > 
> 
> Thanks, fixed by patch from Simon

Right, thanks !




Re: [PATCH net-next] liquidio: 'imply' ptp instead of 'select'

2016-12-02 Thread Nicolas Pitre
On Sat, 3 Dec 2016, Arnd Bergmann wrote:

> ptp now depends on the optional POSIX_TIMERS setting and fails to build
> if we select it without that:
> 
> warning: (LIQUIDIO_VF && TI_CPTS) selects PTP_1588_CLOCK which has unmet 
> direct dependencies (NET && POSIX_TIMERS)
> warning: (LIQUIDIO_VF && TI_CPTS) selects PTP_1588_CLOCK which has unmet 
> direct dependencies (NET && POSIX_TIMERS)
> ERROR: "posix_clock_unregister" [drivers/ptp/ptp.ko] undefined!
> ERROR: "posix_clock_register" [drivers/ptp/ptp.ko] undefined!
> ERROR: "pps_unregister_source" [drivers/ptp/ptp.ko] undefined!
> ERROR: "pps_event" [drivers/ptp/ptp.ko] undefined!
> ERROR: "pps_register_source" [drivers/ptp/ptp.ko] undefined!
> 
> It seems that two patches have collided here, the build failure
> is a result of the combination. Changing the new option to 'imply'
> as well fixes it.
> 
> Fixes: 111fc64a237f ("liquidio CN23XX: VF registration")
> Fixes: d1cbfd771ce8 ("ptp_clock: Allow for it to be optional")
> Signed-off-by: Arnd Bergmann 

Acked-by: Nicolas Pitre 

> ---
>  drivers/net/ethernet/cavium/Kconfig | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/net/ethernet/cavium/Kconfig 
> b/drivers/net/ethernet/cavium/Kconfig
> index bbc8bd16cb97..dcbce6cac63e 100644
> --- a/drivers/net/ethernet/cavium/Kconfig
> +++ b/drivers/net/ethernet/cavium/Kconfig
> @@ -77,7 +77,7 @@ config OCTEON_MGMT_ETHERNET
>  config LIQUIDIO_VF
>   tristate "Cavium LiquidIO VF support"
>   depends on 64BIT && PCI_MSI
> - select PTP_1588_CLOCK
> + imply PTP_1588_CLOCK
>   ---help---
> This driver supports Cavium LiquidIO Intelligent Server Adapter
> based on CN23XX chips.
> -- 
> 2.9.0
> 
> 


Re: [PATCH] iproute2: ss: escape all null bytes in abstract unix domain socket

2016-12-02 Thread Stephen Hemminger
On Fri, 02 Dec 2016 10:59:56 -0800
Eric Dumazet  wrote:

> On Sat, 2016-11-12 at 10:17 +0300, Stephen Hemminger wrote:
> > On Sat, 29 Oct 2016 22:20:19 +0300
> > Isaac Boukris  wrote:
> >   
> > > Abstract unix domain socket may embed null characters,
> > > these should be translated to '@' when printed by ss the
> > > same way the null prefix is currently being translated.
> > > 
> > > Signed-off-by: Isaac Boukris   
> > 
> > Applied  
> 
> Probably not a good idea to have :
> 
>for (int i = 0; i < len; i++)
>if (name[i] == '\0')
>name[i] = '@';
> 
> ss.c: In function 'unix_show_sock':
> ss.c:3128:4: error: 'for' loop initial declarations are only allowed in C99 
> mode
> ss.c:3128:4: note: use option -std=c99 or -std=gnu99 to compile your code
> make[1]: *** [ss.o] Error 1
> 
> 
> 

Thanks, fixed by patch from Simon


Re: iproute2 public git outdated?

2016-12-02 Thread Stephen Hemminger
On Thu, 1 Dec 2016 13:18:06 +0100
Phil Sutter  wrote:

> Hi,
> 
> I am using iproute2's public git repo at this URL:
> 
> git://git.kernel.org/pub/scm/linux/kernel/git/shemminger/iproute2.git
> 
> To my surprise, neither master nor net-next branches have received new
> commits since end of October. Did the repo location change or was it
> just not updated for a while?
> 
> Thanks, Phil

I was on the road, and moving between houses. Sorry for the extended delay.


Avoid deadlock situation due to use of xmit_lock

2016-12-02 Thread Lino Sanfilippo
Hi,

after stumbling over a potential deadlock situation in the altera driver 
(see http://marc.info/?l=linux-netdev&m=148054615230447&w=2), I checked
all other ethernet drivers for the same issue and actually found it in 2
more, namely stmmac, and sxgbe. Please see the commit messages for a 
description of the problem.
These 2 patches fix the concerning drivers.

Regards,
Lino



[PATCH 1/2] net: ethernet: sxgbe: do not use xmit_lock in tx completion handler

2016-12-02 Thread Lino Sanfilippo
The driver already uses its private lock for synchronization between the
xmit function and the xmit completion handler, making the additional use of
the xmit_lock unnecessary.
Furthermore the driver does not set NETIF_F_LLTX resulting in xmit to be
called with the xmit_lock held and then taking the private lock.
On the other hand the xmit completion handler uses the reverse locking
order, by first taking the private lock, and then the xmit_lock, which
leads to the potential danger of a deadlock.

Fix this issue by not taking the xmit_lock in the completion handler.
By doing this also remove an unnecessary double check for a stopped tx
queue.

Signed-off-by: Lino Sanfilippo 
---
 drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c | 11 +++
 1 file changed, 3 insertions(+), 8 deletions(-)

Please note that this patch is only compile tested.

diff --git a/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c 
b/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c
index 5dbe406..578cbec 100644
--- a/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c
+++ b/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c
@@ -782,14 +782,9 @@ static void sxgbe_tx_queue_clean(struct sxgbe_tx_queue 
*tqueue)
/* wake up queue */
if (unlikely(netif_tx_queue_stopped(dev_txq) &&
 sxgbe_tx_avail(tqueue, tx_rsize) > SXGBE_TX_THRESH(priv))) 
{
-   netif_tx_lock(priv->dev);
-   if (netif_tx_queue_stopped(dev_txq) &&
-   sxgbe_tx_avail(tqueue, tx_rsize) > SXGBE_TX_THRESH(priv)) {
-   if (netif_msg_tx_done(priv))
-   pr_debug("%s: restart transmit\n", __func__);
-   netif_tx_wake_queue(dev_txq);
-   }
-   netif_tx_unlock(priv->dev);
+   if (netif_msg_tx_done(priv))
+   pr_debug("%s: restart transmit\n", __func__);
+   netif_tx_wake_queue(dev_txq);
}
 
spin_unlock(&tqueue->tx_lock);
-- 
1.9.1



[PATCH 2/2] net: ethernet: stmmac: do not use xmit_lock in tx completion handler

2016-12-02 Thread Lino Sanfilippo
The driver already uses its private lock for synchronization between the
xmit function and the xmit completion handler, making the additional use of
the xmit_lock unnecessary.
Furthermore the driver does not set NETIF_F_LLTX resulting in xmit to be
called with the xmit_lock held and then taking the private lock.
On the other hand the xmit completion handler uses the reverse locking
order, by first taking the private lock, and then the xmit_lock, which
leads to the potential danger of a deadlock.

Fix this issue by not taking the xmit_lock in the completion handler.
By doing this also remove an unnecessary double check for a stopped tx
queue.

Signed-off-by: Lino Sanfilippo 
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 11 +++
 1 file changed, 3 insertions(+), 8 deletions(-)

Please note that this patch is only compile tested.

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c 
b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 48a4e84..8def423 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -1380,14 +1380,9 @@ static void stmmac_tx_clean(struct stmmac_priv *priv)
 
if (unlikely(netif_queue_stopped(priv->dev) &&
 stmmac_tx_avail(priv) > STMMAC_TX_THRESH)) {
-   netif_tx_lock(priv->dev);
-   if (netif_queue_stopped(priv->dev) &&
-   stmmac_tx_avail(priv) > STMMAC_TX_THRESH) {
-   netif_dbg(priv, tx_done, priv->dev,
- "%s: restart transmit\n", __func__);
-   netif_wake_queue(priv->dev);
-   }
-   netif_tx_unlock(priv->dev);
+   netif_dbg(priv, tx_done, priv->dev,
+ "%s: restart transmit\n", __func__);
+   netif_wake_queue(priv->dev);
}
 
if ((priv->eee_enabled) && (!priv->tx_path_in_lpi_mode)) {
-- 
1.9.1



Re: [PATCH/RFC iproute2/net-next 1/3] tc: flower: update headers for TCA_FLOWER_KEY_ICMP*

2016-12-02 Thread Stephen Hemminger
On Fri,  2 Dec 2016 10:59:43 +0100
Simon Horman  wrote:

> These are proposed changes for net-next.
> 
> Signed-off-by: Simon Horman 
> ---
>  include/linux/pkt_cls.h | 10 ++
>  1 file changed, 10 insertions(+)
> 
> diff --git a/include/linux/pkt_cls.h b/include/linux/pkt_cls.h
> index a3d8a4f17d8e..fa435ea8ad21 100644
> --- a/include/linux/pkt_cls.h
> +++ b/include/linux/pkt_cls.h
> @@ -403,6 +403,16 @@ enum {
>   TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK,   /* be16 */
>   TCA_FLOWER_KEY_ENC_UDP_DST_PORT,/* be16 */
>   TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK,   /* be16 */
> +
> + TCA_FLOWER_KEY_ICMPV4_CODE, /* u8 */
> + TCA_FLOWER_KEY_ICMPV4_CODE_MASK,/* u8 */
> + TCA_FLOWER_KEY_ICMPV4_TYPE, /* u8 */
> + TCA_FLOWER_KEY_ICMPV4_TYPE_MASK,/* u8 */
> + TCA_FLOWER_KEY_ICMPV6_CODE, /* u8 */
> + TCA_FLOWER_KEY_ICMPV6_CODE_MASK,/* u8 */
> + TCA_FLOWER_KEY_ICMPV6_TYPE, /* u8 */
> + TCA_FLOWER_KEY_ICMPV6_TYPE_MASK,/* u8 */
> +
>   __TCA_FLOWER_MAX,
>  };
>  

I picked this up by updating from kernel headers.


Re: [[PATCH iproute2/net-next v2] 2/4] tc: flower: document SCTP ip_proto

2016-12-02 Thread Stephen Hemminger
On Fri,  2 Dec 2016 09:45:19 +0100
Simon Horman  wrote:

> Add SCTP ip_proto to help text and man page.
> 
> Signed-off-by: Simon Horman 

This doesn't apply cleanly to current net-next git.
Probably some of the other man page changes caused reject.



Re: [[PATCH iproute2/net-next v2] 1/4] tc: flower: remove references to eth_type in manpage

2016-12-02 Thread Stephen Hemminger
On Fri,  2 Dec 2016 09:45:18 +0100
Simon Horman  wrote:

> Remove references to eth_type and ether_type (spelling error) in
> the tc flower manpage.
> 
> Also correct formatting of boldface text with whitespace.
> 
> Cc: Paul Blakey 
> Signed-off-by: Simon Horman 

Applied this one. Later ones still need rebase.


[PATCH net-next] liquidio: 'imply' ptp instead of 'select'

2016-12-02 Thread Arnd Bergmann
ptp now depends on the optional POSIX_TIMERS setting and fails to build
if we select it without that:

warning: (LIQUIDIO_VF && TI_CPTS) selects PTP_1588_CLOCK which has unmet direct 
dependencies (NET && POSIX_TIMERS)
warning: (LIQUIDIO_VF && TI_CPTS) selects PTP_1588_CLOCK which has unmet direct 
dependencies (NET && POSIX_TIMERS)
ERROR: "posix_clock_unregister" [drivers/ptp/ptp.ko] undefined!
ERROR: "posix_clock_register" [drivers/ptp/ptp.ko] undefined!
ERROR: "pps_unregister_source" [drivers/ptp/ptp.ko] undefined!
ERROR: "pps_event" [drivers/ptp/ptp.ko] undefined!
ERROR: "pps_register_source" [drivers/ptp/ptp.ko] undefined!

It seems that two patches have collided here, the build failure
is a result of the combination. Changing the new option to 'imply'
as well fixes it.

Fixes: 111fc64a237f ("liquidio CN23XX: VF registration")
Fixes: d1cbfd771ce8 ("ptp_clock: Allow for it to be optional")
Signed-off-by: Arnd Bergmann 
---
 drivers/net/ethernet/cavium/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/cavium/Kconfig 
b/drivers/net/ethernet/cavium/Kconfig
index bbc8bd16cb97..dcbce6cac63e 100644
--- a/drivers/net/ethernet/cavium/Kconfig
+++ b/drivers/net/ethernet/cavium/Kconfig
@@ -77,7 +77,7 @@ config OCTEON_MGMT_ETHERNET
 config LIQUIDIO_VF
tristate "Cavium LiquidIO VF support"
depends on 64BIT && PCI_MSI
-   select PTP_1588_CLOCK
+   imply PTP_1588_CLOCK
---help---
  This driver supports Cavium LiquidIO Intelligent Server Adapter
  based on CN23XX chips.
-- 
2.9.0



[PATCH net-next] phy: add phy fixup unregister functions

2016-12-02 Thread Woojung.Huh
From: Woojung Huh 

Add functions to unregister phy fixup for modules.

phy_unregister_fixup(const char *bus_id, u32 phy_uid, u32 phy_uid_mask)
 Unregister phy fixup matches bus_id, phy_uid and phy_uid_mask
 from phy_fixup_list.
 Return 0 when find matched one and remove from the list.
 Return -ENODEV when fail to find it on the list.

phy_unregister_fixup_for_uid(u32 phy_uid, u32 phy_uid_mask)
 Unregister phy fixup from phy_fixup_list.
 Use it for fixup registered by phy_register_fixup_for_uid()
 Return 0 when find matched one and remove from the list.
 Return -ENODEV when fail to find it on the list.

phy_unregister_fixup_for_id(const char *bus_id)
 Unregister phy fixup from phy_fixup_list.
 Use it for fixup registered by phy_register_fixup_for_id()
 Return 0 when find matched one and remove from the list.
 Return -ENODEV when fail to find it on the list.

Signed-off-by: Woojung Huh 
---
 Documentation/networking/phy.txt |  9 
 drivers/net/phy/phy_device.c | 47 
 include/linux/phy.h  |  4 
 3 files changed, 60 insertions(+)

diff --git a/Documentation/networking/phy.txt b/Documentation/networking/phy.txt
index e017d93..16f90d8 100644
--- a/Documentation/networking/phy.txt
+++ b/Documentation/networking/phy.txt
@@ -407,6 +407,15 @@ Board Fixups
  The stubs set one of the two matching criteria, and set the other one to
  match anything.
 
+ When phy_register_fixup() or *_for_uid()/*_for_id() is called at module,
+ unregister fixup and free allocate memory are required.
+
+ Call one of following function before unloading module.
+
+ int phy_unregister_fixup(const char *phy_id, u32 phy_uid, u32 phy_uid_mask);
+ int phy_unregister_fixup_for_uid(u32 phy_uid, u32 phy_uid_mask);
+ int phy_register_fixup_for_id(const char *phy_id);
+
 Standards
 
  IEEE Standard 802.3: CSMA/CD Access Method and Physical Layer Specifications, 
Section Two:
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index aeaf1bc..32fa7c7 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -235,6 +235,53 @@ int phy_register_fixup_for_id(const char *bus_id,
 }
 EXPORT_SYMBOL(phy_register_fixup_for_id);
 
+/**
+ * phy_unregister_fixup - remove a phy_fixup from the list
+ * @bus_id: A string matches fixup->bus_id (or PHY_ANY_ID) in phy_fixup_list
+ * @phy_uid: A phy id matches fixup->phy_id (or PHY_ANY_UID) in phy_fixup_list
+ * @phy_uid_mask: Applied to phy_uid and fixup->phy_uid before comparison
+ */
+int phy_unregister_fixup(const char *bus_id, u32 phy_uid, u32 phy_uid_mask)
+{
+   struct list_head *pos, *n;
+   struct phy_fixup *fixup;
+   int ret;
+
+   ret = -ENODEV;
+
+   mutex_lock(&phy_fixup_lock);
+   list_for_each_safe(pos, n, &phy_fixup_list) {
+   fixup = list_entry(pos, struct phy_fixup, list);
+
+   if ((!strcmp(fixup->bus_id, bus_id)) &&
+   ((fixup->phy_uid & phy_uid_mask) ==
+(phy_uid & phy_uid_mask))) {
+   list_del(&fixup->list);
+   kfree(fixup);
+   ret = 0;
+   break;
+   }
+   }
+   mutex_unlock(&phy_fixup_lock);
+
+   return ret;
+}
+EXPORT_SYMBOL(phy_unregister_fixup);
+
+/* Unregisters a fixup of any PHY with the UID in phy_uid */
+int phy_unregister_fixup_for_uid(u32 phy_uid, u32 phy_uid_mask)
+{
+   return phy_unregister_fixup(PHY_ANY_ID, phy_uid, phy_uid_mask);
+}
+EXPORT_SYMBOL(phy_unregister_fixup_for_uid);
+
+/* Unregisters a fixup of the PHY with id string bus_id */
+int phy_unregister_fixup_for_id(const char *bus_id)
+{
+   return phy_unregister_fixup(bus_id, PHY_ANY_UID, 0x);
+}
+EXPORT_SYMBOL(phy_unregister_fixup_for_id);
+
 /* Returns 1 if fixup matches phydev in bus_id and phy_uid.
  * Fixups can be set to match any in one or more fields.
  */
diff --git a/include/linux/phy.h b/include/linux/phy.h
index b53177f..745661d 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -859,6 +859,10 @@ int phy_register_fixup_for_id(const char *bus_id,
 int phy_register_fixup_for_uid(u32 phy_uid, u32 phy_uid_mask,
   int (*run)(struct phy_device *));
 
+int phy_unregister_fixup(const char *bus_id, u32 phy_uid, u32 phy_uid_mask);
+int phy_unregister_fixup_for_id(const char *bus_id);
+int phy_unregister_fixup_for_uid(u32 phy_uid, u32 phy_uid_mask);
+
 int phy_init_eee(struct phy_device *phydev, bool clk_stop_enable);
 int phy_get_eee_err(struct phy_device *phydev);
 int phy_ethtool_set_eee(struct phy_device *phydev, struct ethtool_eee *data);
-- 
2.7.4


Re: [PATCH net-next v2 6/6] tcp: SOF_TIMESTAMPING_OPT_STATS option for SO_TIMESTAMPING

2016-12-02 Thread Yuchung Cheng
On Fri, Dec 2, 2016 at 2:30 PM, Paul Gortmaker
 wrote:
> On Mon, Nov 28, 2016 at 2:07 AM, Yuchung Cheng  wrote:
>> From: Francis Yan 
>>
>> This patch exports the sender chronograph stats via the socket
>> SO_TIMESTAMPING channel. Currently we can instrument how long a
>> particular application unit of data was queued in TCP by tracking
>> SOF_TIMESTAMPING_TX_SOFTWARE and SOF_TIMESTAMPING_TX_SCHED. Having
>
> Seems a new linux-next failure leads back to here ; I did not run a
> full bisect, since the variable seems confined to this commit:
>
> net/socket.c:701: error: 'SCM_TIMESTAMPING_OPT_STATS' undeclared
> (first use in this function)
>
> http://kisskb.ellerman.id.au/kisskb/buildresult/12875981/
oops didn't patch avr32 arch. Does this fix work?

>
> Paul.
> --
>
>> these sender chronograph stats exported simultaneously along with
>> these timestamps allow further breaking down the various sender
>> limitation.  For example, a video server can tell if a particular
>> chunk of video on a connection takes a long time to deliver because
>> TCP was experiencing small receive window. It is not possible to
>> tell before this patch without packet traces.
>>
>> To prepare these stats, the user needs to set
>> SOF_TIMESTAMPING_OPT_STATS and SOF_TIMESTAMPING_OPT_TSONLY flags
>> while requesting other SOF_TIMESTAMPING TX timestamps. When the
>> timestamps are available in the error queue, the stats are returned
>> in a separate control message of type SCM_TIMESTAMPING_OPT_STATS,
>> in a list of TLVs (struct nlattr) of types: TCP_NLA_BUSY_TIME,
>> TCP_NLA_RWND_LIMITED, TCP_NLA_SNDBUF_LIMITED. Unit is microsecond.
>>
>> Signed-off-by: Francis Yan 
>> Signed-off-by: Yuchung Cheng 
>> Signed-off-by: Soheil Hassas Yeganeh 
>> ---
>> ChangeLog since v1:
>>  - fix build break if CONFIG_INET is not defined
>>
>>  Documentation/networking/timestamping.txt | 10 ++
>>  arch/alpha/include/uapi/asm/socket.h  |  2 ++
>>  arch/frv/include/uapi/asm/socket.h|  2 ++
>>  arch/ia64/include/uapi/asm/socket.h   |  2 ++
>>  arch/m32r/include/uapi/asm/socket.h   |  2 ++
>>  arch/mips/include/uapi/asm/socket.h   |  2 ++
>>  arch/mn10300/include/uapi/asm/socket.h|  2 ++
>>  arch/parisc/include/uapi/asm/socket.h |  2 ++
>>  arch/powerpc/include/uapi/asm/socket.h|  2 ++
>>  arch/s390/include/uapi/asm/socket.h   |  2 ++
>>  arch/sparc/include/uapi/asm/socket.h  |  2 ++
>>  arch/xtensa/include/uapi/asm/socket.h |  2 ++
>>  include/linux/tcp.h   |  2 ++
>>  include/uapi/asm-generic/socket.h |  2 ++
>>  include/uapi/linux/net_tstamp.h   |  3 ++-
>>  include/uapi/linux/tcp.h  |  8 
>>  net/core/skbuff.c | 14 +++---
>>  net/core/sock.c   |  7 +++
>>  net/ipv4/tcp.c| 20 
>>  net/socket.c  |  7 ++-
>>  20 files changed, 90 insertions(+), 5 deletions(-)
>>
>> diff --git a/Documentation/networking/timestamping.txt 
>> b/Documentation/networking/timestamping.txt
>> index 671cccf..96f5069 100644
>> --- a/Documentation/networking/timestamping.txt
>> +++ b/Documentation/networking/timestamping.txt
>> @@ -182,6 +182,16 @@ SOF_TIMESTAMPING_OPT_TSONLY:
>>the timestamp even if sysctl net.core.tstamp_allow_data is 0.
>>This option disables SOF_TIMESTAMPING_OPT_CMSG.
>>
>> +SOF_TIMESTAMPING_OPT_STATS:
>> +
>> +  Optional stats that are obtained along with the transmit timestamps.
>> +  It must be used together with SOF_TIMESTAMPING_OPT_TSONLY. When the
>> +  transmit timestamp is available, the stats are available in a
>> +  separate control message of type SCM_TIMESTAMPING_OPT_STATS, as a
>> +  list of TLVs (struct nlattr) of types. These stats allow the
>> +  application to associate various transport layer stats with
>> +  the transmit timestamps, such as how long a certain block of
>> +  data was limited by peer's receiver window.
>>
>>  New applications are encouraged to pass SOF_TIMESTAMPING_OPT_ID to
>>  disambiguate timestamps and SOF_TIMESTAMPING_OPT_TSONLY to operate
>> diff --git a/arch/alpha/include/uapi/asm/socket.h 
>> b/arch/alpha/include/uapi/asm/socket.h
>> index 9e46d6e..afc901b 100644
>> --- a/arch/alpha/include/uapi/asm/socket.h
>> +++ b/arch/alpha/include/uapi/asm/socket.h
>> @@ -97,4 +97,6 @@
>>
>>  #define SO_CNX_ADVICE  53
>>
>> +#define SCM_TIMESTAMPING_OPT_STATS 54
>> +
>>  #endif /* _UAPI_ASM_SOCKET_H */
>> diff --git a/arch/frv/include/uapi/asm/socket.h 
>> b/arch/frv/include/uapi/asm/socket.h
>> index afbc98f0..81e0353 100644
>> --- a/arch/frv/include/uapi/asm/socket.h
>> +++ b/arch/frv/include/uapi/asm/socket.h
>> @@ -90,5 +90,7 @@
>>
>>  #define SO_CNX_ADVICE  53
>>
>> +#define SCM_TIMESTAMPING_OPT_STATS 54
>> +
>>  #endif /* _ASM_SOCKET_H */
>>
>> diff --git a/arch/ia64/include/uapi/asm/socket.h 
>> b/arch/ia64/include/uapi/asm/socket.h
>> index 0

[PATCH 3/3] uapi: export nf_log.h

2016-12-02 Thread Stephen Hemminger
File is in uapi directory but not being copied on
 make install_headers

Fixes commit 4ec9c8fbbc22 ("netfilter: nft_log: complete
NFTA_LOG_FLAGS attr support").

Signed-off-by: Stephen Hemminger 
---
 include/uapi/linux/netfilter/Kbuild | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/uapi/linux/netfilter/Kbuild 
b/include/uapi/linux/netfilter/Kbuild
index cd26d7a..03f194a 100644
--- a/include/uapi/linux/netfilter/Kbuild
+++ b/include/uapi/linux/netfilter/Kbuild
@@ -5,6 +5,7 @@ header-y += nf_conntrack_ftp.h
 header-y += nf_conntrack_sctp.h
 header-y += nf_conntrack_tcp.h
 header-y += nf_conntrack_tuple_common.h
+header-y += nf_log.h
 header-y += nf_tables.h
 header-y += nf_tables_compat.h
 header-y += nf_nat.h
-- 
2.10.2



[PATCH 2/3] uapi: export tc_skbmod.h

2016-12-02 Thread Stephen Hemminger
Fixes commit 735cffe5d800 ("net_sched: Introduce skbmod action")
Not used by iproute2 but maybe in future.

Signed-off-by: Stephen Hemminger 
---
 include/uapi/linux/tc_act/Kbuild | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/uapi/linux/tc_act/Kbuild b/include/uapi/linux/tc_act/Kbuild
index 9611c7b..721433e 100644
--- a/include/uapi/linux/tc_act/Kbuild
+++ b/include/uapi/linux/tc_act/Kbuild
@@ -12,3 +12,4 @@ header-y += tc_bpf.h
 header-y += tc_connmark.h
 header-y += tc_ife.h
 header-y += tc_tunnel_key.h
+header-y += tc_sbkmod.h
-- 
2.10.2



[PATCH 1/3] uapi: export tc tunnel key file

2016-12-02 Thread Stephen Hemminger
Fixes commit 21609ae32aaf6c6fab0e ("net/sched: Introduce act_tunnel_key")
The file is necessary for iproute2 headers but was not being
copied by make install_headers

Signed-off-by: Stephen Hemminger 
---
 include/uapi/linux/tc_act/Kbuild | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/uapi/linux/tc_act/Kbuild b/include/uapi/linux/tc_act/Kbuild
index e3969bd..9611c7b 100644
--- a/include/uapi/linux/tc_act/Kbuild
+++ b/include/uapi/linux/tc_act/Kbuild
@@ -11,3 +11,4 @@ header-y += tc_vlan.h
 header-y += tc_bpf.h
 header-y += tc_connmark.h
 header-y += tc_ife.h
+header-y += tc_tunnel_key.h
-- 
2.10.2



[PATCH 0/3] UAPI export missing headers

2016-12-02 Thread Stephen Hemminger
Files not being exported by make install headers

Stephen Hemminger (3):
  tc: export tunnel key file
  uapi: export tc_skbmod.h
  uapi: export nf_log.h

 include/uapi/linux/netfilter/Kbuild | 1 +
 include/uapi/linux/tc_act/Kbuild| 2 ++
 2 files changed, 3 insertions(+)

-- 
2.10.2



[PATCH v2 0/3] uapi: add kbuild for some files

2016-12-02 Thread Stephen Hemminger
Some files which are in uapi but not being copied
by make headers_install

Stephen Hemminger (3):
  tc: export tunnel key file
  uapi: export tc_skbmod.h
  uapi: export nf_log.h

 include/uapi/linux/netfilter/Kbuild | 1 +
 include/uapi/linux/tc_act/Kbuild| 2 ++
 2 files changed, 3 insertions(+)

V2 - typo in s/sbkmod/skbmod/

-- 
2.10.2



[PATCH 2/3] uapi: export tc_skbmod.h

2016-12-02 Thread Stephen Hemminger
Fixes commit 735cffe5d800 ("net_sched: Introduce skbmod action")
Not used by iproute2 but maybe in future.

Signed-off-by: Stephen Hemminger 
---
 include/uapi/linux/tc_act/Kbuild | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/uapi/linux/tc_act/Kbuild b/include/uapi/linux/tc_act/Kbuild
index 9611c7b..e3db740 100644
--- a/include/uapi/linux/tc_act/Kbuild
+++ b/include/uapi/linux/tc_act/Kbuild
@@ -12,3 +12,4 @@ header-y += tc_bpf.h
 header-y += tc_connmark.h
 header-y += tc_ife.h
 header-y += tc_tunnel_key.h
+header-y += tc_skbmod.h
-- 
2.10.2



[PATCH 3/3] uapi: export nf_log.h

2016-12-02 Thread Stephen Hemminger
File is in uapi directory but not being copied on
 make install_headers

Fixes commit 4ec9c8fbbc22 ("netfilter: nft_log: complete
NFTA_LOG_FLAGS attr support").

Signed-off-by: Stephen Hemminger 
---
 include/uapi/linux/netfilter/Kbuild | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/uapi/linux/netfilter/Kbuild 
b/include/uapi/linux/netfilter/Kbuild
index cd26d7a..03f194a 100644
--- a/include/uapi/linux/netfilter/Kbuild
+++ b/include/uapi/linux/netfilter/Kbuild
@@ -5,6 +5,7 @@ header-y += nf_conntrack_ftp.h
 header-y += nf_conntrack_sctp.h
 header-y += nf_conntrack_tcp.h
 header-y += nf_conntrack_tuple_common.h
+header-y += nf_log.h
 header-y += nf_tables.h
 header-y += nf_tables_compat.h
 header-y += nf_nat.h
-- 
2.10.2



[PATCH 1/3] uapi: export tc tunnel key file

2016-12-02 Thread Stephen Hemminger
Fixes commit 21609ae32aaf6c6fab0e ("net/sched: Introduce act_tunnel_key")
The file is necessary for iproute2 headers but was not being
copied by make install_headers

Signed-off-by: Stephen Hemminger 
---
 include/uapi/linux/tc_act/Kbuild | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/uapi/linux/tc_act/Kbuild b/include/uapi/linux/tc_act/Kbuild
index e3969bd..9611c7b 100644
--- a/include/uapi/linux/tc_act/Kbuild
+++ b/include/uapi/linux/tc_act/Kbuild
@@ -11,3 +11,4 @@ header-y += tc_vlan.h
 header-y += tc_bpf.h
 header-y += tc_connmark.h
 header-y += tc_ife.h
+header-y += tc_tunnel_key.h
-- 
2.10.2



[PATCH net v3] tcp: warn on bogus MSS and try to amend it

2016-12-02 Thread Marcelo Ricardo Leitner
There have been some reports lately about TCP connection stalls caused
by NIC drivers that aren't setting gso_size on aggregated packets on rx
path. This causes TCP to assume that the MSS is actually the size of the
aggregated packet, which is invalid.

Although the proper fix is to be done at each driver, it's often hard
and cumbersome for one to debug, come to such root cause and report/fix
it.

This patch amends this situation in two ways. First, it adds a warning
on when this situation occurs, so it gives a hint to those trying to
debug this. It also limit the maximum probed MSS to the adverised MSS,
as it should never be any higher than that.

The result is that the connection may not have the best performance ever
but it shouldn't stall, and the admin will have a hint on what to look
for.

Tested with virtio by forcing gso_size to 0.

v2: updated msg per David suggestion
v3: use skb_iif to find the interface and also log its name, per Eric
Dumazet suggestion. As the skb may be backlogged and the interface
gone by then, we need to check if the number still has a meaning.

Cc: Jonathan Maxwell 
Signed-off-by: Marcelo Ricardo Leitner 
---
 net/ipv4/tcp_input.c | 16 +++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 
a27b9c0e27c08b4e4aeaff3d0bfdf3ae561ba4d8..042a8a895e97d04afbdc377830537e8fd3b15d1e
 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -144,7 +144,21 @@ static void tcp_measure_rcv_mss(struct sock *sk, const 
struct sk_buff *skb)
 */
len = skb_shinfo(skb)->gso_size ? : skb->len;
if (len >= icsk->icsk_ack.rcv_mss) {
-   icsk->icsk_ack.rcv_mss = len;
+   static bool __once __read_mostly;
+
+   icsk->icsk_ack.rcv_mss = min_t(unsigned int, len,
+  tcp_sk(sk)->advmss);
+   if (icsk->icsk_ack.rcv_mss != len && !__once) {
+   struct net_device *dev;
+
+   __once = true;
+
+   rcu_read_lock();
+   dev = dev_get_by_index_rcu(sock_net(sk), skb->skb_iif);
+   pr_warn_once("%s: Driver has suspect GRO 
implementation, TCP performance may be compromised.\n",
+dev ? dev->name : "Unknown driver");
+   rcu_read_unlock();
+   }
} else {
/* Otherwise, we make more careful check taking into account,
 * that SACKs block is variable.
-- 
2.9.3



Re: [PATCH net v2] tcp: warn on bogus MSS and try to amend it

2016-12-02 Thread Marcelo Ricardo Leitner
On Fri, Dec 02, 2016 at 06:45:24AM -0800, Eric Dumazet wrote:
> On Fri, 2016-12-02 at 08:55 -0200, Marcelo Ricardo Leitner wrote:
> > There have been some reports lately about TCP connection stalls caused
> > by NIC drivers that aren't setting gso_size on aggregated packets on rx
> > path. This causes TCP to assume that the MSS is actually the size of the
> > aggregated packet, which is invalid.
> > 
> > Although the proper fix is to be done at each driver, it's often hard
> > and cumbersome for one to debug, come to such root cause and report/fix
> > it.
> > 
> > This patch amends this situation in two ways. First, it adds a warning
> > on when this situation occurs, so it gives a hint to those trying to
> > debug this. It also limit the maximum probed MSS to the adverised MSS,
> > as it should never be any higher than that.
> > 
> > The result is that the connection may not have the best performance ever
> > but it shouldn't stall, and the admin will have a hint on what to look
> > for.
> > 
> > Tested with virtio by forcing gso_size to 0.
> > 
> > Cc: Jonathan Maxwell 
> > Signed-off-by: Marcelo Ricardo Leitner 
> > ---
> > v2: Updated msg as suggested by David.
> > 
> >  net/ipv4/tcp_input.c | 5 -
> >  1 file changed, 4 insertions(+), 1 deletion(-)
> > 
> > diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> > index 
> > a27b9c0e27c08b4e4aeaff3d0bfdf3ae561ba4d8..fd619eb93749b6de56a41669248b337c051d9fe2
> >  100644
> > --- a/net/ipv4/tcp_input.c
> > +++ b/net/ipv4/tcp_input.c
> > @@ -144,7 +144,10 @@ static void tcp_measure_rcv_mss(struct sock *sk, const 
> > struct sk_buff *skb)
> >  */
> > len = skb_shinfo(skb)->gso_size ? : skb->len;
> > if (len >= icsk->icsk_ack.rcv_mss) {
> > -   icsk->icsk_ack.rcv_mss = len;
> > +   icsk->icsk_ack.rcv_mss = min_t(unsigned int, len,
> > +  tcp_sk(sk)->advmss);
> > +   if (icsk->icsk_ack.rcv_mss != len)
> > +   pr_warn_once("Driver has suspect GRO implementation, 
> > TCP performance may be compromised.\n");
> > } else {
> > /* Otherwise, we make more careful check taking into account,
> >  * that SACKs block is variable.
> 
> 
> skb->dev is indeed NULL, but it might be worth getting back the device
> using skb->skb_iif maybe ?
> 

Yes, then it's possible. But I have to add an extra check because it
involves a search (iif -> net_device) and I can't wrap that inside
pr_warn_once(). I hope it doesn't get too cluttered then. Posting v3 in
a few.. Thanks



Re: [PATCH net-next v2 6/6] tcp: SOF_TIMESTAMPING_OPT_STATS option for SO_TIMESTAMPING

2016-12-02 Thread Paul Gortmaker
On Mon, Nov 28, 2016 at 2:07 AM, Yuchung Cheng  wrote:
> From: Francis Yan 
>
> This patch exports the sender chronograph stats via the socket
> SO_TIMESTAMPING channel. Currently we can instrument how long a
> particular application unit of data was queued in TCP by tracking
> SOF_TIMESTAMPING_TX_SOFTWARE and SOF_TIMESTAMPING_TX_SCHED. Having

Seems a new linux-next failure leads back to here ; I did not run a
full bisect, since the variable seems confined to this commit:

net/socket.c:701: error: 'SCM_TIMESTAMPING_OPT_STATS' undeclared
(first use in this function)

http://kisskb.ellerman.id.au/kisskb/buildresult/12875981/

Paul.
--

> these sender chronograph stats exported simultaneously along with
> these timestamps allow further breaking down the various sender
> limitation.  For example, a video server can tell if a particular
> chunk of video on a connection takes a long time to deliver because
> TCP was experiencing small receive window. It is not possible to
> tell before this patch without packet traces.
>
> To prepare these stats, the user needs to set
> SOF_TIMESTAMPING_OPT_STATS and SOF_TIMESTAMPING_OPT_TSONLY flags
> while requesting other SOF_TIMESTAMPING TX timestamps. When the
> timestamps are available in the error queue, the stats are returned
> in a separate control message of type SCM_TIMESTAMPING_OPT_STATS,
> in a list of TLVs (struct nlattr) of types: TCP_NLA_BUSY_TIME,
> TCP_NLA_RWND_LIMITED, TCP_NLA_SNDBUF_LIMITED. Unit is microsecond.
>
> Signed-off-by: Francis Yan 
> Signed-off-by: Yuchung Cheng 
> Signed-off-by: Soheil Hassas Yeganeh 
> ---
> ChangeLog since v1:
>  - fix build break if CONFIG_INET is not defined
>
>  Documentation/networking/timestamping.txt | 10 ++
>  arch/alpha/include/uapi/asm/socket.h  |  2 ++
>  arch/frv/include/uapi/asm/socket.h|  2 ++
>  arch/ia64/include/uapi/asm/socket.h   |  2 ++
>  arch/m32r/include/uapi/asm/socket.h   |  2 ++
>  arch/mips/include/uapi/asm/socket.h   |  2 ++
>  arch/mn10300/include/uapi/asm/socket.h|  2 ++
>  arch/parisc/include/uapi/asm/socket.h |  2 ++
>  arch/powerpc/include/uapi/asm/socket.h|  2 ++
>  arch/s390/include/uapi/asm/socket.h   |  2 ++
>  arch/sparc/include/uapi/asm/socket.h  |  2 ++
>  arch/xtensa/include/uapi/asm/socket.h |  2 ++
>  include/linux/tcp.h   |  2 ++
>  include/uapi/asm-generic/socket.h |  2 ++
>  include/uapi/linux/net_tstamp.h   |  3 ++-
>  include/uapi/linux/tcp.h  |  8 
>  net/core/skbuff.c | 14 +++---
>  net/core/sock.c   |  7 +++
>  net/ipv4/tcp.c| 20 
>  net/socket.c  |  7 ++-
>  20 files changed, 90 insertions(+), 5 deletions(-)
>
> diff --git a/Documentation/networking/timestamping.txt 
> b/Documentation/networking/timestamping.txt
> index 671cccf..96f5069 100644
> --- a/Documentation/networking/timestamping.txt
> +++ b/Documentation/networking/timestamping.txt
> @@ -182,6 +182,16 @@ SOF_TIMESTAMPING_OPT_TSONLY:
>the timestamp even if sysctl net.core.tstamp_allow_data is 0.
>This option disables SOF_TIMESTAMPING_OPT_CMSG.
>
> +SOF_TIMESTAMPING_OPT_STATS:
> +
> +  Optional stats that are obtained along with the transmit timestamps.
> +  It must be used together with SOF_TIMESTAMPING_OPT_TSONLY. When the
> +  transmit timestamp is available, the stats are available in a
> +  separate control message of type SCM_TIMESTAMPING_OPT_STATS, as a
> +  list of TLVs (struct nlattr) of types. These stats allow the
> +  application to associate various transport layer stats with
> +  the transmit timestamps, such as how long a certain block of
> +  data was limited by peer's receiver window.
>
>  New applications are encouraged to pass SOF_TIMESTAMPING_OPT_ID to
>  disambiguate timestamps and SOF_TIMESTAMPING_OPT_TSONLY to operate
> diff --git a/arch/alpha/include/uapi/asm/socket.h 
> b/arch/alpha/include/uapi/asm/socket.h
> index 9e46d6e..afc901b 100644
> --- a/arch/alpha/include/uapi/asm/socket.h
> +++ b/arch/alpha/include/uapi/asm/socket.h
> @@ -97,4 +97,6 @@
>
>  #define SO_CNX_ADVICE  53
>
> +#define SCM_TIMESTAMPING_OPT_STATS 54
> +
>  #endif /* _UAPI_ASM_SOCKET_H */
> diff --git a/arch/frv/include/uapi/asm/socket.h 
> b/arch/frv/include/uapi/asm/socket.h
> index afbc98f0..81e0353 100644
> --- a/arch/frv/include/uapi/asm/socket.h
> +++ b/arch/frv/include/uapi/asm/socket.h
> @@ -90,5 +90,7 @@
>
>  #define SO_CNX_ADVICE  53
>
> +#define SCM_TIMESTAMPING_OPT_STATS 54
> +
>  #endif /* _ASM_SOCKET_H */
>
> diff --git a/arch/ia64/include/uapi/asm/socket.h 
> b/arch/ia64/include/uapi/asm/socket.h
> index 0018fad..57feb0c 100644
> --- a/arch/ia64/include/uapi/asm/socket.h
> +++ b/arch/ia64/include/uapi/asm/socket.h
> @@ -99,4 +99,6 @@
>
>  #define SO_CNX_ADVICE  53
>
> +#define SCM_TIMESTAMPING_OPT_STATS 54
> +
>  #en

Re: [PATCH iproute2/net-next] ss: initialise variables outside of for loop

2016-12-02 Thread Stephen Hemminger
On Fri,  2 Dec 2016 12:56:05 +0100
Simon Horman  wrote:

> Initialise for loops outside of for loops. GCC flags this as being
> out of spec unless C99 or C11 mode is used.
> 
> With this change the entire tree appears to compile cleanly with -Wall.
> 
> $ gcc --version
> gcc (Debian 4.9.2-10) 4.9.2
> ...
> $ make
> ...
> ss.c: In function ‘unix_show_sock’:
> ss.c:3128:4: error: ‘for’ loop initial declarations are only allowed in C99 
> or C11 mode
> ...
> 
> Signed-off-by: Simon Horman 

Applied.
Note, I used to have -Wall in Makefile but old GCC were broken and would give
aliasing warnings.


Re: [PATCH iproute2 V5 0/3] tc: Support for ip tunnel metadata set/unset/classify

2016-12-02 Thread Stephen Hemminger
On Fri,  2 Dec 2016 13:25:12 +0200
Amir Vadai  wrote:

> Hi,
> 
> This short series adds support for matching and setting metadata for ip tunnel
> shared device using the TC system, introduced in kernel 4.9 [1].
> 
> Applied and tested on top of commit b6c7fc61faab ("ss: print new tcp_info
> fields: busy, rwnd-limited, sndbuf-limited times")
> 
> 
> Example usage:
> 
> $ tc filter add dev vxlan0 protocol ip parent : \
> flower \
>   enc_src_ip 11.11.0.2 \
>   enc_dst_ip 11.11.0.1 \
>   enc_key_id 11 \
>   dst_ip 11.11.11.1 \
> action mirred egress redirect dev vnet0
> 
> $ tc filter add dev net0 protocol ip parent : \
> flower \
>   ip_proto 1 \
>   dst_ip 11.11.11.2 \
> action tunnel_key set \
>   src_ip 11.11.0.1 \
>   dst_ip 11.11.0.2 \
>   id 11 \
> action mirred egress redirect dev vxlan0
> 
> [1] - d1ba24feb466 ("Merge branch 'act_tunnel_key'")
> 
> Thanks,
> Amir
> 
> Changes from V4:
> - Fix rebase conflicts for net-next
> 
> Changes from V3:
> - Fix bad wording in the man page about the use of the 'unset' operation
> 
> Changes from V2:
> - Use const where needed
> - Don't lose return value
> - Introduce rta_getattr_be16() and rta_getattr_be32()
> 
> Changes from V1:
> - Updated Patch 2/2 ("tc/act_tunnel: Introduce ip tunnel action") commit log
>   and the man page tc-tunnel_key to reflect the fact that 'unset' 
> operation is
>   no mandatory.
>   And describe when it might be needed.
> - Rename the 'release' operation to 'unset'
> 
> Amir Vadai (3):
>   libnetlink: Introduce rta_getattr_be*()
>   tc/cls_flower: Classify packet in ip tunnels
>   tc/act_tunnel: Introduce ip tunnel action
> 
> Amir Vadai (3):
>   libnetlink: Introduce rta_getattr_be*()
>   tc/cls_flower: Classify packet in ip tunnels
>   tc/act_tunnel: Introduce ip tunnel action
> 
>  bridge/fdb.c |   4 +-
>  include/libnetlink.h |   9 ++
>  include/linux/tc_act/tc_tunnel_key.h |  42 ++
>  ip/iplink_geneve.c   |   2 +-
>  ip/iplink_vxlan.c|   2 +-
>  man/man8/tc-flower.8 |  17 ++-
>  man/man8/tc-tunnel_key.8 | 112 +++
>  tc/Makefile  |   1 +
>  tc/f_flower.c|  84 +++-
>  tc/m_tunnel_key.c| 258 
> +++
>  10 files changed, 522 insertions(+), 9 deletions(-)
>  create mode 100644 include/linux/tc_act/tc_tunnel_key.h
>  create mode 100644 man/man8/tc-tunnel_key.8
>  create mode 100644 tc/m_tunnel_key.c
> 
 Series applied


Re: [PATCH net-next 3/4] tcp: tsq: add shortcut in tcp_tasklet_func()

2016-12-02 Thread Eric Dumazet
On Fri, 2016-12-02 at 10:25 -0800, Eric Dumazet wrote:
> Under high stress, I've seen tcp_tasklet_func() consuming
> ~700 usec, handling ~150 tcp sockets.
> 
> By setting TCP_TSQ_DEFERRED in tcp_wfree(), we give a chance
> for other cpus/threads entering tcp_write_xmit() to grab it,
> allowing tcp_tasklet_func() to skip sockets that already did
> an xmit cycle.
> 
> Signed-off-by: Eric Dumazet 

...

> @@ -884,7 +884,7 @@ void tcp_wfree(struct sk_buff *skb)
>   if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED))
>   goto out;
>  
> - nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED;
> + nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED | 
> TCP_TSQ_DEFERRED;

Typo here...

Should be :
nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED | 
TCPF_TSQ_DEFERRED;

>   nval = cmpxchg(&tp->tsq_flags, oval, nval);
>   if (nval != oval)
>   continue;




Re: [iproute PATCH v2 00/18] ss: Minor code review

2016-12-02 Thread Stephen Hemminger
On Fri,  2 Dec 2016 11:39:44 +0100
Phil Sutter  wrote:

> This is a series of misc changes to ss code which happened as fall-out
> when working on a unified output formatter (still unfinished).
> 
> Changes since v1:
> - Rebased onto current upstream, resolved conflicts in patch 4 generated
>   by previously added SCTP socket support.
> 
> Phil Sutter (18):
>   ss: Mark fall through in arg parsing switch()
>   ss: Drop empty lines in UDP output
>   ss: Add missing tab when printing UNIX details
>   ss: Use sockstat->type in all socket types
>   ss: introduce proc_ctx_print()
>   ss: Drop list traversal from unix_stats_print()
>   ss: Eliminate unix_use_proc()
>   ss: Turn generic_proc_open() wrappers into macros
>   ss: Make tmr_name local to tcp_timer_print()
>   ss: Make user_ent_hash_build_init local to user_ent_hash_build()
>   ss: Make some variables function-local
>   ss: Make slabstat_ids local to get_slabstat()
>   ss: Get rid of useless goto in handle_follow_request()
>   ss: Get rid of single-fielded struct snmpstat
>   ss: Make unix_state_map local to unix_show()
>   ss: Make sstate_name local to sock_state_print()
>   ss: Make sstate_namel local to scan_state()
>   ss: unix_show: No need to initialize members of calloc'ed structs
> 
>  misc/ss.c | 532 
> ++
>  1 file changed, 224 insertions(+), 308 deletions(-)
> 

Applied, thanks


Re: [PATCH iproute2 1/1] tc: updated man page to reflect handle-id use in filter GET command.

2016-12-02 Thread Stephen Hemminger
On Thu,  1 Dec 2016 15:20:44 -0500
Roman Mashak  wrote:

> Signed-off-by: Roman Mashak 
> ---
>  man/man8/tc.8 | 6 --
>  1 file changed, 4 insertions(+), 2 deletions(-)
> 
> diff --git a/man/man8/tc.8 b/man/man8/tc.8
> index 8a47a2b..d957ffa 100644
> --- a/man/man8/tc.8
> +++ b/man/man8/tc.8
> @@ -32,7 +32,9 @@ class-id ] qdisc
>  DEV
>  .B [ parent
>  qdisc-id
> -.B | root ] protocol
> +.B | root ] [ handle
> +handle-id ]
> +.B protocol
>  protocol
>  .B prio
>  priority filtertype
> @@ -577,7 +579,7 @@ it is created.
>  
>  .TP
>  get
> -Displays a single filter given the interface, parent ID, priority, protocol 
> and handle ID.
> +Displays a single filter given the interface, qdisc-id, priority, protocol 
> and handle-id.
>  
>  .TP
>  show

The proper syntax for man page usage section is to put keywords in bold and any 
value
that is variable in italic.

I know this whole man page doesn't do this correctly. But that doesn't mean 
that new
additions should continue with the mistake.

Please revise and resubmit. Extra bonus points for fixing the other bits.


Re: [PATCHv2 net-next 4/4] net: dsa: mv88e6xxx: Refactor CPU and DSA port setup

2016-12-02 Thread Vivien Didelot
Hi Andrew,

Andrew Lunn  writes:

>> The port's EgressMode, FrameMode and EtherType are really tied together
>> to compose the mode of the port.
>
> Setting the EtherType is somewhat separate. It is only needed on ports
> using EDSA. And that can only happen on a CPU port. Humm, actually, i
> set it when i should not. But putting this in a wrapper actually hides
> this.

Wrong. The datasheet says:

> This Ether Type is used for many features depending upon the mode
> of the port (as defined by the port’s EgressMode and FrameMode
> bits – in Port Control, port offset 0x04).

It says that in Normal Network mode, this register can be used to trap,
mirror, etc. Also used in Provider and EDSA modes.

That is why it would be better to wrap them together to ensure correct
values when configuring a port's mode.

>
>> Could you add an helper in chip.c like:
>> 
>> static int mv88e6xxx_set_port_mode(struct mv88e6xxx_chip *chip, int port,
>>enum mv88e6xxx_frame_mode frame_mode,
>>u16 egress_mode, bool egress_unknown,
>>u16 ethertype)
>> {
>> int err;
>> 
>> if (chip->info->ops->port_set_frame_mode) {
>> err = chip->info->ops->port_set_frame_mode(chip, port, 
>> frame_mode);
>> if (err)
>> return err;
>> }
>
> Ignoring that it is not implemented here is wrong. It must be
> implemented, or the device is not going to work. It is a question of,
> do we want an oops, or return an error code.

Since that is done at setup time, returning an error is enough IMO to
inform the DSA layer that something went wrong.

Thanks,

Vivien


[PATCH net-next] tools: hv: Enable network manager for bonding scripts on RHEL

2016-12-02 Thread Haiyang Zhang
From: Haiyang Zhang 

We found network manager is necessary on RHEL to make the synthetic
NIC, VF NIC bonding operations handled automatically. So, enabling
network manager here.

Signed-off-by: Haiyang Zhang 
Reviewed-by: K. Y. Srinivasan 
---
 tools/hv/bondvf.sh |4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/hv/bondvf.sh b/tools/hv/bondvf.sh
index 8e96023..4aa5369 100755
--- a/tools/hv/bondvf.sh
+++ b/tools/hv/bondvf.sh
@@ -74,8 +74,8 @@ function create_eth_cfg_redhat {
echo DEVICE=$1 >>$fn
echo TYPE=Ethernet >>$fn
echo BOOTPROTO=none >>$fn
+   echo UUID=`uuidgen` >>$fn
echo ONBOOT=yes >>$fn
-   echo NM_CONTROLLED=no >>$fn
echo PEERDNS=yes >>$fn
echo IPV6INIT=yes >>$fn
echo MASTER=$2 >>$fn
@@ -93,8 +93,8 @@ function create_bond_cfg_redhat {
echo DEVICE=$1 >>$fn
echo TYPE=Bond >>$fn
echo BOOTPROTO=dhcp >>$fn
+   echo UUID=`uuidgen` >>$fn
echo ONBOOT=yes >>$fn
-   echo NM_CONTROLLED=no >>$fn
echo PEERDNS=yes >>$fn
echo IPV6INIT=yes >>$fn
echo BONDING_MASTER=yes >>$fn
-- 
1.7.4.1



[PATCH net-next v5] ipv6 addrconf: Implemented enhanced DAD (RFC7527)

2016-12-02 Thread Erik Nordmark
Implemented RFC7527 Enhanced DAD.
IPv6 duplicate address detection can fail if there is some temporary
loopback of Ethernet frames. RFC7527 solves this by including a random
nonce in the NS messages used for DAD, and if an NS is received with the
same nonce it is assumed to be a looped back DAD probe and is ignored.
RFC7527 is enabled by default. Can be disabled by setting both of
conf/{all,interface}/enhanced_dad to zero.

Signed-off-by: Erik Nordmark 
Signed-off-by: Bob Gilligan 
Reviewed-by: Hannes Frederic Sowa 

---

v2: renamed sysctl and made it default to true, plus minor code review fixes
v3: respun with later net-next; fixed whitespace issues
v4: fixed kbuild test robot for route.c; added Reviewed-by
v5: using %pM for printk of nonce

 Documentation/networking/ip-sysctl.txt |  9 +
 include/linux/ipv6.h   |  1 +
 include/net/if_inet6.h |  1 +
 include/net/ndisc.h|  5 -
 include/uapi/linux/ipv6.h  |  1 +
 net/ipv6/addrconf.c| 22 +-
 net/ipv6/ndisc.c   | 29 ++---
 net/ipv6/route.c   |  2 +-
 8 files changed, 64 insertions(+), 6 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt 
b/Documentation/networking/ip-sysctl.txt
index 5af48dd..d9ef566 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1729,6 +1729,15 @@ drop_unsolicited_na - BOOLEAN
 
By default this is turned off.
 
+enhanced_dad - BOOLEAN
+   Include a nonce option in the IPv6 neighbor solicitation messages used 
for
+   duplicate address detection per RFC7527. A received DAD NS will only 
signal
+   a duplicate address if the nonce is different. This avoids any false
+   detection of duplicates due to loopback of the NS messages that we send.
+   The nonce option will be sent on an interface unless both of
+   conf/{all,interface}/enhanced_dad are set to FALSE.
+   Default: TRUE
+
 icmp/*:
 ratelimit - INTEGER
Limit the maximal rates for sending ICMPv6 packets.
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 3f95233..671d014 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -68,6 +68,7 @@ struct ipv6_devconf {
 #ifdef CONFIG_IPV6_SEG6_HMAC
__s32   seg6_require_hmac;
 #endif
+   __u32   enhanced_dad;
 
struct ctl_table_header *sysctl_header;
 };
diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h
index b0576cb..0fa4c32 100644
--- a/include/net/if_inet6.h
+++ b/include/net/if_inet6.h
@@ -55,6 +55,7 @@ struct inet6_ifaddr {
__u8stable_privacy_retry;
 
__u16   scope;
+   __u64   dad_nonce;
 
unsigned long   cstamp; /* created timestamp */
unsigned long   tstamp; /* updated timestamp */
diff --git a/include/net/ndisc.h b/include/net/ndisc.h
index be1fe228..d562a2f 100644
--- a/include/net/ndisc.h
+++ b/include/net/ndisc.h
@@ -31,6 +31,7 @@ enum {
ND_OPT_PREFIX_INFO = 3, /* RFC2461 */
ND_OPT_REDIRECT_HDR = 4,/* RFC2461 */
ND_OPT_MTU = 5, /* RFC2461 */
+   ND_OPT_NONCE = 14,  /* RFC7527 */
__ND_OPT_ARRAY_MAX,
ND_OPT_ROUTE_INFO = 24, /* RFC4191 */
ND_OPT_RDNSS = 25,  /* RFC5006 */
@@ -121,6 +122,7 @@ struct ndisc_options {
 #define nd_opts_pi_end nd_opt_array[__ND_OPT_PREFIX_INFO_END]
 #define nd_opts_rh nd_opt_array[ND_OPT_REDIRECT_HDR]
 #define nd_opts_mtund_opt_array[ND_OPT_MTU]
+#define nd_opts_nonce  nd_opt_array[ND_OPT_NONCE]
 #define nd_802154_opts_src_lladdr  
nd_802154_opt_array[ND_OPT_SOURCE_LL_ADDR]
 #define nd_802154_opts_tgt_lladdr  
nd_802154_opt_array[ND_OPT_TARGET_LL_ADDR]
 
@@ -398,7 +400,8 @@ static inline struct neighbour *__ipv6_neigh_lookup(struct 
net_device *dev, cons
 int ndisc_rcv(struct sk_buff *skb);
 
 void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit,
-  const struct in6_addr *daddr, const struct in6_addr *saddr);
+  const struct in6_addr *daddr, const struct in6_addr *saddr,
+  u64 nonce);
 
 void ndisc_send_rs(struct net_device *dev,
   const struct in6_addr *saddr, const struct in6_addr *daddr);
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index 53561be..eaf65dc 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -181,6 +181,7 @@ enum {
DEVCONF_RTR_SOLICIT_MAX_INTERVAL,
DEVCONF_SEG6_ENABLED,
DEVCONF_SEG6_REQUIRE_HMAC,
+   DEVCONF_ENHANCED_DAD,
DEVCONF_MAX
 };
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 4c387dc..c1e124b 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -2

Re: [PATCHv2 net-next 2/4] net: dsa: mv88e6xxx: Monitor and Management tables

2016-12-02 Thread Vivien Didelot
Hi Andrew,

Andrew Lunn  writes:

> On Fri, Dec 02, 2016 at 02:32:39PM -0500, Vivien Didelot wrote:
>> Hi Andrew,
>> 
>> Andrew Lunn  writes:
>> 
>> > @@ -3184,6 +3186,8 @@ static const struct mv88e6xxx_ops mv88e6085_ops = {
>> >.stats_get_sset_count = mv88e6095_stats_get_sset_count,
>> >.stats_get_strings = mv88e6095_stats_get_strings,
>> >.stats_get_stats = mv88e6095_stats_get_stats,
>> > +  .g1_set_cpu_port = mv88e6095_g1_set_cpu_port,
>> > +  .g1_set_egress_port = mv88e6095_g1_set_egress_port,
>> >  };
>> 
>> I like the implementation in this version better. But please explain me
>> why you are prefixing these operations with g1_?
>
> The prefix gives some basic grouping. port_ indicates it operates on a
> port, and is likely to be found in port.c. stats_ indicates it
> operates on statistics, ppu that is operates on the phy polling unit.

Yes, port_* operations operate on ports. But the port.c file is there to
implement the function of "Port Registers". "Port" can be confusing, but
it refers to the SMI internal device at address 0xsomething.

"port_", "ppu_", "stats_", in the mv88e6xxx_ops structure just give
implicit namespaces for the **features**, not their location!

> We are going to have some things which don't fall into a simple
> category, like these two. But it would however be nice to group them,
> so i picked which register bank they are in. These operations are
> always in g1. It is a useful hint as to where to find the different
> variants.

Absolutely not!

.set_egress_port = mv88e6095_g1_set_egress_port,

 ^
 That is the useful hint!

At the higher level of chip.c, we don't care about where is implemented
the switch MAC setter. We just have to call the correctly defined
.set_switch_mac routine.

However if you do care to know, its _ops.set_switch_mac pointer will
tell you (_g1 vs _g2 prefix).

>> But let's imagine we can set the CPU port in some Global 2 registers.
>> You are going to wrap this in chip.c with something like:
>> 
>> int mv88e6xxx_set_cpu_port(struct mv88e6xxx_chip *chip, int port)
>> {
>> if (chip->info->ops->g2_set_cpu_port)
>> return chip->info->ops->g2_set_cpu_port(chip, port);
>> else if (chip->info->ops->g1_set_cpu_port)
>> return chip->info->ops->g1_set_cpu_port(chip, port);
>> else
>> return -EOPNOTSUPP;
>> }
>
> I answered in one of my other emails. Frames with reserved MAC
> addresses can be forwarded to the CPU. For most devices, this is a g2
> operation. However, for 6390, it is a g1. In that case, my code does
> not use a prefix. Not having a prefix, when all the others do, also
> gives you information. It means the ops are spread around and you need
> to make a bigger effort to go find them.

Again, absolutely not. This is your interpretation of having a prefix or
not. A chip has only one way to access a feature, not two. Since you
seem to be focused on the Rsvd2CPU feature, here's an example with it:

What's the point of writing this:

/* Consider the given MAC as MGMT */
int mv88e6xxx_reserve_mac(struct mv88e6xxx_chip *chip, u8 *addr)
{
if (mac_is_0x(addr)) {
if (chip->info->ops->g1_set_rsvd2cpu0)
return chip->info->ops->g1_set_rsvd2cpu0(...);
else if (chip->info->ops->g2_set_rsvd2cpu0)
return chip->info->ops->g2_set_rsvd2cpu0(...);
} else if (mac_is_2x(addr)) {
if (chip->info->ops->g1_set_rsvd2cpu2)
return chip->info->ops->g1_set_rsvd2cpu2(...);
else if (chip->info->ops->g2_set_rsvd2cpu2)
return chip->info->ops->g2_set_rsvd2cpu2(...);
}

return mv88e6xxx_atu_load(chip, addr, MGMT);
}

Compared to this:

/* Consider the given MAC as MGMT */
int mv88e6xxx_reserve_mac(struct mv88e6xxx_chip *chip, u8 *addr)
{
if (mac_is_0x(addr)) {
if (chip->info->ops->set_rsvd2cpu0)
return chip->info->ops->set_rsvd2cpu0(...);
} else if (mac_is_2x(addr)) {
if (chip->info->ops->set_rsvd2cpu2)
return chip->info->ops->set_rsvd2cpu2(...);
}

return mv88e6xxx_atu_load(chip, addr, MGMT);
}

Your higher level API (chip.c) doesn't need to know where is implemented
a given feature. It just needs to know if it supports it or not.

Thanks,

Vivien


[PATCH] adm80211: add checks for dma mapping errors

2016-12-02 Thread Alexey Khoroshilov
The driver does not check if mapping dma memory succeed.
The patch adds the checks and failure handling.

Found by Linux Driver Verification project (linuxtesting.org).

Signed-off-by: Alexey Khoroshilov 
---
 drivers/net/wireless/admtek/adm8211.c | 24 ++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/drivers/net/wireless/admtek/adm8211.c 
b/drivers/net/wireless/admtek/adm8211.c
index 70ecd82d674d..2b4a3eb38dfa 100644
--- a/drivers/net/wireless/admtek/adm8211.c
+++ b/drivers/net/wireless/admtek/adm8211.c
@@ -413,6 +413,13 @@ static void adm8211_interrupt_rci(struct ieee80211_hw *dev)
   skb_tail_pointer(newskb),
   RX_PKT_SIZE,
   PCI_DMA_FROMDEVICE);
+   if (pci_dma_mapping_error(priv->pdev,
+  priv->rx_buffers[entry].mapping)) {
+   priv->rx_buffers[entry].skb = NULL;
+   dev_kfree_skb(newskb);
+   skb = NULL;
+   /* TODO: update rx dropped stats */
+   }
} else {
skb = NULL;
/* TODO: update rx dropped stats */
@@ -1450,6 +1457,12 @@ static int adm8211_init_rings(struct ieee80211_hw *dev)
  
skb_tail_pointer(rx_info->skb),
  RX_PKT_SIZE,
  PCI_DMA_FROMDEVICE);
+   if (pci_dma_mapping_error(priv->pdev, rx_info->mapping)) {
+   dev_kfree_skb(rx_info->skb);
+   rx_info->skb = NULL;
+   break;
+   }
+
desc->buffer1 = cpu_to_le32(rx_info->mapping);
desc->status = cpu_to_le32(RDES0_STATUS_OWN | RDES0_STATUS_SQL);
}
@@ -1613,7 +1626,7 @@ static void adm8211_calc_durations(int *dur, int *plcp, 
size_t payload_len, int
 }
 
 /* Transmit skb w/adm8211_tx_hdr (802.11 header created by hardware) */
-static void adm8211_tx_raw(struct ieee80211_hw *dev, struct sk_buff *skb,
+static int adm8211_tx_raw(struct ieee80211_hw *dev, struct sk_buff *skb,
   u16 plcp_signal,
   size_t hdrlen)
 {
@@ -1625,6 +1638,8 @@ static void adm8211_tx_raw(struct ieee80211_hw *dev, 
struct sk_buff *skb,
 
mapping = pci_map_single(priv->pdev, skb->data, skb->len,
 PCI_DMA_TODEVICE);
+   if (pci_dma_mapping_error(priv->pdev, mapping))
+   return -ENOMEM;
 
spin_lock_irqsave(&priv->lock, flags);
 
@@ -1657,6 +1672,8 @@ static void adm8211_tx_raw(struct ieee80211_hw *dev, 
struct sk_buff *skb,
 
/* Trigger transmit poll */
ADM8211_CSR_WRITE(TDR, 0);
+
+   return 0;
 }
 
 /* Put adm8211_tx_hdr on skb and transmit */
@@ -1710,7 +1727,10 @@ static void adm8211_tx(struct ieee80211_hw *dev,
 
txhdr->retry_limit = info->control.rates[0].count;
 
-   adm8211_tx_raw(dev, skb, plcp_signal, hdrlen);
+   if (adm8211_tx_raw(dev, skb, plcp_signal, hdrlen)) {
+   /* Drop packet */
+   ieee80211_free_txskb(dev, skb);
+   }
 }
 
 static int adm8211_alloc_rings(struct ieee80211_hw *dev)
-- 
2.7.4



Re: [PATCH next] dctcp: update cwnd on congestion event

2016-12-02 Thread Florian Westphal
Neal Cardwell  wrote:
> On Mon, Nov 14, 2016 at 10:42 AM, Florian Westphal  wrote:
> >
> > draft-ietf-tcpm-dctcp-02 says:
> >
> > ... when the sender receives an indication of congestion
> > (ECE), the sender SHOULD update cwnd as follows:
> >
> >  cwnd = cwnd * (1 - DCTCP.Alpha / 2)
> >
> > So, lets do this and reduce cwnd more smoothly (and faster), as per
> > current congestion estimate.
> 
> AFAICT this is doing a multiplicative decrease of cwnd on every ACK
> that has an ECE bit.
> 
> If I am reading the code correctly, then I would have two concerns:
> 
> 1) Has that been tested? That seems like an extremely dramatic
> decrease in cwnd. For example, if the cwnd is 80, and there are 40
> ACKs, and half the ACKs are ECE marked, then my back-of-the-envelope
> calculations seem to suggest that after just 11 ACKs the cwnd would be
> down to a minimal value of 2:
> 
> ack 1 cwnd=60
> ack 2 cwnd=45
> ack 3 cwnd=33
[..]

You are assuming alpha = 0.5?
Then, yes, looks correct.  Since some of these acks will most likely
also end an observation window acks might also cause change to alpha.

> 2) That seems to contradict another passage in the draft (v 02 or 03). 
> Consider
>  https://tools.ietf.org/html/draft-ietf-tcpm-dctcp-03
> where it says
> 
>Just as specified in [RFC3168], DCTCP does not react to congestion
>indications more than once for every window of data.
> 
> So the draft seems to advocate not reacting to congestion indications
> more than once per window. Yet this patch reacts on every ECE-marked
> ACK within a window.
> 
> Am I reading something incorrectly?

No, I will raise this on tcpm next monday (if you want you
can of course do this yourself).

Would be easy to make it so this cwnd update only happens once in each
observation cycle, but it would be even better if this would get input
from draft authors.

Thanks Neal!


Re: [PATCHv2 net-next 2/4] net: dsa: mv88e6xxx: Monitor and Management tables

2016-12-02 Thread Andrew Lunn
On Fri, Dec 02, 2016 at 02:32:39PM -0500, Vivien Didelot wrote:
> Hi Andrew,
> 
> Andrew Lunn  writes:
> 
> > @@ -3184,6 +3186,8 @@ static const struct mv88e6xxx_ops mv88e6085_ops = {
> > .stats_get_sset_count = mv88e6095_stats_get_sset_count,
> > .stats_get_strings = mv88e6095_stats_get_strings,
> > .stats_get_stats = mv88e6095_stats_get_stats,
> > +   .g1_set_cpu_port = mv88e6095_g1_set_cpu_port,
> > +   .g1_set_egress_port = mv88e6095_g1_set_egress_port,
> >  };
> 
> I like the implementation in this version better. But please explain me
> why you are prefixing these operations with g1_?

The prefix gives some basic grouping. port_ indicates it operates on a
port, and is likely to be found in port.c. stats_ indicates it
operates on statistics, ppu that is operates on the phy polling unit.

We are going to have some things which don't fall into a simple
category, like these two. But it would however be nice to group them,
so i picked which register bank they are in. These operations are
always in g1. It is a useful hint as to where to find the different
variants.

> But let's imagine we can set the CPU port in some Global 2 registers.
> You are going to wrap this in chip.c with something like:
> 
> int mv88e6xxx_set_cpu_port(struct mv88e6xxx_chip *chip, int port)
> {
> if (chip->info->ops->g2_set_cpu_port)
> return chip->info->ops->g2_set_cpu_port(chip, port);
> else if (chip->info->ops->g1_set_cpu_port)
> return chip->info->ops->g1_set_cpu_port(chip, port);
> else
> return -EOPNOTSUPP;
> }

I answered in one of my other emails. Frames with reserved MAC
addresses can be forwarded to the CPU. For most devices, this is a g2
operation. However, for 6390, it is a g1. In that case, my code does
not use a prefix. Not having a prefix, when all the others do, also
gives you information. It means the ops are spread around and you need
to make a bigger effort to go find them.

   Andrew



Re: [PATCHv2 net-next 4/4] net: dsa: mv88e6xxx: Refactor CPU and DSA port setup

2016-12-02 Thread Andrew Lunn
> The port's EgressMode, FrameMode and EtherType are really tied together
> to compose the mode of the port.

Setting the EtherType is somewhat separate. It is only needed on ports
using EDSA. And that can only happen on a CPU port. Humm, actually, i
set it when i should not. But putting this in a wrapper actually hides
this.

> Could you add an helper in chip.c like:
> 
> static int mv88e6xxx_set_port_mode(struct mv88e6xxx_chip *chip, int port,
>enum mv88e6xxx_frame_mode frame_mode,
>u16 egress_mode, bool egress_unknown,
>u16 ethertype)
> {
> int err;
> 
> if (chip->info->ops->port_set_frame_mode) {
> err = chip->info->ops->port_set_frame_mode(chip, port, 
> frame_mode);
> if (err)
> return err;
> }

Ignoring that it is not implemented here is wrong. It must be
implemented, or the device is not going to work. It is a question of,
do we want an oops, or return an error code.

New version coming.

Andrew


Re: [PATCHv2 net-next 4/4] net: dsa: mv88e6xxx: Refactor CPU and DSA port setup

2016-12-02 Thread Vivien Didelot
Hi Andrew,

Andrew Lunn  writes:

> +static int mv88e6xxx_setup_port_dsa(struct mv88e6xxx_chip *chip, int port,
> + int upstream_port)
> +{
> + int err;
> +
> + err = chip->info->ops->port_set_frame_mode(
> + chip, port, MV88E6XXX_FRAME_MODE_DSA);
> + if (err)
> + return err;
> +
> + err = chip->info->ops->port_set_egress_unknowns(
> + chip, port, port == upstream_port);
> + if (err)
> + return err;
> +
> + if (chip->info->ops->port_set_ether_type)
> + return chip->info->ops->port_set_ether_type(
> + chip, port, ETH_P_EDSA);
> +
> + return 0;
> +}
> +
> +static int mv88e6xxx_setup_port_cpu(struct mv88e6xxx_chip *chip, int port)
> +{
> + int err;
> +
> + switch (chip->info->tag_protocol) {
> + case DSA_TAG_PROTO_EDSA:
> + err = chip->info->ops->port_set_frame_mode(
> + chip, port, MV88E6XXX_FRAME_MODE_ETHERTYPE);
> + if (err)
> + return err;
> +
> + err = mv88e6xxx_port_set_egress_mode(
> + chip, port, PORT_CONTROL_EGRESS_ADD_TAG);
> + if (err)
> + return err;
> +
> + if (chip->info->ops->port_set_ether_type)
> + err = chip->info->ops->port_set_ether_type(
> + chip, port, ETH_P_EDSA);
> + break;
> +
> + case DSA_TAG_PROTO_DSA:
> + err = chip->info->ops->port_set_frame_mode(
> + chip, port, MV88E6XXX_FRAME_MODE_DSA);
> + if (err)
> + return err;
> +
> + err = mv88e6xxx_port_set_egress_mode(
> + chip, port, PORT_CONTROL_EGRESS_UNMODIFIED);
> + break;
> + default:
> + err = -EINVAL;
> + }
> +
> + if (err)
> + return err;
> +
> + return chip->info->ops->port_set_egress_unknowns(chip, port, true);
> +}
> +
> +static int mv88e6xxx_setup_port_normal(struct mv88e6xxx_chip *chip, int port)
> +{
> + int err;
> +
> + err = chip->info->ops->port_set_frame_mode(
> + chip, port, MV88E6XXX_FRAME_MODE_NORMAL);
> + if (err)
> + return err;
> +
> + return chip->info->ops->port_set_egress_unknowns(chip, port, false);
> +}

The port's EgressMode, FrameMode and EtherType are really tied together
to compose the mode of the port. Could you add an helper in chip.c like:

static int mv88e6xxx_set_port_mode(struct mv88e6xxx_chip *chip, int port,
   enum mv88e6xxx_frame_mode frame_mode,
   u16 egress_mode, bool egress_unknown,
   u16 ethertype)
{
int err;

if (chip->info->ops->port_set_frame_mode) {
err = chip->info->ops->port_set_frame_mode(chip, port, frame_mode);
if (err)
return err;
}

err = mv88e6xxx_port_set_egress_mode(chip, port, egress_mode);
if (err)
return err;

if (chip->info->ops->port_set_egress_unknown) {
err = chip->info->ops->port_set_egress_unknown(chip, port, 
egress_unknown);
if (err)
return err;
}

if (chip->info->ops->port_set_ether_type) {
err = chip->info->ops->port_set_ether_type(chip, port, ethertype);
if (err)
return err;
}

return 0;
}

So that we correctly check for ops before calling them, and make
mv88e6xxx_setup_port_{dsa,cpu,normal} a bit more concise.

> +
>  static int mv88e6xxx_setup_port(struct mv88e6xxx_chip *chip, int port)
>  {
>   struct dsa_switch *ds = chip->ds;
> @@ -2473,44 +2542,25 @@ static int mv88e6xxx_setup_port(struct mv88e6xxx_chip 
> *chip, int port)
>* If this is the upstream port for this switch, enable
>* forwarding of unknown unicasts and multicasts.
>*/
> - reg = 0;
> - if (mv88e6xxx_6352_family(chip) || mv88e6xxx_6351_family(chip) ||
> - mv88e6xxx_6165_family(chip) || mv88e6xxx_6097_family(chip) ||
> - mv88e6xxx_6095_family(chip) || mv88e6xxx_6065_family(chip) ||
> - mv88e6xxx_6185_family(chip) || mv88e6xxx_6320_family(chip))
> - reg = PORT_CONTROL_IGMP_MLD_SNOOP |
> + reg = PORT_CONTROL_IGMP_MLD_SNOOP |
>   PORT_CONTROL_USE_TAG | PORT_CONTROL_USE_IP |
>   PORT_CONTROL_STATE_FORWARDING;
> + err = mv88e6xxx_port_write(chip, port, PORT_CONTROL, reg);
> + if (err)
> + return err;
> +
>   if (dsa_is_cpu_port(ds, port)) {
> - if (chip->info->tag_protocol == DSA_TAG_PROTO_EDSA)
> - reg |= PORT_CONTROL_FRAME_ETHER_TYPE_DSA |
> - PORT_CONTROL_FORWARD_UNKNOWN_MC;
> - else
> - reg |= PORT_CONTROL_DS

Re: [PATCH net-next 0/4] tcp: tsq: performance series

2016-12-02 Thread Eric Dumazet
On Fri, Dec 2, 2016 at 10:25 AM, Eric Dumazet  wrote:
> Under very high TX stress, CPU handling NIC TX completions can spend
> considerable amount of cycles handling TSQ (TCP Small Queues) logic.
>
> This patch series avoids some atomic operations, but more important
> patch is the 3rd one, allowing other cpus processing ACK packets and
> calling tcp_write_xmit() to grab TCP_TSQ_DEFERRED so that
> tcp_tasklet_func() can skip already processed sockets.
>
> This avoid lots of lock acquisitions and cache lines accesses,
> particularly under load.
>

Please do not merge this version.

I probably messed something, I need to make more tests.

Thanks.


Re: [PATCH next] dctcp: update cwnd on congestion event

2016-12-02 Thread Neal Cardwell
On Mon, Nov 14, 2016 at 10:42 AM, Florian Westphal  wrote:
>
> draft-ietf-tcpm-dctcp-02 says:
>
> ... when the sender receives an indication of congestion
> (ECE), the sender SHOULD update cwnd as follows:
>
>  cwnd = cwnd * (1 - DCTCP.Alpha / 2)
>
> So, lets do this and reduce cwnd more smoothly (and faster), as per
> current congestion estimate.

AFAICT this is doing a multiplicative decrease of cwnd on every ACK
that has an ECE bit.

If I am reading the code correctly, then I would have two concerns:

1) Has that been tested? That seems like an extremely dramatic
decrease in cwnd. For example, if the cwnd is 80, and there are 40
ACKs, and half the ACKs are ECE marked, then my back-of-the-envelope
calculations seem to suggest that after just 11 ACKs the cwnd would be
down to a minimal value of 2:

ack 1 cwnd=60
ack 2 cwnd=45
ack 3 cwnd=33
ack 4 cwnd=24
ack 5 cwnd=18
ack 6 cwnd=13
ack 7 cwnd=9
ack 8 cwnd=6
ack 9 cwnd=4
ack 10 cwnd=3
ack 11 cwnd=2

2) That seems to contradict another passage in the draft (v 02 or 03). Consider
 https://tools.ietf.org/html/draft-ietf-tcpm-dctcp-03
where it says

   Just as specified in [RFC3168], DCTCP does not react to congestion
   indications more than once for every window of data.

So the draft seems to advocate not reacting to congestion indications
more than once per window. Yet this patch reacts on every ECE-marked
ACK within a window.

Am I reading something incorrectly?

cheers,
neal


Re: [PATCHv2 net-next 3/4] net: dsa: mv88e6xxx: Move the tagging protocol into info

2016-12-02 Thread Andrew Lunn
On Fri, Dec 02, 2016 at 02:41:08PM -0500, Vivien Didelot wrote:
> Hi Andrew,
> 
> Andrew Lunn  writes:
> 
> > @@ -3749,6 +3756,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] 
> > = {
> > .global1_addr = 0x1b,
> > .age_time_coeff = 15000,
> > .g1_irqs = 9,
> > +   .tag_protocol = DSA_TAG_PROTO_EDSA,
> > .flags = MV88E6XXX_FLAGS_FAMILY_6352,
> > .ops = &mv88e6172_ops,
> > },
> 
> Since some chips support several protocols, we will have to turn
> tag_protocol into a bitmask and introduce something like:

Why? We have made an implementation choice, this chip will be used in
this way. There is no strong reason to use it the other way. There is
a strong reason not to allow it to be configured, because it makes the
driver more complex and the DSA layer more complex, and no other
driver requires this complexity.

KISS.

Andrew


[net-next PATCH v4 6/6] virtio_net: xdp, add slowpath case for non contiguous buffers

2016-12-02 Thread John Fastabend
virtio_net XDP support expects receive buffers to be contiguous.
If this is not the case we enable a slowpath to allow connectivity
to continue but at a significan performance overhead associated with
linearizing data. To make it painfully aware to users that XDP is
running in a degraded mode we throw an xdp buffer error.

To linearize packets we allocate a page and copy the segments of
the data, including the header, into it. After this the page can be
handled by XDP code flow as normal.

Then depending on the return code the page is either freed or sent
to the XDP xmit path. There is no attempt to optimize this path.

This case is being handled simple as a precaution in case some
unknown backend were to generate packets in this form. To test this
I had to hack qemu and force it to generate these packets. I do not
expect this case to be generated by "real" backends.

Signed-off-by: John Fastabend 
---
 drivers/net/virtio_net.c |   77 +-
 1 file changed, 75 insertions(+), 2 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 137caba..13f463d 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -456,6 +456,64 @@ static struct sk_buff *receive_big(struct net_device *dev,
return NULL;
 }
 
+/* The conditions to enable XDP should preclude the underlying device from
+ * sending packets across multiple buffers (num_buf > 1). However per spec
+ * it does not appear to be illegal to do so but rather just against 
convention.
+ * So in order to avoid making a system unresponsive the packets are pushed
+ * into a page and the XDP program is run. This will be extremely slow and we
+ * push a warning to the user to fix this as soon as possible. Fixing this may
+ * require resolving the underlying hardware to determine why multiple buffers
+ * are being received or simply loading the XDP program in the ingress stack
+ * after the skb is built because there is no advantage to running it here
+ * anymore.
+ */
+static struct page *xdp_linearize_page(struct receive_queue *rq,
+  u16 num_buf,
+  struct page *p,
+  int offset,
+  unsigned int *len)
+{
+   struct page *page = alloc_page(GFP_ATOMIC);
+   unsigned int page_off = 0;
+
+   if (!page)
+   return NULL;
+
+   memcpy(page_address(page) + page_off, page_address(p) + offset, *len);
+   page_off += *len;
+
+   while (--num_buf) {
+   unsigned int buflen;
+   unsigned long ctx;
+   void *buf;
+   int off;
+
+   ctx = (unsigned long)virtqueue_get_buf(rq->vq, &buflen);
+   if (unlikely(!ctx))
+   goto err_buf;
+
+   /* guard against a misconfigured or uncooperative backend that
+* is sending packet larger than the MTU.
+*/
+   if ((page_off + buflen) > PAGE_SIZE)
+   goto err_buf;
+
+   buf = mergeable_ctx_to_buf_address(ctx);
+   p = virt_to_head_page(buf);
+   off = buf - page_address(p);
+
+   memcpy(page_address(page) + page_off,
+  page_address(p) + off, buflen);
+   page_off += buflen;
+   }
+
+   *len = page_off;
+   return page;
+err_buf:
+   __free_pages(page, 0);
+   return NULL;
+}
+
 static struct sk_buff *receive_mergeable(struct net_device *dev,
 struct virtnet_info *vi,
 struct receive_queue *rq,
@@ -476,6 +534,7 @@ static struct sk_buff *receive_mergeable(struct net_device 
*dev,
rcu_read_lock();
xdp_prog = rcu_dereference(rq->xdp_prog);
if (xdp_prog) {
+   struct page *xdp_page;
u32 act;
 
/* No known backend devices should send packets with
@@ -485,7 +544,15 @@ static struct sk_buff *receive_mergeable(struct net_device 
*dev,
 */
if (unlikely(num_buf > 1)) {
bpf_warn_invalid_xdp_buffer();
-   goto err_xdp;
+
+   /* linearize data for XDP */
+   xdp_page = xdp_linearize_page(rq, num_buf,
+ page, offset, &len);
+   if (!xdp_page)
+   goto err_xdp;
+   offset = 0;
+   } else {
+   xdp_page = page;
}
 
/* Transient failure which in theory could occur if
@@ -496,15 +563,21 @@ static struct sk_buff *receive_mergeable(struct 
net_device *dev,
if (unlikely(hdr->hdr.gso_type || hdr->hdr.flags))
goto err_xdp;
 
-   act = do_xdp_prog

[net-next PATCH v4 5/6] virtio_net: add XDP_TX support

2016-12-02 Thread John Fastabend
This adds support for the XDP_TX action to virtio_net. When an XDP
program is run and returns the XDP_TX action the virtio_net XDP
implementation will transmit the packet on a TX queue that aligns
with the current CPU that the XDP packet was processed on.

Before sending the packet the header is zeroed.  Also XDP is expected
to handle checksum correctly so no checksum offload  support is
provided.

Signed-off-by: John Fastabend 
---
 drivers/net/virtio_net.c |   63 --
 1 file changed, 60 insertions(+), 3 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index b67203e..137caba 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -330,12 +330,43 @@ static struct sk_buff *page_to_skb(struct virtnet_info 
*vi,
return skb;
 }
 
+static void virtnet_xdp_xmit(struct virtnet_info *vi,
+unsigned int qnum, struct xdp_buff *xdp)
+{
+   struct send_queue *sq = &vi->sq[qnum];
+   struct virtio_net_hdr_mrg_rxbuf *hdr;
+   unsigned int num_sg, len;
+   void *xdp_sent;
+   int err;
+
+   /* Free up any pending old buffers before queueing new ones. */
+   while ((xdp_sent = virtqueue_get_buf(sq->vq, &len)) != NULL) {
+   struct page *page = virt_to_head_page(xdp_sent);
+
+   put_page(page);
+   }
+
+   /* Zero header and leave csum up to XDP layers */
+   hdr = xdp->data;
+   memset(hdr, 0, vi->hdr_len);
+
+   num_sg = 1;
+   sg_init_one(sq->sg, xdp->data, xdp->data_end - xdp->data);
+   err = virtqueue_add_outbuf(sq->vq, sq->sg, num_sg,
+  xdp->data, GFP_ATOMIC);
+   if (unlikely(err))
+   put_page(virt_to_head_page(xdp->data));
+   else
+   virtqueue_kick(sq->vq);
+}
+
 static u32 do_xdp_prog(struct virtnet_info *vi,
   struct bpf_prog *xdp_prog,
   struct page *page, int offset, int len)
 {
int hdr_padded_len;
struct xdp_buff xdp;
+   unsigned int qp;
u32 act;
u8 *buf;
 
@@ -353,9 +384,15 @@ static u32 do_xdp_prog(struct virtnet_info *vi,
switch (act) {
case XDP_PASS:
return XDP_PASS;
+   case XDP_TX:
+   qp = vi->curr_queue_pairs -
+   vi->xdp_queue_pairs +
+   smp_processor_id();
+   xdp.data = buf + (vi->mergeable_rx_bufs ? 0 : 4);
+   virtnet_xdp_xmit(vi, qp, &xdp);
+   return XDP_TX;
default:
bpf_warn_invalid_xdp_action(act);
-   case XDP_TX:
case XDP_ABORTED:
case XDP_DROP:
return XDP_DROP;
@@ -391,8 +428,16 @@ static struct sk_buff *receive_big(struct net_device *dev,
if (unlikely(hdr->hdr.gso_type || hdr->hdr.flags))
goto err_xdp;
act = do_xdp_prog(vi, xdp_prog, page, 0, len);
-   if (act == XDP_DROP)
+   switch (act) {
+   case XDP_PASS:
+   break;
+   case XDP_TX:
+   rcu_read_unlock();
+   goto xdp_xmit;
+   case XDP_DROP:
+   default:
goto err_xdp;
+   }
}
rcu_read_unlock();
 
@@ -407,6 +452,7 @@ static struct sk_buff *receive_big(struct net_device *dev,
 err:
dev->stats.rx_dropped++;
give_pages(rq, page);
+xdp_xmit:
return NULL;
 }
 
@@ -425,6 +471,8 @@ static struct sk_buff *receive_mergeable(struct net_device 
*dev,
struct bpf_prog *xdp_prog;
unsigned int truesize;
 
+   head_skb = NULL;
+
rcu_read_lock();
xdp_prog = rcu_dereference(rq->xdp_prog);
if (xdp_prog) {
@@ -449,8 +497,16 @@ static struct sk_buff *receive_mergeable(struct net_device 
*dev,
goto err_xdp;
 
act = do_xdp_prog(vi, xdp_prog, page, offset, len);
-   if (act == XDP_DROP)
+   switch (act) {
+   case XDP_PASS:
+   break;
+   case XDP_TX:
+   rcu_read_unlock();
+   goto xdp_xmit;
+   case XDP_DROP:
+   default:
goto err_xdp;
+   }
}
rcu_read_unlock();
 
@@ -528,6 +584,7 @@ static struct sk_buff *receive_mergeable(struct net_device 
*dev,
 err_buf:
dev->stats.rx_dropped++;
dev_kfree_skb(head_skb);
+xdp_xmit:
return NULL;
 }
 



[net-next PATCH v4 4/6] virtio_net: add dedicated XDP transmit queues

2016-12-02 Thread John Fastabend
XDP requires using isolated transmit queues to avoid interference
with normal networking stack (BQL, NETDEV_TX_BUSY, etc). This patch
adds a XDP queue per cpu when a XDP program is loaded and does not
expose the queues to the OS via the normal API call to
netif_set_real_num_tx_queues(). This way the stack will never push
an skb to these queues.

However virtio/vhost/qemu implementation only allows for creating
TX/RX queue pairs at this time so creating only TX queues was not
possible. And because the associated RX queues are being created I
went ahead and exposed these to the stack and let the backend use
them. This creates more RX queues visible to the network stack than
TX queues which is worth mentioning but does not cause any issues as
far as I can tell.

Signed-off-by: John Fastabend 
---
 drivers/net/virtio_net.c |   30 --
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 80b1cfc..b67203e 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -114,6 +114,9 @@ struct virtnet_info {
/* # of queue pairs currently used by the driver */
u16 curr_queue_pairs;
 
+   /* # of XDP queue pairs currently used by the driver */
+   u16 xdp_queue_pairs;
+
/* I like... big packets and I cannot lie! */
bool big_packets;
 
@@ -1552,7 +1555,8 @@ static int virtnet_xdp_set(struct net_device *dev, struct 
bpf_prog *prog)
unsigned long int max_sz = PAGE_SIZE - sizeof(struct padded_vnet_hdr);
struct virtnet_info *vi = netdev_priv(dev);
struct bpf_prog *old_prog;
-   int i;
+   u16 xdp_qp = 0, curr_qp;
+   int i, err;
 
if ((dev->features & NETIF_F_LRO) && prog) {
netdev_warn(dev, "can't set XDP while LRO is on, disable LRO 
first\n");
@@ -1569,12 +1573,34 @@ static int virtnet_xdp_set(struct net_device *dev, 
struct bpf_prog *prog)
return -EINVAL;
}
 
+   curr_qp = vi->curr_queue_pairs - vi->xdp_queue_pairs;
+   if (prog)
+   xdp_qp = nr_cpu_ids;
+
+   /* XDP requires extra queues for XDP_TX */
+   if (curr_qp + xdp_qp > vi->max_queue_pairs) {
+   netdev_warn(dev, "request %i queues but max is %i\n",
+   curr_qp + xdp_qp, vi->max_queue_pairs);
+   return -ENOMEM;
+   }
+
+   err = virtnet_set_queues(vi, curr_qp + xdp_qp);
+   if (err) {
+   dev_warn(&dev->dev, "XDP Device queue allocation failure.\n");
+   return err;
+   }
+
if (prog) {
prog = bpf_prog_add(prog, vi->max_queue_pairs - 1);
-   if (IS_ERR(prog))
+   if (IS_ERR(prog)) {
+   virtnet_set_queues(vi, curr_qp);
return PTR_ERR(prog);
+   }
}
 
+   vi->xdp_queue_pairs = xdp_qp;
+   netif_set_real_num_rx_queues(dev, curr_qp + xdp_qp);
+
for (i = 0; i < vi->max_queue_pairs; i++) {
old_prog = rtnl_dereference(vi->rq[i].xdp_prog);
rcu_assign_pointer(vi->rq[i].xdp_prog, prog);



[net-next PATCH v4 3/6] virtio_net: Add XDP support

2016-12-02 Thread John Fastabend
From: John Fastabend 

This adds XDP support to virtio_net. Some requirements must be
met for XDP to be enabled depending on the mode. First it will
only be supported with LRO disabled so that data is not pushed
across multiple buffers. Second the MTU must be less than a page
size to avoid having to handle XDP across multiple pages.

If mergeable receive is enabled this patch only supports the case
where header and data are in the same buf which we can check when
a packet is received by looking at num_buf. If the num_buf is
greater than 1 and a XDP program is loaded the packet is dropped
and a warning is thrown. When any_header_sg is set this does not
happen and both header and data is put in a single buffer as expected
so we check this when XDP programs are loaded.  Subsequent patches
will process the packet in a degraded mode to ensure connectivity
and correctness is not lost even if backend pushes packets into
multiple buffers.

If big packets mode is enabled and MTU/LRO conditions above are
met then XDP is allowed.

This patch was tested with qemu with vhost=on and vhost=off where
mergeable and big_packet modes were forced via hard coding feature
negotiation. Multiple buffers per packet was forced via a small
test patch to vhost.c in the vhost=on qemu mode.

Suggested-by: Shrijeet Mukherjee 
Signed-off-by: John Fastabend 
---
 drivers/net/virtio_net.c |  175 +-
 1 file changed, 170 insertions(+), 5 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index d814e7cb..80b1cfc 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -22,6 +22,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -81,6 +82,8 @@ struct receive_queue {
 
struct napi_struct napi;
 
+   struct bpf_prog __rcu *xdp_prog;
+
/* Chain pages by the private ptr. */
struct page *pages;
 
@@ -324,6 +327,38 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
return skb;
 }
 
+static u32 do_xdp_prog(struct virtnet_info *vi,
+  struct bpf_prog *xdp_prog,
+  struct page *page, int offset, int len)
+{
+   int hdr_padded_len;
+   struct xdp_buff xdp;
+   u32 act;
+   u8 *buf;
+
+   buf = page_address(page) + offset;
+
+   if (vi->mergeable_rx_bufs)
+   hdr_padded_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
+   else
+   hdr_padded_len = sizeof(struct padded_vnet_hdr);
+
+   xdp.data = buf + hdr_padded_len;
+   xdp.data_end = xdp.data + (len - vi->hdr_len);
+
+   act = bpf_prog_run_xdp(xdp_prog, &xdp);
+   switch (act) {
+   case XDP_PASS:
+   return XDP_PASS;
+   default:
+   bpf_warn_invalid_xdp_action(act);
+   case XDP_TX:
+   case XDP_ABORTED:
+   case XDP_DROP:
+   return XDP_DROP;
+   }
+}
+
 static struct sk_buff *receive_small(struct virtnet_info *vi, void *buf, 
unsigned int len)
 {
struct sk_buff * skb = buf;
@@ -340,14 +375,32 @@ static struct sk_buff *receive_big(struct net_device *dev,
   void *buf,
   unsigned int len)
 {
+   struct bpf_prog *xdp_prog;
struct page *page = buf;
-   struct sk_buff *skb = page_to_skb(vi, rq, page, 0, len, PAGE_SIZE);
+   struct sk_buff *skb;
 
+   rcu_read_lock();
+   xdp_prog = rcu_dereference(rq->xdp_prog);
+   if (xdp_prog) {
+   struct virtio_net_hdr_mrg_rxbuf *hdr = buf;
+   u32 act;
+
+   if (unlikely(hdr->hdr.gso_type || hdr->hdr.flags))
+   goto err_xdp;
+   act = do_xdp_prog(vi, xdp_prog, page, 0, len);
+   if (act == XDP_DROP)
+   goto err_xdp;
+   }
+   rcu_read_unlock();
+
+   skb = page_to_skb(vi, rq, page, 0, len, PAGE_SIZE);
if (unlikely(!skb))
goto err;
 
return skb;
 
+err_xdp:
+   rcu_read_unlock();
 err:
dev->stats.rx_dropped++;
give_pages(rq, page);
@@ -365,11 +418,42 @@ static struct sk_buff *receive_mergeable(struct 
net_device *dev,
u16 num_buf = virtio16_to_cpu(vi->vdev, hdr->num_buffers);
struct page *page = virt_to_head_page(buf);
int offset = buf - page_address(page);
-   unsigned int truesize = max(len, mergeable_ctx_to_buf_truesize(ctx));
+   struct sk_buff *head_skb, *curr_skb;
+   struct bpf_prog *xdp_prog;
+   unsigned int truesize;
+
+   rcu_read_lock();
+   xdp_prog = rcu_dereference(rq->xdp_prog);
+   if (xdp_prog) {
+   u32 act;
+
+   /* No known backend devices should send packets with
+* more than a single buffer when XDP conditions are
+* met. However it is not strictly illegal so the case
+* is handled as an exception and a warning is thrown.
+   

[net-next PATCH v4 2/6] net: xdp: add invalid buffer warning

2016-12-02 Thread John Fastabend
This adds a warning for drivers to use when encountering an invalid
buffer for XDP. For normal cases this should not happen but to catch
this in virtual/qemu setups that I may not have expected from the
emulation layer having a standard warning is useful.

Signed-off-by: John Fastabend 
---
 include/linux/filter.h |1 +
 net/core/filter.c  |6 ++
 2 files changed, 7 insertions(+)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 7f246a2..90dfc3c 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -595,6 +595,7 @@ int sk_get_filter(struct sock *sk, struct sock_filter 
__user *filter,
 struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
   const struct bpf_insn *patch, u32 len);
 void bpf_warn_invalid_xdp_action(u32 act);
+void bpf_warn_invalid_xdp_buffer(void);
 
 #ifdef CONFIG_BPF_JIT
 extern int bpf_jit_enable;
diff --git a/net/core/filter.c b/net/core/filter.c
index 698a262..7926dd0 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2783,6 +2783,12 @@ void bpf_warn_invalid_xdp_action(u32 act)
 }
 EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
 
+void bpf_warn_invalid_xdp_buffer(void)
+{
+   WARN_ONCE(1, "Illegal XDP buffer encountered, expect throughput 
degradation\n");
+}
+EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_buffer);
+
 static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg,
int src_reg, int ctx_off,
struct bpf_insn *insn_buf,



[net-next PATCH v4 1/6] net: virtio dynamically disable/enable LRO

2016-12-02 Thread John Fastabend
This adds support for dynamically setting the LRO feature flag. The
message to control guest features in the backend uses the
CTRL_GUEST_OFFLOADS msg type.

Signed-off-by: John Fastabend 
---
 drivers/net/virtio_net.c |   45 -
 1 file changed, 44 insertions(+), 1 deletion(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index a21d93a..d814e7cb 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1419,6 +1419,41 @@ static void virtnet_init_settings(struct net_device *dev)
.set_settings = virtnet_set_settings,
 };
 
+static int virtnet_set_features(struct net_device *netdev,
+   netdev_features_t features)
+{
+   struct virtnet_info *vi = netdev_priv(netdev);
+   struct virtio_device *vdev = vi->vdev;
+   struct scatterlist sg;
+   u64 offloads = 0;
+
+   if (features & NETIF_F_LRO)
+   offloads |= (1 << VIRTIO_NET_F_GUEST_TSO4) |
+   (1 << VIRTIO_NET_F_GUEST_TSO6);
+
+   if (features & NETIF_F_RXCSUM)
+   offloads |= (1 << VIRTIO_NET_F_GUEST_CSUM);
+
+   if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)) {
+   sg_init_one(&sg, &offloads, sizeof(uint64_t));
+   if (!virtnet_send_command(vi,
+ VIRTIO_NET_CTRL_GUEST_OFFLOADS,
+ VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET,
+ &sg)) {
+   dev_warn(&netdev->dev,
+"Failed to set guest offloads by virtnet 
command.\n");
+   return -EINVAL;
+   }
+   } else if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS) &&
+  !virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) {
+   dev_warn(&netdev->dev,
+"No support for setting offloads pre version_1.\n");
+   return -EINVAL;
+   }
+
+   return 0;
+}
+
 static const struct net_device_ops virtnet_netdev = {
.ndo_open= virtnet_open,
.ndo_stop= virtnet_close,
@@ -1435,6 +1470,7 @@ static void virtnet_init_settings(struct net_device *dev)
 #ifdef CONFIG_NET_RX_BUSY_POLL
.ndo_busy_poll  = virtnet_busy_poll,
 #endif
+   .ndo_set_features   = virtnet_set_features,
 };
 
 static void virtnet_config_changed_work(struct work_struct *work)
@@ -1815,6 +1851,12 @@ static int virtnet_probe(struct virtio_device *vdev)
if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_CSUM))
dev->features |= NETIF_F_RXCSUM;
 
+   if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) &&
+   virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6)) {
+   dev->features |= NETIF_F_LRO;
+   dev->hw_features |= NETIF_F_LRO;
+   }
+
dev->vlan_features = dev->features;
 
/* MTU range: 68 - 65535 */
@@ -2057,7 +2099,8 @@ static int virtnet_restore(struct virtio_device *vdev)
VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN, \
VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ, \
VIRTIO_NET_F_CTRL_MAC_ADDR, \
-   VIRTIO_NET_F_MTU
+   VIRTIO_NET_F_MTU, \
+   VIRTIO_NET_F_CTRL_GUEST_OFFLOADS
 
 static unsigned int features[] = {
VIRTNET_FEATURES,



[net-next PATCH v4 0/6] XDP for virtio_net

2016-12-02 Thread John Fastabend
This implements virtio_net for the mergeable buffers and big_packet
modes. I tested this with vhost_net running on qemu and did not see
any issues. For testing num_buf > 1 I added a hack to vhost driver
to only use 100 bytes per buffer so that packets were pushed across
multiple buffers.

There are some restrictions for XDP to be enabled and work well
(see patch 3) for more details.

  1. LRO must be off
  2. MTU must be less than PAGE_SIZE
  3. queues must be available to dedicate to XDP
  4. num_bufs received in mergeable buffers must be 1
  5. big_packet mode must have all data on single page

Please review any comments/feedback welcome as always.

---

John Fastabend (6):
  net: virtio dynamically disable/enable LRO
  net: xdp: add invalid buffer warning
  virtio_net: Add XDP support
  virtio_net: add dedicated XDP transmit queues
  virtio_net: add XDP_TX support
  virtio_net: xdp, add slowpath case for non contiguous buffers


 drivers/net/virtio_net.c |  376 +-
 include/linux/filter.h   |1 
 net/core/filter.c|6 +
 3 files changed, 377 insertions(+), 6 deletions(-)

--
Signature


Re: [PATCH] net: wireless: realtek: constify rate_control_ops structures

2016-12-02 Thread Larry Finger

On 12/02/2016 03:50 AM, Bhumika Goyal wrote:

The structures rate_control_ops are only passed as an argument to the
functions ieee80211_rate_control_{register/unregister}. This argument is
of type const, so rate_control_ops having this property can also be
declared as const.
Done using Coccinelle:

@r1 disable optional_qualifier @
identifier i;
position p;
@@
static struct rate_control_ops i@p = {...};

@ok1@
identifier r1.i;
position p;
@@
ieee80211_rate_control_register(&i@p)

@ok2@
identifier r1.i;
position p;
@@
ieee80211_rate_control_unregister(&i@p)

@bad@
position p!={r1.p,ok1.p,ok2.p};
identifier r1.i;
@@
i@p

@depends on !bad disable optional_qualifier@
identifier r1.i;
@@
static
+const
struct rate_control_ops i={...};

@depends on !bad disable optional_qualifier@
identifier r1.i;
@@
+const
struct rate_control_ops i;

File size before:
   textdata bss dec hex filename
   1991 104   02095 82f wireless/realtek/rtlwifi/rc.o

File size after:
   textdata bss dec hex filename
   2095   0   02095 wireless/realtek/rtlwifi/rc.o

Signed-off-by: Bhumika Goyal 
---
 drivers/net/wireless/realtek/rtlwifi/rc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/realtek/rtlwifi/rc.c 
b/drivers/net/wireless/realtek/rtlwifi/rc.c
index ce8621a..107c13c 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rc.c
+++ b/drivers/net/wireless/realtek/rtlwifi/rc.c
@@ -284,7 +284,7 @@ static void rtl_rate_free_sta(void *rtlpriv,
kfree(rate_priv);
 }

-static struct rate_control_ops rtl_rate_ops = {
+static const struct rate_control_ops rtl_rate_ops = {
.name = "rtl_rc",
.alloc = rtl_rate_alloc,
.free = rtl_rate_free,



The content of your patch is OK; however, your subject is not. By convention, 
"net: wireless: realtek:" is assumed. We do, however, include "rtlwifi:" to 
indicate which part of drivers/net/wireless/realtek/ is referenced.


NACK

Larry



[PATCH v3 05/13] net: ethernet: ti: cpts: fix registration order

2016-12-02 Thread Grygorii Strashko
The ptp clock registered before spinlock, which is protecting it, and
before timecounter and cyclecounter initialization in cpts_register().

So, ensure that ptp clock is registered the last, after everything
else is done.

Acked-by: Richard Cochran 
Signed-off-by: Grygorii Strashko 
---
 drivers/net/ethernet/ti/cpts.c | 24 ++--
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/ti/cpts.c b/drivers/net/ethernet/ti/cpts.c
index 61198f1..3dda6d5 100644
--- a/drivers/net/ethernet/ti/cpts.c
+++ b/drivers/net/ethernet/ti/cpts.c
@@ -356,15 +356,8 @@ int cpts_register(struct device *dev, struct cpts *cpts,
  u32 mult, u32 shift)
 {
int err, i;
-   unsigned long flags;
 
cpts->info = cpts_info;
-   cpts->clock = ptp_clock_register(&cpts->info, dev);
-   if (IS_ERR(cpts->clock)) {
-   err = PTR_ERR(cpts->clock);
-   cpts->clock = NULL;
-   return err;
-   }
spin_lock_init(&cpts->lock);
 
cpts->cc.read = cpts_systim_read;
@@ -382,15 +375,26 @@ int cpts_register(struct device *dev, struct cpts *cpts,
cpts_write32(cpts, CPTS_EN, control);
cpts_write32(cpts, TS_PEND_EN, int_enable);
 
-   spin_lock_irqsave(&cpts->lock, flags);
timecounter_init(&cpts->tc, &cpts->cc, ktime_to_ns(ktime_get_real()));
-   spin_unlock_irqrestore(&cpts->lock, flags);
 
INIT_DELAYED_WORK(&cpts->overflow_work, cpts_overflow_check);
-   schedule_delayed_work(&cpts->overflow_work, CPTS_OVERFLOW_PERIOD);
 
+   cpts->clock = ptp_clock_register(&cpts->info, dev);
+   if (IS_ERR(cpts->clock)) {
+   err = PTR_ERR(cpts->clock);
+   cpts->clock = NULL;
+   goto err_ptp;
+   }
cpts->phc_index = ptp_clock_index(cpts->clock);
+
+   schedule_delayed_work(&cpts->overflow_work, CPTS_OVERFLOW_PERIOD);
+
return 0;
+
+err_ptp:
+   if (cpts->refclk)
+   cpts_clk_release(cpts);
+   return err;
 }
 EXPORT_SYMBOL_GPL(cpts_register);
 
-- 
2.10.1



[PATCH v3 08/13] net: ethernet: ti: cpts: drop excessive writes to CTRL and INT_EN regs

2016-12-02 Thread Grygorii Strashko
CPTS module and IRQs are always enabled when CPTS is registered,
before starting overflow check work, and disabled during
deregistration, when overflow check work has been canceled already.
So, It doesn't require to (re)enable CPTS module and IRQs in
cpts_overflow_check().

Acked-by: Richard Cochran 
Signed-off-by: Grygorii Strashko 
---
 drivers/net/ethernet/ti/cpts.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/net/ethernet/ti/cpts.c b/drivers/net/ethernet/ti/cpts.c
index 8266459..a662c33 100644
--- a/drivers/net/ethernet/ti/cpts.c
+++ b/drivers/net/ethernet/ti/cpts.c
@@ -243,8 +243,6 @@ static void cpts_overflow_check(struct work_struct *work)
struct timespec64 ts;
struct cpts *cpts = container_of(work, struct cpts, overflow_work.work);
 
-   cpts_write32(cpts, CPTS_EN, control);
-   cpts_write32(cpts, TS_PEND_EN, int_enable);
cpts_ptp_gettime(&cpts->info, &ts);
pr_debug("cpts overflow check at %lld.%09lu\n", ts.tv_sec, ts.tv_nsec);
schedule_delayed_work(&cpts->overflow_work, CPTS_OVERFLOW_PERIOD);
-- 
2.10.1



[PATCH v3 01/13] net: ethernet: ti: cpts: switch to readl/writel_relaxed()

2016-12-02 Thread Grygorii Strashko
Switch to readl/writel_relaxed() APIs, because this is recommended
API and the CPTS IP is reused on Keystone 2 SoCs
where LE/BE modes are supported.

Acked-by: Richard Cochran 
Signed-off-by: Grygorii Strashko 
---
 drivers/net/ethernet/ti/cpts.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/ti/cpts.c b/drivers/net/ethernet/ti/cpts.c
index 85a55b4..a42c449 100644
--- a/drivers/net/ethernet/ti/cpts.c
+++ b/drivers/net/ethernet/ti/cpts.c
@@ -33,8 +33,8 @@
 
 #ifdef CONFIG_TI_CPTS
 
-#define cpts_read32(c, r)  __raw_readl(&c->reg->r)
-#define cpts_write32(c, v, r)  __raw_writel(v, &c->reg->r)
+#define cpts_read32(c, r)  readl_relaxed(&c->reg->r)
+#define cpts_write32(c, v, r)  writel_relaxed(v, &c->reg->r)
 
 static int event_expired(struct cpts_event *event)
 {
-- 
2.10.1



[PATCH v3 09/13] net: ethernet: ti: cpts: rework initialization/deinitialization

2016-12-02 Thread Grygorii Strashko
The current implementation CPTS initialization and deinitialization
(represented by cpts_register/unregister()) does too many static
initialization from .ndo_open(), which is reasonable to do once at probe
time instead, and also require caller to allocate memory for struct cpts,
which is internal for CPTS driver in general.

This patch splits CPTS initialization and deinitialization on two parts:

- static initializtion cpts_create()/cpts_release() which expected to be
executed when parent driver is probed/removed;

- dynamic part cpts_register/unregister() which expected to be executed
when network device is opened/closed.

As result, current code of CPTS parent driver - CPSW - will be simplified
(and it also will allow simplify adding support for Keystone 2 devices in
the future), plus more initialization errors will be catched earlier. In
addition, this change allows to clean up cpts.h for the case when CPTS is
disabled.

Signed-off-by: Grygorii Strashko 
---
 drivers/net/ethernet/ti/cpsw.c |  24 +-
 drivers/net/ethernet/ti/cpts.c | 102 -
 drivers/net/ethernet/ti/cpts.h |  26 +--
 3 files changed, 95 insertions(+), 57 deletions(-)

diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index a6a93ad..6c28ef1 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -1406,9 +1406,7 @@ static int cpsw_ndo_open(struct net_device *ndev)
if (ret < 0)
goto err_cleanup;
 
-   if (cpts_register(cpsw->dev, cpsw->cpts,
- cpsw->data.cpts_clock_mult,
- cpsw->data.cpts_clock_shift))
+   if (cpts_register(cpsw->cpts))
dev_err(priv->dev, "error registering cpts device\n");
 
}
@@ -2596,6 +2594,7 @@ static int cpsw_probe(struct platform_device *pdev)
struct cpdma_params dma_params;
struct cpsw_ale_params  ale_params;
void __iomem*ss_regs;
+   void __iomem*cpts_regs;
struct resource *res, *ss_res;
const struct of_device_id   *of_id;
struct gpio_descs   *mode;
@@ -2623,12 +2622,6 @@ static int cpsw_probe(struct platform_device *pdev)
priv->dev  = &ndev->dev;
priv->msg_enable = netif_msg_init(debug_level, CPSW_DEBUG);
cpsw->rx_packet_max = max(rx_packet_max, 128);
-   cpsw->cpts = devm_kzalloc(&pdev->dev, sizeof(struct cpts), GFP_KERNEL);
-   if (!cpsw->cpts) {
-   dev_err(&pdev->dev, "error allocating cpts\n");
-   ret = -ENOMEM;
-   goto clean_ndev_ret;
-   }
 
mode = devm_gpiod_get_array_optional(&pdev->dev, "mode", GPIOD_OUT_LOW);
if (IS_ERR(mode)) {
@@ -2716,7 +2709,7 @@ static int cpsw_probe(struct platform_device *pdev)
switch (cpsw->version) {
case CPSW_VERSION_1:
cpsw->host_port_regs = ss_regs + CPSW1_HOST_PORT_OFFSET;
-   cpsw->cpts->reg  = ss_regs + CPSW1_CPTS_OFFSET;
+   cpts_regs   = ss_regs + CPSW1_CPTS_OFFSET;
cpsw->hw_stats   = ss_regs + CPSW1_HW_STATS;
dma_params.dmaregs   = ss_regs + CPSW1_CPDMA_OFFSET;
dma_params.txhdp = ss_regs + CPSW1_STATERAM_OFFSET;
@@ -2730,7 +2723,7 @@ static int cpsw_probe(struct platform_device *pdev)
case CPSW_VERSION_3:
case CPSW_VERSION_4:
cpsw->host_port_regs = ss_regs + CPSW2_HOST_PORT_OFFSET;
-   cpsw->cpts->reg  = ss_regs + CPSW2_CPTS_OFFSET;
+   cpts_regs   = ss_regs + CPSW2_CPTS_OFFSET;
cpsw->hw_stats   = ss_regs + CPSW2_HW_STATS;
dma_params.dmaregs   = ss_regs + CPSW2_CPDMA_OFFSET;
dma_params.txhdp = ss_regs + CPSW2_STATERAM_OFFSET;
@@ -2796,6 +2789,14 @@ static int cpsw_probe(struct platform_device *pdev)
goto clean_dma_ret;
}
 
+   cpsw->cpts = cpts_create(cpsw->dev, cpts_regs,
+cpsw->data.cpts_clock_mult,
+cpsw->data.cpts_clock_shift);
+   if (IS_ERR(cpsw->cpts)) {
+   ret = PTR_ERR(cpsw->cpts);
+   goto clean_ale_ret;
+   }
+
ndev->irq = platform_get_irq(pdev, 1);
if (ndev->irq < 0) {
dev_err(priv->dev, "error getting irq resource\n");
@@ -2911,6 +2912,7 @@ static int cpsw_remove(struct platform_device *pdev)
unregister_netdev(cpsw->slaves[1].ndev);
unregister_netdev(ndev);
 
+   cpts_release(cpsw->cpts);
cpsw_ale_destroy(cpsw->ale);
cpdma_ctlr_destroy(cpsw->dma);
cpsw_remove_dt(pdev);
diff --git a/drivers/net/ethernet/ti/cpts.c b/drivers/net/ethernet/ti/cpts.c
index a662c33..47831b2 100644
--- a/drivers/net/ethernet/ti/cpt

Re: [PATCH net-next 2/2] net/sched: cls_flower: Support matching on ICMP type and code

2016-12-02 Thread Simon Horman
Hi Jiri,

On Fri, Dec 02, 2016 at 08:17:13PM +0100, Simon Horman wrote:
> On Fri, Dec 02, 2016 at 07:38:48PM +0100, Jiri Pirko wrote:
> > Fri, Dec 02, 2016 at 07:05:51PM CET, simon.hor...@netronome.com wrote:
> > >Support matching on ICMP type and code.

...

> > This hunk looks like it should be squashed to the previous patch.
> 
> I included it in this patch as it is where these helpers are used
> for the first time. I can shuffle it into the first patch if you prefer;
> I agree it does make sense to put all the dissector changes there.

I moved things around as you suggested and posted v2.


[PATCH v3 12/13] net: ethernet: ti: cpts: calc mult and shift from refclk freq

2016-12-02 Thread Grygorii Strashko
The cyclecounter mult and shift values can be calculated based on the
CPTS rfclk frequency and timekeepnig framework provides required algos
and API's.

Hence, calc mult and shift basing on CPTS rfclk frequency if both
cpts_clock_shift and cpts_clock_mult properties are not provided in DT (the
basis of calculation algorithm is borrowed from
__clocksource_update_freq_scale() commit 7d2f944a2b83 ("clocksource:
Provide a generic mult/shift factor calculation")). After this change
cpts_clock_shift and cpts_clock_mult DT properties will become optional.

Cc: John Stultz 
Cc: Thomas Gleixner 
Signed-off-by: Grygorii Strashko 
---
 Documentation/devicetree/bindings/net/cpsw.txt |  8 ++--
 drivers/net/ethernet/ti/cpts.c | 53 +++---
 2 files changed, 52 insertions(+), 9 deletions(-)

diff --git a/Documentation/devicetree/bindings/net/cpsw.txt 
b/Documentation/devicetree/bindings/net/cpsw.txt
index 5ad439f..ebda7c9 100644
--- a/Documentation/devicetree/bindings/net/cpsw.txt
+++ b/Documentation/devicetree/bindings/net/cpsw.txt
@@ -20,8 +20,6 @@ Required properties:
 - slaves   : Specifies number for slaves
 - active_slave : Specifies the slave to use for time stamping,
  ethtool and SIOCGMIIPHY
-- cpts_clock_mult  : Numerator to convert input clock ticks into 
nanoseconds
-- cpts_clock_shift : Denominator to convert input clock ticks into 
nanoseconds
 
 Optional properties:
 - ti,hwmods: Must be "cpgmac0"
@@ -35,7 +33,11 @@ Optional properties:
  For example in dra72x-evm, pcf gpio has to be
  driven low so that cpsw slave 0 and phy data
  lines are connected via mux.
-
+- cpts_clock_mult  : Numerator to convert input clock ticks into 
nanoseconds
+- cpts_clock_shift : Denominator to convert input clock ticks into 
nanoseconds
+ Mult and shift will be calculated basing on CPTS
+ rftclk frequency if both cpts_clock_shift and
+ cpts_clock_mult properties are not provided.
 
 Slave Properties:
 Required properties:
diff --git a/drivers/net/ethernet/ti/cpts.c b/drivers/net/ethernet/ti/cpts.c
index 5d5c46d..806241b 100644
--- a/drivers/net/ethernet/ti/cpts.c
+++ b/drivers/net/ethernet/ti/cpts.c
@@ -409,21 +409,60 @@ void cpts_unregister(struct cpts *cpts)
 }
 EXPORT_SYMBOL_GPL(cpts_unregister);
 
+static void cpts_calc_mult_shift(struct cpts *cpts)
+{
+   u64 frac, maxsec, ns;
+   u32 freq, mult, shift;
+
+   freq = clk_get_rate(cpts->refclk);
+
+   /* Calc the maximum number of seconds which we can run before
+* wrapping around.
+*/
+   maxsec = cpts->cc.mask;
+   do_div(maxsec, freq);
+   /* limit conversation rate to 10 sec as higher values will produce
+* too small mult factors and so reduce the conversion accuracy
+*/
+   if (maxsec > 10)
+   maxsec = 10;
+
+   if (cpts->cc_mult || cpts->cc.shift)
+   return;
+
+   clocks_calc_mult_shift(&mult, &shift, freq, NSEC_PER_SEC, maxsec);
+
+   cpts->cc_mult = mult;
+   cpts->cc.mult = mult;
+   cpts->cc.shift = shift;
+
+   frac = 0;
+   ns = cyclecounter_cyc2ns(&cpts->cc, freq, cpts->cc.mask, &frac);
+
+   dev_info(cpts->dev,
+"CPTS: ref_clk_freq:%u calc_mult:%u calc_shift:%u error:%lld 
nsec/sec\n",
+freq, cpts->cc_mult, cpts->cc.shift, (ns - NSEC_PER_SEC));
+}
+
 static int cpts_of_parse(struct cpts *cpts, struct device_node *node)
 {
int ret = -EINVAL;
u32 prop;
 
-   if (of_property_read_u32(node, "cpts_clock_mult", &prop))
-   goto  of_error;
/* save cc.mult original value as it can be modified
 * by cpts_ptp_adjfreq().
 */
-   cpts->cc_mult = prop;
+   cpts->cc_mult = 0;
+   if (!of_property_read_u32(node, "cpts_clock_mult", &prop))
+   cpts->cc_mult = prop;
+
+   cpts->cc.shift = 0;
+   if (!of_property_read_u32(node, "cpts_clock_shift", &prop))
+   cpts->cc.shift = prop;
 
-   if (of_property_read_u32(node, "cpts_clock_shift", &prop))
-   goto  of_error;
-   cpts->cc.shift = prop;
+   if ((cpts->cc_mult && !cpts->cc.shift) ||
+   (!cpts->cc_mult && cpts->cc.shift))
+   goto of_error;
 
return 0;
 
@@ -463,6 +502,8 @@ struct cpts *cpts_create(struct device *dev, void __iomem 
*regs,
cpts->cc.mask = CLOCKSOURCE_MASK(32);
cpts->info = cpts_info;
 
+   cpts_calc_mult_shift(cpts);
+
return cpts;
 }
 EXPORT_SYMBOL_GPL(cpts_create);
-- 
2.10.1



  1   2   3   4   >