[PATCHv2 net-next 0/4] MV88E6390 batch two
This is the second batch of patches adding support for the MV88e6390. They are not sufficient to make it work properly. The mv88e6390 has a much expanded set of priority maps. Refactor the existing code, and implement basic support for the new device. Similarly, the monitor control register has been reworked. The mv88e6390 has something odd in its EDSA tagging implementation, which means it is not possible to use it. So we need to use DSA tagging. This is the first device with EDSA support where we need to use DSA, and the code does not support this. So two patches refactor the existing code. The two different register definitions are separated out, and using DSA on an EDSA capable device is added. v2: Add port prefix Add helper function for 6390 Add _IEEE_ into #defines Split monitor_ctrl into a number of separate ops. Remove 6390 code which is management, used in a later patch s/EGREES/EGRESS/. Broke up setup_port_dsa() and set_port_dsa() into a number of ops v3: Verify mandatory ops for port setup Don't set ether type for DSA port. Andrew Lunn (4): net: dsa: mv88e6xxx: Implement mv88e6390 tag remap net: dsa: mv88e6xxx: Monitor and Management tables net: dsa: mv88e6xxx: Move the tagging protocol into info net: dsa: mv88e6xxx: Refactor CPU and DSA port setup drivers/net/dsa/mv88e6xxx/chip.c | 339 ++ drivers/net/dsa/mv88e6xxx/global1.c | 69 +++ drivers/net/dsa/mv88e6xxx/global1.h | 4 + drivers/net/dsa/mv88e6xxx/mv88e6xxx.h | 62 +-- drivers/net/dsa/mv88e6xxx/port.c | 181 ++ drivers/net/dsa/mv88e6xxx/port.h | 15 ++ 6 files changed, 583 insertions(+), 87 deletions(-) -- 2.10.2
Re: [PATCH net-next 2/4] mlx4: xdp: Allow raising MTU up to one page minus eth and vlan hdrs
On Fri, 2016-12-02 at 19:42 -0800, Martin KaFai Lau wrote: > On Fri, Dec 02, 2016 at 06:15:26PM -0800, Eric Dumazet wrote: > > My question was more like : > > > > Can we double check all these patches wont break mlx4 driver (non XDP > > path) on arches with PAGE_SIZE=64KB. > The page/pkt requirement is not added by this patch. The earlier > XDP patch series has already ensured this page/pkt requirement > is effective only when XDP prog is attached. > > In the earlier XDP patches, MTU is limited to 1514 when > XDP is ative. This patch is to allow fully use of the > page for a packet (and also only matter when XDP is active). OK, thanks for the clarification.
[PATCH v1 net-next 1/5] net: dsa: mv88e6xxx: Reserved Management frames to CPU
Older devices have a couple of registers in global2. The mv88e6390 family has a single register in global1 behind which hides similar configuration. Implement and op for this. Signed-off-by: Andrew Lunn --- drivers/net/dsa/mv88e6xxx/chip.c | 35 drivers/net/dsa/mv88e6xxx/global1.c | 27 ++ drivers/net/dsa/mv88e6xxx/global1.h | 1 + drivers/net/dsa/mv88e6xxx/global2.c | 43 --- drivers/net/dsa/mv88e6xxx/global2.h | 6 + drivers/net/dsa/mv88e6xxx/mv88e6xxx.h | 3 +++ 6 files changed, 97 insertions(+), 18 deletions(-) diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index 9c14aaad5103..b2b6fe3ef4bf 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -2899,6 +2899,17 @@ static int mv88e6xxx_setup(struct dsa_switch *ds) goto unlock; } + /* Some generations have the configuration of sending reserved +* management frames to the CPU in global2, others in +* global1. Hence it does not fit the two setup functions +* above. +*/ + if (chip->info->ops->mgmt_rsvd2cpu) { + err = chip->info->ops->mgmt_rsvd2cpu(chip); + if (err) + goto unlock; + } + unlock: mutex_unlock(&chip->reg_lock); @@ -3221,6 +3232,7 @@ static const struct mv88e6xxx_ops mv88e6085_ops = { .stats_get_stats = mv88e6095_stats_get_stats, .g1_set_cpu_port = mv88e6095_g1_set_cpu_port, .g1_set_egress_port = mv88e6095_g1_set_egress_port, + .mgmt_rsvd2cpu = mv88e6095_g2_mgmt_rsvd2cpu, }; static const struct mv88e6xxx_ops mv88e6095_ops = { @@ -3237,6 +3249,7 @@ static const struct mv88e6xxx_ops mv88e6095_ops = { .stats_get_sset_count = mv88e6095_stats_get_sset_count, .stats_get_strings = mv88e6095_stats_get_strings, .stats_get_stats = mv88e6095_stats_get_stats, + .mgmt_rsvd2cpu = mv88e6095_g2_mgmt_rsvd2cpu, }; static const struct mv88e6xxx_ops mv88e6097_ops = { @@ -3257,6 +3270,7 @@ static const struct mv88e6xxx_ops mv88e6097_ops = { .stats_get_stats = mv88e6095_stats_get_stats, .g1_set_cpu_port = mv88e6095_g1_set_cpu_port, .g1_set_egress_port = mv88e6095_g1_set_egress_port, + .mgmt_rsvd2cpu = mv88e6095_g2_mgmt_rsvd2cpu, }; static const struct mv88e6xxx_ops mv88e6123_ops = { @@ -3275,6 +3289,7 @@ static const struct mv88e6xxx_ops mv88e6123_ops = { .stats_get_stats = mv88e6095_stats_get_stats, .g1_set_cpu_port = mv88e6095_g1_set_cpu_port, .g1_set_egress_port = mv88e6095_g1_set_egress_port, + .mgmt_rsvd2cpu = mv88e6095_g2_mgmt_rsvd2cpu, }; static const struct mv88e6xxx_ops mv88e6131_ops = { @@ -3295,6 +3310,7 @@ static const struct mv88e6xxx_ops mv88e6131_ops = { .stats_get_stats = mv88e6095_stats_get_stats, .g1_set_cpu_port = mv88e6095_g1_set_cpu_port, .g1_set_egress_port = mv88e6095_g1_set_egress_port, + .mgmt_rsvd2cpu = mv88e6095_g2_mgmt_rsvd2cpu, }; static const struct mv88e6xxx_ops mv88e6161_ops = { @@ -3315,6 +3331,7 @@ static const struct mv88e6xxx_ops mv88e6161_ops = { .stats_get_stats = mv88e6095_stats_get_stats, .g1_set_cpu_port = mv88e6095_g1_set_cpu_port, .g1_set_egress_port = mv88e6095_g1_set_egress_port, + .mgmt_rsvd2cpu = mv88e6095_g2_mgmt_rsvd2cpu, }; static const struct mv88e6xxx_ops mv88e6165_ops = { @@ -3331,6 +3348,7 @@ static const struct mv88e6xxx_ops mv88e6165_ops = { .stats_get_stats = mv88e6095_stats_get_stats, .g1_set_cpu_port = mv88e6095_g1_set_cpu_port, .g1_set_egress_port = mv88e6095_g1_set_egress_port, + .mgmt_rsvd2cpu = mv88e6095_g2_mgmt_rsvd2cpu, }; static const struct mv88e6xxx_ops mv88e6171_ops = { @@ -3352,6 +3370,7 @@ static const struct mv88e6xxx_ops mv88e6171_ops = { .stats_get_stats = mv88e6095_stats_get_stats, .g1_set_cpu_port = mv88e6095_g1_set_cpu_port, .g1_set_egress_port = mv88e6095_g1_set_egress_port, + .mgmt_rsvd2cpu = mv88e6095_g2_mgmt_rsvd2cpu, }; static const struct mv88e6xxx_ops mv88e6172_ops = { @@ -3375,6 +3394,7 @@ static const struct mv88e6xxx_ops mv88e6172_ops = { .stats_get_stats = mv88e6095_stats_get_stats, .g1_set_cpu_port = mv88e6095_g1_set_cpu_port, .g1_set_egress_port = mv88e6095_g1_set_egress_port, + .mgmt_rsvd2cpu = mv88e6095_g2_mgmt_rsvd2cpu, }; static const struct mv88e6xxx_ops mv88e6175_ops = { @@ -3396,6 +3416,7 @@ static const struct mv88e6xxx_ops mv88e6175_ops = { .stats_get_stats = mv88e6095_stats_get_stats, .g1_set_cpu_port = mv88e6095_g1_set_cpu_port, .g1_set_egress_port = mv88e6095_g1_set_egress_port, + .mgmt_rsvd2cpu = mv88e6095_g2_mgmt_rsvd2cpu, }; static const struct mv88e6xxx_ops mv88e6176_ops = { @@ -3419,6 +3440,7 @@ static const struct mv8
[PATCH v1 net-next 0/5] mv88e6390 batch 3
More patches to support the MV88e6390. This is mostly refactoring existing code and adding implementations for the mv88e6390. This patchset set which reserved frames are sent to the cpu, the size of jumbo frames that will be accepted, turn off egress rate limiting, and configuration of pause frames. Andrew Lunn (5): net: dsa: mv88e6xxx: Reserved Management frames to CPU net: dsa: mv88e6xxx: Refactor setting of jumbo frames net: dsa: mv88e6xxx: Refactor egress rate limiting net: dsa: mv88e6xxx: Refactor pause configuration net: dsa: mv88e6xxx: Implement mv88e6390 pause control drivers/net/dsa/mv88e6xxx/chip.c | 125 +++--- drivers/net/dsa/mv88e6xxx/global1.c | 27 drivers/net/dsa/mv88e6xxx/global1.h | 1 + drivers/net/dsa/mv88e6xxx/global2.c | 43 +++- drivers/net/dsa/mv88e6xxx/global2.h | 6 ++ drivers/net/dsa/mv88e6xxx/mv88e6xxx.h | 9 +++ drivers/net/dsa/mv88e6xxx/port.c | 50 ++ drivers/net/dsa/mv88e6xxx/port.h | 6 +- 8 files changed, 225 insertions(+), 42 deletions(-) -- 2.10.2
[PATCH v1 net-next 4/5] net: dsa: mv88e6xxx: Refactor pause configuration
The mv88e6390 has a different mechanism for configuring pause. Refactor the code into an ops function, and for the moment, don't add any mv88e6390 code yet. Signed-off-by: Andrew Lunn --- drivers/net/dsa/mv88e6xxx/chip.c | 28 drivers/net/dsa/mv88e6xxx/mv88e6xxx.h | 1 + drivers/net/dsa/mv88e6xxx/port.c | 11 +++ drivers/net/dsa/mv88e6xxx/port.h | 1 + 4 files changed, 33 insertions(+), 8 deletions(-) diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index 1b0917e44809..3ddb1f79e709 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -2625,17 +2625,15 @@ static int mv88e6xxx_setup_port(struct mv88e6xxx_chip *chip, int port) if (err) return err; - if (mv88e6xxx_6352_family(chip) || mv88e6xxx_6351_family(chip) || - mv88e6xxx_6165_family(chip) || mv88e6xxx_6097_family(chip) || - mv88e6xxx_6320_family(chip)) { - /* Do not limit the period of time that this port can -* be paused for by the remote end or the period of -* time that this port can pause the remote end. -*/ - err = mv88e6xxx_port_write(chip, port, PORT_PAUSE_CTRL, 0x); + if (chip->info->ops->port_pause_config) { + err = chip->info->ops->port_pause_config(chip, port); if (err) return err; + } + if (mv88e6xxx_6352_family(chip) || mv88e6xxx_6351_family(chip) || + mv88e6xxx_6165_family(chip) || mv88e6xxx_6097_family(chip) || + mv88e6xxx_6320_family(chip)) { /* Port ATU control: disable limiting the number of * address database entries that this port is allowed * to use. @@ -3220,6 +3218,7 @@ static const struct mv88e6xxx_ops mv88e6085_ops = { .port_set_egress_unknowns = mv88e6351_port_set_egress_unknowns, .port_set_ether_type = mv88e6351_port_set_ether_type, .port_egress_rate_limiting = mv88e6097_port_egress_rate_limiting, + .port_pause_config = mv88e6097_port_pause_config, .stats_snapshot = mv88e6xxx_g1_stats_snapshot, .stats_get_sset_count = mv88e6095_stats_get_sset_count, .stats_get_strings = mv88e6095_stats_get_strings, @@ -3260,6 +3259,7 @@ static const struct mv88e6xxx_ops mv88e6097_ops = { .port_set_ether_type = mv88e6351_port_set_ether_type, .port_jumbo_config = mv88e6165_port_jumbo_config, .port_egress_rate_limiting = mv88e6095_port_egress_rate_limiting, + .port_pause_config = mv88e6097_port_pause_config, .stats_snapshot = mv88e6xxx_g1_stats_snapshot, .stats_get_sset_count = mv88e6095_stats_get_sset_count, .stats_get_strings = mv88e6095_stats_get_strings, @@ -3302,6 +3302,7 @@ static const struct mv88e6xxx_ops mv88e6131_ops = { .port_set_ether_type = mv88e6351_port_set_ether_type, .port_jumbo_config = mv88e6165_port_jumbo_config, .port_egress_rate_limiting = mv88e6097_port_egress_rate_limiting, + .port_pause_config = mv88e6097_port_pause_config, .stats_snapshot = mv88e6xxx_g1_stats_snapshot, .stats_get_sset_count = mv88e6095_stats_get_sset_count, .stats_get_strings = mv88e6095_stats_get_strings, @@ -3325,6 +3326,7 @@ static const struct mv88e6xxx_ops mv88e6161_ops = { .port_set_ether_type = mv88e6351_port_set_ether_type, .port_jumbo_config = mv88e6165_port_jumbo_config, .port_egress_rate_limiting = mv88e6097_port_egress_rate_limiting, + .port_pause_config = mv88e6097_port_pause_config, .stats_snapshot = mv88e6xxx_g1_stats_snapshot, .stats_get_sset_count = mv88e6095_stats_get_sset_count, .stats_get_strings = mv88e6095_stats_get_strings, @@ -3366,6 +3368,7 @@ static const struct mv88e6xxx_ops mv88e6171_ops = { .port_set_ether_type = mv88e6351_port_set_ether_type, .port_jumbo_config = mv88e6165_port_jumbo_config, .port_egress_rate_limiting = mv88e6097_port_egress_rate_limiting, + .port_pause_config = mv88e6097_port_pause_config, .stats_snapshot = mv88e6320_g1_stats_snapshot, .stats_get_sset_count = mv88e6095_stats_get_sset_count, .stats_get_strings = mv88e6095_stats_get_strings, @@ -3392,6 +3395,7 @@ static const struct mv88e6xxx_ops mv88e6172_ops = { .port_set_ether_type = mv88e6351_port_set_ether_type, .port_jumbo_config = mv88e6165_port_jumbo_config, .port_egress_rate_limiting = mv88e6097_port_egress_rate_limiting, + .port_pause_config = mv88e6097_port_pause_config, .stats_snapshot = mv88e6320_g1_stats_snapshot, .stats_get_sset_count = mv88e6095_stats_get_sset_count, .stats_get_strings = mv88e6095_stats_get_strings, @@ -3416,6 +3420,7 @@ static const struct mv88e6xxx_ops mv88e6175_ops = { .port_set_ether_type = mv88e6351_por
[PATCH v1 net-next 2/5] net: dsa: mv88e6xxx: Refactor setting of jumbo frames
Some switches support jumbo frames. Refactor this code into operations in the ops structure. Signed-off-by: Andrew Lunn --- drivers/net/dsa/mv88e6xxx/chip.c | 26 ++ drivers/net/dsa/mv88e6xxx/mv88e6xxx.h | 1 + drivers/net/dsa/mv88e6xxx/port.c | 14 ++ drivers/net/dsa/mv88e6xxx/port.h | 2 +- 4 files changed, 38 insertions(+), 5 deletions(-) diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index b2b6fe3ef4bf..db1542e05e62 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -2582,10 +2582,6 @@ static int mv88e6xxx_setup_port(struct mv88e6xxx_chip *chip, int port) mv88e6xxx_6185_family(chip)) reg = PORT_CONTROL_2_MAP_DA; - if (mv88e6xxx_6352_family(chip) || mv88e6xxx_6351_family(chip) || - mv88e6xxx_6165_family(chip) || mv88e6xxx_6320_family(chip)) - reg |= PORT_CONTROL_2_JUMBO_10240; - if (mv88e6xxx_6095_family(chip) || mv88e6xxx_6185_family(chip)) { /* Set the upstream port this port should use */ reg |= dsa_upstream_port(ds); @@ -2604,6 +2600,12 @@ static int mv88e6xxx_setup_port(struct mv88e6xxx_chip *chip, int port) return err; } + if (chip->info->ops->port_jumbo_config) { + err = chip->info->ops->port_jumbo_config(chip, port); + if (err) + return err; + } + /* Port Association Vector: when learning source addresses * of packets, add the address to the address database using * a port bitmap that has only the bit for this port set and @@ -2663,6 +2665,7 @@ static int mv88e6xxx_setup_port(struct mv88e6xxx_chip *chip, int port) 0x0001); if (err) return err; + } else if (mv88e6xxx_6185_family(chip) || mv88e6xxx_6095_family(chip)) { err = mv88e6xxx_port_write(chip, port, PORT_RATE_CONTROL, 0x); @@ -3264,6 +3267,7 @@ static const struct mv88e6xxx_ops mv88e6097_ops = { .port_set_frame_mode = mv88e6351_port_set_frame_mode, .port_set_egress_unknowns = mv88e6351_port_set_egress_unknowns, .port_set_ether_type = mv88e6351_port_set_ether_type, + .port_jumbo_config = mv88e6165_port_jumbo_config, .stats_snapshot = mv88e6xxx_g1_stats_snapshot, .stats_get_sset_count = mv88e6095_stats_get_sset_count, .stats_get_strings = mv88e6095_stats_get_strings, @@ -3304,6 +3308,7 @@ static const struct mv88e6xxx_ops mv88e6131_ops = { .port_set_frame_mode = mv88e6351_port_set_frame_mode, .port_set_egress_unknowns = mv88e6351_port_set_egress_unknowns, .port_set_ether_type = mv88e6351_port_set_ether_type, + .port_jumbo_config = mv88e6165_port_jumbo_config, .stats_snapshot = mv88e6xxx_g1_stats_snapshot, .stats_get_sset_count = mv88e6095_stats_get_sset_count, .stats_get_strings = mv88e6095_stats_get_strings, @@ -3325,6 +3330,7 @@ static const struct mv88e6xxx_ops mv88e6161_ops = { .port_set_frame_mode = mv88e6351_port_set_frame_mode, .port_set_egress_unknowns = mv88e6351_port_set_egress_unknowns, .port_set_ether_type = mv88e6351_port_set_ether_type, + .port_jumbo_config = mv88e6165_port_jumbo_config, .stats_snapshot = mv88e6xxx_g1_stats_snapshot, .stats_get_sset_count = mv88e6095_stats_get_sset_count, .stats_get_strings = mv88e6095_stats_get_strings, @@ -3364,6 +3370,7 @@ static const struct mv88e6xxx_ops mv88e6171_ops = { .port_set_frame_mode = mv88e6351_port_set_frame_mode, .port_set_egress_unknowns = mv88e6351_port_set_egress_unknowns, .port_set_ether_type = mv88e6351_port_set_ether_type, + .port_jumbo_config = mv88e6165_port_jumbo_config, .stats_snapshot = mv88e6320_g1_stats_snapshot, .stats_get_sset_count = mv88e6095_stats_get_sset_count, .stats_get_strings = mv88e6095_stats_get_strings, @@ -3388,6 +3395,7 @@ static const struct mv88e6xxx_ops mv88e6172_ops = { .port_set_frame_mode = mv88e6351_port_set_frame_mode, .port_set_egress_unknowns = mv88e6351_port_set_egress_unknowns, .port_set_ether_type = mv88e6351_port_set_ether_type, + .port_jumbo_config = mv88e6165_port_jumbo_config, .stats_snapshot = mv88e6320_g1_stats_snapshot, .stats_get_sset_count = mv88e6095_stats_get_sset_count, .stats_get_strings = mv88e6095_stats_get_strings, @@ -3410,6 +3418,7 @@ static const struct mv88e6xxx_ops mv88e6175_ops = { .port_set_frame_mode = mv88e6351_port_set_frame_mode, .port_set_egress_unknowns = mv88e6351_port_set_egress_unknowns, .port_set_ether_type = mv88e6351_port_set_ether_type, + .port_jumbo_config = mv88e6165_port_jumbo_config, .stats_snapsho
[PATCH v1 net-next 5/5] net: dsa: mv88e6xxx: Implement mv88e6390 pause control
The mv88e6390 has a number flow control registers accessed via the Flow Control register. Use these to set the pause control. Signed-off-by: Andrew Lunn --- drivers/net/dsa/mv88e6xxx/chip.c | 7 +++ drivers/net/dsa/mv88e6xxx/mv88e6xxx.h | 2 ++ drivers/net/dsa/mv88e6xxx/port.c | 13 + drivers/net/dsa/mv88e6xxx/port.h | 1 + 4 files changed, 23 insertions(+) diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index 3ddb1f79e709..ca453f3243cd 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -3490,6 +3490,7 @@ static const struct mv88e6xxx_ops mv88e6190_ops = { .port_set_frame_mode = mv88e6351_port_set_frame_mode, .port_set_egress_unknowns = mv88e6351_port_set_egress_unknowns, .port_set_ether_type = mv88e6351_port_set_ether_type, + .port_pause_config = mv88e6390_port_pause_config, .stats_snapshot = mv88e6390_g1_stats_snapshot, .stats_set_histogram = mv88e6390_g1_stats_set_histogram, .stats_get_sset_count = mv88e6320_stats_get_sset_count, @@ -3513,6 +3514,7 @@ static const struct mv88e6xxx_ops mv88e6190x_ops = { .port_set_frame_mode = mv88e6351_port_set_frame_mode, .port_set_egress_unknowns = mv88e6351_port_set_egress_unknowns, .port_set_ether_type = mv88e6351_port_set_ether_type, + .port_pause_config = mv88e6390_port_pause_config, .stats_snapshot = mv88e6390_g1_stats_snapshot, .stats_set_histogram = mv88e6390_g1_stats_set_histogram, .stats_get_sset_count = mv88e6320_stats_get_sset_count, @@ -3536,6 +3538,7 @@ static const struct mv88e6xxx_ops mv88e6191_ops = { .port_set_frame_mode = mv88e6351_port_set_frame_mode, .port_set_egress_unknowns = mv88e6351_port_set_egress_unknowns, .port_set_ether_type = mv88e6351_port_set_ether_type, + .port_pause_config = mv88e6390_port_pause_config, .stats_snapshot = mv88e6390_g1_stats_snapshot, .stats_set_histogram = mv88e6390_g1_stats_set_histogram, .stats_get_sset_count = mv88e6320_stats_get_sset_count, @@ -3586,6 +3589,7 @@ static const struct mv88e6xxx_ops mv88e6290_ops = { .port_set_frame_mode = mv88e6351_port_set_frame_mode, .port_set_egress_unknowns = mv88e6351_port_set_egress_unknowns, .port_set_ether_type = mv88e6351_port_set_ether_type, + .port_pause_config = mv88e6390_port_pause_config, .stats_snapshot = mv88e6390_g1_stats_snapshot, .stats_set_histogram = mv88e6390_g1_stats_set_histogram, .stats_get_sset_count = mv88e6320_stats_get_sset_count, @@ -3739,6 +3743,7 @@ static const struct mv88e6xxx_ops mv88e6390_ops = { .port_set_ether_type = mv88e6351_port_set_ether_type, .port_jumbo_config = mv88e6165_port_jumbo_config, .port_egress_rate_limiting = mv88e6097_port_egress_rate_limiting, + .port_pause_config = mv88e6390_port_pause_config, .stats_snapshot = mv88e6390_g1_stats_snapshot, .stats_set_histogram = mv88e6390_g1_stats_set_histogram, .stats_get_sset_count = mv88e6320_stats_get_sset_count, @@ -3764,6 +3769,7 @@ static const struct mv88e6xxx_ops mv88e6390x_ops = { .port_set_ether_type = mv88e6351_port_set_ether_type, .port_jumbo_config = mv88e6165_port_jumbo_config, .port_egress_rate_limiting = mv88e6097_port_egress_rate_limiting, + .port_pause_config = mv88e6390_port_pause_config, .stats_snapshot = mv88e6390_g1_stats_snapshot, .stats_set_histogram = mv88e6390_g1_stats_set_histogram, .stats_get_sset_count = mv88e6320_stats_get_sset_count, @@ -3787,6 +3793,7 @@ static const struct mv88e6xxx_ops mv88e6391_ops = { .port_set_frame_mode = mv88e6351_port_set_frame_mode, .port_set_egress_unknowns = mv88e6351_port_set_egress_unknowns, .port_set_ether_type = mv88e6351_port_set_ether_type, + .port_pause_config = mv88e6390_port_pause_config, .stats_snapshot = mv88e6390_g1_stats_snapshot, .stats_set_histogram = mv88e6390_g1_stats_set_histogram, .stats_get_sset_count = mv88e6320_stats_get_sset_count, diff --git a/drivers/net/dsa/mv88e6xxx/mv88e6xxx.h b/drivers/net/dsa/mv88e6xxx/mv88e6xxx.h index 3b1f3ab490b9..13c7cc443454 100644 --- a/drivers/net/dsa/mv88e6xxx/mv88e6xxx.h +++ b/drivers/net/dsa/mv88e6xxx/mv88e6xxx.h @@ -78,6 +78,8 @@ #define PORT_PCS_CTRL_SPEED_1 (0x03) /* 6390X */ #define PORT_PCS_CTRL_SPEED_UNFORCED (0x03) #define PORT_PAUSE_CTRL0x02 +#define PORT_FLOW_CTRL_LIMIT_IN((0x00 << 8) | BIT(15)) +#define PORT_FLOW_CTRL_LIMIT_OUT ((0x01 << 8) | BIT(15)) #define PORT_SWITCH_ID 0x03 #define PORT_SWITCH_ID_PROD_NUM_6085 0x04a #define PORT_SWITCH_ID_PROD_NUM_6095 0x095 diff --git a/drivers/net/dsa/mv88e6xxx/port.c b/drivers/net/dsa/mv88e6xxx/port.c index 8d14833b2e49..0db7fa0373ae 100644 --- a/drivers/net/dsa/mv88e6xxx/port.c +++ b/
[PATCH v1 net-next 3/5] net: dsa: mv88e6xxx: Refactor egress rate limiting
There are two different rate limiting configurations, depending on the switch generation. Refactor this into ops. Signed-off-by: Andrew Lunn --- drivers/net/dsa/mv88e6xxx/chip.c | 31 +++ drivers/net/dsa/mv88e6xxx/mv88e6xxx.h | 2 ++ drivers/net/dsa/mv88e6xxx/port.c | 12 drivers/net/dsa/mv88e6xxx/port.h | 2 ++ 4 files changed, 35 insertions(+), 12 deletions(-) diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index db1542e05e62..1b0917e44809 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -2657,18 +2657,8 @@ static int mv88e6xxx_setup_port(struct mv88e6xxx_chip *chip, int port) return err; } - /* Rate Control: disable ingress rate limiting. */ - if (mv88e6xxx_6352_family(chip) || mv88e6xxx_6351_family(chip) || - mv88e6xxx_6165_family(chip) || mv88e6xxx_6097_family(chip) || - mv88e6xxx_6320_family(chip)) { - err = mv88e6xxx_port_write(chip, port, PORT_RATE_CONTROL, - 0x0001); - if (err) - return err; - - } else if (mv88e6xxx_6185_family(chip) || mv88e6xxx_6095_family(chip)) { - err = mv88e6xxx_port_write(chip, port, PORT_RATE_CONTROL, - 0x); + if (chip->info->ops->port_egress_rate_limiting) { + err = chip->info->ops->port_egress_rate_limiting(chip, port); if (err) return err; } @@ -3229,6 +3219,7 @@ static const struct mv88e6xxx_ops mv88e6085_ops = { .port_set_frame_mode = mv88e6351_port_set_frame_mode, .port_set_egress_unknowns = mv88e6351_port_set_egress_unknowns, .port_set_ether_type = mv88e6351_port_set_ether_type, + .port_egress_rate_limiting = mv88e6097_port_egress_rate_limiting, .stats_snapshot = mv88e6xxx_g1_stats_snapshot, .stats_get_sset_count = mv88e6095_stats_get_sset_count, .stats_get_strings = mv88e6095_stats_get_strings, @@ -3268,6 +3259,7 @@ static const struct mv88e6xxx_ops mv88e6097_ops = { .port_set_egress_unknowns = mv88e6351_port_set_egress_unknowns, .port_set_ether_type = mv88e6351_port_set_ether_type, .port_jumbo_config = mv88e6165_port_jumbo_config, + .port_egress_rate_limiting = mv88e6095_port_egress_rate_limiting, .stats_snapshot = mv88e6xxx_g1_stats_snapshot, .stats_get_sset_count = mv88e6095_stats_get_sset_count, .stats_get_strings = mv88e6095_stats_get_strings, @@ -3309,6 +3301,7 @@ static const struct mv88e6xxx_ops mv88e6131_ops = { .port_set_egress_unknowns = mv88e6351_port_set_egress_unknowns, .port_set_ether_type = mv88e6351_port_set_ether_type, .port_jumbo_config = mv88e6165_port_jumbo_config, + .port_egress_rate_limiting = mv88e6097_port_egress_rate_limiting, .stats_snapshot = mv88e6xxx_g1_stats_snapshot, .stats_get_sset_count = mv88e6095_stats_get_sset_count, .stats_get_strings = mv88e6095_stats_get_strings, @@ -3331,6 +3324,7 @@ static const struct mv88e6xxx_ops mv88e6161_ops = { .port_set_egress_unknowns = mv88e6351_port_set_egress_unknowns, .port_set_ether_type = mv88e6351_port_set_ether_type, .port_jumbo_config = mv88e6165_port_jumbo_config, + .port_egress_rate_limiting = mv88e6097_port_egress_rate_limiting, .stats_snapshot = mv88e6xxx_g1_stats_snapshot, .stats_get_sset_count = mv88e6095_stats_get_sset_count, .stats_get_strings = mv88e6095_stats_get_strings, @@ -3371,6 +3365,7 @@ static const struct mv88e6xxx_ops mv88e6171_ops = { .port_set_egress_unknowns = mv88e6351_port_set_egress_unknowns, .port_set_ether_type = mv88e6351_port_set_ether_type, .port_jumbo_config = mv88e6165_port_jumbo_config, + .port_egress_rate_limiting = mv88e6097_port_egress_rate_limiting, .stats_snapshot = mv88e6320_g1_stats_snapshot, .stats_get_sset_count = mv88e6095_stats_get_sset_count, .stats_get_strings = mv88e6095_stats_get_strings, @@ -3396,6 +3391,7 @@ static const struct mv88e6xxx_ops mv88e6172_ops = { .port_set_egress_unknowns = mv88e6351_port_set_egress_unknowns, .port_set_ether_type = mv88e6351_port_set_ether_type, .port_jumbo_config = mv88e6165_port_jumbo_config, + .port_egress_rate_limiting = mv88e6097_port_egress_rate_limiting, .stats_snapshot = mv88e6320_g1_stats_snapshot, .stats_get_sset_count = mv88e6095_stats_get_sset_count, .stats_get_strings = mv88e6095_stats_get_strings, @@ -3419,6 +3415,7 @@ static const struct mv88e6xxx_ops mv88e6175_ops = { .port_set_egress_unknowns = mv88e6351_port_set_egress_unknowns, .port_set_ether_type = mv88e6351_port_set_ether_type, .port_jumbo_config = mv88e6165_port_jumbo_config, +
Re: [PATCH net-next 1/4] bpf: xdp: Allow head adjustment in XDP prog
On Sat, Dec 03, 2016 at 01:22:05AM +0100, Daniel Borkmann wrote: > On 12/03/2016 12:23 AM, Martin KaFai Lau wrote: > >This patch allows XDP prog to extend/remove the packet > >data at the head (like adding or removing header). It is > >done by adding a new XDP helper bpf_xdp_adjust_head(). > > > >It also renames bpf_helper_changes_skb_data() to > >bpf_helper_changes_pkt_data() to better reflect > >that XDP prog does not work on skb. > > > >Signed-off-by: Martin KaFai Lau > [...] > >diff --git a/net/core/filter.c b/net/core/filter.c > >index 56b43587d200..6902e2f73e38 100644 > >--- a/net/core/filter.c > >+++ b/net/core/filter.c > >@@ -2234,7 +2234,34 @@ static const struct bpf_func_proto > >bpf_skb_change_head_proto = { > > .arg3_type = ARG_ANYTHING, > > }; > > > >-bool bpf_helper_changes_skb_data(void *func) > >+BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset) > >+{ > >+/* Both mlx4 and mlx5 driver align each packet to PAGE_SIZE when > >+ * XDP prog is set. > >+ * If the above is not true for the other drivers to support > >+ * bpf_xdp_adjust_head, struct xdp_buff can be extended. > >+ */ > >+void *head = (void *)((unsigned long)xdp->data & PAGE_MASK); > >+void *new_data = xdp->data + offset; > >+ > >+if (new_data < head || new_data >= xdp->data_end) > >+/* The packet length must be >=1 */ > > Patch looks generally good to me. Should the min pkt len here be > limited to ETH_HLEN instead of 1? Make sense. Will make the change. > > >+return -EINVAL; > >+ > >+xdp->data = new_data; > >+ > >+return 0; > >+} > >+ > >+static const struct bpf_func_proto bpf_xdp_adjust_head_proto = { > >+.func = bpf_xdp_adjust_head, > >+.gpl_only = false, > >+.ret_type = RET_INTEGER, > >+.arg1_type = ARG_PTR_TO_CTX, > >+.arg2_type = ARG_ANYTHING, > >+}; > >+ > >+bool bpf_helper_changes_pkt_data(void *func) > > { > > if (func == bpf_skb_vlan_push || > > func == bpf_skb_vlan_pop || > [...]
Re: [PATCH net-next 2/4] mlx4: xdp: Allow raising MTU up to one page minus eth and vlan hdrs
On Fri, Dec 02, 2016 at 06:15:26PM -0800, Eric Dumazet wrote: > On Fri, 2016-12-02 at 16:53 -0800, Alexei Starovoitov wrote: > > On 12/2/16 4:38 PM, Eric Dumazet wrote: > > > On Fri, 2016-12-02 at 15:23 -0800, Martin KaFai Lau wrote: > > >> When XDP prog is attached, it is currently limiting > > >> MTU to be FRAG_SZ0 - ETH_HLEN - (2 * VLAN_HLEN) which is 1514 > > >> in x86. > > >> > > >> AFAICT, since mlx4 is doing one page per packet for XDP, > > >> we can at least raise the MTU limitation up to > > >> PAGE_SIZE - ETH_HLEN - (2 * VLAN_HLEN) which this patch is > > >> doing. It will be useful in the next patch which allows > > >> XDP program to extend the packet by adding new header(s). > > >> > > >> Signed-off-by: Martin KaFai Lau > > >> --- > > > > > > Have you tested your patch on a host with PAGE_SIZE = 64 KB ? > > > > > > Looks XDP really kills arches with bigger pages :( > > > > I'm afraid xdp mlx[45] support was not tested on arches > > with 64k pages at all. Not just this patch. > > I think people who care about such archs should test? > > Note page per packet is not a hard requirement for all drivers > > and all archs. For mlx[45] it was the easiest and the most > > convenient way to achieve desired performance. > > If there are ways to do the same performance differently, > > I'm all ears :) > > > > My question was more like : > > Can we double check all these patches wont break mlx4 driver (non XDP > path) on arches with PAGE_SIZE=64KB. The page/pkt requirement is not added by this patch. The earlier XDP patch series has already ensured this page/pkt requirement is effective only when XDP prog is attached. In the earlier XDP patches, MTU is limited to 1514 when XDP is ative. This patch is to allow fully use of the page for a packet (and also only matter when XDP is active).
[PATCH v3 net-next 1/4] net: dsa: mv88e6xxx: Implement mv88e6390 tag remap
The mv88e6390 does not have the two registers to set the frame priority map. Instead it has an indirection registers for setting a number of different priority maps. Refactor the old code into an function, implement the mv88e6390 version, and use an op to call the right one. Signed-off-by: Andrew Lunn Reviewed-by: Vivien Didelot --- v2: Add port prefix Add helper function for 6390 Add _IEEE_ into #defines --- drivers/net/dsa/mv88e6xxx/chip.c | 37 drivers/net/dsa/mv88e6xxx/mv88e6xxx.h | 12 +++ drivers/net/dsa/mv88e6xxx/port.c | 63 +++ drivers/net/dsa/mv88e6xxx/port.h | 2 ++ 4 files changed, 101 insertions(+), 13 deletions(-) diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index ce2f7ff8066e..ff4bd2f74357 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -2617,20 +2617,10 @@ static int mv88e6xxx_setup_port(struct mv88e6xxx_chip *chip, int port) if (err) return err; } + } - /* Tag Remap: use an identity 802.1p prio -> switch -* prio mapping. -*/ - err = mv88e6xxx_port_write(chip, port, PORT_TAG_REGMAP_0123, - 0x3210); - if (err) - return err; - - /* Tag Remap 2: use an identity 802.1p prio -> switch -* prio mapping. -*/ - err = mv88e6xxx_port_write(chip, port, PORT_TAG_REGMAP_4567, - 0x7654); + if (chip->info->ops->port_tag_remap) { + err = chip->info->ops->port_tag_remap(chip, port); if (err) return err; } @@ -3189,6 +3179,7 @@ static const struct mv88e6xxx_ops mv88e6085_ops = { .port_set_link = mv88e6xxx_port_set_link, .port_set_duplex = mv88e6xxx_port_set_duplex, .port_set_speed = mv88e6185_port_set_speed, + .port_tag_remap = mv88e6095_port_tag_remap, .stats_snapshot = mv88e6xxx_g1_stats_snapshot, .stats_get_sset_count = mv88e6095_stats_get_sset_count, .stats_get_strings = mv88e6095_stats_get_strings, @@ -3217,6 +3208,7 @@ static const struct mv88e6xxx_ops mv88e6097_ops = { .port_set_link = mv88e6xxx_port_set_link, .port_set_duplex = mv88e6xxx_port_set_duplex, .port_set_speed = mv88e6185_port_set_speed, + .port_tag_remap = mv88e6095_port_tag_remap, .stats_snapshot = mv88e6xxx_g1_stats_snapshot, .stats_get_sset_count = mv88e6095_stats_get_sset_count, .stats_get_strings = mv88e6095_stats_get_strings, @@ -3245,6 +3237,7 @@ static const struct mv88e6xxx_ops mv88e6131_ops = { .port_set_link = mv88e6xxx_port_set_link, .port_set_duplex = mv88e6xxx_port_set_duplex, .port_set_speed = mv88e6185_port_set_speed, + .port_tag_remap = mv88e6095_port_tag_remap, .stats_snapshot = mv88e6xxx_g1_stats_snapshot, .stats_get_sset_count = mv88e6095_stats_get_sset_count, .stats_get_strings = mv88e6095_stats_get_strings, @@ -3259,6 +3252,7 @@ static const struct mv88e6xxx_ops mv88e6161_ops = { .port_set_link = mv88e6xxx_port_set_link, .port_set_duplex = mv88e6xxx_port_set_duplex, .port_set_speed = mv88e6185_port_set_speed, + .port_tag_remap = mv88e6095_port_tag_remap, .stats_snapshot = mv88e6xxx_g1_stats_snapshot, .stats_get_sset_count = mv88e6095_stats_get_sset_count, .stats_get_strings = mv88e6095_stats_get_strings, @@ -3288,6 +3282,7 @@ static const struct mv88e6xxx_ops mv88e6171_ops = { .port_set_duplex = mv88e6xxx_port_set_duplex, .port_set_rgmii_delay = mv88e6352_port_set_rgmii_delay, .port_set_speed = mv88e6185_port_set_speed, + .port_tag_remap = mv88e6095_port_tag_remap, .stats_snapshot = mv88e6320_g1_stats_snapshot, .stats_get_sset_count = mv88e6095_stats_get_sset_count, .stats_get_strings = mv88e6095_stats_get_strings, @@ -3305,6 +3300,7 @@ static const struct mv88e6xxx_ops mv88e6172_ops = { .port_set_duplex = mv88e6xxx_port_set_duplex, .port_set_rgmii_delay = mv88e6352_port_set_rgmii_delay, .port_set_speed = mv88e6352_port_set_speed, + .port_tag_remap = mv88e6095_port_tag_remap, .stats_snapshot = mv88e6320_g1_stats_snapshot, .stats_get_sset_count = mv88e6095_stats_get_sset_count, .stats_get_strings = mv88e6095_stats_get_strings, @@ -3320,6 +3316,7 @@ static const struct mv88e6xxx_ops mv88e6175_ops = { .port_set_duplex = mv88e6xxx_port_set_duplex, .port_set_rgmii_delay = mv88e6352_port_set_rgmii_delay, .port_set_speed = mv88e6185_port_set_speed, + .port_tag_remap = mv88e6095_port_tag_remap, .stats_snapshot = mv88e6320_g1_stats_snapshot,
[PATCH v3 net-next 2/4] net: dsa: mv88e6xxx: Monitor and Management tables
The mv88e6390 changes the monitor control register into the Monitor and Management control, which is an indirection register to various registers. Add ops to set the CPU port and the ingress/egress port for both register layouts, to global1 Signed-off-by: Andrew Lunn --- drivers/net/dsa/mv88e6xxx/chip.c | 68 +- drivers/net/dsa/mv88e6xxx/global1.c | 69 +++ drivers/net/dsa/mv88e6xxx/global1.h | 4 ++ drivers/net/dsa/mv88e6xxx/mv88e6xxx.h | 13 +++ 4 files changed, 145 insertions(+), 9 deletions(-) diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index ff4bd2f74357..6e981bedd028 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -2747,15 +2747,17 @@ static int mv88e6xxx_g1_setup(struct mv88e6xxx_chip *chip) if (err) return err; - /* Configure the upstream port, and configure it as the port to which -* ingress and egress and ARP monitor frames are to be sent. -*/ - reg = upstream_port << GLOBAL_MONITOR_CONTROL_INGRESS_SHIFT | - upstream_port << GLOBAL_MONITOR_CONTROL_EGRESS_SHIFT | - upstream_port << GLOBAL_MONITOR_CONTROL_ARP_SHIFT; - err = mv88e6xxx_g1_write(chip, GLOBAL_MONITOR_CONTROL, reg); - if (err) - return err; + if (chip->info->ops->g1_set_cpu_port) { + err = chip->info->ops->g1_set_cpu_port(chip, upstream_port); + if (err) + return err; + } + + if (chip->info->ops->g1_set_egress_port) { + err = chip->info->ops->g1_set_egress_port(chip, upstream_port); + if (err) + return err; + } /* Disable remote management, and set the switch's DSA device number. */ err = mv88e6xxx_g1_write(chip, GLOBAL_CONTROL_2, @@ -3184,6 +3186,8 @@ static const struct mv88e6xxx_ops mv88e6085_ops = { .stats_get_sset_count = mv88e6095_stats_get_sset_count, .stats_get_strings = mv88e6095_stats_get_strings, .stats_get_stats = mv88e6095_stats_get_stats, + .g1_set_cpu_port = mv88e6095_g1_set_cpu_port, + .g1_set_egress_port = mv88e6095_g1_set_egress_port, }; static const struct mv88e6xxx_ops mv88e6095_ops = { @@ -3213,6 +3217,8 @@ static const struct mv88e6xxx_ops mv88e6097_ops = { .stats_get_sset_count = mv88e6095_stats_get_sset_count, .stats_get_strings = mv88e6095_stats_get_strings, .stats_get_stats = mv88e6095_stats_get_stats, + .g1_set_cpu_port = mv88e6095_g1_set_cpu_port, + .g1_set_egress_port = mv88e6095_g1_set_egress_port, }; static const struct mv88e6xxx_ops mv88e6123_ops = { @@ -3227,6 +3233,8 @@ static const struct mv88e6xxx_ops mv88e6123_ops = { .stats_get_sset_count = mv88e6095_stats_get_sset_count, .stats_get_strings = mv88e6095_stats_get_strings, .stats_get_stats = mv88e6095_stats_get_stats, + .g1_set_cpu_port = mv88e6095_g1_set_cpu_port, + .g1_set_egress_port = mv88e6095_g1_set_egress_port, }; static const struct mv88e6xxx_ops mv88e6131_ops = { @@ -3242,6 +3250,8 @@ static const struct mv88e6xxx_ops mv88e6131_ops = { .stats_get_sset_count = mv88e6095_stats_get_sset_count, .stats_get_strings = mv88e6095_stats_get_strings, .stats_get_stats = mv88e6095_stats_get_stats, + .g1_set_cpu_port = mv88e6095_g1_set_cpu_port, + .g1_set_egress_port = mv88e6095_g1_set_egress_port, }; static const struct mv88e6xxx_ops mv88e6161_ops = { @@ -3257,6 +3267,8 @@ static const struct mv88e6xxx_ops mv88e6161_ops = { .stats_get_sset_count = mv88e6095_stats_get_sset_count, .stats_get_strings = mv88e6095_stats_get_strings, .stats_get_stats = mv88e6095_stats_get_stats, + .g1_set_cpu_port = mv88e6095_g1_set_cpu_port, + .g1_set_egress_port = mv88e6095_g1_set_egress_port, }; static const struct mv88e6xxx_ops mv88e6165_ops = { @@ -3271,6 +3283,8 @@ static const struct mv88e6xxx_ops mv88e6165_ops = { .stats_get_sset_count = mv88e6095_stats_get_sset_count, .stats_get_strings = mv88e6095_stats_get_strings, .stats_get_stats = mv88e6095_stats_get_stats, + .g1_set_cpu_port = mv88e6095_g1_set_cpu_port, + .g1_set_egress_port = mv88e6095_g1_set_egress_port, }; static const struct mv88e6xxx_ops mv88e6171_ops = { @@ -3287,6 +3301,8 @@ static const struct mv88e6xxx_ops mv88e6171_ops = { .stats_get_sset_count = mv88e6095_stats_get_sset_count, .stats_get_strings = mv88e6095_stats_get_strings, .stats_get_stats = mv88e6095_stats_get_stats, + .g1_set_cpu_port = mv88e6095_g1_set_cpu_port, + .g1_set_egress_port = mv88e6095_g1_set_egress_port, }; static const struct mv88e6xxx_ops mv88e6172_ops = { @@ -3305,6 +3321,8 @@ static const struct mv88e6xxx_ops mv88e6172_ops = { .stats_get_sset_cou
[PATCH v3 net-next 4/4] net: dsa: mv88e6xxx: Refactor CPU and DSA port setup
Older chips only support DSA tagging. Newer chips have both DSA and EDSA tagging. Refactor the code by adding port functions for setting the frame mode, egress mode, and if to forward unknown frames. This results in the helper mv88e6xxx_6065_family() becoming unused, so remove it. Signed-off-by: Andrew Lunn v3: Verify mandatory ops for port setup Don't set ether type for DSA port. --- drivers/net/dsa/mv88e6xxx/chip.c | 217 ++ drivers/net/dsa/mv88e6xxx/mv88e6xxx.h | 20 drivers/net/dsa/mv88e6xxx/port.c | 118 ++ drivers/net/dsa/mv88e6xxx/port.h | 13 ++ 4 files changed, 319 insertions(+), 49 deletions(-) diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index 80efee6f5e16..9c14aaad5103 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -677,11 +677,6 @@ static int mv88e6xxx_phy_ppu_write(struct mv88e6xxx_chip *chip, int addr, return err; } -static bool mv88e6xxx_6065_family(struct mv88e6xxx_chip *chip) -{ - return chip->info->family == MV88E6XXX_FAMILY_6065; -} - static bool mv88e6xxx_6095_family(struct mv88e6xxx_chip *chip) { return chip->info->family == MV88E6XXX_FAMILY_6095; @@ -2438,6 +2433,72 @@ static int mv88e6xxx_serdes_power_on(struct mv88e6xxx_chip *chip) return err; } +static int mv88e6xxx_setup_port_dsa(struct mv88e6xxx_chip *chip, int port, + int upstream_port) +{ + int err; + + err = chip->info->ops->port_set_frame_mode( + chip, port, MV88E6XXX_FRAME_MODE_DSA); + if (err) + return err; + + return chip->info->ops->port_set_egress_unknowns( + chip, port, port == upstream_port); +} + +static int mv88e6xxx_setup_port_cpu(struct mv88e6xxx_chip *chip, int port) +{ + int err; + + switch (chip->info->tag_protocol) { + case DSA_TAG_PROTO_EDSA: + err = chip->info->ops->port_set_frame_mode( + chip, port, MV88E6XXX_FRAME_MODE_ETHERTYPE); + if (err) + return err; + + err = mv88e6xxx_port_set_egress_mode( + chip, port, PORT_CONTROL_EGRESS_ADD_TAG); + if (err) + return err; + + if (chip->info->ops->port_set_ether_type) + err = chip->info->ops->port_set_ether_type( + chip, port, ETH_P_EDSA); + break; + + case DSA_TAG_PROTO_DSA: + err = chip->info->ops->port_set_frame_mode( + chip, port, MV88E6XXX_FRAME_MODE_DSA); + if (err) + return err; + + err = mv88e6xxx_port_set_egress_mode( + chip, port, PORT_CONTROL_EGRESS_UNMODIFIED); + break; + default: + err = -EINVAL; + } + + if (err) + return err; + + return chip->info->ops->port_set_egress_unknowns(chip, port, true); +} + +static int mv88e6xxx_setup_port_normal(struct mv88e6xxx_chip *chip, int port) +{ + int err; + + err = chip->info->ops->port_set_frame_mode( + chip, port, MV88E6XXX_FRAME_MODE_NORMAL); + if (err) + return err; + + return chip->info->ops->port_set_egress_unknowns(chip, port, false); +} + static int mv88e6xxx_setup_port(struct mv88e6xxx_chip *chip, int port) { struct dsa_switch *ds = chip->ds; @@ -2473,44 +2534,23 @@ static int mv88e6xxx_setup_port(struct mv88e6xxx_chip *chip, int port) * If this is the upstream port for this switch, enable * forwarding of unknown unicasts and multicasts. */ - reg = 0; - if (mv88e6xxx_6352_family(chip) || mv88e6xxx_6351_family(chip) || - mv88e6xxx_6165_family(chip) || mv88e6xxx_6097_family(chip) || - mv88e6xxx_6095_family(chip) || mv88e6xxx_6065_family(chip) || - mv88e6xxx_6185_family(chip) || mv88e6xxx_6320_family(chip)) - reg = PORT_CONTROL_IGMP_MLD_SNOOP | + reg = PORT_CONTROL_IGMP_MLD_SNOOP | PORT_CONTROL_USE_TAG | PORT_CONTROL_USE_IP | PORT_CONTROL_STATE_FORWARDING; - if (dsa_is_cpu_port(ds, port)) { - if (chip->info->tag_protocol == DSA_TAG_PROTO_EDSA) - reg |= PORT_CONTROL_FRAME_ETHER_TYPE_DSA | - PORT_CONTROL_FORWARD_UNKNOWN_MC; - else - reg |= PORT_CONTROL_DSA_TAG; - reg |= PORT_CONTROL_EGRESS_ADD_TAG | - PORT_CONTROL_FORWARD_UNKNOWN; - } - if (dsa_is_dsa_port(ds, port)) { - if (mv88e6xxx_6095_family(chip) || - mv88e6xxx_6185_family(chip)) - reg |= PORT_CONTROL_DSA_TAG; - if (mv88e6xxx_6352_family(chip) || -
[PATCH v3 net-next 3/4] net: dsa: mv88e6xxx: Move the tagging protocol into info
Older chips support a single tagging protocol, DSA. New chips support both DSA and EDSA, an enhanced version. Having both as an option changes the register layouts. Up until now, it has been assumed that if EDSA is supported, it will be used. Hence the register layout has been determined by which protocol should be used. However, mv88e6390 has a different implementation of EDSA, which requires we need to use the DSA tagging. Hence separate the selection of the protocol from the register layout. Signed-off-by: Andrew Lunn Reviewed-by: Vivien Didelot --- drivers/net/dsa/mv88e6xxx/chip.c | 33 +++-- drivers/net/dsa/mv88e6xxx/mv88e6xxx.h | 17 - 2 files changed, 31 insertions(+), 19 deletions(-) diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index 6e981bedd028..80efee6f5e16 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -2482,7 +2482,7 @@ static int mv88e6xxx_setup_port(struct mv88e6xxx_chip *chip, int port) PORT_CONTROL_USE_TAG | PORT_CONTROL_USE_IP | PORT_CONTROL_STATE_FORWARDING; if (dsa_is_cpu_port(ds, port)) { - if (mv88e6xxx_has(chip, MV88E6XXX_FLAG_EDSA)) + if (chip->info->tag_protocol == DSA_TAG_PROTO_EDSA) reg |= PORT_CONTROL_FRAME_ETHER_TYPE_DSA | PORT_CONTROL_FORWARD_UNKNOWN_MC; else @@ -2611,7 +2611,7 @@ static int mv88e6xxx_setup_port(struct mv88e6xxx_chip *chip, int port) /* Port Ethertype: use the Ethertype DSA Ethertype * value. */ - if (mv88e6xxx_has(chip, MV88E6XXX_FLAG_EDSA)) { + if (chip->info->tag_protocol == DSA_TAG_PROTO_EDSA) { err = mv88e6xxx_port_write(chip, port, PORT_ETH_TYPE, ETH_P_EDSA); if (err) @@ -3637,6 +3637,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .global1_addr = 0x1b, .age_time_coeff = 15000, .g1_irqs = 8, + .tag_protocol = DSA_TAG_PROTO_DSA, .flags = MV88E6XXX_FLAGS_FAMILY_6097, .ops = &mv88e6085_ops, }, @@ -3651,6 +3652,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .global1_addr = 0x1b, .age_time_coeff = 15000, .g1_irqs = 8, + .tag_protocol = DSA_TAG_PROTO_DSA, .flags = MV88E6XXX_FLAGS_FAMILY_6095, .ops = &mv88e6095_ops, }, @@ -3679,6 +3681,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .global1_addr = 0x1b, .age_time_coeff = 15000, .g1_irqs = 9, + .tag_protocol = DSA_TAG_PROTO_DSA, .flags = MV88E6XXX_FLAGS_FAMILY_6165, .ops = &mv88e6123_ops, }, @@ -3693,6 +3696,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .global1_addr = 0x1b, .age_time_coeff = 15000, .g1_irqs = 9, + .tag_protocol = DSA_TAG_PROTO_DSA, .flags = MV88E6XXX_FLAGS_FAMILY_6185, .ops = &mv88e6131_ops, }, @@ -3707,6 +3711,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .global1_addr = 0x1b, .age_time_coeff = 15000, .g1_irqs = 9, + .tag_protocol = DSA_TAG_PROTO_DSA, .flags = MV88E6XXX_FLAGS_FAMILY_6165, .ops = &mv88e6161_ops, }, @@ -3721,6 +3726,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .global1_addr = 0x1b, .age_time_coeff = 15000, .g1_irqs = 9, + .tag_protocol = DSA_TAG_PROTO_DSA, .flags = MV88E6XXX_FLAGS_FAMILY_6165, .ops = &mv88e6165_ops, }, @@ -3735,6 +3741,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .global1_addr = 0x1b, .age_time_coeff = 15000, .g1_irqs = 9, + .tag_protocol = DSA_TAG_PROTO_EDSA, .flags = MV88E6XXX_FLAGS_FAMILY_6351, .ops = &mv88e6171_ops, }, @@ -3749,6 +3756,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .global1_addr = 0x1b, .age_time_coeff = 15000, .g1_irqs = 9, + .tag_protocol = DSA_TAG_PROTO_EDSA, .flags = MV88E6XXX_FLAGS_FAMILY_6352, .ops = &mv88e6172_ops, }, @@ -3763,6 +3771,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .global1_addr = 0x1b, .age_time_coeff = 15000, .g1_irqs = 9, + .tag_protocol = DSA_TAG_PROTO_EDSA, .
Re: [net-next PATCH v4 5/6] virtio_net: add XDP_TX support
On 16-12-02 12:51 PM, John Fastabend wrote: > This adds support for the XDP_TX action to virtio_net. When an XDP > program is run and returns the XDP_TX action the virtio_net XDP > implementation will transmit the packet on a TX queue that aligns > with the current CPU that the XDP packet was processed on. > > Before sending the packet the header is zeroed. Also XDP is expected > to handle checksum correctly so no checksum offload support is > provided. > > Signed-off-by: John Fastabend > --- > drivers/net/virtio_net.c | 63 > -- > 1 file changed, 60 insertions(+), 3 deletions(-) > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > index b67203e..137caba 100644 > --- a/drivers/net/virtio_net.c > +++ b/drivers/net/virtio_net.c > @@ -330,12 +330,43 @@ static struct sk_buff *page_to_skb(struct virtnet_info > *vi, > return skb; > } > > +static void virtnet_xdp_xmit(struct virtnet_info *vi, > + unsigned int qnum, struct xdp_buff *xdp) > +{ > + struct send_queue *sq = &vi->sq[qnum]; > + struct virtio_net_hdr_mrg_rxbuf *hdr; > + unsigned int num_sg, len; > + void *xdp_sent; > + int err; > + > + /* Free up any pending old buffers before queueing new ones. */ > + while ((xdp_sent = virtqueue_get_buf(sq->vq, &len)) != NULL) { > + struct page *page = virt_to_head_page(xdp_sent); > + > + put_page(page); > + } > + > + /* Zero header and leave csum up to XDP layers */ > + hdr = xdp->data; > + memset(hdr, 0, vi->hdr_len); > + > + num_sg = 1; > + sg_init_one(sq->sg, xdp->data, xdp->data_end - xdp->data); > + err = virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, > +xdp->data, GFP_ATOMIC); > + if (unlikely(err)) > + put_page(virt_to_head_page(xdp->data)); > + else > + virtqueue_kick(sq->vq); > +} > + Hi Michael, Any idea why the above pattern > + err = virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, > +xdp->data, GFP_ATOMIC); > + if (unlikely(err)) > + put_page(virt_to_head_page(xdp->data)); > + else > + virtqueue_kick(sq->vq); > +} would cause a hang but if I call the virtqueue_kick as below even in the error case everything seems to be fine. err = virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, xdp->data, GFP_ATOMIC); if (unlikely(err)) put_page(virt_to_head_page(xdp->data)); virtqueue_kick(sq->vq); I'll take a look through the virtio code but thought I might ask in case you know off-hand or it could be something else entirely. I noticed virtio_input.c uses the second pattern and virtio_net.c uses the above pattern but I'm guessing it never gets exercised due to stack backoff. Thanks, John
Re: [PATCH 2/3] uapi: export tc_skbmod.h
Hi Stephen, [auto build test ERROR on linus/master] [also build test ERROR on v4.9-rc7] [cannot apply to next-20161202] [if your patch is applied to the wrong git tree, please drop us a note to help improve the system] url: https://github.com/0day-ci/linux/commits/Stephen-Hemminger/UAPI-export-missing-headers/20161203-104831 config: i386-tinyconfig (attached as .config) compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901 reproduce: # save the attached .config to linux build tree make ARCH=i386 All errors (new ones prefixed by >>): >> scripts/Makefile.headersinst:55: *** Missing UAPI file >> include/uapi/linux/tc_act/tc_sbkmod.h. Stop. -- >> scripts/Makefile.headersinst:55: *** Missing UAPI file >> include/uapi/linux/tc_act/tc_sbkmod.h. Stop. make[3]: *** [tc_act] Error 2 make[3]: Target '__headersinst' not remade because of errors. make[2]: *** [linux] Error 2 make[2]: Target '__headersinst' not remade because of errors. make[1]: *** [headers_install] Error 2 make: *** [sub-make] Error 2 vim +55 scripts/Makefile.headersinst d8ecc5cd Sam Ravnborg2011-04-27 39 10b63956 David Howells 2012-10-02 40 srcdir:= $(srctree)/$(obj) 10b63956 David Howells 2012-10-02 41 gendir:= $(objtree)/$(gen) 10b63956 David Howells 2012-10-02 42 10b63956 David Howells 2012-10-02 43 oldsrcdir := $(srctree)/$(subst /uapi,,$(obj)) 10b63956 David Howells 2012-10-02 44 7712401a Sam Ravnborg2008-06-15 45 # all headers files for this dir d8ecc5cd Sam Ravnborg2011-04-27 46 header-y := $(filter-out $(generic-y), $(header-y)) 40f1d4c2 David Howells 2012-10-02 47 all-files := $(header-y) $(genhdr-y) $(wrapper-files) 10b63956 David Howells 2012-10-02 48 output-files := $(addprefix $(installdir)/, $(all-files)) 10b63956 David Howells 2012-10-02 49 c0ff68f1 Nicolas Dichtel 2013-04-29 50 input-files1 := $(foreach hdr, $(header-y), \ c4619bc6 Sam Ravnborg2013-03-04 51$(if $(wildcard $(srcdir)/$(hdr)), \ c0ff68f1 Nicolas Dichtel 2013-04-29 52 $(wildcard $(srcdir)/$(hdr))) \ c0ff68f1 Nicolas Dichtel 2013-04-29 53) c0ff68f1 Nicolas Dichtel 2013-04-29 54 input-files1-name := $(notdir $(input-files1)) c0ff68f1 Nicolas Dichtel 2013-04-29 @55 input-files2 := $(foreach hdr, $(header-y), \ c0ff68f1 Nicolas Dichtel 2013-04-29 56$(if $(wildcard $(srcdir)/$(hdr)),, \ c4619bc6 Sam Ravnborg2013-03-04 57 $(if $(wildcard $(oldsrcdir)/$(hdr)), \ 10b63956 David Howells 2012-10-02 58 $(wildcard $(oldsrcdir)/$(hdr)), \ c4619bc6 Sam Ravnborg2013-03-04 59 $(error Missing UAPI file $(srcdir)/$(hdr))) \ c0ff68f1 Nicolas Dichtel 2013-04-29 60)) c0ff68f1 Nicolas Dichtel 2013-04-29 61 input-files2-name := $(notdir $(input-files2)) c0ff68f1 Nicolas Dichtel 2013-04-29 62 input-files3 := $(foreach hdr, $(genhdr-y), \ c4619bc6 Sam Ravnborg2013-03-04 63$(if $(wildcard $(gendir)/$(hdr)), \ :: The code at line 55 was first introduced by commit :: c0ff68f1611d6855a06d672989ad5cfea160a4eb kbuild: fix make headers_install when path is too long :: TO: Nicolas Dichtel :: CC: Michal Marek --- 0-DAY kernel test infrastructureOpen Source Technology Center https://lists.01.org/pipermail/kbuild-all Intel Corporation .config.gz Description: application/gzip
Re: [PATCH] net: wireless: realtek: constify rate_control_ops structures
On Sat, Dec 3, 2016 at 2:09 AM, Larry Finger wrote: > On 12/02/2016 03:50 AM, Bhumika Goyal wrote: >> >> The structures rate_control_ops are only passed as an argument to the >> functions ieee80211_rate_control_{register/unregister}. This argument is >> of type const, so rate_control_ops having this property can also be >> declared as const. >> Done using Coccinelle: >> >> @r1 disable optional_qualifier @ >> identifier i; >> position p; >> @@ >> static struct rate_control_ops i@p = {...}; >> >> @ok1@ >> identifier r1.i; >> position p; >> @@ >> ieee80211_rate_control_register(&i@p) >> >> @ok2@ >> identifier r1.i; >> position p; >> @@ >> ieee80211_rate_control_unregister(&i@p) >> >> @bad@ >> position p!={r1.p,ok1.p,ok2.p}; >> identifier r1.i; >> @@ >> i@p >> >> @depends on !bad disable optional_qualifier@ >> identifier r1.i; >> @@ >> static >> +const >> struct rate_control_ops i={...}; >> >> @depends on !bad disable optional_qualifier@ >> identifier r1.i; >> @@ >> +const >> struct rate_control_ops i; >> >> File size before: >>textdata bss dec hex filename >>1991 104 02095 82f wireless/realtek/rtlwifi/rc.o >> >> File size after: >>textdata bss dec hex filename >>2095 0 02095 wireless/realtek/rtlwifi/rc.o >> >> Signed-off-by: Bhumika Goyal >> --- >> drivers/net/wireless/realtek/rtlwifi/rc.c | 2 +- >> 1 file changed, 1 insertion(+), 1 deletion(-) >> >> diff --git a/drivers/net/wireless/realtek/rtlwifi/rc.c >> b/drivers/net/wireless/realtek/rtlwifi/rc.c >> index ce8621a..107c13c 100644 >> --- a/drivers/net/wireless/realtek/rtlwifi/rc.c >> +++ b/drivers/net/wireless/realtek/rtlwifi/rc.c >> @@ -284,7 +284,7 @@ static void rtl_rate_free_sta(void *rtlpriv, >> kfree(rate_priv); >> } >> >> -static struct rate_control_ops rtl_rate_ops = { >> +static const struct rate_control_ops rtl_rate_ops = { >> .name = "rtl_rc", >> .alloc = rtl_rate_alloc, >> .free = rtl_rate_free, >> > > The content of your patch is OK; however, your subject is not. By > convention, "net: wireless: realtek:" is assumed. We do, however, include > "rtlwifi:" to indicate which part of drivers/net/wireless/realtek/ is > referenced. > Ok, I will send a v2 with the correct subject. Thanks for the input. Thanks, Bhumika > NACK > > Larry >
[PATCH net-next v2 1/4] bnxt_en: Re-factor bnxt_setup_tc().
Add a new function bnxt_setup_mq_tc() to handle MQPRIO. This new function will be called during ETS setup when we add DCBNL in the next patch. Signed-off-by: Michael Chan --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 18 ++ drivers/net/ethernet/broadcom/bnxt/bnxt.h | 1 + 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 0e4f168..7664281 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -6337,17 +6337,10 @@ static int bnxt_change_mtu(struct net_device *dev, int new_mtu) return 0; } -static int bnxt_setup_tc(struct net_device *dev, u32 handle, __be16 proto, -struct tc_to_netdev *ntc) +int bnxt_setup_mq_tc(struct net_device *dev, u8 tc) { struct bnxt *bp = netdev_priv(dev); bool sh = false; - u8 tc; - - if (ntc->type != TC_SETUP_MQPRIO) - return -EINVAL; - - tc = ntc->tc; if (tc > bp->max_tc) { netdev_err(dev, "too many traffic classes requested: %d Max supported is %d\n", @@ -6390,6 +6383,15 @@ static int bnxt_setup_tc(struct net_device *dev, u32 handle, __be16 proto, return 0; } +static int bnxt_setup_tc(struct net_device *dev, u32 handle, __be16 proto, +struct tc_to_netdev *ntc) +{ + if (ntc->type != TC_SETUP_MQPRIO) + return -EINVAL; + + return bnxt_setup_mq_tc(dev, ntc->tc); +} + #ifdef CONFIG_RFS_ACCEL static bool bnxt_fltr_match(struct bnxt_ntuple_filter *f1, struct bnxt_ntuple_filter *f2) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 47be789..fcd07ee 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -1225,5 +1225,6 @@ static inline void bnxt_disable_poll(struct bnxt_napi *bnapi) int bnxt_hwrm_fw_set_time(struct bnxt *); int bnxt_open_nic(struct bnxt *, bool, bool); int bnxt_close_nic(struct bnxt *, bool, bool); +int bnxt_setup_mq_tc(struct net_device *dev, u8 tc); int bnxt_get_max_rings(struct bnxt *, int *, int *, bool); #endif -- 1.8.3.1
[PATCH net-next v2 2/4] bnxt_en: Update firmware header file to latest 1.6.0.
Latest interface has the latest DCB command structs. Get and store the max number of lossless TCs the hardware can support. Signed-off-by: Michael Chan --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 28 +- drivers/net/ethernet/broadcom/bnxt/bnxt.h |5 +- drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h | 1725 ++- drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c |8 +- 4 files changed, 1069 insertions(+), 697 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 7664281..7ba5a99 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -186,11 +186,11 @@ enum board_idx { }; static const u16 bnxt_async_events_arr[] = { - HWRM_ASYNC_EVENT_CMPL_EVENT_ID_LINK_STATUS_CHANGE, - HWRM_ASYNC_EVENT_CMPL_EVENT_ID_PF_DRVR_UNLOAD, - HWRM_ASYNC_EVENT_CMPL_EVENT_ID_PORT_CONN_NOT_ALLOWED, - HWRM_ASYNC_EVENT_CMPL_EVENT_ID_VF_CFG_CHANGE, - HWRM_ASYNC_EVENT_CMPL_EVENT_ID_LINK_SPEED_CFG_CHANGE, + ASYNC_EVENT_CMPL_EVENT_ID_LINK_STATUS_CHANGE, + ASYNC_EVENT_CMPL_EVENT_ID_PF_DRVR_UNLOAD, + ASYNC_EVENT_CMPL_EVENT_ID_PORT_CONN_NOT_ALLOWED, + ASYNC_EVENT_CMPL_EVENT_ID_VF_CFG_CHANGE, + ASYNC_EVENT_CMPL_EVENT_ID_LINK_SPEED_CFG_CHANGE, }; static bool bnxt_vf_pciid(enum board_idx idx) @@ -1476,8 +1476,8 @@ static int bnxt_rx_pkt(struct bnxt *bp, struct bnxt_napi *bnapi, u32 *raw_cons, } #define BNXT_GET_EVENT_PORT(data) \ - ((data) & \ -HWRM_ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_EVENT_DATA1_PORT_ID_MASK) + ((data) & \ +ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_EVENT_DATA1_PORT_ID_MASK) static int bnxt_async_event_process(struct bnxt *bp, struct hwrm_async_event_cmpl *cmpl) @@ -1486,7 +1486,7 @@ static int bnxt_async_event_process(struct bnxt *bp, /* TODO CHIMP_FW: Define event id's for link change, error etc */ switch (event_id) { - case HWRM_ASYNC_EVENT_CMPL_EVENT_ID_LINK_SPEED_CFG_CHANGE: { + case ASYNC_EVENT_CMPL_EVENT_ID_LINK_SPEED_CFG_CHANGE: { u32 data1 = le32_to_cpu(cmpl->event_data1); struct bnxt_link_info *link_info = &bp->link_info; @@ -1502,13 +1502,13 @@ static int bnxt_async_event_process(struct bnxt *bp, set_bit(BNXT_LINK_SPEED_CHNG_SP_EVENT, &bp->sp_event); /* fall thru */ } - case HWRM_ASYNC_EVENT_CMPL_EVENT_ID_LINK_STATUS_CHANGE: + case ASYNC_EVENT_CMPL_EVENT_ID_LINK_STATUS_CHANGE: set_bit(BNXT_LINK_CHNG_SP_EVENT, &bp->sp_event); break; - case HWRM_ASYNC_EVENT_CMPL_EVENT_ID_PF_DRVR_UNLOAD: + case ASYNC_EVENT_CMPL_EVENT_ID_PF_DRVR_UNLOAD: set_bit(BNXT_HWRM_PF_UNLOAD_SP_EVENT, &bp->sp_event); break; - case HWRM_ASYNC_EVENT_CMPL_EVENT_ID_PORT_CONN_NOT_ALLOWED: { + case ASYNC_EVENT_CMPL_EVENT_ID_PORT_CONN_NOT_ALLOWED: { u32 data1 = le32_to_cpu(cmpl->event_data1); u16 port_id = BNXT_GET_EVENT_PORT(data1); @@ -1521,7 +1521,7 @@ static int bnxt_async_event_process(struct bnxt *bp, set_bit(BNXT_HWRM_PORT_MODULE_SP_EVENT, &bp->sp_event); break; } - case HWRM_ASYNC_EVENT_CMPL_EVENT_ID_VF_CFG_CHANGE: + case ASYNC_EVENT_CMPL_EVENT_ID_VF_CFG_CHANGE: if (BNXT_PF(bp)) goto async_event_process_exit; set_bit(BNXT_RESET_TASK_SILENT_SP_EVENT, &bp->sp_event); @@ -4261,12 +4261,16 @@ static int bnxt_hwrm_queue_qportcfg(struct bnxt *bp) goto qportcfg_exit; } bp->max_tc = resp->max_configurable_queues; + bp->max_lltc = resp->max_configurable_lossless_queues; if (bp->max_tc > BNXT_MAX_QUEUE) bp->max_tc = BNXT_MAX_QUEUE; if (resp->queue_cfg_info & QUEUE_QPORTCFG_RESP_QUEUE_CFG_INFO_ASYM_CFG) bp->max_tc = 1; + if (bp->max_lltc > bp->max_tc) + bp->max_lltc = bp->max_tc; + qptr = &resp->queue_id0; for (i = 0; i < bp->max_tc; i++) { bp->q_info[i].queue_id = *qptr++; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index fcd07ee..1f3d852 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -11,10 +11,10 @@ #define BNXT_H #define DRV_MODULE_NAME"bnxt_en" -#define DRV_MODULE_VERSION "1.5.0" +#define DRV_MODULE_VERSION "1.6.0" #define DRV_VER_MAJ1 -#define DRV_VER_MIN5 +#define DRV_VER_MIN6 #define DRV_VER_UPD0 struct tx_bd { @@ -1010,6 +1010,7 @@ struct bnxt { u32 rss_hash_cfg; u8 max_tc; + u8
[PATCH net-next v2 4/4] bnxt_en: Add PFC statistics.
Report PFC statistics to ethtool -S and DCBNL. Signed-off-by: Michael Chan --- drivers/net/ethernet/broadcom/bnxt/bnxt.h | 7 +++ drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c | 14 +- drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 23 --- 3 files changed, 36 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 2a714cf..b4abc1b 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -1124,6 +1124,13 @@ struct bnxt { u32 lpi_tmr_hi; }; +#define BNXT_RX_STATS_OFFSET(counter) \ + (offsetof(struct rx_port_stats, counter) / 8) + +#define BNXT_TX_STATS_OFFSET(counter) \ + ((offsetof(struct tx_port_stats, counter) + \ + sizeof(struct rx_port_stats) + 512) / 8) + #ifdef CONFIG_NET_RX_BUSY_POLL static inline void bnxt_enable_poll(struct bnxt_napi *bnapi) { diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c index f391b47..fdf2d8c 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c @@ -347,8 +347,10 @@ static int bnxt_dcbnl_ieee_setets(struct net_device *dev, struct ieee_ets *ets) static int bnxt_dcbnl_ieee_getpfc(struct net_device *dev, struct ieee_pfc *pfc) { struct bnxt *bp = netdev_priv(dev); + __le64 *stats = (__le64 *)bp->hw_rx_port_stats; struct ieee_pfc *my_pfc = bp->ieee_pfc; - int rc; + long rx_off, tx_off; + int i, rc; pfc->pfc_cap = bp->max_lltc; @@ -369,6 +371,16 @@ static int bnxt_dcbnl_ieee_getpfc(struct net_device *dev, struct ieee_pfc *pfc) pfc->mbc = my_pfc->mbc; pfc->delay = my_pfc->delay; + if (!stats) + return 0; + + rx_off = BNXT_RX_STATS_OFFSET(rx_pfc_ena_frames_pri0); + tx_off = BNXT_TX_STATS_OFFSET(tx_pfc_ena_frames_pri0); + for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++, rx_off++, tx_off++) { + pfc->requests[i] = le64_to_cpu(*(stats + tx_off)); + pfc->indications[i] = le64_to_cpu(*(stats + rx_off)); + } + return 0; } diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index fa6125e..784aa77 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -107,16 +107,9 @@ static int bnxt_set_coalesce(struct net_device *dev, #define BNXT_NUM_STATS 21 -#define BNXT_RX_STATS_OFFSET(counter) \ - (offsetof(struct rx_port_stats, counter) / 8) - #define BNXT_RX_STATS_ENTRY(counter) \ { BNXT_RX_STATS_OFFSET(counter), __stringify(counter) } -#define BNXT_TX_STATS_OFFSET(counter) \ - ((offsetof(struct tx_port_stats, counter) + \ - sizeof(struct rx_port_stats) + 512) / 8) - #define BNXT_TX_STATS_ENTRY(counter) \ { BNXT_TX_STATS_OFFSET(counter), __stringify(counter) } @@ -150,6 +143,14 @@ static int bnxt_set_coalesce(struct net_device *dev, BNXT_RX_STATS_ENTRY(rx_tagged_frames), BNXT_RX_STATS_ENTRY(rx_double_tagged_frames), BNXT_RX_STATS_ENTRY(rx_good_frames), + BNXT_RX_STATS_ENTRY(rx_pfc_ena_frames_pri0), + BNXT_RX_STATS_ENTRY(rx_pfc_ena_frames_pri1), + BNXT_RX_STATS_ENTRY(rx_pfc_ena_frames_pri2), + BNXT_RX_STATS_ENTRY(rx_pfc_ena_frames_pri3), + BNXT_RX_STATS_ENTRY(rx_pfc_ena_frames_pri4), + BNXT_RX_STATS_ENTRY(rx_pfc_ena_frames_pri5), + BNXT_RX_STATS_ENTRY(rx_pfc_ena_frames_pri6), + BNXT_RX_STATS_ENTRY(rx_pfc_ena_frames_pri7), BNXT_RX_STATS_ENTRY(rx_undrsz_frames), BNXT_RX_STATS_ENTRY(rx_eee_lpi_events), BNXT_RX_STATS_ENTRY(rx_eee_lpi_duration), @@ -179,6 +180,14 @@ static int bnxt_set_coalesce(struct net_device *dev, BNXT_TX_STATS_ENTRY(tx_fcs_err_frames), BNXT_TX_STATS_ENTRY(tx_err), BNXT_TX_STATS_ENTRY(tx_fifo_underruns), + BNXT_TX_STATS_ENTRY(tx_pfc_ena_frames_pri0), + BNXT_TX_STATS_ENTRY(tx_pfc_ena_frames_pri1), + BNXT_TX_STATS_ENTRY(tx_pfc_ena_frames_pri2), + BNXT_TX_STATS_ENTRY(tx_pfc_ena_frames_pri3), + BNXT_TX_STATS_ENTRY(tx_pfc_ena_frames_pri4), + BNXT_TX_STATS_ENTRY(tx_pfc_ena_frames_pri5), + BNXT_TX_STATS_ENTRY(tx_pfc_ena_frames_pri6), + BNXT_TX_STATS_ENTRY(tx_pfc_ena_frames_pri7), BNXT_TX_STATS_ENTRY(tx_eee_lpi_events), BNXT_TX_STATS_ENTRY(tx_eee_lpi_duration), BNXT_TX_STATS_ENTRY(tx_total_collisions), -- 1.8.3.1
[PATCH net-next v2 3/4] bnxt_en: Implement DCBNL to support host-based DCBX.
Support only IEEE DCBX initially. Add IEEE DCBNL ops and functions to get and set the hardware DCBX parameters. The DCB code is conditional on Kconfig CONFIG_BNXT_DCB. Signed-off-by: Michael Chan --- drivers/net/ethernet/broadcom/Kconfig | 10 + drivers/net/ethernet/broadcom/bnxt/Makefile | 2 +- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 8 +- drivers/net/ethernet/broadcom/bnxt/bnxt.h | 9 + drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c | 490 ++ drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.h | 41 +++ 6 files changed, 557 insertions(+), 3 deletions(-) create mode 100644 drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c create mode 100644 drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.h diff --git a/drivers/net/ethernet/broadcom/Kconfig b/drivers/net/ethernet/broadcom/Kconfig index bd8c80c..404c020 100644 --- a/drivers/net/ethernet/broadcom/Kconfig +++ b/drivers/net/ethernet/broadcom/Kconfig @@ -203,4 +203,14 @@ config BNXT_SRIOV Virtualization support in the NetXtreme-C/E products. This allows for virtual function acceleration in virtual environments. +config BNXT_DCB + bool "Data Center Bridging (DCB) Support" + default n + depends on BNXT && DCB + ---help--- + Say Y here if you want to use Data Center Bridging (DCB) in the + driver. + + If unsure, say N. + endif # NET_VENDOR_BROADCOM diff --git a/drivers/net/ethernet/broadcom/bnxt/Makefile b/drivers/net/ethernet/broadcom/bnxt/Makefile index 97e78e2..b233a86 100644 --- a/drivers/net/ethernet/broadcom/bnxt/Makefile +++ b/drivers/net/ethernet/broadcom/bnxt/Makefile @@ -1,3 +1,3 @@ obj-$(CONFIG_BNXT) += bnxt_en.o -bnxt_en-y := bnxt.o bnxt_sriov.o bnxt_ethtool.o +bnxt_en-y := bnxt.o bnxt_sriov.o bnxt_ethtool.o bnxt_dcb.o diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 7ba5a99..e8ab5fd 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -54,6 +54,7 @@ #include "bnxt.h" #include "bnxt_sriov.h" #include "bnxt_ethtool.h" +#include "bnxt_dcb.h" #define BNXT_TX_TIMEOUT(5 * HZ) @@ -4997,7 +4998,7 @@ static void bnxt_enable_napi(struct bnxt *bp) } } -static void bnxt_tx_disable(struct bnxt *bp) +void bnxt_tx_disable(struct bnxt *bp) { int i; struct bnxt_tx_ring_info *txr; @@ -5015,7 +5016,7 @@ static void bnxt_tx_disable(struct bnxt *bp) netif_carrier_off(bp->dev); } -static void bnxt_tx_enable(struct bnxt *bp) +void bnxt_tx_enable(struct bnxt *bp) { int i; struct bnxt_tx_ring_info *txr; @@ -6686,6 +6687,7 @@ static void bnxt_remove_one(struct pci_dev *pdev) bnxt_hwrm_func_drv_unrgtr(bp); bnxt_free_hwrm_resources(bp); + bnxt_dcb_free(bp); pci_iounmap(pdev, bp->bar2); pci_iounmap(pdev, bp->bar1); pci_iounmap(pdev, bp->bar0); @@ -6913,6 +6915,8 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) dev->min_mtu = ETH_ZLEN; dev->max_mtu = 9500; + bnxt_dcb_init(bp); + #ifdef CONFIG_BNXT_SRIOV init_waitqueue_head(&bp->sriov_cfg_wait); #endif diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 1f3d852..2a714cf 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -1026,6 +1026,13 @@ struct bnxt { struct bnxt_irq *irq_tbl; u8 mac_addr[ETH_ALEN]; +#ifdef CONFIG_BNXT_DCB + struct ieee_pfc *ieee_pfc; + struct ieee_ets *ieee_ets; + u8 dcbx_cap; + u8 default_pri; +#endif /* CONFIG_BNXT_DCB */ + u32 msg_enable; u32 hwrm_spec_code; @@ -1221,6 +1228,8 @@ static inline void bnxt_disable_poll(struct bnxt_napi *bnapi) int hwrm_send_message_silent(struct bnxt *, void *, u32, int); int bnxt_hwrm_set_coal(struct bnxt *); int bnxt_hwrm_func_qcaps(struct bnxt *); +void bnxt_tx_disable(struct bnxt *bp); +void bnxt_tx_enable(struct bnxt *bp); int bnxt_hwrm_set_pause(struct bnxt *); int bnxt_hwrm_set_link_setting(struct bnxt *, bool, bool); int bnxt_hwrm_fw_set_time(struct bnxt *); diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c new file mode 100644 index 000..f391b47 --- /dev/null +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c @@ -0,0 +1,490 @@ +/* Broadcom NetXtreme-C/E network driver. + * + * Copyright (c) 2014-2016 Broadcom Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "b
[PATCH net-next v2 0/4] bnxt_en: Add DCBNL support.
This series adds DCBNL operations to support host-based IEEE DCBX. v2: Updated to the latest firmware interface spec. David, please consider this series for net-next. Michael Chan (4): bnxt_en: Re-factor bnxt_setup_tc(). bnxt_en: Update firmware header file to latest 1.6.0. bnxt_en: Implement DCBNL to support host-based DCBX. bnxt_en: Add PFC statistics. drivers/net/ethernet/broadcom/Kconfig | 10 + drivers/net/ethernet/broadcom/bnxt/Makefile |2 +- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 54 +- drivers/net/ethernet/broadcom/bnxt/bnxt.h | 22 +- drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c | 502 ++ drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.h | 41 + drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 23 +- drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h | 1725 + drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c |8 +- 9 files changed, 1672 insertions(+), 715 deletions(-) create mode 100644 drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c create mode 100644 drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.h -- 1.8.3.1
Re: [PATCH net-next 2/4] mlx4: xdp: Allow raising MTU up to one page minus eth and vlan hdrs
On Fri, 2016-12-02 at 16:53 -0800, Alexei Starovoitov wrote: > On 12/2/16 4:38 PM, Eric Dumazet wrote: > > On Fri, 2016-12-02 at 15:23 -0800, Martin KaFai Lau wrote: > >> When XDP prog is attached, it is currently limiting > >> MTU to be FRAG_SZ0 - ETH_HLEN - (2 * VLAN_HLEN) which is 1514 > >> in x86. > >> > >> AFAICT, since mlx4 is doing one page per packet for XDP, > >> we can at least raise the MTU limitation up to > >> PAGE_SIZE - ETH_HLEN - (2 * VLAN_HLEN) which this patch is > >> doing. It will be useful in the next patch which allows > >> XDP program to extend the packet by adding new header(s). > >> > >> Signed-off-by: Martin KaFai Lau > >> --- > > > > Have you tested your patch on a host with PAGE_SIZE = 64 KB ? > > > > Looks XDP really kills arches with bigger pages :( > > I'm afraid xdp mlx[45] support was not tested on arches > with 64k pages at all. Not just this patch. > I think people who care about such archs should test? > Note page per packet is not a hard requirement for all drivers > and all archs. For mlx[45] it was the easiest and the most > convenient way to achieve desired performance. > If there are ways to do the same performance differently, > I'm all ears :) > My question was more like : Can we double check all these patches wont break mlx4 driver (non XDP path) on arches with PAGE_SIZE=64KB. I have no plan using XDP before a while, but I certainly know some customers are using mlx4 on powerpc.
[PATCH] net: ethernet: ti: cpdma: use desc_read in chan_process instead of raw read
There is desc_read() macros to read desc fields, so no need to use __raw_readl(); Signed-off-by: Ivan Khoronzhuk --- Based on net-next/master drivers/net/ethernet/ti/davinci_cpdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ti/davinci_cpdma.c b/drivers/net/ethernet/ti/davinci_cpdma.c index c776e45..d96dca5 100644 --- a/drivers/net/ethernet/ti/davinci_cpdma.c +++ b/drivers/net/ethernet/ti/davinci_cpdma.c @@ -1132,7 +1132,7 @@ static int __cpdma_chan_process(struct cpdma_chan *chan) } desc_dma = desc_phys(pool, desc); - status = __raw_readl(&desc->hw_mode); + status = desc_read(desc, hw_mode); outlen = status & 0x7ff; if (status & CPDMA_DESC_OWNER) { chan->stats.busy_dequeue++; -- 2.7.4
Re: [PATCH net-next] liquidio: 'imply' ptp instead of 'select'
Hi Arnd, [auto build test ERROR on net-next/master] url: https://github.com/0day-ci/linux/commits/Arnd-Bergmann/liquidio-imply-ptp-instead-of-select/20161203-084019 config: x86_64-allmodconfig compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901 reproduce: make ARCH=x86_64 allmodconfig make ARCH=x86_64 All errors (new ones prefixed by >>): >> drivers/net/ethernet/cavium/Kconfig:81: syntax error >> drivers/net/ethernet/cavium/Kconfig:80: unknown option "imply" make[2]: *** [allmodconfig] Error 1 make[1]: *** [allmodconfig] Error 2 make: *** [sub-make] Error 2 -- >> drivers/net/ethernet/cavium/Kconfig:81: syntax error >> drivers/net/ethernet/cavium/Kconfig:80: unknown option "imply" make[2]: *** [oldconfig] Error 1 make[1]: *** [oldconfig] Error 2 make: *** [sub-make] Error 2 -- >> drivers/net/ethernet/cavium/Kconfig:81: syntax error >> drivers/net/ethernet/cavium/Kconfig:80: unknown option "imply" make[2]: *** [olddefconfig] Error 1 make[2]: Target 'oldnoconfig' not remade because of errors. make[1]: *** [oldnoconfig] Error 2 make: *** [sub-make] Error 2 vim +81 drivers/net/ethernet/cavium/Kconfig d07a147f David Daney 2016-03-14 74 port on Cavium Networks' Octeon CN57XX, CN56XX, CN55XX, d07a147f David Daney 2016-03-14 75 CN54XX, CN52XX, and CN6XXX chips. d07a147f David Daney 2016-03-14 76 111fc64a Raghu Vatsavayi 2016-11-28 77 config LIQUIDIO_VF 111fc64a Raghu Vatsavayi 2016-11-28 78 tristate "Cavium LiquidIO VF support" 111fc64a Raghu Vatsavayi 2016-11-28 79 depends on 64BIT && PCI_MSI 2d6e65ca Arnd Bergmann 2016-12-03 @80 imply PTP_1588_CLOCK 111fc64a Raghu Vatsavayi 2016-11-28 @81 ---help--- 111fc64a Raghu Vatsavayi 2016-11-28 82 This driver supports Cavium LiquidIO Intelligent Server Adapter 111fc64a Raghu Vatsavayi 2016-11-28 83 based on CN23XX chips. 111fc64a Raghu Vatsavayi 2016-11-28 84 :: The code at line 81 was first introduced by commit :: 111fc64a237f231bc2d3187bdf8358eb7966e6a9 liquidio CN23XX: VF registration :: TO: Raghu Vatsavayi :: CC: David S. Miller --- 0-DAY kernel test infrastructureOpen Source Technology Center https://lists.01.org/pipermail/kbuild-all Intel Corporation
[PATCH] net: ping: check minimum size on ICMP header length
Prior to commit c0371da6047a ("put iov_iter into msghdr") in v3.19, there was no check that the iovec contained enough bytes for a icmp header, and the read loop would walk across neighboring stack contents. Since the iov_iter conversion, bad arguments are noticed, but the returned error is EFAULT. Returning EMSGSIZE is a clearer fix and solves the problem prior to v3.19. This was found using trinity with KASAN on v3.18: BUG: KASAN: stack-out-of-bounds in memcpy_fromiovec+0x60/0x114 at addr ffc071077da0 Read of size 8 by task trinity-c2/9623 page:ffbe034b9a08 count:0 mapcount:0 mapping: (null) index:0x0 flags: 0x0() page dumped because: kasan: bad access detected CPU: 0 PID: 9623 Comm: trinity-c2 Tainted: GBU 3.18.0-dirty #15 Hardware name: Google Tegra210 Smaug Rev 1,3+ (DT) Call trace: [] dump_backtrace+0x0/0x1ac arch/arm64/kernel/traps.c:90 [] show_stack+0x10/0x1c arch/arm64/kernel/traps.c:171 [< inline >] __dump_stack lib/dump_stack.c:15 [] dump_stack+0x7c/0xd0 lib/dump_stack.c:50 [< inline >] print_address_description mm/kasan/report.c:147 [< inline >] kasan_report_error mm/kasan/report.c:236 [] kasan_report+0x380/0x4b8 mm/kasan/report.c:259 [< inline >] check_memory_region mm/kasan/kasan.c:264 [] __asan_load8+0x20/0x70 mm/kasan/kasan.c:507 [] memcpy_fromiovec+0x5c/0x114 lib/iovec.c:15 [< inline >] memcpy_from_msg include/linux/skbuff.h:2667 [] ping_common_sendmsg+0x50/0x108 net/ipv4/ping.c:674 [] ping_v4_sendmsg+0xd8/0x698 net/ipv4/ping.c:714 [] inet_sendmsg+0xe0/0x12c net/ipv4/af_inet.c:749 [< inline >] __sock_sendmsg_nosec net/socket.c:624 [< inline >] __sock_sendmsg net/socket.c:632 [] sock_sendmsg+0x124/0x164 net/socket.c:643 [< inline >] SYSC_sendto net/socket.c:1797 [] SyS_sendto+0x178/0x1d8 net/socket.c:1761 CVE-2016-8399 Reported-by: Qidan He Fixes: c319b4d76b9e ("net: ipv4: add IPPROTO_ICMP socket kind") Cc: sta...@vger.kernel.org Signed-off-by: Kees Cook --- net/ipv4/ping.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 205e2000d395..8257be3f032c 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -654,7 +654,7 @@ int ping_common_sendmsg(int family, struct msghdr *msg, size_t len, void *user_icmph, size_t icmph_len) { u8 type, code; - if (len > 0x) + if (len > 0x || len < icmph_len) return -EMSGSIZE; /* -- 2.7.4 -- Kees Cook Nexus Security
Re: [PATCH net-next 2/4] mlx4: xdp: Allow raising MTU up to one page minus eth and vlan hdrs
On 12/2/16 4:38 PM, Eric Dumazet wrote: On Fri, 2016-12-02 at 15:23 -0800, Martin KaFai Lau wrote: When XDP prog is attached, it is currently limiting MTU to be FRAG_SZ0 - ETH_HLEN - (2 * VLAN_HLEN) which is 1514 in x86. AFAICT, since mlx4 is doing one page per packet for XDP, we can at least raise the MTU limitation up to PAGE_SIZE - ETH_HLEN - (2 * VLAN_HLEN) which this patch is doing. It will be useful in the next patch which allows XDP program to extend the packet by adding new header(s). Signed-off-by: Martin KaFai Lau --- Have you tested your patch on a host with PAGE_SIZE = 64 KB ? Looks XDP really kills arches with bigger pages :( I'm afraid xdp mlx[45] support was not tested on arches with 64k pages at all. Not just this patch. I think people who care about such archs should test? Note page per packet is not a hard requirement for all drivers and all archs. For mlx[45] it was the easiest and the most convenient way to achieve desired performance. If there are ways to do the same performance differently, I'm all ears :)
Re: [PATCH net-next 2/4] mlx4: xdp: Allow raising MTU up to one page minus eth and vlan hdrs
On Fri, 2016-12-02 at 15:23 -0800, Martin KaFai Lau wrote: > When XDP prog is attached, it is currently limiting > MTU to be FRAG_SZ0 - ETH_HLEN - (2 * VLAN_HLEN) which is 1514 > in x86. > > AFAICT, since mlx4 is doing one page per packet for XDP, > we can at least raise the MTU limitation up to > PAGE_SIZE - ETH_HLEN - (2 * VLAN_HLEN) which this patch is > doing. It will be useful in the next patch which allows > XDP program to extend the packet by adding new header(s). > > Signed-off-by: Martin KaFai Lau > --- Have you tested your patch on a host with PAGE_SIZE = 64 KB ? Looks XDP really kills arches with bigger pages :( Thanks.
Re: [PATCH 5/7] Documentation: DT: net: cpsw: allow to specify descriptors pool size
On Fri, Dec 02, 2016 at 11:22:28AM -0600, Grygorii Strashko wrote: > > > On 12/02/2016 05:28 AM, Ivan Khoronzhuk wrote: > > On Thu, Dec 01, 2016 at 05:34:30PM -0600, Grygorii Strashko wrote: > >> Add optional property "descs_pool_size" to specify buffer descriptor's > >> pool size. The "descs_pool_size" should define total number of CPDMA > >> CPPI descriptors to be used for both ingress/egress packets > >> processing. If not specified - the default value 256 will be used > >> which will allow to place descriptor's pool into the internal CPPI > >> RAM on most of TI SoC. > >> > >> Signed-off-by: Grygorii Strashko > >> --- > >> Documentation/devicetree/bindings/net/cpsw.txt | 5 + > >> 1 file changed, 5 insertions(+) > >> > >> diff --git a/Documentation/devicetree/bindings/net/cpsw.txt > >> b/Documentation/devicetree/bindings/net/cpsw.txt > >> index 5ad439f..b99d196 100644 > >> --- a/Documentation/devicetree/bindings/net/cpsw.txt > >> +++ b/Documentation/devicetree/bindings/net/cpsw.txt > >> @@ -35,6 +35,11 @@ Optional properties: > >> For example in dra72x-evm, pcf gpio has to be > >> driven low so that cpsw slave 0 and phy data > >> lines are connected via mux. > >> +- descs_pool_size : total number of CPDMA CPPI descriptors to be used for > >> +both ingress/egress packets processing. if not > >> +specified the default value 256 will be used which > >> +will allow to place descriptors pool into the > >> +internal CPPI RAM. > > Does it describe h/w? Why now module parameter? or even smth like ethtool > > num > > ring entries? > > > > It can be module parameter too. in general this is expected to be > one-time boot setting only. > > - OR > So, do you propose to use >ethtool -g ethX > >ethtool -G ethX [rx N] [tx N] > ? It has a little different names, but yes, why not? No need, maybe, butIt's just a proposition, at least I was thinking about it after proposition from +cc Schuyler Patton to leave rx desc num property. In this case it's possible to tune tx/rx desc num ratio, even with SRAM descs. > > Now cpdma has one pool for all RX/TX channels, so changing this settings > by ethtool will require: pause interfaces, reallocate cpdma pool, Pause can lead to losts only for rx, and only for very short time, so it's not very bad, especially when user knows what he is doing. > re-arrange buffers between channels, resume interface. Correct? correct. But, some alternative variants can be used, like replacing descriptors. Shrink num of desc for every channels to 1, replace/add others, and expand. In this case no losts, but it's harder to debug issues after > > How do you think - we can move forward with one pool or better to have two > (Rx and Tx)? I think one is enough, just split, if no harm on perf. > > Wouldn't it be reasonable to still have DT (or module) parameter to avoid > cpdma reconfiguration on system startup (pause/resume interfaces) (faster > boot)? Would be, your choice, but it's not flexible. > > How about cpdma re-allocation policy (with expectation that is shouldn't > happen too often)? > - increasing of Rx, Tx will grow total number of physically allocated buffers > (total_desc_num) > - decreasing of Rx, Tx will just change number of available buffers (no > memory re-allocation) > > - OR > Can we move forward with current patch (total number of CPDMA CPPI > descriptors defined in DT) > and add ethtool -G ethX [rx N] [tx N] which will allow to re-split descs > between RX and TX? No objections, It anyway requires re-allocations. Re-split of Rx and Tx will not have a lot changes as most code exists already. > > > > -- > regards, > -grygorii
Re: [PATCH net] geneve: avoid use-after-free of skb->data
2016-12-02, 14:09:25 -0500, David Miller wrote: > From: Sabrina Dubroca > Date: Fri, 2 Dec 2016 16:49:29 +0100 > > > geneve{,6}_build_skb can end up doing a pskb_expand_head(), which > > makes the ip_hdr(skb) reference we stashed earlier stale. Since it's > > only needed as an argument to ip_tunnel_ecn_encap(), move this > > directly in the function call. > > > > Fixes: 08399efc6319 ("geneve: ensure ECN info is handled properly in all > > tx/rx paths") > > Signed-off-by: Sabrina Dubroca > > Applied and queued up for -stable, thanks. > > This bug happens so many times that I think it might be time for > a debugging mode for pskb_expand_head() that unconditionally > reallocates the skb->data buffer regardless of whether it's > necessary or not and somehow unmaps the previous buffer to > force a trap on stale pointers. The problem with that is you'd need to enable the "debugging mode" in all wrappers, so that they don't bypass the actual call to pskb_expand_head(). And that still leaves all the direct calls to pskb_expand_head() that are guarded by some kind of check (just two random hits without even looking very hard: net/core/pktgen.c:process_ipsec, net/ipv4/ip_gre.c:gre_fb_xmit). Then I think we could just rely on KASAN (that's how I noticed this bug). > Better ideas welcome, of course :) May not be better ;) but at least another idea: I'd like to try something based on static analysis. We'd need a way to tag cached pointers to skb->data (via ip_hdr() or whatever), and propagate the notion that pskb_expand_head() makes these cached pointers stale through layers of function calls. I don't know how feasible this is with the tools we have. -- Sabrina
Re: [PATCH net-next 1/4] bpf: xdp: Allow head adjustment in XDP prog
On 12/03/2016 12:23 AM, Martin KaFai Lau wrote: This patch allows XDP prog to extend/remove the packet data at the head (like adding or removing header). It is done by adding a new XDP helper bpf_xdp_adjust_head(). It also renames bpf_helper_changes_skb_data() to bpf_helper_changes_pkt_data() to better reflect that XDP prog does not work on skb. Signed-off-by: Martin KaFai Lau [...] diff --git a/net/core/filter.c b/net/core/filter.c index 56b43587d200..6902e2f73e38 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2234,7 +2234,34 @@ static const struct bpf_func_proto bpf_skb_change_head_proto = { .arg3_type = ARG_ANYTHING, }; -bool bpf_helper_changes_skb_data(void *func) +BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset) +{ + /* Both mlx4 and mlx5 driver align each packet to PAGE_SIZE when +* XDP prog is set. +* If the above is not true for the other drivers to support +* bpf_xdp_adjust_head, struct xdp_buff can be extended. +*/ + void *head = (void *)((unsigned long)xdp->data & PAGE_MASK); + void *new_data = xdp->data + offset; + + if (new_data < head || new_data >= xdp->data_end) + /* The packet length must be >=1 */ Patch looks generally good to me. Should the min pkt len here be limited to ETH_HLEN instead of 1? + return -EINVAL; + + xdp->data = new_data; + + return 0; +} + +static const struct bpf_func_proto bpf_xdp_adjust_head_proto = { + .func = bpf_xdp_adjust_head, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + +bool bpf_helper_changes_pkt_data(void *func) { if (func == bpf_skb_vlan_push || func == bpf_skb_vlan_pop || [...]
Re: bpf bounded loops. Was: [flamebait] xdp
On Fri, Dec 02, 2016 at 11:42:15AM -0800, John Fastabend wrote: > >> As far as pattern search for DNS packets... > >> it was requested by Cloudflare guys back in March: > >> https://github.com/iovisor/bcc/issues/471 > >> and it is useful for several tracing use cases as well. > >> Unfortunately no one had time to implement it yet. > > > > The string operations you proposed on the other hand, which would count > > as one eBPF instructions, would give a lot more flexibility and allow > > more cycles to burn, but don't help parsing binary protocols like IPv6 > > extension headers. these are two separate things. we need pattern search regardless of bounded loops. bpf program shouldn't be doing any complicated algorithms. The main reasons to have loops are: - speed up execution (smaller I-cache footprint) - avoid forcing compiler to unroll loops (easier for users) - support loops where unroll is not possible (like example below) > My rough thinking on this was the verifier had to start looking for loop > invariants and to guarantee termination. Sounds scary in general but > LLVM could put these in some normal form for us and the verifier could > only accept decreasing loops, the invariants could be required to be > integers, etc. By simplifying the loop enough the problem becomes > tractable. yep. I think what Hannes was proposing earlier is straighforward to implement for a compiler guy. The following: for (int i = 0; i < (var & 0xff); i++) sum += map->value[i]; /* map value_size >= 0xff */ is obviously bounded and dataflow analysis can easily prove that all memory operations are valid. Static analysis tools do way way more than this. > I think this would be better than new instructions and/or multiple > verifiers. agree that it's better than new instructions that would have required JIT changes. Though there are pros to new insns too :)
[PATCH net-next 3/4] mlx4: xdp: Reserve headroom for receiving packet when XDP prog is active
Reserve XDP_PACKET_HEADROOM when XDP prog is active. Signed-off-by: Martin KaFai Lau --- drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 17 +++-- drivers/net/ethernet/mellanox/mlx4/en_rx.c | 23 +-- drivers/net/ethernet/mellanox/mlx4/en_tx.c | 9 + drivers/net/ethernet/mellanox/mlx4/mlx4_en.h | 3 ++- 4 files changed, 39 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c index 5df0bbd88d67..fb6d87dbc350 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c @@ -51,7 +51,8 @@ #include "mlx4_en.h" #include "en_port.h" -#define MLX4_EN_MAX_XDP_MTU ((int)(PAGE_SIZE - ETH_HLEN - (2 * VLAN_HLEN))) +#define MLX4_EN_MAX_XDP_MTU ((int)(PAGE_SIZE - ETH_HLEN - (2 * VLAN_HLEN) - \ + XDP_PACKET_HEADROOM)) int mlx4_en_setup_tc(struct net_device *dev, u8 up) { @@ -1551,6 +1552,7 @@ int mlx4_en_start_port(struct net_device *dev) struct mlx4_en_tx_ring *tx_ring; int rx_index = 0; int err = 0; + int mtu; int i, t; int j; u8 mc_list[16] = {0}; @@ -1684,8 +1686,12 @@ int mlx4_en_start_port(struct net_device *dev) } /* Configure port */ + mtu = priv->rx_skb_size + ETH_FCS_LEN; + if (priv->tx_ring_num[TX_XDP]) + mtu += XDP_PACKET_HEADROOM; + err = mlx4_SET_PORT_general(mdev->dev, priv->port, - priv->rx_skb_size + ETH_FCS_LEN, + mtu, priv->prof->tx_pause, priv->prof->tx_ppp, priv->prof->rx_pause, @@ -2268,6 +2274,13 @@ static bool mlx4_en_check_xdp_mtu(struct net_device *dev, int mtu) { struct mlx4_en_priv *priv = netdev_priv(dev); + if (mtu + XDP_PACKET_HEADROOM > priv->max_mtu) { + en_err(priv, + "Device max mtu:%d does not allow %d bytes reserved headroom for XDP prog\n", + priv->max_mtu, XDP_PACKET_HEADROOM); + return false; + } + if (mtu > MLX4_EN_MAX_XDP_MTU) { en_err(priv, "mtu:%d > max:%d when XDP prog is attached\n", mtu, MLX4_EN_MAX_XDP_MTU); diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c index 23e9d04d1ef4..324771ac929e 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c @@ -96,7 +96,6 @@ static int mlx4_en_alloc_frags(struct mlx4_en_priv *priv, struct mlx4_en_rx_alloc page_alloc[MLX4_EN_MAX_RX_FRAGS]; const struct mlx4_en_frag_info *frag_info; struct page *page; - dma_addr_t dma; int i; for (i = 0; i < priv->num_frags; i++) { @@ -115,9 +114,10 @@ static int mlx4_en_alloc_frags(struct mlx4_en_priv *priv, for (i = 0; i < priv->num_frags; i++) { frags[i] = ring_alloc[i]; - dma = ring_alloc[i].dma + ring_alloc[i].page_offset; + frags[i].page_offset += priv->frag_info[i].rx_headroom; + rx_desc->data[i].addr = cpu_to_be64(frags[i].dma + + frags[i].page_offset); ring_alloc[i] = page_alloc[i]; - rx_desc->data[i].addr = cpu_to_be64(dma); } return 0; @@ -250,7 +250,8 @@ static int mlx4_en_prepare_rx_desc(struct mlx4_en_priv *priv, if (ring->page_cache.index > 0) { frags[0] = ring->page_cache.buf[--ring->page_cache.index]; - rx_desc->data[0].addr = cpu_to_be64(frags[0].dma); + rx_desc->data[0].addr = cpu_to_be64(frags[0].dma + + frags[0].page_offset); return 0; } @@ -889,6 +890,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud if (xdp_prog) { struct xdp_buff xdp; dma_addr_t dma; + void *pg_addr, *orig_data; u32 act; dma = be64_to_cpu(rx_desc->data[0].addr); @@ -896,11 +898,18 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud priv->frag_info[0].frag_size, DMA_FROM_DEVICE); - xdp.data = page_address(frags[0].page) + - frags[0].page_offset; + pg_addr = page_address(frags[0].page); + orig_data = pg_addr + frags[0].page_offset; + xdp.data = orig_data; xdp.data_end = x
Re: [PATCH net-next 2/4] mlx4: xdp: Allow raising MTU up to one page minus eth and vlan hdrs
On 12/02/2016 03:23 PM, Martin KaFai Lau wrote: When XDP prog is attached, it is currently limiting MTU to be FRAG_SZ0 - ETH_HLEN - (2 * VLAN_HLEN) which is 1514 in x86. AFAICT, since mlx4 is doing one page per packet for XDP, we can at least raise the MTU limitation up to PAGE_SIZE - ETH_HLEN - (2 * VLAN_HLEN) which this patch is doing. It will be useful in the next patch which allows XDP program to extend the packet by adding new header(s). Is mlx4 the only driver doing page-per-packet? rick jones
[PATCH net-next 1/4] bpf: xdp: Allow head adjustment in XDP prog
This patch allows XDP prog to extend/remove the packet data at the head (like adding or removing header). It is done by adding a new XDP helper bpf_xdp_adjust_head(). It also renames bpf_helper_changes_skb_data() to bpf_helper_changes_pkt_data() to better reflect that XDP prog does not work on skb. Signed-off-by: Martin KaFai Lau --- arch/powerpc/net/bpf_jit_comp64.c | 4 ++-- arch/s390/net/bpf_jit_comp.c | 2 +- arch/x86/net/bpf_jit_comp.c | 2 +- include/linux/filter.h| 2 +- include/uapi/linux/bpf.h | 11 ++- kernel/bpf/core.c | 2 +- kernel/bpf/verifier.c | 2 +- net/core/filter.c | 34 -- 8 files changed, 49 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index 0fe98a567125..73a5cf18fd84 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -766,7 +766,7 @@ static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, func = (u8 *) __bpf_call_base + imm; /* Save skb pointer if we need to re-cache skb data */ - if (bpf_helper_changes_skb_data(func)) + if (bpf_helper_changes_pkt_data(func)) PPC_BPF_STL(3, 1, bpf_jit_stack_local(ctx)); bpf_jit_emit_func_call(image, ctx, (u64)func); @@ -775,7 +775,7 @@ static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, PPC_MR(b2p[BPF_REG_0], 3); /* refresh skb cache */ - if (bpf_helper_changes_skb_data(func)) { + if (bpf_helper_changes_pkt_data(func)) { /* reload skb pointer to r3 */ PPC_BPF_LL(3, 1, bpf_jit_stack_local(ctx)); bpf_jit_emit_skb_loads(image, ctx); diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index bee281f3163d..167b31b186c1 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -981,7 +981,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i EMIT2(0x0d00, REG_14, REG_W1); /* lgr %b0,%r2: load return value into %b0 */ EMIT4(0xb904, BPF_REG_0, REG_2); - if (bpf_helper_changes_skb_data((void *)func)) { + if (bpf_helper_changes_pkt_data((void *)func)) { jit->seen |= SEEN_SKB_CHANGE; /* lg %b1,ST_OFF_SKBP(%r15) */ EMIT6_DISP_LH(0xe300, 0x0004, BPF_REG_1, REG_0, diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index fe04a04dab8e..e76d1af60f7a 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -853,7 +853,7 @@ xadd: if (is_imm8(insn->off)) func = (u8 *) __bpf_call_base + imm32; jmp_offset = func - (image + addrs[i]); if (seen_ld_abs) { - reload_skb_data = bpf_helper_changes_skb_data(func); + reload_skb_data = bpf_helper_changes_pkt_data(func); if (reload_skb_data) { EMIT1(0x57); /* push %rdi */ jmp_offset += 22; /* pop, mov, sub, mov */ diff --git a/include/linux/filter.h b/include/linux/filter.h index 97338134398f..3c02de77ad6a 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -590,7 +590,7 @@ void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp); u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog); -bool bpf_helper_changes_skb_data(void *func); +bool bpf_helper_changes_pkt_data(void *func); struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, const struct bpf_insn *patch, u32 len); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6123d9b8e828..0eb0e87dbe9f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -424,6 +424,12 @@ union bpf_attr { * @len: length of header to be pushed in front * @flags: Flags (unused for now) * Return: 0 on success or negative error + * + * int bpf_xdp_adjust_head(xdp_md, delta) + * Adjust the xdp_md.data by delta + * @xdp_md: pointer to xdp_md + * @delta: An positive/negative integer to be added to xdp_md.data + * Return: 0 on success or negative on error */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -469,7 +475,8 @@ union bpf_attr { FN(csum_update),\ FN(set_hash_invalid), \ FN(ge
[PATCH net-next 2/4] mlx4: xdp: Allow raising MTU up to one page minus eth and vlan hdrs
When XDP prog is attached, it is currently limiting MTU to be FRAG_SZ0 - ETH_HLEN - (2 * VLAN_HLEN) which is 1514 in x86. AFAICT, since mlx4 is doing one page per packet for XDP, we can at least raise the MTU limitation up to PAGE_SIZE - ETH_HLEN - (2 * VLAN_HLEN) which this patch is doing. It will be useful in the next patch which allows XDP program to extend the packet by adding new header(s). Signed-off-by: Martin KaFai Lau --- drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 28 +++- drivers/net/ethernet/mellanox/mlx4/en_rx.c | 46 ++ 2 files changed, 44 insertions(+), 30 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c index 091b904262bc..5df0bbd88d67 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c @@ -51,6 +51,8 @@ #include "mlx4_en.h" #include "en_port.h" +#define MLX4_EN_MAX_XDP_MTU ((int)(PAGE_SIZE - ETH_HLEN - (2 * VLAN_HLEN))) + int mlx4_en_setup_tc(struct net_device *dev, u8 up) { struct mlx4_en_priv *priv = netdev_priv(dev); @@ -2262,6 +2264,19 @@ void mlx4_en_destroy_netdev(struct net_device *dev) free_netdev(dev); } +static bool mlx4_en_check_xdp_mtu(struct net_device *dev, int mtu) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + + if (mtu > MLX4_EN_MAX_XDP_MTU) { + en_err(priv, "mtu:%d > max:%d when XDP prog is attached\n", + mtu, MLX4_EN_MAX_XDP_MTU); + return false; + } + + return true; +} + static int mlx4_en_change_mtu(struct net_device *dev, int new_mtu) { struct mlx4_en_priv *priv = netdev_priv(dev); @@ -2271,11 +2286,10 @@ static int mlx4_en_change_mtu(struct net_device *dev, int new_mtu) en_dbg(DRV, priv, "Change MTU called - current:%d new:%d\n", dev->mtu, new_mtu); - if (priv->tx_ring_num[TX_XDP] && MLX4_EN_EFF_MTU(new_mtu) > FRAG_SZ0) { - en_err(priv, "MTU size:%d requires frags but XDP running\n", - new_mtu); - return -EOPNOTSUPP; - } + if (priv->tx_ring_num[TX_XDP] && + !mlx4_en_check_xdp_mtu(dev, new_mtu)) + return -ENOTSUPP; + dev->mtu = new_mtu; if (netif_running(dev)) { @@ -2723,10 +2737,8 @@ static int mlx4_xdp_set(struct net_device *dev, struct bpf_prog *prog) return 0; } - if (priv->num_frags > 1) { - en_err(priv, "Cannot set XDP if MTU requires multiple frags\n"); + if (!mlx4_en_check_xdp_mtu(dev, dev->mtu)) return -EOPNOTSUPP; - } tmp = kzalloc(sizeof(*tmp), GFP_KERNEL); if (!tmp) diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c index 6562f78b07f4..23e9d04d1ef4 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c @@ -1164,37 +1164,39 @@ static const int frag_sizes[] = { void mlx4_en_calc_rx_buf(struct net_device *dev) { - enum dma_data_direction dma_dir = PCI_DMA_FROMDEVICE; struct mlx4_en_priv *priv = netdev_priv(dev); int eff_mtu = MLX4_EN_EFF_MTU(dev->mtu); - int order = MLX4_EN_ALLOC_PREFER_ORDER; - u32 align = SMP_CACHE_BYTES; - int buf_size = 0; int i = 0; /* bpf requires buffers to be set up as 1 packet per page. * This only works when num_frags == 1. */ if (priv->tx_ring_num[TX_XDP]) { - dma_dir = PCI_DMA_BIDIRECTIONAL; - /* This will gain efficient xdp frame recycling at the expense -* of more costly truesize accounting + priv->frag_info[0].order = 0; + priv->frag_info[0].frag_size = eff_mtu; + priv->frag_info[0].frag_prefix_size = 0; + /* This will gain efficient xdp frame recycling at the +* expense of more costly truesize accounting */ - align = PAGE_SIZE; - order = 0; - } - - while (buf_size < eff_mtu) { - priv->frag_info[i].order = order; - priv->frag_info[i].frag_size = - (eff_mtu > buf_size + frag_sizes[i]) ? - frag_sizes[i] : eff_mtu - buf_size; - priv->frag_info[i].frag_prefix_size = buf_size; - priv->frag_info[i].frag_stride = - ALIGN(priv->frag_info[i].frag_size, align); - priv->frag_info[i].dma_dir = dma_dir; - buf_size += priv->frag_info[i].frag_size; - i++; + priv->frag_info[0].frag_stride = PAGE_SIZE; + priv->frag_info[0].dma_dir = PCI_DMA_BIDIRECTIONAL; + i = 1; + } else { + int buf_size = 0; + + while (buf_s
Re: [PATCH v2 0/7] stmmac: dwmac-meson8b: configurable RGMII TX delay
On Mon, Nov 28, 2016 at 2:33 AM, David Miller wrote: > From: Martin Blumenstingl > Date: Fri, 25 Nov 2016 14:01:49 +0100 > >> Currently the dwmac-meson8b stmmac glue driver uses a hardcoded 1/4 >> cycle TX clock delay. This seems to work fine for many boards (for >> example Odroid-C2 or Amlogic's reference boards) but there are some >> others where TX traffic is simply broken. >> There are probably multiple reasons why it's working on some boards >> while it's broken on others: >> - some of Amlogic's reference boards are using a Micrel PHY >> - hardware circuit design >> - maybe more... > > The ARM arch file changes do not apply cleanly to net-next, you probably > want to merge them via the ARM tree instead of mine, and respin this series > to be without the .dts file changes. done, v3 contains only the net-next changes while the dts changes can be found here: [0] Regards, Martin [0] http://lists.infradead.org/pipermail/linux-amlogic/2016-December/001836.html
[PATCH net-next 4/4] bpf: xdp: Add XDP example for head adjustment
The XDP prog checks if the incoming packet matches any VIP:PORT combination in the BPF hashmap. If it is, it will encapsulate the packet with a IPv4/v6 header as instructed by the value of the BPF hashmap and then XDP_TX it out. The VIP:PORT -> IP-Encap-Info can be specified by the cmd args of the user prog. Signed-off-by: Martin KaFai Lau --- samples/bpf/Makefile | 4 + samples/bpf/bpf_helpers.h | 2 + samples/bpf/bpf_load.c| 94 ++ samples/bpf/bpf_load.h| 1 + samples/bpf/xdp1_user.c | 93 -- samples/bpf/xdp_tx_iptnl_common.h | 37 ++ samples/bpf/xdp_tx_iptnl_kern.c | 232 ++ samples/bpf/xdp_tx_iptnl_user.c | 253 ++ 8 files changed, 623 insertions(+), 93 deletions(-) create mode 100644 samples/bpf/xdp_tx_iptnl_common.h create mode 100644 samples/bpf/xdp_tx_iptnl_kern.c create mode 100644 samples/bpf/xdp_tx_iptnl_user.c diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index bfc2cb88a1f7..e4d6be8bd94b 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -32,6 +32,7 @@ hostprogs-y += trace_event hostprogs-y += sampleip hostprogs-y += tc_l2_redirect hostprogs-y += lwt_len_hist +hostprogs-y += xdp_tx_iptnl test_lru_dist-objs := test_lru_dist.o libbpf.o sock_example-objs := sock_example.o libbpf.o @@ -65,6 +66,7 @@ trace_event-objs := bpf_load.o libbpf.o trace_event_user.o sampleip-objs := bpf_load.o libbpf.o sampleip_user.o tc_l2_redirect-objs := bpf_load.o libbpf.o tc_l2_redirect_user.o lwt_len_hist-objs := bpf_load.o libbpf.o lwt_len_hist_user.o +xdp_tx_iptnl-objs := bpf_load.o libbpf.o xdp_tx_iptnl_user.o # Tell kbuild to always build the programs always := $(hostprogs-y) @@ -97,6 +99,7 @@ always += test_current_task_under_cgroup_kern.o always += trace_event_kern.o always += sampleip_kern.o always += lwt_len_hist_kern.o +always += xdp_tx_iptnl_kern.o HOSTCFLAGS += -I$(objtree)/usr/include HOSTCFLAGS += -I$(srctree)/tools/testing/selftests/bpf/ @@ -127,6 +130,7 @@ HOSTLOADLIBES_trace_event += -lelf HOSTLOADLIBES_sampleip += -lelf HOSTLOADLIBES_tc_l2_redirect += -l elf HOSTLOADLIBES_lwt_len_hist += -l elf +HOSTLOADLIBES_xdp_tx_iptnl += -lelf # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline: # make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h index a246c6122629..8e9dca50b73a 100644 --- a/samples/bpf/bpf_helpers.h +++ b/samples/bpf/bpf_helpers.h @@ -57,6 +57,8 @@ static int (*bpf_skb_set_tunnel_opt)(void *ctx, void *md, int size) = (void *) BPF_FUNC_skb_set_tunnel_opt; static unsigned long long (*bpf_get_prandom_u32)(void) = (void *) BPF_FUNC_get_prandom_u32; +static int (*bpf_xdp_adjust_head)(void *ctx, int offset) = + (void *) BPF_FUNC_xdp_adjust_head; /* llvm builtin functions that eBPF C program may use to * emit BPF_LD_ABS and BPF_LD_IND instructions diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c index 49b45ccbe153..e30b6de94f2e 100644 --- a/samples/bpf/bpf_load.c +++ b/samples/bpf/bpf_load.c @@ -12,6 +12,10 @@ #include #include #include +#include +#include +#include +#include #include #include #include @@ -450,3 +454,93 @@ struct ksym *ksym_search(long key) /* out of range. return _stext */ return &syms[0]; } + +int set_link_xdp_fd(int ifindex, int fd) +{ + struct sockaddr_nl sa; + int sock, seq = 0, len, ret = -1; + char buf[4096]; + struct nlattr *nla, *nla_xdp; + struct { + struct nlmsghdr nh; + struct ifinfomsg ifinfo; + char attrbuf[64]; + } req; + struct nlmsghdr *nh; + struct nlmsgerr *err; + + memset(&sa, 0, sizeof(sa)); + sa.nl_family = AF_NETLINK; + + sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + if (sock < 0) { + printf("open netlink socket: %s\n", strerror(errno)); + return -1; + } + + if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) { + printf("bind to netlink: %s\n", strerror(errno)); + goto cleanup; + } + + memset(&req, 0, sizeof(req)); + req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)); + req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; + req.nh.nlmsg_type = RTM_SETLINK; + req.nh.nlmsg_pid = 0; + req.nh.nlmsg_seq = ++seq; + req.ifinfo.ifi_family = AF_UNSPEC; + req.ifinfo.ifi_index = ifindex; + nla = (struct nlattr *)(((char *)&req) + + NLMSG_ALIGN(req.nh.nlmsg_len)); + nla->nla_type = NLA_F_NESTED | 43/*IFLA_XDP*/; + + nla_xdp = (struct nlattr *)((char *)nla + NLA_HDRLEN); + nla_xdp->nla_type = 1/*IFLA_XDP_FD*/; + nla_xdp->nla_len = NLA_HD
[PATCH net-next 0/4]: Allow head adjustment in XDP prog
This series adds a helper to allow head adjustment in XDP prog. mlx4 driver has been modified to support this feature. An example is written to encapsulate a packet with an IPv4/v6 header and then XDP_TX it out. Thanks, --Martin
Re: [PATCH net-next] liquidio: 'imply' ptp instead of 'select'
On 12/02/2016 03:04 PM, Arnd Bergmann wrote: ptp now depends on the optional POSIX_TIMERS setting and fails to build if we select it without that: warning: (LIQUIDIO_VF && TI_CPTS) selects PTP_1588_CLOCK which has unmet direct dependencies (NET && POSIX_TIMERS) warning: (LIQUIDIO_VF && TI_CPTS) selects PTP_1588_CLOCK which has unmet direct dependencies (NET && POSIX_TIMERS) ERROR: "posix_clock_unregister" [drivers/ptp/ptp.ko] undefined! ERROR: "posix_clock_register" [drivers/ptp/ptp.ko] undefined! ERROR: "pps_unregister_source" [drivers/ptp/ptp.ko] undefined! ERROR: "pps_event" [drivers/ptp/ptp.ko] undefined! ERROR: "pps_register_source" [drivers/ptp/ptp.ko] undefined! It seems that two patches have collided here, the build failure is a result of the combination. Changing the new option to 'imply' as well fixes it. Fixes: 111fc64a237f ("liquidio CN23XX: VF registration") Fixes: d1cbfd771ce8 ("ptp_clock: Allow for it to be optional") Signed-off-by: Arnd Bergmann I didn't know about this new "imply" thing. This seems like a plausible fix, so... Acked-by: David Daney Thanks for fixing this up. --- drivers/net/ethernet/cavium/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/cavium/Kconfig b/drivers/net/ethernet/cavium/Kconfig index bbc8bd16cb97..dcbce6cac63e 100644 --- a/drivers/net/ethernet/cavium/Kconfig +++ b/drivers/net/ethernet/cavium/Kconfig @@ -77,7 +77,7 @@ config OCTEON_MGMT_ETHERNET config LIQUIDIO_VF tristate "Cavium LiquidIO VF support" depends on 64BIT && PCI_MSI - select PTP_1588_CLOCK + imply PTP_1588_CLOCK ---help--- This driver supports Cavium LiquidIO Intelligent Server Adapter based on CN23XX chips.
Re: bpf bounded loops. Was: [flamebait] xdp
On Fri, Dec 02, 2016 at 08:42:41PM +0100, Hannes Frederic Sowa wrote: > On Fri, Dec 2, 2016, at 20:25, Hannes Frederic Sowa wrote: > > On 02.12.2016 19:39, Alexei Starovoitov wrote: > > > On Thu, Dec 01, 2016 at 10:27:12PM +0100, Hannes Frederic Sowa wrote: > > >> like") and the problematic of parsing DNS packets in XDP due to string > > >> processing and looping inside eBPF. > > > > > > Hannes, > > > Not too long ago you proposed a very interesting idea to add > > > support for bounded loops without adding any new bpf instructions and > > > changing llvm (which was way better than my 'rep' like instructions > > > I was experimenting with). I thought systemtap guys also wanted bounded > > > loops and you were cooperating on the design, so I gave up on my work and > > > was expecting an imminent patch from you. I guess it sounds like you know > > > believe that bounded loops are impossible or I misunderstand your > > > statement ? > > > > Your argument was that it would need a new verifier as the current first > > pass checks that we indeed can lay out the basic blocks as a DAG which > > the second pass depends on. This would be violated. yes. today the main part of verifier depends on cfg check that confirms DAG property of the program. This was done as a simplification for the algorithm, so any programmer that understands C can understand the verifier code. It certainly was the case, since most of the people who hacked verifier had zero compiler background. Now I'm thinking to introduce proper compiler technologies to it. On one side it will make the bar to understand higher and on the other side it will cleanup the logic and reuse tens of years of data flow analysis theory and will make verifier more robust and mathematically solid. > > Because eBPF is available by non privileged users this would need a lot > > of effort to rewrite and verify (or indeed keep two verifiers in the > > kernel for priv and non-priv). The verifier itself is exposed to > > unprivileged users. I certainly hear your concerns that people unfamiliar with it are simply scared that more and more verification logic being added. So I don't mind freezing current verifier for unpriv and let proper data flow analysis to be done in root only component. > > Also, by design, if we keep the current limits, this would not give you > > more instructions to operate on compared to the flattened version of the > > program, it would merely reduce the numbers of optimizations in LLVM > > that let the verifier reject the program. I think we most likely will keep 4k insn limit (since there were no requests to increase it). The bounded loops will improve performance and reduce I-cache misses. > The only solution to protect the verifier, which I saw, would be to > limit it by time and space, thus making loading of eBPF programs > depending on how fast and hot (thermal throttling) one CPU thread is. the verifier already has time and space limits. See no reason to rely on physical cpu sensors. > Those are the complexity problems I am talking and concerned about. Do you have concerns when people implement encryption algorithm that you're unfamiliar with? Isn't it much bigger concern, since any bugs in the algorithm are directly exploitable and when encryption is actually used it's protecting sensitive data, whereas here the verifier protects kernel from crashing.
[PATCH net-next v3 2/2] net: stmmac: dwmac-meson8b: make the RGMII TX delay configurable
Prior to this patch we were using a hardcoded RGMII TX clock delay of 2ns (= 1/4 cycle of the 125MHz RGMII TX clock). This value works for many boards, but unfortunately not for all (due to the way the actual circuit is designed, sometimes because the TX delay is enabled in the PHY, etc.). Making the TX delay on the MAC side configurable allows us to support all possible hardware combinations. This allows fixing a compatibility issue on some boards, where the RTL8211F PHY is configured to generate the TX delay. We can now turn off the TX delay in the MAC, because otherwise we would be applying the delay twice (which results in non-working TX traffic). Signed-off-by: Martin Blumenstingl Tested-by: Neil Armstrong --- drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c | 21 +++-- 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c index 250e4ce..dad31b0 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c @@ -35,10 +35,6 @@ #define PRG_ETH0_TXDLY_SHIFT 5 #define PRG_ETH0_TXDLY_MASKGENMASK(6, 5) -#define PRG_ETH0_TXDLY_OFF (0x0 << PRG_ETH0_TXDLY_SHIFT) -#define PRG_ETH0_TXDLY_QUARTER (0x1 << PRG_ETH0_TXDLY_SHIFT) -#define PRG_ETH0_TXDLY_HALF(0x2 << PRG_ETH0_TXDLY_SHIFT) -#define PRG_ETH0_TXDLY_THREE_QUARTERS (0x3 << PRG_ETH0_TXDLY_SHIFT) /* divider for the result of m250_sel */ #define PRG_ETH0_CLK_M250_DIV_SHIFT7 @@ -69,6 +65,8 @@ struct meson8b_dwmac { struct clk_divider m25_div; struct clk *m25_div_clk; + + u32 tx_delay_ns; }; static void meson8b_dwmac_mask_bits(struct meson8b_dwmac *dwmac, u32 reg, @@ -179,6 +177,7 @@ static int meson8b_init_prg_eth(struct meson8b_dwmac *dwmac) { int ret; unsigned long clk_rate; + u8 tx_dly_val; switch (dwmac->phy_mode) { case PHY_INTERFACE_MODE_RGMII: @@ -196,9 +195,13 @@ static int meson8b_init_prg_eth(struct meson8b_dwmac *dwmac) meson8b_dwmac_mask_bits(dwmac, PRG_ETH0, PRG_ETH0_INVERTED_RMII_CLK, 0); - /* TX clock delay - all known boards use a 1/4 cycle delay */ + /* TX clock delay in ns = "8ns / 4 * tx_dly_val" (where +* 8ns are exactly one cycle of the 125MHz RGMII TX clock): +* 0ns = 0x0, 2ns = 0x1, 4ns = 0x2, 6ns = 0x3 +*/ + tx_dly_val = dwmac->tx_delay_ns >> 1; meson8b_dwmac_mask_bits(dwmac, PRG_ETH0, PRG_ETH0_TXDLY_MASK, - PRG_ETH0_TXDLY_QUARTER); + tx_dly_val << PRG_ETH0_TXDLY_SHIFT); break; case PHY_INTERFACE_MODE_RMII: @@ -277,6 +280,12 @@ static int meson8b_dwmac_probe(struct platform_device *pdev) if (dwmac->phy_mode < 0) { dev_err(&pdev->dev, "missing phy-mode property\n"); return -EINVAL; + } else if (dwmac->phy_mode != PHY_INTERFACE_MODE_RMII) { + /* ignore errors as this is an optional property - by default +* we assume a TX delay of 0ns. +*/ + of_property_read_u32(pdev->dev.of_node, "amlogic,tx-delay-ns", +&dwmac->tx_delay_ns); } ret = meson8b_init_clk(dwmac); -- 2.10.2
[PATCH net-next v3 1/2] net: dt-bindings: add RGMII TX delay configuration to meson8b-dwmac
This allows configuring the RGMII TX clock delay. The RGMII clock is generated by underlying hardware of the the Meson 8b / GXBB DWMAC glue. The configuration depends on the actual hardware (no delay may be needed due to the design of the actual circuit, the PHY might add this delay, etc.). Signed-off-by: Martin Blumenstingl Tested-by: Neil Armstrong --- Documentation/devicetree/bindings/net/meson-dwmac.txt | 14 ++ 1 file changed, 14 insertions(+) diff --git a/Documentation/devicetree/bindings/net/meson-dwmac.txt b/Documentation/devicetree/bindings/net/meson-dwmac.txt index 89e62dd..f8bc540 100644 --- a/Documentation/devicetree/bindings/net/meson-dwmac.txt +++ b/Documentation/devicetree/bindings/net/meson-dwmac.txt @@ -25,6 +25,20 @@ Required properties on Meson8b and newer: - "clkin0" - first parent clock of the internal mux - "clkin1" - second parent clock of the internal mux +Optional properties on Meson8b and newer: +- amlogic,tx-delay-ns: The internal RGMII TX clock delay (provided + by this driver) in nanoseconds. Allowed values + are: 0ns, 2ns, 4ns, 6ns. + This must be configured when the phy-mode is + "rgmii" (typically a value of 2ns is used in + this case). + When phy-mode is set to "rgmii-id" or + "rgmii-txid" the TX clock delay is already + provided by the PHY. In that case this + property should be set to 0ns (which disables + the TX clock delay in the MAC to prevent the + clock from going off because both PHY and MAC + are adding a delay). Example for Meson6: -- 2.10.2
[PATCH net-next v3 0/2] stmmac: dwmac-meson8b: configurable RGMII TX delay
Currently the dwmac-meson8b stmmac glue driver uses a hardcoded 1/4 cycle (= 2ns) TX clock delay. This seems to work fine for many boards (for example Odroid-C2 or Amlogic's reference boards) but there are some others where TX traffic is simply broken. There are probably multiple reasons why it's working on some boards while it's broken on others: - some of Amlogic's reference boards are using a Micrel PHY - hardware circuit design - maybe more... iperf3 results on my Mecool BB2 board (Meson GXM, RTL8211F PHY) with TX clock delay disabled on the MAC (as it's enabled in the PHY driver). TX throughput was virtually zero before: $ iperf3 -c 192.168.1.100 -R Connecting to host 192.168.1.100, port 5201 Reverse mode, remote host 192.168.1.100 is sending [ 4] local 192.168.1.206 port 52828 connected to 192.168.1.100 port 5201 [ ID] Interval Transfer Bandwidth [ 4] 0.00-1.00 sec 108 MBytes 901 Mbits/sec [ 4] 1.00-2.00 sec 94.2 MBytes 791 Mbits/sec [ 4] 2.00-3.00 sec 96.5 MBytes 810 Mbits/sec [ 4] 3.00-4.00 sec 96.2 MBytes 808 Mbits/sec [ 4] 4.00-5.00 sec 96.6 MBytes 810 Mbits/sec [ 4] 5.00-6.00 sec 96.5 MBytes 810 Mbits/sec [ 4] 6.00-7.00 sec 96.6 MBytes 810 Mbits/sec [ 4] 7.00-8.00 sec 96.5 MBytes 809 Mbits/sec [ 4] 8.00-9.00 sec 105 MBytes 884 Mbits/sec [ 4] 9.00-10.00 sec 111 MBytes 934 Mbits/sec - - - - - - - - - - - - - - - - - - - - - - - - - [ ID] Interval Transfer Bandwidth Retr [ 4] 0.00-10.00 sec 1000 MBytes 839 Mbits/sec0 sender [ 4] 0.00-10.00 sec 998 MBytes 837 Mbits/sec receiver iperf Done. $ iperf3 -c 192.168.1.100 Connecting to host 192.168.1.100, port 5201 [ 4] local 192.168.1.206 port 52832 connected to 192.168.1.100 port 5201 [ ID] Interval Transfer Bandwidth Retr Cwnd [ 4] 0.00-1.01 sec 99.5 MBytes 829 Mbits/sec 117139 KBytes [ 4] 1.01-2.00 sec 105 MBytes 884 Mbits/sec 129 70.7 KBytes [ 4] 2.00-3.01 sec 107 MBytes 889 Mbits/sec 106187 KBytes [ 4] 3.01-4.01 sec 105 MBytes 878 Mbits/sec 92143 KBytes [ 4] 4.01-5.00 sec 105 MBytes 882 Mbits/sec 140129 KBytes [ 4] 5.00-6.01 sec 106 MBytes 883 Mbits/sec 115195 KBytes [ 4] 6.01-7.00 sec 102 MBytes 863 Mbits/sec 133 70.7 KBytes [ 4] 7.00-8.01 sec 106 MBytes 884 Mbits/sec 143 97.6 KBytes [ 4] 8.01-9.01 sec 104 MBytes 875 Mbits/sec 124107 KBytes [ 4] 9.01-10.01 sec 105 MBytes 876 Mbits/sec 90139 KBytes - - - - - - - - - - - - - - - - - - - - - - - - - [ ID] Interval Transfer Bandwidth Retr [ 4] 0.00-10.01 sec 1.02 GBytes 874 Mbits/sec 1189 sender [ 4] 0.00-10.01 sec 1.02 GBytes 873 Mbits/sec receiver iperf Done. I get similar TX throughput on my Meson GXBB "MXQ Pro+" board when I disable the PHY's TX-delay and configure a 4ms TX-delay on the MAC. So changes to at least the RTL8211F PHY driver are needed to get it working properly in all situations. Changes since v2: - moved all .dts patches (3-7) to a separate series - removed the default 2ns TX delay when phy-mode RGMII is specified - (rebased against current net-next) Changes since v1: - renamed the devicetree property "amlogic,tx-delay" to "amlogic,tx-delay-ns", which makes the .dts easier to read as we can simply specify human-readable values instead of having "preprocessor defines and calculation in human brain". Thanks to Andrew Lunn for the suggestion! - improved documentation to indicate when the MAC TX-delay should be configured and how to use the PHY's TX-delay - changed the default TX-delay in the dwmac-meson8b driver from 2ns to 0ms when any of the rgmii-*id modes are used (the 2ns default value still applies for phy-mode "rgmii") - added patches to properly reset the PHY on Meson GXBB devices and to use a similar configuration than the one we use on Meson GXL devices (by passing a phy-handle to stmmac and defining the PHY in the mdio0 bus - patch 3-6) - add the "amlogic,tx-delay-ns" property to all boards which are using the RGMII PHY (patch 7) Martin Blumenstingl (2): net: dt-bindings: add RGMII TX delay configuration to meson8b-dwmac net: stmmac: dwmac-meson8b: make the RGMII TX delay configurable .../devicetree/bindings/net/meson-dwmac.txt | 14 ++ drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c | 21 +++-- 2 files changed, 29 insertions(+), 6 deletions(-) -- 2.10.2
Re: [PATCH 2/7] net: ethernet: ti: cpdma: fix desc re-queuing
On Fri, Dec 02, 2016 at 10:45:07AM -0600, Grygorii Strashko wrote: > > > On 12/02/2016 05:03 AM, Ivan Khoronzhuk wrote: > > On Thu, Dec 01, 2016 at 05:34:27PM -0600, Grygorii Strashko wrote: > >> The currently processing cpdma descriptor with EOQ flag set may > >> contain two values in Next Descriptor Pointer field: > >> - valid pointer: means CPDMA missed addition of new desc in queue; > > It shouldn't happen in normal circumstances, right? > > it might happen, because desc push compete with desc pop. > You can check stats values: > chan->stats.misqueued > chan->stats.requeue > under different types of net-loads. I've done this, of-course. By whole logic the misqueued counter has to cover all cases. But that's not true. > > TRM: > " > If the pNext pointer is initially NULL, and more packets need to be queued > for transmit, the software > application may alter this pointer to point to a newly appended descriptor. > The EMAC will use the new > pointer value and proceed to the next descriptor unless the pNext value has > already been read. In this > latter case, the transmitter will halt on the transmit channel in question, > and the software application may > restart it at that time. The software can detect this case by checking for an > end of queue (EOQ) condition > flag on the updated packet descriptor when it is returned by the EMAC. > " That's true. No issues in desc. In the code no any place to update next_desc except submit function. And this case is supposed to be caught here: For submit: cpdma_chan_submit() spin_lock_irqsave(&chan->lock); ... --->__cpdma_chan_submit() ... --> desc_write(prev, hw_next, desc_dma); // here next pointer is updated, it can be not in time ... --> mode = desc_read(prev, hw_mode); // pay attention, it's read after updating next pointer --> if ((mode & CPDMA_DESC_EOQ) && --> (chan->state == CPDMA_STATE_ACTIVE)) { // here checked if it was late update -> chan_write(chan, hdp, desc_dma); // here transmit is restarted, if needed For process it only caught the fact of late update, but it has to be caught in submit() already: __cpdma_chan_process() spin_lock_irqsave(&chan->lock); --> if (mode & CPDMA_DESC_EOQ) // here transmit is restarted, if needed -> chan_write(chan, hdp, desc_dma); // but w/o updating next pointer Seems there is no place where hw_next is updated w/o updating hdp :-| in case of late hw_next set. And that is strange. I know it happens, I've checked it before of-course. Then I thought, maybe there is some problem with write order, thus out of sync, nothing more. > > > > So, why it happens only for egress channels? And Does that mean > > there is some resynchronization between submit and process function, > > or this is h/w issue? > > no hw issues. this patch just removes one unnecessary I/O access No objections against patch. Anyway it's better then before. Just want to know the real reason why it happens, maybe there is smth else. > > > > >> - null: no more descriptors in queue. > >> In the later case, it's not required to write to HDP register, but now > >> CPDMA does it. > >> > >> Hence, add additional check for Next Descriptor Pointer != null in > >> cpdma_chan_process() function before writing in HDP register. > >> > >> Signed-off-by: Grygorii Strashko > >> --- > >> drivers/net/ethernet/ti/davinci_cpdma.c | 2 +- > >> 1 file changed, 1 insertion(+), 1 deletion(-) > >> > >> diff --git a/drivers/net/ethernet/ti/davinci_cpdma.c > >> b/drivers/net/ethernet/ti/davinci_cpdma.c > >> index 0924014..379314f 100644 > >> --- a/drivers/net/ethernet/ti/davinci_cpdma.c > >> +++ b/drivers/net/ethernet/ti/davinci_cpdma.c > >> @@ -1152,7 +1152,7 @@ static int __cpdma_chan_process(struct cpdma_chan > >> *chan) > >>chan->count--; > >>chan->stats.good_dequeue++; > >> > >> - if (status & CPDMA_DESC_EOQ) { > >> + if ((status & CPDMA_DESC_EOQ) && chan->head) { > >>chan->stats.requeue++; > >>chan_write(chan, hdp, desc_phys(pool, chan->head)); > >>} > >> -- > >> 2.10.1 > >> > > -- > regards, > -grygorii
Re: [PATCH] iproute2: ss: escape all null bytes in abstract unix domain socket
On Fri, 2016-12-02 at 15:18 -0800, Stephen Hemminger wrote: > name[i] = '@'; > > > > ss.c: In function 'unix_show_sock': > > ss.c:3128:4: error: 'for' loop initial declarations are only allowed in C99 > > mode > > ss.c:3128:4: note: use option -std=c99 or -std=gnu99 to compile your code > > make[1]: *** [ss.o] Error 1 > > > > > > > > Thanks, fixed by patch from Simon Right, thanks !
Re: [PATCH net-next] liquidio: 'imply' ptp instead of 'select'
On Sat, 3 Dec 2016, Arnd Bergmann wrote: > ptp now depends on the optional POSIX_TIMERS setting and fails to build > if we select it without that: > > warning: (LIQUIDIO_VF && TI_CPTS) selects PTP_1588_CLOCK which has unmet > direct dependencies (NET && POSIX_TIMERS) > warning: (LIQUIDIO_VF && TI_CPTS) selects PTP_1588_CLOCK which has unmet > direct dependencies (NET && POSIX_TIMERS) > ERROR: "posix_clock_unregister" [drivers/ptp/ptp.ko] undefined! > ERROR: "posix_clock_register" [drivers/ptp/ptp.ko] undefined! > ERROR: "pps_unregister_source" [drivers/ptp/ptp.ko] undefined! > ERROR: "pps_event" [drivers/ptp/ptp.ko] undefined! > ERROR: "pps_register_source" [drivers/ptp/ptp.ko] undefined! > > It seems that two patches have collided here, the build failure > is a result of the combination. Changing the new option to 'imply' > as well fixes it. > > Fixes: 111fc64a237f ("liquidio CN23XX: VF registration") > Fixes: d1cbfd771ce8 ("ptp_clock: Allow for it to be optional") > Signed-off-by: Arnd Bergmann Acked-by: Nicolas Pitre > --- > drivers/net/ethernet/cavium/Kconfig | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/drivers/net/ethernet/cavium/Kconfig > b/drivers/net/ethernet/cavium/Kconfig > index bbc8bd16cb97..dcbce6cac63e 100644 > --- a/drivers/net/ethernet/cavium/Kconfig > +++ b/drivers/net/ethernet/cavium/Kconfig > @@ -77,7 +77,7 @@ config OCTEON_MGMT_ETHERNET > config LIQUIDIO_VF > tristate "Cavium LiquidIO VF support" > depends on 64BIT && PCI_MSI > - select PTP_1588_CLOCK > + imply PTP_1588_CLOCK > ---help--- > This driver supports Cavium LiquidIO Intelligent Server Adapter > based on CN23XX chips. > -- > 2.9.0 > >
Re: [PATCH] iproute2: ss: escape all null bytes in abstract unix domain socket
On Fri, 02 Dec 2016 10:59:56 -0800 Eric Dumazet wrote: > On Sat, 2016-11-12 at 10:17 +0300, Stephen Hemminger wrote: > > On Sat, 29 Oct 2016 22:20:19 +0300 > > Isaac Boukris wrote: > > > > > Abstract unix domain socket may embed null characters, > > > these should be translated to '@' when printed by ss the > > > same way the null prefix is currently being translated. > > > > > > Signed-off-by: Isaac Boukris > > > > Applied > > Probably not a good idea to have : > >for (int i = 0; i < len; i++) >if (name[i] == '\0') >name[i] = '@'; > > ss.c: In function 'unix_show_sock': > ss.c:3128:4: error: 'for' loop initial declarations are only allowed in C99 > mode > ss.c:3128:4: note: use option -std=c99 or -std=gnu99 to compile your code > make[1]: *** [ss.o] Error 1 > > > Thanks, fixed by patch from Simon
Re: iproute2 public git outdated?
On Thu, 1 Dec 2016 13:18:06 +0100 Phil Sutter wrote: > Hi, > > I am using iproute2's public git repo at this URL: > > git://git.kernel.org/pub/scm/linux/kernel/git/shemminger/iproute2.git > > To my surprise, neither master nor net-next branches have received new > commits since end of October. Did the repo location change or was it > just not updated for a while? > > Thanks, Phil I was on the road, and moving between houses. Sorry for the extended delay.
Avoid deadlock situation due to use of xmit_lock
Hi, after stumbling over a potential deadlock situation in the altera driver (see http://marc.info/?l=linux-netdev&m=148054615230447&w=2), I checked all other ethernet drivers for the same issue and actually found it in 2 more, namely stmmac, and sxgbe. Please see the commit messages for a description of the problem. These 2 patches fix the concerning drivers. Regards, Lino
[PATCH 1/2] net: ethernet: sxgbe: do not use xmit_lock in tx completion handler
The driver already uses its private lock for synchronization between the xmit function and the xmit completion handler, making the additional use of the xmit_lock unnecessary. Furthermore the driver does not set NETIF_F_LLTX resulting in xmit to be called with the xmit_lock held and then taking the private lock. On the other hand the xmit completion handler uses the reverse locking order, by first taking the private lock, and then the xmit_lock, which leads to the potential danger of a deadlock. Fix this issue by not taking the xmit_lock in the completion handler. By doing this also remove an unnecessary double check for a stopped tx queue. Signed-off-by: Lino Sanfilippo --- drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c | 11 +++ 1 file changed, 3 insertions(+), 8 deletions(-) Please note that this patch is only compile tested. diff --git a/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c b/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c index 5dbe406..578cbec 100644 --- a/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c +++ b/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c @@ -782,14 +782,9 @@ static void sxgbe_tx_queue_clean(struct sxgbe_tx_queue *tqueue) /* wake up queue */ if (unlikely(netif_tx_queue_stopped(dev_txq) && sxgbe_tx_avail(tqueue, tx_rsize) > SXGBE_TX_THRESH(priv))) { - netif_tx_lock(priv->dev); - if (netif_tx_queue_stopped(dev_txq) && - sxgbe_tx_avail(tqueue, tx_rsize) > SXGBE_TX_THRESH(priv)) { - if (netif_msg_tx_done(priv)) - pr_debug("%s: restart transmit\n", __func__); - netif_tx_wake_queue(dev_txq); - } - netif_tx_unlock(priv->dev); + if (netif_msg_tx_done(priv)) + pr_debug("%s: restart transmit\n", __func__); + netif_tx_wake_queue(dev_txq); } spin_unlock(&tqueue->tx_lock); -- 1.9.1
[PATCH 2/2] net: ethernet: stmmac: do not use xmit_lock in tx completion handler
The driver already uses its private lock for synchronization between the xmit function and the xmit completion handler, making the additional use of the xmit_lock unnecessary. Furthermore the driver does not set NETIF_F_LLTX resulting in xmit to be called with the xmit_lock held and then taking the private lock. On the other hand the xmit completion handler uses the reverse locking order, by first taking the private lock, and then the xmit_lock, which leads to the potential danger of a deadlock. Fix this issue by not taking the xmit_lock in the completion handler. By doing this also remove an unnecessary double check for a stopped tx queue. Signed-off-by: Lino Sanfilippo --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 11 +++ 1 file changed, 3 insertions(+), 8 deletions(-) Please note that this patch is only compile tested. diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 48a4e84..8def423 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -1380,14 +1380,9 @@ static void stmmac_tx_clean(struct stmmac_priv *priv) if (unlikely(netif_queue_stopped(priv->dev) && stmmac_tx_avail(priv) > STMMAC_TX_THRESH)) { - netif_tx_lock(priv->dev); - if (netif_queue_stopped(priv->dev) && - stmmac_tx_avail(priv) > STMMAC_TX_THRESH) { - netif_dbg(priv, tx_done, priv->dev, - "%s: restart transmit\n", __func__); - netif_wake_queue(priv->dev); - } - netif_tx_unlock(priv->dev); + netif_dbg(priv, tx_done, priv->dev, + "%s: restart transmit\n", __func__); + netif_wake_queue(priv->dev); } if ((priv->eee_enabled) && (!priv->tx_path_in_lpi_mode)) { -- 1.9.1
Re: [PATCH/RFC iproute2/net-next 1/3] tc: flower: update headers for TCA_FLOWER_KEY_ICMP*
On Fri, 2 Dec 2016 10:59:43 +0100 Simon Horman wrote: > These are proposed changes for net-next. > > Signed-off-by: Simon Horman > --- > include/linux/pkt_cls.h | 10 ++ > 1 file changed, 10 insertions(+) > > diff --git a/include/linux/pkt_cls.h b/include/linux/pkt_cls.h > index a3d8a4f17d8e..fa435ea8ad21 100644 > --- a/include/linux/pkt_cls.h > +++ b/include/linux/pkt_cls.h > @@ -403,6 +403,16 @@ enum { > TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK, /* be16 */ > TCA_FLOWER_KEY_ENC_UDP_DST_PORT,/* be16 */ > TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK, /* be16 */ > + > + TCA_FLOWER_KEY_ICMPV4_CODE, /* u8 */ > + TCA_FLOWER_KEY_ICMPV4_CODE_MASK,/* u8 */ > + TCA_FLOWER_KEY_ICMPV4_TYPE, /* u8 */ > + TCA_FLOWER_KEY_ICMPV4_TYPE_MASK,/* u8 */ > + TCA_FLOWER_KEY_ICMPV6_CODE, /* u8 */ > + TCA_FLOWER_KEY_ICMPV6_CODE_MASK,/* u8 */ > + TCA_FLOWER_KEY_ICMPV6_TYPE, /* u8 */ > + TCA_FLOWER_KEY_ICMPV6_TYPE_MASK,/* u8 */ > + > __TCA_FLOWER_MAX, > }; > I picked this up by updating from kernel headers.
Re: [[PATCH iproute2/net-next v2] 2/4] tc: flower: document SCTP ip_proto
On Fri, 2 Dec 2016 09:45:19 +0100 Simon Horman wrote: > Add SCTP ip_proto to help text and man page. > > Signed-off-by: Simon Horman This doesn't apply cleanly to current net-next git. Probably some of the other man page changes caused reject.
Re: [[PATCH iproute2/net-next v2] 1/4] tc: flower: remove references to eth_type in manpage
On Fri, 2 Dec 2016 09:45:18 +0100 Simon Horman wrote: > Remove references to eth_type and ether_type (spelling error) in > the tc flower manpage. > > Also correct formatting of boldface text with whitespace. > > Cc: Paul Blakey > Signed-off-by: Simon Horman Applied this one. Later ones still need rebase.
[PATCH net-next] liquidio: 'imply' ptp instead of 'select'
ptp now depends on the optional POSIX_TIMERS setting and fails to build if we select it without that: warning: (LIQUIDIO_VF && TI_CPTS) selects PTP_1588_CLOCK which has unmet direct dependencies (NET && POSIX_TIMERS) warning: (LIQUIDIO_VF && TI_CPTS) selects PTP_1588_CLOCK which has unmet direct dependencies (NET && POSIX_TIMERS) ERROR: "posix_clock_unregister" [drivers/ptp/ptp.ko] undefined! ERROR: "posix_clock_register" [drivers/ptp/ptp.ko] undefined! ERROR: "pps_unregister_source" [drivers/ptp/ptp.ko] undefined! ERROR: "pps_event" [drivers/ptp/ptp.ko] undefined! ERROR: "pps_register_source" [drivers/ptp/ptp.ko] undefined! It seems that two patches have collided here, the build failure is a result of the combination. Changing the new option to 'imply' as well fixes it. Fixes: 111fc64a237f ("liquidio CN23XX: VF registration") Fixes: d1cbfd771ce8 ("ptp_clock: Allow for it to be optional") Signed-off-by: Arnd Bergmann --- drivers/net/ethernet/cavium/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/cavium/Kconfig b/drivers/net/ethernet/cavium/Kconfig index bbc8bd16cb97..dcbce6cac63e 100644 --- a/drivers/net/ethernet/cavium/Kconfig +++ b/drivers/net/ethernet/cavium/Kconfig @@ -77,7 +77,7 @@ config OCTEON_MGMT_ETHERNET config LIQUIDIO_VF tristate "Cavium LiquidIO VF support" depends on 64BIT && PCI_MSI - select PTP_1588_CLOCK + imply PTP_1588_CLOCK ---help--- This driver supports Cavium LiquidIO Intelligent Server Adapter based on CN23XX chips. -- 2.9.0
[PATCH net-next] phy: add phy fixup unregister functions
From: Woojung Huh Add functions to unregister phy fixup for modules. phy_unregister_fixup(const char *bus_id, u32 phy_uid, u32 phy_uid_mask) Unregister phy fixup matches bus_id, phy_uid and phy_uid_mask from phy_fixup_list. Return 0 when find matched one and remove from the list. Return -ENODEV when fail to find it on the list. phy_unregister_fixup_for_uid(u32 phy_uid, u32 phy_uid_mask) Unregister phy fixup from phy_fixup_list. Use it for fixup registered by phy_register_fixup_for_uid() Return 0 when find matched one and remove from the list. Return -ENODEV when fail to find it on the list. phy_unregister_fixup_for_id(const char *bus_id) Unregister phy fixup from phy_fixup_list. Use it for fixup registered by phy_register_fixup_for_id() Return 0 when find matched one and remove from the list. Return -ENODEV when fail to find it on the list. Signed-off-by: Woojung Huh --- Documentation/networking/phy.txt | 9 drivers/net/phy/phy_device.c | 47 include/linux/phy.h | 4 3 files changed, 60 insertions(+) diff --git a/Documentation/networking/phy.txt b/Documentation/networking/phy.txt index e017d93..16f90d8 100644 --- a/Documentation/networking/phy.txt +++ b/Documentation/networking/phy.txt @@ -407,6 +407,15 @@ Board Fixups The stubs set one of the two matching criteria, and set the other one to match anything. + When phy_register_fixup() or *_for_uid()/*_for_id() is called at module, + unregister fixup and free allocate memory are required. + + Call one of following function before unloading module. + + int phy_unregister_fixup(const char *phy_id, u32 phy_uid, u32 phy_uid_mask); + int phy_unregister_fixup_for_uid(u32 phy_uid, u32 phy_uid_mask); + int phy_register_fixup_for_id(const char *phy_id); + Standards IEEE Standard 802.3: CSMA/CD Access Method and Physical Layer Specifications, Section Two: diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index aeaf1bc..32fa7c7 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -235,6 +235,53 @@ int phy_register_fixup_for_id(const char *bus_id, } EXPORT_SYMBOL(phy_register_fixup_for_id); +/** + * phy_unregister_fixup - remove a phy_fixup from the list + * @bus_id: A string matches fixup->bus_id (or PHY_ANY_ID) in phy_fixup_list + * @phy_uid: A phy id matches fixup->phy_id (or PHY_ANY_UID) in phy_fixup_list + * @phy_uid_mask: Applied to phy_uid and fixup->phy_uid before comparison + */ +int phy_unregister_fixup(const char *bus_id, u32 phy_uid, u32 phy_uid_mask) +{ + struct list_head *pos, *n; + struct phy_fixup *fixup; + int ret; + + ret = -ENODEV; + + mutex_lock(&phy_fixup_lock); + list_for_each_safe(pos, n, &phy_fixup_list) { + fixup = list_entry(pos, struct phy_fixup, list); + + if ((!strcmp(fixup->bus_id, bus_id)) && + ((fixup->phy_uid & phy_uid_mask) == +(phy_uid & phy_uid_mask))) { + list_del(&fixup->list); + kfree(fixup); + ret = 0; + break; + } + } + mutex_unlock(&phy_fixup_lock); + + return ret; +} +EXPORT_SYMBOL(phy_unregister_fixup); + +/* Unregisters a fixup of any PHY with the UID in phy_uid */ +int phy_unregister_fixup_for_uid(u32 phy_uid, u32 phy_uid_mask) +{ + return phy_unregister_fixup(PHY_ANY_ID, phy_uid, phy_uid_mask); +} +EXPORT_SYMBOL(phy_unregister_fixup_for_uid); + +/* Unregisters a fixup of the PHY with id string bus_id */ +int phy_unregister_fixup_for_id(const char *bus_id) +{ + return phy_unregister_fixup(bus_id, PHY_ANY_UID, 0x); +} +EXPORT_SYMBOL(phy_unregister_fixup_for_id); + /* Returns 1 if fixup matches phydev in bus_id and phy_uid. * Fixups can be set to match any in one or more fields. */ diff --git a/include/linux/phy.h b/include/linux/phy.h index b53177f..745661d 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -859,6 +859,10 @@ int phy_register_fixup_for_id(const char *bus_id, int phy_register_fixup_for_uid(u32 phy_uid, u32 phy_uid_mask, int (*run)(struct phy_device *)); +int phy_unregister_fixup(const char *bus_id, u32 phy_uid, u32 phy_uid_mask); +int phy_unregister_fixup_for_id(const char *bus_id); +int phy_unregister_fixup_for_uid(u32 phy_uid, u32 phy_uid_mask); + int phy_init_eee(struct phy_device *phydev, bool clk_stop_enable); int phy_get_eee_err(struct phy_device *phydev); int phy_ethtool_set_eee(struct phy_device *phydev, struct ethtool_eee *data); -- 2.7.4
Re: [PATCH net-next v2 6/6] tcp: SOF_TIMESTAMPING_OPT_STATS option for SO_TIMESTAMPING
On Fri, Dec 2, 2016 at 2:30 PM, Paul Gortmaker wrote: > On Mon, Nov 28, 2016 at 2:07 AM, Yuchung Cheng wrote: >> From: Francis Yan >> >> This patch exports the sender chronograph stats via the socket >> SO_TIMESTAMPING channel. Currently we can instrument how long a >> particular application unit of data was queued in TCP by tracking >> SOF_TIMESTAMPING_TX_SOFTWARE and SOF_TIMESTAMPING_TX_SCHED. Having > > Seems a new linux-next failure leads back to here ; I did not run a > full bisect, since the variable seems confined to this commit: > > net/socket.c:701: error: 'SCM_TIMESTAMPING_OPT_STATS' undeclared > (first use in this function) > > http://kisskb.ellerman.id.au/kisskb/buildresult/12875981/ oops didn't patch avr32 arch. Does this fix work? > > Paul. > -- > >> these sender chronograph stats exported simultaneously along with >> these timestamps allow further breaking down the various sender >> limitation. For example, a video server can tell if a particular >> chunk of video on a connection takes a long time to deliver because >> TCP was experiencing small receive window. It is not possible to >> tell before this patch without packet traces. >> >> To prepare these stats, the user needs to set >> SOF_TIMESTAMPING_OPT_STATS and SOF_TIMESTAMPING_OPT_TSONLY flags >> while requesting other SOF_TIMESTAMPING TX timestamps. When the >> timestamps are available in the error queue, the stats are returned >> in a separate control message of type SCM_TIMESTAMPING_OPT_STATS, >> in a list of TLVs (struct nlattr) of types: TCP_NLA_BUSY_TIME, >> TCP_NLA_RWND_LIMITED, TCP_NLA_SNDBUF_LIMITED. Unit is microsecond. >> >> Signed-off-by: Francis Yan >> Signed-off-by: Yuchung Cheng >> Signed-off-by: Soheil Hassas Yeganeh >> --- >> ChangeLog since v1: >> - fix build break if CONFIG_INET is not defined >> >> Documentation/networking/timestamping.txt | 10 ++ >> arch/alpha/include/uapi/asm/socket.h | 2 ++ >> arch/frv/include/uapi/asm/socket.h| 2 ++ >> arch/ia64/include/uapi/asm/socket.h | 2 ++ >> arch/m32r/include/uapi/asm/socket.h | 2 ++ >> arch/mips/include/uapi/asm/socket.h | 2 ++ >> arch/mn10300/include/uapi/asm/socket.h| 2 ++ >> arch/parisc/include/uapi/asm/socket.h | 2 ++ >> arch/powerpc/include/uapi/asm/socket.h| 2 ++ >> arch/s390/include/uapi/asm/socket.h | 2 ++ >> arch/sparc/include/uapi/asm/socket.h | 2 ++ >> arch/xtensa/include/uapi/asm/socket.h | 2 ++ >> include/linux/tcp.h | 2 ++ >> include/uapi/asm-generic/socket.h | 2 ++ >> include/uapi/linux/net_tstamp.h | 3 ++- >> include/uapi/linux/tcp.h | 8 >> net/core/skbuff.c | 14 +++--- >> net/core/sock.c | 7 +++ >> net/ipv4/tcp.c| 20 >> net/socket.c | 7 ++- >> 20 files changed, 90 insertions(+), 5 deletions(-) >> >> diff --git a/Documentation/networking/timestamping.txt >> b/Documentation/networking/timestamping.txt >> index 671cccf..96f5069 100644 >> --- a/Documentation/networking/timestamping.txt >> +++ b/Documentation/networking/timestamping.txt >> @@ -182,6 +182,16 @@ SOF_TIMESTAMPING_OPT_TSONLY: >>the timestamp even if sysctl net.core.tstamp_allow_data is 0. >>This option disables SOF_TIMESTAMPING_OPT_CMSG. >> >> +SOF_TIMESTAMPING_OPT_STATS: >> + >> + Optional stats that are obtained along with the transmit timestamps. >> + It must be used together with SOF_TIMESTAMPING_OPT_TSONLY. When the >> + transmit timestamp is available, the stats are available in a >> + separate control message of type SCM_TIMESTAMPING_OPT_STATS, as a >> + list of TLVs (struct nlattr) of types. These stats allow the >> + application to associate various transport layer stats with >> + the transmit timestamps, such as how long a certain block of >> + data was limited by peer's receiver window. >> >> New applications are encouraged to pass SOF_TIMESTAMPING_OPT_ID to >> disambiguate timestamps and SOF_TIMESTAMPING_OPT_TSONLY to operate >> diff --git a/arch/alpha/include/uapi/asm/socket.h >> b/arch/alpha/include/uapi/asm/socket.h >> index 9e46d6e..afc901b 100644 >> --- a/arch/alpha/include/uapi/asm/socket.h >> +++ b/arch/alpha/include/uapi/asm/socket.h >> @@ -97,4 +97,6 @@ >> >> #define SO_CNX_ADVICE 53 >> >> +#define SCM_TIMESTAMPING_OPT_STATS 54 >> + >> #endif /* _UAPI_ASM_SOCKET_H */ >> diff --git a/arch/frv/include/uapi/asm/socket.h >> b/arch/frv/include/uapi/asm/socket.h >> index afbc98f0..81e0353 100644 >> --- a/arch/frv/include/uapi/asm/socket.h >> +++ b/arch/frv/include/uapi/asm/socket.h >> @@ -90,5 +90,7 @@ >> >> #define SO_CNX_ADVICE 53 >> >> +#define SCM_TIMESTAMPING_OPT_STATS 54 >> + >> #endif /* _ASM_SOCKET_H */ >> >> diff --git a/arch/ia64/include/uapi/asm/socket.h >> b/arch/ia64/include/uapi/asm/socket.h >> index 0
[PATCH 3/3] uapi: export nf_log.h
File is in uapi directory but not being copied on make install_headers Fixes commit 4ec9c8fbbc22 ("netfilter: nft_log: complete NFTA_LOG_FLAGS attr support"). Signed-off-by: Stephen Hemminger --- include/uapi/linux/netfilter/Kbuild | 1 + 1 file changed, 1 insertion(+) diff --git a/include/uapi/linux/netfilter/Kbuild b/include/uapi/linux/netfilter/Kbuild index cd26d7a..03f194a 100644 --- a/include/uapi/linux/netfilter/Kbuild +++ b/include/uapi/linux/netfilter/Kbuild @@ -5,6 +5,7 @@ header-y += nf_conntrack_ftp.h header-y += nf_conntrack_sctp.h header-y += nf_conntrack_tcp.h header-y += nf_conntrack_tuple_common.h +header-y += nf_log.h header-y += nf_tables.h header-y += nf_tables_compat.h header-y += nf_nat.h -- 2.10.2
[PATCH 2/3] uapi: export tc_skbmod.h
Fixes commit 735cffe5d800 ("net_sched: Introduce skbmod action") Not used by iproute2 but maybe in future. Signed-off-by: Stephen Hemminger --- include/uapi/linux/tc_act/Kbuild | 1 + 1 file changed, 1 insertion(+) diff --git a/include/uapi/linux/tc_act/Kbuild b/include/uapi/linux/tc_act/Kbuild index 9611c7b..721433e 100644 --- a/include/uapi/linux/tc_act/Kbuild +++ b/include/uapi/linux/tc_act/Kbuild @@ -12,3 +12,4 @@ header-y += tc_bpf.h header-y += tc_connmark.h header-y += tc_ife.h header-y += tc_tunnel_key.h +header-y += tc_sbkmod.h -- 2.10.2
[PATCH 1/3] uapi: export tc tunnel key file
Fixes commit 21609ae32aaf6c6fab0e ("net/sched: Introduce act_tunnel_key") The file is necessary for iproute2 headers but was not being copied by make install_headers Signed-off-by: Stephen Hemminger --- include/uapi/linux/tc_act/Kbuild | 1 + 1 file changed, 1 insertion(+) diff --git a/include/uapi/linux/tc_act/Kbuild b/include/uapi/linux/tc_act/Kbuild index e3969bd..9611c7b 100644 --- a/include/uapi/linux/tc_act/Kbuild +++ b/include/uapi/linux/tc_act/Kbuild @@ -11,3 +11,4 @@ header-y += tc_vlan.h header-y += tc_bpf.h header-y += tc_connmark.h header-y += tc_ife.h +header-y += tc_tunnel_key.h -- 2.10.2
[PATCH 0/3] UAPI export missing headers
Files not being exported by make install headers Stephen Hemminger (3): tc: export tunnel key file uapi: export tc_skbmod.h uapi: export nf_log.h include/uapi/linux/netfilter/Kbuild | 1 + include/uapi/linux/tc_act/Kbuild| 2 ++ 2 files changed, 3 insertions(+) -- 2.10.2
[PATCH v2 0/3] uapi: add kbuild for some files
Some files which are in uapi but not being copied by make headers_install Stephen Hemminger (3): tc: export tunnel key file uapi: export tc_skbmod.h uapi: export nf_log.h include/uapi/linux/netfilter/Kbuild | 1 + include/uapi/linux/tc_act/Kbuild| 2 ++ 2 files changed, 3 insertions(+) V2 - typo in s/sbkmod/skbmod/ -- 2.10.2
[PATCH 2/3] uapi: export tc_skbmod.h
Fixes commit 735cffe5d800 ("net_sched: Introduce skbmod action") Not used by iproute2 but maybe in future. Signed-off-by: Stephen Hemminger --- include/uapi/linux/tc_act/Kbuild | 1 + 1 file changed, 1 insertion(+) diff --git a/include/uapi/linux/tc_act/Kbuild b/include/uapi/linux/tc_act/Kbuild index 9611c7b..e3db740 100644 --- a/include/uapi/linux/tc_act/Kbuild +++ b/include/uapi/linux/tc_act/Kbuild @@ -12,3 +12,4 @@ header-y += tc_bpf.h header-y += tc_connmark.h header-y += tc_ife.h header-y += tc_tunnel_key.h +header-y += tc_skbmod.h -- 2.10.2
[PATCH 3/3] uapi: export nf_log.h
File is in uapi directory but not being copied on make install_headers Fixes commit 4ec9c8fbbc22 ("netfilter: nft_log: complete NFTA_LOG_FLAGS attr support"). Signed-off-by: Stephen Hemminger --- include/uapi/linux/netfilter/Kbuild | 1 + 1 file changed, 1 insertion(+) diff --git a/include/uapi/linux/netfilter/Kbuild b/include/uapi/linux/netfilter/Kbuild index cd26d7a..03f194a 100644 --- a/include/uapi/linux/netfilter/Kbuild +++ b/include/uapi/linux/netfilter/Kbuild @@ -5,6 +5,7 @@ header-y += nf_conntrack_ftp.h header-y += nf_conntrack_sctp.h header-y += nf_conntrack_tcp.h header-y += nf_conntrack_tuple_common.h +header-y += nf_log.h header-y += nf_tables.h header-y += nf_tables_compat.h header-y += nf_nat.h -- 2.10.2
[PATCH 1/3] uapi: export tc tunnel key file
Fixes commit 21609ae32aaf6c6fab0e ("net/sched: Introduce act_tunnel_key") The file is necessary for iproute2 headers but was not being copied by make install_headers Signed-off-by: Stephen Hemminger --- include/uapi/linux/tc_act/Kbuild | 1 + 1 file changed, 1 insertion(+) diff --git a/include/uapi/linux/tc_act/Kbuild b/include/uapi/linux/tc_act/Kbuild index e3969bd..9611c7b 100644 --- a/include/uapi/linux/tc_act/Kbuild +++ b/include/uapi/linux/tc_act/Kbuild @@ -11,3 +11,4 @@ header-y += tc_vlan.h header-y += tc_bpf.h header-y += tc_connmark.h header-y += tc_ife.h +header-y += tc_tunnel_key.h -- 2.10.2
[PATCH net v3] tcp: warn on bogus MSS and try to amend it
There have been some reports lately about TCP connection stalls caused by NIC drivers that aren't setting gso_size on aggregated packets on rx path. This causes TCP to assume that the MSS is actually the size of the aggregated packet, which is invalid. Although the proper fix is to be done at each driver, it's often hard and cumbersome for one to debug, come to such root cause and report/fix it. This patch amends this situation in two ways. First, it adds a warning on when this situation occurs, so it gives a hint to those trying to debug this. It also limit the maximum probed MSS to the adverised MSS, as it should never be any higher than that. The result is that the connection may not have the best performance ever but it shouldn't stall, and the admin will have a hint on what to look for. Tested with virtio by forcing gso_size to 0. v2: updated msg per David suggestion v3: use skb_iif to find the interface and also log its name, per Eric Dumazet suggestion. As the skb may be backlogged and the interface gone by then, we need to check if the number still has a meaning. Cc: Jonathan Maxwell Signed-off-by: Marcelo Ricardo Leitner --- net/ipv4/tcp_input.c | 16 +++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a27b9c0e27c08b4e4aeaff3d0bfdf3ae561ba4d8..042a8a895e97d04afbdc377830537e8fd3b15d1e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -144,7 +144,21 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb) */ len = skb_shinfo(skb)->gso_size ? : skb->len; if (len >= icsk->icsk_ack.rcv_mss) { - icsk->icsk_ack.rcv_mss = len; + static bool __once __read_mostly; + + icsk->icsk_ack.rcv_mss = min_t(unsigned int, len, + tcp_sk(sk)->advmss); + if (icsk->icsk_ack.rcv_mss != len && !__once) { + struct net_device *dev; + + __once = true; + + rcu_read_lock(); + dev = dev_get_by_index_rcu(sock_net(sk), skb->skb_iif); + pr_warn_once("%s: Driver has suspect GRO implementation, TCP performance may be compromised.\n", +dev ? dev->name : "Unknown driver"); + rcu_read_unlock(); + } } else { /* Otherwise, we make more careful check taking into account, * that SACKs block is variable. -- 2.9.3
Re: [PATCH net v2] tcp: warn on bogus MSS and try to amend it
On Fri, Dec 02, 2016 at 06:45:24AM -0800, Eric Dumazet wrote: > On Fri, 2016-12-02 at 08:55 -0200, Marcelo Ricardo Leitner wrote: > > There have been some reports lately about TCP connection stalls caused > > by NIC drivers that aren't setting gso_size on aggregated packets on rx > > path. This causes TCP to assume that the MSS is actually the size of the > > aggregated packet, which is invalid. > > > > Although the proper fix is to be done at each driver, it's often hard > > and cumbersome for one to debug, come to such root cause and report/fix > > it. > > > > This patch amends this situation in two ways. First, it adds a warning > > on when this situation occurs, so it gives a hint to those trying to > > debug this. It also limit the maximum probed MSS to the adverised MSS, > > as it should never be any higher than that. > > > > The result is that the connection may not have the best performance ever > > but it shouldn't stall, and the admin will have a hint on what to look > > for. > > > > Tested with virtio by forcing gso_size to 0. > > > > Cc: Jonathan Maxwell > > Signed-off-by: Marcelo Ricardo Leitner > > --- > > v2: Updated msg as suggested by David. > > > > net/ipv4/tcp_input.c | 5 - > > 1 file changed, 4 insertions(+), 1 deletion(-) > > > > diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c > > index > > a27b9c0e27c08b4e4aeaff3d0bfdf3ae561ba4d8..fd619eb93749b6de56a41669248b337c051d9fe2 > > 100644 > > --- a/net/ipv4/tcp_input.c > > +++ b/net/ipv4/tcp_input.c > > @@ -144,7 +144,10 @@ static void tcp_measure_rcv_mss(struct sock *sk, const > > struct sk_buff *skb) > > */ > > len = skb_shinfo(skb)->gso_size ? : skb->len; > > if (len >= icsk->icsk_ack.rcv_mss) { > > - icsk->icsk_ack.rcv_mss = len; > > + icsk->icsk_ack.rcv_mss = min_t(unsigned int, len, > > + tcp_sk(sk)->advmss); > > + if (icsk->icsk_ack.rcv_mss != len) > > + pr_warn_once("Driver has suspect GRO implementation, > > TCP performance may be compromised.\n"); > > } else { > > /* Otherwise, we make more careful check taking into account, > > * that SACKs block is variable. > > > skb->dev is indeed NULL, but it might be worth getting back the device > using skb->skb_iif maybe ? > Yes, then it's possible. But I have to add an extra check because it involves a search (iif -> net_device) and I can't wrap that inside pr_warn_once(). I hope it doesn't get too cluttered then. Posting v3 in a few.. Thanks
Re: [PATCH net-next v2 6/6] tcp: SOF_TIMESTAMPING_OPT_STATS option for SO_TIMESTAMPING
On Mon, Nov 28, 2016 at 2:07 AM, Yuchung Cheng wrote: > From: Francis Yan > > This patch exports the sender chronograph stats via the socket > SO_TIMESTAMPING channel. Currently we can instrument how long a > particular application unit of data was queued in TCP by tracking > SOF_TIMESTAMPING_TX_SOFTWARE and SOF_TIMESTAMPING_TX_SCHED. Having Seems a new linux-next failure leads back to here ; I did not run a full bisect, since the variable seems confined to this commit: net/socket.c:701: error: 'SCM_TIMESTAMPING_OPT_STATS' undeclared (first use in this function) http://kisskb.ellerman.id.au/kisskb/buildresult/12875981/ Paul. -- > these sender chronograph stats exported simultaneously along with > these timestamps allow further breaking down the various sender > limitation. For example, a video server can tell if a particular > chunk of video on a connection takes a long time to deliver because > TCP was experiencing small receive window. It is not possible to > tell before this patch without packet traces. > > To prepare these stats, the user needs to set > SOF_TIMESTAMPING_OPT_STATS and SOF_TIMESTAMPING_OPT_TSONLY flags > while requesting other SOF_TIMESTAMPING TX timestamps. When the > timestamps are available in the error queue, the stats are returned > in a separate control message of type SCM_TIMESTAMPING_OPT_STATS, > in a list of TLVs (struct nlattr) of types: TCP_NLA_BUSY_TIME, > TCP_NLA_RWND_LIMITED, TCP_NLA_SNDBUF_LIMITED. Unit is microsecond. > > Signed-off-by: Francis Yan > Signed-off-by: Yuchung Cheng > Signed-off-by: Soheil Hassas Yeganeh > --- > ChangeLog since v1: > - fix build break if CONFIG_INET is not defined > > Documentation/networking/timestamping.txt | 10 ++ > arch/alpha/include/uapi/asm/socket.h | 2 ++ > arch/frv/include/uapi/asm/socket.h| 2 ++ > arch/ia64/include/uapi/asm/socket.h | 2 ++ > arch/m32r/include/uapi/asm/socket.h | 2 ++ > arch/mips/include/uapi/asm/socket.h | 2 ++ > arch/mn10300/include/uapi/asm/socket.h| 2 ++ > arch/parisc/include/uapi/asm/socket.h | 2 ++ > arch/powerpc/include/uapi/asm/socket.h| 2 ++ > arch/s390/include/uapi/asm/socket.h | 2 ++ > arch/sparc/include/uapi/asm/socket.h | 2 ++ > arch/xtensa/include/uapi/asm/socket.h | 2 ++ > include/linux/tcp.h | 2 ++ > include/uapi/asm-generic/socket.h | 2 ++ > include/uapi/linux/net_tstamp.h | 3 ++- > include/uapi/linux/tcp.h | 8 > net/core/skbuff.c | 14 +++--- > net/core/sock.c | 7 +++ > net/ipv4/tcp.c| 20 > net/socket.c | 7 ++- > 20 files changed, 90 insertions(+), 5 deletions(-) > > diff --git a/Documentation/networking/timestamping.txt > b/Documentation/networking/timestamping.txt > index 671cccf..96f5069 100644 > --- a/Documentation/networking/timestamping.txt > +++ b/Documentation/networking/timestamping.txt > @@ -182,6 +182,16 @@ SOF_TIMESTAMPING_OPT_TSONLY: >the timestamp even if sysctl net.core.tstamp_allow_data is 0. >This option disables SOF_TIMESTAMPING_OPT_CMSG. > > +SOF_TIMESTAMPING_OPT_STATS: > + > + Optional stats that are obtained along with the transmit timestamps. > + It must be used together with SOF_TIMESTAMPING_OPT_TSONLY. When the > + transmit timestamp is available, the stats are available in a > + separate control message of type SCM_TIMESTAMPING_OPT_STATS, as a > + list of TLVs (struct nlattr) of types. These stats allow the > + application to associate various transport layer stats with > + the transmit timestamps, such as how long a certain block of > + data was limited by peer's receiver window. > > New applications are encouraged to pass SOF_TIMESTAMPING_OPT_ID to > disambiguate timestamps and SOF_TIMESTAMPING_OPT_TSONLY to operate > diff --git a/arch/alpha/include/uapi/asm/socket.h > b/arch/alpha/include/uapi/asm/socket.h > index 9e46d6e..afc901b 100644 > --- a/arch/alpha/include/uapi/asm/socket.h > +++ b/arch/alpha/include/uapi/asm/socket.h > @@ -97,4 +97,6 @@ > > #define SO_CNX_ADVICE 53 > > +#define SCM_TIMESTAMPING_OPT_STATS 54 > + > #endif /* _UAPI_ASM_SOCKET_H */ > diff --git a/arch/frv/include/uapi/asm/socket.h > b/arch/frv/include/uapi/asm/socket.h > index afbc98f0..81e0353 100644 > --- a/arch/frv/include/uapi/asm/socket.h > +++ b/arch/frv/include/uapi/asm/socket.h > @@ -90,5 +90,7 @@ > > #define SO_CNX_ADVICE 53 > > +#define SCM_TIMESTAMPING_OPT_STATS 54 > + > #endif /* _ASM_SOCKET_H */ > > diff --git a/arch/ia64/include/uapi/asm/socket.h > b/arch/ia64/include/uapi/asm/socket.h > index 0018fad..57feb0c 100644 > --- a/arch/ia64/include/uapi/asm/socket.h > +++ b/arch/ia64/include/uapi/asm/socket.h > @@ -99,4 +99,6 @@ > > #define SO_CNX_ADVICE 53 > > +#define SCM_TIMESTAMPING_OPT_STATS 54 > + > #en
Re: [PATCH iproute2/net-next] ss: initialise variables outside of for loop
On Fri, 2 Dec 2016 12:56:05 +0100 Simon Horman wrote: > Initialise for loops outside of for loops. GCC flags this as being > out of spec unless C99 or C11 mode is used. > > With this change the entire tree appears to compile cleanly with -Wall. > > $ gcc --version > gcc (Debian 4.9.2-10) 4.9.2 > ... > $ make > ... > ss.c: In function ‘unix_show_sock’: > ss.c:3128:4: error: ‘for’ loop initial declarations are only allowed in C99 > or C11 mode > ... > > Signed-off-by: Simon Horman Applied. Note, I used to have -Wall in Makefile but old GCC were broken and would give aliasing warnings.
Re: [PATCH iproute2 V5 0/3] tc: Support for ip tunnel metadata set/unset/classify
On Fri, 2 Dec 2016 13:25:12 +0200 Amir Vadai wrote: > Hi, > > This short series adds support for matching and setting metadata for ip tunnel > shared device using the TC system, introduced in kernel 4.9 [1]. > > Applied and tested on top of commit b6c7fc61faab ("ss: print new tcp_info > fields: busy, rwnd-limited, sndbuf-limited times") > > > Example usage: > > $ tc filter add dev vxlan0 protocol ip parent : \ > flower \ > enc_src_ip 11.11.0.2 \ > enc_dst_ip 11.11.0.1 \ > enc_key_id 11 \ > dst_ip 11.11.11.1 \ > action mirred egress redirect dev vnet0 > > $ tc filter add dev net0 protocol ip parent : \ > flower \ > ip_proto 1 \ > dst_ip 11.11.11.2 \ > action tunnel_key set \ > src_ip 11.11.0.1 \ > dst_ip 11.11.0.2 \ > id 11 \ > action mirred egress redirect dev vxlan0 > > [1] - d1ba24feb466 ("Merge branch 'act_tunnel_key'") > > Thanks, > Amir > > Changes from V4: > - Fix rebase conflicts for net-next > > Changes from V3: > - Fix bad wording in the man page about the use of the 'unset' operation > > Changes from V2: > - Use const where needed > - Don't lose return value > - Introduce rta_getattr_be16() and rta_getattr_be32() > > Changes from V1: > - Updated Patch 2/2 ("tc/act_tunnel: Introduce ip tunnel action") commit log > and the man page tc-tunnel_key to reflect the fact that 'unset' > operation is > no mandatory. > And describe when it might be needed. > - Rename the 'release' operation to 'unset' > > Amir Vadai (3): > libnetlink: Introduce rta_getattr_be*() > tc/cls_flower: Classify packet in ip tunnels > tc/act_tunnel: Introduce ip tunnel action > > Amir Vadai (3): > libnetlink: Introduce rta_getattr_be*() > tc/cls_flower: Classify packet in ip tunnels > tc/act_tunnel: Introduce ip tunnel action > > bridge/fdb.c | 4 +- > include/libnetlink.h | 9 ++ > include/linux/tc_act/tc_tunnel_key.h | 42 ++ > ip/iplink_geneve.c | 2 +- > ip/iplink_vxlan.c| 2 +- > man/man8/tc-flower.8 | 17 ++- > man/man8/tc-tunnel_key.8 | 112 +++ > tc/Makefile | 1 + > tc/f_flower.c| 84 +++- > tc/m_tunnel_key.c| 258 > +++ > 10 files changed, 522 insertions(+), 9 deletions(-) > create mode 100644 include/linux/tc_act/tc_tunnel_key.h > create mode 100644 man/man8/tc-tunnel_key.8 > create mode 100644 tc/m_tunnel_key.c > Series applied
Re: [PATCH net-next 3/4] tcp: tsq: add shortcut in tcp_tasklet_func()
On Fri, 2016-12-02 at 10:25 -0800, Eric Dumazet wrote: > Under high stress, I've seen tcp_tasklet_func() consuming > ~700 usec, handling ~150 tcp sockets. > > By setting TCP_TSQ_DEFERRED in tcp_wfree(), we give a chance > for other cpus/threads entering tcp_write_xmit() to grab it, > allowing tcp_tasklet_func() to skip sockets that already did > an xmit cycle. > > Signed-off-by: Eric Dumazet ... > @@ -884,7 +884,7 @@ void tcp_wfree(struct sk_buff *skb) > if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED)) > goto out; > > - nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED; > + nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED | > TCP_TSQ_DEFERRED; Typo here... Should be : nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED | TCPF_TSQ_DEFERRED; > nval = cmpxchg(&tp->tsq_flags, oval, nval); > if (nval != oval) > continue;
Re: [iproute PATCH v2 00/18] ss: Minor code review
On Fri, 2 Dec 2016 11:39:44 +0100 Phil Sutter wrote: > This is a series of misc changes to ss code which happened as fall-out > when working on a unified output formatter (still unfinished). > > Changes since v1: > - Rebased onto current upstream, resolved conflicts in patch 4 generated > by previously added SCTP socket support. > > Phil Sutter (18): > ss: Mark fall through in arg parsing switch() > ss: Drop empty lines in UDP output > ss: Add missing tab when printing UNIX details > ss: Use sockstat->type in all socket types > ss: introduce proc_ctx_print() > ss: Drop list traversal from unix_stats_print() > ss: Eliminate unix_use_proc() > ss: Turn generic_proc_open() wrappers into macros > ss: Make tmr_name local to tcp_timer_print() > ss: Make user_ent_hash_build_init local to user_ent_hash_build() > ss: Make some variables function-local > ss: Make slabstat_ids local to get_slabstat() > ss: Get rid of useless goto in handle_follow_request() > ss: Get rid of single-fielded struct snmpstat > ss: Make unix_state_map local to unix_show() > ss: Make sstate_name local to sock_state_print() > ss: Make sstate_namel local to scan_state() > ss: unix_show: No need to initialize members of calloc'ed structs > > misc/ss.c | 532 > ++ > 1 file changed, 224 insertions(+), 308 deletions(-) > Applied, thanks
Re: [PATCH iproute2 1/1] tc: updated man page to reflect handle-id use in filter GET command.
On Thu, 1 Dec 2016 15:20:44 -0500 Roman Mashak wrote: > Signed-off-by: Roman Mashak > --- > man/man8/tc.8 | 6 -- > 1 file changed, 4 insertions(+), 2 deletions(-) > > diff --git a/man/man8/tc.8 b/man/man8/tc.8 > index 8a47a2b..d957ffa 100644 > --- a/man/man8/tc.8 > +++ b/man/man8/tc.8 > @@ -32,7 +32,9 @@ class-id ] qdisc > DEV > .B [ parent > qdisc-id > -.B | root ] protocol > +.B | root ] [ handle > +handle-id ] > +.B protocol > protocol > .B prio > priority filtertype > @@ -577,7 +579,7 @@ it is created. > > .TP > get > -Displays a single filter given the interface, parent ID, priority, protocol > and handle ID. > +Displays a single filter given the interface, qdisc-id, priority, protocol > and handle-id. > > .TP > show The proper syntax for man page usage section is to put keywords in bold and any value that is variable in italic. I know this whole man page doesn't do this correctly. But that doesn't mean that new additions should continue with the mistake. Please revise and resubmit. Extra bonus points for fixing the other bits.
Re: [PATCHv2 net-next 4/4] net: dsa: mv88e6xxx: Refactor CPU and DSA port setup
Hi Andrew, Andrew Lunn writes: >> The port's EgressMode, FrameMode and EtherType are really tied together >> to compose the mode of the port. > > Setting the EtherType is somewhat separate. It is only needed on ports > using EDSA. And that can only happen on a CPU port. Humm, actually, i > set it when i should not. But putting this in a wrapper actually hides > this. Wrong. The datasheet says: > This Ether Type is used for many features depending upon the mode > of the port (as defined by the port’s EgressMode and FrameMode > bits – in Port Control, port offset 0x04). It says that in Normal Network mode, this register can be used to trap, mirror, etc. Also used in Provider and EDSA modes. That is why it would be better to wrap them together to ensure correct values when configuring a port's mode. > >> Could you add an helper in chip.c like: >> >> static int mv88e6xxx_set_port_mode(struct mv88e6xxx_chip *chip, int port, >>enum mv88e6xxx_frame_mode frame_mode, >>u16 egress_mode, bool egress_unknown, >>u16 ethertype) >> { >> int err; >> >> if (chip->info->ops->port_set_frame_mode) { >> err = chip->info->ops->port_set_frame_mode(chip, port, >> frame_mode); >> if (err) >> return err; >> } > > Ignoring that it is not implemented here is wrong. It must be > implemented, or the device is not going to work. It is a question of, > do we want an oops, or return an error code. Since that is done at setup time, returning an error is enough IMO to inform the DSA layer that something went wrong. Thanks, Vivien
[PATCH net-next] tools: hv: Enable network manager for bonding scripts on RHEL
From: Haiyang Zhang We found network manager is necessary on RHEL to make the synthetic NIC, VF NIC bonding operations handled automatically. So, enabling network manager here. Signed-off-by: Haiyang Zhang Reviewed-by: K. Y. Srinivasan --- tools/hv/bondvf.sh |4 ++-- 1 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/hv/bondvf.sh b/tools/hv/bondvf.sh index 8e96023..4aa5369 100755 --- a/tools/hv/bondvf.sh +++ b/tools/hv/bondvf.sh @@ -74,8 +74,8 @@ function create_eth_cfg_redhat { echo DEVICE=$1 >>$fn echo TYPE=Ethernet >>$fn echo BOOTPROTO=none >>$fn + echo UUID=`uuidgen` >>$fn echo ONBOOT=yes >>$fn - echo NM_CONTROLLED=no >>$fn echo PEERDNS=yes >>$fn echo IPV6INIT=yes >>$fn echo MASTER=$2 >>$fn @@ -93,8 +93,8 @@ function create_bond_cfg_redhat { echo DEVICE=$1 >>$fn echo TYPE=Bond >>$fn echo BOOTPROTO=dhcp >>$fn + echo UUID=`uuidgen` >>$fn echo ONBOOT=yes >>$fn - echo NM_CONTROLLED=no >>$fn echo PEERDNS=yes >>$fn echo IPV6INIT=yes >>$fn echo BONDING_MASTER=yes >>$fn -- 1.7.4.1
[PATCH net-next v5] ipv6 addrconf: Implemented enhanced DAD (RFC7527)
Implemented RFC7527 Enhanced DAD. IPv6 duplicate address detection can fail if there is some temporary loopback of Ethernet frames. RFC7527 solves this by including a random nonce in the NS messages used for DAD, and if an NS is received with the same nonce it is assumed to be a looped back DAD probe and is ignored. RFC7527 is enabled by default. Can be disabled by setting both of conf/{all,interface}/enhanced_dad to zero. Signed-off-by: Erik Nordmark Signed-off-by: Bob Gilligan Reviewed-by: Hannes Frederic Sowa --- v2: renamed sysctl and made it default to true, plus minor code review fixes v3: respun with later net-next; fixed whitespace issues v4: fixed kbuild test robot for route.c; added Reviewed-by v5: using %pM for printk of nonce Documentation/networking/ip-sysctl.txt | 9 + include/linux/ipv6.h | 1 + include/net/if_inet6.h | 1 + include/net/ndisc.h| 5 - include/uapi/linux/ipv6.h | 1 + net/ipv6/addrconf.c| 22 +- net/ipv6/ndisc.c | 29 ++--- net/ipv6/route.c | 2 +- 8 files changed, 64 insertions(+), 6 deletions(-) diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 5af48dd..d9ef566 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -1729,6 +1729,15 @@ drop_unsolicited_na - BOOLEAN By default this is turned off. +enhanced_dad - BOOLEAN + Include a nonce option in the IPv6 neighbor solicitation messages used for + duplicate address detection per RFC7527. A received DAD NS will only signal + a duplicate address if the nonce is different. This avoids any false + detection of duplicates due to loopback of the NS messages that we send. + The nonce option will be sent on an interface unless both of + conf/{all,interface}/enhanced_dad are set to FALSE. + Default: TRUE + icmp/*: ratelimit - INTEGER Limit the maximal rates for sending ICMPv6 packets. diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 3f95233..671d014 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -68,6 +68,7 @@ struct ipv6_devconf { #ifdef CONFIG_IPV6_SEG6_HMAC __s32 seg6_require_hmac; #endif + __u32 enhanced_dad; struct ctl_table_header *sysctl_header; }; diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h index b0576cb..0fa4c32 100644 --- a/include/net/if_inet6.h +++ b/include/net/if_inet6.h @@ -55,6 +55,7 @@ struct inet6_ifaddr { __u8stable_privacy_retry; __u16 scope; + __u64 dad_nonce; unsigned long cstamp; /* created timestamp */ unsigned long tstamp; /* updated timestamp */ diff --git a/include/net/ndisc.h b/include/net/ndisc.h index be1fe228..d562a2f 100644 --- a/include/net/ndisc.h +++ b/include/net/ndisc.h @@ -31,6 +31,7 @@ enum { ND_OPT_PREFIX_INFO = 3, /* RFC2461 */ ND_OPT_REDIRECT_HDR = 4,/* RFC2461 */ ND_OPT_MTU = 5, /* RFC2461 */ + ND_OPT_NONCE = 14, /* RFC7527 */ __ND_OPT_ARRAY_MAX, ND_OPT_ROUTE_INFO = 24, /* RFC4191 */ ND_OPT_RDNSS = 25, /* RFC5006 */ @@ -121,6 +122,7 @@ struct ndisc_options { #define nd_opts_pi_end nd_opt_array[__ND_OPT_PREFIX_INFO_END] #define nd_opts_rh nd_opt_array[ND_OPT_REDIRECT_HDR] #define nd_opts_mtund_opt_array[ND_OPT_MTU] +#define nd_opts_nonce nd_opt_array[ND_OPT_NONCE] #define nd_802154_opts_src_lladdr nd_802154_opt_array[ND_OPT_SOURCE_LL_ADDR] #define nd_802154_opts_tgt_lladdr nd_802154_opt_array[ND_OPT_TARGET_LL_ADDR] @@ -398,7 +400,8 @@ static inline struct neighbour *__ipv6_neigh_lookup(struct net_device *dev, cons int ndisc_rcv(struct sk_buff *skb); void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit, - const struct in6_addr *daddr, const struct in6_addr *saddr); + const struct in6_addr *daddr, const struct in6_addr *saddr, + u64 nonce); void ndisc_send_rs(struct net_device *dev, const struct in6_addr *saddr, const struct in6_addr *daddr); diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h index 53561be..eaf65dc 100644 --- a/include/uapi/linux/ipv6.h +++ b/include/uapi/linux/ipv6.h @@ -181,6 +181,7 @@ enum { DEVCONF_RTR_SOLICIT_MAX_INTERVAL, DEVCONF_SEG6_ENABLED, DEVCONF_SEG6_REQUIRE_HMAC, + DEVCONF_ENHANCED_DAD, DEVCONF_MAX }; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 4c387dc..c1e124b 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2
Re: [PATCHv2 net-next 2/4] net: dsa: mv88e6xxx: Monitor and Management tables
Hi Andrew, Andrew Lunn writes: > On Fri, Dec 02, 2016 at 02:32:39PM -0500, Vivien Didelot wrote: >> Hi Andrew, >> >> Andrew Lunn writes: >> >> > @@ -3184,6 +3186,8 @@ static const struct mv88e6xxx_ops mv88e6085_ops = { >> >.stats_get_sset_count = mv88e6095_stats_get_sset_count, >> >.stats_get_strings = mv88e6095_stats_get_strings, >> >.stats_get_stats = mv88e6095_stats_get_stats, >> > + .g1_set_cpu_port = mv88e6095_g1_set_cpu_port, >> > + .g1_set_egress_port = mv88e6095_g1_set_egress_port, >> > }; >> >> I like the implementation in this version better. But please explain me >> why you are prefixing these operations with g1_? > > The prefix gives some basic grouping. port_ indicates it operates on a > port, and is likely to be found in port.c. stats_ indicates it > operates on statistics, ppu that is operates on the phy polling unit. Yes, port_* operations operate on ports. But the port.c file is there to implement the function of "Port Registers". "Port" can be confusing, but it refers to the SMI internal device at address 0xsomething. "port_", "ppu_", "stats_", in the mv88e6xxx_ops structure just give implicit namespaces for the **features**, not their location! > We are going to have some things which don't fall into a simple > category, like these two. But it would however be nice to group them, > so i picked which register bank they are in. These operations are > always in g1. It is a useful hint as to where to find the different > variants. Absolutely not! .set_egress_port = mv88e6095_g1_set_egress_port, ^ That is the useful hint! At the higher level of chip.c, we don't care about where is implemented the switch MAC setter. We just have to call the correctly defined .set_switch_mac routine. However if you do care to know, its _ops.set_switch_mac pointer will tell you (_g1 vs _g2 prefix). >> But let's imagine we can set the CPU port in some Global 2 registers. >> You are going to wrap this in chip.c with something like: >> >> int mv88e6xxx_set_cpu_port(struct mv88e6xxx_chip *chip, int port) >> { >> if (chip->info->ops->g2_set_cpu_port) >> return chip->info->ops->g2_set_cpu_port(chip, port); >> else if (chip->info->ops->g1_set_cpu_port) >> return chip->info->ops->g1_set_cpu_port(chip, port); >> else >> return -EOPNOTSUPP; >> } > > I answered in one of my other emails. Frames with reserved MAC > addresses can be forwarded to the CPU. For most devices, this is a g2 > operation. However, for 6390, it is a g1. In that case, my code does > not use a prefix. Not having a prefix, when all the others do, also > gives you information. It means the ops are spread around and you need > to make a bigger effort to go find them. Again, absolutely not. This is your interpretation of having a prefix or not. A chip has only one way to access a feature, not two. Since you seem to be focused on the Rsvd2CPU feature, here's an example with it: What's the point of writing this: /* Consider the given MAC as MGMT */ int mv88e6xxx_reserve_mac(struct mv88e6xxx_chip *chip, u8 *addr) { if (mac_is_0x(addr)) { if (chip->info->ops->g1_set_rsvd2cpu0) return chip->info->ops->g1_set_rsvd2cpu0(...); else if (chip->info->ops->g2_set_rsvd2cpu0) return chip->info->ops->g2_set_rsvd2cpu0(...); } else if (mac_is_2x(addr)) { if (chip->info->ops->g1_set_rsvd2cpu2) return chip->info->ops->g1_set_rsvd2cpu2(...); else if (chip->info->ops->g2_set_rsvd2cpu2) return chip->info->ops->g2_set_rsvd2cpu2(...); } return mv88e6xxx_atu_load(chip, addr, MGMT); } Compared to this: /* Consider the given MAC as MGMT */ int mv88e6xxx_reserve_mac(struct mv88e6xxx_chip *chip, u8 *addr) { if (mac_is_0x(addr)) { if (chip->info->ops->set_rsvd2cpu0) return chip->info->ops->set_rsvd2cpu0(...); } else if (mac_is_2x(addr)) { if (chip->info->ops->set_rsvd2cpu2) return chip->info->ops->set_rsvd2cpu2(...); } return mv88e6xxx_atu_load(chip, addr, MGMT); } Your higher level API (chip.c) doesn't need to know where is implemented a given feature. It just needs to know if it supports it or not. Thanks, Vivien
[PATCH] adm80211: add checks for dma mapping errors
The driver does not check if mapping dma memory succeed. The patch adds the checks and failure handling. Found by Linux Driver Verification project (linuxtesting.org). Signed-off-by: Alexey Khoroshilov --- drivers/net/wireless/admtek/adm8211.c | 24 ++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/admtek/adm8211.c b/drivers/net/wireless/admtek/adm8211.c index 70ecd82d674d..2b4a3eb38dfa 100644 --- a/drivers/net/wireless/admtek/adm8211.c +++ b/drivers/net/wireless/admtek/adm8211.c @@ -413,6 +413,13 @@ static void adm8211_interrupt_rci(struct ieee80211_hw *dev) skb_tail_pointer(newskb), RX_PKT_SIZE, PCI_DMA_FROMDEVICE); + if (pci_dma_mapping_error(priv->pdev, + priv->rx_buffers[entry].mapping)) { + priv->rx_buffers[entry].skb = NULL; + dev_kfree_skb(newskb); + skb = NULL; + /* TODO: update rx dropped stats */ + } } else { skb = NULL; /* TODO: update rx dropped stats */ @@ -1450,6 +1457,12 @@ static int adm8211_init_rings(struct ieee80211_hw *dev) skb_tail_pointer(rx_info->skb), RX_PKT_SIZE, PCI_DMA_FROMDEVICE); + if (pci_dma_mapping_error(priv->pdev, rx_info->mapping)) { + dev_kfree_skb(rx_info->skb); + rx_info->skb = NULL; + break; + } + desc->buffer1 = cpu_to_le32(rx_info->mapping); desc->status = cpu_to_le32(RDES0_STATUS_OWN | RDES0_STATUS_SQL); } @@ -1613,7 +1626,7 @@ static void adm8211_calc_durations(int *dur, int *plcp, size_t payload_len, int } /* Transmit skb w/adm8211_tx_hdr (802.11 header created by hardware) */ -static void adm8211_tx_raw(struct ieee80211_hw *dev, struct sk_buff *skb, +static int adm8211_tx_raw(struct ieee80211_hw *dev, struct sk_buff *skb, u16 plcp_signal, size_t hdrlen) { @@ -1625,6 +1638,8 @@ static void adm8211_tx_raw(struct ieee80211_hw *dev, struct sk_buff *skb, mapping = pci_map_single(priv->pdev, skb->data, skb->len, PCI_DMA_TODEVICE); + if (pci_dma_mapping_error(priv->pdev, mapping)) + return -ENOMEM; spin_lock_irqsave(&priv->lock, flags); @@ -1657,6 +1672,8 @@ static void adm8211_tx_raw(struct ieee80211_hw *dev, struct sk_buff *skb, /* Trigger transmit poll */ ADM8211_CSR_WRITE(TDR, 0); + + return 0; } /* Put adm8211_tx_hdr on skb and transmit */ @@ -1710,7 +1727,10 @@ static void adm8211_tx(struct ieee80211_hw *dev, txhdr->retry_limit = info->control.rates[0].count; - adm8211_tx_raw(dev, skb, plcp_signal, hdrlen); + if (adm8211_tx_raw(dev, skb, plcp_signal, hdrlen)) { + /* Drop packet */ + ieee80211_free_txskb(dev, skb); + } } static int adm8211_alloc_rings(struct ieee80211_hw *dev) -- 2.7.4
Re: [PATCH next] dctcp: update cwnd on congestion event
Neal Cardwell wrote: > On Mon, Nov 14, 2016 at 10:42 AM, Florian Westphal wrote: > > > > draft-ietf-tcpm-dctcp-02 says: > > > > ... when the sender receives an indication of congestion > > (ECE), the sender SHOULD update cwnd as follows: > > > > cwnd = cwnd * (1 - DCTCP.Alpha / 2) > > > > So, lets do this and reduce cwnd more smoothly (and faster), as per > > current congestion estimate. > > AFAICT this is doing a multiplicative decrease of cwnd on every ACK > that has an ECE bit. > > If I am reading the code correctly, then I would have two concerns: > > 1) Has that been tested? That seems like an extremely dramatic > decrease in cwnd. For example, if the cwnd is 80, and there are 40 > ACKs, and half the ACKs are ECE marked, then my back-of-the-envelope > calculations seem to suggest that after just 11 ACKs the cwnd would be > down to a minimal value of 2: > > ack 1 cwnd=60 > ack 2 cwnd=45 > ack 3 cwnd=33 [..] You are assuming alpha = 0.5? Then, yes, looks correct. Since some of these acks will most likely also end an observation window acks might also cause change to alpha. > 2) That seems to contradict another passage in the draft (v 02 or 03). > Consider > https://tools.ietf.org/html/draft-ietf-tcpm-dctcp-03 > where it says > >Just as specified in [RFC3168], DCTCP does not react to congestion >indications more than once for every window of data. > > So the draft seems to advocate not reacting to congestion indications > more than once per window. Yet this patch reacts on every ECE-marked > ACK within a window. > > Am I reading something incorrectly? No, I will raise this on tcpm next monday (if you want you can of course do this yourself). Would be easy to make it so this cwnd update only happens once in each observation cycle, but it would be even better if this would get input from draft authors. Thanks Neal!
Re: [PATCHv2 net-next 2/4] net: dsa: mv88e6xxx: Monitor and Management tables
On Fri, Dec 02, 2016 at 02:32:39PM -0500, Vivien Didelot wrote: > Hi Andrew, > > Andrew Lunn writes: > > > @@ -3184,6 +3186,8 @@ static const struct mv88e6xxx_ops mv88e6085_ops = { > > .stats_get_sset_count = mv88e6095_stats_get_sset_count, > > .stats_get_strings = mv88e6095_stats_get_strings, > > .stats_get_stats = mv88e6095_stats_get_stats, > > + .g1_set_cpu_port = mv88e6095_g1_set_cpu_port, > > + .g1_set_egress_port = mv88e6095_g1_set_egress_port, > > }; > > I like the implementation in this version better. But please explain me > why you are prefixing these operations with g1_? The prefix gives some basic grouping. port_ indicates it operates on a port, and is likely to be found in port.c. stats_ indicates it operates on statistics, ppu that is operates on the phy polling unit. We are going to have some things which don't fall into a simple category, like these two. But it would however be nice to group them, so i picked which register bank they are in. These operations are always in g1. It is a useful hint as to where to find the different variants. > But let's imagine we can set the CPU port in some Global 2 registers. > You are going to wrap this in chip.c with something like: > > int mv88e6xxx_set_cpu_port(struct mv88e6xxx_chip *chip, int port) > { > if (chip->info->ops->g2_set_cpu_port) > return chip->info->ops->g2_set_cpu_port(chip, port); > else if (chip->info->ops->g1_set_cpu_port) > return chip->info->ops->g1_set_cpu_port(chip, port); > else > return -EOPNOTSUPP; > } I answered in one of my other emails. Frames with reserved MAC addresses can be forwarded to the CPU. For most devices, this is a g2 operation. However, for 6390, it is a g1. In that case, my code does not use a prefix. Not having a prefix, when all the others do, also gives you information. It means the ops are spread around and you need to make a bigger effort to go find them. Andrew
Re: [PATCHv2 net-next 4/4] net: dsa: mv88e6xxx: Refactor CPU and DSA port setup
> The port's EgressMode, FrameMode and EtherType are really tied together > to compose the mode of the port. Setting the EtherType is somewhat separate. It is only needed on ports using EDSA. And that can only happen on a CPU port. Humm, actually, i set it when i should not. But putting this in a wrapper actually hides this. > Could you add an helper in chip.c like: > > static int mv88e6xxx_set_port_mode(struct mv88e6xxx_chip *chip, int port, >enum mv88e6xxx_frame_mode frame_mode, >u16 egress_mode, bool egress_unknown, >u16 ethertype) > { > int err; > > if (chip->info->ops->port_set_frame_mode) { > err = chip->info->ops->port_set_frame_mode(chip, port, > frame_mode); > if (err) > return err; > } Ignoring that it is not implemented here is wrong. It must be implemented, or the device is not going to work. It is a question of, do we want an oops, or return an error code. New version coming. Andrew
Re: [PATCHv2 net-next 4/4] net: dsa: mv88e6xxx: Refactor CPU and DSA port setup
Hi Andrew, Andrew Lunn writes: > +static int mv88e6xxx_setup_port_dsa(struct mv88e6xxx_chip *chip, int port, > + int upstream_port) > +{ > + int err; > + > + err = chip->info->ops->port_set_frame_mode( > + chip, port, MV88E6XXX_FRAME_MODE_DSA); > + if (err) > + return err; > + > + err = chip->info->ops->port_set_egress_unknowns( > + chip, port, port == upstream_port); > + if (err) > + return err; > + > + if (chip->info->ops->port_set_ether_type) > + return chip->info->ops->port_set_ether_type( > + chip, port, ETH_P_EDSA); > + > + return 0; > +} > + > +static int mv88e6xxx_setup_port_cpu(struct mv88e6xxx_chip *chip, int port) > +{ > + int err; > + > + switch (chip->info->tag_protocol) { > + case DSA_TAG_PROTO_EDSA: > + err = chip->info->ops->port_set_frame_mode( > + chip, port, MV88E6XXX_FRAME_MODE_ETHERTYPE); > + if (err) > + return err; > + > + err = mv88e6xxx_port_set_egress_mode( > + chip, port, PORT_CONTROL_EGRESS_ADD_TAG); > + if (err) > + return err; > + > + if (chip->info->ops->port_set_ether_type) > + err = chip->info->ops->port_set_ether_type( > + chip, port, ETH_P_EDSA); > + break; > + > + case DSA_TAG_PROTO_DSA: > + err = chip->info->ops->port_set_frame_mode( > + chip, port, MV88E6XXX_FRAME_MODE_DSA); > + if (err) > + return err; > + > + err = mv88e6xxx_port_set_egress_mode( > + chip, port, PORT_CONTROL_EGRESS_UNMODIFIED); > + break; > + default: > + err = -EINVAL; > + } > + > + if (err) > + return err; > + > + return chip->info->ops->port_set_egress_unknowns(chip, port, true); > +} > + > +static int mv88e6xxx_setup_port_normal(struct mv88e6xxx_chip *chip, int port) > +{ > + int err; > + > + err = chip->info->ops->port_set_frame_mode( > + chip, port, MV88E6XXX_FRAME_MODE_NORMAL); > + if (err) > + return err; > + > + return chip->info->ops->port_set_egress_unknowns(chip, port, false); > +} The port's EgressMode, FrameMode and EtherType are really tied together to compose the mode of the port. Could you add an helper in chip.c like: static int mv88e6xxx_set_port_mode(struct mv88e6xxx_chip *chip, int port, enum mv88e6xxx_frame_mode frame_mode, u16 egress_mode, bool egress_unknown, u16 ethertype) { int err; if (chip->info->ops->port_set_frame_mode) { err = chip->info->ops->port_set_frame_mode(chip, port, frame_mode); if (err) return err; } err = mv88e6xxx_port_set_egress_mode(chip, port, egress_mode); if (err) return err; if (chip->info->ops->port_set_egress_unknown) { err = chip->info->ops->port_set_egress_unknown(chip, port, egress_unknown); if (err) return err; } if (chip->info->ops->port_set_ether_type) { err = chip->info->ops->port_set_ether_type(chip, port, ethertype); if (err) return err; } return 0; } So that we correctly check for ops before calling them, and make mv88e6xxx_setup_port_{dsa,cpu,normal} a bit more concise. > + > static int mv88e6xxx_setup_port(struct mv88e6xxx_chip *chip, int port) > { > struct dsa_switch *ds = chip->ds; > @@ -2473,44 +2542,25 @@ static int mv88e6xxx_setup_port(struct mv88e6xxx_chip > *chip, int port) >* If this is the upstream port for this switch, enable >* forwarding of unknown unicasts and multicasts. >*/ > - reg = 0; > - if (mv88e6xxx_6352_family(chip) || mv88e6xxx_6351_family(chip) || > - mv88e6xxx_6165_family(chip) || mv88e6xxx_6097_family(chip) || > - mv88e6xxx_6095_family(chip) || mv88e6xxx_6065_family(chip) || > - mv88e6xxx_6185_family(chip) || mv88e6xxx_6320_family(chip)) > - reg = PORT_CONTROL_IGMP_MLD_SNOOP | > + reg = PORT_CONTROL_IGMP_MLD_SNOOP | > PORT_CONTROL_USE_TAG | PORT_CONTROL_USE_IP | > PORT_CONTROL_STATE_FORWARDING; > + err = mv88e6xxx_port_write(chip, port, PORT_CONTROL, reg); > + if (err) > + return err; > + > if (dsa_is_cpu_port(ds, port)) { > - if (chip->info->tag_protocol == DSA_TAG_PROTO_EDSA) > - reg |= PORT_CONTROL_FRAME_ETHER_TYPE_DSA | > - PORT_CONTROL_FORWARD_UNKNOWN_MC; > - else > - reg |= PORT_CONTROL_DS
Re: [PATCH net-next 0/4] tcp: tsq: performance series
On Fri, Dec 2, 2016 at 10:25 AM, Eric Dumazet wrote: > Under very high TX stress, CPU handling NIC TX completions can spend > considerable amount of cycles handling TSQ (TCP Small Queues) logic. > > This patch series avoids some atomic operations, but more important > patch is the 3rd one, allowing other cpus processing ACK packets and > calling tcp_write_xmit() to grab TCP_TSQ_DEFERRED so that > tcp_tasklet_func() can skip already processed sockets. > > This avoid lots of lock acquisitions and cache lines accesses, > particularly under load. > Please do not merge this version. I probably messed something, I need to make more tests. Thanks.
Re: [PATCH next] dctcp: update cwnd on congestion event
On Mon, Nov 14, 2016 at 10:42 AM, Florian Westphal wrote: > > draft-ietf-tcpm-dctcp-02 says: > > ... when the sender receives an indication of congestion > (ECE), the sender SHOULD update cwnd as follows: > > cwnd = cwnd * (1 - DCTCP.Alpha / 2) > > So, lets do this and reduce cwnd more smoothly (and faster), as per > current congestion estimate. AFAICT this is doing a multiplicative decrease of cwnd on every ACK that has an ECE bit. If I am reading the code correctly, then I would have two concerns: 1) Has that been tested? That seems like an extremely dramatic decrease in cwnd. For example, if the cwnd is 80, and there are 40 ACKs, and half the ACKs are ECE marked, then my back-of-the-envelope calculations seem to suggest that after just 11 ACKs the cwnd would be down to a minimal value of 2: ack 1 cwnd=60 ack 2 cwnd=45 ack 3 cwnd=33 ack 4 cwnd=24 ack 5 cwnd=18 ack 6 cwnd=13 ack 7 cwnd=9 ack 8 cwnd=6 ack 9 cwnd=4 ack 10 cwnd=3 ack 11 cwnd=2 2) That seems to contradict another passage in the draft (v 02 or 03). Consider https://tools.ietf.org/html/draft-ietf-tcpm-dctcp-03 where it says Just as specified in [RFC3168], DCTCP does not react to congestion indications more than once for every window of data. So the draft seems to advocate not reacting to congestion indications more than once per window. Yet this patch reacts on every ECE-marked ACK within a window. Am I reading something incorrectly? cheers, neal
Re: [PATCHv2 net-next 3/4] net: dsa: mv88e6xxx: Move the tagging protocol into info
On Fri, Dec 02, 2016 at 02:41:08PM -0500, Vivien Didelot wrote: > Hi Andrew, > > Andrew Lunn writes: > > > @@ -3749,6 +3756,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] > > = { > > .global1_addr = 0x1b, > > .age_time_coeff = 15000, > > .g1_irqs = 9, > > + .tag_protocol = DSA_TAG_PROTO_EDSA, > > .flags = MV88E6XXX_FLAGS_FAMILY_6352, > > .ops = &mv88e6172_ops, > > }, > > Since some chips support several protocols, we will have to turn > tag_protocol into a bitmask and introduce something like: Why? We have made an implementation choice, this chip will be used in this way. There is no strong reason to use it the other way. There is a strong reason not to allow it to be configured, because it makes the driver more complex and the DSA layer more complex, and no other driver requires this complexity. KISS. Andrew
[net-next PATCH v4 6/6] virtio_net: xdp, add slowpath case for non contiguous buffers
virtio_net XDP support expects receive buffers to be contiguous. If this is not the case we enable a slowpath to allow connectivity to continue but at a significan performance overhead associated with linearizing data. To make it painfully aware to users that XDP is running in a degraded mode we throw an xdp buffer error. To linearize packets we allocate a page and copy the segments of the data, including the header, into it. After this the page can be handled by XDP code flow as normal. Then depending on the return code the page is either freed or sent to the XDP xmit path. There is no attempt to optimize this path. This case is being handled simple as a precaution in case some unknown backend were to generate packets in this form. To test this I had to hack qemu and force it to generate these packets. I do not expect this case to be generated by "real" backends. Signed-off-by: John Fastabend --- drivers/net/virtio_net.c | 77 +- 1 file changed, 75 insertions(+), 2 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 137caba..13f463d 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -456,6 +456,64 @@ static struct sk_buff *receive_big(struct net_device *dev, return NULL; } +/* The conditions to enable XDP should preclude the underlying device from + * sending packets across multiple buffers (num_buf > 1). However per spec + * it does not appear to be illegal to do so but rather just against convention. + * So in order to avoid making a system unresponsive the packets are pushed + * into a page and the XDP program is run. This will be extremely slow and we + * push a warning to the user to fix this as soon as possible. Fixing this may + * require resolving the underlying hardware to determine why multiple buffers + * are being received or simply loading the XDP program in the ingress stack + * after the skb is built because there is no advantage to running it here + * anymore. + */ +static struct page *xdp_linearize_page(struct receive_queue *rq, + u16 num_buf, + struct page *p, + int offset, + unsigned int *len) +{ + struct page *page = alloc_page(GFP_ATOMIC); + unsigned int page_off = 0; + + if (!page) + return NULL; + + memcpy(page_address(page) + page_off, page_address(p) + offset, *len); + page_off += *len; + + while (--num_buf) { + unsigned int buflen; + unsigned long ctx; + void *buf; + int off; + + ctx = (unsigned long)virtqueue_get_buf(rq->vq, &buflen); + if (unlikely(!ctx)) + goto err_buf; + + /* guard against a misconfigured or uncooperative backend that +* is sending packet larger than the MTU. +*/ + if ((page_off + buflen) > PAGE_SIZE) + goto err_buf; + + buf = mergeable_ctx_to_buf_address(ctx); + p = virt_to_head_page(buf); + off = buf - page_address(p); + + memcpy(page_address(page) + page_off, + page_address(p) + off, buflen); + page_off += buflen; + } + + *len = page_off; + return page; +err_buf: + __free_pages(page, 0); + return NULL; +} + static struct sk_buff *receive_mergeable(struct net_device *dev, struct virtnet_info *vi, struct receive_queue *rq, @@ -476,6 +534,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, rcu_read_lock(); xdp_prog = rcu_dereference(rq->xdp_prog); if (xdp_prog) { + struct page *xdp_page; u32 act; /* No known backend devices should send packets with @@ -485,7 +544,15 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, */ if (unlikely(num_buf > 1)) { bpf_warn_invalid_xdp_buffer(); - goto err_xdp; + + /* linearize data for XDP */ + xdp_page = xdp_linearize_page(rq, num_buf, + page, offset, &len); + if (!xdp_page) + goto err_xdp; + offset = 0; + } else { + xdp_page = page; } /* Transient failure which in theory could occur if @@ -496,15 +563,21 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, if (unlikely(hdr->hdr.gso_type || hdr->hdr.flags)) goto err_xdp; - act = do_xdp_prog
[net-next PATCH v4 5/6] virtio_net: add XDP_TX support
This adds support for the XDP_TX action to virtio_net. When an XDP program is run and returns the XDP_TX action the virtio_net XDP implementation will transmit the packet on a TX queue that aligns with the current CPU that the XDP packet was processed on. Before sending the packet the header is zeroed. Also XDP is expected to handle checksum correctly so no checksum offload support is provided. Signed-off-by: John Fastabend --- drivers/net/virtio_net.c | 63 -- 1 file changed, 60 insertions(+), 3 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index b67203e..137caba 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -330,12 +330,43 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi, return skb; } +static void virtnet_xdp_xmit(struct virtnet_info *vi, +unsigned int qnum, struct xdp_buff *xdp) +{ + struct send_queue *sq = &vi->sq[qnum]; + struct virtio_net_hdr_mrg_rxbuf *hdr; + unsigned int num_sg, len; + void *xdp_sent; + int err; + + /* Free up any pending old buffers before queueing new ones. */ + while ((xdp_sent = virtqueue_get_buf(sq->vq, &len)) != NULL) { + struct page *page = virt_to_head_page(xdp_sent); + + put_page(page); + } + + /* Zero header and leave csum up to XDP layers */ + hdr = xdp->data; + memset(hdr, 0, vi->hdr_len); + + num_sg = 1; + sg_init_one(sq->sg, xdp->data, xdp->data_end - xdp->data); + err = virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, + xdp->data, GFP_ATOMIC); + if (unlikely(err)) + put_page(virt_to_head_page(xdp->data)); + else + virtqueue_kick(sq->vq); +} + static u32 do_xdp_prog(struct virtnet_info *vi, struct bpf_prog *xdp_prog, struct page *page, int offset, int len) { int hdr_padded_len; struct xdp_buff xdp; + unsigned int qp; u32 act; u8 *buf; @@ -353,9 +384,15 @@ static u32 do_xdp_prog(struct virtnet_info *vi, switch (act) { case XDP_PASS: return XDP_PASS; + case XDP_TX: + qp = vi->curr_queue_pairs - + vi->xdp_queue_pairs + + smp_processor_id(); + xdp.data = buf + (vi->mergeable_rx_bufs ? 0 : 4); + virtnet_xdp_xmit(vi, qp, &xdp); + return XDP_TX; default: bpf_warn_invalid_xdp_action(act); - case XDP_TX: case XDP_ABORTED: case XDP_DROP: return XDP_DROP; @@ -391,8 +428,16 @@ static struct sk_buff *receive_big(struct net_device *dev, if (unlikely(hdr->hdr.gso_type || hdr->hdr.flags)) goto err_xdp; act = do_xdp_prog(vi, xdp_prog, page, 0, len); - if (act == XDP_DROP) + switch (act) { + case XDP_PASS: + break; + case XDP_TX: + rcu_read_unlock(); + goto xdp_xmit; + case XDP_DROP: + default: goto err_xdp; + } } rcu_read_unlock(); @@ -407,6 +452,7 @@ static struct sk_buff *receive_big(struct net_device *dev, err: dev->stats.rx_dropped++; give_pages(rq, page); +xdp_xmit: return NULL; } @@ -425,6 +471,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, struct bpf_prog *xdp_prog; unsigned int truesize; + head_skb = NULL; + rcu_read_lock(); xdp_prog = rcu_dereference(rq->xdp_prog); if (xdp_prog) { @@ -449,8 +497,16 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, goto err_xdp; act = do_xdp_prog(vi, xdp_prog, page, offset, len); - if (act == XDP_DROP) + switch (act) { + case XDP_PASS: + break; + case XDP_TX: + rcu_read_unlock(); + goto xdp_xmit; + case XDP_DROP: + default: goto err_xdp; + } } rcu_read_unlock(); @@ -528,6 +584,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, err_buf: dev->stats.rx_dropped++; dev_kfree_skb(head_skb); +xdp_xmit: return NULL; }
[net-next PATCH v4 4/6] virtio_net: add dedicated XDP transmit queues
XDP requires using isolated transmit queues to avoid interference with normal networking stack (BQL, NETDEV_TX_BUSY, etc). This patch adds a XDP queue per cpu when a XDP program is loaded and does not expose the queues to the OS via the normal API call to netif_set_real_num_tx_queues(). This way the stack will never push an skb to these queues. However virtio/vhost/qemu implementation only allows for creating TX/RX queue pairs at this time so creating only TX queues was not possible. And because the associated RX queues are being created I went ahead and exposed these to the stack and let the backend use them. This creates more RX queues visible to the network stack than TX queues which is worth mentioning but does not cause any issues as far as I can tell. Signed-off-by: John Fastabend --- drivers/net/virtio_net.c | 30 -- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 80b1cfc..b67203e 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -114,6 +114,9 @@ struct virtnet_info { /* # of queue pairs currently used by the driver */ u16 curr_queue_pairs; + /* # of XDP queue pairs currently used by the driver */ + u16 xdp_queue_pairs; + /* I like... big packets and I cannot lie! */ bool big_packets; @@ -1552,7 +1555,8 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog) unsigned long int max_sz = PAGE_SIZE - sizeof(struct padded_vnet_hdr); struct virtnet_info *vi = netdev_priv(dev); struct bpf_prog *old_prog; - int i; + u16 xdp_qp = 0, curr_qp; + int i, err; if ((dev->features & NETIF_F_LRO) && prog) { netdev_warn(dev, "can't set XDP while LRO is on, disable LRO first\n"); @@ -1569,12 +1573,34 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog) return -EINVAL; } + curr_qp = vi->curr_queue_pairs - vi->xdp_queue_pairs; + if (prog) + xdp_qp = nr_cpu_ids; + + /* XDP requires extra queues for XDP_TX */ + if (curr_qp + xdp_qp > vi->max_queue_pairs) { + netdev_warn(dev, "request %i queues but max is %i\n", + curr_qp + xdp_qp, vi->max_queue_pairs); + return -ENOMEM; + } + + err = virtnet_set_queues(vi, curr_qp + xdp_qp); + if (err) { + dev_warn(&dev->dev, "XDP Device queue allocation failure.\n"); + return err; + } + if (prog) { prog = bpf_prog_add(prog, vi->max_queue_pairs - 1); - if (IS_ERR(prog)) + if (IS_ERR(prog)) { + virtnet_set_queues(vi, curr_qp); return PTR_ERR(prog); + } } + vi->xdp_queue_pairs = xdp_qp; + netif_set_real_num_rx_queues(dev, curr_qp + xdp_qp); + for (i = 0; i < vi->max_queue_pairs; i++) { old_prog = rtnl_dereference(vi->rq[i].xdp_prog); rcu_assign_pointer(vi->rq[i].xdp_prog, prog);
[net-next PATCH v4 3/6] virtio_net: Add XDP support
From: John Fastabend This adds XDP support to virtio_net. Some requirements must be met for XDP to be enabled depending on the mode. First it will only be supported with LRO disabled so that data is not pushed across multiple buffers. Second the MTU must be less than a page size to avoid having to handle XDP across multiple pages. If mergeable receive is enabled this patch only supports the case where header and data are in the same buf which we can check when a packet is received by looking at num_buf. If the num_buf is greater than 1 and a XDP program is loaded the packet is dropped and a warning is thrown. When any_header_sg is set this does not happen and both header and data is put in a single buffer as expected so we check this when XDP programs are loaded. Subsequent patches will process the packet in a degraded mode to ensure connectivity and correctness is not lost even if backend pushes packets into multiple buffers. If big packets mode is enabled and MTU/LRO conditions above are met then XDP is allowed. This patch was tested with qemu with vhost=on and vhost=off where mergeable and big_packet modes were forced via hard coding feature negotiation. Multiple buffers per packet was forced via a small test patch to vhost.c in the vhost=on qemu mode. Suggested-by: Shrijeet Mukherjee Signed-off-by: John Fastabend --- drivers/net/virtio_net.c | 175 +- 1 file changed, 170 insertions(+), 5 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index d814e7cb..80b1cfc 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -81,6 +82,8 @@ struct receive_queue { struct napi_struct napi; + struct bpf_prog __rcu *xdp_prog; + /* Chain pages by the private ptr. */ struct page *pages; @@ -324,6 +327,38 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi, return skb; } +static u32 do_xdp_prog(struct virtnet_info *vi, + struct bpf_prog *xdp_prog, + struct page *page, int offset, int len) +{ + int hdr_padded_len; + struct xdp_buff xdp; + u32 act; + u8 *buf; + + buf = page_address(page) + offset; + + if (vi->mergeable_rx_bufs) + hdr_padded_len = sizeof(struct virtio_net_hdr_mrg_rxbuf); + else + hdr_padded_len = sizeof(struct padded_vnet_hdr); + + xdp.data = buf + hdr_padded_len; + xdp.data_end = xdp.data + (len - vi->hdr_len); + + act = bpf_prog_run_xdp(xdp_prog, &xdp); + switch (act) { + case XDP_PASS: + return XDP_PASS; + default: + bpf_warn_invalid_xdp_action(act); + case XDP_TX: + case XDP_ABORTED: + case XDP_DROP: + return XDP_DROP; + } +} + static struct sk_buff *receive_small(struct virtnet_info *vi, void *buf, unsigned int len) { struct sk_buff * skb = buf; @@ -340,14 +375,32 @@ static struct sk_buff *receive_big(struct net_device *dev, void *buf, unsigned int len) { + struct bpf_prog *xdp_prog; struct page *page = buf; - struct sk_buff *skb = page_to_skb(vi, rq, page, 0, len, PAGE_SIZE); + struct sk_buff *skb; + rcu_read_lock(); + xdp_prog = rcu_dereference(rq->xdp_prog); + if (xdp_prog) { + struct virtio_net_hdr_mrg_rxbuf *hdr = buf; + u32 act; + + if (unlikely(hdr->hdr.gso_type || hdr->hdr.flags)) + goto err_xdp; + act = do_xdp_prog(vi, xdp_prog, page, 0, len); + if (act == XDP_DROP) + goto err_xdp; + } + rcu_read_unlock(); + + skb = page_to_skb(vi, rq, page, 0, len, PAGE_SIZE); if (unlikely(!skb)) goto err; return skb; +err_xdp: + rcu_read_unlock(); err: dev->stats.rx_dropped++; give_pages(rq, page); @@ -365,11 +418,42 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, u16 num_buf = virtio16_to_cpu(vi->vdev, hdr->num_buffers); struct page *page = virt_to_head_page(buf); int offset = buf - page_address(page); - unsigned int truesize = max(len, mergeable_ctx_to_buf_truesize(ctx)); + struct sk_buff *head_skb, *curr_skb; + struct bpf_prog *xdp_prog; + unsigned int truesize; + + rcu_read_lock(); + xdp_prog = rcu_dereference(rq->xdp_prog); + if (xdp_prog) { + u32 act; + + /* No known backend devices should send packets with +* more than a single buffer when XDP conditions are +* met. However it is not strictly illegal so the case +* is handled as an exception and a warning is thrown. +
[net-next PATCH v4 2/6] net: xdp: add invalid buffer warning
This adds a warning for drivers to use when encountering an invalid buffer for XDP. For normal cases this should not happen but to catch this in virtual/qemu setups that I may not have expected from the emulation layer having a standard warning is useful. Signed-off-by: John Fastabend --- include/linux/filter.h |1 + net/core/filter.c |6 ++ 2 files changed, 7 insertions(+) diff --git a/include/linux/filter.h b/include/linux/filter.h index 7f246a2..90dfc3c 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -595,6 +595,7 @@ int sk_get_filter(struct sock *sk, struct sock_filter __user *filter, struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, const struct bpf_insn *patch, u32 len); void bpf_warn_invalid_xdp_action(u32 act); +void bpf_warn_invalid_xdp_buffer(void); #ifdef CONFIG_BPF_JIT extern int bpf_jit_enable; diff --git a/net/core/filter.c b/net/core/filter.c index 698a262..7926dd0 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2783,6 +2783,12 @@ void bpf_warn_invalid_xdp_action(u32 act) } EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); +void bpf_warn_invalid_xdp_buffer(void) +{ + WARN_ONCE(1, "Illegal XDP buffer encountered, expect throughput degradation\n"); +} +EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_buffer); + static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg, int src_reg, int ctx_off, struct bpf_insn *insn_buf,
[net-next PATCH v4 1/6] net: virtio dynamically disable/enable LRO
This adds support for dynamically setting the LRO feature flag. The message to control guest features in the backend uses the CTRL_GUEST_OFFLOADS msg type. Signed-off-by: John Fastabend --- drivers/net/virtio_net.c | 45 - 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index a21d93a..d814e7cb 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -1419,6 +1419,41 @@ static void virtnet_init_settings(struct net_device *dev) .set_settings = virtnet_set_settings, }; +static int virtnet_set_features(struct net_device *netdev, + netdev_features_t features) +{ + struct virtnet_info *vi = netdev_priv(netdev); + struct virtio_device *vdev = vi->vdev; + struct scatterlist sg; + u64 offloads = 0; + + if (features & NETIF_F_LRO) + offloads |= (1 << VIRTIO_NET_F_GUEST_TSO4) | + (1 << VIRTIO_NET_F_GUEST_TSO6); + + if (features & NETIF_F_RXCSUM) + offloads |= (1 << VIRTIO_NET_F_GUEST_CSUM); + + if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)) { + sg_init_one(&sg, &offloads, sizeof(uint64_t)); + if (!virtnet_send_command(vi, + VIRTIO_NET_CTRL_GUEST_OFFLOADS, + VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET, + &sg)) { + dev_warn(&netdev->dev, +"Failed to set guest offloads by virtnet command.\n"); + return -EINVAL; + } + } else if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS) && + !virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) { + dev_warn(&netdev->dev, +"No support for setting offloads pre version_1.\n"); + return -EINVAL; + } + + return 0; +} + static const struct net_device_ops virtnet_netdev = { .ndo_open= virtnet_open, .ndo_stop= virtnet_close, @@ -1435,6 +1470,7 @@ static void virtnet_init_settings(struct net_device *dev) #ifdef CONFIG_NET_RX_BUSY_POLL .ndo_busy_poll = virtnet_busy_poll, #endif + .ndo_set_features = virtnet_set_features, }; static void virtnet_config_changed_work(struct work_struct *work) @@ -1815,6 +1851,12 @@ static int virtnet_probe(struct virtio_device *vdev) if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_CSUM)) dev->features |= NETIF_F_RXCSUM; + if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) && + virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6)) { + dev->features |= NETIF_F_LRO; + dev->hw_features |= NETIF_F_LRO; + } + dev->vlan_features = dev->features; /* MTU range: 68 - 65535 */ @@ -2057,7 +2099,8 @@ static int virtnet_restore(struct virtio_device *vdev) VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN, \ VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ, \ VIRTIO_NET_F_CTRL_MAC_ADDR, \ - VIRTIO_NET_F_MTU + VIRTIO_NET_F_MTU, \ + VIRTIO_NET_F_CTRL_GUEST_OFFLOADS static unsigned int features[] = { VIRTNET_FEATURES,
[net-next PATCH v4 0/6] XDP for virtio_net
This implements virtio_net for the mergeable buffers and big_packet modes. I tested this with vhost_net running on qemu and did not see any issues. For testing num_buf > 1 I added a hack to vhost driver to only use 100 bytes per buffer so that packets were pushed across multiple buffers. There are some restrictions for XDP to be enabled and work well (see patch 3) for more details. 1. LRO must be off 2. MTU must be less than PAGE_SIZE 3. queues must be available to dedicate to XDP 4. num_bufs received in mergeable buffers must be 1 5. big_packet mode must have all data on single page Please review any comments/feedback welcome as always. --- John Fastabend (6): net: virtio dynamically disable/enable LRO net: xdp: add invalid buffer warning virtio_net: Add XDP support virtio_net: add dedicated XDP transmit queues virtio_net: add XDP_TX support virtio_net: xdp, add slowpath case for non contiguous buffers drivers/net/virtio_net.c | 376 +- include/linux/filter.h |1 net/core/filter.c|6 + 3 files changed, 377 insertions(+), 6 deletions(-) -- Signature
Re: [PATCH] net: wireless: realtek: constify rate_control_ops structures
On 12/02/2016 03:50 AM, Bhumika Goyal wrote: The structures rate_control_ops are only passed as an argument to the functions ieee80211_rate_control_{register/unregister}. This argument is of type const, so rate_control_ops having this property can also be declared as const. Done using Coccinelle: @r1 disable optional_qualifier @ identifier i; position p; @@ static struct rate_control_ops i@p = {...}; @ok1@ identifier r1.i; position p; @@ ieee80211_rate_control_register(&i@p) @ok2@ identifier r1.i; position p; @@ ieee80211_rate_control_unregister(&i@p) @bad@ position p!={r1.p,ok1.p,ok2.p}; identifier r1.i; @@ i@p @depends on !bad disable optional_qualifier@ identifier r1.i; @@ static +const struct rate_control_ops i={...}; @depends on !bad disable optional_qualifier@ identifier r1.i; @@ +const struct rate_control_ops i; File size before: textdata bss dec hex filename 1991 104 02095 82f wireless/realtek/rtlwifi/rc.o File size after: textdata bss dec hex filename 2095 0 02095 wireless/realtek/rtlwifi/rc.o Signed-off-by: Bhumika Goyal --- drivers/net/wireless/realtek/rtlwifi/rc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/realtek/rtlwifi/rc.c b/drivers/net/wireless/realtek/rtlwifi/rc.c index ce8621a..107c13c 100644 --- a/drivers/net/wireless/realtek/rtlwifi/rc.c +++ b/drivers/net/wireless/realtek/rtlwifi/rc.c @@ -284,7 +284,7 @@ static void rtl_rate_free_sta(void *rtlpriv, kfree(rate_priv); } -static struct rate_control_ops rtl_rate_ops = { +static const struct rate_control_ops rtl_rate_ops = { .name = "rtl_rc", .alloc = rtl_rate_alloc, .free = rtl_rate_free, The content of your patch is OK; however, your subject is not. By convention, "net: wireless: realtek:" is assumed. We do, however, include "rtlwifi:" to indicate which part of drivers/net/wireless/realtek/ is referenced. NACK Larry
[PATCH v3 05/13] net: ethernet: ti: cpts: fix registration order
The ptp clock registered before spinlock, which is protecting it, and before timecounter and cyclecounter initialization in cpts_register(). So, ensure that ptp clock is registered the last, after everything else is done. Acked-by: Richard Cochran Signed-off-by: Grygorii Strashko --- drivers/net/ethernet/ti/cpts.c | 24 ++-- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/ti/cpts.c b/drivers/net/ethernet/ti/cpts.c index 61198f1..3dda6d5 100644 --- a/drivers/net/ethernet/ti/cpts.c +++ b/drivers/net/ethernet/ti/cpts.c @@ -356,15 +356,8 @@ int cpts_register(struct device *dev, struct cpts *cpts, u32 mult, u32 shift) { int err, i; - unsigned long flags; cpts->info = cpts_info; - cpts->clock = ptp_clock_register(&cpts->info, dev); - if (IS_ERR(cpts->clock)) { - err = PTR_ERR(cpts->clock); - cpts->clock = NULL; - return err; - } spin_lock_init(&cpts->lock); cpts->cc.read = cpts_systim_read; @@ -382,15 +375,26 @@ int cpts_register(struct device *dev, struct cpts *cpts, cpts_write32(cpts, CPTS_EN, control); cpts_write32(cpts, TS_PEND_EN, int_enable); - spin_lock_irqsave(&cpts->lock, flags); timecounter_init(&cpts->tc, &cpts->cc, ktime_to_ns(ktime_get_real())); - spin_unlock_irqrestore(&cpts->lock, flags); INIT_DELAYED_WORK(&cpts->overflow_work, cpts_overflow_check); - schedule_delayed_work(&cpts->overflow_work, CPTS_OVERFLOW_PERIOD); + cpts->clock = ptp_clock_register(&cpts->info, dev); + if (IS_ERR(cpts->clock)) { + err = PTR_ERR(cpts->clock); + cpts->clock = NULL; + goto err_ptp; + } cpts->phc_index = ptp_clock_index(cpts->clock); + + schedule_delayed_work(&cpts->overflow_work, CPTS_OVERFLOW_PERIOD); + return 0; + +err_ptp: + if (cpts->refclk) + cpts_clk_release(cpts); + return err; } EXPORT_SYMBOL_GPL(cpts_register); -- 2.10.1
[PATCH v3 08/13] net: ethernet: ti: cpts: drop excessive writes to CTRL and INT_EN regs
CPTS module and IRQs are always enabled when CPTS is registered, before starting overflow check work, and disabled during deregistration, when overflow check work has been canceled already. So, It doesn't require to (re)enable CPTS module and IRQs in cpts_overflow_check(). Acked-by: Richard Cochran Signed-off-by: Grygorii Strashko --- drivers/net/ethernet/ti/cpts.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/ethernet/ti/cpts.c b/drivers/net/ethernet/ti/cpts.c index 8266459..a662c33 100644 --- a/drivers/net/ethernet/ti/cpts.c +++ b/drivers/net/ethernet/ti/cpts.c @@ -243,8 +243,6 @@ static void cpts_overflow_check(struct work_struct *work) struct timespec64 ts; struct cpts *cpts = container_of(work, struct cpts, overflow_work.work); - cpts_write32(cpts, CPTS_EN, control); - cpts_write32(cpts, TS_PEND_EN, int_enable); cpts_ptp_gettime(&cpts->info, &ts); pr_debug("cpts overflow check at %lld.%09lu\n", ts.tv_sec, ts.tv_nsec); schedule_delayed_work(&cpts->overflow_work, CPTS_OVERFLOW_PERIOD); -- 2.10.1
[PATCH v3 01/13] net: ethernet: ti: cpts: switch to readl/writel_relaxed()
Switch to readl/writel_relaxed() APIs, because this is recommended API and the CPTS IP is reused on Keystone 2 SoCs where LE/BE modes are supported. Acked-by: Richard Cochran Signed-off-by: Grygorii Strashko --- drivers/net/ethernet/ti/cpts.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/ti/cpts.c b/drivers/net/ethernet/ti/cpts.c index 85a55b4..a42c449 100644 --- a/drivers/net/ethernet/ti/cpts.c +++ b/drivers/net/ethernet/ti/cpts.c @@ -33,8 +33,8 @@ #ifdef CONFIG_TI_CPTS -#define cpts_read32(c, r) __raw_readl(&c->reg->r) -#define cpts_write32(c, v, r) __raw_writel(v, &c->reg->r) +#define cpts_read32(c, r) readl_relaxed(&c->reg->r) +#define cpts_write32(c, v, r) writel_relaxed(v, &c->reg->r) static int event_expired(struct cpts_event *event) { -- 2.10.1
[PATCH v3 09/13] net: ethernet: ti: cpts: rework initialization/deinitialization
The current implementation CPTS initialization and deinitialization (represented by cpts_register/unregister()) does too many static initialization from .ndo_open(), which is reasonable to do once at probe time instead, and also require caller to allocate memory for struct cpts, which is internal for CPTS driver in general. This patch splits CPTS initialization and deinitialization on two parts: - static initializtion cpts_create()/cpts_release() which expected to be executed when parent driver is probed/removed; - dynamic part cpts_register/unregister() which expected to be executed when network device is opened/closed. As result, current code of CPTS parent driver - CPSW - will be simplified (and it also will allow simplify adding support for Keystone 2 devices in the future), plus more initialization errors will be catched earlier. In addition, this change allows to clean up cpts.h for the case when CPTS is disabled. Signed-off-by: Grygorii Strashko --- drivers/net/ethernet/ti/cpsw.c | 24 +- drivers/net/ethernet/ti/cpts.c | 102 - drivers/net/ethernet/ti/cpts.h | 26 +-- 3 files changed, 95 insertions(+), 57 deletions(-) diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c index a6a93ad..6c28ef1 100644 --- a/drivers/net/ethernet/ti/cpsw.c +++ b/drivers/net/ethernet/ti/cpsw.c @@ -1406,9 +1406,7 @@ static int cpsw_ndo_open(struct net_device *ndev) if (ret < 0) goto err_cleanup; - if (cpts_register(cpsw->dev, cpsw->cpts, - cpsw->data.cpts_clock_mult, - cpsw->data.cpts_clock_shift)) + if (cpts_register(cpsw->cpts)) dev_err(priv->dev, "error registering cpts device\n"); } @@ -2596,6 +2594,7 @@ static int cpsw_probe(struct platform_device *pdev) struct cpdma_params dma_params; struct cpsw_ale_params ale_params; void __iomem*ss_regs; + void __iomem*cpts_regs; struct resource *res, *ss_res; const struct of_device_id *of_id; struct gpio_descs *mode; @@ -2623,12 +2622,6 @@ static int cpsw_probe(struct platform_device *pdev) priv->dev = &ndev->dev; priv->msg_enable = netif_msg_init(debug_level, CPSW_DEBUG); cpsw->rx_packet_max = max(rx_packet_max, 128); - cpsw->cpts = devm_kzalloc(&pdev->dev, sizeof(struct cpts), GFP_KERNEL); - if (!cpsw->cpts) { - dev_err(&pdev->dev, "error allocating cpts\n"); - ret = -ENOMEM; - goto clean_ndev_ret; - } mode = devm_gpiod_get_array_optional(&pdev->dev, "mode", GPIOD_OUT_LOW); if (IS_ERR(mode)) { @@ -2716,7 +2709,7 @@ static int cpsw_probe(struct platform_device *pdev) switch (cpsw->version) { case CPSW_VERSION_1: cpsw->host_port_regs = ss_regs + CPSW1_HOST_PORT_OFFSET; - cpsw->cpts->reg = ss_regs + CPSW1_CPTS_OFFSET; + cpts_regs = ss_regs + CPSW1_CPTS_OFFSET; cpsw->hw_stats = ss_regs + CPSW1_HW_STATS; dma_params.dmaregs = ss_regs + CPSW1_CPDMA_OFFSET; dma_params.txhdp = ss_regs + CPSW1_STATERAM_OFFSET; @@ -2730,7 +2723,7 @@ static int cpsw_probe(struct platform_device *pdev) case CPSW_VERSION_3: case CPSW_VERSION_4: cpsw->host_port_regs = ss_regs + CPSW2_HOST_PORT_OFFSET; - cpsw->cpts->reg = ss_regs + CPSW2_CPTS_OFFSET; + cpts_regs = ss_regs + CPSW2_CPTS_OFFSET; cpsw->hw_stats = ss_regs + CPSW2_HW_STATS; dma_params.dmaregs = ss_regs + CPSW2_CPDMA_OFFSET; dma_params.txhdp = ss_regs + CPSW2_STATERAM_OFFSET; @@ -2796,6 +2789,14 @@ static int cpsw_probe(struct platform_device *pdev) goto clean_dma_ret; } + cpsw->cpts = cpts_create(cpsw->dev, cpts_regs, +cpsw->data.cpts_clock_mult, +cpsw->data.cpts_clock_shift); + if (IS_ERR(cpsw->cpts)) { + ret = PTR_ERR(cpsw->cpts); + goto clean_ale_ret; + } + ndev->irq = platform_get_irq(pdev, 1); if (ndev->irq < 0) { dev_err(priv->dev, "error getting irq resource\n"); @@ -2911,6 +2912,7 @@ static int cpsw_remove(struct platform_device *pdev) unregister_netdev(cpsw->slaves[1].ndev); unregister_netdev(ndev); + cpts_release(cpsw->cpts); cpsw_ale_destroy(cpsw->ale); cpdma_ctlr_destroy(cpsw->dma); cpsw_remove_dt(pdev); diff --git a/drivers/net/ethernet/ti/cpts.c b/drivers/net/ethernet/ti/cpts.c index a662c33..47831b2 100644 --- a/drivers/net/ethernet/ti/cpt
Re: [PATCH net-next 2/2] net/sched: cls_flower: Support matching on ICMP type and code
Hi Jiri, On Fri, Dec 02, 2016 at 08:17:13PM +0100, Simon Horman wrote: > On Fri, Dec 02, 2016 at 07:38:48PM +0100, Jiri Pirko wrote: > > Fri, Dec 02, 2016 at 07:05:51PM CET, simon.hor...@netronome.com wrote: > > >Support matching on ICMP type and code. ... > > This hunk looks like it should be squashed to the previous patch. > > I included it in this patch as it is where these helpers are used > for the first time. I can shuffle it into the first patch if you prefer; > I agree it does make sense to put all the dissector changes there. I moved things around as you suggested and posted v2.
[PATCH v3 12/13] net: ethernet: ti: cpts: calc mult and shift from refclk freq
The cyclecounter mult and shift values can be calculated based on the CPTS rfclk frequency and timekeepnig framework provides required algos and API's. Hence, calc mult and shift basing on CPTS rfclk frequency if both cpts_clock_shift and cpts_clock_mult properties are not provided in DT (the basis of calculation algorithm is borrowed from __clocksource_update_freq_scale() commit 7d2f944a2b83 ("clocksource: Provide a generic mult/shift factor calculation")). After this change cpts_clock_shift and cpts_clock_mult DT properties will become optional. Cc: John Stultz Cc: Thomas Gleixner Signed-off-by: Grygorii Strashko --- Documentation/devicetree/bindings/net/cpsw.txt | 8 ++-- drivers/net/ethernet/ti/cpts.c | 53 +++--- 2 files changed, 52 insertions(+), 9 deletions(-) diff --git a/Documentation/devicetree/bindings/net/cpsw.txt b/Documentation/devicetree/bindings/net/cpsw.txt index 5ad439f..ebda7c9 100644 --- a/Documentation/devicetree/bindings/net/cpsw.txt +++ b/Documentation/devicetree/bindings/net/cpsw.txt @@ -20,8 +20,6 @@ Required properties: - slaves : Specifies number for slaves - active_slave : Specifies the slave to use for time stamping, ethtool and SIOCGMIIPHY -- cpts_clock_mult : Numerator to convert input clock ticks into nanoseconds -- cpts_clock_shift : Denominator to convert input clock ticks into nanoseconds Optional properties: - ti,hwmods: Must be "cpgmac0" @@ -35,7 +33,11 @@ Optional properties: For example in dra72x-evm, pcf gpio has to be driven low so that cpsw slave 0 and phy data lines are connected via mux. - +- cpts_clock_mult : Numerator to convert input clock ticks into nanoseconds +- cpts_clock_shift : Denominator to convert input clock ticks into nanoseconds + Mult and shift will be calculated basing on CPTS + rftclk frequency if both cpts_clock_shift and + cpts_clock_mult properties are not provided. Slave Properties: Required properties: diff --git a/drivers/net/ethernet/ti/cpts.c b/drivers/net/ethernet/ti/cpts.c index 5d5c46d..806241b 100644 --- a/drivers/net/ethernet/ti/cpts.c +++ b/drivers/net/ethernet/ti/cpts.c @@ -409,21 +409,60 @@ void cpts_unregister(struct cpts *cpts) } EXPORT_SYMBOL_GPL(cpts_unregister); +static void cpts_calc_mult_shift(struct cpts *cpts) +{ + u64 frac, maxsec, ns; + u32 freq, mult, shift; + + freq = clk_get_rate(cpts->refclk); + + /* Calc the maximum number of seconds which we can run before +* wrapping around. +*/ + maxsec = cpts->cc.mask; + do_div(maxsec, freq); + /* limit conversation rate to 10 sec as higher values will produce +* too small mult factors and so reduce the conversion accuracy +*/ + if (maxsec > 10) + maxsec = 10; + + if (cpts->cc_mult || cpts->cc.shift) + return; + + clocks_calc_mult_shift(&mult, &shift, freq, NSEC_PER_SEC, maxsec); + + cpts->cc_mult = mult; + cpts->cc.mult = mult; + cpts->cc.shift = shift; + + frac = 0; + ns = cyclecounter_cyc2ns(&cpts->cc, freq, cpts->cc.mask, &frac); + + dev_info(cpts->dev, +"CPTS: ref_clk_freq:%u calc_mult:%u calc_shift:%u error:%lld nsec/sec\n", +freq, cpts->cc_mult, cpts->cc.shift, (ns - NSEC_PER_SEC)); +} + static int cpts_of_parse(struct cpts *cpts, struct device_node *node) { int ret = -EINVAL; u32 prop; - if (of_property_read_u32(node, "cpts_clock_mult", &prop)) - goto of_error; /* save cc.mult original value as it can be modified * by cpts_ptp_adjfreq(). */ - cpts->cc_mult = prop; + cpts->cc_mult = 0; + if (!of_property_read_u32(node, "cpts_clock_mult", &prop)) + cpts->cc_mult = prop; + + cpts->cc.shift = 0; + if (!of_property_read_u32(node, "cpts_clock_shift", &prop)) + cpts->cc.shift = prop; - if (of_property_read_u32(node, "cpts_clock_shift", &prop)) - goto of_error; - cpts->cc.shift = prop; + if ((cpts->cc_mult && !cpts->cc.shift) || + (!cpts->cc_mult && cpts->cc.shift)) + goto of_error; return 0; @@ -463,6 +502,8 @@ struct cpts *cpts_create(struct device *dev, void __iomem *regs, cpts->cc.mask = CLOCKSOURCE_MASK(32); cpts->info = cpts_info; + cpts_calc_mult_shift(cpts); + return cpts; } EXPORT_SYMBOL_GPL(cpts_create); -- 2.10.1