[PATCH -next v2] libbpf: remove redundant semi-colon

2021-04-01 Thread Yang Yingliang
Remove redundant semi-colon in infinalize_btf_ext().

Signed-off-by: Yang Yingliang 
---
v2:
  add commit log
---
 tools/lib/bpf/linker.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/lib/bpf/linker.c b/tools/lib/bpf/linker.c
index 46b16cbdcda3..4e08bc07e635 100644
--- a/tools/lib/bpf/linker.c
+++ b/tools/lib/bpf/linker.c
@@ -1895,7 +1895,7 @@ static int finalize_btf_ext(struct bpf_linker *linker)
hdr->func_info_len = funcs_sz;
hdr->line_info_off = funcs_sz;
hdr->line_info_len = lines_sz;
-   hdr->core_relo_off = funcs_sz + lines_sz;;
+   hdr->core_relo_off = funcs_sz + lines_sz;
hdr->core_relo_len = core_relos_sz;
 
if (funcs_sz) {
-- 
2.25.1



[PATCH -next] libbpf: remove redundant semi-colon

2021-04-01 Thread Yang Yingliang
Signed-off-by: Yang Yingliang 
---
 tools/lib/bpf/linker.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/lib/bpf/linker.c b/tools/lib/bpf/linker.c
index 46b16cbdcda3..4e08bc07e635 100644
--- a/tools/lib/bpf/linker.c
+++ b/tools/lib/bpf/linker.c
@@ -1895,7 +1895,7 @@ static int finalize_btf_ext(struct bpf_linker *linker)
hdr->func_info_len = funcs_sz;
hdr->line_info_off = funcs_sz;
hdr->line_info_len = lines_sz;
-   hdr->core_relo_off = funcs_sz + lines_sz;;
+   hdr->core_relo_off = funcs_sz + lines_sz;
hdr->core_relo_len = core_relos_sz;
 
if (funcs_sz) {
-- 
2.25.1



[PATCH -next] lan743x: remove redundant semi-colon

2021-04-01 Thread Yang Yingliang
Signed-off-by: Yang Yingliang 
---
 drivers/net/ethernet/microchip/lan743x_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/microchip/lan743x_main.c 
b/drivers/net/ethernet/microchip/lan743x_main.c
index 1c3e204d727c..e7ab5f3f73fd 100644
--- a/drivers/net/ethernet/microchip/lan743x_main.c
+++ b/drivers/net/ethernet/microchip/lan743x_main.c
@@ -3004,7 +3004,7 @@ static int lan743x_pm_suspend(struct device *dev)
lan743x_pm_set_wol(adapter);
 
/* Host sets PME_En, put D3hot */
-   return pci_prepare_to_sleep(pdev);;
+   return pci_prepare_to_sleep(pdev);
 }
 
 static int lan743x_pm_resume(struct device *dev)
-- 
2.25.1



[PATCH -next] net/tipc: fix missing destroy_workqueue() on error in tipc_crypto_start()

2021-03-31 Thread Yang Yingliang
Add the missing destroy_workqueue() before return from
tipc_crypto_start() in the error handling case.

Fixes: 1ef6f7c9390f ("tipc: add automatic session key exchange")
Reported-by: Hulk Robot 
Signed-off-by: Yang Yingliang 
---
 net/tipc/crypto.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c
index 6f64acef73dc..76b8428c94a7 100644
--- a/net/tipc/crypto.c
+++ b/net/tipc/crypto.c
@@ -1492,6 +1492,8 @@ int tipc_crypto_start(struct tipc_crypto **crypto, struct 
net *net,
/* Allocate statistic structure */
c->stats = alloc_percpu_gfp(struct tipc_crypto_stats, GFP_ATOMIC);
if (!c->stats) {
+   if (c->wq)
+   destroy_workqueue(c->wq);
kfree_sensitive(c);
return -ENOMEM;
}
-- 
2.25.1



[PATCH -next] net: mhi: remove pointless conditional before kfree_skb()

2021-03-30 Thread Yang Yingliang
It already has null pointer check in kfree_skb(),
remove pointless pointer check before kfree_skb().

Reported-by: Hulk Robot 
Signed-off-by: Yang Yingliang 
---
 drivers/net/mhi/net.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/mhi/net.c b/drivers/net/mhi/net.c
index f59960876083..8b44c674db1b 100644
--- a/drivers/net/mhi/net.c
+++ b/drivers/net/mhi/net.c
@@ -359,8 +359,7 @@ static void mhi_net_remove(struct mhi_device *mhi_dev)
 
mhi_unprepare_from_transfer(mhi_netdev->mdev);
 
-   if (mhi_netdev->skbagg_head)
-   kfree_skb(mhi_netdev->skbagg_head);
+   kfree_skb(mhi_netdev->skbagg_head);
 
free_netdev(mhi_netdev->ndev);
 }
-- 
2.25.1



[PATCH -next] netfilter: nftables: remove unnecessary spin_lock_init()

2021-03-29 Thread Yang Yingliang
The spinlock nf_tables_destroy_list_lock is initialized statically.
It is unnecessary to initialize by spin_lock_init().

Reported-by: Hulk Robot 
Signed-off-by: Yang Yingliang 
---
 net/netfilter/nf_tables_api.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index fc2526b8bd55..24eeb027a888 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -9193,7 +9193,6 @@ static int __init nf_tables_module_init(void)
 {
int err;
 
-   spin_lock_init(&nf_tables_destroy_list_lock);
err = register_pernet_subsys(&nf_tables_net_ops);
if (err < 0)
return err;
-- 
2.25.1



[PATCH -next] net: mdio: Correct function name mdio45_links_ok() in comment

2021-03-29 Thread Yang Yingliang
Fix the following make W=1 kernel build warning:

 drivers/net/mdio.c:95: warning: expecting prototype for mdio_link_ok(). 
Prototype was for mdio45_links_ok() instead

Reported-by: Hulk Robot 
Signed-off-by: Yang Yingliang 
---
 drivers/net/mdio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/mdio.c b/drivers/net/mdio.c
index 5e72cc55afbd..e08c90ac0c6e 100644
--- a/drivers/net/mdio.c
+++ b/drivers/net/mdio.c
@@ -83,7 +83,7 @@ int mdio_set_flag(const struct mdio_if_info *mdio,
 EXPORT_SYMBOL(mdio_set_flag);
 
 /**
- * mdio_link_ok - is link status up/OK
+ * mdio45_links_ok - is link status up/OK
  * @mdio: MDIO interface
  * @mmd_mask: Mask for MMDs to check
  *
-- 
2.25.1



[PATCH -next] net: bonding: Correct function name bond_change_active_slave() in comment

2021-03-29 Thread Yang Yingliang
Fix the following make W=1 kernel build warning:

 drivers/net/bonding/bond_main.c:982: warning: expecting prototype for 
change_active_interface(). Prototype was for bond_change_active_slave() instead

Reported-by: Hulk Robot 
Signed-off-by: Yang Yingliang 
---
 drivers/net/bonding/bond_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 74cbbb22470b..d5ca38aa8aa9 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -964,7 +964,7 @@ static bool bond_should_notify_peers(struct bonding *bond)
 }
 
 /**
- * change_active_interface - change the active slave into the specified one
+ * bond_change_active_slave - change the active slave into the specified one
  * @bond: our bonding struct
  * @new_active: the new slave to make the active one
  *
-- 
2.25.1



[PATCH -next] net: phy: Correct function name mdiobus_register_board_info() in comment

2021-03-29 Thread Yang Yingliang
Fix the following make W=1 kernel build warning:

 drivers/net/phy/mdio-boardinfo.c:63: warning: expecting prototype for 
mdio_register_board_info(). Prototype was for mdiobus_register_board_info() 
instead

Reported-by: Hulk Robot 
Signed-off-by: Yang Yingliang 
---
 drivers/net/phy/mdio-boardinfo.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/phy/mdio-boardinfo.c b/drivers/net/phy/mdio-boardinfo.c
index 033df435f76c..2de679a68115 100644
--- a/drivers/net/phy/mdio-boardinfo.c
+++ b/drivers/net/phy/mdio-boardinfo.c
@@ -50,7 +50,7 @@ void mdiobus_setup_mdiodev_from_board_info(struct mii_bus 
*bus,
 EXPORT_SYMBOL(mdiobus_setup_mdiodev_from_board_info);
 
 /**
- * mdio_register_board_info - register MDIO devices for a given board
+ * mdiobus_register_board_info - register MDIO devices for a given board
  * @info: array of devices descriptors
  * @n: number of descriptors provided
  * Context: can sleep
-- 
2.25.1



[PATCH -next] net: stmmac: fix missing unlock on error in stmmac_suspend()

2021-03-27 Thread Yang Yingliang
Add the missing unlock before return from stmmac_suspend()
in the error handling case.

Fixes: 5ec55823438e ("net: stmmac: add clocks management for gmac driver")
Reported-by: Hulk Robot 
Signed-off-by: Yang Yingliang 
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c 
b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 170296820af0..973bdf11e59d 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -5605,8 +5605,10 @@ int stmmac_suspend(struct device *dev)
/* Disable clock in case of PWM is off */
clk_disable_unprepare(priv->plat->clk_ptp_ref);
ret = pm_runtime_force_suspend(dev);
-   if (ret)
+   if (ret) {
+   mutex_unlock(&priv->lock);
return ret;
+   }
}
 
mutex_unlock(&priv->lock);
-- 
2.25.1



[PATCH -next 3/3] net: llc: Correct function name llc_pdu_set_pf_bit() in header

2021-03-26 Thread Yang Yingliang
Fix the following make W=1 kernel build warning:

 net/llc/llc_pdu.c:36: warning: expecting prototype for pdu_set_pf_bit(). 
Prototype was for llc_pdu_set_pf_bit() instead

Reported-by: Hulk Robot 
Signed-off-by: Yang Yingliang 
---
 net/llc/llc_pdu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/llc/llc_pdu.c b/net/llc/llc_pdu.c
index 792d195c8bae..63749dde542f 100644
--- a/net/llc/llc_pdu.c
+++ b/net/llc/llc_pdu.c
@@ -24,7 +24,7 @@ void llc_pdu_set_cmd_rsp(struct sk_buff *skb, u8 pdu_type)
 }
 
 /**
- * pdu_set_pf_bit - sets poll/final bit in LLC header
+ * llc_pdu_set_pf_bit - sets poll/final bit in LLC header
  * @skb: Frame to set bit in
  * @bit_value: poll/final bit (0 or 1).
  *
-- 
2.25.1



[PATCH -next 1/3] net: llc: Correct some function names in header

2021-03-26 Thread Yang Yingliang
Fix the following make W=1 kernel build warning:

 net/llc/llc_c_ev.c:622: warning: expecting prototype for 
conn_ev_qlfy_last_frame_eq_1(). Prototype was for 
llc_conn_ev_qlfy_last_frame_eq_1() instead
 net/llc/llc_c_ev.c:636: warning: expecting prototype for 
conn_ev_qlfy_last_frame_eq_0(). Prototype was for 
llc_conn_ev_qlfy_last_frame_eq_0() instead

Reported-by: Hulk Robot 
Signed-off-by: Yang Yingliang 
---
 net/llc/llc_c_ev.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/llc/llc_c_ev.c b/net/llc/llc_c_ev.c
index 523fdd1cf781..d6627a80cb45 100644
--- a/net/llc/llc_c_ev.c
+++ b/net/llc/llc_c_ev.c
@@ -608,7 +608,7 @@ int llc_conn_ev_qlfy_p_flag_eq_1(struct sock *sk, struct 
sk_buff *skb)
 }
 
 /**
- * conn_ev_qlfy_last_frame_eq_1 - checks if frame is last in tx window
+ * llc_conn_ev_qlfy_last_frame_eq_1 - checks if frame is last in tx window
  * @sk: current connection structure.
  * @skb: current event.
  *
@@ -624,7 +624,7 @@ int llc_conn_ev_qlfy_last_frame_eq_1(struct sock *sk, 
struct sk_buff *skb)
 }
 
 /**
- * conn_ev_qlfy_last_frame_eq_0 - checks if frame isn't last in tx window
+ * llc_conn_ev_qlfy_last_frame_eq_0 - checks if frame isn't last in tx 
window
  * @sk: current connection structure.
  * @skb: current event.
  *
-- 
2.25.1



[PATCH -next 0/3] net: llc: Correct some function names in header

2021-03-26 Thread Yang Yingliang
Fix some make W=1 kernel build warnings in net/llc/

Yang Yingliang (3):
  net: llc: Correct some function names in header
  net: llc: Correct function name llc_sap_action_unitdata_ind() in
header
  net: llc: Correct function name llc_pdu_set_pf_bit() in header

 net/llc/llc_c_ev.c | 4 ++--
 net/llc/llc_pdu.c  | 2 +-
 net/llc/llc_s_ac.c | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

-- 
2.25.1



[PATCH -next 2/3] net: llc: Correct function name llc_sap_action_unitdata_ind() in header

2021-03-26 Thread Yang Yingliang
Fix the following make W=1 kernel build warning:

  net/llc/llc_s_ac.c:38: warning: expecting prototype for 
llc_sap_action_unit_data_ind(). Prototype was for llc_sap_action_unitdata_ind() 
instead

Reported-by: Hulk Robot 
Signed-off-by: Yang Yingliang 
---
 net/llc/llc_s_ac.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/llc/llc_s_ac.c b/net/llc/llc_s_ac.c
index 7ae4cc684d3a..b554f26c68ee 100644
--- a/net/llc/llc_s_ac.c
+++ b/net/llc/llc_s_ac.c
@@ -27,7 +27,7 @@
 
 
 /**
- * llc_sap_action_unit_data_ind - forward UI PDU to network layer
+ * llc_sap_action_unitdata_ind - forward UI PDU to network layer
  * @sap: SAP
  * @skb: the event to forward
  *
-- 
2.25.1



[PATCH net v3] net: fix memory leak in register_netdevice() on error path

2020-12-01 Thread Yang Yingliang
I got a memleak report when doing fault-inject test:

unreferenced object 0x88810ace9000 (size 1024):
  comm "ip", pid 4622, jiffies 4295457037 (age 43.378s)
  hex dump (first 32 bytes):
00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  
00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  
  backtrace:
[<008abe41>] __kmalloc+0x10f/0x210
[<5d3533a6>] veth_dev_init+0x140/0x310
[<88353c64>] register_netdevice+0x496/0x7a0
[<1324d322>] veth_newlink+0x40b/0x960
[<d0799866>] __rtnl_newlink+0xd8c/0x1360
[<d616040a>] rtnl_newlink+0x6b/0xa0
[<e0a1600d>] rtnetlink_rcv_msg+0x3cc/0x9e0
[<9eeff98b>] netlink_rcv_skb+0x130/0x3a0
[<500f8be1>] netlink_unicast+0x4da/0x700
[<666c03b3>] netlink_sendmsg+0x7fe/0xcb0
[<73b28103>] sock_sendmsg+0x143/0x180
[<ad746a30>] sys_sendmsg+0x677/0x810
[<87dd98e5>] ___sys_sendmsg+0x105/0x180
[<028dd365>] __sys_sendmsg+0xf0/0x1c0
[<a6bfbae6>] do_syscall_64+0x33/0x40
[<e00521b4>] entry_SYSCALL_64_after_hwframe+0x44/0xa9

It seems ifb and loopback may also hit the leak, so I try to fix this in
register_netdevice().

In common case, priv_destructor() will be called in netdev_run_todo()
after calling ndo_uninit() in rollback_registered(), on other error
path in register_netdevice(), ndo_uninit() and priv_destructor() are
called before register_netdevice() return, but in this case,
priv_destructor() will never be called, then it causes memory leak,
so we should call priv_destructor() here.

Reported-by: Hulk Robot 
Signed-off-by: Yang Yingliang 
---
v2 -> v3: In wireguard driver, priv_destructor() will call
free_netdev(), but it is assigned after register_netdevice(),
so it will not lead a double free, drop patch#1. Also I've
test wireguard device, it's no memory leak on this error path.
---
 net/core/dev.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/net/core/dev.c b/net/core/dev.c
index 82dc6b48e45f..51b9cf1ff6a1 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -10003,6 +10003,16 @@ int register_netdevice(struct net_device *dev)
rcu_barrier();
 
dev->reg_state = NETREG_UNREGISTERED;
+   /* In common case, priv_destructor() will be
+* called in netdev_run_todo() after calling
+* ndo_uninit() in rollback_registered().
+* But in this case, priv_destructor() will
+* never be called, then it causes memory
+* leak, so we should call priv_destructor()
+* here.
+*/
+   if (dev->priv_destructor)
+   dev->priv_destructor(dev);
/* We should put the kobject that hold in
 * netdev_unregister_kobject(), otherwise
 * the net device cannot be freed when
-- 
2.25.1



Re: [PATCH net v2 1/2] wireguard: device: don't call free_netdev() in priv_destructor()

2020-12-01 Thread Yang Yingliang



On 2020/12/1 17:46, Jason A. Donenfeld wrote:

Hi Yang,

On Tue, Dec 1, 2020 at 10:31 AM Yang Yingliang  wrote:

After commit cf124db566e6 ("net: Fix inconsistent teardown and..."),
priv_destruct() doesn't call free_netdev() in driver, we use
dev->needs_free_netdev to indicate whether free_netdev() should be
called on release path.
This patch remove free_netdev() from priv_destructor() and set
dev->needs_free_netdev to true.

For now, nack.

I remember when cf124db566e6 came out and carefully looking at the
construction of device.c in WireGuard. priv_destructor is only
assigned after register_device, with the various error paths in
wg_newlink responsible for cleaning up other earlier failures, and
trying to move to needs_free_netdev would have introduced more
complexity in this particular case, if my memory serves. I do not
think there's a memory leak here, and I worry about too hastily
changing the state machine "just because".

In other words, could you point out how to generate a memory leak? If
you're correct, then we can start dissecting and refactoring this. But
off the bat, I'm not sure I'm exactly seeing whatever you're seeing.


Yes, I missed that priv_destructor is only assigned after 
register_netdevice(),


so, it will not lead a double free in my patch#2, so this patch can be 
dropped and


send v3.



Jason
.


[PATCH net v2 1/2] wireguard: device: don't call free_netdev() in priv_destructor()

2020-12-01 Thread Yang Yingliang
After commit cf124db566e6 ("net: Fix inconsistent teardown and..."),
priv_destruct() doesn't call free_netdev() in driver, we use
dev->needs_free_netdev to indicate whether free_netdev() should be
called on release path.
This patch remove free_netdev() from priv_destructor() and set
dev->needs_free_netdev to true.

Signed-off-by: Yang Yingliang 
---
 drivers/net/wireguard/device.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireguard/device.c b/drivers/net/wireguard/device.c
index c9f65e96ccb0..578ac6097d7e 100644
--- a/drivers/net/wireguard/device.c
+++ b/drivers/net/wireguard/device.c
@@ -247,7 +247,6 @@ static void wg_destruct(struct net_device *dev)
mutex_unlock(&wg->device_update_lock);
 
pr_debug("%s: Interface destroyed\n", dev->name);
-   free_netdev(dev);
 }
 
 static const struct device_type device_type = { .name = KBUILD_MODNAME };
@@ -360,6 +359,7 @@ static int wg_newlink(struct net *src_net, struct 
net_device *dev,
 * register_netdevice doesn't call it for us if it fails.
 */
dev->priv_destructor = wg_destruct;
+   dev->needs_free_netdev = true;
 
pr_debug("%s: Interface created\n", dev->name);
return ret;
-- 
2.25.1



[PATCH net v2 2/2] net: fix memory leak in register_netdevice() on error path

2020-12-01 Thread Yang Yingliang
I got a memleak report when doing fault-inject test:

unreferenced object 0x88810ace9000 (size 1024):
  comm "ip", pid 4622, jiffies 4295457037 (age 43.378s)
  hex dump (first 32 bytes):
00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  
00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  
  backtrace:
[<008abe41>] __kmalloc+0x10f/0x210
[<5d3533a6>] veth_dev_init+0x140/0x310
[<88353c64>] register_netdevice+0x496/0x7a0
[<1324d322>] veth_newlink+0x40b/0x960
[<d0799866>] __rtnl_newlink+0xd8c/0x1360
[<d616040a>] rtnl_newlink+0x6b/0xa0
[<e0a1600d>] rtnetlink_rcv_msg+0x3cc/0x9e0
[<9eeff98b>] netlink_rcv_skb+0x130/0x3a0
[<500f8be1>] netlink_unicast+0x4da/0x700
[<666c03b3>] netlink_sendmsg+0x7fe/0xcb0
[<73b28103>] sock_sendmsg+0x143/0x180
[<ad746a30>] sys_sendmsg+0x677/0x810
[<87dd98e5>] ___sys_sendmsg+0x105/0x180
[<028dd365>] __sys_sendmsg+0xf0/0x1c0
[<a6bfbae6>] do_syscall_64+0x33/0x40
[<e00521b4>] entry_SYSCALL_64_after_hwframe+0x44/0xa9

It seems ifb and loopback may also hit the leak, so I try to fix this in
register_netdevice().

In common case, priv_destructor() will be called in netdev_run_todo()
after calling ndo_uninit() in rollback_registered(), on other error
path in register_netdevice(), ndo_uninit() and priv_destructor() are
called before register_netdevice() return, but in this case,
priv_destructor() will never be called, then it causes memory leak,
so we should call priv_destructor() here.

Reported-by: Hulk Robot 
Signed-off-by: Yang Yingliang 
---
 net/core/dev.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/net/core/dev.c b/net/core/dev.c
index 82dc6b48e45f..51b9cf1ff6a1 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -10003,6 +10003,16 @@ int register_netdevice(struct net_device *dev)
rcu_barrier();
 
dev->reg_state = NETREG_UNREGISTERED;
+   /* In common case, priv_destructor() will be
+* called in netdev_run_todo() after calling
+* ndo_uninit() in rollback_registered().
+* But in this case, priv_destructor() will
+* never be called, then it causes memory
+* leak, so we should call priv_destructor()
+* here.
+*/
+   if (dev->priv_destructor)
+   dev->priv_destructor(dev);
/* We should put the kobject that hold in
 * netdev_unregister_kobject(), otherwise
 * the net device cannot be freed when
-- 
2.25.1



Re: [PATCH net] net: fix memory leak in register_netdevice() on error path

2020-11-30 Thread Yang Yingliang



On 2020/11/29 21:56, Toshiaki Makita wrote:

On 2020/11/26 22:23, Yang Yingliang wrote:
...

Reported-by: Hulk Robot 
Signed-off-by: Yang Yingliang 
---
  net/core/dev.c | 11 +++
  1 file changed, 11 insertions(+)

diff --git a/net/core/dev.c b/net/core/dev.c
index 82dc6b48e45f..907204395b64 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1,6 +1,17 @@ int register_netdevice(struct net_device *dev)
  ret = notifier_to_errno(ret);
  if (ret) {
  rollback_registered(dev);
+    /*
+ * In common case, priv_destructor() will be


As per netdev-faq, the comment style should be

/* foobar blah blah blah
 * another line of text
 */

rather than

/*
 * foobar blah blah blah
 * another line of text
 */


+ * called in netdev_run_todo() after calling
+ * ndo_uninit() in rollback_registered().
+ * But in this case, priv_destructor() will
+ * never be called, then it causes memory
+ * leak, so we should call priv_destructor()
+ * here.
+ */
+    if (dev->priv_destructor)
+    dev->priv_destructor(dev);


To be in line with netdev_run_todo(), I think priv_destructor() should be
called after "dev->reg_state = NETREG_UNREGISTERED".

OK,  I will send a v2 later.


Toshiaki Makita


  rcu_barrier();
    dev->reg_state = NETREG_UNREGISTERED;



Re: [PATCH net] net: fix memory leak in register_netdevice() on error path

2020-11-30 Thread Yang Yingliang



On 2020/11/30 12:39, Stephen Hemminger wrote:

On Thu, 26 Nov 2020 21:23:12 +0800
Yang Yingliang  wrote:


I got a memleak report when doing fault-inject test:

unreferenced object 0x88810ace9000 (size 1024):
   comm "ip", pid 4622, jiffies 4295457037 (age 43.378s)
   hex dump (first 32 bytes):
 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  
 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  
   backtrace:
 [<008abe41>] __kmalloc+0x10f/0x210
 [<5d3533a6>] veth_dev_init+0x140/0x310
 [<88353c64>] register_netdevice+0x496/0x7a0
 [<1324d322>] veth_newlink+0x40b/0x960
 [<d0799866>] __rtnl_newlink+0xd8c/0x1360
 [<d616040a>] rtnl_newlink+0x6b/0xa0
 [<e0a1600d>] rtnetlink_rcv_msg+0x3cc/0x9e0
 [<9eeff98b>] netlink_rcv_skb+0x130/0x3a0
 [<500f8be1>] netlink_unicast+0x4da/0x700
 [<666c03b3>] netlink_sendmsg+0x7fe/0xcb0
 [<73b28103>] sock_sendmsg+0x143/0x180
 [<ad746a30>] sys_sendmsg+0x677/0x810
 [<87dd98e5>] ___sys_sendmsg+0x105/0x180
 [<028dd365>] __sys_sendmsg+0xf0/0x1c0
 [<a6bfbae6>] do_syscall_64+0x33/0x40
 [<e00521b4>] entry_SYSCALL_64_after_hwframe+0x44/0xa9

It seems ifb and loopback may also hit the leak, so I try to fix this in
register_netdevice().

In common case, priv_destructor() will be called in netdev_run_todo()
after calling ndo_uninit() in rollback_registered(), on other error
path in register_netdevice(), ndo_uninit() and priv_destructor() are
called before register_netdevice() return, but in this case,
priv_destructor() will never be called, then it causes memory leak,
so we should call priv_destructor() here.

Reported-by: Hulk Robot 
Signed-off-by: Yang Yingliang 
---
  net/core/dev.c | 11 +++
  1 file changed, 11 insertions(+)

diff --git a/net/core/dev.c b/net/core/dev.c
index 82dc6b48e45f..907204395b64 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1,6 +1,17 @@ int register_netdevice(struct net_device *dev)
ret = notifier_to_errno(ret);
if (ret) {
rollback_registered(dev);
+   /*
+* In common case, priv_destructor() will be
+* called in netdev_run_todo() after calling
+* ndo_uninit() in rollback_registered().
+* But in this case, priv_destructor() will
+* never be called, then it causes memory
+* leak, so we should call priv_destructor()
+* here.
+*/
+   if (dev->priv_destructor)
+   dev->priv_destructor(dev);

Are you sure this is safe?
Several devices have destructors that call free_netdev.
Up until now a common pattern for those devices was to call
free_netdev on error. After this change it would lead to double free.


After commit cf124db566e6 ("net: Fix inconsistent teardown and release 
of private netdev state."),


free_netdev() is not be called in priv_destructor().



.


[PATCH net] net: fix memory leak in register_netdevice() on error path

2020-11-26 Thread Yang Yingliang
I got a memleak report when doing fault-inject test:

unreferenced object 0x88810ace9000 (size 1024):
  comm "ip", pid 4622, jiffies 4295457037 (age 43.378s)
  hex dump (first 32 bytes):
00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  
00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  
  backtrace:
[<008abe41>] __kmalloc+0x10f/0x210
[<5d3533a6>] veth_dev_init+0x140/0x310
[<88353c64>] register_netdevice+0x496/0x7a0
[<1324d322>] veth_newlink+0x40b/0x960
[<d0799866>] __rtnl_newlink+0xd8c/0x1360
[<d616040a>] rtnl_newlink+0x6b/0xa0
[<e0a1600d>] rtnetlink_rcv_msg+0x3cc/0x9e0
[<9eeff98b>] netlink_rcv_skb+0x130/0x3a0
[<500f8be1>] netlink_unicast+0x4da/0x700
[<666c03b3>] netlink_sendmsg+0x7fe/0xcb0
[<73b28103>] sock_sendmsg+0x143/0x180
[<ad746a30>] sys_sendmsg+0x677/0x810
[<87dd98e5>] ___sys_sendmsg+0x105/0x180
[<028dd365>] __sys_sendmsg+0xf0/0x1c0
[<a6bfbae6>] do_syscall_64+0x33/0x40
[<e00521b4>] entry_SYSCALL_64_after_hwframe+0x44/0xa9

It seems ifb and loopback may also hit the leak, so I try to fix this in
register_netdevice().

In common case, priv_destructor() will be called in netdev_run_todo()
after calling ndo_uninit() in rollback_registered(), on other error
path in register_netdevice(), ndo_uninit() and priv_destructor() are
called before register_netdevice() return, but in this case,
priv_destructor() will never be called, then it causes memory leak,
so we should call priv_destructor() here.

Reported-by: Hulk Robot 
Signed-off-by: Yang Yingliang 
---
 net/core/dev.c | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/net/core/dev.c b/net/core/dev.c
index 82dc6b48e45f..907204395b64 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1,6 +1,17 @@ int register_netdevice(struct net_device *dev)
ret = notifier_to_errno(ret);
if (ret) {
rollback_registered(dev);
+   /*
+* In common case, priv_destructor() will be
+* called in netdev_run_todo() after calling
+* ndo_uninit() in rollback_registered().
+* But in this case, priv_destructor() will
+* never be called, then it causes memory
+* leak, so we should call priv_destructor()
+* here.
+*/
+   if (dev->priv_destructor)
+   dev->priv_destructor(dev);
rcu_barrier();
 
dev->reg_state = NETREG_UNREGISTERED;
-- 
2.25.1



[PATCH] veth: fix memleak in veth_newlink()

2020-11-20 Thread Yang Yingliang
I got a memleak report when doing fault-inject test:

unreferenced object 0x88810ace9000 (size 1024):
  comm "ip", pid 4622, jiffies 4295457037 (age 43.378s)
  hex dump (first 32 bytes):
00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  
00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  
  backtrace:
[<008abe41>] __kmalloc+0x10f/0x210
[<5d3533a6>] veth_dev_init+0x140/0x310
[<88353c64>] register_netdevice+0x496/0x7a0
[<1324d322>] veth_newlink+0x40b/0x960
[<d0799866>] __rtnl_newlink+0xd8c/0x1360
[<d616040a>] rtnl_newlink+0x6b/0xa0
[<e0a1600d>] rtnetlink_rcv_msg+0x3cc/0x9e0
[<9eeff98b>] netlink_rcv_skb+0x130/0x3a0
[<500f8be1>] netlink_unicast+0x4da/0x700
[<666c03b3>] netlink_sendmsg+0x7fe/0xcb0
[<73b28103>] sock_sendmsg+0x143/0x180
[<ad746a30>] sys_sendmsg+0x677/0x810
[<87dd98e5>] ___sys_sendmsg+0x105/0x180
[<028dd365>] __sys_sendmsg+0xf0/0x1c0
[<a6bfbae6>] do_syscall_64+0x33/0x40
[<e00521b4>] entry_SYSCALL_64_after_hwframe+0x44/0xa9

If call_netdevice_notifiers() failed in register_netdevice(),
dev->priv_destructor() is not called, it will cause memleak.
Fix this by assigning ndo_uninit with veth_dev_free(), so
the memory can be freed in rollback_registered();

Reported-by: Hulk Robot 
Signed-off-by: Yang Yingliang 
---
 drivers/net/veth.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 8c737668008a..537d9a60028a 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -1217,6 +1217,7 @@ static int veth_xdp(struct net_device *dev, struct 
netdev_bpf *xdp)
 
 static const struct net_device_ops veth_netdev_ops = {
.ndo_init= veth_dev_init,
+   .ndo_uninit  = veth_dev_free,
.ndo_open= veth_open,
.ndo_stop= veth_close,
.ndo_start_xmit  = veth_xmit,
@@ -1260,7 +1261,6 @@ static void veth_setup(struct net_device *dev)
   NETIF_F_HW_VLAN_CTAG_RX |
   NETIF_F_HW_VLAN_STAG_RX);
dev->needs_free_netdev = true;
-   dev->priv_destructor = veth_dev_free;
dev->max_mtu = ETH_MAX_MTU;
 
dev->hw_features = VETH_FEATURES;
-- 
2.17.1



[PATCH net-next] netlink: add spaces around '&' in netlink_recv/sendmsg()

2020-09-16 Thread Yang Yingliang
It's hard to read the code without spaces around '&',
for better reading, add spaces around '&'.

Signed-off-by: Yang Yingliang 
---
 net/netlink/af_netlink.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index d2d1448274f5..57a6318ec940 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1853,7 +1853,7 @@ static int netlink_sendmsg(struct socket *sock, struct 
msghdr *msg, size_t len)
struct scm_cookie scm;
u32 netlink_skb_flags = 0;
 
-   if (msg->msg_flags&MSG_OOB)
+   if (msg->msg_flags & MSG_OOB)
return -EOPNOTSUPP;
 
err = scm_send(sock, msg, &scm, true);
@@ -1916,7 +1916,7 @@ static int netlink_sendmsg(struct socket *sock, struct 
msghdr *msg, size_t len)
refcount_inc(&skb->users);
netlink_broadcast(sk, skb, dst_portid, dst_group, GFP_KERNEL);
}
-   err = netlink_unicast(sk, skb, dst_portid, msg->msg_flags&MSG_DONTWAIT);
+   err = netlink_unicast(sk, skb, dst_portid, msg->msg_flags & 
MSG_DONTWAIT);
 
 out:
scm_destroy(&scm);
@@ -1929,12 +1929,12 @@ static int netlink_recvmsg(struct socket *sock, struct 
msghdr *msg, size_t len,
struct scm_cookie scm;
struct sock *sk = sock->sk;
struct netlink_sock *nlk = nlk_sk(sk);
-   int noblock = flags&MSG_DONTWAIT;
+   int noblock = flags & MSG_DONTWAIT;
size_t copied;
struct sk_buff *skb, *data_skb;
int err, ret;
 
-   if (flags&MSG_OOB)
+   if (flags & MSG_OOB)
return -EOPNOTSUPP;
 
copied = 0;
-- 
2.25.1



[PATCH net-next] netlink: add spaces around '&' in netlink_recvmsg()

2020-09-07 Thread Yang Yingliang
Spaces preferred around '&'.

Signed-off-by: Yang Yingliang 
---
 net/netlink/af_netlink.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index d2d1448274f5..5a86bf4f80b1 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1929,12 +1929,12 @@ static int netlink_recvmsg(struct socket *sock, struct 
msghdr *msg, size_t len,
struct scm_cookie scm;
struct sock *sk = sock->sk;
struct netlink_sock *nlk = nlk_sk(sk);
-   int noblock = flags&MSG_DONTWAIT;
+   int noblock = flags & MSG_DONTWAIT;
size_t copied;
struct sk_buff *skb, *data_skb;
int err, ret;
 
-   if (flags&MSG_OOB)
+   if (flags & MSG_OOB)
return -EOPNOTSUPP;
 
copied = 0;
-- 
2.25.1



[Question] Oops when using connector in linux-4.19

2020-09-05 Thread Yang Yingliang
Hi,

I got some crashes when using connector module in linux-4.19:

log1:

[10385482.776385] Unable to handle kernel paging request at virtual address 
0003004c
[10385482.777083] Mem abort info:
[10385482.777340]   ESR = 0x9604
[10385482.777578]   Exception class = DABT (current EL), IL = 32 bits
[10385482.778034]   SET = 0, FnV = 0
[10385482.778282]   EA = 0, S1PTW = 0
[10385482.778531] Data abort info:
[10385482.778851]   ISV = 0, ISS = 0x0004
[10385482.779162]   CM = 0, WnR = 0
[10385482.779419] user pgtable: 4k pages, 48-bit VAs, pgdp = c16a0f04
[10385482.779930] [0003004c] pgd=
[10385482.780318] Internal error: Oops: 9604 [#1] SMP
[10385482.780690] Process sudo (pid: 60096, stack limit = 0x66055412)
[10385482.781225] CPU: 0 PID: 60096 Comm: sudo Kdump: loaded Tainted: G 
  OE K   4.19.36 #1
[10385482.782098] Hardware name: OpenStack Foundation OpenStack Nova, BIOS 
0.0.0 02/06/2015
[10385482.782732] pstate: 6045 (nZCv daif +PAN -UAO)
[10385482.783192] pc : __kmalloc_node_track_caller+0x214/0x348
[10385482.783692] lr : __kmalloc_node_track_caller+0x6c/0x348
[10385482.784194] sp : 1802bbb0
[10385482.784515] x29: 1802bbb0 x28: 800b5329
[10385482.784999] x27:  x26: 003c
[10385482.785487] x25: 800ffc3a7800 x24: 800ffc3a7800
[10385482.785970] x23: 08864af0 x22: 
[10385482.786450] x21: 01c0 x20: 00410200
[10385482.786995] x19: 0003004c x18: 
[10385482.787486] x17:  x16: 
[10385482.787983] x15:  x14: 
[10385482.788478] x13:  x12: 
[10385482.788979] x11:  x10: 800fb38ac800
[10385482.789441] x9 :  x8 : 
[10385482.789947] x7 : 8011b2130380 x6 : 801eeb758500
[10385482.790456] x5 : 089bccdf x4 : 800fff8d2620
[10385482.790997] x3 : 08864af0 x2 : 800ff69ce000
[10385482.791509] x1 : 800ff69ce000 x0 : 
[10385482.792029] Call trace:
[10385482.792265]  __kmalloc_node_track_caller+0x214/0x348
[10385482.792726]  __kmalloc_reserve.isra.9+0x54/0xb0
[10385482.793158]  __alloc_skb+0x90/0x1b0
[10385482.793500]  cn_netlink_send_mult+0x148/0x268
[10385482.793909]  cn_netlink_send+0x40/0x50
[10385482.794261]  proc_id_connector+0x130/0x170
[10385482.794644]  commit_creds+0x10c/0x2c0
[10385482.795047]  __sys_setresuid+0x1f0/0x228
[10385482.795410]  __arm64_sys_setresuid+0x28/0x38
[10385482.795816]  el0_svc_common+0x78/0x130
[10385482.796174]  el0_svc_handler+0x38/0x78
[10385482.796537]  el0_svc+0x8/0xc


log2:

[22635000.696206] Unable to handle kernel paging request at virtual address 
0003004c
[22635000.704370] Mem abort info:
[22635000.707417]   ESR = 0x9604
[22635000.710723]   Exception class = DABT (current EL), IL = 32 bits
[22635000.716878]   SET = 0, FnV = 0
[22635000.720183]   EA = 0, S1PTW = 0
[22635000.723575] Data abort info:
[22635000.726709]   ISV = 0, ISS = 0x0004
[22635000.730792]   CM = 0, WnR = 0
[22635000.734011] user pgtable: 4k pages, 48-bit VAs, pgdp = cfe46aff
[22635000.740859] [0003004c] pgd=
[22635000.745980] Internal error: Oops: 9604 [#1] SMP
[22635000.751094] Process thread-pool-6 (pid: 44316, stack limit = 
0x378f8b35)
[22635000.758717] CPU: 49 PID: 44316 Comm: thread-pool-6 Kdump: loaded Tainted: 
G   OE K   4.19.36 #1
[22635000.771695] Hardware name: Huawei TaiShan 2280 V2/BC82AMDD, BIOS 1.05 
09/18/2019
[22635000.779314] pstate: 6049 (nZCv daif +PAN -UAO)
[22635000.784348] pc : __kmalloc_node_track_caller+0x214/0x348
[22635000.789894] lr : __kmalloc_node_track_caller+0x6c/0x348
[22635000.795353] sp : 259a3800
[22635000.798912] x29: 259a3800 x28: 
[22635000.804459] x27: 0022 x26: 0022
[22635000.810007] x25: 803f7f40f800 x24: 803f7f40f800
[22635000.81] x23: 08864af0 x22: 
[22635000.821103] x21: 01c0 x20: 006102c0
[22635000.826650] x19: 0003004c x18: 
[22635000.832198] x17:  x16: 
[22635000.837746] x15:  x14: 
[22635000.843293] x13:  x12: 
[22635000.848839] x11: 0001 x10: 
[22635000.854385] x9 : 0001 x8 : 
[22635000.859932] x7 : 0022 x6 : 805f0ef09a00
[22635000.865478] x5 : 0001d01b24b0 x4 : 805f7fa8c620
[22635000.871017] x3 : 08864af0 x2 : 805f76b88000
[22635000.876563] x1 : 805f76b88000 x0 : 
[22635000.882110] Call trace:
[22635000.884805]  __kmalloc_node_track_caller+0x214/0x348
[22635000.890009]  __kmalloc_reserve.isra.9+0x54/0xb0
[22635000.894778]  __alloc_skb+0x90/0x1b0
[22635000.898513]  __ip_app

[PATCH net] net: fix memleak in register_netdevice()

2020-06-15 Thread Yang Yingliang
I got a memleak report when doing some fuzz test:

unreferenced object 0x888112584000 (size 13599):
  comm "ip", pid 3048, jiffies 4294911734 (age 343.491s)
  hex dump (first 32 bytes):
74 61 70 30 00 00 00 00 00 00 00 00 00 00 00 00  tap0
00 ee d9 19 81 88 ff ff 00 00 00 00 00 00 00 00  
  backtrace:
[<2f60ba65>] __kmalloc_node+0x309/0x3a0
[<75b211ec>] kvmalloc_node+0x7f/0xc0
[<d3a97396>] alloc_netdev_mqs+0x76/0xfc0
[<609c3655>] __tun_chr_ioctl+0x1456/0x3d70
[<1127ca24>] ksys_ioctl+0xe5/0x130
[<b7d5e66a>] __x64_sys_ioctl+0x6f/0xb0
[<e1023498>] do_syscall_64+0x56/0xa0
[<9ec0eb12>] entry_SYSCALL_64_after_hwframe+0x44/0xa9
unreferenced object 0x888111845cc0 (size 8):
  comm "ip", pid 3048, jiffies 4294911734 (age 343.491s)
  hex dump (first 8 bytes):
74 61 70 30 00 88 ff ff  tap0
  backtrace:
[<4c159777>] kstrdup+0x35/0x70
[<d8b496ad>] kstrdup_const+0x3d/0x50
[<494e884a>] kvasprintf_const+0xf1/0x180
[<97880a2b>] kobject_set_name_vargs+0x56/0x140
[<8fbdfc7b>] dev_set_name+0xab/0xe0
[<5b99e3b4>] netdev_register_kobject+0xc0/0x390
[<602704fe>] register_netdevice+0xb61/0x1250
[<2b7ca244>] __tun_chr_ioctl+0x1cd1/0x3d70
[<1127ca24>] ksys_ioctl+0xe5/0x130
[<b7d5e66a>] __x64_sys_ioctl+0x6f/0xb0
[<e1023498>] do_syscall_64+0x56/0xa0
[<9ec0eb12>] entry_SYSCALL_64_after_hwframe+0x44/0xa9
unreferenced object 0x88811886d800 (size 512):
  comm "ip", pid 3048, jiffies 4294911734 (age 343.491s)
  hex dump (first 32 bytes):
00 00 00 00 ad 4e ad de ff ff ff ff 00 00 00 00  .N..
ff ff ff ff ff ff ff ff c0 66 3d a3 ff ff ff ff  .f=.
  backtrace:
[<50315800>] device_add+0x61e/0x1950
[<21008dfb>] netdev_register_kobject+0x17e/0x390
[<602704fe>] register_netdevice+0xb61/0x1250
[<2b7ca244>] __tun_chr_ioctl+0x1cd1/0x3d70
[<1127ca24>] ksys_ioctl+0xe5/0x130
[<b7d5e66a>] __x64_sys_ioctl+0x6f/0xb0
[<e1023498>] do_syscall_64+0x56/0xa0
[<9ec0eb12>] entry_SYSCALL_64_after_hwframe+0x44/0xa9

If call_netdevice_notifiers() failed, then rollback_registered()
calls netdev_unregister_kobject() which holds the kobject. The
reference cannot be put because the netdev won't be add to todo
list, so it will leads a memleak, we need put the reference to
avoid memleak.

Reported-by: Hulk Robot 
Signed-off-by: Yang Yingliang 
---
 net/core/dev.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/net/core/dev.c b/net/core/dev.c
index a26d87073f71..a44871b39c6d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -8760,6 +8760,13 @@ int register_netdevice(struct net_device *dev)
rcu_barrier();
 
dev->reg_state = NETREG_UNREGISTERED;
+   /* We should put the kobject that hold in
+* netdev_unregister_kobject(), otherwise
+* the net device cannot be freed when
+* driver calls free_netdev(), because the
+* kobject is being hold.
+*/
+   kobject_put(&dev->dev.kobj);
}
/*
 *  Prevent userspace races by waiting until the network
-- 
2.25.1



[PATCH net] devinet: fix memleak in inetdev_init()

2020-05-29 Thread Yang Yingliang
When devinet_sysctl_register() failed, the memory allocated
in neigh_parms_alloc() should be freed.

Fixes: 20e61da7ffcf ("ipv4: fail early when creating netdev named all or 
default")
Signed-off-by: Yang Yingliang 
---
 net/ipv4/devinet.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index c0dd561..5267b6b 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -276,6 +276,7 @@ static struct in_device *inetdev_init(struct net_device 
*dev)
err = devinet_sysctl_register(in_dev);
if (err) {
in_dev->dead = 1;
+   neigh_parms_release(&arp_tbl, in_dev->arp_parms);
in_dev_put(in_dev);
in_dev = NULL;
goto out;
-- 
1.8.3



Re: cgroup pointed by sock is leaked on mode switch

2020-05-08 Thread Yang Yingliang



On 2020/5/6 15:51, Zefan Li wrote:

On 2020/5/6 10:16, Zefan Li wrote:

On 2020/5/6 9:50, Yang Yingliang wrotee:

+cc lize...@huawei.com

On 2020/5/6 0:06, Tejun Heo wrote:

Hello, Yang.

On Sat, May 02, 2020 at 06:27:21PM +0800, Yang Yingliang wrote:

I find the number nr_dying_descendants is increasing:
linux-dVpNUK:~ # find /sys/fs/cgroup/ -name cgroup.stat -exec grep
'^nr_dying_descendants [^0]'  {} +
/sys/fs/cgroup/unified/cgroup.stat:nr_dying_descendants 80
/sys/fs/cgroup/unified/system.slice/cgroup.stat:nr_dying_descendants 
1
/sys/fs/cgroup/unified/system.slice/system-hostos.slice/cgroup.stat:nr_dying_descendants 


1
/sys/fs/cgroup/unified/lxc/cgroup.stat:nr_dying_descendants 79
/sys/fs/cgroup/unified/lxc/5f1fdb8c54fa40c3e599613dab6e4815058b76ebada8a27bc1fe80c0d4801764/cgroup.stat:nr_dying_descendants 


78
/sys/fs/cgroup/unified/lxc/5f1fdb8c54fa40c3e599613dab6e4815058b76ebada8a27bc1fe80c0d4801764/system.slice/cgroup.stat:nr_dying_descendants 


78

Those numbers are nowhere close to causing oom issues. There are some
aspects of page and other cache draining which is being improved 
but unless
you're seeing numbers multiple orders of magnitude higher, this 
isn't the

source of your problem.

The situation is as same as the commit bd1060a1d671 ("sock, 
cgroup: add

sock->sk_cgroup") describes.
"On mode switch, cgroup references which are already being pointed 
to by

socks may be leaked."
I'm doubtful that you're hitting that issue. Mode switching means 
memcg
being switched between cgroup1 and cgroup2 hierarchies, which is 
unlikely to

be what's happening when you're launching docker containers.

The first step would be identifying where memory is going and 
finding out
whether memcg is actually being switched between cgroup1 and 2 - 
look at the

hierarchy number in /proc/cgroups, if that's switching between 0 and
someting not zero, it is switching.



I think there's a bug here which can lead to unlimited memory leak.
This should reproduce the bug:

    # mount -t cgroup -o netprio xxx /cgroup/netprio
    # mkdir /cgroup/netprio/xxx
    # echo PID > /cgroup/netprio/xxx/tasks
    /* this PID process starts to do some network thing and then 
exits */

    # rmdir /cgroup/netprio/xxx
    /* now this cgroup will never be freed */



Correction (still not tested):

    # mount -t cgroup2 none /cgroup/v2
    # mkdir /cgroup/v2/xxx
    # echo PID > /cgroup/v2/xxx/cgroup.procs
    /* this PID process starts to do some network thing */

    # mount -t cgroup -o netprio xxx /cgroup/netprio
    # mkdir /cgroup/netprio/xxx
    # echo PID > /cgroup/netprio/xxx/tasks
    ...
    /* the PID process exits */

    rmdir /cgroup/netprio/xxx
    rmdir /cgroup/v2/xxx
    /* now looks like this v2 cgroup will never be freed */


Look at the code:

static inline void sock_update_netprioidx(struct sock_cgroup_data *skcd)
{
 ...
 sock_cgroup_set_prioidx(skcd, task_netprioidx(current));
}

static inline void sock_cgroup_set_prioidx(struct sock_cgroup_data 
*skcd,

 u16 prioidx)
{
 ...
 if (sock_cgroup_prioidx(&skcd_buf) == prioidx)
 return ;
 ...
 skcd_buf.prioidx = prioidx;
 WRITE_ONCE(skcd->val, skcd_buf.val);
}

task_netprioidx() will be the cgrp id of xxx which is not 1, but
sock_cgroup_prioidx(&skcd_buf) is 1 because it thought it's in v2 mode.
Now we have a memory leak.

I think the eastest fix is to do the mode switch here:

diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index b905747..2397866 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -240,6 +240,8 @@ static void net_prio_attach(struct cgroup_taskset 
*tset)

 struct task_struct *p;
 struct cgroup_subsys_state *css;

+   cgroup_sk_alloc_disable();
+
 cgroup_taskset_for_each(p, css, tset) {
 void *v = (void *)(unsigned long)css->cgroup->id;



I've do some test with this change, here is the test result:


Without this change, nr_dying_descendants is increased and the cgroup is 
leaked:


linux-dVpNUK:~ # mount | grep cgroup
tmpfs on /sys/fs/cgroup type tmpfs (ro,nosuid,nodev,noexec,mode=755)
cgroup2 on /sys/fs/cgroup/unified type cgroup2 
(rw,nosuid,nodev,noexec,relatime,nsdelegate)
cgroup on /sys/fs/cgroup/systemd type cgroup 
(rw,nosuid,nodev,noexec,relatime,xattr,name=systemd)
cgroup on /sys/fs/cgroup/blkio type cgroup 
(rw,nosuid,nodev,noexec,relatime,blkio)
cgroup on /sys/fs/cgroup/cpu,cpuacct type cgroup 
(rw,nosuid,nodev,noexec,relatime,cpu,cpuacct)
cgroup on /sys/fs/cgroup/net_cls,net_prio type cgroup 
(rw,nosuid,nodev,noexec,relatime,net_cls,net_prio)
cgroup on /sys/fs/cgroup/freezer type cgroup 
(rw,nosuid,nodev,noexec,relatime,freezer)
cgroup on /sys/fs/cgroup/devices type cgroup 
(rw,nosuid,nodev,noexec,relatime,devices)
cgroup on /sys/fs/cgroup/memory type cgroup 
(rw,nosuid,nodev,noexec,

[PATCH net-next] ieee802154: 6lowpan: remove unnecessary comparison

2020-05-07 Thread Yang Yingliang
The type of dispatch is u8 which is always '<=' 0xff, so the
dispatch <= 0xff is always true, we can remove this comparison.

Reported-by: Hulk Robot 
Signed-off-by: Yang Yingliang 
---
 net/ieee802154/6lowpan/rx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ieee802154/6lowpan/rx.c b/net/ieee802154/6lowpan/rx.c
index ee17938..b34d050 100644
--- a/net/ieee802154/6lowpan/rx.c
+++ b/net/ieee802154/6lowpan/rx.c
@@ -240,7 +240,7 @@ static inline bool lowpan_is_reserved(u8 dispatch)
return ((dispatch >= 0x44 && dispatch <= 0x4F) ||
(dispatch >= 0x51 && dispatch <= 0x5F) ||
(dispatch >= 0xc8 && dispatch <= 0xdf) ||
-   (dispatch >= 0xe8 && dispatch <= 0xff));
+   dispatch >= 0xe8);
 }
 
 /* lowpan_rx_h_check checks on generic 6LoWPAN requirements
-- 
1.8.3



Re: cgroup pointed by sock is leaked on mode switch

2020-05-05 Thread Yang Yingliang

+cc lize...@huawei.com

On 2020/5/6 0:06, Tejun Heo wrote:

Hello, Yang.

On Sat, May 02, 2020 at 06:27:21PM +0800, Yang Yingliang wrote:

I find the number nr_dying_descendants is increasing:
linux-dVpNUK:~ # find /sys/fs/cgroup/ -name cgroup.stat -exec grep
'^nr_dying_descendants [^0]'  {} +
/sys/fs/cgroup/unified/cgroup.stat:nr_dying_descendants 80
/sys/fs/cgroup/unified/system.slice/cgroup.stat:nr_dying_descendants 1
/sys/fs/cgroup/unified/system.slice/system-hostos.slice/cgroup.stat:nr_dying_descendants
1
/sys/fs/cgroup/unified/lxc/cgroup.stat:nr_dying_descendants 79
/sys/fs/cgroup/unified/lxc/5f1fdb8c54fa40c3e599613dab6e4815058b76ebada8a27bc1fe80c0d4801764/cgroup.stat:nr_dying_descendants
78
/sys/fs/cgroup/unified/lxc/5f1fdb8c54fa40c3e599613dab6e4815058b76ebada8a27bc1fe80c0d4801764/system.slice/cgroup.stat:nr_dying_descendants
78

Those numbers are nowhere close to causing oom issues. There are some
aspects of page and other cache draining which is being improved but unless
you're seeing numbers multiple orders of magnitude higher, this isn't the
source of your problem.


The situation is as same as the commit bd1060a1d671 ("sock, cgroup: add
sock->sk_cgroup") describes.
"On mode switch, cgroup references which are already being pointed to by
socks may be leaked."

I'm doubtful that you're hitting that issue. Mode switching means memcg
being switched between cgroup1 and cgroup2 hierarchies, which is unlikely to
be what's happening when you're launching docker containers.

The first step would be identifying where memory is going and finding out
whether memcg is actually being switched between cgroup1 and 2 - look at the
hierarchy number in /proc/cgroups, if that's switching between 0 and
someting not zero, it is switching.

Thanks.





cgroup pointed by sock is leaked on mode switch

2020-05-02 Thread Yang Yingliang

Hi,

I got an oom panic because cgroup is leaked.

Here is the steps :
  - run a docker with --cap-add sys_admin parameter and the systemd 
process in the docker uses both cgroupv1 and cgroupv2

  - ssh/exit from host to docker repeately

I find the number nr_dying_descendants is increasing:
linux-dVpNUK:~ # find /sys/fs/cgroup/ -name cgroup.stat -exec grep 
'^nr_dying_descendants [^0]'  {} +

/sys/fs/cgroup/unified/cgroup.stat:nr_dying_descendants 80
/sys/fs/cgroup/unified/system.slice/cgroup.stat:nr_dying_descendants 1
/sys/fs/cgroup/unified/system.slice/system-hostos.slice/cgroup.stat:nr_dying_descendants 
1

/sys/fs/cgroup/unified/lxc/cgroup.stat:nr_dying_descendants 79
/sys/fs/cgroup/unified/lxc/5f1fdb8c54fa40c3e599613dab6e4815058b76ebada8a27bc1fe80c0d4801764/cgroup.stat:nr_dying_descendants 
78
/sys/fs/cgroup/unified/lxc/5f1fdb8c54fa40c3e599613dab6e4815058b76ebada8a27bc1fe80c0d4801764/system.slice/cgroup.stat:nr_dying_descendants 
78



The situation is as same as the commit bd1060a1d671 ("sock, cgroup: add 
sock->sk_cgroup") describes.
"On mode switch, cgroup references which are already being pointed to by 
socks may be leaked."


Do we have a fix for this leak now ?

Or how  about fix this by record the cgrp2 pointer, then put it when sk 
is freeing like this:


diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index d9bd671105e2..cbb1e76ea305 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -770,6 +770,7 @@ struct sock_cgroup_data {
 #endif
 u64    val;
 };
+    struct cgroup *cgrpv2;
 };

 /*
@@ -802,6 +803,7 @@ static inline void sock_cgroup_set_prioidx(struct 
sock_cgroup_data *skcd,

 return;

 if (!(skcd_buf.is_data & 1)) {
+    WRITE_ONCE(skcd->cgrpv2, skcd_buf.val);
 skcd_buf.val = 0;
 skcd_buf.is_data = 1;
 }
@@ -819,6 +821,7 @@ static inline void sock_cgroup_set_classid(struct 
sock_cgroup_data *skcd,

 return;

 if (!(skcd_buf.is_data & 1)) {
+    WRITE_ONCE(skcd->cgrpv2, skcd_buf.val);
 skcd_buf.val = 0;
 skcd_buf.is_data = 1;
 }
diff --git a/net/core/sock.c b/net/core/sock.c
index a0dda2bf9d7c..7c761ef2d32e 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1520,6 +1520,10 @@ static void sk_prot_free(struct proto *prot, 
struct sock *sk)

 slab = prot->slab;

 cgroup_sk_free(&sk->sk_cgrp_data);
+    if (sk->sk_cgrp_data.cgrpv2) {
+    cgroup_put(sk->sk_cgrp_data.cgrpv2);
+    sk->sk_cgrp_data.cgrpv2 = NULL;
+    }
 mem_cgroup_sk_free(sk);
 security_sk_free(sk);
 if (slab != NULL)


Thanks,
Yang



[PATCH v4] tun: fix use-after-free when register netdev failed

2019-09-10 Thread Yang Yingliang
  tun_get()
tun_do_read()
  tun_ring_recv()
register_netdevice() <-- inject error
goto err_detach
tun_detach_all() <-- set RCV_SHUTDOWN
free_netdev() <-- called from
 err_free_dev path
  netdev_freemem() <-- free the memory
without check refcount
  (In this path, the refcount cannot prevent
   freeing the memory of dev, and the memory
   will be used by dev_put() called by
   tun_chr_read_iter() on CPUB.)
 (Break from 
tun_ring_recv(),
 because RCV_SHUTDOWN is 
set)
   tun_put()
 dev_put() <-- use the 
memory
   freed by 
netdev_freemem()

Put the publishing of tfile->tun after register_netdevice(),
so tun_get() won't get the tun pointer that freed by
err_detach path if register_netdevice() failed.

Fixes: eb0fb363f920 ("tuntap: attach queue 0 before registering netdevice")
Reported-by: Hulk Robot 
Suggested-by: Jason Wang 
Signed-off-by: Yang Yingliang 
---
 drivers/net/tun.c | 16 +++-
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index db16d7a13e00..aab0be40d443 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -787,7 +787,8 @@ static void tun_detach_all(struct net_device *dev)
 }
 
 static int tun_attach(struct tun_struct *tun, struct file *file,
- bool skip_filter, bool napi, bool napi_frags)
+ bool skip_filter, bool napi, bool napi_frags,
+ bool publish_tun)
 {
struct tun_file *tfile = file->private_data;
struct net_device *dev = tun->dev;
@@ -870,7 +871,8 @@ static int tun_attach(struct tun_struct *tun, struct file 
*file,
 * initialized tfile; otherwise we risk using half-initialized
 * object.
 */
-   rcu_assign_pointer(tfile->tun, tun);
+   if (publish_tun)
+   rcu_assign_pointer(tfile->tun, tun);
rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
tun->numqueues++;
tun_set_real_num_queues(tun);
@@ -2730,7 +2732,7 @@ static int tun_set_iff(struct net *net, struct file 
*file, struct ifreq *ifr)
 
err = tun_attach(tun, file, ifr->ifr_flags & IFF_NOFILTER,
 ifr->ifr_flags & IFF_NAPI,
-ifr->ifr_flags & IFF_NAPI_FRAGS);
+ifr->ifr_flags & IFF_NAPI_FRAGS, true);
if (err < 0)
return err;
 
@@ -2829,13 +2831,17 @@ static int tun_set_iff(struct net *net, struct file 
*file, struct ifreq *ifr)
 
INIT_LIST_HEAD(&tun->disabled);
err = tun_attach(tun, file, false, ifr->ifr_flags & IFF_NAPI,
-ifr->ifr_flags & IFF_NAPI_FRAGS);
+ifr->ifr_flags & IFF_NAPI_FRAGS, false);
if (err < 0)
goto err_free_flow;
 
err = register_netdevice(tun->dev);
if (err < 0)
goto err_detach;
+   /* free_netdev() won't check refcnt, to aovid race
+* with dev_put() we need publish tun after registration.
+*/
+   rcu_assign_pointer(tfile->tun, tun);
}
 
netif_carrier_on(tun->dev);
@@ -2978,7 +2984,7 @@ static int tun_set_queue(struct file *file, struct ifreq 
*ifr)
if (ret < 0)
goto unlock;
ret = tun_attach(tun, file, false, tun->flags & IFF_NAPI,
-tun->flags & IFF_NAPI_FRAGS);
+tun->flags & IFF_NAPI_FRAGS, true);
} else if (ifr->ifr_flags & IFF_DETACH_QUEUE) {
tun = rtnl_dereference(tfile->tun);
if (!tun || !(tun->flags & IFF_MULTI_QUEUE) || tfile->detached)
-- 
2.17.1



Re: [PATCH v3] tun: fix use-after-free when register netdev failed

2019-09-09 Thread Yang Yingliang




On 2019/9/5 11:10, Jason Wang wrote:

On 2019/9/5 上午10:03, Yang Yingliang wrote:


On 2019/9/3 18:50, Jason Wang wrote:

- Original Message -

On 2019/9/3 14:06, Jason Wang wrote:

On 2019/9/3 下午1:42, Yang Yingliang wrote:

On 2019/9/3 11:03, Jason Wang wrote:

On 2019/9/3 上午9:45, Yang Yingliang wrote:

On 2019/9/2 13:32, Jason Wang wrote:

On 2019/8/23 下午5:36, Yang Yingliang wrote:

On 2019/8/23 11:05, Jason Wang wrote:

- Original Message -

On 2019/8/22 14:07, Yang Yingliang wrote:

On 2019/8/22 10:13, Jason Wang wrote:

On 2019/8/20 上午10:28, Jason Wang wrote:

On 2019/8/20 上午9:25, David Miller wrote:

From: Yang Yingliang 
Date: Mon, 19 Aug 2019 21:31:19 +0800


Call tun_attach() after register_netdevice() to make sure
tfile->tun
is not published until the netdevice is registered. So the
read/write
thread can not use the tun pointer that may freed by
free_netdev().
(The tun and dev pointer are allocated by
alloc_netdev_mqs(), they
can
be freed by netdev_freemem().)

register_netdevice() must always be the last operation in
the order of
network device setup.

At the point register_netdevice() is called, the device is
visible
globally
and therefore all of it's software state must be fully
initialized and
ready for us.

You're going to have to find another solution to these
problems.

The device is loosely coupled with sockets/queues. Each
side is
allowed to be go away without caring the other side. So
in this
case, there's a small window that network stack think the
device has
one queue but actually not, the code can then safely drop
them.
Maybe it's ok here with some comments?

Or if not, we can try to hold the device before tun_attach
and drop
it after register_netdevice().

Hi Yang:

I think maybe we can try to hold refcnt instead of playing
real num
queues here. Do you want to post a V4?

I think the refcnt can prevent freeing the memory in this
case.
When register_netdevice() failed, free_netdev() will be called
directly,
dev->pcpu_refcnt and dev are freed without checking refcnt of
dev.

How about using patch-v1 that using a flag to check whether the
device
registered successfully.


As I said, it lacks sufficient locks or barriers. To be clear, I
meant
something like (compile-test only):

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index db16d7a13e00..e52678f9f049 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -2828,6 +2828,7 @@ static int tun_set_iff(struct net *net,
struct file *file, struct ifreq *ifr)
 (ifr->ifr_flags & TUN_FEATURES);
INIT_LIST_HEAD(&tun->disabled);
+   dev_hold(dev);
   err = tun_attach(tun, file, false,
ifr->ifr_flags & IFF_NAPI,
ifr->ifr_flags &
IFF_NAPI_FRAGS);
   if (err < 0)
@@ -2836,6 +2837,7 @@ static int tun_set_iff(struct net *net,
struct file *file, struct ifreq *ifr)
   err = register_netdevice(tun->dev);
   if (err < 0)
   goto err_detach;
+   dev_put(dev);
   }
 netif_carrier_on(tun->dev);
@@ -2852,11 +2854,13 @@ static int tun_set_iff(struct net *net,
struct file *file, struct ifreq *ifr)
   return 0;
  err_detach:
+   dev_put(dev);
   tun_detach_all(dev);
   /* register_netdevice() already called
tun_free_netdev() */
   goto err_free_dev;
  err_free_flow:
+   dev_put(dev);
   tun_flow_uninit(tun);
security_tun_dev_free_security(tun->security);
err_free_stat:

What's your thought?

The dev pointer are freed without checking the refcount in
free_netdev() called by err_free_dev

path, so I don't understand how the refcount protects this
pointer.


The refcount are guaranteed to be zero there, isn't it?

No, it's not.

err_free_dev:
  free_netdev(dev);

void free_netdev(struct net_device *dev)
{
...
  /* pcpu_refcnt can be freed without checking refcount */
  free_percpu(dev->pcpu_refcnt);
  dev->pcpu_refcnt = NULL;

  /*  Compatibility with error handling in drivers */
  if (dev->reg_state == NETREG_UNINITIALIZED) {
  /* dev can be freed without checking refcount */
  netdev_freemem(dev);
  return;
  }
...
}

Right, but what I meant is in my patch, when code reaches
free_netdev() the refcnt is zero. What did I miss?

Yes, but it can't fix the UAF problem.

Well, it looks to me that the dev_put() in tun_put() won't release the
device in this case.

The device is not released in tun_put().
This is how the UAF occurs:

   CPUA   CPUB
   tun_set_iff()
 alloc_netdev_mqs()
 tun_attach()
  
tun_chr_read_iter()

   

Re: [PATCH v3] tun: fix use-after-free when register netdev failed

2019-09-04 Thread Yang Yingliang




On 2019/9/3 18:50, Jason Wang wrote:


- Original Message -


On 2019/9/3 14:06, Jason Wang wrote:

On 2019/9/3 下午1:42, Yang Yingliang wrote:


On 2019/9/3 11:03, Jason Wang wrote:

On 2019/9/3 上午9:45, Yang Yingliang wrote:


On 2019/9/2 13:32, Jason Wang wrote:

On 2019/8/23 下午5:36, Yang Yingliang wrote:


On 2019/8/23 11:05, Jason Wang wrote:

- Original Message -

On 2019/8/22 14:07, Yang Yingliang wrote:

On 2019/8/22 10:13, Jason Wang wrote:

On 2019/8/20 上午10:28, Jason Wang wrote:

On 2019/8/20 上午9:25, David Miller wrote:

From: Yang Yingliang 
Date: Mon, 19 Aug 2019 21:31:19 +0800


Call tun_attach() after register_netdevice() to make sure
tfile->tun
is not published until the netdevice is registered. So the
read/write
thread can not use the tun pointer that may freed by
free_netdev().
(The tun and dev pointer are allocated by
alloc_netdev_mqs(), they
can
be freed by netdev_freemem().)

register_netdevice() must always be the last operation in
the order of
network device setup.

At the point register_netdevice() is called, the device is
visible
globally
and therefore all of it's software state must be fully
initialized and
ready for us.

You're going to have to find another solution to these
problems.

The device is loosely coupled with sockets/queues. Each side is
allowed to be go away without caring the other side. So in this
case, there's a small window that network stack think the
device has
one queue but actually not, the code can then safely drop them.
Maybe it's ok here with some comments?

Or if not, we can try to hold the device before tun_attach
and drop
it after register_netdevice().

Hi Yang:

I think maybe we can try to hold refcnt instead of playing
real num
queues here. Do you want to post a V4?

I think the refcnt can prevent freeing the memory in this case.
When register_netdevice() failed, free_netdev() will be called
directly,
dev->pcpu_refcnt and dev are freed without checking refcnt of
dev.

How about using patch-v1 that using a flag to check whether the
device
registered successfully.


As I said, it lacks sufficient locks or barriers. To be clear, I
meant
something like (compile-test only):

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index db16d7a13e00..e52678f9f049 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -2828,6 +2828,7 @@ static int tun_set_iff(struct net *net,
struct file *file, struct ifreq *ifr)
(ifr->ifr_flags & TUN_FEATURES);
INIT_LIST_HEAD(&tun->disabled);
+   dev_hold(dev);
  err = tun_attach(tun, file, false,
ifr->ifr_flags & IFF_NAPI,
   ifr->ifr_flags & IFF_NAPI_FRAGS);
  if (err < 0)
@@ -2836,6 +2837,7 @@ static int tun_set_iff(struct net *net,
struct file *file, struct ifreq *ifr)
  err = register_netdevice(tun->dev);
  if (err < 0)
  goto err_detach;
+   dev_put(dev);
  }
netif_carrier_on(tun->dev);
@@ -2852,11 +2854,13 @@ static int tun_set_iff(struct net *net,
struct file *file, struct ifreq *ifr)
  return 0;
 err_detach:
+   dev_put(dev);
  tun_detach_all(dev);
  /* register_netdevice() already called
tun_free_netdev() */
  goto err_free_dev;
 err_free_flow:
+   dev_put(dev);
  tun_flow_uninit(tun);
security_tun_dev_free_security(tun->security);
   err_free_stat:

What's your thought?

The dev pointer are freed without checking the refcount in
free_netdev() called by err_free_dev

path, so I don't understand how the refcount protects this pointer.


The refcount are guaranteed to be zero there, isn't it?

No, it's not.

err_free_dev:
 free_netdev(dev);

void free_netdev(struct net_device *dev)
{
...
 /* pcpu_refcnt can be freed without checking refcount */
 free_percpu(dev->pcpu_refcnt);
 dev->pcpu_refcnt = NULL;

 /*  Compatibility with error handling in drivers */
 if (dev->reg_state == NETREG_UNINITIALIZED) {
 /* dev can be freed without checking refcount */
 netdev_freemem(dev);
 return;
 }
...
}


Right, but what I meant is in my patch, when code reaches
free_netdev() the refcnt is zero. What did I miss?

Yes, but it can't fix the UAF problem.


Well, it looks to me that the dev_put() in tun_put() won't release the
device in this case.

The device is not released in tun_put().
This is how the UAF occurs:

  CPUA   CPUB
  tun_set_iff()
alloc_netdev_mqs()
tun_attach()
  tun_chr_read_iter()
tun_get()
tun_do_read()
   

Re: [PATCH v3] tun: fix use-after-free when register netdev failed

2019-09-03 Thread Yang Yingliang




On 2019/9/3 14:06, Jason Wang wrote:


On 2019/9/3 下午1:42, Yang Yingliang wrote:



On 2019/9/3 11:03, Jason Wang wrote:


On 2019/9/3 上午9:45, Yang Yingliang wrote:



On 2019/9/2 13:32, Jason Wang wrote:


On 2019/8/23 下午5:36, Yang Yingliang wrote:



On 2019/8/23 11:05, Jason Wang wrote:

- Original Message -


On 2019/8/22 14:07, Yang Yingliang wrote:


On 2019/8/22 10:13, Jason Wang wrote:

On 2019/8/20 上午10:28, Jason Wang wrote:

On 2019/8/20 上午9:25, David Miller wrote:

From: Yang Yingliang 
Date: Mon, 19 Aug 2019 21:31:19 +0800

Call tun_attach() after register_netdevice() to make sure 
tfile->tun
is not published until the netdevice is registered. So the 
read/write
thread can not use the tun pointer that may freed by 
free_netdev().
(The tun and dev pointer are allocated by 
alloc_netdev_mqs(), they

can
be freed by netdev_freemem().)
register_netdevice() must always be the last operation in 
the order of

network device setup.

At the point register_netdevice() is called, the device is 
visible

globally
and therefore all of it's software state must be fully 
initialized and

ready for us.

You're going to have to find another solution to these 
problems.


The device is loosely coupled with sockets/queues. Each side is
allowed to be go away without caring the other side. So in this
case, there's a small window that network stack think the 
device has

one queue but actually not, the code can then safely drop them.
Maybe it's ok here with some comments?

Or if not, we can try to hold the device before tun_attach 
and drop

it after register_netdevice().


Hi Yang:

I think maybe we can try to hold refcnt instead of playing 
real num

queues here. Do you want to post a V4?

I think the refcnt can prevent freeing the memory in this case.
When register_netdevice() failed, free_netdev() will be called 
directly,
dev->pcpu_refcnt and dev are freed without checking refcnt of 
dev.
How about using patch-v1 that using a flag to check whether the 
device

registered successfully.

As I said, it lacks sufficient locks or barriers. To be clear, I 
meant

something like (compile-test only):

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index db16d7a13e00..e52678f9f049 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -2828,6 +2828,7 @@ static int tun_set_iff(struct net *net, 
struct file *file, struct ifreq *ifr)

   (ifr->ifr_flags & TUN_FEATURES);
INIT_LIST_HEAD(&tun->disabled);
+   dev_hold(dev);
 err = tun_attach(tun, file, false, 
ifr->ifr_flags & IFF_NAPI,

  ifr->ifr_flags & IFF_NAPI_FRAGS);
 if (err < 0)
@@ -2836,6 +2837,7 @@ static int tun_set_iff(struct net *net, 
struct file *file, struct ifreq *ifr)

 err = register_netdevice(tun->dev);
 if (err < 0)
 goto err_detach;
+   dev_put(dev);
 }
   netif_carrier_on(tun->dev);
@@ -2852,11 +2854,13 @@ static int tun_set_iff(struct net *net, 
struct file *file, struct ifreq *ifr)

 return 0;
err_detach:
+   dev_put(dev);
 tun_detach_all(dev);
 /* register_netdevice() already called 
tun_free_netdev() */

 goto err_free_dev;
err_free_flow:
+   dev_put(dev);
 tun_flow_uninit(tun);
security_tun_dev_free_security(tun->security);
  err_free_stat:

What's your thought?


The dev pointer are freed without checking the refcount in 
free_netdev() called by err_free_dev


path, so I don't understand how the refcount protects this pointer.



The refcount are guaranteed to be zero there, isn't it?

No, it's not.

err_free_dev:
free_netdev(dev);

void free_netdev(struct net_device *dev)
{
...
/* pcpu_refcnt can be freed without checking refcount */
free_percpu(dev->pcpu_refcnt);
dev->pcpu_refcnt = NULL;

/*  Compatibility with error handling in drivers */
if (dev->reg_state == NETREG_UNINITIALIZED) {
/* dev can be freed without checking refcount */
netdev_freemem(dev);
return;
}
...
}



Right, but what I meant is in my patch, when code reaches 
free_netdev() the refcnt is zero. What did I miss?

Yes, but it can't fix the UAF problem.



Well, it looks to me that the dev_put() in tun_put() won't release the 
device in this case.


The device is not released in tun_put().
This is how the UAF occurs:

CPUA   CPUB
tun_set_iff()
  alloc_netdev_mqs()
  tun_attach()
tun_chr_read_iter()
  tun_get()
  tun_do_read()
tun_ring_recv()
 

Re: [PATCH v3] tun: fix use-after-free when register netdev failed

2019-09-02 Thread Yang Yingliang




On 2019/9/3 11:03, Jason Wang wrote:


On 2019/9/3 上午9:45, Yang Yingliang wrote:



On 2019/9/2 13:32, Jason Wang wrote:


On 2019/8/23 下午5:36, Yang Yingliang wrote:



On 2019/8/23 11:05, Jason Wang wrote:

- Original Message -


On 2019/8/22 14:07, Yang Yingliang wrote:


On 2019/8/22 10:13, Jason Wang wrote:

On 2019/8/20 上午10:28, Jason Wang wrote:

On 2019/8/20 上午9:25, David Miller wrote:

From: Yang Yingliang 
Date: Mon, 19 Aug 2019 21:31:19 +0800

Call tun_attach() after register_netdevice() to make sure 
tfile->tun
is not published until the netdevice is registered. So the 
read/write
thread can not use the tun pointer that may freed by 
free_netdev().
(The tun and dev pointer are allocated by 
alloc_netdev_mqs(), they

can
be freed by netdev_freemem().)
register_netdevice() must always be the last operation in the 
order of

network device setup.

At the point register_netdevice() is called, the device is 
visible

globally
and therefore all of it's software state must be fully 
initialized and

ready for us.

You're going to have to find another solution to these problems.


The device is loosely coupled with sockets/queues. Each side is
allowed to be go away without caring the other side. So in this
case, there's a small window that network stack think the 
device has

one queue but actually not, the code can then safely drop them.
Maybe it's ok here with some comments?

Or if not, we can try to hold the device before tun_attach and 
drop

it after register_netdevice().


Hi Yang:

I think maybe we can try to hold refcnt instead of playing real 
num

queues here. Do you want to post a V4?

I think the refcnt can prevent freeing the memory in this case.
When register_netdevice() failed, free_netdev() will be called 
directly,

dev->pcpu_refcnt and dev are freed without checking refcnt of dev.
How about using patch-v1 that using a flag to check whether the 
device

registered successfully.

As I said, it lacks sufficient locks or barriers. To be clear, I 
meant

something like (compile-test only):

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index db16d7a13e00..e52678f9f049 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -2828,6 +2828,7 @@ static int tun_set_iff(struct net *net, 
struct file *file, struct ifreq *ifr)

   (ifr->ifr_flags & TUN_FEATURES);
   INIT_LIST_HEAD(&tun->disabled);
+   dev_hold(dev);
 err = tun_attach(tun, file, false, ifr->ifr_flags 
& IFF_NAPI,

  ifr->ifr_flags & IFF_NAPI_FRAGS);
 if (err < 0)
@@ -2836,6 +2837,7 @@ static int tun_set_iff(struct net *net, 
struct file *file, struct ifreq *ifr)

 err = register_netdevice(tun->dev);
 if (err < 0)
 goto err_detach;
+   dev_put(dev);
 }
   netif_carrier_on(tun->dev);
@@ -2852,11 +2854,13 @@ static int tun_set_iff(struct net *net, 
struct file *file, struct ifreq *ifr)

 return 0;
err_detach:
+   dev_put(dev);
 tun_detach_all(dev);
 /* register_netdevice() already called tun_free_netdev() */
 goto err_free_dev;
err_free_flow:
+   dev_put(dev);
 tun_flow_uninit(tun);
 security_tun_dev_free_security(tun->security);
  err_free_stat:

What's your thought?


The dev pointer are freed without checking the refcount in 
free_netdev() called by err_free_dev


path, so I don't understand how the refcount protects this pointer.



The refcount are guaranteed to be zero there, isn't it?

No, it's not.

err_free_dev:
free_netdev(dev);

void free_netdev(struct net_device *dev)
{
...
/* pcpu_refcnt can be freed without checking refcount */
free_percpu(dev->pcpu_refcnt);
dev->pcpu_refcnt = NULL;

/*  Compatibility with error handling in drivers */
if (dev->reg_state == NETREG_UNINITIALIZED) {
/* dev can be freed without checking refcount */
netdev_freemem(dev);
return;
}
...
}



Right, but what I meant is in my patch, when code reaches 
free_netdev() the refcnt is zero. What did I miss?

Yes, but it can't fix the UAF problem.


Thanks






Thanks



Thanks,
Yang



Thanks

.






.






.






Re: [PATCH v3] tun: fix use-after-free when register netdev failed

2019-09-02 Thread Yang Yingliang




On 2019/9/2 13:32, Jason Wang wrote:


On 2019/8/23 下午5:36, Yang Yingliang wrote:



On 2019/8/23 11:05, Jason Wang wrote:

- Original Message -


On 2019/8/22 14:07, Yang Yingliang wrote:


On 2019/8/22 10:13, Jason Wang wrote:

On 2019/8/20 上午10:28, Jason Wang wrote:

On 2019/8/20 上午9:25, David Miller wrote:

From: Yang Yingliang 
Date: Mon, 19 Aug 2019 21:31:19 +0800

Call tun_attach() after register_netdevice() to make sure 
tfile->tun
is not published until the netdevice is registered. So the 
read/write
thread can not use the tun pointer that may freed by 
free_netdev().
(The tun and dev pointer are allocated by alloc_netdev_mqs(), 
they

can
be freed by netdev_freemem().)
register_netdevice() must always be the last operation in the 
order of

network device setup.

At the point register_netdevice() is called, the device is visible
globally
and therefore all of it's software state must be fully 
initialized and

ready for us.

You're going to have to find another solution to these problems.


The device is loosely coupled with sockets/queues. Each side is
allowed to be go away without caring the other side. So in this
case, there's a small window that network stack think the device 
has

one queue but actually not, the code can then safely drop them.
Maybe it's ok here with some comments?

Or if not, we can try to hold the device before tun_attach and drop
it after register_netdevice().


Hi Yang:

I think maybe we can try to hold refcnt instead of playing real num
queues here. Do you want to post a V4?

I think the refcnt can prevent freeing the memory in this case.
When register_netdevice() failed, free_netdev() will be called 
directly,

dev->pcpu_refcnt and dev are freed without checking refcnt of dev.

How about using patch-v1 that using a flag to check whether the device
registered successfully.


As I said, it lacks sufficient locks or barriers. To be clear, I meant
something like (compile-test only):

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index db16d7a13e00..e52678f9f049 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -2828,6 +2828,7 @@ static int tun_set_iff(struct net *net, struct 
file *file, struct ifreq *ifr)

   (ifr->ifr_flags & TUN_FEATURES);
   INIT_LIST_HEAD(&tun->disabled);
+   dev_hold(dev);
 err = tun_attach(tun, file, false, ifr->ifr_flags & 
IFF_NAPI,

  ifr->ifr_flags & IFF_NAPI_FRAGS);
 if (err < 0)
@@ -2836,6 +2837,7 @@ static int tun_set_iff(struct net *net, struct 
file *file, struct ifreq *ifr)

 err = register_netdevice(tun->dev);
 if (err < 0)
 goto err_detach;
+   dev_put(dev);
 }
   netif_carrier_on(tun->dev);
@@ -2852,11 +2854,13 @@ static int tun_set_iff(struct net *net, 
struct file *file, struct ifreq *ifr)

 return 0;
err_detach:
+   dev_put(dev);
 tun_detach_all(dev);
 /* register_netdevice() already called tun_free_netdev() */
 goto err_free_dev;
err_free_flow:
+   dev_put(dev);
 tun_flow_uninit(tun);
 security_tun_dev_free_security(tun->security);
  err_free_stat:

What's your thought?


The dev pointer are freed without checking the refcount in 
free_netdev() called by err_free_dev


path, so I don't understand how the refcount protects this pointer.



The refcount are guaranteed to be zero there, isn't it?

No, it's not.

err_free_dev:
free_netdev(dev);

void free_netdev(struct net_device *dev)
{
...
/* pcpu_refcnt can be freed without checking refcount */
free_percpu(dev->pcpu_refcnt);
dev->pcpu_refcnt = NULL;

/*  Compatibility with error handling in drivers */
if (dev->reg_state == NETREG_UNINITIALIZED) {
/* dev can be freed without checking refcount */
netdev_freemem(dev);
return;
}
...
}



Thanks



Thanks,
Yang



Thanks

.






.






Re: [PATCH v3] tun: fix use-after-free when register netdev failed

2019-08-23 Thread Yang Yingliang




On 2019/8/23 11:05, Jason Wang wrote:

- Original Message -


On 2019/8/22 14:07, Yang Yingliang wrote:


On 2019/8/22 10:13, Jason Wang wrote:

On 2019/8/20 上午10:28, Jason Wang wrote:

On 2019/8/20 上午9:25, David Miller wrote:

From: Yang Yingliang 
Date: Mon, 19 Aug 2019 21:31:19 +0800


Call tun_attach() after register_netdevice() to make sure tfile->tun
is not published until the netdevice is registered. So the read/write
thread can not use the tun pointer that may freed by free_netdev().
(The tun and dev pointer are allocated by alloc_netdev_mqs(), they
can
be freed by netdev_freemem().)

register_netdevice() must always be the last operation in the order of
network device setup.

At the point register_netdevice() is called, the device is visible
globally
and therefore all of it's software state must be fully initialized and
ready for us.

You're going to have to find another solution to these problems.


The device is loosely coupled with sockets/queues. Each side is
allowed to be go away without caring the other side. So in this
case, there's a small window that network stack think the device has
one queue but actually not, the code can then safely drop them.
Maybe it's ok here with some comments?

Or if not, we can try to hold the device before tun_attach and drop
it after register_netdevice().


Hi Yang:

I think maybe we can try to hold refcnt instead of playing real num
queues here. Do you want to post a V4?

I think the refcnt can prevent freeing the memory in this case.
When register_netdevice() failed, free_netdev() will be called directly,
dev->pcpu_refcnt and dev are freed without checking refcnt of dev.

How about using patch-v1 that using a flag to check whether the device
registered successfully.


As I said, it lacks sufficient locks or barriers. To be clear, I meant
something like (compile-test only):

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index db16d7a13e00..e52678f9f049 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -2828,6 +2828,7 @@ static int tun_set_iff(struct net *net, struct file 
*file, struct ifreq *ifr)
   (ifr->ifr_flags & TUN_FEATURES);
  
 INIT_LIST_HEAD(&tun->disabled);

+   dev_hold(dev);
 err = tun_attach(tun, file, false, ifr->ifr_flags & IFF_NAPI,
  ifr->ifr_flags & IFF_NAPI_FRAGS);
 if (err < 0)
@@ -2836,6 +2837,7 @@ static int tun_set_iff(struct net *net, struct file 
*file, struct ifreq *ifr)
 err = register_netdevice(tun->dev);
 if (err < 0)
 goto err_detach;
+   dev_put(dev);
 }
  
 netif_carrier_on(tun->dev);

@@ -2852,11 +2854,13 @@ static int tun_set_iff(struct net *net, struct file 
*file, struct ifreq *ifr)
 return 0;
  
  err_detach:

+   dev_put(dev);
 tun_detach_all(dev);
 /* register_netdevice() already called tun_free_netdev() */
 goto err_free_dev;
  
  err_free_flow:

+   dev_put(dev);
 tun_flow_uninit(tun);
 security_tun_dev_free_security(tun->security);
  err_free_stat:

What's your thought?


The dev pointer are freed without checking the refcount in free_netdev() called 
by err_free_dev

path, so I don't understand how the refcount protects this pointer.

Thanks,
Yang



Thanks

.






Re: [PATCH v3] tun: fix use-after-free when register netdev failed

2019-08-22 Thread Yang Yingliang




On 2019/8/22 14:07, Yang Yingliang wrote:



On 2019/8/22 10:13, Jason Wang wrote:


On 2019/8/20 上午10:28, Jason Wang wrote:


On 2019/8/20 上午9:25, David Miller wrote:

From: Yang Yingliang 
Date: Mon, 19 Aug 2019 21:31:19 +0800


Call tun_attach() after register_netdevice() to make sure tfile->tun
is not published until the netdevice is registered. So the read/write
thread can not use the tun pointer that may freed by free_netdev().
(The tun and dev pointer are allocated by alloc_netdev_mqs(), they 
can

be freed by netdev_freemem().)

register_netdevice() must always be the last operation in the order of
network device setup.

At the point register_netdevice() is called, the device is visible 
globally

and therefore all of it's software state must be fully initialized and
ready for us.

You're going to have to find another solution to these problems.



The device is loosely coupled with sockets/queues. Each side is 
allowed to be go away without caring the other side. So in this 
case, there's a small window that network stack think the device has 
one queue but actually not, the code can then safely drop them. 
Maybe it's ok here with some comments?


Or if not, we can try to hold the device before tun_attach and drop 
it after register_netdevice().



Hi Yang:

I think maybe we can try to hold refcnt instead of playing real num 
queues here. Do you want to post a V4?

I think the refcnt can prevent freeing the memory in this case.
When register_netdevice() failed, free_netdev() will be called directly,
dev->pcpu_refcnt and dev are freed without checking refcnt of dev.
How about using patch-v1 that using a flag to check whether the device 
registered successfully.






Thanks




Thanks



.





.






Re: [PATCH v3] tun: fix use-after-free when register netdev failed

2019-08-21 Thread Yang Yingliang




On 2019/8/22 10:13, Jason Wang wrote:


On 2019/8/20 上午10:28, Jason Wang wrote:


On 2019/8/20 上午9:25, David Miller wrote:

From: Yang Yingliang 
Date: Mon, 19 Aug 2019 21:31:19 +0800


Call tun_attach() after register_netdevice() to make sure tfile->tun
is not published until the netdevice is registered. So the read/write
thread can not use the tun pointer that may freed by free_netdev().
(The tun and dev pointer are allocated by alloc_netdev_mqs(), they can
be freed by netdev_freemem().)

register_netdevice() must always be the last operation in the order of
network device setup.

At the point register_netdevice() is called, the device is visible 
globally

and therefore all of it's software state must be fully initialized and
ready for us.

You're going to have to find another solution to these problems.



The device is loosely coupled with sockets/queues. Each side is 
allowed to be go away without caring the other side. So in this case, 
there's a small window that network stack think the device has one 
queue but actually not, the code can then safely drop them. Maybe 
it's ok here with some comments?


Or if not, we can try to hold the device before tun_attach and drop 
it after register_netdevice().



Hi Yang:

I think maybe we can try to hold refcnt instead of playing real num 
queues here. Do you want to post a V4?

I think the refcnt can prevent freeing the memory in this case.
When register_netdevice() failed, free_netdev() will be called directly,
dev->pcpu_refcnt and dev are freed without checking refcnt of dev.



Thanks




Thanks



.






[PATCH v3] tun: fix use-after-free when register netdev failed

2019-08-19 Thread Yang Yingliang
inject error
  tun_detach_all()
synchronize_net()
  tun_do_read()
tun_ring_recv()
  schedule()
  free_netdev()
netdev_freemem()
  tun_put()
dev_put() <-- UAF

Call netif_set_real_num_t/rx_queues() before register_netdevice().

Call tun_attach() after register_netdevice() to make sure tfile->tun
is not published until the netdevice is registered. So the read/write
thread can not use the tun pointer that may freed by free_netdev().
(The tun and dev pointer are allocated by alloc_netdev_mqs(), they can
be freed by netdev_freemem().)

---
Changes in v3:
 - call netif_set_real_num_t/rx_queues() before register_netdevice()

Changes in v2:
 - add a param in tun_set_real_num_queues()
 - move tun_set_real_num_queues() out of tun_attach()
 - call tun_set_real_num_queues() before register_netdevice()
 - call tun_attach() after register_netdevice()
---

Fixes: eb0fb363f920 ("tuntap: attach queue 0 before registering netdevice")
Reported-by: Hulk Robot 
Suggested-by: Jason Wang 
Signed-off-by: Yang Yingliang 
---
 drivers/net/tun.c | 25 +
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index db16d7a13e00..07d1e945385a 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -2828,14 +2828,19 @@ static int tun_set_iff(struct net *net, struct file 
*file, struct ifreq *ifr)
  (ifr->ifr_flags & TUN_FEATURES);
 
INIT_LIST_HEAD(&tun->disabled);
-   err = tun_attach(tun, file, false, ifr->ifr_flags & IFF_NAPI,
-ifr->ifr_flags & IFF_NAPI_FRAGS);
-   if (err < 0)
-   goto err_free_flow;
+
+   netif_set_real_num_tx_queues(tun->dev, 1);
+   netif_set_real_num_rx_queues(tun->dev, 1);
 
err = register_netdevice(tun->dev);
if (err < 0)
-   goto err_detach;
+   /* register_netdevice() already called 
tun_free_netdev() */
+   goto err_free_dev;
+
+   err = tun_attach(tun, file, false, ifr->ifr_flags & IFF_NAPI,
+ifr->ifr_flags & IFF_NAPI_FRAGS);
+   if (err < 0)
+   goto err_unregister;
}
 
netif_carrier_on(tun->dev);
@@ -2851,14 +2856,10 @@ static int tun_set_iff(struct net *net, struct file 
*file, struct ifreq *ifr)
strcpy(ifr->ifr_name, tun->dev->name);
return 0;
 
-err_detach:
-   tun_detach_all(dev);
-   /* register_netdevice() already called tun_free_netdev() */
-   goto err_free_dev;
+err_unregister:
+   unregister_netdevice(dev);
+   return err;
 
-err_free_flow:
-   tun_flow_uninit(tun);
-   security_tun_dev_free_security(tun->security);
 err_free_stat:
free_percpu(tun->pcpu_stats);
 err_free_dev:
-- 
2.17.1



Re: [PATCH v2] tun: fix use-after-free when register netdev failed

2019-08-19 Thread Yang Yingliang




On 2019/8/19 11:17, Jason Wang wrote:

On 2019/8/16 下午7:00, Yang Yingliang wrote:

[...]
  
  		INIT_LIST_HEAD(&tun->disabled);

-   err = tun_attach(tun, file, false, ifr->ifr_flags & IFF_NAPI,
-ifr->ifr_flags & IFF_NAPI_FRAGS);
-   if (err < 0)
-   goto err_free_flow;
+
+   tun_set_real_num_queues(tun, tun->numqueues + 1);


This looks tricky, why not simply call netif_set_real_num_tx/rx_queues()
here?

OK, I will do some test, then send a v3 patch.


Thanks,
Yang



Thanks


  
  		err = register_netdevice(tun->dev);

if (err < 0)
-   goto err_detach;
+   /* register_netdevice() already called 
tun_free_netdev() */
+   goto err_free_dev;
+
+   err = tun_attach(tun, file, false, ifr->ifr_flags & IFF_NAPI,
+ifr->ifr_flags & IFF_NAPI_FRAGS);
+   if (err < 0)
+   goto err_unregister;
}
  
  	netif_carrier_on(tun->dev);

@@ -2851,14 +2857,10 @@ static int tun_set_iff(struct net *net, struct file 
*file, struct ifreq *ifr)
strcpy(ifr->ifr_name, tun->dev->name);
return 0;
  
-err_detach:

-   tun_detach_all(dev);
-   /* register_netdevice() already called tun_free_netdev() */
-   goto err_free_dev;
+err_unregister:
+   unregister_netdevice(dev);
+   return err;
  
-err_free_flow:

-   tun_flow_uninit(tun);
-   security_tun_dev_free_security(tun->security);
  err_free_stat:
free_percpu(tun->pcpu_stats);
  err_free_dev:
@@ -2979,6 +2981,8 @@ static int tun_set_queue(struct file *file, struct ifreq 
*ifr)
goto unlock;
ret = tun_attach(tun, file, false, tun->flags & IFF_NAPI,
 tun->flags & IFF_NAPI_FRAGS);
+   if (!ret)
+   tun_set_real_num_queues(tun, tun->numqueues);
} else if (ifr->ifr_flags & IFF_DETACH_QUEUE) {
tun = rtnl_dereference(tfile->tun);
if (!tun || !(tun->flags & IFF_MULTI_QUEUE) || tfile->detached)

.






[PATCH v2] tun: fix use-after-free when register netdev failed

2019-08-16 Thread Yang Yingliang
inject error
  tun_detach_all()
synchronize_net()
  tun_do_read()
tun_ring_recv()
  schedule()
  free_netdev()
netdev_freemem()
  tun_put()
dev_put() <-- UAF

Move tun_set_real_num_queues() out of tun_attach() and call it
before register_netdevice() in tun_set_iff().

Call tun_attach() after register_netdevice() to make sure tfile->tun
is not published until the netdevice is registered. So the read/write
thread can not use the tun pointer that may freed by free_netdev().
(The tun and dev pointer are allocated by alloc_netdev_mqs(), they can
be freed by netdev_freemem().)

---
Changes in v2:
 - add a param in tun_set_real_num_queues()
 - move tun_set_real_num_queues() out of tun_attach()
 - call tun_set_real_num_queues() before register_netdevice()
 - call tun_attach() after register_netdevice()
---

Cc: Yang Yingliang 
Fixes: eb0fb363f920 ("tuntap: attach queue 0 before registering netdevice")
Reported-by: Hulk Robot 
Suggested-by: Jason Wang 
Signed-off-by: Yang Yingliang 
---
 drivers/net/tun.c | 38 +-
 1 file changed, 21 insertions(+), 17 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index db16d7a13e00..a19f864c5f8d 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -626,10 +626,11 @@ static inline bool tun_not_capable(struct tun_struct *tun)
!ns_capable(net->user_ns, CAP_NET_ADMIN);
 }
 
-static void tun_set_real_num_queues(struct tun_struct *tun)
+static void tun_set_real_num_queues(struct tun_struct *tun,
+   unsigned int numqueues)
 {
-   netif_set_real_num_tx_queues(tun->dev, tun->numqueues);
-   netif_set_real_num_rx_queues(tun->dev, tun->numqueues);
+   netif_set_real_num_tx_queues(tun->dev, numqueues);
+   netif_set_real_num_rx_queues(tun->dev, numqueues);
 }
 
 static void tun_disable_queue(struct tun_struct *tun, struct tun_file *tfile)
@@ -708,7 +709,7 @@ static void __tun_detach(struct tun_file *tfile, bool clean)
tun_flow_delete_by_queue(tun, tun->numqueues + 1);
/* Drop read queue */
tun_queue_purge(tfile);
-   tun_set_real_num_queues(tun);
+   tun_set_real_num_queues(tun, tun->numqueues);
} else if (tfile->detached && clean) {
tun = tun_enable_queue(tfile);
sock_put(&tfile->sk);
@@ -873,7 +874,6 @@ static int tun_attach(struct tun_struct *tun, struct file 
*file,
rcu_assign_pointer(tfile->tun, tun);
rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
tun->numqueues++;
-   tun_set_real_num_queues(tun);
 out:
return err;
 }
@@ -2734,6 +2734,8 @@ static int tun_set_iff(struct net *net, struct file 
*file, struct ifreq *ifr)
if (err < 0)
return err;
 
+   tun_set_real_num_queues(tun, tun->numqueues);
+
if (tun->flags & IFF_MULTI_QUEUE &&
(tun->numqueues + tun->numdisabled > 1)) {
/* One or more queue has already been attached, no need
@@ -2828,14 +2830,18 @@ static int tun_set_iff(struct net *net, struct file 
*file, struct ifreq *ifr)
  (ifr->ifr_flags & TUN_FEATURES);
 
INIT_LIST_HEAD(&tun->disabled);
-   err = tun_attach(tun, file, false, ifr->ifr_flags & IFF_NAPI,
-ifr->ifr_flags & IFF_NAPI_FRAGS);
-   if (err < 0)
-   goto err_free_flow;
+
+   tun_set_real_num_queues(tun, tun->numqueues + 1);
 
err = register_netdevice(tun->dev);
if (err < 0)
-   goto err_detach;
+   /* register_netdevice() already called 
tun_free_netdev() */
+   goto err_free_dev;
+
+   err = tun_attach(tun, file, false, ifr->ifr_flags & IFF_NAPI,
+ifr->ifr_flags & IFF_NAPI_FRAGS);
+   if (err < 0)
+   goto err_unregister;
}
 
netif_carrier_on(tun->dev);
@@ -2851,14 +2857,10 @@ static int tun_set_iff(struct net *net, struct file 
*file, struct ifreq *ifr)
strcpy(ifr->ifr_name, tun->dev->name);
return 0;
 
-err_detach:
-   tun_detach_all(dev);
-   /* register_netdevice() already called tun_free_netdev() */
-   goto err_free_dev;
+err_unregister:
+   unregister_netdevice(dev);
+   return err;
 
-err_free_flow:
-   tun_flow_uninit(tun);
-   security_tun_dev

Re: [PATCH] tun: fix use-after-free when register netdev failed

2019-08-15 Thread Yang Yingliang




On 2019/8/15 17:21, Jason Wang wrote:


On 2019/8/15 下午4:18, Yang Yingliang wrote:

I got a UAF repport in tun driver when doing fuzzy test:

[  466.269490] 
==
[  466.271792] BUG: KASAN: use-after-free in 
tun_chr_read_iter+0x2ca/0x2d0
[  466.271806] Read of size 8 at addr 888372139250 by task 
tun-test/2699

[  466.271810]
[  466.271824] CPU: 1 PID: 2699 Comm: tun-test Not tainted 
5.3.0-rc1-1-g5a9433db2614-dirty #427
[  466.271833] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), 
BIOS rel-1.12.1-0-ga5cab58e9a3f-prebuilt.qemu.org 04/01/2014

[  466.271838] Call Trace:
[  466.271858]  dump_stack+0xca/0x13e
[  466.271871]  ? tun_chr_read_iter+0x2ca/0x2d0
[  466.271890]  print_address_description+0x79/0x440
[  466.271906]  ? vprintk_func+0x5e/0xf0
[  466.271920]  ? tun_chr_read_iter+0x2ca/0x2d0
[  466.271935]  __kasan_report+0x15c/0x1df
[  466.271958]  ? tun_chr_read_iter+0x2ca/0x2d0
[  466.271976]  kasan_report+0xe/0x20
[  466.271987]  tun_chr_read_iter+0x2ca/0x2d0
[  466.272013]  do_iter_readv_writev+0x4b7/0x740
[  466.272032]  ? default_llseek+0x2d0/0x2d0
[  466.272072]  do_iter_read+0x1c5/0x5e0
[  466.272110]  vfs_readv+0x108/0x180
[  466.299007]  ? compat_rw_copy_check_uvector+0x440/0x440
[  466.299020]  ? fsnotify+0x888/0xd50
[  466.299040]  ? __fsnotify_parent+0xd0/0x350
[  466.299064]  ? fsnotify_first_mark+0x1e0/0x1e0
[  466.304548]  ? vfs_write+0x264/0x510
[  466.304569]  ? ksys_write+0x101/0x210
[  466.304591]  ? do_preadv+0x116/0x1a0
[  466.304609]  do_preadv+0x116/0x1a0
[  466.309829]  do_syscall_64+0xc8/0x600
[  466.309849]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[  466.309861] RIP: 0033:0x4560f9
[  466.309875] Code: 00 00 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 
00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 
08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 
89 01 48
[  466.309889] RSP: 002b:7a5166e8 EFLAGS: 0206 ORIG_RAX: 
0127
[  466.322992] RAX: ffda RBX: 00400460 RCX: 
004560f9
[  466.322999] RDX: 0003 RSI: 28c0 RDI: 
0003
[  466.323007] RBP: 7a516700 R08: 0004 R09: 

[  466.323014] R10:  R11: 0206 R12: 
0040cb10
[  466.323021] R13:  R14: 006d7018 R15: 


[  466.323057]
[  466.323064] Allocated by task 2605:
[  466.335165]  save_stack+0x19/0x80
[  466.336240]  __kasan_kmalloc.constprop.8+0xa0/0xd0
[  466.337755]  kmem_cache_alloc+0xe8/0x320
[  466.339050]  getname_flags+0xca/0x560
[  466.340229]  user_path_at_empty+0x2c/0x50
[  466.341508]  vfs_statx+0xe6/0x190
[  466.342619]  __do_sys_newstat+0x81/0x100
[  466.343908]  do_syscall_64+0xc8/0x600
[  466.345303]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[  466.347034]
[  466.347517] Freed by task 2605:
[  466.348471]  save_stack+0x19/0x80
[  466.349476]  __kasan_slab_free+0x12e/0x180
[  466.350726]  kmem_cache_free+0xc8/0x430
[  466.351874]  putname+0xe2/0x120
[  466.352921]  filename_lookup+0x257/0x3e0
[  466.354319]  vfs_statx+0xe6/0x190
[  466.355498]  __do_sys_newstat+0x81/0x100
[  466.356889]  do_syscall_64+0xc8/0x600
[  466.358037]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[  466.359567]
[  466.360050] The buggy address belongs to the object at 
888372139100

[  466.360050]  which belongs to the cache names_cache of size 4096
[  466.363735] The buggy address is located 336 bytes inside of
[  466.363735]  4096-byte region [888372139100, 88837213a100)
[  466.367179] The buggy address belongs to the page:
[  466.368604] page:ea000dc84e00 refcount:1 mapcount:0 
mapping:8883df1b4f00 index:0x0 compound_mapcount: 0

[  466.371582] flags: 0x2f80010200(slab|head)
[  466.372910] raw: 002f80010200 dead0100 
dead0122 8883df1b4f00
[  466.375209] raw:  00070007 
0001 

[  466.38] page dumped because: kasan: bad access detected
[  466.379730]
[  466.380288] Memory state around the buggy address:
[  466.381844]  888372139100: fb fb fb fb fb fb fb fb fb fb fb fb 
fb fb fb fb
[  466.384009]  888372139180: fb fb fb fb fb fb fb fb fb fb fb fb 
fb fb fb fb
[  466.386131] >888372139200: fb fb fb fb fb fb fb fb fb fb fb fb 
fb fb fb fb

[  466.388257] ^
[  466.390234]  888372139280: fb fb fb fb fb fb fb fb fb fb fb fb 
fb fb fb fb
[  466.392512]  888372139300: fb fb fb fb fb fb fb fb fb fb fb fb 
fb fb fb fb
[  466.394667] 
==


tun_chr_read_iter() accessed the memory which freed by free_netdev()
called by tun_set_iff():

CPUACPUB
 tun_set_iff()
   alloc_netdev_mqs()
   tun_attach()
tun_chr_read_iter()
  tun_get()
   register_netdevice()
   tun_d

Re: [PATCH] tun: fix use-after-free when register netdev failed

2019-08-15 Thread Yang Yingliang




On 2019/8/15 17:35, Eric Dumazet wrote:


On 8/15/19 10:18 AM, Yang Yingliang wrote:

I got a UAF repport in tun driver when doing fuzzy test:


[  466.368604] page:ea000dc84e00 refcount:1 mapcount:0 
mapping:8883df1b4f00 index:0x0 compound_mapcount: 0
[  466.371582] flags: 0x2f80010200(slab|head)
[  466.372910] raw: 002f80010200 dead0100 dead0122 
8883df1b4f00
[  466.375209] raw:  00070007 0001 

[  466.38] page dumped because: kasan: bad access detected
[  466.379730]
[  466.380288] Memory state around the buggy address:
[  466.381844]  888372139100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb 
fb
[  466.384009]  888372139180: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb 
fb
[  466.386131] >888372139200: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb 
fb
[  466.388257]  ^
[  466.390234]  888372139280: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb 
fb
[  466.392512]  888372139300: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb 
fb
[  466.394667] 
==

tun_chr_read_iter() accessed the memory which freed by free_netdev()
called by tun_set_iff():

CPUACPUB
 tun_set_iff()
   alloc_netdev_mqs()
   tun_attach()
tun_chr_read_iter()
  tun_get()
   register_netdevice()
   tun_detach_all()
 synchronize_net()
  tun_do_read()
tun_ring_recv()
  schedule()
   free_netdev()
  tun_put() <-- UAF

UAF on what exactly ? The dev_hold() should prevent the free_netdev().


register_netdevice() is failed, so the dev is freed directly in free_netdev
().




Set a new bit in tun->flag if register_netdevice() successed,
without this bit, tun_get() returns NULL to avoid using a
freed tun pointer.

Fixes: eb0fb363f920 ("tuntap: attach queue 0 before registering netdevice")
Reported-by: Hulk Robot 
Signed-off-by: Yang Yingliang 
---
  drivers/net/tun.c | 10 --
  1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index db16d7a13e00..cbd60c276c40 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -115,6 +115,7 @@ do {
\
  /* High bits in flags field are unused. */
  #define TUN_VNET_LE 0x8000
  #define TUN_VNET_BE 0x4000
+#define TUN_DEV_REGISTERED 0x2000
  
  #define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \

  IFF_MULTI_QUEUE | IFF_NAPI | IFF_NAPI_FRAGS)
@@ -719,8 +720,10 @@ static void __tun_detach(struct tun_file *tfile, bool 
clean)
netif_carrier_off(tun->dev);
  
  			if (!(tun->flags & IFF_PERSIST) &&

-   tun->dev->reg_state == NETREG_REGISTERED)
+   tun->dev->reg_state == NETREG_REGISTERED) {
unregister_netdevice(tun->dev);
+   tun->flags &= ~TUN_DEV_REGISTERED;

Isn't this done too late ?


+   }
}
if (tun)
xdp_rxq_info_unreg(&tfile->xdp_rxq);
@@ -884,8 +887,10 @@ static struct tun_struct *tun_get(struct tun_file *tfile)
  
  	rcu_read_lock();

tun = rcu_dereference(tfile->tun);
-   if (tun)
+   if (tun && (tun->flags & TUN_DEV_REGISTERED))
dev_hold(tun->dev);
+   else
+   tun = NULL;
rcu_read_unlock();
  
  	return tun;

@@ -2836,6 +2841,7 @@ static int tun_set_iff(struct net *net, struct file 
*file, struct ifreq *ifr)
err = register_netdevice(tun->dev);
if (err < 0)
goto err_detach;
+   tun->flags |= TUN_DEV_REGISTERED;
}
  
  	netif_carrier_on(tun->dev);




So tun_get() will return NULL as long as  tun_set_iff() (TUNSETIFF ioctl()) has 
not yet been called ?

This could break some applications, since tun_get() is used from poll() and 
other syscalls.

I will try Wang's sugguestion later, if it's OK, I will drop this patch.


.






[PATCH] tun: fix use-after-free when register netdev failed

2019-08-15 Thread Yang Yingliang
onize_net()
  tun_do_read()
tun_ring_recv()
  schedule()
  free_netdev()
  tun_put() <-- UAF

Set a new bit in tun->flag if register_netdevice() successed,
without this bit, tun_get() returns NULL to avoid using a
freed tun pointer.

Fixes: eb0fb363f920 ("tuntap: attach queue 0 before registering netdevice")
Reported-by: Hulk Robot 
Signed-off-by: Yang Yingliang 
---
 drivers/net/tun.c | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index db16d7a13e00..cbd60c276c40 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -115,6 +115,7 @@ do {
\
 /* High bits in flags field are unused. */
 #define TUN_VNET_LE 0x8000
 #define TUN_VNET_BE 0x4000
+#define TUN_DEV_REGISTERED 0x2000
 
 #define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \
  IFF_MULTI_QUEUE | IFF_NAPI | IFF_NAPI_FRAGS)
@@ -719,8 +720,10 @@ static void __tun_detach(struct tun_file *tfile, bool 
clean)
netif_carrier_off(tun->dev);
 
if (!(tun->flags & IFF_PERSIST) &&
-   tun->dev->reg_state == NETREG_REGISTERED)
+   tun->dev->reg_state == NETREG_REGISTERED) {
unregister_netdevice(tun->dev);
+   tun->flags &= ~TUN_DEV_REGISTERED;
+   }
}
if (tun)
xdp_rxq_info_unreg(&tfile->xdp_rxq);
@@ -884,8 +887,10 @@ static struct tun_struct *tun_get(struct tun_file *tfile)
 
rcu_read_lock();
tun = rcu_dereference(tfile->tun);
-   if (tun)
+   if (tun && (tun->flags & TUN_DEV_REGISTERED))
dev_hold(tun->dev);
+   else
+   tun = NULL;
rcu_read_unlock();
 
return tun;
@@ -2836,6 +2841,7 @@ static int tun_set_iff(struct net *net, struct file 
*file, struct ifreq *ifr)
err = register_netdevice(tun->dev);
if (err < 0)
goto err_detach;
+   tun->flags |= TUN_DEV_REGISTERED;
}
 
netif_carrier_on(tun->dev);
-- 
2.17.1



Re: [PATCH RFC] net: decrease the length of backlog queue immediately after it's detached from sk

2016-04-12 Thread Yang Yingliang



On 2016/4/12 10:59, Yang Yingliang wrote:



On 2016/4/11 20:13, Eric Dumazet wrote:

On Mon, 2016-04-11 at 19:57 +0800, Yang Yingliang wrote:


On 2016/4/8 22:44, Eric Dumazet wrote:

On Fri, 2016-04-08 at 19:18 +0800, Yang Yingliang wrote:


I expand  tcp_adv_win_scale and tcp_rmem. It has no effect.


Try :

echo -2 >/proc/sys/net/ipv4/tcp_adv_win_scale

And restart your flows.


cat /proc/sys/net/ipv4/tcp_rmem
10240 2097152 10485760


What about leaving the default values ?

I tried, it did not work.



$ cat /proc/sys/net/ipv4/tcp_rmem
4096873806291456



echo 102400 20971520 104857600 > /proc/sys/net/ipv4/tcp_rmem
echo -2 >/proc/sys/net/ipv4/tcp_adv_win_scale

It seems has not effect.



I have no idea what you did on the sender side to allow it to send more
than 1.5 MB then.


We are doing performance test. The sender send 256KB per-block with 128
threads to one socket. And the receiver uses 10Gb NIC to handle the
data on ARM64. The data flow is driver->ip layer->tcp layer->iscsi.

I added some debug messages and found handling backlog packets in
__release_sock() cost about 11ms at most. This can cause backlog queue
overflow. The sk_data_ready is re-assigned, it may cost time in our
program. I will check it out.


I traced the cost cycles of handling backlog packets in
__release_sock().
16.97 ms to handling about 12MB backlog packets, of which 13.66ms to do
sk_data_ready.
The speed of handling packets in TCP is 5.65Gb/s which is smaller than
the NIC's bandwidth. So the packets will be dropped.

If the cost of sk_data_read cannot be reduced, do we have other choice
exclude dropping packets ?






Re: [PATCH RFC] net: decrease the length of backlog queue immediately after it's detached from sk

2016-04-11 Thread Yang Yingliang



On 2016/4/11 20:13, Eric Dumazet wrote:

On Mon, 2016-04-11 at 19:57 +0800, Yang Yingliang wrote:


On 2016/4/8 22:44, Eric Dumazet wrote:

On Fri, 2016-04-08 at 19:18 +0800, Yang Yingliang wrote:


I expand  tcp_adv_win_scale and tcp_rmem. It has no effect.


Try :

echo -2 >/proc/sys/net/ipv4/tcp_adv_win_scale

And restart your flows.


cat /proc/sys/net/ipv4/tcp_rmem
10240 2097152 10485760


What about leaving the default values ?

I tried, it did not work.



$ cat /proc/sys/net/ipv4/tcp_rmem
409687380   6291456



echo 102400 20971520 104857600 > /proc/sys/net/ipv4/tcp_rmem
echo -2 >/proc/sys/net/ipv4/tcp_adv_win_scale

It seems has not effect.



I have no idea what you did on the sender side to allow it to send more
than 1.5 MB then.


We are doing performance test. The sender send 256KB per-block with 128
threads to one socket. And the receiver uses 10Gb NIC to handle the
data on ARM64. The data flow is driver->ip layer->tcp layer->iscsi.

I added some debug messages and found handling backlog packets in 
__release_sock() cost about 11ms at most. This can cause backlog queue

overflow. The sk_data_ready is re-assigned, it may cost time in our
program. I will check it out.




Re: [PATCH RFC] net: decrease the length of backlog queue immediately after it's detached from sk

2016-04-11 Thread Yang Yingliang



On 2016/4/9 1:04, Eric Dumazet wrote:

On Fri, 2016-04-08 at 12:53 -0400, David Miller wrote:

From: Eric Dumazet 
Date: Fri, 08 Apr 2016 07:44:25 -0700


On Fri, 2016-04-08 at 19:18 +0800, Yang Yingliang wrote:


I expand  tcp_adv_win_scale and tcp_rmem. It has no effect.


Try :

echo -2 >/proc/sys/net/ipv4/tcp_adv_win_scale

And restart your flows.


I'm honestly beginning to suspect a bug in their driver and how they
handle skb->truesize.

Yang, until you show us the driver you are using and how is handles
receive packets, we are largely in the dark about a major component
of this issue and that is entirely unfair to us.


Apparently their skb->truesize and skb->len combinations are correct.

I suspect an issue with rcvbuf autouning on a bidirectional tcp traffic.
We mostly focus on unidirectional flows, but they seem to use a mixed
case.

Also, fact that sendmsg() locks the socket for the duration of the call
is problematic : I suspect their issues would mostly disappear by using
smaller chunk sizes (ie 64KB per sendmsg() instead of 256KB).

It's less packets dropping with using 64KB chunk.



We also could add resched points in sendmsg() (processing backlog if it
gets too hot), but I fear this would slow down the fast path.









Re: [PATCH RFC] net: decrease the length of backlog queue immediately after it's detached from sk

2016-04-11 Thread Yang Yingliang



On 2016/4/8 22:44, Eric Dumazet wrote:

On Fri, 2016-04-08 at 19:18 +0800, Yang Yingliang wrote:


I expand  tcp_adv_win_scale and tcp_rmem. It has no effect.


Try :

echo -2 >/proc/sys/net/ipv4/tcp_adv_win_scale

And restart your flows.


cat /proc/sys/net/ipv4/tcp_rmem
10240 2097152 10485760

echo 102400 20971520 104857600 > /proc/sys/net/ipv4/tcp_rmem
echo -2 >/proc/sys/net/ipv4/tcp_adv_win_scale

It seems has not effect.



Re: [PATCH RFC] net: decrease the length of backlog queue immediately after it's detached from sk

2016-04-08 Thread Yang Yingliang



On 2016/4/7 22:51, Eric Dumazet wrote:

On Thu, 2016-04-07 at 03:21 -0700, Eric Dumazet wrote:


Please do not send patches before really understanding the issue you
have.

Having a backlog of 12506206 bytes is ridiculous. Dropping packets is
absolutely fine if this ever happens.

Something is really wrong on your host, or the sender simply does not
comply with TCP protocol (not caring of receiver window at all)

Since you added a trace of truesize, please also trace skb->len



[2016-04-08 18:33:39][ 9748.726948] TCP: rcvbuf:10485760 sndbuf:2097152 
limit:12582912 backloglen:12607514 rmem_alloc:0, truesize:31992, len:17540
[2016-04-08 18:33:39][ 9748.726964] TCP: rcvbuf:10485760 sndbuf:2097152 
limit:12582912 backloglen:12607514 rmem_alloc:29326, truesize:18662, 
len:10240
[2016-04-08 18:33:39][ 9748.726986] TCP: rcvbuf:10485760 sndbuf:2097152 
limit:12582912 backloglen:12607514 rmem_alloc:0, truesize:39990, len:21920
[2016-04-08 18:33:39][ 9748.727028] TCP: rcvbuf:10485760 sndbuf:2097152 
limit:12582912 backloglen:12607514 rmem_alloc:0, truesize:58652, len:32140
[2016-04-08 18:33:39][ 9748.727068] TCP: rcvbuf:10485760 sndbuf:2097152 
limit:12582912 backloglen:12607514 rmem_alloc:0, truesize:58652, len:32140
[2016-04-08 18:33:39][ 9748.727082] TCP: rcvbuf:10485760 sndbuf:2097152 
limit:12582912 backloglen:12607514 rmem_alloc:21328, truesize:5332, len:2940
[2016-04-08 18:33:39][ 9748.727310] TCP: rcvbuf:10485760 sndbuf:2097152 
limit:12582912 backloglen:12607514 rmem_alloc:0, truesize:53320, len:29220
[2016-04-08 18:33:39][ 9748.727326] TCP: rcvbuf:10485760 sndbuf:2097152 
limit:12582912 backloglen:12607514 rmem_alloc:26660, truesize:7998, len:4400
[2016-04-08 18:33:39][ 9748.727352] TCP: rcvbuf:10485760 sndbuf:2097152 
limit:12582912 backloglen:12607514 rmem_alloc:47988, truesize:58652, 
len:32140
[2016-04-08 18:33:39][ 9748.727389] TCP: rcvbuf:10485760 sndbuf:2097152 
limit:12582912 backloglen:12607514 rmem_alloc:0, truesize:39990, len:21920
[2016-04-08 18:33:39][ 9748.727409] TCP: rcvbuf:10485760 sndbuf:2097152 
limit:12582912 backloglen:12607514 rmem_alloc:58652, truesize:18662, 
len:10240


If I expand buffer 5 times((sndbuf+rcvbuf)*5). There are only 5M data in 
backlog at most.


[2016-04-08 18:33:39][ 9748.43] TCP: rcvbuf:10485760 sndbuf:2097152 
limit:12582912 backloglen:5435954 rmem_alloc:0, truesize:55986, len:30680
[2016-04-08 18:33:39][ 9748.62] TCP: rcvbuf:10485760 sndbuf:2097152 
limit:12582912 backloglen:5457282 rmem_alloc:58652, truesize:21328, 
len:11700
[2016-04-08 18:33:39][ 9748.777804] TCP: rcvbuf:10485760 sndbuf:2097152 
limit:12582912 backloglen:5515934 rmem_alloc:55986, truesize:58652, 
len:32140
[2016-04-08 18:33:39][ 9748.777818] TCP: rcvbuf:10485760 sndbuf:2097152 
limit:12582912 backloglen:5537262 rmem_alloc:0, truesize:21328, len:11700
[2016-04-08 18:33:39][ 9748.777839] TCP: rcvbuf:10485760 sndbuf:2097152 
limit:12582912 backloglen:5574586 rmem_alloc:0, truesize:37324, len:20460
[2016-04-08 18:33:39][ 9748.777854] TCP: rcvbuf:10485760 sndbuf:2097152 
limit:12582912 backloglen:5601246 rmem_alloc:58652, truesize:26660, 
len:14620
[2016-04-08 18:33:39][ 9748.777881] TCP: rcvbuf:10485760 sndbuf:2097152 
limit:12582912 backloglen:5659898 rmem_alloc:21328, truesize:58652, 
len:32140
[2016-04-08 18:33:39][ 9748.777894] TCP: rcvbuf:10485760 sndbuf:2097152 
limit:12582912 backloglen:5675894 rmem_alloc:37324, truesize:15996, len:8780
[2016-04-08 18:33:39][ 9748.778047] TCP: rcvbuf:10485760 sndbuf:2097152 
limit:12582912 backloglen:58652 rmem_alloc:0, truesize:58652, len:32140
[2016-04-08 18:33:39][ 9748.778075] TCP: rcvbuf:10485760 sndbuf:2097152 
limit:12582912 backloglen:117304 rmem_alloc:0, truesize:58652, len:32140
[2016-04-08 18:33:39][ 9748.778084] TCP: rcvbuf:10485760 sndbuf:2097152 
limit:12582912 backloglen:122636 rmem_alloc:0, truesize:5332, len:2940
[2016-04-08 18:33:39][ 9748.778109] TCP: rcvbuf:10485760 sndbuf:2097152 
limit:12582912 backloglen:175956 rmem_alloc:0, truesize:53320, len:29220
[2016-04-08 18:33:39][ 9748.778156] TCP: rcvbuf:10485760 sndbuf:2097152 
limit:12582912 backloglen:234608 rmem_alloc:0, truesize:58652, len:32140
[2016-04-08 18:33:39][ 9748.778178] TCP: rcvbuf:10485760 sndbuf:2097152 
limit:12582912 backloglen:282596 rmem_alloc:58652, truesize:47988, len:26300


BTW, have you played with /proc/sys/net/ipv4/tcp_adv_win_scale ?



I expand  tcp_adv_win_scale and tcp_rmem. It has no effect.









Re: [PATCH RFC] net: decrease the length of backlog queue immediately after it's detached from sk

2016-04-06 Thread Yang Yingliang



On 2016/3/30 21:47, Eric Dumazet wrote:

On Wed, 2016-03-30 at 13:56 +0800, Yang Yingliang wrote:


Sorry, I made a mistake. I am very sure my kernel has these two patches.
And I can get some dropping of the packets in 10Gb eth.

# netstat -s | grep -i backlog
  TCPBacklogDrop: 4135
# netstat -s | grep -i backlog
  TCPBacklogDrop: 4167


Sender will retransmit and the receiver backlog will lilely be emptied
before the packets arrive again.

Are you sure these are TCP drops ?

Yes.



Which 10Gb NIC is it ? (ethtool -i eth0)

The NIC driver is not upstream. And my system is arm64.



What is the max size of sendmsg() chunks are generated by your apps ?

256KB



Are they forcing small SO_RCVBUF or SO_SNDBUF ?

I am not sure.
I add some debug message in kernel:
[2016-04-06 10:56:55][ 1365.477140] TCP: rcvbuf:10485760 sndbuf:2097152 
limit:12582912 backloglen:12402232 rmem_alloc:0 truesize:53320
[2016-04-06 10:56:55][ 1365.477170] TCP: rcvbuf:10485760 sndbuf:2097152 
limit:12582912 backloglen:12460884 rmem_alloc:55986 truesize:58652
[2016-04-06 10:56:55][ 1365.477192] TCP: rcvbuf:10485760 sndbuf:2097152 
limit:12582912 backloglen:12506206 rmem_alloc:0 truesize:45322
[2016-04-06 10:56:55][ 1365.477226] TCP: rcvbuf:10485760 sndbuf:2097152 
limit:12582912 backloglen:12519536 rmem_alloc:7998 truesize:13330
[2016-04-06 10:56:55][ 1365.477254] TCP: rcvbuf:10485760 sndbuf:2097152 
limit:12582912 backloglen:12575522 rmem_alloc:0 truesize:55986
[2016-04-06 10:56:55][ 1365.477282] TCP: rcvbuf:10485760 sndbuf:2097152 
limit:12582912 backloglen:12634174 rmem_alloc:0 truesize:58652
[2016-04-06 10:56:55][ 1365.477301] TCP: rcvbuf:10485760 sndbuf:2097152 
limit:12582912 backloglen:12634174 rmem_alloc:26660 truesize:31992
[2016-04-06 10:56:55][ 1365.477321] TCP: rcvbuf:10485760 sndbuf:2097152 
limit:12582912 backloglen:12634174 rmem_alloc:58652 truesize:26660
[2016-04-06 10:56:55][ 1365.477341] TCP: rcvbuf:10485760 sndbuf:2097152 
limit:12582912 backloglen:12634174 rmem_alloc:58652 truesize:42656
[2016-04-06 10:56:55][ 1365.477384] TCP: rcvbuf:10485760 sndbuf:2097152 
limit:12582912 backloglen:12634174 rmem_alloc:0 truesize:58652
[2016-04-06 10:56:55][ 1365.477403] TCP: rcvbuf:10485760 sndbuf:2097152 
limit:12582912 backloglen:12634174 rmem_alloc:0 truesize:34658




What percentage of drops do you have ?

netstat -s | grep -i TCPBacklogDrop increases 20-40 per second.
It's about 1.2% (117724(TCPBacklogDrop)/214502873(InSegs of cat 
/proc/net/snmp)).




Here (at Google), we have less than one backlog drop per billion
packets, on host facing the public Internet.

If a TCP sender sends a burst of tiny packets because it is misbehaving,
you absolutely will drop packets, especially if applications use
sendmsg() with very big lengths and big SO_SNDBUF.

Trying to not drop these hostile packets as you did is simply opening
your host to DOS attacks.

Eventually, we should even drop earlier in TCP stack (before taking
socket lock).



How about expand the buffer like:

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 6d204f3..da1bc16 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -281,6 +281,7 @@ extern unsigned int sysctl_tcp_notsent_lowat;
 extern int sysctl_tcp_min_tso_segs;
 extern int sysctl_tcp_autocorking;
 extern int sysctl_tcp_invalid_ratelimit;
+extern int sysctl_tcp_backlog_buf_multi;

 extern atomic_long_t tcp_memory_allocated;
 extern struct percpu_counter tcp_sockets_allocated;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index f0e8297..9511410 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -631,6 +631,13 @@ static struct ctl_table ipv4_table[] = {
.mode   = 0644,
.proc_handler   = proc_dointvec
},
+   {
+   .procname   = "tcp_backlog_buf_multi",
+   .data   = &sysctl_tcp_backlog_buf_multi,
+   .maxlen = sizeof(int),
+   .mode   = 0644,
+   .proc_handler   = proc_dointvec,
+   },
 #ifdef CONFIG_NETLABEL
{
.procname   = "cipso_cache_enable",
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 87463c8..337ad55 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -101,6 +101,8 @@ int sysctl_tcp_thin_dupack __read_mostly;
 int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
 int sysctl_tcp_early_retrans __read_mostly = 3;
 int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
+int sysctl_tcp_backlog_buf_multi __read_mostly = 1;
+EXPORT_SYMBOL(sysctl_tcp_backlog_buf_multi);

 #define FLAG_DATA  0x01 /* Incoming frame contained data.  
*/
 #define FLAG_WIN_UPDATE0x02 /* Incoming ACK was a window 
update.   */
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 13b92d5..39272f3 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1635,7 +1635,8 @@ process:
   

Re: [PATCH RFC] net: decrease the length of backlog queue immediately after it's detached from sk

2016-04-06 Thread Yang Yingliang



On 2016/3/30 20:56, Sergei Shtylyov wrote:

Hello.

On 3/30/2016 8:16 AM, Yang Yingliang wrote:


When task A hold the sk owned in tcp_sendmsg, if lots of packets
arrive and the packets will be added to backlog queue. The packets
will be handled in release_sock called from tcp_sendmsg. When the
sk_backlog is removed from sk, the length will not decrease until
all the packets in backlog queue are handled. This may leads to the
new packets be dropped because the lenth is too big. So set the
lenth to 0 immediately after it's detached from sk.


Length?


Signed-off-by: Yang Yingliang 

[...]

MBR, Sergei



Yes. It's a typo.

Thanks
Yang



Re: [PATCH RFC] net: decrease the length of backlog queue immediately after it's detached from sk

2016-03-29 Thread Yang Yingliang



On 2016/3/30 13:34, Eric Dumazet wrote:

On Tue, 2016-03-29 at 22:25 -0700, Eric Dumazet wrote:

On Wed, 2016-03-30 at 13:16 +0800, Yang Yingliang wrote:

When task A hold the sk owned in tcp_sendmsg, if lots of packets
arrive and the packets will be added to backlog queue. The packets
will be handled in release_sock called from tcp_sendmsg. When the
sk_backlog is removed from sk, the length will not decrease until
all the packets in backlog queue are handled. This may leads to the
new packets be dropped because the lenth is too big. So set the
lenth to 0 immediately after it's detached from sk.

Signed-off-by: Yang Yingliang 
---
  net/core/sock.c | 1 +
  1 file changed, 1 insertion(+)

diff --git a/net/core/sock.c b/net/core/sock.c
index 47fc8bb..108be05 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1933,6 +1933,7 @@ static void __release_sock(struct sock *sk)

do {
sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
+   sk->sk_backlog.len = 0;
bh_unlock_sock(sk);

do {


Certainly not.

Have you really missed the comment ?

https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=8eae939f1400326b06d0c9afe53d2a484a326871


I do not believe the case you describe can happen, unless a misbehaving
driver cooks fat skb (with skb->truesize being far more bigger than
skb->len)



And also make sure you backported
https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=da882c1f2ecadb0ed582628ec1585e36b137c0f0


Sorry, I made a mistake. I am very sure my kernel has these two patches.
And I can get some dropping of the packets in 10Gb eth.

# netstat -s | grep -i backlog
TCPBacklogDrop: 4135
# netstat -s | grep -i backlog
TCPBacklogDrop: 4167













Re: [PATCH RFC] net: decrease the length of backlog queue immediately after it's detached from sk

2016-03-29 Thread Yang Yingliang



On 2016/3/30 13:25, Eric Dumazet wrote:

On Wed, 2016-03-30 at 13:16 +0800, Yang Yingliang wrote:

When task A hold the sk owned in tcp_sendmsg, if lots of packets
arrive and the packets will be added to backlog queue. The packets
will be handled in release_sock called from tcp_sendmsg. When the
sk_backlog is removed from sk, the length will not decrease until
all the packets in backlog queue are handled. This may leads to the
new packets be dropped because the lenth is too big. So set the
lenth to 0 immediately after it's detached from sk.

Signed-off-by: Yang Yingliang 
---
  net/core/sock.c | 1 +
  1 file changed, 1 insertion(+)

diff --git a/net/core/sock.c b/net/core/sock.c
index 47fc8bb..108be05 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1933,6 +1933,7 @@ static void __release_sock(struct sock *sk)

do {
sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
+   sk->sk_backlog.len = 0;
bh_unlock_sock(sk);

do {


Certainly not.

Have you really missed the comment ?

https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=8eae939f1400326b06d0c9afe53d2a484a326871


My kernel is 4.1 LTS, it seems don't have this patch. I will try this 
patch later.


Thanks
Yang



I do not believe the case you describe can happen, unless a misbehaving
driver cooks fat skb (with skb->truesize being far more bigger than
skb->len)











[PATCH RFC] net: decrease the length of backlog queue immediately after it's detached from sk

2016-03-29 Thread Yang Yingliang
When task A hold the sk owned in tcp_sendmsg, if lots of packets
arrive and the packets will be added to backlog queue. The packets
will be handled in release_sock called from tcp_sendmsg. When the
sk_backlog is removed from sk, the length will not decrease until
all the packets in backlog queue are handled. This may leads to the
new packets be dropped because the lenth is too big. So set the
lenth to 0 immediately after it's detached from sk.

Signed-off-by: Yang Yingliang 
---
 net/core/sock.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/core/sock.c b/net/core/sock.c
index 47fc8bb..108be05 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1933,6 +1933,7 @@ static void __release_sock(struct sock *sk)
 
do {
sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
+   sk->sk_backlog.len = 0;
bh_unlock_sock(sk);
 
do {
-- 
2.5.0




A net device refcnt leak problem

2016-02-18 Thread Yang Yingliang

Hi,


I got a tap device refcnt leak message when I was detaching the device
and it's deadloop for waiting the usage count decrease to 0.

The log is:
unregister_netdevice: waiting for pae_tap0 to become free. Usage count = 1


Unfortunately, it happened only once unit now, I cannot reproduce this.
My kernel version is 3.10 LTS.

The attachment is value of struct net_device, tun_struct and
tun_file while the leak is occurred.

Does anyone have any thoughts or if there is a patch could fix this ?


Thanks,
Yang
struct net_device {
  name = "pae_tap0\000\000\000\000\000\000\000", 
  name_hlist = {
next = 0x0, 
pprev = 0xdead00200200
  }, 
  ifalias = 0x0, 
  mem_end = 0, 
  mem_start = 0, 
  base_addr = 0, 
  irq = 0, 
  state = 6, 
  dev_list = {
next = 0x88061b96c050, 
prev = 0xdead00200200
  }, 
  napi_list = {
next = 0x88060bf2c060, 
prev = 0x88060bf2c060
  }, 
  unreg_list = {
next = 0x88060bf2c070, 
prev = 0x88060bf2c070
  }, 
  upper_dev_list = {
next = 0x88060bf2c080, 
prev = 0x88060bf2c080
  }, 
  features = 8589953089, 
  hw_features = 8591722569, 
  wanted_features = 8591722569, 
  vlan_features = 1769577, 
  hw_enc_features = 1, 
  mpls_features = 1, 
  ifindex = 13, 
  iflink = 13, 
  stats = {
rx_packets = 0, 
tx_packets = 7, 
rx_bytes = 0, 
tx_bytes = 570, 
rx_errors = 0, 
tx_errors = 0, 
rx_dropped = 0, 
tx_dropped = 0, 
multicast = 0, 
collisions = 0, 
rx_length_errors = 0, 
rx_over_errors = 0, 
rx_crc_errors = 0, 
rx_frame_errors = 0, 
rx_fifo_errors = 0, 
rx_missed_errors = 0, 
tx_aborted_errors = 0, 
tx_carrier_errors = 0, 
tx_fifo_errors = 0, 
tx_heartbeat_errors = 0, 
tx_window_errors = 0, 
rx_compressed = 0, 
tx_compressed = 0
  }, 
  rx_dropped = {
counter = 0
  }, 
  wireless_handlers = 0x0, 
  wireless_data = 0x0, 
  netdev_ops = 0xa04004c0 , 
  ethtool_ops = 0xa0400940 , 
  header_ops = 0x816b5480 , 
  flags = 4098, 
  priv_flags = 1049600, 
  gflags = 0, 
  padded = 0, 
  operstate = 2 '\002', 
  link_mode = 0 '\000', 
  if_port = 0 '\000', 
  dma = 0 '\000', 
  mtu = 1500, 
  type = 1, 
  hard_header_len = 14, 
  needed_headroom = 0, 
  needed_tailroom = 0, 
  perm_addr = 
"\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000",
 
  addr_assign_type = 1 '\001', 
  addr_len = 6 '\006', 
  neigh_priv_len = 0, 
  dev_id = 0, 
  dev_port = 0, 
  addr_list_lock = {
{
  rlock = {
raw_lock = {
  {
head_tail = 1441814, 
tickets = {
  head = 22, 
  tail = 22
}
  }
}
  }
}
  }, 
  uc = {
list = {
  next = 0x88060bf2c1f8, 
  prev = 0x88060bf2c1f8
}, 
count = 0
  }, 
  mc = {
list = {
  next = 0x88060bf2c210, 
  prev = 0x88060bf2c210
}, 
count = 0
  }, 
  dev_addrs = {
list = {
  next = 0x880619636a20, 
  prev = 0x880619636a20
}, 
count = 1
  }, 
  queues_kset = 0x8806196366c0, 
  uc_promisc = false, 
  promiscuity = 0, 
  allmulti = 0, 
  vlan_info = 0x0, 
  atalk_ptr = 0x0, 
  ip_ptr = 0x0, 
  dn_ptr = 0x0, 
  ip6_ptr = 0x0, 
  ax25_ptr = 0x0, 
  ieee80211_ptr = 0x0, 
  last_rx = 0, 
  dev_addr = 0x880619636a30 "\342\374\207[\342d", 
  _rx = 0x88061b8a1380, 
  num_rx_queues = 1, 
  real_num_rx_queues = 1, 
  rx_handler = 0x0, 
  rx_handler_data = 0x0, 
  ingress_queue = 0x0, 
  broadcast = 
"\377\377\377\377\377\377\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000",
 
  _tx = 0x88060de3f200, 
  num_tx_queues = 1, 
  real_num_tx_queues = 1, 
  qdisc = 0x819e7e40 , 
  tx_queue_len = 500, 
  tx_global_lock = {
{
  rlock = {
raw_lock = {
  {
head_tail = 131074, 
tickets = {
  head = 2, 
  tail = 2
}
  }
}
  }
}
  }, 
  xps_maps = 0x0, 
  rx_cpu_rmap = 0x0, 
  trans_start = 4294707515, 
  watchdog_timeo = 0, 
  watchdog_timer = {
entry = {
  next = 0x0, 
  prev = 0x0
}, 
expires = 0, 
base = 0x88061c4fc000, 
function = 0x815212b0 , 
data = 18446612158284480512, 
slack = -1, 
start_pid = -1, 
start_site = 0x0, 
start_comm = "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000"
  }, 
  pcpu_refcnt = 0x60f9d98029a4, 
  todo_list = {
next = 0xdead00100100, 
prev = 0xdead00200200
  }, 
  index_hlist = {
next = 0x0, 
pprev = 0xdead00200200
  }, 
  link_watch_list = {
next = 0x88060bf2c3c0, 
prev = 0x88060bf2c3c0
  }, 
  reg_state = NETREG_UNREGISTERED, 
  dismantle = true, 
  rtnl_link_state = RTNL_LINK_INITIALIZED, 
  destructor = 0xa03fd220 , 
  npi