[PATCH v4] can: mcp251xfd: replace sizeof(u32) with val_bytes in regmap
The sizeof(u32) is hardcoded. It's better to use the config value in regmap. It increases the size of target object, but it's flexible when new mcp chip need other val_bytes. Signed-off-by: Su Yanjun --- drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c | 9 ++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c index f07e8b737d31..3dde52669343 100644 --- a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c +++ b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c @@ -1308,6 +1308,7 @@ mcp251xfd_tef_obj_read(const struct mcp251xfd_priv *priv, const u8 offset, const u8 len) { const struct mcp251xfd_tx_ring *tx_ring = priv->tx; + int val_bytes = regmap_get_val_bytes(priv->map_rx); if (IS_ENABLED(CONFIG_CAN_MCP251XFD_SANITY) && (offset > tx_ring->obj_num || @@ -1322,7 +1323,7 @@ mcp251xfd_tef_obj_read(const struct mcp251xfd_priv *priv, return regmap_bulk_read(priv->map_rx, mcp251xfd_get_tef_obj_addr(offset), hw_tef_obj, - sizeof(*hw_tef_obj) / sizeof(u32) * len); + sizeof(*hw_tef_obj) / val_bytes * len); } static int mcp251xfd_handle_tefif(struct mcp251xfd_priv *priv) @@ -1511,11 +1512,12 @@ mcp251xfd_rx_obj_read(const struct mcp251xfd_priv *priv, const u8 offset, const u8 len) { int err; + int val_bytes = regmap_get_val_bytes(priv->map_rx); err = regmap_bulk_read(priv->map_rx, mcp251xfd_get_rx_obj_addr(ring, offset), hw_rx_obj, - len * ring->obj_size / sizeof(u32)); + len * ring->obj_size / val_bytes); return err; } @@ -2139,6 +2141,7 @@ static irqreturn_t mcp251xfd_irq(int irq, void *dev_id) struct mcp251xfd_priv *priv = dev_id; irqreturn_t handled = IRQ_NONE; int err; + int val_bytes = regmap_get_val_bytes(priv->map_reg); if (priv->rx_int) do { @@ -2162,7 +2165,7 @@ static irqreturn_t mcp251xfd_irq(int irq, void *dev_id) err = regmap_bulk_read(priv->map_reg, MCP251XFD_REG_INT, >regs_status, sizeof(priv->regs_status) / - sizeof(u32)); + val_bytes); if (err) goto out_fail; -- 2.25.1
[PATCH v1] can: mcp251xfd: Add some sysfs debug interfaces for registers r/w
When i debug mcp2518fd, some method to track registers is needed. This easy debug interface will be ok. For example, read a register at 0xe00: echo 0xe00 > can_get_reg cat can_get_reg write a register at 0xe00: echo 0xe00,0x60 > can_set_reg Signed-off-by: Su Yanjun --- .../net/can/spi/mcp251xfd/mcp251xfd-core.c| 132 ++ 1 file changed, 132 insertions(+) diff --git a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c index ab8aad0a7594..d65abe5505d5 100644 --- a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c +++ b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c @@ -27,6 +27,131 @@ #define DEVICE_NAME "mcp251xfd" +/* Add sysfs debug interface for easy to debug + * + * For example, + * + * - read a register + * echo 0xe00 > can_get_reg + * cat can_get_reg + * + * - write a register + * echo 0xe00,0x1 > can_set_reg + * + */ +static int reg_offset; + +static int __get_param(const char *buf, char *off, char *val) +{ + int len; + + if (!buf || !off || !val) + return -EINVAL; + + len = 0; + while (*buf != ',') { + *off++ = *buf++; + len++; + + if (len >= 16) + return -EINVAL; + } + + buf++; + + *off = '\0'; + + len = 0; + while (*buf) { + *val++ = *buf++; + len++; + + if (len >= 16) + return -EINVAL; + } + + *val = '\0'; + + return 0; +} + +static ssize_t can_get_reg_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int err; + u32 val; + struct mcp251xfd_priv *priv; + + priv = dev_get_drvdata(dev); + + err = regmap_read(priv->map_reg, reg_offset, ); + if (err) + return 0; + + return sprintf(buf, "reg = 0x%08x, val = 0x%08x\n", reg_offset, val); +} + +static ssize_t can_get_reg_store(struct device *dev, +struct device_attribute *attr, const char *buf, size_t len) +{ + u32 off; + + reg_offset = 0; + + if (kstrtouint(buf, 0, ) || (off % 4)) + return -EINVAL; + + reg_offset = off; + + return len; +} + +static ssize_t can_set_reg_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return 0; +} + +static ssize_t can_set_reg_store(struct device *dev, +struct device_attribute *attr, const char *buf, size_t len) +{ + struct mcp251xfd_priv *priv; + u32 off, val; + int err; + + char s1[16]; + char s2[16]; + + if (__get_param(buf, s1, s2)) + return -EINVAL; + + if (kstrtouint(s1, 0, ) || (off % 4)) + return -EINVAL; + + if (kstrtouint(s2, 0, )) + return -EINVAL; + + err = regmap_write(priv->map_reg, off, val); + if (err) + return -EINVAL; + + return len; +} + +static DEVICE_ATTR_RW(can_get_reg); +static DEVICE_ATTR_RW(can_set_reg); + +static struct attribute *can_attributes[] = { + _attr_can_get_reg.attr, + _attr_can_set_reg.attr, + NULL +}; + +static const struct attribute_group can_group = { + .attrs = can_attributes, + NULL +}; + static const struct mcp251xfd_devtype_data mcp251xfd_devtype_data_mcp2517fd = { .quirks = MCP251XFD_QUIRK_MAB_NO_WARN | MCP251XFD_QUIRK_CRC_REG | MCP251XFD_QUIRK_CRC_RX | MCP251XFD_QUIRK_CRC_TX | @@ -2944,6 +3069,12 @@ static int mcp251xfd_probe(struct spi_device *spi) if (err) goto out_free_candev; + err = sysfs_create_group(>dev.kobj, _group); + if (err) { + netdev_err(priv->ndev, "Create can group fail.\n"); + goto out_free_candev; + } + err = can_rx_offload_add_manual(ndev, >offload, MCP251XFD_NAPI_WEIGHT); if (err) @@ -2972,6 +3103,7 @@ static int mcp251xfd_remove(struct spi_device *spi) mcp251xfd_unregister(priv); spi->max_speed_hz = priv->spi_max_speed_hz_orig; free_candev(ndev); + sysfs_remove_group(>dev.kobj, _group); return 0; } -- 2.25.1
[PATCH v3] can: mcp251xfd: replace sizeof(u32) with val_bytes in regmap
The sizeof(u32) is hardcoded. It's better to use the config value in regmap. It increases the size of target object, but it's flexible when new mcp chip need other val_bytes. Signed-off-by: Su Yanjun --- drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c | 9 ++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c index f07e8b737d31..3dde52669343 100644 --- a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c +++ b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c @@ -1308,6 +1308,7 @@ mcp251xfd_tef_obj_read(const struct mcp251xfd_priv *priv, const u8 offset, const u8 len) { const struct mcp251xfd_tx_ring *tx_ring = priv->tx; + int val_bytes = regmap_get_val_bytes(priv->map_reg); if (IS_ENABLED(CONFIG_CAN_MCP251XFD_SANITY) && (offset > tx_ring->obj_num || @@ -1322,7 +1323,7 @@ mcp251xfd_tef_obj_read(const struct mcp251xfd_priv *priv, return regmap_bulk_read(priv->map_rx, mcp251xfd_get_tef_obj_addr(offset), hw_tef_obj, - sizeof(*hw_tef_obj) / sizeof(u32) * len); + sizeof(*hw_tef_obj) / val_bytes * len); } static int mcp251xfd_handle_tefif(struct mcp251xfd_priv *priv) @@ -1511,11 +1512,12 @@ mcp251xfd_rx_obj_read(const struct mcp251xfd_priv *priv, const u8 offset, const u8 len) { int err; + int val_bytes = regmap_get_val_bytes(priv->map_reg); err = regmap_bulk_read(priv->map_rx, mcp251xfd_get_rx_obj_addr(ring, offset), hw_rx_obj, - len * ring->obj_size / sizeof(u32)); + len * ring->obj_size / val_bytes); return err; } @@ -2139,6 +2141,7 @@ static irqreturn_t mcp251xfd_irq(int irq, void *dev_id) struct mcp251xfd_priv *priv = dev_id; irqreturn_t handled = IRQ_NONE; int err; + int val_bytes = regmap_get_val_bytes(priv->map_reg); if (priv->rx_int) do { @@ -2162,7 +2165,7 @@ static irqreturn_t mcp251xfd_irq(int irq, void *dev_id) err = regmap_bulk_read(priv->map_reg, MCP251XFD_REG_INT, >regs_status, sizeof(priv->regs_status) / - sizeof(u32)); + val_bytes); if (err) goto out_fail; -- 2.25.1
[PATCH v1] can: mcp251xfd: use regmap_bulk_write for compatibility
Recently i use mcp2518fd on 4.x kernel which multiple write is not backported, regmap_raw_write will cause old kernel crash because the tx buffer in driver is smaller then 2K. Use regmap_bulk_write instead for compatibility. Signed-off-by: Su Yanjun --- drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c index 3dde52669343..ab8aad0a7594 100644 --- a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c +++ b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c @@ -932,6 +932,7 @@ static int mcp251xfd_chip_ecc_init(struct mcp251xfd_priv *priv) void *ram; u32 val = 0; int err; + int val_bytes = regmap_get_val_bytes(priv->map_reg); ecc->ecc_stat = 0; @@ -947,8 +948,8 @@ static int mcp251xfd_chip_ecc_init(struct mcp251xfd_priv *priv) if (!ram) return -ENOMEM; - err = regmap_raw_write(priv->map_reg, MCP251XFD_RAM_START, ram, - MCP251XFD_RAM_SIZE); + err = regmap_bulk_write(priv->map_reg, MCP251XFD_RAM_START, ram, + MCP251XFD_RAM_SIZE / val_bytes); kfree(ram); return err; -- 2.25.1
[PATCH v2] can: mcp251xfd: replace sizeof(u32) with val_bytes in regmap
No functional effect. Signed-off-by: Su Yanjun --- drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c | 15 --- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c index f07e8b737d31..b15bfd50b863 100644 --- a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c +++ b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c @@ -181,6 +181,12 @@ static int mcp251xfd_clks_and_vdd_disable(const struct mcp251xfd_priv *priv) return 0; } +static inline int +mcp251xfd_get_val_bytes(const struct mcp251xfd_priv *priv) +{ + return regmap_get_val_bytes(priv->map_reg); +} + static inline u8 mcp251xfd_cmd_prepare_write_reg(const struct mcp251xfd_priv *priv, union mcp251xfd_write_reg_buf *write_reg_buf, @@ -1308,6 +1314,7 @@ mcp251xfd_tef_obj_read(const struct mcp251xfd_priv *priv, const u8 offset, const u8 len) { const struct mcp251xfd_tx_ring *tx_ring = priv->tx; + int val_bytes = mcp251xfd_get_val_bytes(priv); if (IS_ENABLED(CONFIG_CAN_MCP251XFD_SANITY) && (offset > tx_ring->obj_num || @@ -1322,7 +1329,7 @@ mcp251xfd_tef_obj_read(const struct mcp251xfd_priv *priv, return regmap_bulk_read(priv->map_rx, mcp251xfd_get_tef_obj_addr(offset), hw_tef_obj, - sizeof(*hw_tef_obj) / sizeof(u32) * len); + sizeof(*hw_tef_obj) / val_bytes * len); } static int mcp251xfd_handle_tefif(struct mcp251xfd_priv *priv) @@ -1511,11 +1518,12 @@ mcp251xfd_rx_obj_read(const struct mcp251xfd_priv *priv, const u8 offset, const u8 len) { int err; + int val_bytes = mcp251xfd_get_val_bytes(priv); err = regmap_bulk_read(priv->map_rx, mcp251xfd_get_rx_obj_addr(ring, offset), hw_rx_obj, - len * ring->obj_size / sizeof(u32)); + len * ring->obj_size / val_bytes); return err; } @@ -2139,6 +2147,7 @@ static irqreturn_t mcp251xfd_irq(int irq, void *dev_id) struct mcp251xfd_priv *priv = dev_id; irqreturn_t handled = IRQ_NONE; int err; + int val_bytes = mcp251xfd_get_val_bytes(priv); if (priv->rx_int) do { @@ -2162,7 +2171,7 @@ static irqreturn_t mcp251xfd_irq(int irq, void *dev_id) err = regmap_bulk_read(priv->map_reg, MCP251XFD_REG_INT, >regs_status, sizeof(priv->regs_status) / - sizeof(u32)); + val_bytes); if (err) goto out_fail; -- 2.25.1
[PATCH v1] can: mcp251xfd: replace sizeof(u32) with val_bytes in regmap
No functional effect. Signed-off-by: Su Yanjun --- drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c | 15 --- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c index f07e8b737d31..cc48ccee4694 100644 --- a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c +++ b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c @@ -181,6 +181,12 @@ static int mcp251xfd_clks_and_vdd_disable(const struct mcp251xfd_priv *priv) return 0; } +static inline int +mcp251xfd_get_val_bytes(const struct mcp251xfd_priv *priv) +{ + return priv->map_reg->format.val_bytes; +} + static inline u8 mcp251xfd_cmd_prepare_write_reg(const struct mcp251xfd_priv *priv, union mcp251xfd_write_reg_buf *write_reg_buf, @@ -1308,6 +1314,7 @@ mcp251xfd_tef_obj_read(const struct mcp251xfd_priv *priv, const u8 offset, const u8 len) { const struct mcp251xfd_tx_ring *tx_ring = priv->tx; + int val_bytes = mcp251xfd_get_val_bytes(priv); if (IS_ENABLED(CONFIG_CAN_MCP251XFD_SANITY) && (offset > tx_ring->obj_num || @@ -1322,7 +1329,7 @@ mcp251xfd_tef_obj_read(const struct mcp251xfd_priv *priv, return regmap_bulk_read(priv->map_rx, mcp251xfd_get_tef_obj_addr(offset), hw_tef_obj, - sizeof(*hw_tef_obj) / sizeof(u32) * len); + sizeof(*hw_tef_obj) / val_bytes * len); } static int mcp251xfd_handle_tefif(struct mcp251xfd_priv *priv) @@ -1511,11 +1518,12 @@ mcp251xfd_rx_obj_read(const struct mcp251xfd_priv *priv, const u8 offset, const u8 len) { int err; + int val_bytes = mcp251xfd_get_val_bytes(priv); err = regmap_bulk_read(priv->map_rx, mcp251xfd_get_rx_obj_addr(ring, offset), hw_rx_obj, - len * ring->obj_size / sizeof(u32)); + len * ring->obj_size / val_bytes); return err; } @@ -2139,6 +2147,7 @@ static irqreturn_t mcp251xfd_irq(int irq, void *dev_id) struct mcp251xfd_priv *priv = dev_id; irqreturn_t handled = IRQ_NONE; int err; + int val_bytes = mcp251xfd_get_val_bytes(priv); if (priv->rx_int) do { @@ -2162,7 +2171,7 @@ static irqreturn_t mcp251xfd_irq(int irq, void *dev_id) err = regmap_bulk_read(priv->map_reg, MCP251XFD_REG_INT, >regs_status, sizeof(priv->regs_status) / - sizeof(u32)); + val_bytes); if (err) goto out_fail; -- 2.25.1
About patch NFS: Fix O_DIRECT accounting of number of bytes read/written
Hi trond, Because My mail system cant receive nfs mail list’s mails, I reply your patch here. I have some question for the patch. >No. Basic O_DIRECT does not guarantee atomicity of requests, which is >why we do not have generic locking at the VFS level when reading and >writing. The only guarantee being offered is that O_DIRECT and buffered >writes do not collide. Do you mean other fs also cant guarantee atomicity of O_DIRECT request or just nfs? >IOW: I think the basic premise for this test is just broken (as I >commented in the patch series I sent) because it is assuming a >behaviour that is simply not guaranteed. So the generic/465 of xfstests can’t apply to nfs for now, am I right? >BTW: note that buffered writes have the same property. They are ordered >when being written into the page cache, meaning that reads on the same >client will see no holes, however if you try to read from another >client, then you will see the same behaviour, with temporary holes >magically appearing in the file. As you say, nfs buffered write also has the hole problem with multiple r/w on different clients. I want to know if the problem exists in other local fs such as xfs,ext4? Thanks in advance.
Re: [PATCH] NFS: Fix O_DIRECT read problem when another write is going on
在 2019/10/1 2:06, Trond Myklebust 写道: Hi Su, On Mon, 2019-09-30 at 17:11 +0800, Su Yanjun wrote: In xfstests generic/465 tests failed. Because O_DIRECT r/w use async rpc calls, when r/w rpc calls are running concurrently we may read partial data which is wrong. For example as follows. user buffer /\ || rpc0 rpc1 When rpc0 runs it encounters eof so return 0, then another writes something. When rpc1 runs it returns some data. The total data buffer contains wrong data. In this patch we check eof mark for each direct request. If encounters eof then set eof mark in the request, when we meet it again report -EAGAIN error. In nfs_direct_complete we convert -EAGAIN as if read nothing. When the reader issue another read it will read ok. Signed-off-by: Su Yanjun --- fs/nfs/direct.c | 14 +- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 222d711..7f737a3 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -93,6 +93,7 @@ struct nfs_direct_req { bytes_left, /* bytes left to be sent */ error; /* any reported error */ struct completion completion; /* wait for i/o completion */ + int eof;/* eof mark in the req */ /* commit state */ struct nfs_mds_commit_info mds_cinfo; /* Storage for cinfo */ @@ -380,6 +381,12 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq) { struct inode *inode = dreq->inode; + /* read partial data just as read nothing */ + if (dreq->error == -EAGAIN) { + dreq->count = 0; + dreq->error = 0; + } + inode_dio_end(inode); if (dreq->iocb) { @@ -413,8 +420,13 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr) if (hdr->good_bytes != 0) nfs_direct_good_bytes(dreq, hdr); - if (test_bit(NFS_IOHDR_EOF, >flags)) + if (dreq->eof) + dreq->error = -EAGAIN; + + if (test_bit(NFS_IOHDR_EOF, >flags)) { dreq->error = 0; + dreq->eof = 1; + } spin_unlock(>lock); Thanks for looking into this issue. I agree with your analysis of what is going wrong in generic/465. However, I think the problem is greater than just EOF. I think we also need to look at the generic error handling, and ensure that it handles a truncated RPC call in the middle of a series of calls correctly. Please see the two patches I sent you just now and check if they fix the problem for you. The patchset you sent works for generic/465. Thanks a lot
[PATCH] NFS: Fix O_DIRECT read problem when another write is going on
In xfstests generic/465 tests failed. Because O_DIRECT r/w use async rpc calls, when r/w rpc calls are running concurrently we may read partial data which is wrong. For example as follows. user buffer /\ ||| rpc0 rpc1 When rpc0 runs it encounters eof so return 0, then another writes something. When rpc1 runs it returns some data. The total data buffer contains wrong data. In this patch we check eof mark for each direct request. If encounters eof then set eof mark in the request, when we meet it again report -EAGAIN error. In nfs_direct_complete we convert -EAGAIN as if read nothing. When the reader issue another read it will read ok. Signed-off-by: Su Yanjun --- fs/nfs/direct.c | 14 +- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 222d711..7f737a3 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -93,6 +93,7 @@ struct nfs_direct_req { bytes_left, /* bytes left to be sent */ error; /* any reported error */ struct completion completion; /* wait for i/o completion */ + int eof;/* eof mark in the req */ /* commit state */ struct nfs_mds_commit_info mds_cinfo; /* Storage for cinfo */ @@ -380,6 +381,12 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq) { struct inode *inode = dreq->inode; + /* read partial data just as read nothing */ + if (dreq->error == -EAGAIN) { + dreq->count = 0; + dreq->error = 0; + } + inode_dio_end(inode); if (dreq->iocb) { @@ -413,8 +420,13 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr) if (hdr->good_bytes != 0) nfs_direct_good_bytes(dreq, hdr); - if (test_bit(NFS_IOHDR_EOF, >flags)) + if (dreq->eof) + dreq->error = -EAGAIN; + + if (test_bit(NFS_IOHDR_EOF, >flags)) { dreq->error = 0; + dreq->eof = 1; + } spin_unlock(>lock); -- 2.7.4
[PATCH net] ipv4: fix ifa_flags reuse problem in using ifconfig tool
When NetworkManager has already set ipv4 address then uses ifconfig set another ipv4 address. It will use previous ifa_flags that will cause device route not be inserted. As NetworkManager has already support IFA_F_NOPREFIXROUTE flag [1], but ifconfig will reuse the ifa_flags. It's weird especially some old scripts or program [2] still use ifconfig. [1] https://gitlab.freedesktop.org/NetworkManager/NetworkManager/ commit/fec80e7473ad16979af75ed299d68103e7aa3fe9 [2] LTP or TAHI Signed-off-by: Su Yanjun --- net/ipv4/devinet.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index a4b5bd4..56ca339 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1159,6 +1159,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr) inet_del_ifa(in_dev, ifap, 0); ifa->ifa_broadcast = 0; ifa->ifa_scope = 0; + ifa->ifa_flags = 0; } ifa->ifa_address = ifa->ifa_local = sin->sin_addr.s_addr; -- 2.7.4
[tip:perf/urgent] perf/x86: Fix typo in comment
Commit-ID: 77d760328ee015cf89460c52bfd5a6b0a09b7472 Gitweb: https://git.kernel.org/tip/77d760328ee015cf89460c52bfd5a6b0a09b7472 Author: Su Yanjun AuthorDate: Fri, 16 Aug 2019 16:43:21 +0800 Committer: Ingo Molnar CommitDate: Mon, 19 Aug 2019 11:50:24 +0200 perf/x86: Fix typo in comment No functional change. Signed-off-by: Su Yanjun Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1565945001-4413-1-git-send-email-suyj.f...@cn.fujitsu.com Signed-off-by: Ingo Molnar --- arch/x86/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 81b005e4c7d9..325959d19d9a 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1236,7 +1236,7 @@ void x86_pmu_enable_event(struct perf_event *event) * Add a single event to the PMU. * * The event is added to the group of enabled events - * but only if it can be scehduled with existing events. + * but only if it can be scheduled with existing events. */ static int x86_pmu_add(struct perf_event *event, int flags) {
[PATCH] perf/x86: Fix typo in core.c
No functional change related. Signed-off-by: Su Yanjun --- arch/x86/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 81b005e..325959d 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1236,7 +1236,7 @@ void x86_pmu_enable_event(struct perf_event *event) * Add a single event to the PMU. * * The event is added to the group of enabled events - * but only if it can be scehduled with existing events. + * but only if it can be scheduled with existing events. */ static int x86_pmu_add(struct perf_event *event, int flags) { -- 2.7.4
[PATCH net v3] net: ipv6: Fix a bug in ndisc_send_ns when netdev only has a global address
When the egress interface does not have a link local address, it can not communicate with other hosts. In RFC4861, 7.2.2 says "If the source address of the packet prompting the solicitation is the same as one of the addresses assigned to the outgoing interface, that address SHOULD be placed in the IP Source Address of the outgoing solicitation. Otherwise, any one of the addresses assigned to the interface should be used." In this patch we try get a global address if we get ll address failed. Signed-off-by: Su Yanjun --- Changes since V2: - Let banned_flags under the scope of its use. --- include/net/addrconf.h | 2 ++ net/ipv6/addrconf.c| 34 ++ net/ipv6/ndisc.c | 10 +++--- 3 files changed, 43 insertions(+), 3 deletions(-) diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 269ec27..eae1167 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -107,6 +107,8 @@ int __ipv6_get_lladdr(struct inet6_dev *idev, struct in6_addr *addr, u32 banned_flags); int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr, u32 banned_flags); +int ipv6_get_addr(struct net_device *dev, struct in6_addr *addr, + u32 banned_flags); bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, bool match_wildcard); bool inet_rcv_saddr_any(const struct sock *sk); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 4ae17a9..9e537bd 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1873,6 +1873,40 @@ int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr, return err; } +int __ipv6_get_addr(struct inet6_dev *idev, struct in6_addr *addr, + u32 banned_flags) +{ + struct inet6_ifaddr *ifp; + int err = -EADDRNOTAVAIL; + + list_for_each_entry(ifp, >addr_list, if_list) { + if (ifp->scope == 0 && + !(ifp->flags & banned_flags)) { + *addr = ifp->addr; + err = 0; + break; + } + } + return err; +} + +int ipv6_get_addr(struct net_device *dev, struct in6_addr *addr, + u32 banned_flags) +{ + struct inet6_dev *idev; + int err = -EADDRNOTAVAIL; + + rcu_read_lock(); + idev = __in6_dev_get(dev); + if (idev) { + read_lock_bh(>lock); + err = __ipv6_get_addr(idev, addr, banned_flags); + read_unlock_bh(>lock); + } + rcu_read_unlock(); + return err; +} + static int ipv6_count_addresses(const struct inet6_dev *idev) { const struct inet6_ifaddr *ifp; diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 659ecf4e..fa58c6e 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -592,9 +592,13 @@ void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit, struct nd_msg *msg; if (!saddr) { - if (ipv6_get_lladdr(dev, _buf, - (IFA_F_TENTATIVE|IFA_F_OPTIMISTIC))) - return; + u32 banned_flags = IFA_F_TENTATIVE | IFA_F_OPTIMISTIC; + + if (ipv6_get_lladdr(dev, _buf, banned_flags)) { + /* try global address */ + if (ipv6_get_addr(dev, _buf, banned_flags)) + return; + } saddr = _buf; } -- 2.7.4
[PATCH net v2] net: ipv6: Fix a bug in ndisc_send_ns when netdev only has a global address
When the egress interface does not have a link local address, it can not communicate with other hosts. In RFC4861, 7.2.2 says "If the source address of the packet prompting the solicitation is the same as one of the addresses assigned to the outgoing interface, that address SHOULD be placed in the IP Source Address of the outgoing solicitation. Otherwise, any one of the addresses assigned to the interface should be used." In this patch we try get a global address if we get ll address failed. Signed-off-by: Su Yanjun --- Changes since V1: - Change patch description and code optimization at David Ahern's suggestion --- include/net/addrconf.h | 2 ++ net/ipv6/addrconf.c| 34 ++ net/ipv6/ndisc.c | 9 ++--- 3 files changed, 42 insertions(+), 3 deletions(-) diff --git a/include/net/addrconf.h b/include/net/addrconf.h index becdad5..ce1561e 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -107,6 +107,8 @@ int __ipv6_get_lladdr(struct inet6_dev *idev, struct in6_addr *addr, u32 banned_flags); int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr, u32 banned_flags); +int ipv6_get_addr(struct net_device *dev, struct in6_addr *addr, + u32 banned_flags); bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, bool match_wildcard); bool inet_rcv_saddr_any(const struct sock *sk); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 521e320..9467457 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1870,6 +1870,40 @@ int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr, return err; } +int __ipv6_get_addr(struct inet6_dev *idev, struct in6_addr *addr, + u32 banned_flags) +{ + struct inet6_ifaddr *ifp; + int err = -EADDRNOTAVAIL; + + list_for_each_entry(ifp, >addr_list, if_list) { + if (ifp->scope == 0 && + !(ifp->flags & banned_flags)) { + *addr = ifp->addr; + err = 0; + break; + } + } + return err; +} + +int ipv6_get_addr(struct net_device *dev, struct in6_addr *addr, + u32 banned_flags) +{ + struct inet6_dev *idev; + int err = -EADDRNOTAVAIL; + + rcu_read_lock(); + idev = __in6_dev_get(dev); + if (idev) { + read_lock_bh(>lock); + err = __ipv6_get_addr(idev, addr, banned_flags); + read_unlock_bh(>lock); + } + rcu_read_unlock(); + return err; +} + static int ipv6_count_addresses(const struct inet6_dev *idev) { const struct inet6_ifaddr *ifp; diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 083cc1c..156c027 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -603,11 +603,14 @@ void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit, int inc_opt = dev->addr_len; int optlen = 0; struct nd_msg *msg; + u32 banned_flags = IFA_F_TENTATIVE | IFA_F_OPTIMISTIC; if (!saddr) { - if (ipv6_get_lladdr(dev, _buf, - (IFA_F_TENTATIVE|IFA_F_OPTIMISTIC))) - return; + if (ipv6_get_lladdr(dev, _buf, banned_flags)) { + /* try global address */ + if (ipv6_get_addr(dev, _buf, banned_flags)) + return; + } saddr = _buf; } -- 2.7.4
[PATCH net] net: ipv6: Fix a bug in ndisc_send_ns when netdev only has a global address
When we send mpls packets and the interface only has a manual global ipv6 address, then the two hosts cant communicate. I find that in ndisc_send_ns it only tries to get a ll address. In my case, the executive path is as below. ip6_output ->ip6_finish_output ->lwtunnel_xmit ->mpls_xmit ->neigh_resolve_output ->neigh_probe ->ndisc_solicit ->ndisc_send_ns In RFC4861, 7.2.2 says "If the source address of the packet prompting the solicitation is the same as one of the addresses assigned to the outgoing interface, that address SHOULD be placed in the IP Source Address of the outgoing solicitation. Otherwise, any one of the addresses assigned to the interface should be used." In this patch we try get a global address if we get ll address failed. Signed-off-by: Su Yanjun --- include/net/addrconf.h | 4 net/ipv6/addrconf.c| 34 ++ net/ipv6/ndisc.c | 8 ++-- 3 files changed, 44 insertions(+), 2 deletions(-) diff --git a/include/net/addrconf.h b/include/net/addrconf.h index becdad5..006db8e 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -107,6 +107,10 @@ int __ipv6_get_lladdr(struct inet6_dev *idev, struct in6_addr *addr, u32 banned_flags); int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr, u32 banned_flags); +int __ipv6_get_addr(struct inet6_dev *idev, struct in6_addr *addr, + u32 banned_flags); +int ipv6_get_addr(struct net_device *dev, struct in6_addr *addr, + u32 banned_flags); bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, bool match_wildcard); bool inet_rcv_saddr_any(const struct sock *sk); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 521e320..4c0a43f 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1870,6 +1870,40 @@ int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr, return err; } +int __ipv6_get_addr(struct inet6_dev *idev, struct in6_addr *addr, + u32 banned_flags) +{ + struct inet6_ifaddr *ifp; + int err = -EADDRNOTAVAIL; + + list_for_each_entry_reverse(ifp, >addr_list, if_list) { + if (ifp->scope == 0 && + !(ifp->flags & banned_flags)) { + *addr = ifp->addr; + err = 0; + break; + } + } + return err; +} + +int ipv6_get_addr(struct net_device *dev, struct in6_addr *addr, + u32 banned_flags) +{ + struct inet6_dev *idev; + int err = -EADDRNOTAVAIL; + + rcu_read_lock(); + idev = __in6_dev_get(dev); + if (idev) { + read_lock_bh(>lock); + err = __ipv6_get_addr(idev, addr, banned_flags); + read_unlock_bh(>lock); + } + rcu_read_unlock(); + return err; +} + static int ipv6_count_addresses(const struct inet6_dev *idev) { const struct inet6_ifaddr *ifp; diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 083cc1c..18ac2fb 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -606,8 +606,12 @@ void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit, if (!saddr) { if (ipv6_get_lladdr(dev, _buf, - (IFA_F_TENTATIVE|IFA_F_OPTIMISTIC))) - return; + (IFA_F_TENTATIVE | IFA_F_OPTIMISTIC))) { + /* try global address */ + if (ipv6_get_addr(dev, _buf, + (IFA_F_TENTATIVE | IFA_F_OPTIMISTIC))) + return; + } saddr = _buf; } -- 2.7.4
[PATCH] udp: Fix typo in net/ipv4/udp.c
Signed-off-by: Su Yanjun --- net/ipv4/udp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index c21862b..d88821c 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2170,7 +2170,7 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, /* Initialize UDP checksum. If exited with zero value (success), * CHECKSUM_UNNECESSARY means, that no more checks are required. - * Otherwise, csum completion requires chacksumming packet body, + * Otherwise, csum completion requires checksumming packet body, * including udp header and folding it to skb->csum. */ static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh, -- 2.7.4
[PATCH] udp: Fix typo in udpv4/p.c
Signed-off-by: Su Yanjun --- net/ipv4/udp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index c21862b..d88821c 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2170,7 +2170,7 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, /* Initialize UDP checksum. If exited with zero value (success), * CHECKSUM_UNNECESSARY means, that no more checks are required. - * Otherwise, csum completion requires chacksumming packet body, + * Otherwise, csum completion requires checksumming packet body, * including udp header and folding it to skb->csum. */ static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh, -- 2.7.4
Re: [Problem]testOpenUpgradeLock test failed in nfsv4.0 in 5.2.0-rc7
Ang ping? 在 2019/7/3 9:34, Su Yanjun 写道: Hi Frank We tested the pynfs of NFSv4.0 on the latest version of the kernel (5.2.0-rc7). I encountered a problem while testing st_lock.testOpenUpgradeLock. The problem is now as follows: ** LOCK24 st_lock.testOpenUpgradeLock : FAILURE OP_LOCK should return NFS4_OK, instead got NFS4ERR_BAD_SEQID ** Is this normal? The case is as follows: Def testOpenUpgradeLock(t, env): """Try open, lock, open, downgrade, close FLAGS: all lock CODE: LOCK24 """ c= env.c1 C.init_connection() Os = open_sequence(c, t.code, lockowner="lockowner_LOCK24") Os.open(OPEN4_SHARE_ACCESS_READ) Os.lock(READ_LT) Os.open(OPEN4_SHARE_ACCESS_WRITE) Os.unlock() Os.downgrade(OPEN4_SHARE_ACCESS_WRITE) Os.lock(WRITE_LT) Os.close() After investigation, there was an error in unlock->lock. When unlocking, the lockowner of the file was not released, causing an error when locking again. Will nfs4.0 support 1) open-> 2) lock-> 3) unlock-> 4) lock this function?
[Problem]testOpenUpgradeLock test failed in nfsv4.0 in 5.2.0-rc7
Hi Frank We tested the pynfs of NFSv4.0 on the latest version of the kernel (5.2.0-rc7). I encountered a problem while testing st_lock.testOpenUpgradeLock. The problem is now as follows: ** LOCK24 st_lock.testOpenUpgradeLock : FAILURE OP_LOCK should return NFS4_OK, instead got NFS4ERR_BAD_SEQID ** Is this normal? The case is as follows: Def testOpenUpgradeLock(t, env): """Try open, lock, open, downgrade, close FLAGS: all lock CODE: LOCK24 """ c= env.c1 C.init_connection() Os = open_sequence(c, t.code, lockowner="lockowner_LOCK24") Os.open(OPEN4_SHARE_ACCESS_READ) Os.lock(READ_LT) Os.open(OPEN4_SHARE_ACCESS_WRITE) Os.unlock() Os.downgrade(OPEN4_SHARE_ACCESS_WRITE) Os.lock(WRITE_LT) Os.close() After investigation, there was an error in unlock->lock. When unlocking, the lockowner of the file was not released, causing an error when locking again. Will nfs4.0 support 1) open-> 2) lock-> 3) unlock-> 4) lock this function?
Re: [PATCH] sctp: Add rcu lock to protect dst entry in sctp_transport_route
在 2019/6/12 21:13, Neil Horman 写道: On Tue, Jun 11, 2019 at 10:33:17AM +0800, Su Yanjun wrote: 在 2019/6/10 19:12, Neil Horman 写道: On Mon, Jun 10, 2019 at 11:20:00AM +0800, Su Yanjun wrote: syzbot found a crash in rt_cache_valid. Problem is that when more threads release dst in sctp_transport_route, the route cache can be freed. As follows, p1: sctp_transport_route dst_release get_dst p2: sctp_transport_route dst_release get_dst ... If enough threads calling dst_release will cause dst->refcnt==0 then rcu softirq will reclaim the dst entry,get_dst then use the freed memory. This patch adds rcu lock to protect the dst_entry here. Fixes: 6e91b578bf3f("sctp: re-use sctp_transport_pmtu in sctp_transport_route") Signed-off-by: Su Yanjun Reported-by: syzbot+a9e23ea2aa21044c2...@syzkaller.appspotmail.com --- net/sctp/transport.c | 5 + 1 file changed, 5 insertions(+) diff --git a/net/sctp/transport.c b/net/sctp/transport.c index ad158d3..5ad7e20 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -308,8 +308,13 @@ void sctp_transport_route(struct sctp_transport *transport, struct sctp_association *asoc = transport->asoc; struct sctp_af *af = transport->af_specific; + /* When dst entry is being released, route cache may be referred +* again. Add rcu lock here to protect dst entry. +*/ + rcu_read_lock(); sctp_transport_dst_release(transport); af->get_dst(transport, saddr, >fl, sctp_opt2sk(opt)); + rcu_read_unlock(); What is the exact error that syzbot reported? This doesn't seem like it fixes BUG: KASAN: slab-out-of-bounds in rt_cache_valid+0x158/0x190 net/ipv4/route.c:1556 Read of size 2 at addr 8880654f3ac7 by task syz-executor.0/26603 CPU: 0 PID: 26603 Comm: syz-executor.0 Not tainted 5.2.0-rc2+ #9 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x172/0x1f0 lib/dump_stack.c:113 print_address_description.cold+0x7c/0x20d mm/kasan/report.c:188 __kasan_report.cold+0x1b/0x40 mm/kasan/report.c:317 kasan_report+0x12/0x20 mm/kasan/common.c:614 __asan_report_load2_noabort+0x14/0x20 mm/kasan/generic_report.c:130 rt_cache_valid+0x158/0x190 net/ipv4/route.c:1556 __mkroute_output net/ipv4/route.c:2332 [inline] ip_route_output_key_hash_rcu+0x819/0x2d50 net/ipv4/route.c:2564 ip_route_output_key_hash+0x1ef/0x360 net/ipv4/route.c:2393 __ip_route_output_key include/net/route.h:125 [inline] ip_route_output_flow+0x28/0xc0 net/ipv4/route.c:2651 ip_route_output_key include/net/route.h:135 [inline] sctp_v4_get_dst+0x467/0x1260 net/sctp/protocol.c:435 sctp_transport_route+0x12d/0x360 net/sctp/transport.c:297 sctp_assoc_add_peer+0x53e/0xfc0 net/sctp/associola.c:663 sctp_process_param net/sctp/sm_make_chunk.c:2531 [inline] sctp_process_init+0x2491/0x2b10 net/sctp/sm_make_chunk.c:2344 sctp_cmd_process_init net/sctp/sm_sideeffect.c:667 [inline] sctp_cmd_interpreter net/sctp/sm_sideeffect.c:1369 [inline] sctp_side_effects net/sctp/sm_sideeffect.c:1179 [inline] sctp_do_sm+0x3a30/0x50e0 net/sctp/sm_sideeffect.c:1150 sctp_assoc_bh_rcv+0x343/0x660 net/sctp/associola.c:1059 sctp_inq_push+0x1e4/0x280 net/sctp/inqueue.c:80 sctp_backlog_rcv+0x196/0xbe0 net/sctp/input.c:339 sk_backlog_rcv include/net/sock.h:945 [inline] __release_sock+0x129/0x390 net/core/sock.c:2412 release_sock+0x59/0x1c0 net/core/sock.c:2928 sctp_wait_for_connect+0x316/0x540 net/sctp/socket.c:9039 __sctp_connect+0xab2/0xcd0 net/sctp/socket.c:1226 __sctp_setsockopt_connectx+0x133/0x1a0 net/sctp/socket.c:1334 sctp_setsockopt_connectx_old net/sctp/socket.c:1350 [inline] sctp_setsockopt net/sctp/socket.c:4644 [inline] sctp_setsockopt+0x22c0/0x6d10 net/sctp/socket.c:4608 compat_sock_common_setsockopt+0x106/0x140 net/core/sock.c:3137 __compat_sys_setsockopt+0x185/0x380 net/compat.c:383 __do_compat_sys_setsockopt net/compat.c:396 [inline] __se_compat_sys_setsockopt net/compat.c:393 [inline] __ia32_compat_sys_setsockopt+0xbd/0x150 net/compat.c:393 do_syscall_32_irqs_on arch/x86/entry/common.c:337 [inline] do_fast_syscall_32+0x27b/0xd7d arch/x86/entry/common.c:408 entry_SYSENTER_compat+0x70/0x7f arch/x86/entry/entry_64_compat.S:139 RIP: 0023:0xf7ff5849 Code: 85 d2 74 02 89 0a 5b 5d c3 8b 04 24 c3 8b 14 24 c3 8b 3c 24 c3 90 90 90 90 90 90 90 90 90 90 90 90 51 52 55 89 e5 0f 34 cd 80 <5d> 5a 59 c3 90 90 90 90 eb 0d 90 90 90 90 90 90 90 90 90 90 90 90 RSP: 002b:f5df10cc EFLAGS: 0296 ORIG_RAX: 016e RAX: ffda RBX: 0007 RCX: 0084 RDX: 006b RSI: 2055bfe4 RDI: 001c RBP: R08: R09: R10: R11: R12: R13: R14: R15:
[PATCH] sctp: Add rcu lock to protect dst entry in sctp_transport_route
syzbot found a crash in rt_cache_valid. Problem is that when more threads release dst in sctp_transport_route, the route cache can be freed. As follows, p1: sctp_transport_route dst_release get_dst p2: sctp_transport_route dst_release get_dst ... If enough threads calling dst_release will cause dst->refcnt==0 then rcu softirq will reclaim the dst entry,get_dst then use the freed memory. This patch adds rcu lock to protect the dst_entry here. Fixes: 6e91b578bf3f("sctp: re-use sctp_transport_pmtu in sctp_transport_route") Signed-off-by: Su Yanjun Reported-by: syzbot+a9e23ea2aa21044c2...@syzkaller.appspotmail.com --- net/sctp/transport.c | 5 + 1 file changed, 5 insertions(+) diff --git a/net/sctp/transport.c b/net/sctp/transport.c index ad158d3..5ad7e20 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -308,8 +308,13 @@ void sctp_transport_route(struct sctp_transport *transport, struct sctp_association *asoc = transport->asoc; struct sctp_af *af = transport->af_specific; + /* When dst entry is being released, route cache may be referred +* again. Add rcu lock here to protect dst entry. +*/ + rcu_read_lock(); sctp_transport_dst_release(transport); af->get_dst(transport, saddr, >fl, sctp_opt2sk(opt)); + rcu_read_unlock(); if (saddr) memcpy(>saddr, saddr, sizeof(union sctp_addr)); -- 2.7.4
[PATCH v3] net: xfrm: Add '_rcu' tag for rcu protected pointer in netns_xfrm
For rcu protected pointers, we'd better add '__rcu' for them. Once added '__rcu' tag for rcu protected pointer, the sparse tool reports warnings. net/xfrm/xfrm_user.c:1198:39: sparse:expected struct sock *sk net/xfrm/xfrm_user.c:1198:39: sparse:got struct sock [noderef] *nlsk [...] So introduce a new wrapper function of nlmsg_unicast to handle type conversions. This patch also fixes a direct access of a rcu protected socket. Fixes: be33690d8fcf("[XFRM]: Fix aevent related crash") Signed-off-by: Su Yanjun --- Changes from v2: - add 'Fixes' tag and some description include/net/netns/xfrm.h | 2 +- net/xfrm/xfrm_user.c | 30 +++--- 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h index 59f45b1..d2a36fb 100644 --- a/include/net/netns/xfrm.h +++ b/include/net/netns/xfrm.h @@ -57,7 +57,7 @@ struct netns_xfrm { struct list_headinexact_bins; - struct sock *nlsk; + struct sock __rcu *nlsk; struct sock *nlsk_stash; u32 sysctl_aevent_etime; diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index d832783..e8f23e4 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1071,6 +1071,22 @@ static inline int xfrm_nlmsg_multicast(struct net *net, struct sk_buff *skb, return nlmsg_multicast(nlsk, skb, pid, group, GFP_ATOMIC); } +/* A similar wrapper like xfrm_nlmsg_multicast checking that nlsk is still + * available. + */ +static inline int xfrm_nlmsg_unicast(struct net *net, struct sk_buff *skb, +u32 pid) +{ + struct sock *nlsk = rcu_dereference(net->xfrm.nlsk); + + if (!nlsk) { + kfree_skb(skb); + return -EPIPE; + } + + return nlmsg_unicast(nlsk, skb, pid); +} + static inline unsigned int xfrm_spdinfo_msgsize(void) { return NLMSG_ALIGN(4) @@ -1195,7 +1211,7 @@ static int xfrm_get_spdinfo(struct sk_buff *skb, struct nlmsghdr *nlh, err = build_spdinfo(r_skb, net, sportid, seq, *flags); BUG_ON(err < 0); - return nlmsg_unicast(net->xfrm.nlsk, r_skb, sportid); + return xfrm_nlmsg_unicast(net, r_skb, sportid); } static inline unsigned int xfrm_sadinfo_msgsize(void) @@ -1254,7 +1270,7 @@ static int xfrm_get_sadinfo(struct sk_buff *skb, struct nlmsghdr *nlh, err = build_sadinfo(r_skb, net, sportid, seq, *flags); BUG_ON(err < 0); - return nlmsg_unicast(net->xfrm.nlsk, r_skb, sportid); + return xfrm_nlmsg_unicast(net, r_skb, sportid); } static int xfrm_get_sa(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -1274,7 +1290,7 @@ static int xfrm_get_sa(struct sk_buff *skb, struct nlmsghdr *nlh, if (IS_ERR(resp_skb)) { err = PTR_ERR(resp_skb); } else { - err = nlmsg_unicast(net->xfrm.nlsk, resp_skb, NETLINK_CB(skb).portid); + err = xfrm_nlmsg_unicast(net, resp_skb, NETLINK_CB(skb).portid); } xfrm_state_put(x); out_noput: @@ -1337,7 +1353,7 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh, goto out; } - err = nlmsg_unicast(net->xfrm.nlsk, resp_skb, NETLINK_CB(skb).portid); + err = xfrm_nlmsg_unicast(net, resp_skb, NETLINK_CB(skb).portid); out: xfrm_state_put(x); @@ -1903,8 +1919,8 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, if (IS_ERR(resp_skb)) { err = PTR_ERR(resp_skb); } else { - err = nlmsg_unicast(net->xfrm.nlsk, resp_skb, - NETLINK_CB(skb).portid); + err = xfrm_nlmsg_unicast(net, resp_skb, +NETLINK_CB(skb).portid); } } else { xfrm_audit_policy_delete(xp, err ? 0 : 1, true); @@ -2062,7 +2078,7 @@ static int xfrm_get_ae(struct sk_buff *skb, struct nlmsghdr *nlh, err = build_aevent(r_skb, x, ); BUG_ON(err < 0); - err = nlmsg_unicast(net->xfrm.nlsk, r_skb, NETLINK_CB(skb).portid); + err = xfrm_nlmsg_unicast(net, r_skb, NETLINK_CB(skb).portid); spin_unlock_bh(>lock); xfrm_state_put(x); return err; -- 2.7.4
Re: [PATCH] net: xfrm: Fix potential oops in xfrm_user_rcv_msg and array out of bounds
On 2019/3/5 15:31, Steffen Klassert wrote: On Tue, Mar 05, 2019 at 03:08:49PM +0800, Su Yanjun wrote: On 2019/3/5 14:49, Herbert Xu wrote: On Sun, Mar 03, 2019 at 10:47:39PM -0500, Su Yanjun wrote: When i review xfrm_user.c code, i found some potentical bug in it. In xfrm_user_rcvmsg if type parameter from user space is set to XFRM_MSG_MAX or XFRM_MSG_NEWSADINFO or XFRM_MSG_NEWSPDINFO. It will cause xfrm_user_rcv_msg referring to null entry in xfrm_dispatch array. Signed-off-by: Su Yanjun --- net/xfrm/xfrm_user.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index a131f9f..d832783 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -2630,11 +2630,13 @@ static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, return -EOPNOTSUPP; type = nlh->nlmsg_type; - if (type > XFRM_MSG_MAX) + if (type >= XFRM_MSG_MAX) return -EINVAL; Your patch is wrong. Please check the definition of XFRM_MSG_MAX. I see, thanks for your reply. type -= XFRM_MSG_BASE; link = _dispatch[type]; + if (!link) + return -EOPNOTSUPP; Here **link** may refer to null entry for special types such as XFRM_MSG_MAX or XFRM_MSG_NEWSADINFO or XFRM_MSG_NEWSPDINFO Am i miss something? 'link' is always a valid pointer into that array. Thanks Su
[PATCH v2] net: xfrm: Add '_rcu' tag for rcu protected pointer in netns_xfrm
For rcu protected pointers, we'd better add '__rcu' for them. Once added '__rcu' tag for rcu protected pointer, the sparse tool reports warnings. net/xfrm/xfrm_user.c:1198:39: sparse:expected struct sock *sk net/xfrm/xfrm_user.c:1198:39: sparse:got struct sock [noderef] *nlsk [...] So introduce a new wrapper function of nlmsg_unicast to handle type conversions. No functional change. Signed-off-by: Su Yanjun --- Changes from v1: - fix spelling mistakes and the sparse tool' warnings include/net/netns/xfrm.h | 2 +- net/xfrm/xfrm_user.c | 30 +++--- 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h index 59f45b1..d2a36fb 100644 --- a/include/net/netns/xfrm.h +++ b/include/net/netns/xfrm.h @@ -57,7 +57,7 @@ struct netns_xfrm { struct list_headinexact_bins; - struct sock *nlsk; + struct sock __rcu *nlsk; struct sock *nlsk_stash; u32 sysctl_aevent_etime; diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index d832783..e8f23e4 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1071,6 +1071,22 @@ static inline int xfrm_nlmsg_multicast(struct net *net, struct sk_buff *skb, return nlmsg_multicast(nlsk, skb, pid, group, GFP_ATOMIC); } +/* A similar wrapper like xfrm_nlmsg_multicast checking that nlsk is still + * available. + */ +static inline int xfrm_nlmsg_unicast(struct net *net, struct sk_buff *skb, +u32 pid) +{ + struct sock *nlsk = rcu_dereference(net->xfrm.nlsk); + + if (!nlsk) { + kfree_skb(skb); + return -EPIPE; + } + + return nlmsg_unicast(nlsk, skb, pid); +} + static inline unsigned int xfrm_spdinfo_msgsize(void) { return NLMSG_ALIGN(4) @@ -1195,7 +1211,7 @@ static int xfrm_get_spdinfo(struct sk_buff *skb, struct nlmsghdr *nlh, err = build_spdinfo(r_skb, net, sportid, seq, *flags); BUG_ON(err < 0); - return nlmsg_unicast(net->xfrm.nlsk, r_skb, sportid); + return xfrm_nlmsg_unicast(net, r_skb, sportid); } static inline unsigned int xfrm_sadinfo_msgsize(void) @@ -1254,7 +1270,7 @@ static int xfrm_get_sadinfo(struct sk_buff *skb, struct nlmsghdr *nlh, err = build_sadinfo(r_skb, net, sportid, seq, *flags); BUG_ON(err < 0); - return nlmsg_unicast(net->xfrm.nlsk, r_skb, sportid); + return xfrm_nlmsg_unicast(net, r_skb, sportid); } static int xfrm_get_sa(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -1274,7 +1290,7 @@ static int xfrm_get_sa(struct sk_buff *skb, struct nlmsghdr *nlh, if (IS_ERR(resp_skb)) { err = PTR_ERR(resp_skb); } else { - err = nlmsg_unicast(net->xfrm.nlsk, resp_skb, NETLINK_CB(skb).portid); + err = xfrm_nlmsg_unicast(net, resp_skb, NETLINK_CB(skb).portid); } xfrm_state_put(x); out_noput: @@ -1337,7 +1353,7 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh, goto out; } - err = nlmsg_unicast(net->xfrm.nlsk, resp_skb, NETLINK_CB(skb).portid); + err = xfrm_nlmsg_unicast(net, resp_skb, NETLINK_CB(skb).portid); out: xfrm_state_put(x); @@ -1903,8 +1919,8 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, if (IS_ERR(resp_skb)) { err = PTR_ERR(resp_skb); } else { - err = nlmsg_unicast(net->xfrm.nlsk, resp_skb, - NETLINK_CB(skb).portid); + err = xfrm_nlmsg_unicast(net, resp_skb, +NETLINK_CB(skb).portid); } } else { xfrm_audit_policy_delete(xp, err ? 0 : 1, true); @@ -2062,7 +2078,7 @@ static int xfrm_get_ae(struct sk_buff *skb, struct nlmsghdr *nlh, err = build_aevent(r_skb, x, ); BUG_ON(err < 0); - err = nlmsg_unicast(net->xfrm.nlsk, r_skb, NETLINK_CB(skb).portid); + err = xfrm_nlmsg_unicast(net, r_skb, NETLINK_CB(skb).portid); spin_unlock_bh(>lock); xfrm_state_put(x); return err; -- 2.7.4
[PATCH] net: xfrm: Add '_rcu' tag for rcu protected pointer in netns_xfrm
For rcu protected pointers, we'd bettter add '__rcu' for them. No functional change. Signed-off-by: Su Yanjun --- include/net/netns/xfrm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h index 59f45b1..d2a36fb 100644 --- a/include/net/netns/xfrm.h +++ b/include/net/netns/xfrm.h @@ -57,7 +57,7 @@ struct netns_xfrm { struct list_headinexact_bins; - struct sock *nlsk; + struct sock __rcu *nlsk; struct sock *nlsk_stash; u32 sysctl_aevent_etime; -- 2.7.4
[PATCH] net: xfrm: Fix potential oops in xfrm_user_rcv_msg and array out of bounds
When i review xfrm_user.c code, i found some potentical bug in it. In xfrm_user_rcvmsg if type parameter from user space is set to XFRM_MSG_MAX or XFRM_MSG_NEWSADINFO or XFRM_MSG_NEWSPDINFO. It will cause xfrm_user_rcv_msg referring to null entry in xfrm_dispatch array. Signed-off-by: Su Yanjun --- net/xfrm/xfrm_user.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index a131f9f..d832783 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -2630,11 +2630,13 @@ static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, return -EOPNOTSUPP; type = nlh->nlmsg_type; - if (type > XFRM_MSG_MAX) + if (type >= XFRM_MSG_MAX) return -EINVAL; type -= XFRM_MSG_BASE; link = _dispatch[type]; + if (!link) + return -EOPNOTSUPP; /* All operations require privileges, even GET */ if (!netlink_net_capable(skb, CAP_NET_ADMIN)) -- 2.7.4
Re: [PATCH] netfilter: nf_ct_helper: Fix possible panic when nf_conntrack_helper_unregister is used in an unloadable module
On 2019/3/1 16:43, Sergei Shtylyov wrote: Hello! On 01.03.2019 8:56, Su Yanjun wrote: From: Su Yanjun Because nf_conntrack_helper_unregister maybe used in an unloadable module, it uses 'synchronize_rcu' which may cause kernel panic. According to the artical: Article? I got it. RCU and Unloadable Modules https://lwn.net/Articles/217484/ When we have a heavy rcu callback load, then some of the callbacks might be deferred in order to allow other processing to proceed. sychnorize_rcu does not wait rcu callback complete and module may be unloaded before callback done. This patch uses rcu_barrier instead of synchronize_rcu will prevent this ^ that/which missed? Yes. situation. Signed-off-by: Su Yanjun [...] MBR, Sergei Thanks. Su
[PATCH] netfilter: nf_ct_helper: Fix possible panic when nf_conntrack_helper_unregister is used in an unloadable module
From: Su Yanjun Because nf_conntrack_helper_unregister maybe used in an unloadable module, it uses 'synchronize_rcu' which may cause kernel panic. According to the artical: RCU and Unloadable Modules https://lwn.net/Articles/217484/ When we have a heavy rcu callback load, then some of the callbacks might be deferred in order to allow other processing to proceed. sychnorize_rcu does not wait rcu callback complete and module may be unloaded before callback done. This patch uses rcu_barrier instead of synchronize_rcu will prevent this situation. Signed-off-by: Su Yanjun --- net/netfilter/nf_conntrack_helper.c | 11 +-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 274baf1..0ee9378 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -397,8 +397,15 @@ void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me) /* Make sure every nothing is still using the helper unless its a * connection in the hash. +* +* 'synchronize_rcu' may have problem when rcu callback function +* is used in unloadable modules. Use rcu_barrier instead, so that +* it will wait until rcu callback completes before modules are +* unloaded. +* More detail about rcu_barrier please see: +* https://lwn.net/Articles/217484/ */ - synchronize_rcu(); + rcu_barrier(); nf_ct_expect_iterate_destroy(expect_iter_me, NULL); nf_ct_iterate_destroy(unhelp, me); @@ -406,7 +413,7 @@ void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me) /* Maybe someone has gotten the helper already when unhelp above. * So need to wait it. */ - synchronize_rcu(); + rcu_barrier(); } EXPORT_SYMBOL_GPL(nf_conntrack_helper_unregister); -- 2.7.4
Re: [PATCH] xfs: correct statx's result_mask value
On 1/8/2019 1:07 PM, Darrick J. Wong wrote: On Tue, Jan 08, 2019 at 12:58:43PM +0800, Su Yanjun wrote: On 1/8/2019 2:04 AM, Eric Sandeen wrote: On 1/7/19 11:52 AM, Darrick J. Wong wrote: On Mon, Jan 07, 2019 at 04:53:10AM -0500, Su Yanjun wrote: For statx syscall, xfs return the wrong result_mask. Signed-off-by: Su Yanjun --- fs/xfs/xfs_iops.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index f48ffd7..3811457 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -521,6 +521,9 @@ xfs_vn_getattr( stat->btime.tv_nsec = ip->i_d.di_crtime.t_nsec; } } + + /* Only return mask that we care */ + stat->result_mask &= request_mask; Why not just: stat->result_mask = STATX_BASIC_STATS; at the top of the function? I don't see the need to mask off result_mask at all, since we could some day elect to return more than what's in request_mask... When we run xfstests with nfs, the generic/423 case runs failed. So i review the nfs' nfs_getattr code it does validate the request_mask. Then i review the xfs' getattr code, it has no such check. Whatever request_mask is set, the stat's result_mask always the 0x7ff. Yes, statx can return more data than what userspace callers ask for: Maybe it has Unclear semantics about statx's result_mask. "A filesystem may also fill in fields that the caller didn't ask for if it has values for them available and the information is available at no extra cost. If this happens, the corresponding bits will be set in stx_mask." --D I get it, then the testcase generic/423 may need update in xfstests. Thanks for your reply. ...waitaminute, are you seeing garbage in the result_mask that's returned to userspace? I also noticed the vfs stat functions declare "struct kstat stat;" without explicitly zeroing the structure fields, which means (I think) that we can leak stack information if the kernel isn't built with the stackleak plugin? No such problem. A clear problem statement and reproducer steps would be hugely useful here. -Eric Thanks, Su
Re: [PATCH] xfs: correct statx's result_mask value
On 1/8/2019 2:04 AM, Eric Sandeen wrote: On 1/7/19 11:52 AM, Darrick J. Wong wrote: On Mon, Jan 07, 2019 at 04:53:10AM -0500, Su Yanjun wrote: For statx syscall, xfs return the wrong result_mask. Signed-off-by: Su Yanjun --- fs/xfs/xfs_iops.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index f48ffd7..3811457 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -521,6 +521,9 @@ xfs_vn_getattr( stat->btime.tv_nsec = ip->i_d.di_crtime.t_nsec; } } + + /* Only return mask that we care */ + stat->result_mask &= request_mask; Why not just: stat->result_mask = STATX_BASIC_STATS; at the top of the function? I don't see the need to mask off result_mask at all, since we could some day elect to return more than what's in request_mask... When we run xfstests with nfs, the generic/423 case runs failed. So i review the nfs' nfs_getattr code it does validate the request_mask. Then i review the xfs' getattr code, it has no such check. Whatever request_mask is set, the stat's result_mask always the 0x7ff. Maybe it has Unclear semantics about statx's result_mask. ...waitaminute, are you seeing garbage in the result_mask that's returned to userspace? I also noticed the vfs stat functions declare "struct kstat stat;" without explicitly zeroing the structure fields, which means (I think) that we can leak stack information if the kernel isn't built with the stackleak plugin? No such problem. A clear problem statement and reproducer steps would be hugely useful here. -Eric Thanks, Su
[PATCH] xfs: correct statx's result_mask value
For statx syscall, xfs return the wrong result_mask. Signed-off-by: Su Yanjun --- fs/xfs/xfs_iops.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index f48ffd7..3811457 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -521,6 +521,9 @@ xfs_vn_getattr( stat->btime.tv_nsec = ip->i_d.di_crtime.t_nsec; } } + + /* Only return mask that we care */ + stat->result_mask &= request_mask; if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE) stat->attributes |= STATX_ATTR_IMMUTABLE; -- 2.7.4
[PATCH v2] vti4: Fix a ipip packet processing bug in 'IPCOMP' virtual tunnel
Recently we run a network test over ipcomp virtual tunnel.We find that if a ipv4 packet needs fragment, then the peer can't receive it. We deep into the code and find that when packet need fragment the smaller fragment will be encapsulated by ipip not ipcomp. So when the ipip packet goes into xfrm, it's skb->dev is not properly set. The ipv4 reassembly code always set skb'dev to the last fragment's dev. After ipv4 defrag processing, when the kernel rp_filter parameter is set, the skb will be drop by -EXDEV error. This patch adds compatible support for the ipip process in ipcomp virtual tunnel. Signed-off-by: Su Yanjun --- Based on Linus' master branch. Changes from v1: - use a separate handler for ipip packet processing in ipcomp virtual tunnel net/ipv4/ip_vti.c | 50 ++ 1 file changed, 50 insertions(+) diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index de31b30..d155a51 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -74,6 +74,33 @@ static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi, return 0; } +static int vti_input_ipip(struct sk_buff *skb, int nexthdr, __be32 spi, +int encap_type) +{ + struct ip_tunnel *tunnel; + const struct iphdr *iph = ip_hdr(skb); + struct net *net = dev_net(skb->dev); + struct ip_tunnel_net *itn = net_generic(net, vti_net_id); + + tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, + iph->saddr, iph->daddr, 0); + if (tunnel) { + if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) + goto drop; + + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = tunnel; + + skb->dev = tunnel->dev; + + return xfrm_input(skb, nexthdr, spi, encap_type); + } + + return -EINVAL; +drop: + kfree_skb(skb); + return 0; +} + static int vti_rcv(struct sk_buff *skb) { XFRM_SPI_SKB_CB(skb)->family = AF_INET; @@ -82,6 +109,14 @@ static int vti_rcv(struct sk_buff *skb) return vti_input(skb, ip_hdr(skb)->protocol, 0, 0); } +static int vti_rcv_ipip(struct sk_buff *skb) +{ + XFRM_SPI_SKB_CB(skb)->family = AF_INET; + XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); + + return vti_input_ipip(skb, ip_hdr(skb)->protocol, ip_hdr(skb)->saddr, 0); +} + static int vti_rcv_cb(struct sk_buff *skb, int err) { unsigned short family; @@ -429,6 +464,12 @@ static struct xfrm4_protocol vti_ipcomp4_protocol __read_mostly = { .priority = 100, }; +static struct xfrm_tunnel ipip_handler __read_mostly = { + .handler= vti_rcv_ipip, + .err_handler= vti4_err, + .priority = 0, +}; + static int __net_init vti_init_net(struct net *net) { int err; @@ -597,6 +638,13 @@ static int __init vti_init(void) if (err < 0) goto xfrm_proto_comp_failed; + msg = "ipip tunnel"; + err = xfrm4_tunnel_register(_handler, AF_INET); + if (err < 0) { + pr_info("%s: cant't register tunnel\n",__func__); + goto xfrm_tunnel_failed; + } + msg = "netlink interface"; err = rtnl_link_register(_link_ops); if (err < 0) @@ -606,6 +654,8 @@ static int __init vti_init(void) rtnl_link_failed: xfrm4_protocol_deregister(_ipcomp4_protocol, IPPROTO_COMP); +xfrm_tunnel_failed: + xfrm4_tunnel_deregister(_handler, AF_INET); xfrm_proto_comp_failed: xfrm4_protocol_deregister(_ah4_protocol, IPPROTO_AH); xfrm_proto_ah_failed: -- 2.7.4
Re: [PATCH] vti4: Fix a ipip packet processing bug in 'IPCOMP' virtual tunnel
On 1/4/2019 3:43 PM, Steffen Klassert wrote: On Thu, Jan 03, 2019 at 07:48:41AM -0500, Su Yanjun wrote: Recently we run a network test over ipcomp virtual tunnel.We find that if a ipv4 packet needs fragment, then the peer can't receive it. We deep into the code and find that when packet need fragment the smaller fragment will be encapsulated by ipip not ipcomp. So when the ipip packet goes into xfrm, it's skb->dev is not properly set. The ipv4 reassembly code always set skb'dev to the last fragment's dev. After ipv4 defrag processing, when the kernel rp_filter parameter is set, the skb will be drop by -EXDEV error. Why not just leaving rp_filter disabled or in 'loose mode' if you use ipcomp? In my option rp_filter should not affect the ip_vti functionality. The root cause is the origin tunnel code doesn't update skb->dev. vti only cares about ipcomp, esp, ah packets. This patch adds compatible support for the ipip process in ipcomp virtual tunnel. Signed-off-by: Su Yanjun --- net/ipv4/ip_vti.c | 25 - 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index de31b30..63de2f6 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -65,6 +65,9 @@ static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi, XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = tunnel; + if (iph->protocol == IPPROTO_IPIP) + skb->dev = tunnel->dev; + return xfrm_input(skb, nexthdr, spi, encap_type); } @@ -76,10 +79,15 @@ static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi, static int vti_rcv(struct sk_buff *skb) { + __be32 spi = 0; + XFRM_SPI_SKB_CB(skb)->family = AF_INET; XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); + + if (ip_hdr(skb)->protocol == IPPROTO_IPIP) + spi = ip_hdr(skb)->saddr; - return vti_input(skb, ip_hdr(skb)->protocol, 0, 0); + return vti_input(skb, ip_hdr(skb)->protocol, spi, 0); You use the src address as spi, how is this supposed to work? This code derives from xfrm4_tunnel and i just want the vti can handle ipip packet as xfrm4 tunnel does. Thanks, Su
[PATCH] vti4: Fix a ipip packet processing bug in 'IPCOMP' virtual tunnel
Recently we run a network test over ipcomp virtual tunnel.We find that if a ipv4 packet needs fragment, then the peer can't receive it. We deep into the code and find that when packet need fragment the smaller fragment will be encapsulated by ipip not ipcomp. So when the ipip packet goes into xfrm, it's skb->dev is not properly set. The ipv4 reassembly code always set skb'dev to the last fragment's dev. After ipv4 defrag processing, when the kernel rp_filter parameter is set, the skb will be drop by -EXDEV error. This patch adds compatible support for the ipip process in ipcomp virtual tunnel. Signed-off-by: Su Yanjun --- net/ipv4/ip_vti.c | 25 - 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index de31b30..63de2f6 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -65,6 +65,9 @@ static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi, XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = tunnel; + if (iph->protocol == IPPROTO_IPIP) + skb->dev = tunnel->dev; + return xfrm_input(skb, nexthdr, spi, encap_type); } @@ -76,10 +79,15 @@ static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi, static int vti_rcv(struct sk_buff *skb) { + __be32 spi = 0; + XFRM_SPI_SKB_CB(skb)->family = AF_INET; XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); + + if (ip_hdr(skb)->protocol == IPPROTO_IPIP) + spi = ip_hdr(skb)->saddr; - return vti_input(skb, ip_hdr(skb)->protocol, 0, 0); + return vti_input(skb, ip_hdr(skb)->protocol, spi, 0); } static int vti_rcv_cb(struct sk_buff *skb, int err) @@ -429,6 +437,12 @@ static struct xfrm4_protocol vti_ipcomp4_protocol __read_mostly = { .priority = 100, }; +static struct xfrm_tunnel ipip_handler __read_mostly = { + .handler= vti_rcv, + .err_handler= vti4_err, + .priority = 0, +}; + static int __net_init vti_init_net(struct net *net) { int err; @@ -597,6 +611,13 @@ static int __init vti_init(void) if (err < 0) goto xfrm_proto_comp_failed; + msg = "ipip tunnel"; + err = xfrm4_tunnel_register(_handler, AF_INET); + if (err < 0) { + pr_info("%s: cant't register tunnel\n",__func__); + goto xfrm_tunnel_failed; + } + msg = "netlink interface"; err = rtnl_link_register(_link_ops); if (err < 0) @@ -606,6 +627,8 @@ static int __init vti_init(void) rtnl_link_failed: xfrm4_protocol_deregister(_ipcomp4_protocol, IPPROTO_COMP); +xfrm_tunnel_failed: + xfrm4_tunnel_deregister(_handler, AF_INET); xfrm_proto_comp_failed: xfrm4_protocol_deregister(_ah4_protocol, IPPROTO_AH); xfrm_proto_ah_failed: -- 2.7.4
[PATCH] ipv6: fix typo in net/ipv6/reassembly.c
Signed-off-by: Su Yanjun --- net/ipv6/reassembly.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index a5bb59e..36a3d8d 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -210,7 +210,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, if (next && next->ip_defrag_offset < end) goto discard_fq; - /* Note : skb->ip_defrag_offset and skb->dev share the same location */ + /* Note : skb->ip_defrag_offset and skb->sk share the same location */ dev = skb->dev; if (dev) fq->iif = dev->ifindex; -- 2.7.4