[V2][PATCH] vt: keyboard, fix uninitialized variables warning
drivers/tty/vt/keyboard.c: In function 'vt_do_kdgkb_ioctl': drivers/tty/vt/keyboard.c: warning: 'ret' may be used uninitialized in this function [-Wmaybe-uninitialized] return ret; ^~~ drivers/tty/vt/keyboard.c: warning: 'kbs' may be used uninitialized in this function [-Wmaybe-uninitialized] kfree(kbs); ^~~~~~ Signed-off-by: Li Wang --- drivers/tty/vt/keyboard.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/tty/vt/keyboard.c b/drivers/tty/vt/keyboard.c index 7763862..62f1ecb 100644 --- a/drivers/tty/vt/keyboard.c +++ b/drivers/tty/vt/keyboard.c @@ -2090,6 +2090,8 @@ int vt_do_kdgkb_ioctl(int cmd, struct kbsentry __user *user_kdgkb, int perm) ret = 0; break; + default: + return -EINVAL; } kfree(kbs); -- 2.7.4
[PATCH] vt: keyboard, fix uninitialized variables warning
drivers/tty/vt/keyboard.c: In function 'vt_do_kdgkb_ioctl': drivers/tty/vt/keyboard.c: warning: 'ret' may be used uninitialized in this function [-Wmaybe-uninitialized] return ret; ^~~ kernel-source/drivers/tty/vt/keyboard.c: warning: 'kbs' may be used uninitialized in this function [-Wmaybe-uninitialized] kfree(kbs); ^~~~~~ Signed-off-by: Li Wang --- drivers/tty/vt/keyboard.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/tty/vt/keyboard.c b/drivers/tty/vt/keyboard.c index 7763862..3e73d55 100644 --- a/drivers/tty/vt/keyboard.c +++ b/drivers/tty/vt/keyboard.c @@ -2049,8 +2049,8 @@ int vt_do_kdgkb_ioctl(int cmd, struct kbsentry __user *user_kdgkb, int perm) { unsigned char kb_func; unsigned long flags; - char *kbs; - int ret; + char *kbs = NULL; + int ret = -EINVAL; if (get_user(kb_func, &user_kdgkb->kb_func)) return -EFAULT; -- 2.7.4
[PATCH] vhost: reduce stack usage in log_used
Fix the warning: [-Werror=-Wframe-larger-than=] drivers/vhost/vhost.c: In function log_used: drivers/vhost/vhost.c:1906:1: warning: the frame size of 1040 bytes is larger than 1024 bytes Signed-off-by: Li Wang --- drivers/vhost/vhost.c | 2 +- drivers/vhost/vhost.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index b45519c..31837a5 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1884,7 +1884,7 @@ static int log_write_hva(struct vhost_virtqueue *vq, u64 hva, u64 len) static int log_used(struct vhost_virtqueue *vq, u64 used_offset, u64 len) { - struct iovec iov[64]; + struct iovec *iov = vq->log_iov; int i, ret; if (!vq->iotlb) diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index 9032d3c..5fe4b47 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -123,6 +123,7 @@ struct vhost_virtqueue { /* Log write descriptors */ void __user *log_base; struct vhost_log *log; + struct iovec log_iov[64]; /* Ring endianness. Defaults to legacy native endianness. * Set to true when starting a modern virtio device. */ -- 2.7.4
[PATCH] vhost: reduce stack usage in log_used
Fix the warning: [-Werror=-Wframe-larger-than=] drivers/vhost/vhost.c: In function log_used: drivers/vhost/vhost.c:1906:1: warning: the frame size of 1040 bytes is larger than 1024 bytes Signed-off-by: Li Wang --- drivers/vhost/vhost.c | 14 ++ 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index b45519c..41769de 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1884,25 +1884,31 @@ static int log_write_hva(struct vhost_virtqueue *vq, u64 hva, u64 len) static int log_used(struct vhost_virtqueue *vq, u64 used_offset, u64 len) { - struct iovec iov[64]; + struct iovec *iov; int i, ret; if (!vq->iotlb) return log_write(vq->log_base, vq->log_addr + used_offset, len); + iov = kcalloc(64, sizeof(*iov), GFP_KERNEL); + if (!iov) + return -ENOMEM; + ret = translate_desc(vq, (uintptr_t)vq->used + used_offset, len, iov, 64, VHOST_ACCESS_WO); if (ret < 0) - return ret; + goto out; for (i = 0; i < ret; i++) { ret = log_write_hva(vq, (uintptr_t)iov[i].iov_base, iov[i].iov_len); if (ret) - return ret; + goto out; } - return 0; +out: + kfree(iov); + return ret; } int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log, -- 2.7.4
[5.3.0-rc4 Bug] WARNING: CPU: 17 PID: 25085 at lib/list_debug.c:47 __list_del_entry_valid+0x4e/0x90
003fba9d2000 CR4: 000406e0 [ 119.912293] Call Trace: [ 119.924344] lpfc_sli4_queue_destroy+0x11a/0x390 [lpfc] [ 119.949270] lpfc_pci_remove_one+0x7d6/0x970 [lpfc] [ 119.976858] pci_device_shutdown+0x34/0x60 [ 119.996353] device_shutdown+0x160/0x1c0 [ 120.015045] kernel_restart+0xe/0x30 [ 120.033515] __do_sys_reboot+0x1cf/0x210 [ 120.054274] ? __fput+0x168/0x250 [ 120.070250] ? syscall_trace_enter+0x198/0x2c0 [ 120.091719] ? __audit_syscall_exit+0x249/0x2a0 [ 120.115046] do_syscall_64+0x59/0x1e0 [ 120.135104] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 120.158848] RIP: 0033:0x7f7f6f48c427 [ 120.175870] Code: 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 89 fa be 69 19 12 28 bf ad de e1 fe b8 a9 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 01 c3 48 8b 15 31 9a 2c 00 f7 d8 64 89 02 b8 [ 120.279588] RSP: 002b:7fffe3e36288 EFLAGS: 0246 ORIG_RAX: 00a9 [ 120.321551] RAX: ffda RBX: RCX: 7f7f6f48c427 [ 120.361854] RDX: 01234567 RSI: 28121969 RDI: fee1dead [ 120.398536] RBP: 7fffe3e362d0 R08: 0002 R09: [ 120.436702] R10: 004b R11: 0246 R12: 0001 [ 120.470997] R13: fffe R14: 0006 R15: [ 120.508557] Modules linked in: sunrpc amd64_edac_mod edac_mce_amd kvm_amd ccp kvm irqbypass ipmi_ssif crct10dif_pclmul crc32_pclmul sp5100_tco ipmi_si joydev ghash_clmulni_intel pcspkr i2c_piix4 hpwdt sg fam15h_power ipmi_devintf k10temp hpilo ipmi_msghandler acpi_power_meter xfs libcrc32c radeon i2c_algo_bit drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops lpfc sd_mod ttm ahci nvmet_fc nvmet libahci ata_generic drm nvme_fc libata nvme_fabrics netxen_nic hpsa nvme_core crc32c_intel scsi_transport_fc serio_raw scsi_transport_sas dm_mirror dm_region_hash dm_log dm_mod -- Regards, Li Wang
Re: ltp/read_all_sys (read_all -d /sys -q -r 10) cause system panic with kernel-4.18.0-rc1
0b8 On Tue, Jun 19, 2018 at 6:41 PM, Li Wang wrote: > Hi, > > I'm hitting this panic when running ltp/read_all_sys on kernel-v4.18-rc1. > > Test env: > FUJITSU PRIMERGY RX200 S6 GS01 > Intel(R) Xeon(R) CPU E5620 @ 2.40GHz > 16384 MB memory, 598 GB disk space > > > [ 5915.705844] BUG: unable to handle kernel NULL pointer dereference > at 00b8 > [ 5915.714587] PGD 80042bcf7067 P4D 80042bcf7067 PUD 423f4e067 PMD 0 > [ 5915.722254] Oops: [#1] SMP PTI > [ 5915.726147] CPU: 6 PID: 18535 Comm: read_all Tainted: P > IOE 4.18.0-rc1 #1 > [ 5915.734980] Hardware name: FUJITSU > PRIMERGY RX200 S6 /D3031, BIOS 6.00 Rev. 1.10.3031 > 01/20/2012 > [ 5915.749654] RIP: 0010:qla_dfs_tgt_counters_show+0x92/0x2a0 [qla2xxx] > [ 5915.756733] Code: b6 86 22 01 00 00 66 85 c0 74 63 83 e8 01 4c 8b > 9e b8 00 00 00 31 f6 0f b7 c0 48 8d 3c c5 08 00 00 00 49 8b 04 33 48 > 83 c6 08 <48> 03 90 b8 00 00 00 48 03 88 c0 00 00 00 4c 03 80 c8 00 00 > 00 4c > [ 5915.777816] RSP: 0018:af04109e3d60 EFLAGS: 00010202 > [ 5915.783645] RAX: RBX: RCX: > > [ 5915.791606] RDX: RSI: 0008 RDI: > 0040 > [ 5915.799568] RBP: R08: R09: > > [ 5915.807529] R10: 956823a74798 R11: 956824a29000 R12: > > [ 5915.815489] R13: R14: 9567badfc280 R15: > > [ 5915.823451] FS: 7f27336a1740() GS:95683fd8() > knlGS: > [ 5915.832479] CS: 0010 DS: ES: CR0: 80050033 > [ 5915.838890] CR2: 00b8 CR3: 00042960a003 CR4: > 000206e0 > [ 5915.846850] Call Trace: > [ 5915.849583] ? __kmalloc_node+0x195/0x280 > [ 5915.854056] ? seq_read+0x33e/0x3f0 > [ 5915.857946] seq_read+0x120/0x3f0 > [ 5915.861643] full_proxy_read+0x50/0x70 > [ 5915.865827] __vfs_read+0x36/0x190 > [ 5915.869622] vfs_read+0x87/0x130 > [ 5915.873223] ksys_read+0x52/0xc0 > [ 5915.876823] do_syscall_64+0x5b/0x180 > [ 5915.880910] entry_SYSCALL_64_after_hwframe+0x44/0xa9 > [ 5915.886547] RIP: 0033:0x7f2733280790 > [ 5915.890532] Code: 73 01 c3 48 8b 0d 18 88 20 00 f7 d8 64 89 01 48 > 83 c8 ff c3 66 0f 1f 44 00 00 83 3d 59 cc 20 00 00 75 10 b8 00 00 00 > 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 1e fc ff ff 48 89 > 04 24 > [ 5915.911617] RSP: 002b:7ffef181c738 EFLAGS: 0246 ORIG_RAX: > > [ 5915.920064] RAX: ffda RBX: 0006 RCX: > 7f2733280790 > [ 5915.928025] RDX: 03ff RSI: 7ffef181cbf0 RDI: > 0006 > [ 5915.935986] RBP: 0b7b R08: R09: > 7ffef181c690 > [ 5915.943949] R10: R11: 0246 R12: > 7f2733688000 > [ 5915.951909] R13: 7ffef181cbf0 R14: 0028 R15: > 0030 > [ 5915.959871] Modules linked in: dummy veth binfmt_misc sctp overlay > tun fuse vfat fat btrfs xor zstd_decompress zstd_compress xxhash > raid6_pq ext4 mbcache jbd2 loop sunrpc intel_powerclamp coretemp > kvm_intel kvm irqbypass crct10dif_pclmul crc32_pclmul > ghash_clmulni_intel pcbc ipmi_ssif aesni_intel crypto_simd iTCO_wdt > ipmi_si cryptd iTCO_vendor_support glue_helper gpio_ich ipmi_devintf > sg acpi_power_meter ipmi_msghandler i2c_i801 pcspkr lpc_ich > i7core_edac acpi_cpufreq ip_tables xfs libcrc32c sd_mod sr_mod cdrom > mgag200 drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops > ttm ata_generic pata_acpi qla2xxx drm igb ata_piix nvme_fc mptsas > libata nvme_fabrics scsi_transport_sas dca crc32c_intel mptscsih > i2c_algo_bit nvme_core i2c_core mptbase scsi_transport_fc dm_mirror > dm_region_hash dm_log > [ 5916.038566] dm_mod [last unloaded: ltp_insmod01] > [ 5916.043814] CR2: 00b8 > [ 5916.047513] BUG: unable to handle kernel NULL pointer dereference > at 00b8 > [ 5916.047537] ---[ end trace 1dddacfb06305174 ]--- > > > -- > Regards, > Li Wang -- Regards, Li Wang
Re: [PATCH v2] zswap: re-check zswap_is_full after do zswap_shrink
On 30 May 2018 at 20:53, Dan Streetman wrote: > On Wed, May 30, 2018 at 6:39 AM, Li Wang wrote: >> The '/sys/../zswap/stored_pages:' keep raising in zswap test with >> "zswap.max_pool_percent=0" parameter. But theoretically, it should >> not compress or store pages any more since there is no space in >> compressed pool. >> >> Reproduce steps: >> 1. Boot kernel with "zswap.enabled=1" >> 2. Set the max_pool_percent to 0 >> # echo 0 > /sys/module/zswap/parameters/max_pool_percent >> 3. Do memory stress test to see if some pages have been compressed >> # stress --vm 1 --vm-bytes $mem_available"M" --timeout 60s >> 4. Watching the 'stored_pages' number increasing or not >> >> The root cause is: >> When zswap_max_pool_percent is setting to 0 via kernel parameter, the >> zswap_is_full() will always return true to do zswap_shrink(). But if >> the shinking is able to reclain a page successful, then proceeds to >> compress/store another page, so the value of stored_pages will keep >> changing. >> >> To solve the issue, this patch adds zswap_is_full() check again after >> zswap_shrink() to make sure it's now under the max_pool_percent, and >> not to compress/store if reach its limitaion. >> >> Signed-off-by: Li Wang > > Acked-by: Dan Streetman ping~ Any possible to merge this in kernel-4.18-rcX? My zswap test always fails on the upstream kernel. -- Regards, Li Wang Email: wangli.a...@gmail.com
ltp/read_all_sys (read_all -d /sys -q -r 10) cause system panic with kernel-4.18.0-rc1
Hi, I'm hitting this panic when running ltp/read_all_sys on kernel-v4.18-rc1. Test env: FUJITSU PRIMERGY RX200 S6 GS01 Intel(R) Xeon(R) CPU E5620 @ 2.40GHz 16384 MB memory, 598 GB disk space [ 5915.705844] BUG: unable to handle kernel NULL pointer dereference at 00b8 [ 5915.714587] PGD 80042bcf7067 P4D 80042bcf7067 PUD 423f4e067 PMD 0 [ 5915.722254] Oops: [#1] SMP PTI [ 5915.726147] CPU: 6 PID: 18535 Comm: read_all Tainted: P IOE 4.18.0-rc1 #1 [ 5915.734980] Hardware name: FUJITSU PRIMERGY RX200 S6 /D3031, BIOS 6.00 Rev. 1.10.3031 01/20/2012 [ 5915.749654] RIP: 0010:qla_dfs_tgt_counters_show+0x92/0x2a0 [qla2xxx] [ 5915.756733] Code: b6 86 22 01 00 00 66 85 c0 74 63 83 e8 01 4c 8b 9e b8 00 00 00 31 f6 0f b7 c0 48 8d 3c c5 08 00 00 00 49 8b 04 33 48 83 c6 08 <48> 03 90 b8 00 00 00 48 03 88 c0 00 00 00 4c 03 80 c8 00 00 00 4c [ 5915.777816] RSP: 0018:af04109e3d60 EFLAGS: 00010202 [ 5915.783645] RAX: RBX: RCX: [ 5915.791606] RDX: RSI: 0008 RDI: 0040 [ 5915.799568] RBP: R08: R09: [ 5915.807529] R10: 956823a74798 R11: 956824a29000 R12: [ 5915.815489] R13: R14: 9567badfc280 R15: [ 5915.823451] FS: 7f27336a1740() GS:95683fd8() knlGS: [ 5915.832479] CS: 0010 DS: ES: CR0: 80050033 [ 5915.838890] CR2: 00b8 CR3: 00042960a003 CR4: 000206e0 [ 5915.846850] Call Trace: [ 5915.849583] ? __kmalloc_node+0x195/0x280 [ 5915.854056] ? seq_read+0x33e/0x3f0 [ 5915.857946] seq_read+0x120/0x3f0 [ 5915.861643] full_proxy_read+0x50/0x70 [ 5915.865827] __vfs_read+0x36/0x190 [ 5915.869622] vfs_read+0x87/0x130 [ 5915.873223] ksys_read+0x52/0xc0 [ 5915.876823] do_syscall_64+0x5b/0x180 [ 5915.880910] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 5915.886547] RIP: 0033:0x7f2733280790 [ 5915.890532] Code: 73 01 c3 48 8b 0d 18 88 20 00 f7 d8 64 89 01 48 83 c8 ff c3 66 0f 1f 44 00 00 83 3d 59 cc 20 00 00 75 10 b8 00 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 1e fc ff ff 48 89 04 24 [ 5915.911617] RSP: 002b:7ffef181c738 EFLAGS: 0246 ORIG_RAX: [ 5915.920064] RAX: ffda RBX: 0006 RCX: 7f2733280790 [ 5915.928025] RDX: 03ff RSI: 7ffef181cbf0 RDI: 0006 [ 5915.935986] RBP: 0b7b R08: R09: 7ffef181c690 [ 5915.943949] R10: R11: 0246 R12: 7f2733688000 [ 5915.951909] R13: 7ffef181cbf0 R14: 0028 R15: 0030 [ 5915.959871] Modules linked in: dummy veth binfmt_misc sctp overlay tun fuse vfat fat btrfs xor zstd_decompress zstd_compress xxhash raid6_pq ext4 mbcache jbd2 loop sunrpc intel_powerclamp coretemp kvm_intel kvm irqbypass crct10dif_pclmul crc32_pclmul ghash_clmulni_intel pcbc ipmi_ssif aesni_intel crypto_simd iTCO_wdt ipmi_si cryptd iTCO_vendor_support glue_helper gpio_ich ipmi_devintf sg acpi_power_meter ipmi_msghandler i2c_i801 pcspkr lpc_ich i7core_edac acpi_cpufreq ip_tables xfs libcrc32c sd_mod sr_mod cdrom mgag200 drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm ata_generic pata_acpi qla2xxx drm igb ata_piix nvme_fc mptsas libata nvme_fabrics scsi_transport_sas dca crc32c_intel mptscsih i2c_algo_bit nvme_core i2c_core mptbase scsi_transport_fc dm_mirror dm_region_hash dm_log [ 5916.038566] dm_mod [last unloaded: ltp_insmod01] [ 5916.043814] CR2: 00b8 [ 5916.047513] BUG: unable to handle kernel NULL pointer dereference at 00b8 [ 5916.047537] ---[ end trace 1dddacfb06305174 ]--- -- Regards, Li Wang
[PATCH v2] zswap: re-check zswap_is_full after do zswap_shrink
The '/sys/../zswap/stored_pages:' keep raising in zswap test with "zswap.max_pool_percent=0" parameter. But theoretically, it should not compress or store pages any more since there is no space in compressed pool. Reproduce steps: 1. Boot kernel with "zswap.enabled=1" 2. Set the max_pool_percent to 0 # echo 0 > /sys/module/zswap/parameters/max_pool_percent 3. Do memory stress test to see if some pages have been compressed # stress --vm 1 --vm-bytes $mem_available"M" --timeout 60s 4. Watching the 'stored_pages' number increasing or not The root cause is: When zswap_max_pool_percent is setting to 0 via kernel parameter, the zswap_is_full() will always return true to do zswap_shrink(). But if the shinking is able to reclain a page successful, then proceeds to compress/store another page, so the value of stored_pages will keep changing. To solve the issue, this patch adds zswap_is_full() check again after zswap_shrink() to make sure it's now under the max_pool_percent, and not to compress/store if reach its limitaion. Signed-off-by: Li Wang Cc: Seth Jennings Cc: Dan Streetman Cc: Huang Ying Cc: Yu Zhao --- mm/zswap.c | 9 + 1 file changed, 9 insertions(+) diff --git a/mm/zswap.c b/mm/zswap.c index 61a5c41..fd320c3 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1026,6 +1026,15 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, ret = -ENOMEM; goto reject; } + + /* A second zswap_is_full() check after +* zswap_shrink() to make sure it's now +* under the max_pool_percent +*/ + if (zswap_is_full()) { + ret = -ENOMEM; + goto reject; + } } /* allocate entry */ -- 2.9.5
[PATCH RFC] zswap: reject to compress/store page if zswap_max_pool_percent is 0
The '/sys/../zswap/stored_pages:' keep raising in zswap test with "zswap.max_pool_percent=0" parameter. But theoretically, it should not compress or store pages any more since there is no space for compressed pool. Reproduce steps: 1. Boot kernel with "zswap.enabled=1 zswap.max_pool_percent=17" 2. Set the max_pool_percent to 0 # echo 0 > /sys/module/zswap/parameters/max_pool_percent Confirm this parameter works fine # cat /sys/kernel/debug/zswap/pool_total_size 0 3. Do memory stress test to see if some pages have been compressed # stress --vm 1 --vm-bytes $mem_available"M" --timeout 60s Watching the 'stored_pages' numbers increasing or not The root cause is: When the zswap_max_pool_percent is set to 0 via kernel parameter, the zswap_is_full() will always return true to shrink the pool size by zswap_shrink(). If the pool size has been shrinked a little success, zswap will do compress/store pages again. Then we get fails on that as above. Signed-off-by: Li Wang Cc: Seth Jennings Cc: Dan Streetman Cc: Huang Ying Cc: Yu Zhao --- mm/zswap.c | 5 + 1 file changed, 5 insertions(+) diff --git a/mm/zswap.c b/mm/zswap.c index 61a5c41..2b537bb 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1007,6 +1007,11 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, u8 *src, *dst; struct zswap_header zhdr = { .swpentry = swp_entry(type, offset) }; + if (!zswap_max_pool_percent) { + ret = -ENOMEM; + goto reject; + } + /* THP isn't supported */ if (PageTransHuge(page)) { ret = -EINVAL; -- 2.9.5
linux/drivers/cpuidle: cpuidle_enter_state() issue
Hi Kernel-developers, The flowing call trace was catch from kernel-v4.15, could anyone help to analysis the cpuidle problem? or, if you need any more detail info pls let me know. Test Env: IBM KVM Guest on ibm-p8-kvm-03 POWER8E (raw), altivec supported 9216 MB memory, 107 GB disk space 8< [15002.722413] swapper/15: page allocation failure: order:0, mode:0x1080020(GFP_ATOMIC), nodemask=(null) [15002.853793] swapper/15 cpuset=/ mems_allowed=0 [15002.853932] CPU: 15 PID: 0 Comm: swapper/15 Not tainted 4.15.0 #1 [15002.854019] Call Trace: [15002.854129] [c0023ff77650] [c0940b50] .dump_stack+0xac/0xfc (unreliable) [15002.854285] [c0023ff776e0] [c026c678] .warn_alloc+0xe8/0x180 [15002.854376] [c0023ff777a0] [c026d50c] .__alloc_pages_nodemask+0xd6c/0xf90 [15002.854490] [c0023ff77980] [c02e9cc0] .alloc_pages_current+0x90/0x120 [15002.854624] [c0023ff77a10] [c07990cc] .skb_page_frag_refill+0x8c/0x120 [15002.854746] [c0023ff77aa0] [d3a561a8] .try_fill_recv+0x368/0x620 [virtio_net] [15003.422855] [c0023ff77ba0] [d3a568ec] .virtnet_poll+0x25c/0x380 [virtio_net] [15003.423864] [c0023ff77c70] [c07c18d0] .net_rx_action+0x330/0x4a0 [15003.424024] [c0023ff77d90] [c0960d50] .__do_softirq+0x150/0x3a8 [15003.424197] [c0023ff77e90] [c00ff608] .irq_exit+0x198/0x1b0 [15003.424342] [c0023ff77f10] [c0015504] .__do_irq+0x94/0x1f0 [15003.424485] [c0023ff77f90] [c0026d5c] .call_do_irq+0x14/0x24 [15003.424627] [c0023bc63820] [c00156ec] .do_IRQ+0x8c/0x100 [15003.424776] [c0023bc638c0] [c0008b34] hardware_interrupt_common+0x114/0x120 [15003.424963] --- interrupt: 501 at .snooze_loop+0xa4/0x1c0 LR = .snooze_loop+0x60/0x1c0 [15003.425164] [c0023bc63bb0] [c0023bc63c50] 0xc0023bc63c50 (unreliable) [15003.425346] [c0023bc63c30] [c075104c] .cpuidle_enter_state+0xac/0x390 [15003.425534] [c0023bc63ce0] [c0157adc] .call_cpuidle+0x3c/0x70 [15003.425669] [c0023bc63d50] [c0157e90] .do_idle+0x2a0/0x300 [15003.425815] [c0023bc63e20] [c01580ac] .cpu_startup_entry+0x2c/0x40 [15003.425995] [c0023bc63ea0] [c0045790] .start_secondary+0x4d0/0x520 [15003.426170] [c0023bc63f90] [c000aa70] start_secondary_prolog+0x10/0x14 -8<--- Any response will be appreciated! -- Regards, Li Wang Email: wangli.a...@gmail.com
Re: [PATCH] s390/mm: return -ENOMEM in arch_get_unmapped_area[_topdown]
On Thu, Oct 26, 2017 at 6:16 PM, Martin Schwidefsky wrote: > On Thu, 26 Oct 2017 17:47:39 +0800 > Li Wang wrote: > >> On Thu, Oct 26, 2017 at 5:26 PM, Martin Schwidefsky >> wrote: >> > On Thu, 26 Oct 2017 15:36:10 +0800 >> > Li Wang wrote: >> > >> > The code in mmap.c checks for the per-task limit, 31-bit vs 64-bit. >> > pgalloc.c checks for the maximum allowed address and does not care >> > about the task. >> > >> >> Fixes: 8ab867cb0806 (s390/mm: fix BUG_ON in crst_table_upgrade) >> >> Signed-off-by: Li Wang >> > >> > I don't think this patch fixes anything. >> >> At least there is a logic error i think, after apply the patch >> "s390/mm: fix BUG_ON in crst_table_upgrade", >> it makes no sense to compare "if (end >= TASK_SIZE_MAX) return >> -ENOMEM" in crst_table_upgrade() function. >> >> isn't it? > > Be careful with TASK_SIZE vs. TASK_SIZE_MAX. They return different > values for 31-bit compat tasks. what do you think this reproducer now failed(mmap into high region succeeded) on the latest kernel? should we enlarge the HIGH_ADDR to -PAGE_SIZE? #include #include #include #include #include #include #include #define HIGH_ADDR (void *)(1L << 53) int main(void) { void *addr; long map_sz = getpagesize(); int fd = open("testfile", O_RDWR | O_CREAT, 0666); /* Attempt to mmap into highmem addr, should get ENOMEM */ addr = mmap(HIGH_ADDR, map_sz, PROT_READ, MAP_SHARED | MAP_FIXED, fd, 0); if (addr != MAP_FAILED) { printf("FAIL: mmap into high region succeeded unexpectedly\n"); munmap(addr, map_sz); close(fd); } if (errno != ENOMEM) { printf("FAIL: mmap into high region failed unexpectedly - expect errno=ENOMEM, got\n"); } else { printf("PASS: mmap into high region failed as expected\n"); } return 0; } > > If the addr parameter is correctly aligned then the if condition in > crst_table_upgrade is superfluous as TASK_SIZE_MAX is now -PAGE_SIZE > with the introduction of 5 level page tables. It is important for older > kernels with only 4 level page tables with a TASK_SIZE_MAX of 2**53. > > On the other hand if addr is ever a value between -PAGE_SIZE and -1 > we would end up with an endless loop. That makes the if condition a > safe-guard and I would like to keep it. > > -- > blue skies, >Martin. > > "Reality continues to ruin my life." - Calvin. > -- Li Wang liw...@redhat.com
Re: [PATCH] s390/mm: return -ENOMEM in arch_get_unmapped_area[_topdown]
On Thu, Oct 26, 2017 at 5:26 PM, Martin Schwidefsky wrote: > On Thu, 26 Oct 2017 15:36:10 +0800 > Li Wang wrote: > >> That would be very hard to get -ENOMEM returned in crst_table_upgrade() >> because the condition(addr + len <= TASK_SIZE) makes all 'end' value >> is smaller/equal than 'TASK_SIZE_TASK'. So let's move it to the upper >> layer. > > I have a hard time understanding what scenario you describe. There is no > 'TASK_SIZE_TASK', only TASK_SIZE, TASK_SIZE_OF and TASK_SIZE_MAX. Sorry for the typo, I was thinking about to write TASK_SIZE_MAX. > > The code in mmap.c checks for the per-task limit, 31-bit vs 64-bit. > pgalloc.c checks for the maximum allowed address and does not care > about the task. > >> Fixes: 8ab867cb0806 (s390/mm: fix BUG_ON in crst_table_upgrade) >> Signed-off-by: Li Wang > > I don't think this patch fixes anything. At least there is a logic error i think, after apply the patch "s390/mm: fix BUG_ON in crst_table_upgrade", it makes no sense to compare "if (end >= TASK_SIZE_MAX) return -ENOMEM" in crst_table_upgrade() function. isn't it? Thanks for reviewing quick. -- Li Wang liw...@redhat.com
[PATCH] s390/mm: return -ENOMEM in arch_get_unmapped_area[_topdown]
That would be very hard to get -ENOMEM returned in crst_table_upgrade() because the condition(addr + len <= TASK_SIZE) makes all 'end' value is smaller/equal than 'TASK_SIZE_TASK'. So let's move it to the upper layer. Fixes: 8ab867cb0806 (s390/mm: fix BUG_ON in crst_table_upgrade) Signed-off-by: Li Wang --- arch/s390/mm/mmap.c| 6 ++ arch/s390/mm/pgalloc.c | 3 +-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c index 5bea139..8ddb13a 100644 --- a/arch/s390/mm/mmap.c +++ b/arch/s390/mm/mmap.c @@ -119,6 +119,9 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, return addr; check_asce_limit: + if (addr + len >= TASK_SIZE_MAX) + return -ENOMEM; + if (addr + len > current->mm->context.asce_limit && addr + len <= TASK_SIZE) { rc = crst_table_upgrade(mm, addr + len); @@ -184,6 +187,9 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, } check_asce_limit: + if (addr + len >= TASK_SIZE_MAX) + return -ENOMEM; + if (addr + len > current->mm->context.asce_limit && addr + len <= TASK_SIZE) { rc = crst_table_upgrade(mm, addr + len); diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index 05f1f27..5e4b887 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -84,8 +84,7 @@ int crst_table_upgrade(struct mm_struct *mm, unsigned long end) /* upgrade should only happen from 3 to 4, 3 to 5, or 4 to 5 levels */ VM_BUG_ON(mm->context.asce_limit < _REGION2_SIZE); - if (end >= TASK_SIZE_MAX) - return -ENOMEM; + rc = 0; notify = 0; while (mm->context.asce_limit < end) { -- 2.9.3
[BUG] Unable to handle kernel paging request for unaligned access at address 0xc0000001c52c53df
Hi, ltp/access04 always panic the latest mainstream kernel-4.12-rc4 on ppc64le. From the calltrace I guess the reason is probably that the tests mount ext2 file system using ext4 driver. A simple way to reproduce: # dd of=wangli if=/dev/zero count=1024 bs=1024 # mkfs -t ext2 wangli # mount -t ext4 wangli /mnt/ Are there any new changes in ext4 (on kernel-4.12-rc4) recently? [ 318.557844] EXT4-fs (loop0): mounting ext2 file system using the ext4 subsystem [ 318.558104] Unable to handle kernel paging request for unaligned access at address 0xc001c52c53df [ 318.558109] Faulting instruction address: 0xc0918b28 [ 318.558114] Oops: Kernel access of bad area, sig: 7 [#1] [ 318.558117] SMP NR_CPUS=2048 [ 318.558117] NUMA [ 318.558120] pSeries [ 318.558124] Modules linked in: ext4 jbd2 mbcache loop rpcsec_gss_krb5 nfsv4 dns_resolver nfs fscache sg pseries_rng nfsd auth_rpcgss nfs_acl lockd ghash_generic gf128mul xts vmx_crypto grace sunrpc ip_tables xfs libcrc32c sd_mod ibmvscsi ibmveth scsi_transport_srp dm_mirror dm_region_hash dm_log dm_mod [ 318.558152] CPU: 2 PID: 40748 Comm: access04 Not tainted 4.12.0-rc4 #1 [ 318.558155] task: c003889fb200 task.stack: c003ac134000 [ 318.558158] NIP: c0918b28 LR: c011c5d4 CTR: c0130900 [ 318.558162] REGS: c003ac137420 TRAP: 0600 Not tainted (4.12.0-rc4) [ 318.558164] MSR: 80010280b033 [ 318.558171] CR: 28028842 XER: [ 318.558174] CFAR: c011c5d0 DAR: c001c52c53df DSISR: SOFTE: 0 [ 318.558174] GPR00: c011c5d4 c003ac1376a0 c1049000 c001c52c53df [ 318.558174] GPR04: c004788657f0 0001 [ 318.558174] GPR08: 000477be 8002 [ 318.558174] GPR12: c0130900 cfac1500 c004648b6800 [ 318.558174] GPR16: c00408ad0400 00040001 [ 318.558174] GPR20: 0001 4000 c0cc5780 [ 318.558174] GPR24: 0001c45ffc5f c0cc5780 c001c52c53df [ 318.558174] GPR28: c9d06034 0004 0800 c001c52c53df [ 318.558222] NIP [c0918b28] _raw_spin_lock+0x28/0xc0 [ 318.558226] LR [c011c5d4] try_to_wake_up+0x1f4/0x5b0 [ 318.558229] Call Trace: [ 318.558231] [c003ac1376a0] [c9d06034] 0xc9d06034 (unreliable) [ 318.558236] [c003ac1376d0] [c011c5d4] try_to_wake_up+0x1f4/0x5b0 [ 318.558241] [c003ac137750] [c0102828] create_worker+0x148/0x250 [ 318.558245] [c003ac1377f0] [c01059dc] alloc_unbound_pwq+0x3bc/0x4c0 [ 318.558249] [c003ac137850] [c010601c] apply_wqattrs_prepare+0x2ac/0x320 [ 318.558253] [c003ac1378c0] [c01060cc] apply_workqueue_attrs_locked+0x3c/0xa0 [ 318.558257] [c003ac1378f0] [c010662c] apply_workqueue_attrs+0x4c/0x80 [ 318.558261] [c003ac137930] [c01081cc] __alloc_workqueue_key+0x16c/0x4e0 [ 318.558280] [c003ac1379f0] [d8455ca0] ext4_fill_super+0x1c70/0x3390 [ext4] [ 318.558286] [c003ac137b30] [c0316bdc] mount_bdev+0x21c/0x250 [ 318.558298] [c003ac137bd0] [d844db20] ext4_mount+0x20/0x40 [ext4] [ 318.558303] [c003ac137bf0] [c0318184] mount_fs+0x74/0x210 [ 318.558307] [c003ac137ca0] [c033fd18] vfs_kern_mount+0x68/0x1d0 [ 318.558310] [c003ac137d10] [c0344a28] do_mount+0x278/0xef0 [ 318.558314] [c003ac137de0] [c0345ac4] SyS_mount+0x94/0x100 [ 318.558319] [c003ac137e30] [c000af84] system_call+0x38/0xe0 [ 318.558322] Instruction dump: [ 318.558324] 990d02bc 4bc8 3c4c0073 38420500 7c0802a6 fbe1fff8 7c7f1b78 f8010010 [ 318.558329] f821ffd1 3940 994d02bc 814d0008 <7d201829> 2c09 40c20010 7d40192d [ 318.558336] ---[ end trace a2b72248c6bfebea ]--- More info of test environment -- # uname -rm 4.12.0-rc4 ppc64le # lscpu Architecture: ppc64le Byte Order:Little Endian CPU(s):16 On-line CPU(s) list: 0-15 Thread(s) per core:8 Core(s) per socket:1 Socket(s): 2 NUMA node(s): 2 Model: 2.1 (pvr 004b 0201) Model name:POWER8 (architected), altivec supported Hypervisor vendor: pHyp Virtualization type: para L1d cache: 64K L1i cache: 32K NUMA node0 CPU(s): 0-15 NUMA node1 CPU(s): -- Li Wang liw...@redhat.com
Re: [PATCH v2] vfs: fix put_compat_statfs64() does not handle errors
sorry, ping for comments~ On 15 November 2016 at 17:19, Li Wang wrote: > put_compat_statfs64() does NOT return -1 and setting errno to EOVERFLOW > when some variables(like: f_bsize) overflowed in the returned struct. > > The reason is that the ubuf->f_blocks is __u64 type, it couldn't be > 4bits as the judgement in put_comat_statfs64(). Here correct the > __u32 variables(in struct compat_statfs64) for comparison. > > reproducer: > step1. mount hugetlbfs with two different pagesize on ppc64 arch. > > $ hugeadm --pool-pages-max 16M:0 > $ hugeadm --create-mount > $ mount | grep -i hugetlbfs > none on /var/lib/hugetlbfs/pagesize-16MB type hugetlbfs > (rw,relatime,seclabel,pagesize=16777216) > none on /var/lib/hugetlbfs/pagesize-16GB type hugetlbfs > (rw,relatime,seclabel,pagesize=17179869184) > > step2. compile & run this C program. > > $ cat statfs64_test.c > > #define _LARGEFILE64_SOURCE > #include > #include > #include > > int main() > { > struct statfs64 sb; > int err; > > err = syscall(SYS_statfs64, "/var/lib/hugetlbfs/pagesize-16GB", > sizeof(sb), &sb); > if (err) > return -1; > > printf("sizeof f_bsize = %d, f_bsize=%ld\n", sizeof(sb.f_bsize), > sb.f_bsize); > > return 0; > } > > $ gcc -m32 statfs64_test.c > $ ./a.out > sizeof f_bsize = 4, f_bsize=0 > > Signed-off-by: Li Wang > --- > fs/compat.c | 6 +++--- > 1 file changed, 3 insertions(+), 3 deletions(-) > > diff --git a/fs/compat.c b/fs/compat.c > index bd064a2..543b48c 100644 > --- a/fs/compat.c > +++ b/fs/compat.c > @@ -253,9 +253,9 @@ static int put_compat_statfs(struct compat_statfs __user > *ubuf, struct kstatfs * > > static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct > kstatfs *kbuf) > { > - if (sizeof ubuf->f_blocks == 4) { > - if ((kbuf->f_blocks | kbuf->f_bfree | kbuf->f_bavail | > -kbuf->f_bsize | kbuf->f_frsize) & 0xULL) > + if (sizeof(ubuf->f_bsize) == 4) { > + if ((kbuf->f_type | kbuf->f_bsize | kbuf->f_namelen | > + kbuf->f_frsize | kbuf->f_flags) & 0xULL) > return -EOVERFLOW; > /* f_files and f_ffree may be -1; it's okay > * to stuff that into 32 bits */ > -- > 1.8.3.1 > -- Regards, Li Wang Email: wangli.a...@gmail.com
[PATCH v2] vfs: fix put_compat_statfs64() does not handle errors
put_compat_statfs64() does NOT return -1 and setting errno to EOVERFLOW when some variables(like: f_bsize) overflowed in the returned struct. The reason is that the ubuf->f_blocks is __u64 type, it couldn't be 4bits as the judgement in put_comat_statfs64(). Here correct the __u32 variables(in struct compat_statfs64) for comparison. reproducer: step1. mount hugetlbfs with two different pagesize on ppc64 arch. $ hugeadm --pool-pages-max 16M:0 $ hugeadm --create-mount $ mount | grep -i hugetlbfs none on /var/lib/hugetlbfs/pagesize-16MB type hugetlbfs (rw,relatime,seclabel,pagesize=16777216) none on /var/lib/hugetlbfs/pagesize-16GB type hugetlbfs (rw,relatime,seclabel,pagesize=17179869184) step2. compile & run this C program. $ cat statfs64_test.c #define _LARGEFILE64_SOURCE #include #include #include int main() { struct statfs64 sb; int err; err = syscall(SYS_statfs64, "/var/lib/hugetlbfs/pagesize-16GB", sizeof(sb), &sb); if (err) return -1; printf("sizeof f_bsize = %d, f_bsize=%ld\n", sizeof(sb.f_bsize), sb.f_bsize); return 0; } $ gcc -m32 statfs64_test.c $ ./a.out sizeof f_bsize = 4, f_bsize=0 Signed-off-by: Li Wang --- fs/compat.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/compat.c b/fs/compat.c index bd064a2..543b48c 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -253,9 +253,9 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs * static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstatfs *kbuf) { - if (sizeof ubuf->f_blocks == 4) { - if ((kbuf->f_blocks | kbuf->f_bfree | kbuf->f_bavail | -kbuf->f_bsize | kbuf->f_frsize) & 0xULL) + if (sizeof(ubuf->f_bsize) == 4) { + if ((kbuf->f_type | kbuf->f_bsize | kbuf->f_namelen | +kbuf->f_frsize | kbuf->f_flags) & 0xULL) return -EOVERFLOW; /* f_files and f_ffree may be -1; it's okay * to stuff that into 32 bits */ -- 1.8.3.1
Re: [PATCH] vfs: fix statfs64() does not handle errors
On Mon, Nov 07, 2016 at 11:03:11AM -0700, Andreas Dilger wrote: > On Nov 7, 2016, at 3:21 AM, Li Wang wrote: > > > > statfs64() does NOT return -1 and setting errno to EOVERFLOW when some > > variables(like: f_bsize) overflowed in the returned struct. > > > > reproducer: > > step1. mount hugetlbfs with two different pagesize on ppc64 arch. > > > > $ hugeadm --pool-pages-max 16M:0 > > $ hugeadm --create-mount > > $ mount | grep -i hugetlbfs > > none on /var/lib/hugetlbfs/pagesize-16MB type hugetlbfs > > (rw,relatime,seclabel,pagesize=16777216) > > none on /var/lib/hugetlbfs/pagesize-16GB type hugetlbfs > > (rw,relatime,seclabel,pagesize=17179869184) > > > > step2. compile & run this C program. > > > > $ cat statfs64_test.c > > > > #define _LARGEFILE64_SOURCE > > #include > > #include > > > > int main() > > { > > struct statfs64 sb; > > int err; > > > > err = statfs64("/var/lib/hugetlbfs/pagesize-16GB", &sb); > > if (err) > > return -1; > > > > printf("sizeof f_bsize = %d, f_bsize=%ld\n", sizeof(sb.f_bsize), > > sb.f_bsize); > > > > return 0; > > } > > > > $ gcc -m32 statfs64_test.c > > $ ./a.out > > sizeof f_bsize = 4, f_bsize=0 > > > > Signed-off-by: Li Wang > > --- > > > > Notes: > >This is my first patch to kernel fs part, I'm not sure if > >this one useful, but just want someone have a look. > > > >thanks~ > > > > fs/statfs.c | 17 + > > 1 file changed, 17 insertions(+) > > > > diff --git a/fs/statfs.c b/fs/statfs.c > > index 083dc0a..849dde95 100644 > > --- a/fs/statfs.c > > +++ b/fs/statfs.c > > @@ -151,6 +151,23 @@ static int do_statfs64(struct kstatfs *st, struct > > statfs64 __user *p) > > if (sizeof(buf) == sizeof(*st)) > > memcpy(&buf, st, sizeof(*st)); > > else { > > + if (sizeof buf.f_bsize == 4) { > > Linux CodingStyle says this should be used like sizeof(buf.f_bsize). agree. > > > + if ((st->f_blocks | st->f_bfree | st->f_bavail | > > +st->f_bsize | st->f_frsize) & > > + 0xULL) > > + return -EOVERFLOW; > > I'm not sure I agree with this check. Sure, if sizeof(buf.f_bsize) == 4 > then the large st->f_bsize will overflow this field, and that is valid. After thinking over, I feel that my fix in this patch is not right. The reproducer.c running on ppc64 arch was build in 32bit, but it does not call SYS_statfs64 in kernel. It calls compat_sys_statfs64 indeed. # cat reproducer.c #define _LARGEFILE64_SOURCE #include #include #include int main() { struct statfs64 sb; int err; err = syscall(SYS_statfs64, "/var/lib/hugetlbfs/pagesize-16GB", sizeof(sb), &sb); if (err) return -1; printf("sizeof f_bsize = %d, f_bsize=%ld\n", sizeof(sb.f_bsize), sb.f_bsize); return 0; } # gcc reproducer.c -m32 # stap -e 'probe kernel.function("compat_sys_statfs64") {printf ("%s", $$parms);}' -vvv & # ./a.out sizeof f_bsize = 4, f_bsize=0 # pathname=0x16c4 sz=0x58 buf=0xff8a20b0 Guess the fix should be like: diff --git a/fs/compat.c b/fs/compat.c index bd064a2..3d923fd 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -253,7 +253,7 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs * static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstatfs *kbuf) { - if (sizeof ubuf->f_blocks == 4) { + if (sizeof ubuf->f_bsize == 4) { if ((kbuf->f_blocks | kbuf->f_bfree | kbuf->f_bavail | kbuf->f_bsize | kbuf->f_frsize) & 0xULL) return -EOVERFLOW; I will test it and send a new patch. Regards, Li Wang > > However, that doesn't mean that large values for f_blocks, f_bfree, f_bavail > should return an error. I assume you are concerned that the product of the > large f_bsize and one of those values would overflow a 64-bit bytes value, > but that is for userspace to worry about, since the values in the individual > fields themselves are OK. > > We're already over 100PiB Lustre filesystems (2^57 bytes) today, and I > don't want statfs() failing prematurely because userspace feels the need > to multiply out the blocks and blocksize into bytes, instead of shifting > th
[PATCH] vfs: fix statfs64() does not handle errors
statfs64() does NOT return -1 and setting errno to EOVERFLOW when some variables(like: f_bsize) overflowed in the returned struct. reproducer: step1. mount hugetlbfs with two different pagesize on ppc64 arch. $ hugeadm --pool-pages-max 16M:0 $ hugeadm --create-mount $ mount | grep -i hugetlbfs none on /var/lib/hugetlbfs/pagesize-16MB type hugetlbfs (rw,relatime,seclabel,pagesize=16777216) none on /var/lib/hugetlbfs/pagesize-16GB type hugetlbfs (rw,relatime,seclabel,pagesize=17179869184) step2. compile & run this C program. $ cat statfs64_test.c #define _LARGEFILE64_SOURCE #include #include int main() { struct statfs64 sb; int err; err = statfs64("/var/lib/hugetlbfs/pagesize-16GB", &sb); if (err) return -1; printf("sizeof f_bsize = %d, f_bsize=%ld\n", sizeof(sb.f_bsize), sb.f_bsize); return 0; } $ gcc -m32 statfs64_test.c $ ./a.out sizeof f_bsize = 4, f_bsize=0 Signed-off-by: Li Wang --- Notes: This is my first patch to kernel fs part, I'm not sure if this one useful, but just want someone have a look. thanks~ fs/statfs.c | 17 + 1 file changed, 17 insertions(+) diff --git a/fs/statfs.c b/fs/statfs.c index 083dc0a..849dde95 100644 --- a/fs/statfs.c +++ b/fs/statfs.c @@ -151,6 +151,23 @@ static int do_statfs64(struct kstatfs *st, struct statfs64 __user *p) if (sizeof(buf) == sizeof(*st)) memcpy(&buf, st, sizeof(*st)); else { + if (sizeof buf.f_bsize == 4) { + if ((st->f_blocks | st->f_bfree | st->f_bavail | +st->f_bsize | st->f_frsize) & + 0xULL) + return -EOVERFLOW; + /* +* f_files and f_ffree may be -1; it's okay to stuff +* that into 32 bits +*/ + if (st->f_files != -1 && + (st->f_files & 0xULL)) + return -EOVERFLOW; + if (st->f_ffree != -1 && + (st->f_ffree & 0xULL)) + return -EOVERFLOW; + } + buf.f_type = st->f_type; buf.f_bsize = st->f_bsize; buf.f_blocks = st->f_blocks; -- 1.8.3.1
mtd: put flash block erasing into wait queue, if has any thread in queue
flash erasing maybe block writing operation. make erasing operation sleep, when the other thread is in wait queue. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] mtd: put flash block erasing into wait queue, if has any thread in queue
When erases many flash blocks, it maybe stop flash writing operation: = erase thread: for(;;) { do_erase_oneblock() { mutex_lock(&chip->mutex); chip->state = FL_ERASING; mutex_unlock(&chip->mutex); msleep(); <--- erase wait mutex_lock(&chip->mutex); chip->state = FL_READY; mutex_unlock(&chip->mutex); <--- finish one block erasing } } write thread: retry: mutex_lock(&cfi->chips[chipnum].mutex); if (cfi->chips[chipnum].state != FL_READY) { set_current_state(TASK_UNINTERRUPTIBLE); add_wait_queue(&cfi->chips[chipnum].wq, &wait); mutex_unlock(&cfi->chips[chipnum].mutex); schedule(); <--- write wait remove_wait_queue(&cfi->chips[chipnum].wq, &wait); goto retry; = Only when finishes one block erasing, writing operation just has chance to run. But, if writing operation is put into wait queue(write wait), the mutex_unlock (finish one block erasing) can not wake up writing operation. So, if many blocks need erase, writing operation has no chance to run. it will cause the following backtrace: = INFO: task sh:727 blocked for more than 120 seconds. "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. sh D 0fe76ad0 0 727 711 0x Call Trace: [df0cdc40] [0002] 0x2 (unreliable) [df0cdd00] [c0008974] __switch_to+0x64/0xd8 [df0cdd10] [c043f2e4] schedule+0x218/0x408 [df0cdd60] [c04401f4] __mutex_lock_slowpath+0xd0/0x174 [df0cdda0] [c044087c] mutex_lock+0x5c/0x60 [df0cddc0] [c00ff18c] do_truncate+0x60/0xa8 [df0cde10] [c010d1d0] do_last+0x5a0/0x6d0 [df0cde40] [c010f778] do_filp_open+0x1d4/0x5e8 [df0cdf20] [c00fe0d0] do_sys_open+0x64/0x19c [df0cdf40] [c0010d04] ret_from_syscall+0x0/0x4 --- Exception: c01 at 0xfe76ad0 LR = 0xffd3ae8 ... sh D 0fe77068 0 607 590 0x Call Trace: [dbca98e0] [c009ad4c] rcu_process_callbacks+0x38/0x4c (unreliable) [dbca99a0] [c0008974] __switch_to+0x64/0xd8 [dbca99b0] [c043f2e4] schedule+0x218/0x408 [dbca9a00] [c034bfa4] cfi_amdstd_write_words+0x364/0x480 [dbca9a80] [c034c9b4] cfi_amdstd_write_buffers+0x8f4/0xca8 [dbca9b10] [c03437ac] part_write+0xb0/0xe4 [dbca9b20] [c02051f8] jffs2_flash_direct_writev+0xdc/0x140 [dbca9b70] [c02079ac] jffs2_flash_writev+0x38c/0x4fc [dbca9bc0] [c01fc6ac] jffs2_write_dnode+0x140/0x5bc [dbca9c40] [c01fd0dc] jffs2_write_inode_range+0x288/0x514 [dbca9cd0] [c01f5ed4] jffs2_write_end+0x190/0x37c [dbca9d10] [c00bf2f0] generic_file_buffered_write+0x100/0x26c [dbca9da0] [c00c1828] __generic_file_aio_write+0x2c0/0x4fc [dbca9e10] [c00c1ad4] generic_file_aio_write+0x70/0xf0 [dbca9e40] [c0100398] do_sync_write+0xac/0x120 [dbca9ee0] [c0101088] vfs_write+0xb4/0x184 [dbca9f00] [c01012cc] sys_write+0x50/0x10c [dbca9f40] [c0010d04] ret_from_syscall+0x0/0x4 --- Exception: c01 at 0xfe77068 LR = 0xffd3c8c ... flash_erase R running 0 869 32566 0x Call Trace: [dbc6dae0] [c0017ac0] kunmap_atomic+0x14/0x3c (unreliable) [dbc6dba0] [c0008974] __switch_to+0x64/0xd8 [dbc6dbb0] [c043f2e4] schedule+0x218/0x408 [dbc6dc00] [c043fbe4] schedule_timeout+0x170/0x2cc [dbc6dc50] [c00531f0] msleep+0x1c/0x34 [dbc6dc60] [c034d538] do_erase_oneblock+0x7d0/0x944 [dbc6dcd0] [c0349dfc] cfi_varsize_frob+0x1a8/0x2cc [dbc6dd20] [c034e4d4] cfi_amdstd_erase_varsize+0x30/0x60 [dbc6dd30] [c0343abc] part_erase+0x80/0x104 [dbc6dd40] [c0345c80] mtd_ioctl+0x3e0/0xc3c [dbc6de80] [c0111050] vfs_ioctl+0xcc/0xe4 [dbc6dea0] [c011122c] do_vfs_ioctl+0x80/0x770 [dbc6df10] [c01119b0] sys_ioctl+0x94/0x108 [dbc6df40] [c0010d04] ret_from_syscall+0x0/0x4 --- Exception: c01 at 0xff586a0 LR = 0xff58608 = So, if there is any thread in wait queue, puts erasing operation into queue. It makes writing operation have chance to run. Signed-off-by: Li Wang --- drivers/mtd/chips/cfi_cmdset_0002.c | 13 + 1 file changed, 13 insertions(+) diff --git a/drivers/mtd/chips/cfi_cmdset_0002.c b/drivers/mtd/chips/cfi_cmdset_0002.c index 5a4bfe3..53f5774 100644 --- a/drivers/mtd/chips/cfi_cmdset_0002.c +++ b/drivers/mtd/chips/cfi_cmdset_0002.c @@ -2400,6 +2400,19 @@ static int __xipram do_erase_oneblock(struct map_info *map, struct flchip *chip, chip->state = FL_READY; DISABLE_VPP(map); put_chip(map, chip, adr); + if (waitqueue_active(&chip->wq)) { + set_current_state(TASK_UNINTERRUPTIBLE); + add_wait_queue(&chip->wq, &wait); + mutex_unlock(&chip->mutex); + /* +* If the other thread in queue misses to wake up erasing in +* 3ms, erasing will wake up itself. The way makes erasing not +* to hang up by the error of the other thread in queue. +*/ + schedule_timeout(msecs_to_jiffies(3)); + remove_wait_queue(&chip->wq, &wait); + re
Re: [PATCH 2/3] Add shrink_pagecache_parent
Hi, On 01/03/2014 07:55 AM, Andrew Morton wrote: On Mon, 30 Dec 2013 21:45:17 +0800 Li Wang wrote: Analogous to shrink_dcache_parent except that it collects inodes. It is not very appropriate to be put in dcache.c, but d_walk can only be invoked from here. Please cc Dave Chinner on future revisions. He be da man. The overall intent of the patchset seems reasonable and I agree that it can't be efficiently done from userspace with the current kernel API. We *could* do it from userspace by providing facilities for userspace to query the VFS caches: "is this pathname in the dentry cache" and "is this inode in the inode cache". Even we have these available, i am afraid it will still introduce non-negligible overhead due to frequent system calls for a directory walking operation, especially under massive small file situations. --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1318,6 +1318,42 @@ void shrink_dcache_parent(struct dentry *parent) } EXPORT_SYMBOL(shrink_dcache_parent); +static enum d_walk_ret gather_inode(void *data, struct dentry *dentry) +{ + struct list_head *list = data; + struct inode *inode = dentry->d_inode; + + if ((inode == NULL) || ((!inode_owner_or_capable(inode)) && + (!capable(CAP_SYS_ADMIN + goto out; + spin_lock(&inode->i_lock); + if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || It's unclear what rationale lies behind this particular group of tests. + (inode->i_mapping->nrpages == 0) || + (!list_empty(&inode->i_lru))) { arg, the "Inode locking rules" at the top of fs/inode.c needs a refresh, I suspect. It is too vague. Formally, inode->i_lru is protected by i_sb->s_inode_lru->node[nid].lock, not by ->i_lock. I guess you can just do a list_lru_add() and that will atomically add the inode to your local list_lru if ->i_lru wasn't being used for anything else. I *think* that your use of i_lock works OK, because code which fiddles with i_lru and s_inode_lru also takes i_lock. However we need to decide which is the preferred and official lock. ie: what is the design here?? However... most inodes will be on an LRU list, won't they? Doesn't this reuse of i_lru mean that many inodes will fail to be processed? If so, we might need to add a new list_head to the inode, which will be problematic. As far as I know, fix me if i am wrong, only when inode has zero reference count, it will be put into superblock lru list. For most situations, there is at least a dentry refers to it, so it will not be on any lru list. Aside: inode_lru_isolate() fiddles directly with inode->i_lru without taking i_sb->s_inode_lru->node[nid].lock. Why doesn't this make a concurrent s_inode_lru walker go oops?? Should we be using list_lru_del() in there? (which should have been called list_lru_del_init(), sigh). It seems inode_lru_isolate() only called by prune_icache_sb() as a callback function. Before calling it, the caller has hold the lock. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/3] Fadvise: Directory level page cache cleaning support
Do we really need clean dcache/icache at the current stage? That will introduce more code work, so far, iput() will put those unreferenced inodes into superblock lru list. To free the inodes inside a specific directory, it seems we do not have a handy API to use, and need modify iput() to recognize our situation, and collect those inodes into our list rather than superblock lru list. Maybe we stay at current stage now, since it is simple and could gain the major benefits, leave the dcache/icache cleaning to do in the future? On 2013/12/31 5:33, Dave Hansen wrote: On 12/30/2013 11:40 AM, Andreas Dilger wrote: On Dec 30, 2013, at 12:18, Dave Hansen wrote: Why is this necessary to do in the kernel? Why not leave it to userspace to walk the filesystem(s)? I would suspect that trying to do it in userspace would be quite bad. It would require traversing the whole directory tree to issue cache flushed for each subdirectory, but it doesn't know when to stop traversal. That would mean the "cache flush" would turn into "cache pollute" and cause a lot of disk IO for subdirectories not in cache to begin with. That makes sense for dentries at least and is a pretty good reason. Probably good enough to to include some text in the patch description. ;) Perhaps: "We need this interface because we have no way of determining what is in the dcache from userspace, and we do not want userspace to pollute the dcache going and looking for page cache to evict." One other thing that bothers me: POSIX_FADV_DONTNEED on a directory seems like it should do something with the _directory_. It should undo the kernel's caching that happens as a result of readdir(). Should this also be trying to drop the dentry/inode entries like "echo 2 .../drop_caches" does? -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 0/3] Fadvise: Directory level page cache cleaning support
VFS relies on LRU-like page cache eviction algorithm to reclaim cache space, such general and simple algorithm is good regarding its application independence, and is working for normal situations. However, sometimes it does not help much for those applications which are performance sensitive or under heavy loads. Since LRU may incorrectly evict going-to-be referenced pages out, resulting in severe performance degradation due to cache thrashing. Applications have the most knowledge about the things they are doing, they can always do better if they are given a chance. This motivates to endow the applications more abilities to manipulate the page cache. Currently, Linux support file system wide cache cleaing by virtue of proc interface 'drop-caches', but it is very coarse granularity and was originally proposed for debugging. The other is to do file-level page cache cleaning through 'fadvise', however, this is sometimes less flexible and not easy to use especially in directory wide operations or under massive small-file situations. This patch extends 'fadvise' to support directory level page cache cleaning. The call to posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED) with 'fd' referring to a directory will recursively reclaim page cache entries of files inside 'fd'. For secruity concern, those inodes which the caller does not own appropriate permissions will not be manipulated. It is easy to demonstrate the advantages of directory level page cache cleaning. We use a machine with a Pentium(R) Dual-Core CPU E5800 @ 3.20GHz, and with 2GB memory. Two directories named '1' and '3' are created, with each containing X (360 - 460) files, and each file with a size of 2MB. The test scripts are as follows, The test scripts (without cache cleaning) #!/bin/bash cp -r 1 2 sync cp -r 3 4 sync time grep "data" 1/* The time on 'grep "data" 1/*' is measured with/without cache cleaning, under different file counts. With cache cleaning, we clean all cache entries of files in '2' before doing 'cp -r 3 4' by using pretty much the following two statements, fd = open("2", O_DIRECTORY, 0644); posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED); The results are as follows (in seconds), X: Number of files inside each directory X Without Cleaning With Cleaning 360 2.3851.361 380 3.1591.466 400 3.9721.558 420 4.8231.548 440 5.7981.702 460 6.8882.197 The page cache is not large enough to buffer all the four directories, so 'cp -r 3 4' will result in some entries of '1' to be evicted (due to LRU). When re-accessing '1', some entries need be reloaded from disk, which is time-consuming. In this case, cleaning '2' before 'cp -r 3 4' enjoys a good speedup. Li Wang (3): VFS: Add the declaration of shrink_pagecache_parent Add shrink_pagecache_parent Fadvise: Add the ability for directory level page cache cleaning fs/dcache.c| 36 include/linux/dcache.h |1 + mm/fadvise.c |4 3 files changed, 41 insertions(+) -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 3/3] Fadvise: Add the ability for directory level page cache cleaning
Signed-off-by: Li Wang Signed-off-by: Yunchuan Wen --- mm/fadvise.c |4 1 file changed, 4 insertions(+) diff --git a/mm/fadvise.c b/mm/fadvise.c index 3bcfd81..644d32d 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -113,6 +113,10 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) case POSIX_FADV_NOREUSE: break; case POSIX_FADV_DONTNEED: + if (S_ISDIR(file_inode(f.file)->i_mode)) { + shrink_pagecache_parent(f.file->f_dentry); + goto out; + } if (!bdi_write_congested(mapping->backing_dev_info)) __filemap_fdatawrite_range(mapping, offset, endbyte, WB_SYNC_NONE); -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 1/3] VFS: Add the declaration of shrink_pagecache_parent
Signed-off-by: Li Wang Signed-off-by: Yunchuan Wen --- include/linux/dcache.h |1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/dcache.h b/include/linux/dcache.h index bf72e9a..6262171 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -249,6 +249,7 @@ extern struct dentry *d_find_any_alias(struct inode *inode); extern struct dentry * d_obtain_alias(struct inode *); extern void shrink_dcache_sb(struct super_block *); extern void shrink_dcache_parent(struct dentry *); +extern void shrink_pagecache_parent(struct dentry *); extern void shrink_dcache_for_umount(struct super_block *); extern int d_invalidate(struct dentry *); -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 2/3] Add shrink_pagecache_parent
Analogous to shrink_dcache_parent except that it collects inodes. It is not very appropriate to be put in dcache.c, but d_walk can only be invoked from here. Signed-off-by: Li Wang Signed-off-by: Yunchuan Wen --- fs/dcache.c | 36 1 file changed, 36 insertions(+) diff --git a/fs/dcache.c b/fs/dcache.c index 6055d61..0fc0f80 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1318,6 +1318,42 @@ void shrink_dcache_parent(struct dentry *parent) } EXPORT_SYMBOL(shrink_dcache_parent); +static enum d_walk_ret gather_inode(void *data, struct dentry *dentry) +{ + struct list_head *list = data; + struct inode *inode = dentry->d_inode; + + if ((inode == NULL) || ((!inode_owner_or_capable(inode)) && + (!capable(CAP_SYS_ADMIN + goto out; + spin_lock(&inode->i_lock); + if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || + (inode->i_mapping->nrpages == 0) || + (!list_empty(&inode->i_lru))) { + goto out_unlock; + } + __iget(inode); + list_add_tail(&inode->i_lru, list); +out_unlock: + spin_unlock(&inode->i_lock); +out: + return D_WALK_CONTINUE; +} + +void shrink_pagecache_parent(struct dentry *parent) +{ + LIST_HEAD(list); + struct inode *inode, *next; + + d_walk(parent, &list, gather_inode, NULL); + list_for_each_entry_safe(inode, next, &list, i_lru) { + list_del_init(&inode->i_lru); + invalidate_mapping_pages(inode->i_mapping, 0, -1); + iput(inode); + } +} +EXPORT_SYMBOL(shrink_pagecache_parent); + static enum d_walk_ret umount_collect(void *_data, struct dentry *dentry) { struct select_data *data = _data; -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/3] Ceph fscache: Fix kernel panic due to a race
Hi Milosz, As far as I know, logically, currently fscache does not play as write cache for Ceph, except that there is a call to ceph_readpage_to_fscache() in ceph_writepage(), but that is nothing related to our test case. According to our observation, our test case never goes through ceph_writepage(), instead, it goes through ceph_writepages(). So in other words, I donot think this is related to caching in write path. May I try to explain the panic in more detail, (1) dd if=/dev/zero of=cephfs/foo bs=8 count=512 (2) echo 3 > /proc/sys/vm/drop_caches (3) dd if=cephfs/foo of=/dev/null bs=8 count=1024 For statement (1), it is frequently appending a file, so ceph_aio_write() frequently updates the inode->i_size, however, these updates did not immediately reflected to object->store_limit_l. For statement (3), when we start reading the second page at [4096, 8192), ceph find that the page does not be cached in fscache, then it decides to write this page into fscache, during this process in cachefiles_write_page(), it found that object->store_limit_l < 4096 (page->index << 12), it causes panic. Does it make sense? Cheers, Li Wang On 2013/12/27 6:51, Milosz Tanski wrote: Li, I looked at the patchset am I correct that this only happens when we enable caching in the write path? - Milosz On Thu, Dec 26, 2013 at 9:29 AM, Li Wang wrote: From: Yunchuan Wen The following scripts could easily panic the kernel, #!/bin/bash mount -t ceph -o fsc MONADDR:/ cephfs rm -rf cephfs/foo dd if=/dev/zero of=cephfs/foo bs=8 count=512 echo 3 > /proc/sys/vm/drop_caches dd if=cephfs/foo of=/dev/null bs=8 count=1024 This is due to when writing a page into fscache, the code will assert that the write position does not exceed the object->store_limit_l, which is supposed to be equal to inode->i_size. However, for current implementation, after file writing, the object->store_limit_l is not synchronized with new inode->i_size immediately, which introduces a race that if writing a new page into fscache, will reach the ASSERT that write position has exceeded the object->store_limit_l, and cause kernel panic. This patch fixes it. Yunchuan Wen (3): Ceph fscache: Add an interface to synchronize object store limit Ceph fscache: Update object store limit after writing Ceph fscache: Wait for completion of object initialization fs/ceph/cache.c |1 + fs/ceph/cache.h | 10 ++ fs/ceph/file.c |3 +++ 3 files changed, 14 insertions(+) -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 1/3] Ceph fscache: Add an interface to synchronize object store limit
From: Yunchuan Wen Add an interface to explicitly synchronize object->store_limit[_l] with inode->i_size Signed-off-by: Yunchuan Wen Signed-off-by: Min Chen Signed-off-by: Li Wang --- fs/ceph/cache.h | 10 ++ 1 file changed, 10 insertions(+) diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h index ba94940..262106b 100644 --- a/fs/ceph/cache.h +++ b/fs/ceph/cache.h @@ -48,6 +48,12 @@ void ceph_readpage_to_fscache(struct inode *inode, struct page *page); void ceph_invalidate_fscache_page(struct inode* inode, struct page *page); void ceph_queue_revalidate(struct inode *inode); +static inline void ceph_fscache_update_objectsize(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + fscache_attr_changed(ci->fscache); +} + static inline void ceph_fscache_invalidate(struct inode *inode) { fscache_invalidate(ceph_inode(inode)->fscache); @@ -127,6 +133,10 @@ static inline void ceph_readpage_to_fscache(struct inode *inode, { } +static inline void ceph_fscache_update_objectsize(struct inode *inode) +{ +} + static inline void ceph_fscache_invalidate(struct inode *inode) { } -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 2/3] Ceph fscache: Update object store limit after file writing
From: Yunchuan Wen Synchronize object->store_limit[_l] with new inode->i_size after file writing. Signed-off-by: Yunchuan Wen Signed-off-by: Min Chen Signed-off-by: Li Wang --- fs/ceph/file.c |3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 3de8982..b6df7ab 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -786,6 +786,7 @@ retry_snap: goto retry_snap; } } else { + loff_t old_size = inode->i_size; /* * No need to acquire the i_truncate_mutex. Because * the MDS revokes Fwb caps before sending truncate @@ -796,6 +797,8 @@ retry_snap: written = generic_file_buffered_write(iocb, iov, nr_segs, pos, &iocb->ki_pos, count, 0); + if (inode->i_size > old_size) + ceph_fscache_update_objectsize(inode); mutex_unlock(&inode->i_mutex); } -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 0/3] Ceph fscache: Fix kernel panic due to a race
From: Yunchuan Wen The following scripts could easily panic the kernel, #!/bin/bash mount -t ceph -o fsc MONADDR:/ cephfs rm -rf cephfs/foo dd if=/dev/zero of=cephfs/foo bs=8 count=512 echo 3 > /proc/sys/vm/drop_caches dd if=cephfs/foo of=/dev/null bs=8 count=1024 This is due to when writing a page into fscache, the code will assert that the write position does not exceed the object->store_limit_l, which is supposed to be equal to inode->i_size. However, for current implementation, after file writing, the object->store_limit_l is not synchronized with new inode->i_size immediately, which introduces a race that if writing a new page into fscache, will reach the ASSERT that write position has exceeded the object->store_limit_l, and cause kernel panic. This patch fixes it. Yunchuan Wen (3): Ceph fscache: Add an interface to synchronize object store limit Ceph fscache: Update object store limit after writing Ceph fscache: Wait for completion of object initialization fs/ceph/cache.c |1 + fs/ceph/cache.h | 10 ++ fs/ceph/file.c |3 +++ 3 files changed, 14 insertions(+) -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 3/3] Ceph fscache: Wait for completion of object initialization
From: Yunchuan Wen The object store limit needs to be updated after writing, and this can be done provided the corresponding object has already been initialized. Current object initialization is done asynchrously, which introduce a race if a file is opened, then immediately followed by a writing, the initialization may have not completed, the code will reach the ASSERT in fscache_submit_exclusive_op() to cause kernel bug. Signed-off-by: Yunchuan Wen Signed-off-by: Min Chen Signed-off-by: Li Wang --- fs/ceph/cache.c |1 + 1 file changed, 1 insertion(+) diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index 8c44fdd..834f9f3 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -205,6 +205,7 @@ void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc, ci->fscache = fscache_acquire_cookie(fsc->fscache, &ceph_fscache_inode_object_def, ci, true); + fscache_check_consistency(ci->fscache); done: mutex_unlock(&inode->i_mutex); -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 1/2] ceph fscache: Introduce a routine for uncaching single no data page from fscache
Signed-off-by: Li Wang --- fs/ceph/cache.h | 13 + 1 file changed, 13 insertions(+) diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h index ba94940..da95f61 100644 --- a/fs/ceph/cache.h +++ b/fs/ceph/cache.h @@ -67,6 +67,14 @@ static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp) return fscache_maybe_release_page(ci->fscache, page, gfp); } +static inline void ceph_fscache_readpage_cancel(struct inode *inode, + struct page *page) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + if (fscache_cookie_valid(ci->fscache) && PageFsCache(page)) + __fscache_uncache_page(ci->fscache, page); +} + static inline void ceph_fscache_readpages_cancel(struct inode *inode, struct list_head *pages) { @@ -145,6 +153,11 @@ static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp) return 1; } +static inline void ceph_fscache_readpage_cancel(struct inode *inode, + struct page *page) +{ +} + static inline void ceph_fscache_readpages_cancel(struct inode *inode, struct list_head *pages) { -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 0/2] ceph fscache: uncaching single no data page when error
Currently, if one new page allocated into fscache in readpage(), however, with no data read into due to error encountered during reading from OSDs, the slot in fscache is not uncached. This patch fixes this. Li Wang (2): ceph: Introduce a routine for uncaching single no data page from fscache ceph: Uncaching no data page from fscache in readpage() fs/ceph/addr.c |1 + fs/ceph/cache.h | 13 + 2 files changed, 14 insertions(+) -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 2/2] ceph fscache: Uncaching no data page from fscache in readpage()
Currently, if one new page allocated into fscache in readpage(), however, with no data read into due to error encountered during reading from OSDs, the slot in fscache is not uncached. This patch fixes this. Signed-off-by: Li Wang --- fs/ceph/addr.c |1 + 1 file changed, 1 insertion(+) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index ec3ba43..0cc9749 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -209,6 +209,7 @@ static int readpage_nounlock(struct file *filp, struct page *page) err = 0; if (err < 0) { SetPageError(page); + ceph_fscache_readpage_cancel(inode, page); goto out; } else { if (err < PAGE_CACHE_SIZE) { -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/5] VFS: Directory level cache cleaning
Both 'drop_caches' and 'vfs_cache_pressure' do coarse granularity control. Sometimes these do not help much for those performance sensitive applications. General and simple algorithms are good regarding its application independence and working for normal situations. However, since applications have the most knowledge about the things they are doing, they can always do better if they are given a chance. I think that is why compiler have directives, such as __inline__,__align__, cpu cache provides __prefetch__ etc. Similarly, I think we had better endow the applications more abilities to manipulate the metadata/page cache. This is potentially beneficial to avoid performance degradation due to cache thrashing. 'drop_caches' may not be the expected way to go, since its intention is for debugging. 'fadvise' is originally proposed at this purpose, I think we may start with making 'fadvise' could handle directory level page cache cleaning. On 2013/12/18 6:05, Dave Chinner wrote: On Mon, Dec 16, 2013 at 07:00:04AM -0800, Li Wang wrote: Currently, Linux only support file system wide VFS cache (dentry cache and page cache) cleaning through '/proc/sys/vm/drop_caches'. Sometimes this is less flexible. The applications may know exactly whether the metadata and data will be referenced or not in future, a desirable mechanism is to enable applications to reclaim the memory of unused cache entries at a finer granularity - directory level. This enables applications to keep hot metadata and data (to be referenced in the future) in the cache, and kick unused out to avoid cache thrashing. Another advantage is it is more flexible for debugging. This patch extend the 'drop_caches' interface to support directory level cache cleaning and has a complete backward compatibility. '{1,2,3}' keeps the same semantics as before. Besides, "{1,2,3}:DIRECTORY_PATH_NAME" is allowed to recursively clean the caches under DIRECTORY_PATH_NAME. For example, 'echo 1:/home/foo/jpg > /proc/sys/vm/drop_caches' will clean the page caches of the files inside 'home/foo/jpg'. It is easy to demonstrate the advantage of directory level cache cleaning. We use a virtual machine configured with an Intel(R) Xeon(R) 8-core CPU E5506 @ 2.13GHz, and with 1GB memory. Three directories named '1', '2' and '3' are created, with each containing 18 – 28 files. The test program opens all files in a directory and then tries the next directory. The order for accessing the directories is '1', '2', '3', '1'. The time on accessing '1' on the second time is measured with/without cache cleaning, under different file counts. With cache cleaning, we clean all cache entries of files in '2' before accessing the files in '3'. The results are as follows (in seconds), This sounds like a highly contrived test case. There is no reason why dentry cache access time would change going from 180k to 280k files in 3 directories unless you're right at the memory pressure balance point in terms of cache sizing. Note: by default, VFS will move those unreferenced inodes into a global LRU list rather than freeing them, for this experiment, we modified iput() to force to free inode as well, this behavior and related codes are left for further discussion, thus not reflected in this patch) Number of files: 18 20 22 24 26 Without cleaning: 2.165 6.977 10.032 11.571 13.443 With cleaning: 1.949 1.906 2.336 2.918 3.651 When the number of files is 18 in each directory, the metadata cache is large enough to buffer all entries of three directories, so re-accessing '1' will hit in the cache, regardless of whether '2' cleaned up or not. As the number of files increases, the cache can now only buffer two+ directories. Accessing '3' will result in some entries of '1' to be evicted (due to LRU). When re-accessing '1', some entries need be reloaded from disk, which is time-consuming. Ok, so exactly as I thought - your example working set is slightly larger than what the cache holds. Hence what you are describing is a cache reclaim threshold effect: something you can avoid with /proc/sys/vm/vfs_cache_pressure. Cheers, Dave. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/5] VFS: Directory level cache cleaning
This extension is just add-on extension. The original debugging capability is still there, and more flexible debugging is now allowed. On 2013/12/17 17:12, Li Zefan wrote: On 2013/12/17 15:23, Li Wang wrote: If we do wanna equip fadvise() with directory level page cache cleaning, this could be solved by invoking (inode_permission() || capable(CAP_SYS_ADMIN)) before manipulating the page cache of that inode. We think the current extension to 'drop_caches' has a complete back compatibility, the old semantics keep unchanged, and with add-on features to do finer granularity cache cleaning should be also desirable. I don't think you can extend the drop_caches interface this way. It should be used for debuging only. commit 9d0243bca345d5ce25d3f4b74b7facb3a6df1232 Author: Andrew Morton Date: Sun Jan 8 01:00:39 2006 -0800 [PATCH] drop-pagecache Add /proc/sys/vm/drop_caches. When written to, this will cause the kernel to discard as much pagecache and/or reclaimable slab objects as it can. THis operation requires root permissions. ... This is a debugging feature: useful for getting consistent results between filesystem benchmarks. We could possibly put it under a config option, but it's less than 300 bytes. Also see http://lkml.org/lkml/2013/7/26/230 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/5] VFS: Directory level cache cleaning
If we do wanna equip fadvise() with directory level page cache cleaning, this could be solved by invoking (inode_permission() || capable(CAP_SYS_ADMIN)) before manipulating the page cache of that inode. We think the current extension to 'drop_caches' has a complete back compatibility, the old semantics keep unchanged, and with add-on features to do finer granularity cache cleaning should be also desirable. On 2013/12/17 11:58, Matthew Wilcox wrote: On Tue, Dec 17, 2013 at 11:08:16AM +0800, Li Wang wrote: As far as we know, fadvise(DONTNEED) does not support metadata cache cleaning. We think that is desirable under massive small files situations. Another thing is that do people accept the behavior of feeding a directory fd to fadvise will recusively clean all page caches of files inside that directory? I think there's a really good permissions-related question here. If that's an acceptable interface, should one have to be CAP_SYS_ADMIN to issue the request? What if some of the files below this directory are not owned by the user issuing the request? On 2013/12/17 1:45, Cong Wang wrote: On Mon, Dec 16, 2013 at 7:00 AM, Li Wang wrote: This patch extend the 'drop_caches' interface to support directory level cache cleaning and has a complete backward compatibility. '{1,2,3}' keeps the same semantics as before. Besides, "{1,2,3}:DIRECTORY_PATH_NAME" is allowed to recursively clean the caches under DIRECTORY_PATH_NAME. For example, 'echo 1:/home/foo/jpg > /proc/sys/vm/drop_caches' will clean the page caches of the files inside 'home/foo/jpg'. This interface is ugly... And we already have a file-level drop cache, that is, fadvise(DONTNEED). Can you extend it if it can't handle a directory fd? -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/5] VFS: Directory level cache cleaning
As far as we know, fadvise(DONTNEED) does not support metadata cache cleaning. We think that is desirable under massive small files situations. Another thing is that do people accept the behavior of feeding a directory fd to fadvise will recusively clean all page caches of files inside that directory? On 2013/12/17 1:45, Cong Wang wrote: On Mon, Dec 16, 2013 at 7:00 AM, Li Wang wrote: This patch extend the 'drop_caches' interface to support directory level cache cleaning and has a complete backward compatibility. '{1,2,3}' keeps the same semantics as before. Besides, "{1,2,3}:DIRECTORY_PATH_NAME" is allowed to recursively clean the caches under DIRECTORY_PATH_NAME. For example, 'echo 1:/home/foo/jpg > /proc/sys/vm/drop_caches' will clean the page caches of the files inside 'home/foo/jpg'. This interface is ugly... And we already have a file-level drop cache, that is, fadvise(DONTNEED). Can you extend it if it can't handle a directory fd? -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 5/5] VFS: Extend drop_caches sysctl handler to allow directory level cache cleaning
Signed-off-by: Li Wang Signed-off-by: Yunchuan Wen --- fs/drop_caches.c | 45 + 1 file changed, 37 insertions(+), 8 deletions(-) diff --git a/fs/drop_caches.c b/fs/drop_caches.c index 9fd702f..ab31393 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -8,10 +8,11 @@ #include #include #include +#include #include "internal.h" /* A global variable is a bit ugly, but it keeps the code simple */ -int sysctl_drop_caches; +char sysctl_drop_caches[PATH_MAX]; static void drop_pagecache_sb(struct super_block *sb, void *unused) { @@ -54,15 +55,43 @@ int drop_caches_sysctl_handler(ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { int ret; + int command; + struct path path; + struct path root; - ret = proc_dointvec_minmax(table, write, buffer, length, ppos); - if (ret) - return ret; - if (write) { - if (sysctl_drop_caches & 1) + ret = proc_dostring(table, write, buffer, length, ppos); + if (ret || !write) + goto out; + ret = -EINVAL; + command = sysctl_drop_caches[0] - '0'; + if (command < 1 || command > 3) + goto out; + if (sysctl_drop_caches[1] == '\0') { + if (command & 1) iterate_supers(drop_pagecache_sb, NULL); - if (sysctl_drop_caches & 2) + if (command & 2) drop_slab(); + ret = 0; + goto out; } - return 0; + if (sysctl_drop_caches[1] != ':' || sysctl_drop_caches[2] == '\0') + goto out; + if (sysctl_drop_caches[2] == '/') + get_fs_root(current->fs, &root); + else + get_fs_pwd(current->fs, &root); + ret = vfs_path_lookup(root.dentry, root.mnt, + &sysctl_drop_caches[2], 0, &path); + path_put(&root); + if (ret) + goto out; + if (command & 1) + shrink_pagecache_parent(path.dentry); + if (command & 2) + shrink_dcache_parent(path.dentry); + path_put(&path); +out: + if (ret) + memset(sysctl_drop_caches, 0, PATH_MAX); + return ret; } -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 3/5] VFS: Add the declaration of shrink_pagecache_parent
Signed-off-by: Li Wang Signed-off-by: Yunchuan Wen --- include/linux/dcache.h |1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 57e87e7..ce11098 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -247,6 +247,7 @@ extern struct dentry *d_find_any_alias(struct inode *inode); extern struct dentry * d_obtain_alias(struct inode *); extern void shrink_dcache_sb(struct super_block *); extern void shrink_dcache_parent(struct dentry *); +extern void shrink_pagecache_parent(struct dentry *); extern void shrink_dcache_for_umount(struct super_block *); extern int d_invalidate(struct dentry *); -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 1/5] VFS: Convert drop_caches to accept string
Signed-off-by: Li Wang Signed-off-by: Yunchuan Wen --- kernel/sysctl.c |6 ++ 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 34a6047..2f2d8ab 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1255,12 +1255,10 @@ static struct ctl_table vm_table[] = { }, { .procname = "drop_caches", - .data = &sysctl_drop_caches, - .maxlen = sizeof(int), + .data = sysctl_drop_caches, + .maxlen = PATH_MAX, .mode = 0644, .proc_handler = drop_caches_sysctl_handler, - .extra1 = &one, - .extra2 = &three, }, #ifdef CONFIG_COMPACTION { -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 2/5] VFS: Convert sysctl_drop_caches to string
Signed-off-by: Li Wang Signed-off-by: Yunchuan Wen --- include/linux/mm.h |3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 1cedd00..5e3cc5b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -17,6 +17,7 @@ #include #include #include +#include struct mempolicy; struct anon_vma; @@ -1920,7 +1921,7 @@ int in_gate_area_no_mm(unsigned long addr); #endif /* __HAVE_ARCH_GATE_AREA */ #ifdef CONFIG_SYSCTL -extern int sysctl_drop_caches; +extern char sysctl_drop_caches[PATH_MAX]; int drop_caches_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); #endif -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 4/5] VFS: Add shrink_pagecache_parent
Analogous to shrink_dcache_parent except that it collects inodes. It is not very appropriate to be put in dcache.c, but d_walk can only be invoked from here. Signed-off-by: Li Wang Signed-off-by: Yunchuan Wen --- fs/dcache.c | 35 +++ 1 file changed, 35 insertions(+) diff --git a/fs/dcache.c b/fs/dcache.c index 4bdb300..bcbfd0d 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1318,6 +1318,41 @@ void shrink_dcache_parent(struct dentry *parent) } EXPORT_SYMBOL(shrink_dcache_parent); +static enum d_walk_ret gather_inode(void *data, struct dentry *dentry) +{ + struct list_head *list = data; + struct inode *inode = dentry->d_inode; + + if (inode == NULL) + goto out; + spin_lock(&inode->i_lock); + if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || + (inode->i_mapping->nrpages == 0) || + (!list_empty(&inode->i_lru))) { + goto out_unlock; + } + __iget(inode); + list_add_tail(&inode->i_lru, list); +out_unlock: + spin_unlock(&inode->i_lock); +out: + return D_WALK_CONTINUE; +} + +void shrink_pagecache_parent(struct dentry *parent) +{ + LIST_HEAD(list); + struct inode *inode, *next; + + d_walk(parent, &list, gather_inode, NULL); + list_for_each_entry_safe(inode, next, &list, i_lru) { + list_del_init(&inode->i_lru); + invalidate_mapping_pages(inode->i_mapping, 0, -1); + iput(inode); + } +} +EXPORT_SYMBOL(shrink_pagecache_parent); + static enum d_walk_ret umount_collect(void *_data, struct dentry *dentry) { struct select_data *data = _data; -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 0/5] VFS: Directory level cache cleaning
Currently, Linux only support file system wide VFS cache (dentry cache and page cache) cleaning through '/proc/sys/vm/drop_caches'. Sometimes this is less flexible. The applications may know exactly whether the metadata and data will be referenced or not in future, a desirable mechanism is to enable applications to reclaim the memory of unused cache entries at a finer granularity - directory level. This enables applications to keep hot metadata and data (to be referenced in the future) in the cache, and kick unused out to avoid cache thrashing. Another advantage is it is more flexible for debugging. This patch extend the 'drop_caches' interface to support directory level cache cleaning and has a complete backward compatibility. '{1,2,3}' keeps the same semantics as before. Besides, "{1,2,3}:DIRECTORY_PATH_NAME" is allowed to recursively clean the caches under DIRECTORY_PATH_NAME. For example, 'echo 1:/home/foo/jpg > /proc/sys/vm/drop_caches' will clean the page caches of the files inside 'home/foo/jpg'. It is easy to demonstrate the advantage of directory level cache cleaning. We use a virtual machine configured with an Intel(R) Xeon(R) 8-core CPU E5506 @ 2.13GHz, and with 1GB memory. Three directories named '1', '2' and '3' are created, with each containing 18 – 28 files. The test program opens all files in a directory and then tries the next directory. The order for accessing the directories is '1', '2', '3', '1'. The time on accessing '1' on the second time is measured with/without cache cleaning, under different file counts. With cache cleaning, we clean all cache entries of files in '2' before accessing the files in '3'. The results are as follows (in seconds), Note: by default, VFS will move those unreferenced inodes into a global LRU list rather than freeing them, for this experiment, we modified iput() to force to free inode as well, this behavior and related codes are left for further discussion, thus not reflected in this patch) Number of files: 18 20 22 24 26 Without cleaning: 2.165 6.977 10.032 11.571 13.443 With cleaning: 1.949 1.906 2.336 2.918 3.651 When the number of files is 18 in each directory, the metadata cache is large enough to buffer all entries of three directories, so re-accessing '1' will hit in the cache, regardless of whether '2' cleaned up or not. As the number of files increases, the cache can now only buffer two+ directories. Accessing '3' will result in some entries of '1' to be evicted (due to LRU). When re-accessing '1', some entries need be reloaded from disk, which is time-consuming. In this case, cleaning '2' before accessing '3' enjoys a good speedup, a maximum 4.29X performance improvements is achieved. The advantage of directory level page cache cleaning should be easier to be demonstrated. Signed-off-by: Li Wang Signed-off-by: Yunchuan Wen Li Wang (5): VFS: Convert drop_caches to accept string VFS: Convert sysctl_drop_caches to string VFS: Add the declaration of shrink_pagecache_parent VFS: Add shrink_pagecache_parent VFS: Extend drop_caches sysctl handler to allow directory level cache cleaning fs/dcache.c| 35 +++ fs/drop_caches.c | 45 + include/linux/dcache.h |1 + include/linux/mm.h |3 ++- kernel/sysctl.c|6 ++ 5 files changed, 77 insertions(+), 13 deletions(-) -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 1/2] ceph: Clean up if error occurred in finish_read()
Clean up if error occurred rather than going through normal process Signed-off-by: Li Wang Signed-off-by: Yunchuan Wen --- fs/ceph/addr.c |3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 1e561c0..97845b4 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -252,6 +252,8 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) for (i = 0; i < num_pages; i++) { struct page *page = osd_data->pages[i]; + if (rc < 0) + goto unlock; if (bytes < (int)PAGE_CACHE_SIZE) { /* zero (remainder of) page */ int s = bytes < 0 ? 0 : bytes; @@ -262,6 +264,7 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) flush_dcache_page(page); SetPageUptodate(page); ceph_readpage_to_fscache(inode, page); +unlock: unlock_page(page); page_cache_release(page); bytes -= PAGE_CACHE_SIZE; -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 0/2] ceph: Add clean up if invalid osd reply received
Signed-off-by: Li Wang Signed-off-by: Yunchuan Wen Li Wang (2): ceph: Clean up if error occurred in finish_read() ceph: Add necessary clean up if invalid reply received in handle_reply() fs/ceph/addr.c|3 +++ net/ceph/osd_client.c |7 +++ 2 files changed, 10 insertions(+) -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 2/2] ceph: Add necessary clean up if invalid reply received in handle_reply()
Wake up possible waiters, invoke the call back if any, unregister the request Signed-off-by: Li Wang Signed-off-by: Yunchuan Wen --- net/ceph/osd_client.c |7 +++ 1 file changed, 7 insertions(+) diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 2b4b32a..a17eaae 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1581,6 +1581,13 @@ done: return; bad_put: + req->r_result = -EIO; + __unregister_request(osdc, req); + if (req->r_callback) + req->r_callback(req, msg); + else + complete_all(&req->r_completion); + complete_request(req); ceph_osdc_put_request(req); bad_mutex: mutex_unlock(&osdc->request_mutex); -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] Ceph: Avoid data inconsistency due to d-cache aliasing in readpage()
Hi Yan, zero_user_segment() has invoked flush_dcache_page() for us, we donnot wanna flush d-cache twice. Cheers, Li Wang On 11/13/2013 09:19 PM, Yan, Zheng wrote: On Wed, Nov 13, 2013 at 3:22 PM, Li Wang wrote: If the length of data to be read in readpage() is exactly PAGE_CACHE_SIZE, the original code does not flush d-cache for data consistency after finishing reading. This patches fixes this. Signed-off-by: Li Wang --- fs/ceph/addr.c |8 ++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 6df8bd4..7ba 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -210,9 +210,13 @@ static int readpage_nounlock(struct file *filp, struct page *page) if (err < 0) { SetPageError(page); goto out; - } else if (err < PAGE_CACHE_SIZE) { + } else { + if (err < PAGE_CACHE_SIZE) { /* zero fill remainder of page */ - zero_user_segment(page, err, PAGE_CACHE_SIZE); + zero_user_segment(page, err, PAGE_CACHE_SIZE); + } else { + flush_dcache_page(page); + } this doesn't make sense for me. why not call flush_dcache_page unconditionally? Regards Yan, Zheng } SetPageUptodate(page); -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/ -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] Ceph: Avoid data inconsistency due to d-cache aliasing in readpage()
If the length of data to be read in readpage() is exactly PAGE_CACHE_SIZE, the original code does not flush d-cache for data consistency after finishing reading. This patches fixes this. Signed-off-by: Li Wang --- fs/ceph/addr.c |8 ++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 6df8bd4..7ba 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -210,9 +210,13 @@ static int readpage_nounlock(struct file *filp, struct page *page) if (err < 0) { SetPageError(page); goto out; - } else if (err < PAGE_CACHE_SIZE) { + } else { + if (err < PAGE_CACHE_SIZE) { /* zero fill remainder of page */ - zero_user_segment(page, err, PAGE_CACHE_SIZE); + zero_user_segment(page, err, PAGE_CACHE_SIZE); + } else { + flush_dcache_page(page); + } } SetPageUptodate(page); -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 7/7] Cifs: Uncaching no-data page in readpage()
Currently, if one page allocated into fscache in readpage(), however, with no-data read, it is not uncached. This patch fixes this. Signed-off-by: Li Wang --- fs/cifs/file.c |4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 7f2..153bc58 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -3406,8 +3406,10 @@ static int cifs_readpage_worker(struct file *file, struct page *page, rc = cifs_read(file, read_data, PAGE_CACHE_SIZE, poffset); - if (rc < 0) + if (rc < 0) { + cifs_fscache_readpage_cancel(file_inode(file), page); goto io_error; + } else cifs_dbg(FYI, "Bytes read %d\n", rc); -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 4/7] Ceph: Uncaching no-data page in readpage()
Currently, if one page allocated into fscache in readpage(), however, with no-data read, it is not uncached. This patch fixes this. Signed-off-by: Li Wang --- fs/ceph/addr.c |1 + 1 file changed, 1 insertion(+) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 6df8bd4..be5f4b6 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -209,6 +209,7 @@ static int readpage_nounlock(struct file *filp, struct page *page) err = 0; if (err < 0) { SetPageError(page); + ceph_fscache_readpage_cancel(inode, page); goto out; } else if (err < PAGE_CACHE_SIZE) { /* zero fill remainder of page */ -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 3/7] Ceph: Introduce routine for uncaching single no-data page
Introduce a routine for uncaching single no-data page, typically in readpage(). Signed-off-by: Li Wang --- fs/ceph/cache.h | 13 + 1 file changed, 13 insertions(+) diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h index ba94940..eb0ec76 100644 --- a/fs/ceph/cache.h +++ b/fs/ceph/cache.h @@ -67,6 +67,14 @@ static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp) return fscache_maybe_release_page(ci->fscache, page, gfp); } + +static inline void cpeh_fscache_readpage_cancel(struct inode *inode, + struct page *page) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + return fscache_readpage_cancel(ci->fscache, page); +} + static inline void ceph_fscache_readpages_cancel(struct inode *inode, struct list_head *pages) { @@ -145,6 +153,11 @@ static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp) return 1; } +static inline void ceph_fscache_readpage_cancel(struct inode *inode, + struct page *page) +{ +} + static inline void ceph_fscache_readpages_cancel(struct inode *inode, struct list_head *pages) { -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 6/7] Cifs: Implement uncaching single no-data page
Implement the routine for uncaching single no-data page, typically in readpage(). Signed-off-by: Li Wang --- fs/cifs/fscache.c |7 +++ 1 file changed, 7 insertions(+) diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c index 8d4b7bc..168f184 100644 --- a/fs/cifs/fscache.c +++ b/fs/cifs/fscache.c @@ -223,6 +223,13 @@ void __cifs_readpage_to_fscache(struct inode *inode, struct page *page) fscache_uncache_page(CIFS_I(inode)->fscache, page); } +void __cifs_fscache_readpage_cancel(struct inode *inode, struct page *page) +{ +cifs_dbg(FYI, "%s: (fsc: %p, i: %p)\n", + __func__, CIFS_I(inode)->fscache, inode); +fscache_readpage_cancel(CIFS_I(inode)->fscache, page); +} + void __cifs_fscache_readpages_cancel(struct inode *inode, struct list_head *pages) { cifs_dbg(FYI, "%s: (fsc: %p, i: %p)\n", -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 1/7] Fscache: Introduce new API fscache_readpage_cancel()
Introduce a new API fscache_readpage_cancel() for uncaching one single no-data page from fscache. Signed-off-by: Li Wang --- include/linux/fscache.h | 11 +++ 1 file changed, 11 insertions(+) diff --git a/include/linux/fscache.h b/include/linux/fscache.h index 115bb81..f1ed21f 100644 --- a/include/linux/fscache.h +++ b/include/linux/fscache.h @@ -245,6 +245,8 @@ extern bool __fscache_maybe_release_page(struct fscache_cookie *, struct page *, gfp_t); extern void __fscache_uncache_all_inode_pages(struct fscache_cookie *, struct inode *); +extern void __fscache_readpage_cancel(struct fscache_cookie *cookie, + struct page *page); extern void __fscache_readpages_cancel(struct fscache_cookie *cookie, struct list_head *pages); extern void __fscache_disable_cookie(struct fscache_cookie *, bool); @@ -633,6 +635,15 @@ int fscache_alloc_page(struct fscache_cookie *cookie, return -ENOBUFS; } +static inline +void fscache_readpage_cancel(struct fscache_cookie *cookie, +struct page *page) +{ + if (fscache_cookie_valid(cookie)) + __fscache_readpage_cancel(cookie, page); +} + + /** * fscache_readpages_cancel - Cancel read/alloc on pages * @cookie: The cookie representing the inode's cache object. -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 2/7] Fscache: Implement uncaching single no-data page
Similar to the routine for multiple pages except that it takes page * as input rather than list head *. Signed-off-by: Li Wang --- fs/fscache/page.c |8 1 file changed, 8 insertions(+) diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 7f5c658..0c69f72 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -721,6 +721,14 @@ nobufs: } EXPORT_SYMBOL(__fscache_alloc_page); +void __fscache_readpage_cancel(struct fscache_cookie *cookie, + struct page *page) +{ + if (PageFsCache(page)) + __fscache_uncache_page(cookie, page); +} +EXPORT_SYMBOL(__fscache_readpage_cancel); + /* * Unmark pages allocate in the readahead code path (via: * fscache_readpages_or_alloc) after delegating to the base filesystem -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 0/7] Cifs and Ceph: Uncache single no-data page in readpage()
Currently, the page allocated into fscache in readpage() for Cifs and Ceph does not be uncached if no data read due to io error. This patch fixes this. fscache_readpages_cancel() is for this kind of job but taking list read * as input, so a new routine take page * as input is introduced. Li Wang (7): Fscache: Introduce new API fscache_readpage_cancel() Fscache: Implement uncaching single no-data page Ceph: Introduce routine for uncaching single no-data page Ceph: Uncaching no-data page in readpage() Cifs: Introduce routine for uncaching single no-data page Cifs: Implement uncaching single no-data page Cifs: Uncaching no-data page in readpage() fs/ceph/addr.c |1 + fs/ceph/cache.h | 13 + fs/cifs/file.c |4 +++- fs/cifs/fscache.c |7 +++ fs/cifs/fscache.h | 13 + fs/fscache/page.c |8 include/linux/fscache.h | 11 +++ 7 files changed, 56 insertions(+), 1 deletion(-) -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 5/7] Cifs: Introduce routine for uncaching single no-data page
Introduce a routine for uncaching single no-data page, typically in readpage(). Signed-off-by: Li Wang --- fs/cifs/fscache.h | 13 + 1 file changed, 13 insertions(+) diff --git a/fs/cifs/fscache.h b/fs/cifs/fscache.h index 24794b6..c712f42 100644 --- a/fs/cifs/fscache.h +++ b/fs/cifs/fscache.h @@ -54,6 +54,7 @@ extern int __cifs_readpages_from_fscache(struct inode *, struct address_space *, struct list_head *, unsigned *); +extern void __cifs_fscache_readpage_cancel(struct inode *, struct page *); extern void __cifs_fscache_readpages_cancel(struct inode *, struct list_head *); extern void __cifs_readpage_to_fscache(struct inode *, struct page *); @@ -92,6 +93,13 @@ static inline void cifs_readpage_to_fscache(struct inode *inode, __cifs_readpage_to_fscache(inode, page); } +static inline void cifs_fscache_readpage_cancel(struct inode *inode, + struct page *page) +{ + if (CIFS_I(inode)->fscache) + return __cifs_fscache_readpage_cancel(inode, page); +} + static inline void cifs_fscache_readpages_cancel(struct inode *inode, struct list_head *pages) { @@ -139,6 +147,11 @@ static inline int cifs_readpages_from_fscache(struct inode *inode, static inline void cifs_readpage_to_fscache(struct inode *inode, struct page *page) {} +static inline void cifs_fscache_readpage_cancel(struct inode *inode, + struct page *page) +{ +} + static inline void cifs_fscache_readpages_cancel(struct inode *inode, struct list_head *pages) { -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] Ceph: allocate non-zero page to fscache in readpage()
ceph_osdc_readpages() returns number of bytes read, currently, the code only allocate full-zero page into fscache, this patch fixes this. Signed-off-by: Li Wang --- fs/ceph/addr.c |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 6df8bd4..1e561c0 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -216,7 +216,7 @@ static int readpage_nounlock(struct file *filp, struct page *page) } SetPageUptodate(page); - if (err == 0) + if (err >= 0) ceph_readpage_to_fscache(inode, page); out: -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC PATCH] ceph: Write through cache support based on fscache
Hi Milosz, Thanks for your comments. We think SSD and fscache based write cache is definitely useful for Ceph, since to some extent, write amplification slow down the write performance of Ceph. Lustre has already introduced SSD based write cache. SSD can be treated as an outer big cache for page cache. It can reduce the requirement of network and OSD bandwidth. Write back cache is more performance useful, but more complicated to implement to meet the consistence and other correctness semantic demands of Ceph and POSIX, such as sync(). Write through cache is much simpler, which will not bother too much. So our goal is to implement both, we plan to submit it as a blueprint at the incoming CDS. It would be great if you could help review and give comments on our codes during the development. Again, thanks very much. Cheers, Li Wang On 11/02/2013 12:51 AM, Milosz Tanski wrote: Li, I think it would be fantastic to see a write cache. In many workloads you ended up writing out a file and then turning around and reading it right back in on the same node. There's a few things that I would like to see. First, an mount option to turn on/off write through caching. There are some workloads / user hardware configurations that will not benefit from this (it might be a net negative). Also, I think it's nice to have a fallback to disable it it's miss behaving. Second, for correctness I think you should only do write-through caching if you have an exclusive cap on the file. Currently as the code is written it only reads from fscache if the file is open in read only mode and has the cache cap. This would also have to change. Thanks, - Milosz P.S: Sorry for the second message Li, I fail at email and forgot to reply-all. On Fri, Nov 1, 2013 at 9:49 AM, Li Wang wrote: Currently, fscache only plays as read cache for ceph, this patch enables it plays as the write through cache as well. A small trick to be discussed: if the writing to OSD finishes before the writing to fscache, the fscache writing is cancelled to avoid slow down the writepages() process. Signed-off-by: Min Chen Signed-off-by: Li Wang Signed-off-by: Yunchuan Wen --- fs/ceph/addr.c | 10 +++--- fs/ceph/cache.c | 29 + fs/ceph/cache.h | 13 + 3 files changed, 49 insertions(+), 3 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 6df8bd4..2465c49 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -506,7 +506,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb)) set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC); - ceph_readpage_to_fscache(inode, page); + ceph_writepage_to_fscache(inode, page); set_page_writeback(page); err = ceph_osdc_writepages(osdc, ceph_vino(inode), @@ -634,6 +634,7 @@ static void writepages_finish(struct ceph_osd_request *req, if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) generic_error_remove_page(inode->i_mapping, page); + ceph_maybe_release_fscache_page(inode, page); unlock_page(page); } dout("%p wrote+cleaned %d pages\n", inode, wrote); @@ -746,7 +747,7 @@ retry: while (!done && index <= end) { int num_ops = do_sync ? 2 : 1; - unsigned i; + unsigned i, j; int first; pgoff_t next; int pvec_pages, locked_pages; @@ -894,7 +895,6 @@ get_more_pages: if (!locked_pages) goto release_pvec_pages; if (i) { - int j; BUG_ON(!locked_pages || first < 0); if (pvec_pages && i == pvec_pages && @@ -924,6 +924,10 @@ get_more_pages: osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, !!pool, false); + for (j = 0; j < locked_pages; j++) { + struct page *page = pages[j]; + ceph_writepage_to_fscache(inode, page); + } pages = NULL; /* request message now owns the pages array */ pool = NULL; diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index 6bfe65e..6f928c4 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -320,6 +320,24 @@ void ceph_readpage_to_fscache(struct inode *inode, struct page *page) fscache_uncache_page(ci->fscache, page); } +void ceph_writepage_to_fscache(struct inode *inode, struct page *page) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int ret; + + if (!cache_valid(ci)) + return; + + if (!PageFsCa
[RFC PATCH] ceph: Write through cache support based on fscache
Currently, fscache only plays as read cache for ceph, this patch enables it plays as the write through cache as well. A small trick to be discussed: if the writing to OSD finishes before the writing to fscache, the fscache writing is cancelled to avoid slow down the writepages() process. Signed-off-by: Min Chen Signed-off-by: Li Wang Signed-off-by: Yunchuan Wen --- fs/ceph/addr.c | 10 +++--- fs/ceph/cache.c | 29 + fs/ceph/cache.h | 13 + 3 files changed, 49 insertions(+), 3 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 6df8bd4..2465c49 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -506,7 +506,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb)) set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC); - ceph_readpage_to_fscache(inode, page); + ceph_writepage_to_fscache(inode, page); set_page_writeback(page); err = ceph_osdc_writepages(osdc, ceph_vino(inode), @@ -634,6 +634,7 @@ static void writepages_finish(struct ceph_osd_request *req, if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) generic_error_remove_page(inode->i_mapping, page); + ceph_maybe_release_fscache_page(inode, page); unlock_page(page); } dout("%p wrote+cleaned %d pages\n", inode, wrote); @@ -746,7 +747,7 @@ retry: while (!done && index <= end) { int num_ops = do_sync ? 2 : 1; - unsigned i; + unsigned i, j; int first; pgoff_t next; int pvec_pages, locked_pages; @@ -894,7 +895,6 @@ get_more_pages: if (!locked_pages) goto release_pvec_pages; if (i) { - int j; BUG_ON(!locked_pages || first < 0); if (pvec_pages && i == pvec_pages && @@ -924,6 +924,10 @@ get_more_pages: osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, !!pool, false); + for (j = 0; j < locked_pages; j++) { + struct page *page = pages[j]; + ceph_writepage_to_fscache(inode, page); + } pages = NULL; /* request message now owns the pages array */ pool = NULL; diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index 6bfe65e..6f928c4 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -320,6 +320,24 @@ void ceph_readpage_to_fscache(struct inode *inode, struct page *page) fscache_uncache_page(ci->fscache, page); } +void ceph_writepage_to_fscache(struct inode *inode, struct page *page) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int ret; + + if (!cache_valid(ci)) + return; + + if (!PageFsCache(page)) { + if (fscache_alloc_page(ci->fscache, page, GFP_KERNEL)) + return; + } + + if (fscache_write_page(ci->fscache, page, GFP_KERNEL)) + fscache_uncache_page(ci->fscache, page); +} + + void ceph_invalidate_fscache_page(struct inode* inode, struct page *page) { struct ceph_inode_info *ci = ceph_inode(inode); @@ -328,6 +346,17 @@ void ceph_invalidate_fscache_page(struct inode* inode, struct page *page) fscache_uncache_page(ci->fscache, page); } +void ceph_maybe_release_fscache_page(struct inode *inode, struct page *page) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + if (PageFsCache(page)) { + if (!fscache_check_page_write(ci->fscache, page)) + fscache_maybe_release_page(ci->fscache, + page, GFP_KERNEL); + } +} + void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc) { if (fsc->revalidate_wq) diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h index ba94940..aa02b7a 100644 --- a/fs/ceph/cache.h +++ b/fs/ceph/cache.h @@ -45,7 +45,9 @@ int ceph_readpages_from_fscache(struct inode *inode, struct list_head *pages, unsigned *nr_pages); void ceph_readpage_to_fscache(struct inode *inode, struct page *page); +void ceph_writepage_to_fscache(struct inode *inode, struct page *page); void ceph_invalidate_fscache_page(struct inode* inode, struct page *page); +void ceph_maybe_release_fscache_page(struct inode *inode, struct page *page); void ceph_queue_revalidate(struct inode *inode); static inline void ceph_fscache_invalidate(struct inode *inode) @@ -127,6 +129,11 @@ static inline void ceph_readpage_to_fscache(struct i
[PATCH] ceph: Update the pages in fscache in writepages() path
Currently, the pages in fscache only are updated in writepage() path, add the process in writepages(). Signed-off-by: Min Chen Signed-off-by: Li Wang Signed-off-by: Yunchuan Wen --- fs/ceph/addr.c |8 +--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 6df8bd4..cc57911 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -746,7 +746,7 @@ retry: while (!done && index <= end) { int num_ops = do_sync ? 2 : 1; - unsigned i; + unsigned i, j; int first; pgoff_t next; int pvec_pages, locked_pages; @@ -894,7 +894,6 @@ get_more_pages: if (!locked_pages) goto release_pvec_pages; if (i) { - int j; BUG_ON(!locked_pages || first < 0); if (pvec_pages && i == pvec_pages && @@ -924,7 +923,10 @@ get_more_pages: osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, !!pool, false); - + for(j = 0; j < locked_pages; j++) { + struct page *page = pages[j]; + ceph_readpage_to_fscache(inode, page); + } pages = NULL; /* request message now owns the pages array */ pool = NULL; -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v5] Ceph: Punch hole support for kernel client
This patch implements fallocate and punch hole support for Ceph kernel client. Signed-off-by: Li Wang Signed-off-by: Yunchuan Wen --- Against v3: Passed the fsx test from xfstests. Truncate rather than delete the first object. Thanks go to Sage and Zheng for the explanation. Silence the OSD ENOENT complaints. --- fs/ceph/file.c| 196 + net/ceph/osd_client.c | 11 ++- 2 files changed, 205 insertions(+), 2 deletions(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 2ddf061..e2bcd5c 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -8,6 +8,7 @@ #include #include #include +#include #include "super.h" #include "mds_client.h" @@ -871,6 +872,200 @@ out: return offset; } +static inline void ceph_zero_partial_page( + struct inode *inode, loff_t offset, unsigned size) +{ + struct page *page; + pgoff_t index = offset >> PAGE_CACHE_SHIFT; + + page = find_lock_page(inode->i_mapping, index); + if (page) { + wait_on_page_writeback(page); + zero_user(page, offset & (PAGE_CACHE_SIZE - 1), size); + unlock_page(page); + page_cache_release(page); + } +} + +static void ceph_zero_pagecache_range(struct inode *inode, loff_t offset, + loff_t length) +{ + loff_t nearly = round_up(offset, PAGE_CACHE_SIZE); + if (offset < nearly) { + loff_t size = nearly - offset; + if (length < size) + size = length; + ceph_zero_partial_page(inode, offset, size); + offset += size; + length -= size; + } + if (length >= PAGE_CACHE_SIZE) { + loff_t size = round_down(length, PAGE_CACHE_SIZE); + truncate_pagecache_range(inode, offset, offset + size - 1); + offset += size; + length -= size; + } + if (length) + ceph_zero_partial_page(inode, offset, length); +} + +static int ceph_zero_partial_object(struct inode *inode, + loff_t offset, loff_t *length) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_osd_request *req; + int ret = 0; + loff_t zero = 0; + int op; + + if (!length) { + op = offset ? CEPH_OSD_OP_DELETE : CEPH_OSD_OP_TRUNCATE; + length = &zero; + } else { + op = CEPH_OSD_OP_ZERO; + } + + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, + ceph_vino(inode), + offset, length, + 1, op, + CEPH_OSD_FLAG_WRITE | + CEPH_OSD_FLAG_ONDISK, + NULL, 0, 0, false); + if (IS_ERR(req)) { + ret = PTR_ERR(req); + goto out; + } + + ceph_osdc_build_request(req, offset, NULL, ceph_vino(inode).snap, + &inode->i_mtime); + + ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); + if (!ret) { + ret = ceph_osdc_wait_request(&fsc->client->osdc, req); + if (ret == -ENOENT) + ret = 0; + } + ceph_osdc_put_request(req); + +out: + return ret; +} + +static int ceph_zero_objects(struct inode *inode, loff_t offset, loff_t length) +{ + int ret = 0; + struct ceph_inode_info *ci = ceph_inode(inode); + __s32 stripe_unit = ceph_file_layout_su(ci->i_layout); + __s32 stripe_count = ceph_file_layout_stripe_count(ci->i_layout); + __s32 object_size = ceph_file_layout_object_size(ci->i_layout); + loff_t object_set_size = (loff_t)object_size * stripe_count; + + loff_t nearly = (offset + object_set_size - 1) + / object_set_size * object_set_size; + while (length && offset < nearly) { + loff_t size = length; + ret = ceph_zero_partial_object(inode, offset, &size); + if (ret < 0) + return ret; + offset += size; + length -= size; + } + while (length >= object_set_size) { + int i; + loff_t pos = offset; + for (i = 0; i < stripe_count; ++i) { + ret = ceph_zero_partial_object(inode, pos, NULL); + if (ret < 0) + return ret; + pos += stripe_unit; + } + offset += object_set_size; + length -= object_set_size;
[PATCH v4] Ceph: Punch hole support for kernel client
This patch implements fallocate and punch hole support for Ceph kernel client. Signed-off-by: Li Wang Signed-off-by: Yunchuan Wen --- Passed the fsx test from xfstests. Truncate rather than delete the first object. Thanks go to Sage and Zheng for the explanation. --- fs/ceph/file.c| 193 + net/ceph/osd_client.c | 11 ++- 2 files changed, 202 insertions(+), 2 deletions(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 2ddf061..04201fb 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -8,6 +8,7 @@ #include #include #include +#include #include "super.h" #include "mds_client.h" @@ -871,6 +872,197 @@ out: return offset; } +static inline void ceph_zero_partial_page( + struct inode *inode, loff_t offset, unsigned size) +{ + struct page *page; + pgoff_t index = offset >> PAGE_CACHE_SHIFT; + + page = find_lock_page(inode->i_mapping, index); + if (page) { + wait_on_page_writeback(page); + zero_user(page, offset & (PAGE_CACHE_SIZE - 1), size); + unlock_page(page); + page_cache_release(page); + } +} + +static void ceph_zero_pagecache_range(struct inode *inode, loff_t offset, + loff_t length) +{ + loff_t nearly = round_up(offset, PAGE_CACHE_SIZE); + if (offset < nearly) { + loff_t size = nearly - offset; + if (length < size) + size = length; + ceph_zero_partial_page(inode, offset, size); + offset += size; + length -= size; + } + if (length >= PAGE_CACHE_SIZE) { + loff_t size = round_down(length, PAGE_CACHE_SIZE); + truncate_pagecache_range(inode, offset, offset + size - 1); + offset += size; + length -= size; + } + if (length) + ceph_zero_partial_page(inode, offset, length); +} + +static int ceph_zero_partial_object(struct inode *inode, + loff_t offset, loff_t *length) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_osd_request *req; + int ret = 0; + loff_t zero = 0; + int op; + + if (!length) { + op = offset ? CEPH_OSD_OP_DELETE : CEPH_OSD_OP_TRUNCATE; + length = &zero; + } else { + op = CEPH_OSD_OP_ZERO; + } + + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, + ceph_vino(inode), + offset, length, + 1, op, + CEPH_OSD_FLAG_WRITE | + CEPH_OSD_FLAG_ONDISK, + NULL, 0, 0, false); + if (IS_ERR(req)) { + ret = PTR_ERR(req); + goto out; + } + + ceph_osdc_build_request(req, offset, NULL, ceph_vino(inode).snap, + &inode->i_mtime); + + ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); + if (!ret) + ret = ceph_osdc_wait_request(&fsc->client->osdc, req); + ceph_osdc_put_request(req); + +out: + return ret; +} + +static int ceph_zero_objects(struct inode *inode, loff_t offset, loff_t length) +{ + int ret = 0; + struct ceph_inode_info *ci = ceph_inode(inode); + __s32 stripe_unit = ceph_file_layout_su(ci->i_layout); + __s32 stripe_count = ceph_file_layout_stripe_count(ci->i_layout); + __s32 object_size = ceph_file_layout_object_size(ci->i_layout); + loff_t object_set_size = (loff_t)object_size * stripe_count; + + loff_t nearly = (offset + object_set_size - 1) + / object_set_size * object_set_size; + while (length && offset < nearly) { + loff_t size = length; + ret = ceph_zero_partial_object(inode, offset, &size); + if (ret < 0) + return ret; + offset += size; + length -= size; + } + while (length >= object_set_size) { + int i; + loff_t pos = offset; + for (i = 0; i < stripe_count; ++i) { + ret = ceph_zero_partial_object(inode, pos, NULL); + if (ret < 0) + return ret; + pos += stripe_unit; + } + offset += object_set_size; + length -= object_set_size; + } + while (length) { + loff_t size = length; + ret = ceph_zero_partial
[PATCH] x86: remove redundant local_irq_enable() after cpuidle_idle_call()
When cpuidle_idle_call() return 0, it shows that linux system is using idle framwork driver. Now, local irq has already been enabled in cpuidle_idle_call(). So, it need not enable local irq again, when return 0. The code is introduced by commit: 97a5b81fa4d3a11dcdf224befc577f2e0abadc0b ("x86: Fix idle consolidation fallout") In that defect, it does not use idle framework driver, just call amd_e400_idle(). That problem is that amd_e400_idle() does not enable irq. Signed-off-by: Li Wang --- arch/x86/kernel/process.c |2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 83369e5..cb55ee4 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -300,8 +300,6 @@ void arch_cpu_idle(void) { if (cpuidle_idle_call()) x86_idle(); - else - local_irq_enable(); } /* -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v3] Ceph: Punch hole support for kernel client
This patch implements fallocate and punch hole support for Ceph kernel client. Signed-off-by: Li Wang Signed-off-by: Yunchuan Wen --- Passed the fsx test from xfstests. --- fs/ceph/file.c| 191 + net/ceph/osd_client.c |8 ++- 2 files changed, 197 insertions(+), 2 deletions(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 656e169..6e56824 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -8,6 +8,7 @@ #include #include #include +#include #include "super.h" #include "mds_client.h" @@ -882,6 +883,195 @@ out: return offset; } +static inline void ceph_zero_partial_page( + struct inode *inode, loff_t offset, unsigned size) +{ + struct page *page; + pgoff_t index = offset >> PAGE_CACHE_SHIFT; + + page = find_lock_page(inode->i_mapping, index); + if (page) { + wait_on_page_writeback(page); + zero_user(page, offset & (PAGE_CACHE_SIZE - 1), size); + unlock_page(page); + page_cache_release(page); + } +} + +static void ceph_zero_pagecache_range(struct inode *inode, loff_t offset, + loff_t length) +{ + loff_t nearly = round_up(offset, PAGE_CACHE_SIZE); + if (offset < nearly) { + loff_t size = nearly - offset; + if (length < size) + size = length; + ceph_zero_partial_page(inode, offset, size); + offset += size; + length -= size; + } + if (length >= PAGE_CACHE_SIZE) { + loff_t size = round_down(length, PAGE_CACHE_SIZE); + truncate_pagecache_range(inode, offset, offset + size - 1); + offset += size; + length -= size; + } + if (length) + ceph_zero_partial_page(inode, offset, length); +} + +static int ceph_zero_partial_object(struct inode *inode, + loff_t offset, loff_t *length) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_osd_request *req; + int ret = 0; + loff_t zero = 0; + int op = CEPH_OSD_OP_ZERO; + + if (!length) { + op = CEPH_OSD_OP_DELETE; + length = &zero; + } + + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, + ceph_vino(inode), + offset, length, + 1, op, + CEPH_OSD_FLAG_WRITE | + CEPH_OSD_FLAG_ONDISK, + NULL, 0, 0, false); + if (IS_ERR(req)) { + ret = PTR_ERR(req); + goto out; + } + + ceph_osdc_build_request(req, offset, NULL, ceph_vino(inode).snap, + &inode->i_mtime); + + ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); + if (!ret) + ret = ceph_osdc_wait_request(&fsc->client->osdc, req); + ceph_osdc_put_request(req); + +out: + return ret; +} + +static int ceph_zero_objects(struct inode *inode, loff_t offset, loff_t length) +{ + int ret = 0; + struct ceph_inode_info *ci = ceph_inode(inode); + __s32 stripe_unit = ceph_file_layout_su(ci->i_layout); + __s32 stripe_count = ceph_file_layout_stripe_count(ci->i_layout); + __s32 object_size = ceph_file_layout_object_size(ci->i_layout); + loff_t object_set_size = (loff_t)object_size * stripe_count; + + loff_t nearly = (offset + object_set_size - 1) + / object_set_size * object_set_size; + while (length && offset < nearly) { + loff_t size = length; + ret = ceph_zero_partial_object(inode, offset, &size); + if (ret < 0) + return ret; + offset += size; + length -= size; + } + while (length >= object_set_size) { + int i; + loff_t pos = offset; + for (i = 0; i < stripe_count; ++i) { + ret = ceph_zero_partial_object(inode, pos, NULL); + if (ret < 0) + return ret; + pos += stripe_unit; + } + offset += object_set_size; + length -= object_set_size; + } + while (length) { + loff_t size = length; + ret = ceph_zero_partial_object(inode, offset, &size); + if (ret < 0) + return ret; + offset += size; + length -= size; + } +
[RFC] Ceph: Kernel client part of inline data support
This patch implements the kernel client part of inline data support, the algorithm is described below. This is a preliminarly implementation based on Linux kernel 3.8.3. State: CEPH_INLINE_MIGRATION: The file size has exceeded the threshold of inline, but MDS has the newest inline data CEPH_INLINE_DISABLED: The file is not inlined, and MDS does not have the inline data Client: Open, lookup, getattr, handle_cap_grant etc, MDS send inline data together with inode metadata to client Read side: if (hold CEPH_CAP_FILE_CACHE capability) // ceph_readpage()/ceph_readpages() if (state < CEPH_INLINE_MIGRATION) copy inline data from inode buffer into page cache else if (state == CEPH_INLINE_MIGRATION) read the data from the OSD replace the head of the first page with the inline data from inode buffer else // ceph_sync_read() if (state != CEPH_INLINE_DISABLED) send GETATTR message to MDS to fetch inline data into inode buffer copy the inline data from inode buffer to user buffer directly if (state == CEPH_INLINE_MIGRATION and pos+len>CEPH_INLINE_SIZE) continue to read the remaning data from OSD to user buffer Write side: if (hold CEPH_CAP_FILE_CACHE capability) if (state < CEPH_INLINE_MIGRATION) // ceph_write_end() if (pos < CEPH_INLINE_SIZE) if (pos + len > CEPH_INLINE_SIZE) let state = CEPH_INLINE_DISABLED else let state = CEPH_INLINE_MIGRATION else if (state == CEPH_INLINE_MIGRATION) if (pos < CEPH_INLINE_SIZE) let state = CEPH_INLINE_DISABLED; if (state < CEPH_INLINE_MIGRATION) // ceph_writepage/ceph_writepages_start() copy data from page cache into inode buffer mark cap and inode dirty to send inode buffer to MDS else do the normal write to OSD else // ceph_sync_write() if (state != CEPH_INLINE_DISABLED) if (pos < CEPH_INLINE_SIZE) copy the written data fit into [pos, min(pos+len, CEPH_INLINE_SIZE)) from user buffer directly to inode buffer let dirty_data_only=true, record the write pos as well as length // leave MDS to merge mark cap and inode dirty to send (maybe part of) written data to MDS if (pos + len >= CEPH_INLINE_SIZE) let state = CEPH_INLINE_MIGRATION write the remaining data to OSD else do the normal write to OSD Signed-off-by: Li Wang Signed-off-by: Yunchuan Wen --- fs/ceph/addr.c | 186 ++ fs/ceph/caps.c | 61 -- fs/ceph/file.c | 90 +++- fs/ceph/inode.c | 19 - fs/ceph/mds_client.c | 14 ++-- fs/ceph/mds_client.h |2 + fs/ceph/super.h | 14 include/linux/ceph/ceph_fs.h |4 + net/ceph/messenger.c |2 +- 9 files changed, 342 insertions(+), 50 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 064d1a6..033396c 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -204,6 +204,18 @@ static int readpage_nounlock(struct file *filp, struct page *page) dout("readpage inode %p file %p page %p index %lu\n", inode, filp, page, page->index); + + if (ci->i_inline_data.version < CEPH_INLINE_MIGRATION && ci->i_inline_data.length) { + void *virt = kmap(page); + memcpy(virt, ci->i_inline_data.data, ci->i_inline_data.length); + kunmap(page); + zero_user_segment(page, ci->i_inline_data.length, PAGE_CACHE_SIZE); + flush_dcache_page(page); + SetPageUptodate(page); + err = 0; + goto out; + } + err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, (u64) page_offset(page), &len, ci->i_truncate_seq, ci->i_truncate_size, @@ -217,6 +229,13 @@ static int readpage_nounlock(struct file *filp, struct page *page) /* zero fill remainder of page */ zero_user_segment(page, err, PAGE_CACHE_SIZE); } + + if (ci->i_inline_data.version == CEPH_INLINE_MIGRATION && ci->i_inline_data.length) { + void *virt = kmap(page); + memcpy(virt, ci->i_inline_data.data, ci->i_inline_data.length); + kunmap(page); + flush_dcache_page(page); + } SetPageUptodate(page); out: @@ -252,6 +271,15 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) for (i = 0; i < req->r_num_pages; i++, bytes -= PAGE_CACHE_SIZE) { struct page *page = req->r_pages[i]; + struct ceph_inode_info *ci = ceph_inode(inode); + if (ci->i_inline_data.version == CEPH_INLINE_MIGRATION && page->index == 0) { + if (ci->i_inline_data.lengt
[PATCH v2] Ceph: Punch hole support
This patch implements punch hole (fallocate) support for Ceph. Signed-off-by: Li Wang Signed-off-by: Yunchuan Wen --- fs/ceph/file.c| 313 + net/ceph/osd_client.c |8 +- 2 files changed, 319 insertions(+), 2 deletions(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 656e169..578e5fd 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -8,6 +8,7 @@ #include #include #include +#include #include "super.h" #include "mds_client.h" @@ -882,6 +883,317 @@ out: return offset; } +static inline void ceph_zero_partial_page(struct inode *inode, pgoff_t index, unsigned start, unsigned size) +{ + struct page *page; + + page = find_lock_page(inode->i_mapping, index); + if (page) { + zero_user(page, start, size); + unlock_page(page); + page_cache_release(page); + } +} + +static void ceph_truncate_and_zero_page_cache(struct inode *inode, loff_t offset, loff_t length) +{ + loff_t first_page; + loff_t last_page; + loff_t zero_len; + + first_page =((offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) << PAGE_CACHE_SHIFT; + last_page = ((offset + length) >> PAGE_CACHE_SHIFT) << PAGE_CACHE_SHIFT; + if (last_page > first_page) { + truncate_pagecache_range(inode, first_page, last_page - 1); + } + if (first_page > last_page) { + ceph_zero_partial_page(inode, offset >> PAGE_CACHE_SHIFT, offset & (PAGE_CACHE_SIZE - 1), length); + return; + } + /* +* zero out the partial page that contains +* the start of the hole +*/ + zero_len = first_page - offset; + if (zero_len > 0) { + ceph_zero_partial_page(inode, offset >> PAGE_CACHE_SHIFT, offset & (PAGE_CACHE_SIZE -1), zero_len); + } + /* +* zero out the partial page that contains +* the end of the hole +*/ + zero_len = offset + length - last_page; + if (zero_len > 0) { + ceph_zero_partial_page(inode, (offset + length) >> PAGE_CACHE_SHIFT, 0, zero_len); + } + /* +* If i_size is contained in the last page, we need to +* zero the partial page after i_size +*/ + if (inode->i_size >> PAGE_CACHE_SHIFT == (offset + length) >> PAGE_CACHE_SHIFT && inode->i_size % PAGE_CACHE_SIZE != 0) { + zero_len = PAGE_CACHE_SIZE - + (inode->i_size & (PAGE_CACHE_SIZE - 1)); + if (zero_len > 0) { + ceph_zero_partial_page(inode, inode->i_size >> PAGE_CACHE_SHIFT, inode->i_size & (PAGE_CACHE_SIZE -1), zero_len); + } + } +} + +static inline __u32 ceph_calculate_shift(__s64 size) +{ + int shift; + + if (size <= 0) + return -1; + if (size == 1) + return 0; + for (shift = 0; ;shift++) { + if (2 << shift == size) + break; + } + shift++; + + return shift; +} + +static int ceph_delete_object(struct inode *inode, u64 offset, u64 *length) +{ + struct ceph_inode_info *ci = ceph_inode(inode); +struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_osd_request *req; + int ret = 0; + + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, +ceph_vino(inode), offset, length, 1, +CEPH_OSD_OP_DELETE, CEPH_OSD_FLAG_ONDISK, +NULL, +ci->i_truncate_seq, ci->i_truncate_size, +false); + if (IS_ERR(req)) { + ret = PTR_ERR(req); + goto out; + } + +ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); +if (!ret) { +ret = ceph_osdc_wait_request(&fsc->client->osdc, req); +} + ceph_osdc_put_request(req); + + out: + return ret; +} + +static int ceph_zero_partial_object(struct inode *inode, loff_t offset, loff_t *length) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_osd_request *req; + int ret = 0; + + if (length <= 0) + goto out; + + + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, +ceph_vino(inode), offset, length, 1, +CEPH_OSD_OP_ZERO, CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, +NULL, +ci->i_truncate_seq, ci->i_truncate_size, +
[PATCH 1/2] Punch hole support against 3.8-rc3
This patch implements punch hole (fallocate) support against Linux kernel 3.8-rc3. Signed-off-by: Li Wang Signed-off-by: Yunchuan Wen --- fs/ceph/file.c| 248 + net/ceph/osd_client.c | 17 +++- 2 files changed, 260 insertions(+), 5 deletions(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index e51558f..7fb9c6d 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -7,6 +7,7 @@ #include #include #include +#include #include "super.h" #include "mds_client.h" @@ -848,6 +849,252 @@ out: return offset; } +static inline void ceph_zero_partial_page(struct inode *inode, pgoff_t index, unsigned start, unsigned size) +{ + struct page *page; + + page = find_lock_page(inode->i_mapping, index); + if (page) { + zero_user(page, start, size); + unlock_page(page); + page_cache_release(page); + } +} + +static void ceph_truncate_and_zero_page_cache(struct inode *inode, loff_t offset, loff_t length) +{ + loff_t first_page; + loff_t last_page; + loff_t zero_len; + + first_page =((offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) << PAGE_CACHE_SHIFT; + last_page = ((offset + length) >> PAGE_CACHE_SHIFT) << PAGE_CACHE_SHIFT; + if (last_page > first_page) { + truncate_pagecache_range(inode, first_page, last_page - 1); + } + if (first_page > last_page) { + ceph_zero_partial_page(inode, offset >> PAGE_CACHE_SHIFT, offset & (PAGE_CACHE_SIZE - 1), length); + return; + } + /* +* zero out the partial page that contains +* the start of the hole +*/ + zero_len = first_page - offset; + if (zero_len > 0) { + ceph_zero_partial_page(inode, offset >> PAGE_CACHE_SHIFT, offset & (PAGE_CACHE_SIZE -1), zero_len); + } + /* +* zero out the partial page that contains +* the end of the hole +*/ + zero_len = offset + length - last_page; + if (zero_len > 0) { + ceph_zero_partial_page(inode, (offset + length) >> PAGE_CACHE_SHIFT, 0, zero_len); + } + /* +* If i_size is contained in the last page, we need to +* zero the partial page after i_size +*/ + if (inode->i_size >> PAGE_CACHE_SHIFT == (offset + length) >> PAGE_CACHE_SHIFT && inode->i_size % PAGE_CACHE_SIZE != 0) { + zero_len = PAGE_CACHE_SIZE - + (inode->i_size & (PAGE_CACHE_SIZE - 1)); + if (zero_len > 0) { + ceph_zero_partial_page(inode, inode->i_size >> PAGE_CACHE_SHIFT, inode->i_size & (PAGE_CACHE_SIZE -1), zero_len); + } + } +} + +static int ceph_delete_object_range(struct inode *inode, loff_t lstart, loff_t lend) +{ + struct ceph_inode_info *ci = ceph_inode(inode); +struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_osd_request *req; + u64 length = ceph_file_layout_object_size(ci->i_layout); + loff_t offset; + int ret = 0; + + if (lstart > lend || length <= 0) + goto out; + for (offset = lstart; offset <= lend; offset += length) { + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, +ceph_vino(inode), offset, &length, +CEPH_OSD_OP_DELETE, CEPH_OSD_FLAG_ONDISK, +NULL, +0, +ci->i_truncate_seq, ci->i_truncate_size, +NULL, false, 1, 0); + if (IS_ERR(req)) { + ret = PTR_ERR(req); + goto out; + } + + ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); + if (!ret) { + ret = ceph_osdc_wait_request(&fsc->client->osdc, req); + } + ceph_osdc_put_request(req); + /* object deleted */ + if (ret == -ENOENT) + ret = 0; + } + + out: + return ret; +} + +static int ceph_zero_partial_object(struct file *file, loff_t offset, loff_t length) +{ + struct ceph_file_info *fi = file->private_data; + struct inode *inode = file->f_dentry->d_inode; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_osd_request *req; + struct timespec mtime = CURRENT_TIME; + int want, got = 0, ret = 0; + + if (length <= 0) + goto out; + + +
[PATCH 0/2] Kernel file system client support for punch hole
This patch implements punch hole (fallocate) support for Ceph kernel file system client. We prepared two patches based on different kernel versions, one against kernel 3.8-rc3, the other against the latest 3.10-rc5. It is because unfortunately, we failed to set up a workable Ceph system with the client based on the lastest code from Linux kernel git tree, for the server side, we tried both the latest code from Ceph git tree and the latest v0.61.3 release. The client will easily hang there without any response, unless rebooting the machine. We managed to set up a Ceph system with the client based on Linux kernel 3.8-rc3 and the server based on Ceph v0.61.3, so the patch against v3.8-rc3 has been under preliminary tests. However, the one against v3.10-rc5 not. Comments are appreciated. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 2/2] Punch hole support against 3.10-rc5
This patch implements punch hole (fallocate) support against Linux kernel 3.10-rc5. Signed-off-by: Li Wang Signed-off-by: Yunchuan Wen --- fs/ceph/file.c| 245 + net/ceph/osd_client.c |8 +- 2 files changed, 251 insertions(+), 2 deletions(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 656e169..e092b69 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -8,6 +8,7 @@ #include #include #include +#include #include "super.h" #include "mds_client.h" @@ -882,6 +883,249 @@ out: return offset; } +static inline void ceph_zero_partial_page(struct inode *inode, pgoff_t index, unsigned start, unsigned size) +{ + struct page *page; + + page = find_lock_page(inode->i_mapping, index); + if (page) { + zero_user(page, start, size); + unlock_page(page); + page_cache_release(page); + } +} + +static void ceph_truncate_and_zero_page_cache(struct inode *inode, loff_t offset, loff_t length) +{ + loff_t first_page; + loff_t last_page; + loff_t zero_len; + + first_page =((offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) << PAGE_CACHE_SHIFT; + last_page = ((offset + length) >> PAGE_CACHE_SHIFT) << PAGE_CACHE_SHIFT; + if (last_page > first_page) { + truncate_pagecache_range(inode, first_page, last_page - 1); + } + if (first_page > last_page) { + ceph_zero_partial_page(inode, offset >> PAGE_CACHE_SHIFT, offset & (PAGE_CACHE_SIZE - 1), length); + return; + } + /* +* zero out the partial page that contains +* the start of the hole +*/ + zero_len = first_page - offset; + if (zero_len > 0) { + ceph_zero_partial_page(inode, offset >> PAGE_CACHE_SHIFT, offset & (PAGE_CACHE_SIZE -1), zero_len); + } + /* +* zero out the partial page that contains +* the end of the hole +*/ + zero_len = offset + length - last_page; + if (zero_len > 0) { + ceph_zero_partial_page(inode, (offset + length) >> PAGE_CACHE_SHIFT, 0, zero_len); + } + /* +* If i_size is contained in the last page, we need to +* zero the partial page after i_size +*/ + if (inode->i_size >> PAGE_CACHE_SHIFT == (offset + length) >> PAGE_CACHE_SHIFT && inode->i_size % PAGE_CACHE_SIZE != 0) { + zero_len = PAGE_CACHE_SIZE - + (inode->i_size & (PAGE_CACHE_SIZE - 1)); + if (zero_len > 0) { + ceph_zero_partial_page(inode, inode->i_size >> PAGE_CACHE_SHIFT, inode->i_size & (PAGE_CACHE_SIZE -1), zero_len); + } + } +} + +static int ceph_delete_object_range(struct inode *inode, loff_t lstart, loff_t lend) +{ + struct ceph_inode_info *ci = ceph_inode(inode); +struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_osd_request *req; + u64 length = ceph_file_layout_object_size(ci->i_layout); + loff_t offset; + int ret = 0; + + if (lstart > lend || length <= 0) + goto out; + for (offset = lstart; offset <= lend; offset += length) { + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, +ceph_vino(inode), offset, &length, +1, CEPH_OSD_OP_DELETE, CEPH_OSD_FLAG_ONDISK, +NULL, +ci->i_truncate_seq, ci->i_truncate_size, +false); + if (IS_ERR(req)) { + ret = PTR_ERR(req); + goto out; + } + + ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); + if (!ret) { + ret = ceph_osdc_wait_request(&fsc->client->osdc, req); + } + ceph_osdc_put_request(req); + /* object deleted */ + if (ret == -ENOENT) + ret = 0; + } + + out: + return ret; +} + +static int ceph_zero_partial_object(struct file *file, loff_t offset, loff_t length) +{ + struct ceph_file_info *fi = file->private_data; + struct inode *inode = file->f_dentry->d_inode; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_osd_request *req; + int want, got = 0, ret = 0; + + if (length <= 0) + goto out; + + + if (fi->fmode & CEPH_FILE_MODE_LAZY) + want = CEPH_CAP_F
[PATCH v4] ext4: Avoid unnecessarily writing back dirty pages before hole punching
For hole punching, currently ext4 will synchronously write back the dirty pages fit into the hole, since the data on the disk responding to those pages are to be deleted, it is benefical to directly release those pages, no matter they are dirty or not, except the ordered case. Signed-off-by: Li Wang Signed-off-by: Yunchuan Wen Cc: Dmitry Monakhov Reviewed-by: Zheng Liu Reviewed-by: Jan Kara --- Hi Jan, Did you mean this? It seems you donot like the jbd2_journal_begin_ordered_discard:), However, what do you think of calling jbd2_journal_begin_ordered_punch_hole() from jbd2_journal_begin_ordered_truncate()? In my option, the two guys stand at the same level. Nevertheless, it is up to your choice. --- fs/ext4/inode.c | 27 --- fs/jbd2/journal.c |2 +- fs/jbd2/transaction.c | 29 ++--- include/linux/jbd2.h | 33 +++-- 4 files changed, 54 insertions(+), 37 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d6382b8..844d1b8 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3569,6 +3569,16 @@ int ext4_can_truncate(struct inode *inode) return 0; } +static inline int ext4_begin_ordered_punch_hole(struct inode *inode, + loff_t start, loff_t length) +{ + if (!EXT4_I(inode)->jinode) + return 0; + return jbd2_journal_begin_ordered_punch_hole(EXT4_JOURNAL(inode), + EXT4_I(inode)->jinode, + start, start+length-1); +} + /* * ext4_punch_hole: punches a hole in a file by releaseing the blocks * associated with the given offset and length @@ -3602,17 +3612,6 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) trace_ext4_punch_hole(inode, offset, length); - /* -* Write out all dirty pages to avoid race conditions -* Then release them. -*/ - if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { - ret = filemap_write_and_wait_range(mapping, offset, - offset + length - 1); - if (ret) - return ret; - } - mutex_lock(&inode->i_mutex); /* It's not possible punch hole on append only file */ if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { @@ -3644,6 +3643,12 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) first_page_offset = first_page << PAGE_CACHE_SHIFT; last_page_offset = last_page << PAGE_CACHE_SHIFT; + if (ext4_should_order_data(inode)) { + ret = ext4_begin_ordered_punch_hole(inode, offset, length); + if (ret) + return ret; + } + /* Now release the pages */ if (last_page_offset > first_page_offset) { truncate_pagecache_range(inode, first_page_offset, diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 9545757..7af4e4f 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -97,7 +97,7 @@ EXPORT_SYMBOL(jbd2_journal_force_commit); EXPORT_SYMBOL(jbd2_journal_file_inode); EXPORT_SYMBOL(jbd2_journal_init_jbd_inode); EXPORT_SYMBOL(jbd2_journal_release_jbd_inode); -EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate); +EXPORT_SYMBOL(jbd2_journal_begin_ordered_punch_hole); EXPORT_SYMBOL(jbd2_inode_cache); static void __journal_abort_soft (journal_t *journal, int errno); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 10f524c..262b1c3 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -2305,29 +2305,10 @@ done: return 0; } -/* - * File truncate and transaction commit interact with each other in a - * non-trivial way. If a transaction writing data block A is - * committing, we cannot discard the data by truncate until we have - * written them. Otherwise if we crashed after the transaction with - * write has committed but before the transaction with truncate has - * committed, we could see stale data in block A. This function is a - * helper to solve this problem. It starts writeout of the truncated - * part in case it is in the committing transaction. - * - * Filesystem code must call this function when inode is journaled in - * ordered mode before truncation happens and after the inode has been - * placed on orphan list with the new inode size. The second condition - * avoids the race that someone writes new data and we start - * committing the transaction after this function has been called but - * before a transaction for truncate is started (and furthermore it - * allows us to optimize the case where the addition to orphan list - * happens in the same transaction as write --- we don't have to write - * any data in such case). - */ -int jbd2_journal_
[PATCH] ext4: Avoid unnecessarily writing back dirty pages before hole punching
For hole punching, currently ext4 will synchronously write back the dirty pages fit into the hole, since the data on the disk responding to those pages are to be deleted, it is benefical to directly release those pages, no matter they are dirty or not, except the ordered case. Signed-off-by: Li Wang Signed-off-by: Yunchuan Wen Cc: Dmitry Monakhov Cc: Jan Kara --- Hi Zheng and Jan, Thanks for your comments. For data=ordered vs. data=writeback, my understanding is that they both journal metadata, so metadata won't be corrupted in both cases. And they both do not journal data, so data may be lost under either case. So it is basically the same under overwriting situation, that is, data may not be fully updated. The difference lies in that for appending write, with data=writeback, the commit of metadata is done asynchronously with the write of data, so it may happen that file size is increased, with data incompletely written, leaves partly uninitialized data, as pointed out by Jan, that results in security issues. For data=ordered, metadata is committed after data are written with slightly? lower performance, so reader won't read out uninitialized data. We introduce the internal function jbd2_journal_begin_ordered_discard() because it will be called by both jbd2_journal_begin_ordered_punch_hole() and jbd2_journal_begin_ordered_truncate(), and we want to leave the function prototype of jbd2_journal_begin_ordered_ truncate() unchanged, and it has less arguments than the punch hole counterpart. The other way is we implement them independently without the internal begin_ordered_discard() function, however, in that case, the two functions will suffer from sharing a big and almost the same body, that is not elegant. We have taken other suggestions from Jan. --- fs/ext4/inode.c | 27 --- fs/jbd2/journal.c |2 +- fs/jbd2/transaction.c | 29 ++--- include/linux/jbd2.h | 41 +++-- 4 files changed, 62 insertions(+), 37 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d6382b8..6b0251e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3569,6 +3569,16 @@ int ext4_can_truncate(struct inode *inode) return 0; } +static inline int ext4_begin_ordered_punch_hole(struct inode *inode, + loff_t start, loff_t length) +{ + if (!EXT4_I(inode)->jinode) + return 0; + return jbd2_journal_begin_ordered_punch_hole(EXT4_JOURNAL(inode), + EXT4_I(inode)->jinode, + start, length); +} + /* * ext4_punch_hole: punches a hole in a file by releaseing the blocks * associated with the given offset and length @@ -3602,17 +3612,6 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) trace_ext4_punch_hole(inode, offset, length); - /* -* Write out all dirty pages to avoid race conditions -* Then release them. -*/ - if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { - ret = filemap_write_and_wait_range(mapping, offset, - offset + length - 1); - if (ret) - return ret; - } - mutex_lock(&inode->i_mutex); /* It's not possible punch hole on append only file */ if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { @@ -3644,6 +3643,12 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) first_page_offset = first_page << PAGE_CACHE_SHIFT; last_page_offset = last_page << PAGE_CACHE_SHIFT; + if (ext4_should_order_data(inode)) { + ret = ext4_begin_ordered_punch_hole(inode, offset, length); + if (ret) + return ret; + } + /* Now release the pages */ if (last_page_offset > first_page_offset) { truncate_pagecache_range(inode, first_page_offset, diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 9545757..166ca5d 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -97,7 +97,7 @@ EXPORT_SYMBOL(jbd2_journal_force_commit); EXPORT_SYMBOL(jbd2_journal_file_inode); EXPORT_SYMBOL(jbd2_journal_init_jbd_inode); EXPORT_SYMBOL(jbd2_journal_release_jbd_inode); -EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate); +EXPORT_SYMBOL(jbd2_journal_begin_ordered_discard); EXPORT_SYMBOL(jbd2_inode_cache); static void __journal_abort_soft (journal_t *journal, int errno); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 10f524c..2d7a3bf 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -2305,29 +2305,10 @@ done: return 0; } -/* - * File truncate and transaction commit interact with each other in a - * non-trivial way. If a
[PATCH v2] ext4: Avoid unnecessarily writing back dirty pages before hole punching
For hole punching, currently ext4 will synchronously write back the dirty pages fit into the hole, since the data on the disk responding to those pages are to be deleted, it is benefical to directly release those pages, no matter they are dirty or not, except the ordered case. Signed-off-by: Li Wang Signed-off-by: Yunchuan Wen Reviewed-by: Zheng Liu Cc: Dmitry Monakhov --- Hi Zheng, Thanks for your comments. This is the revised version with the operation of writting back moved down after the inode mutex held. But there is one thing I wanna confirm is that whether the inode mutex could prevent the mmap() writer? I did not take a careful look at the mmap() code, the straightforward thinking is that mmap() write will directly dirty the pages without going through the VFS generic_file_write() path. BTW, I have one other question to confirm regarding the ext4 journal mode: what is the advantage of data=ordered journal mode compared to data=writeback? For overwriting write, it still may lead to the inconsistence between data and metadata, that is, data is new and metadata is old. So its standpoint is that it beats data=writeback in appending write? --- fs/ext4/inode.c | 27 +- fs/jbd2/journal.c |1 + fs/jbd2/transaction.c | 61 +++-- include/linux/jbd2.h |3 +++ 4 files changed, 59 insertions(+), 33 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d6382b8..568b0bd 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3569,6 +3569,16 @@ int ext4_can_truncate(struct inode *inode) return 0; } +static inline int ext4_begin_ordered_fallocate(struct inode *inode, + loff_t start, loff_t length) +{ + if (!EXT4_I(inode)->jinode) + return 0; + return jbd2_journal_begin_ordered_fallocate(EXT4_JOURNAL(inode), + EXT4_I(inode)->jinode, + start, length); +} + /* * ext4_punch_hole: punches a hole in a file by releaseing the blocks * associated with the given offset and length @@ -3602,17 +3612,6 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) trace_ext4_punch_hole(inode, offset, length); - /* -* Write out all dirty pages to avoid race conditions -* Then release them. -*/ - if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { - ret = filemap_write_and_wait_range(mapping, offset, - offset + length - 1); - if (ret) - return ret; - } - mutex_lock(&inode->i_mutex); /* It's not possible punch hole on append only file */ if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { @@ -3644,6 +3643,12 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) first_page_offset = first_page << PAGE_CACHE_SHIFT; last_page_offset = last_page << PAGE_CACHE_SHIFT; + if (ext4_should_order_data(inode)) { + ret = ext4_begin_ordered_fallocate(inode, offset, length); + if (ret) + return ret; + } + /* Now release the pages */ if (last_page_offset > first_page_offset) { truncate_pagecache_range(inode, first_page_offset, diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 9545757..ccc483a 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -98,6 +98,7 @@ EXPORT_SYMBOL(jbd2_journal_file_inode); EXPORT_SYMBOL(jbd2_journal_init_jbd_inode); EXPORT_SYMBOL(jbd2_journal_release_jbd_inode); EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate); +EXPORT_SYMBOL(jbd2_journal_begin_ordered_fallocate); EXPORT_SYMBOL(jbd2_inode_cache); static void __journal_abort_soft (journal_t *journal, int errno); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 10f524c..035c064 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -2305,6 +2305,36 @@ done: return 0; } + +static int jbd2_journal_begin_ordered_discard(journal_t *journal, + struct jbd2_inode *jinode, + loff_t start, loff_t end) +{ + transaction_t *inode_trans, *commit_trans; + int ret = 0; + + /* This is a quick check to avoid locking if not necessary */ + if (!jinode->i_transaction) + goto out; + /* Locks are here just to force reading of recent values, it is +* enough that the transaction was not committing before we started +* a transaction adding the inode to orphan list */ + read_lock(&journal->j_state_lock); + commit_trans = journal->j_committing_transaction; + read_unlock(&journal->j_state_lock); +
[PATCH] eCryptfs: Avoid unnecessary disk read and data decryption during writing
ecryptfs_write_begin grabs a page from page cache for writing. If the page contains invalid data, or data older than the counterpart on the disk, eCryptfs will read out the corresponing data from the disk into the page, decrypt them, then perform writing. However, for this page, if the length of the data to be written into is equal to page size, that means the whole page of data will be overwritten, in which case, it does not matter whatever the data were before, it is beneficial to perform writing directly rather than bothering to read and decrypt first. With this optimization, according to our test on a machine with Intel Core 2 Duo processor, iozone 'write' operation on an existing file with write size being multiple of page size will enjoy a steady 3x speedup. Signed-off-by: Li Wang Signed-off-by: Yunchuan Wen Reviewed-by: Tyler Hicks --- fs/ecryptfs/mmap.c | 12 ++-- 1 files changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index bd1d57f..564a1fa 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -338,7 +338,8 @@ static int ecryptfs_write_begin(struct file *file, if (prev_page_end_size >= i_size_read(page->mapping->host)) { zero_user(page, 0, PAGE_CACHE_SIZE); - } else { + SetPageUptodate(page); + } else if (len < PAGE_CACHE_SIZE) { rc = ecryptfs_decrypt_page(page); if (rc) { printk(KERN_ERR "%s: Error decrypting " @@ -348,8 +349,8 @@ static int ecryptfs_write_begin(struct file *file, ClearPageUptodate(page); goto out; } + SetPageUptodate(page); } - SetPageUptodate(page); } } /* If creating a page or more of holes, zero them out via truncate. @@ -499,6 +500,13 @@ static int ecryptfs_write_end(struct file *file, } goto out; } + if (!PageUptodate(page)) { + if (copied < PAGE_CACHE_SIZE) { + rc = 0; + goto out; + } + SetPageUptodate(page); + } /* Fills in zeros if 'to' goes beyond inode size */ rc = fill_zeros_to_end_of_page(page, to); if (rc) { -- 1.7.6.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[RFC] VFS: File System Mount Wide O_DIRECT Support
For file system created on file-backed loop device, there will be two-levels of page cache present, which typically doubles the memory consumption. In many cases, it is beneficial to turn on the O_DIRECT option while performing the upper file system file IO, to bypass the upper page cache, which not only reduces half of the memory consumption, but also improves the performance due to shorter copy path. For example, the following iozone REREAD test with O_DIRECT turned on over the one without enjoys 10x speedup due to redundant cache elimination, consequently, avoiding page cache thrashing on a 2GB memory machine running 3.2.9 kernel. losetup /dev/loop0 dummy // dummy is a ext4 file with a size of 1.1GB mkfs -t ext2 /dev/loop0 mount /dev/loop0 /dsk cd /dsk iozone -t 1 -s 1G -r 4M -i 0 -+n -w // produce a 1GB test file iozone -t 1 -s 1G -r 4M -i 1 -w // REREAD test without O_DIRECT echo 1 > /proc/sys/vm/drop_caches // cleanup the page cache iozone -t 1 -s 1G -r 4M -i 1 -w -I // REREAD test with O_DIRECT This feature is also expected to be useful for virtualization situation, the file systems inside the guest operation system will use much less of guest memory, which, potencially results in less of host memory use. Especially, it may be more useful if multiple guests are running based on a same disk image file. The idea is simple, leave the desicion for the file system user to enable file system mount wide O_DIRECT support with a new mount option, for example, losetup /dev/loop0 dummy mount /dev/loop0 -o MS_DIRECT /dsk Below is the preliminary patch, --- fs/open.c |5 + fs/super.c |2 ++ include/linux/fs.h |1 + 3 files changed, 8 insertions(+), 0 deletions(-) diff --git a/fs/open.c b/fs/open.c index e1f2cdb..dacac30 100644 --- a/fs/open.c +++ b/fs/open.c @@ -958,6 +958,11 @@ long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) } else { fsnotify_open(f); fd_install(fd, f); + if (f->f_vfsmnt->mnt_sb && f->f_vfsmnt->mnt_sb->s_flags & MS_DIRECT) { + if (S_ISREG(f->f_dentry->d_inode->i_mode)) { + if (!f->f_mapping->a_ops || ((!f->f_mapping->a_ops->direct_IO) && (!f->f_mapping->a_ops->get_xip_mem))) + f->f_flags |= O_DIRECT; + } } } putname(tmp); diff --git a/fs/super.c b/fs/super.c index 0902cfa..ab5c4a5 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1147,6 +1147,8 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data) WARN_ON(!sb->s_bdi); WARN_ON(sb->s_bdi == &default_backing_dev_info); sb->s_flags |= MS_BORN; + if (flags & MS_DIRECT) + sb->s_flags |= MS_DIRECT; error = security_sb_kern_mount(sb, flags, secdata); if (error) diff --git a/include/linux/fs.h b/include/linux/fs.h index aa11047..127cc85 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -225,6 +225,7 @@ struct inodes_stat_t { #define MS_KERNMOUNT (1<<22) /* this is a kern_mount call */ #define MS_I_VERSION (1<<23) /* Update inode I_version field */ #define MS_STRICTATIME (1<<24) /* Always perform atime updates */ +#define MS_DIRECT (1<<27) #define MS_NOSEC (1<<28) #define MS_BORN(1<<29) #define MS_ACTIVE (1<<30) -- 1.7.6.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/