[Patch] tmpfs fixes against 2.4.6-pre(2)
Hi Linus, This is the second part of my patches. Writing out of a mapping of a tmpfs file into the same file can deadlock. This is running in the -ac series since some while. Please apply Christoph diff -uNr 6-pre8-fix1/include/linux/shmem_fs.h 6-pre8-fix2/include/linux/shmem_fs.h --- 6-pre8-fix1/include/linux/shmem_fs.hSun Apr 29 20:33:00 2001 +++ 6-pre8-fix2/include/linux/shmem_fs.hTue Jul 3 09:28:13 2001 @@ -19,6 +19,7 @@ struct shmem_inode_info { spinlock_t lock; + struct semaphore sem; unsigned long max_index; swp_entry_t i_direct[SHMEM_NR_DIRECT]; /* for the first blocks */ swp_entry_t **i_indirect; /* doubly indirect blocks */ diff -uNr 6-pre8-fix1/mm/shmem.c 6-pre8-fix2/mm/shmem.c --- 6-pre8-fix1/mm/shmem.c Tue Jul 3 08:55:20 2001 +++ 6-pre8-fix2/mm/shmem.c Tue Jul 3 10:09:26 2001 @@ -162,6 +162,7 @@ swp_entry_t **base, **ptr, **last; struct shmem_inode_info * info = >u.shmem_i; + down(>sem); inode->i_ctime = inode->i_mtime = CURRENT_TIME; spin_lock (>lock); index = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; @@ -205,6 +206,7 @@ info->swapped -= freed; shmem_recalc_inode(inode); spin_unlock (>lock); + up(>sem); } static void shmem_delete_inode(struct inode * inode) @@ -289,15 +291,12 @@ * still need to guard against racing with shm_writepage(), which might * be trying to move the page to the swap cache as we run. */ -static struct page * shmem_getpage_locked(struct inode * inode, unsigned long idx) +static struct page * shmem_getpage_locked(struct shmem_inode_info *info, struct inode +* inode, unsigned long idx) { struct address_space * mapping = inode->i_mapping; - struct shmem_inode_info *info; struct page * page; swp_entry_t *entry; - info = >u.shmem_i; - repeat: page = find_lock_page(mapping, idx); if (page) @@ -402,6 +401,7 @@ static int shmem_getpage(struct inode * inode, unsigned long idx, struct page **ptr) { + struct shmem_inode_info *info; struct address_space * mapping = inode->i_mapping; int error; @@ -416,27 +416,28 @@ page_cache_release(*ptr); } - down (>i_sem); - /* retest we may have slept */ + info = >u.shmem_i; + down (>sem); + /* retest we may have slept */ + + *ptr = ERR_PTR(-EFAULT); if (inode->i_size < (loff_t) idx * PAGE_CACHE_SIZE) - goto sigbus; - *ptr = shmem_getpage_locked(inode, idx); + goto failed; + + *ptr = shmem_getpage_locked(>u.shmem_i, inode, idx); if (IS_ERR (*ptr)) goto failed; + UnlockPage(*ptr); - up (>i_sem); + up (>sem); return 0; failed: - up (>i_sem); + up (>sem); error = PTR_ERR(*ptr); - *ptr = NOPAGE_OOM; - if (error != -EFBIG) - *ptr = NOPAGE_SIGBUS; - return error; -sigbus: - up (>i_sem); *ptr = NOPAGE_SIGBUS; - return -EFAULT; + if (error == -ENOMEM) + *ptr = NOPAGE_OOM; + return error; } struct page * shmem_nopage(struct vm_area_struct * vma, unsigned long address, int no_share) @@ -509,6 +510,7 @@ struct inode *shmem_get_inode(struct super_block *sb, int mode, int dev) { struct inode * inode; + struct shmem_inode_info *info; spin_lock (>u.shmem_sb.stat_lock); if (!sb->u.shmem_sb.free_inodes) { @@ -528,7 +530,9 @@ inode->i_rdev = NODEV; inode->i_mapping->a_ops = _aops; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; - spin_lock_init (>u.shmem_i.lock); + info = >u.shmem_i; + spin_lock_init (>lock); + sema_init (>sem, 1); switch (mode & S_IFMT) { default: init_special_inode(inode, mode, dev); @@ -558,6 +562,7 @@ shmem_file_write(struct file *file,const char *buf,size_t count,loff_t *ppos) { struct inode*inode = file->f_dentry->d_inode; + struct shmem_inode_info *info; unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur; loff_t pos; struct page *page; @@ -633,7 +638,11 @@ __get_user(dummy, buf+bytes-1); } - page = shmem_getpage_locked(inode, index); + info = >u.shmem_i; + down (>sem); + page = shmem_getpage_locked(info, inode, index); + up (>sem); + status = PTR_ERR(page); if (IS_ERR(page)) break; @@ -644,7 +653,6 @@ } kaddr = kmap(page); -// can this do a truncated write? cr status =
[Patch] tmpfs fixes against 2.4.6-pre
Hi Linus, I split up my previous patch into two. Hopefully this is more acceptable for you or will trigger some comments. This is the first part: 1) shmem_remount_fs garbles parameters which are not supplied 2) shmem_truncate should check the maximum size else we get ugly oopses 3) shmem_file_setup should give an error if the size is too big. So the application will fail early. I also cleaned up the error handling a bit. 4) We should recalculate the inode on page allocation. Else we get really weird sizes on sparse files. Please apply Christoph diff -uNr 6-pre8/mm/shmem.c 6-pre8-fix1/mm/shmem.c --- 6-pre8/mm/shmem.c Tue Jun 12 09:49:28 2001 +++ 6-pre8-fix1/mm/shmem.c Tue Jul 3 08:55:20 2001 @@ -3,7 +3,8 @@ * * Copyright (C) 2000 Linus Torvalds. * 2000 Transmeta Corp. - * 2000 Christoph Rohland + * 2000-2001 Christoph Rohland + * 2000-2001 SAP AG * * This file is released under the GPL. */ @@ -33,7 +34,7 @@ #define TMPFS_MAGIC0x01021994 #define ENTRIES_PER_PAGE (PAGE_SIZE/sizeof(unsigned long)) -#define NR_SINGLE (ENTRIES_PER_PAGE + SHMEM_NR_DIRECT) +#define SHMEM_MAX_BLOCKS (SHMEM_NR_DIRECT + ENTRIES_PER_PAGE*ENTRIES_PER_PAGE) static struct super_operations shmem_ops; static struct address_space_operations shmem_aops; @@ -193,7 +194,14 @@ } out: - info->max_index = index; + /* +* We have no chance to give an error, so we limit it to max +* size here and the application will fail later +*/ + if (index > SHMEM_MAX_BLOCKS) + info->max_index = SHMEM_MAX_BLOCKS; + else + info->max_index = index; info->swapped -= freed; shmem_recalc_inode(inode); spin_unlock (>lock); @@ -314,6 +322,7 @@ return page; } + shmem_recalc_inode(inode); if (entry->val) { unsigned long flags; @@ -1027,6 +1036,8 @@ unsigned long max_inodes, inodes; struct shmem_sb_info *info = >u.shmem_sb; + max_blocks = info->max_blocks; + max_inodes = info->max_inodes; if (shmem_parse_options (data, NULL, _blocks, _inodes)) return -EINVAL; @@ -1074,7 +1085,7 @@ sb->u.shmem_sb.free_blocks = blocks; sb->u.shmem_sb.max_inodes = inodes; sb->u.shmem_sb.free_inodes = inodes; - sb->s_maxbytes = (unsigned long long)(SHMEM_NR_DIRECT + (ENTRIES_PER_PAGE*ENTRIES_PER_PAGE)) << PAGE_CACHE_SHIFT; + sb->s_maxbytes = (unsigned long long)SHMEM_MAX_BLOCKS << PAGE_CACHE_SHIFT; sb->s_blocksize = PAGE_CACHE_SIZE; sb->s_blocksize_bits = PAGE_CACHE_SHIFT; sb->s_magic = TMPFS_MAGIC; @@ -1282,9 +1293,11 @@ struct qstr this; int vm_enough_memory(long pages); - error = -ENOMEM; + if (size > (unsigned long long) SHMEM_MAX_BLOCKS << PAGE_CACHE_SHIFT) + return ERR_PTR(-EINVAL); + if (!vm_enough_memory((size) >> PAGE_SHIFT)) - goto out; + return ERR_PTR(-ENOMEM); this.name = name; this.len = strlen(name); @@ -1292,7 +1305,7 @@ root = tmpfs_fs_type.kern_mnt->mnt_root; dentry = d_alloc(root, ); if (!dentry) - goto out; + return ERR_PTR(-ENOMEM); error = -ENFILE; file = get_empty_filp(); @@ -1318,7 +1331,6 @@ put_filp(file); put_dentry: dput (dentry); -out: return ERR_PTR(error); } /* - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[Patch] tmpfs fixes against 2.4.6-pre
Hi Linus, I split up my previous patch into two. Hopefully this is more acceptable for you or will trigger some comments. This is the first part: 1) shmem_remount_fs garbles parameters which are not supplied 2) shmem_truncate should check the maximum size else we get ugly oopses 3) shmem_file_setup should give an error if the size is too big. So the application will fail early. I also cleaned up the error handling a bit. 4) We should recalculate the inode on page allocation. Else we get really weird sizes on sparse files. Please apply Christoph diff -uNr 6-pre8/mm/shmem.c 6-pre8-fix1/mm/shmem.c --- 6-pre8/mm/shmem.c Tue Jun 12 09:49:28 2001 +++ 6-pre8-fix1/mm/shmem.c Tue Jul 3 08:55:20 2001 @@ -3,7 +3,8 @@ * * Copyright (C) 2000 Linus Torvalds. * 2000 Transmeta Corp. - * 2000 Christoph Rohland + * 2000-2001 Christoph Rohland + * 2000-2001 SAP AG * * This file is released under the GPL. */ @@ -33,7 +34,7 @@ #define TMPFS_MAGIC0x01021994 #define ENTRIES_PER_PAGE (PAGE_SIZE/sizeof(unsigned long)) -#define NR_SINGLE (ENTRIES_PER_PAGE + SHMEM_NR_DIRECT) +#define SHMEM_MAX_BLOCKS (SHMEM_NR_DIRECT + ENTRIES_PER_PAGE*ENTRIES_PER_PAGE) static struct super_operations shmem_ops; static struct address_space_operations shmem_aops; @@ -193,7 +194,14 @@ } out: - info-max_index = index; + /* +* We have no chance to give an error, so we limit it to max +* size here and the application will fail later +*/ + if (index SHMEM_MAX_BLOCKS) + info-max_index = SHMEM_MAX_BLOCKS; + else + info-max_index = index; info-swapped -= freed; shmem_recalc_inode(inode); spin_unlock (info-lock); @@ -314,6 +322,7 @@ return page; } + shmem_recalc_inode(inode); if (entry-val) { unsigned long flags; @@ -1027,6 +1036,8 @@ unsigned long max_inodes, inodes; struct shmem_sb_info *info = sb-u.shmem_sb; + max_blocks = info-max_blocks; + max_inodes = info-max_inodes; if (shmem_parse_options (data, NULL, max_blocks, max_inodes)) return -EINVAL; @@ -1074,7 +1085,7 @@ sb-u.shmem_sb.free_blocks = blocks; sb-u.shmem_sb.max_inodes = inodes; sb-u.shmem_sb.free_inodes = inodes; - sb-s_maxbytes = (unsigned long long)(SHMEM_NR_DIRECT + (ENTRIES_PER_PAGE*ENTRIES_PER_PAGE)) PAGE_CACHE_SHIFT; + sb-s_maxbytes = (unsigned long long)SHMEM_MAX_BLOCKS PAGE_CACHE_SHIFT; sb-s_blocksize = PAGE_CACHE_SIZE; sb-s_blocksize_bits = PAGE_CACHE_SHIFT; sb-s_magic = TMPFS_MAGIC; @@ -1282,9 +1293,11 @@ struct qstr this; int vm_enough_memory(long pages); - error = -ENOMEM; + if (size (unsigned long long) SHMEM_MAX_BLOCKS PAGE_CACHE_SHIFT) + return ERR_PTR(-EINVAL); + if (!vm_enough_memory((size) PAGE_SHIFT)) - goto out; + return ERR_PTR(-ENOMEM); this.name = name; this.len = strlen(name); @@ -1292,7 +1305,7 @@ root = tmpfs_fs_type.kern_mnt-mnt_root; dentry = d_alloc(root, this); if (!dentry) - goto out; + return ERR_PTR(-ENOMEM); error = -ENFILE; file = get_empty_filp(); @@ -1318,7 +1331,6 @@ put_filp(file); put_dentry: dput (dentry); -out: return ERR_PTR(error); } /* - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[Patch] tmpfs fixes against 2.4.6-pre(2)
Hi Linus, This is the second part of my patches. Writing out of a mapping of a tmpfs file into the same file can deadlock. This is running in the -ac series since some while. Please apply Christoph diff -uNr 6-pre8-fix1/include/linux/shmem_fs.h 6-pre8-fix2/include/linux/shmem_fs.h --- 6-pre8-fix1/include/linux/shmem_fs.hSun Apr 29 20:33:00 2001 +++ 6-pre8-fix2/include/linux/shmem_fs.hTue Jul 3 09:28:13 2001 @@ -19,6 +19,7 @@ struct shmem_inode_info { spinlock_t lock; + struct semaphore sem; unsigned long max_index; swp_entry_t i_direct[SHMEM_NR_DIRECT]; /* for the first blocks */ swp_entry_t **i_indirect; /* doubly indirect blocks */ diff -uNr 6-pre8-fix1/mm/shmem.c 6-pre8-fix2/mm/shmem.c --- 6-pre8-fix1/mm/shmem.c Tue Jul 3 08:55:20 2001 +++ 6-pre8-fix2/mm/shmem.c Tue Jul 3 10:09:26 2001 @@ -162,6 +162,7 @@ swp_entry_t **base, **ptr, **last; struct shmem_inode_info * info = inode-u.shmem_i; + down(info-sem); inode-i_ctime = inode-i_mtime = CURRENT_TIME; spin_lock (info-lock); index = (inode-i_size + PAGE_CACHE_SIZE - 1) PAGE_CACHE_SHIFT; @@ -205,6 +206,7 @@ info-swapped -= freed; shmem_recalc_inode(inode); spin_unlock (info-lock); + up(info-sem); } static void shmem_delete_inode(struct inode * inode) @@ -289,15 +291,12 @@ * still need to guard against racing with shm_writepage(), which might * be trying to move the page to the swap cache as we run. */ -static struct page * shmem_getpage_locked(struct inode * inode, unsigned long idx) +static struct page * shmem_getpage_locked(struct shmem_inode_info *info, struct inode +* inode, unsigned long idx) { struct address_space * mapping = inode-i_mapping; - struct shmem_inode_info *info; struct page * page; swp_entry_t *entry; - info = inode-u.shmem_i; - repeat: page = find_lock_page(mapping, idx); if (page) @@ -402,6 +401,7 @@ static int shmem_getpage(struct inode * inode, unsigned long idx, struct page **ptr) { + struct shmem_inode_info *info; struct address_space * mapping = inode-i_mapping; int error; @@ -416,27 +416,28 @@ page_cache_release(*ptr); } - down (inode-i_sem); - /* retest we may have slept */ + info = inode-u.shmem_i; + down (info-sem); + /* retest we may have slept */ + + *ptr = ERR_PTR(-EFAULT); if (inode-i_size (loff_t) idx * PAGE_CACHE_SIZE) - goto sigbus; - *ptr = shmem_getpage_locked(inode, idx); + goto failed; + + *ptr = shmem_getpage_locked(inode-u.shmem_i, inode, idx); if (IS_ERR (*ptr)) goto failed; + UnlockPage(*ptr); - up (inode-i_sem); + up (info-sem); return 0; failed: - up (inode-i_sem); + up (info-sem); error = PTR_ERR(*ptr); - *ptr = NOPAGE_OOM; - if (error != -EFBIG) - *ptr = NOPAGE_SIGBUS; - return error; -sigbus: - up (inode-i_sem); *ptr = NOPAGE_SIGBUS; - return -EFAULT; + if (error == -ENOMEM) + *ptr = NOPAGE_OOM; + return error; } struct page * shmem_nopage(struct vm_area_struct * vma, unsigned long address, int no_share) @@ -509,6 +510,7 @@ struct inode *shmem_get_inode(struct super_block *sb, int mode, int dev) { struct inode * inode; + struct shmem_inode_info *info; spin_lock (sb-u.shmem_sb.stat_lock); if (!sb-u.shmem_sb.free_inodes) { @@ -528,7 +530,9 @@ inode-i_rdev = NODEV; inode-i_mapping-a_ops = shmem_aops; inode-i_atime = inode-i_mtime = inode-i_ctime = CURRENT_TIME; - spin_lock_init (inode-u.shmem_i.lock); + info = inode-u.shmem_i; + spin_lock_init (info-lock); + sema_init (info-sem, 1); switch (mode S_IFMT) { default: init_special_inode(inode, mode, dev); @@ -558,6 +562,7 @@ shmem_file_write(struct file *file,const char *buf,size_t count,loff_t *ppos) { struct inode*inode = file-f_dentry-d_inode; + struct shmem_inode_info *info; unsigned long limit = current-rlim[RLIMIT_FSIZE].rlim_cur; loff_t pos; struct page *page; @@ -633,7 +638,11 @@ __get_user(dummy, buf+bytes-1); } - page = shmem_getpage_locked(inode, index); + info = inode-u.shmem_i; + down (info-sem); + page = shmem_getpage_locked(info, inode, index); + up (info-sem); + status = PTR_ERR(page); if (IS_ERR(page)) break; @@ -644,7 +653,6 @@ } kaddr = kmap(page);
[Patch] tmpfs/ramfs accounting
Hi Alan, here is the patch you backed out for -ac22. I slightly changed the approach: I do not rely on removepage to calculate the fs size any more since the special-casing was ugly and PG_marker was dropped. But I use removepage for the shmem_nrpages calculation. Please apply Christoph diff -uNr 5-ac22/fs/ramfs/inode.c 5-ac22-fix/fs/ramfs/inode.c --- 5-ac22/fs/ramfs/inode.c Mon Jul 2 09:13:18 2001 +++ 5-ac22-fix/fs/ramfs/inode.c Mon Jul 2 09:55:52 2001 @@ -289,7 +289,7 @@ return 0; } -static void ramfs_truncatepage(struct page *page) +static void ramfs_removepage(struct page *page) { struct inode *inode = (struct inode *)page->mapping->host; @@ -659,7 +659,7 @@ writepage: ramfs_writepage, prepare_write: ramfs_prepare_write, commit_write: ramfs_commit_write, - truncatepage: ramfs_truncatepage, + removepage: ramfs_removepage, }; static struct file_operations ramfs_file_operations = { diff -uNr 5-ac22/include/linux/fs.h 5-ac22-fix/include/linux/fs.h --- 5-ac22/include/linux/fs.h Mon Jul 2 09:35:39 2001 +++ 5-ac22-fix/include/linux/fs.h Mon Jul 2 10:32:04 2001 @@ -375,7 +375,7 @@ int (*sync_page)(struct page *); int (*prepare_write)(struct file *, struct page *, unsigned, unsigned); int (*commit_write)(struct file *, struct page *, unsigned, unsigned); - void (*truncatepage)(struct page *); /* called from truncate_complete_page */ + void (*removepage)(struct page *); /* called when page gets removed from the +inode */ /* Unfortunately this kludge is needed for FIBMAP. Don't use it */ int (*bmap)(struct address_space *, long); }; diff -uNr 5-ac22/mm/filemap.c 5-ac22-fix/mm/filemap.c --- 5-ac22/mm/filemap.c Mon Jul 2 09:13:29 2001 +++ 5-ac22-fix/mm/filemap.c Mon Jul 2 10:22:52 2001 @@ -87,6 +87,9 @@ { struct address_space * mapping = page->mapping; + if (mapping->a_ops->removepage) + mapping->a_ops->removepage(page); + mapping->nrpages--; list_del(>list); page->mapping = NULL; @@ -211,9 +214,6 @@ if (!page->buffers || block_flushpage(page, 0)) lru_cache_del(page); - if (page->mapping->a_ops->truncatepage) - page->mapping->a_ops->truncatepage(page); - /* * We remove the page from the page cache _after_ we have * destroyed all buffer-cache references to it. Otherwise some diff -uNr 5-ac22/mm/shmem.c 5-ac22-fix/mm/shmem.c --- 5-ac22/mm/shmem.c Mon Jul 2 09:13:29 2001 +++ 5-ac22-fix/mm/shmem.c Mon Jul 2 10:54:55 2001 @@ -34,6 +34,7 @@ #define TMPFS_MAGIC0x01021994 #define ENTRIES_PER_PAGE (PAGE_SIZE/sizeof(unsigned long)) +#define SHMEM_MAX_BLOCKS (SHMEM_NR_DIRECT + ENTRIES_PER_PAGE*ENTRIES_PER_PAGE) #define SHMEM_SB(sb) (>u.shmem_sb) @@ -51,6 +52,11 @@ #define BLOCKS_PER_PAGE (PAGE_SIZE/512) +static void shmem_removepage(struct page *page) +{ + atomic_dec(_nrpages); +} + /* * shmem_recalc_inode - recalculate the size of an inode * @@ -69,11 +75,9 @@ * (inode->i_mapping->nrpages + info->swapped) * * It has to be called with the spinlock held. - * - * The swap parameter is a performance hack for truncate. */ -static void shmem_recalc_inode(struct inode * inode, unsigned long swap) +static void shmem_recalc_inode(struct inode * inode) { unsigned long freed; @@ -85,7 +89,6 @@ spin_lock (>stat_lock); sbinfo->free_blocks += freed; spin_unlock (>stat_lock); - atomic_sub(freed-swap, _nrpages); } } @@ -202,7 +205,7 @@ out: info->max_index = index; info->swapped -= freed; - shmem_recalc_inode(inode, freed); + shmem_recalc_inode(inode); spin_unlock (>lock); up(>sem); } @@ -257,7 +260,7 @@ entry = shmem_swp_entry(info, page->index); if (IS_ERR(entry)) /* this had been allocted on page allocation */ BUG(); - shmem_recalc_inode(page->mapping->host, 0); + shmem_recalc_inode(page->mapping->host); error = -EAGAIN; if (entry->val) BUG(); @@ -265,7 +268,6 @@ *entry = swap; error = 0; /* Remove the page from the page cache */ - atomic_dec(_nrpages); lru_cache_del(page); remove_inode_page(page); @@ -1086,6 +1088,8 @@ unsigned long max_inodes, inodes; struct shmem_sb_info *sbinfo = SHMEM_SB(sb); + max_blocks = sbinfo->max_blocks; + max_inodes = sbinfo->max_inodes; if (shmem_parse_options (data, NULL, _blocks, _inodes)) return -EINVAL; @@ -1134,7 +1138,7 @@ sbinfo->free_blocks = blocks; sbinfo->max_inodes = inodes; sbinfo->free_inodes = inodes; - sb->s_maxbytes = (unsigned long long)(SHMEM_NR_DIRECT +
[Patch] tmpfs/ramfs accounting
Hi Alan, here is the patch you backed out for -ac22. I slightly changed the approach: I do not rely on removepage to calculate the fs size any more since the special-casing was ugly and PG_marker was dropped. But I use removepage for the shmem_nrpages calculation. Please apply Christoph diff -uNr 5-ac22/fs/ramfs/inode.c 5-ac22-fix/fs/ramfs/inode.c --- 5-ac22/fs/ramfs/inode.c Mon Jul 2 09:13:18 2001 +++ 5-ac22-fix/fs/ramfs/inode.c Mon Jul 2 09:55:52 2001 @@ -289,7 +289,7 @@ return 0; } -static void ramfs_truncatepage(struct page *page) +static void ramfs_removepage(struct page *page) { struct inode *inode = (struct inode *)page-mapping-host; @@ -659,7 +659,7 @@ writepage: ramfs_writepage, prepare_write: ramfs_prepare_write, commit_write: ramfs_commit_write, - truncatepage: ramfs_truncatepage, + removepage: ramfs_removepage, }; static struct file_operations ramfs_file_operations = { diff -uNr 5-ac22/include/linux/fs.h 5-ac22-fix/include/linux/fs.h --- 5-ac22/include/linux/fs.h Mon Jul 2 09:35:39 2001 +++ 5-ac22-fix/include/linux/fs.h Mon Jul 2 10:32:04 2001 @@ -375,7 +375,7 @@ int (*sync_page)(struct page *); int (*prepare_write)(struct file *, struct page *, unsigned, unsigned); int (*commit_write)(struct file *, struct page *, unsigned, unsigned); - void (*truncatepage)(struct page *); /* called from truncate_complete_page */ + void (*removepage)(struct page *); /* called when page gets removed from the +inode */ /* Unfortunately this kludge is needed for FIBMAP. Don't use it */ int (*bmap)(struct address_space *, long); }; diff -uNr 5-ac22/mm/filemap.c 5-ac22-fix/mm/filemap.c --- 5-ac22/mm/filemap.c Mon Jul 2 09:13:29 2001 +++ 5-ac22-fix/mm/filemap.c Mon Jul 2 10:22:52 2001 @@ -87,6 +87,9 @@ { struct address_space * mapping = page-mapping; + if (mapping-a_ops-removepage) + mapping-a_ops-removepage(page); + mapping-nrpages--; list_del(page-list); page-mapping = NULL; @@ -211,9 +214,6 @@ if (!page-buffers || block_flushpage(page, 0)) lru_cache_del(page); - if (page-mapping-a_ops-truncatepage) - page-mapping-a_ops-truncatepage(page); - /* * We remove the page from the page cache _after_ we have * destroyed all buffer-cache references to it. Otherwise some diff -uNr 5-ac22/mm/shmem.c 5-ac22-fix/mm/shmem.c --- 5-ac22/mm/shmem.c Mon Jul 2 09:13:29 2001 +++ 5-ac22-fix/mm/shmem.c Mon Jul 2 10:54:55 2001 @@ -34,6 +34,7 @@ #define TMPFS_MAGIC0x01021994 #define ENTRIES_PER_PAGE (PAGE_SIZE/sizeof(unsigned long)) +#define SHMEM_MAX_BLOCKS (SHMEM_NR_DIRECT + ENTRIES_PER_PAGE*ENTRIES_PER_PAGE) #define SHMEM_SB(sb) (sb-u.shmem_sb) @@ -51,6 +52,11 @@ #define BLOCKS_PER_PAGE (PAGE_SIZE/512) +static void shmem_removepage(struct page *page) +{ + atomic_dec(shmem_nrpages); +} + /* * shmem_recalc_inode - recalculate the size of an inode * @@ -69,11 +75,9 @@ * (inode-i_mapping-nrpages + info-swapped) * * It has to be called with the spinlock held. - * - * The swap parameter is a performance hack for truncate. */ -static void shmem_recalc_inode(struct inode * inode, unsigned long swap) +static void shmem_recalc_inode(struct inode * inode) { unsigned long freed; @@ -85,7 +89,6 @@ spin_lock (sbinfo-stat_lock); sbinfo-free_blocks += freed; spin_unlock (sbinfo-stat_lock); - atomic_sub(freed-swap, shmem_nrpages); } } @@ -202,7 +205,7 @@ out: info-max_index = index; info-swapped -= freed; - shmem_recalc_inode(inode, freed); + shmem_recalc_inode(inode); spin_unlock (info-lock); up(info-sem); } @@ -257,7 +260,7 @@ entry = shmem_swp_entry(info, page-index); if (IS_ERR(entry)) /* this had been allocted on page allocation */ BUG(); - shmem_recalc_inode(page-mapping-host, 0); + shmem_recalc_inode(page-mapping-host); error = -EAGAIN; if (entry-val) BUG(); @@ -265,7 +268,6 @@ *entry = swap; error = 0; /* Remove the page from the page cache */ - atomic_dec(shmem_nrpages); lru_cache_del(page); remove_inode_page(page); @@ -1086,6 +1088,8 @@ unsigned long max_inodes, inodes; struct shmem_sb_info *sbinfo = SHMEM_SB(sb); + max_blocks = sbinfo-max_blocks; + max_inodes = sbinfo-max_inodes; if (shmem_parse_options (data, NULL, max_blocks, max_inodes)) return -EINVAL; @@ -1134,7 +1138,7 @@ sbinfo-free_blocks = blocks; sbinfo-max_inodes = inodes; sbinfo-free_inodes = inodes; - sb-s_maxbytes = (unsigned long long)(SHMEM_NR_DIRECT +
Re: Shared memory quantity not being reflected by /proc/meminfo
Hi Allan, On Sun, 24 Jun 2001, Allan Duncan wrote: > OK, it's fine by me if the "shared" under 2.2.x is not the same, > however in that case the field should not appear at all in meminfo, > rather than the current zero value, which leads lesser kernel > hackers like me up the garden path. This would probably break a lot of user space apps. Greetings Christoph - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: Shared memory quantity not being reflected by /proc/meminfo
Hi Allan, On Sun, 24 Jun 2001, Allan Duncan wrote: OK, it's fine by me if the shared under 2.2.x is not the same, however in that case the field should not appear at all in meminfo, rather than the current zero value, which leads lesser kernel hackers like me up the garden path. This would probably break a lot of user space apps. Greetings Christoph - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: Shared memory quantity not being reflected by /proc/meminfo
Hi Albert, On Sat, 23 Jun 2001, Albert D. Cahalan wrote: > You misunderstood what 2.2.xx kernels were reporting. > The "shared" memory in /proc/meminfo refers to something > completely unrelated to SysV shared memory. This is no > longer calculated because the computation was too costly. But the load of misinterpretations and the missing value led me to export the number of shmem pages in later -ac kernels exactly in this field. I know it is a change of semantics and because of this both Alan and me asked for comments if this change is appreciated. I am still waiting for responses though. Greetings Christoph - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: Shared memory quantity not being reflected by /proc/meminfo
Hi Albert, On Sat, 23 Jun 2001, Albert D. Cahalan wrote: You misunderstood what 2.2.xx kernels were reporting. The shared memory in /proc/meminfo refers to something completely unrelated to SysV shared memory. This is no longer calculated because the computation was too costly. But the load of misinterpretations and the missing value led me to export the number of shmem pages in later -ac kernels exactly in this field. I know it is a change of semantics and because of this both Alan and me asked for comments if this change is appreciated. I am still waiting for responses though. Greetings Christoph - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[Patch] tmpfs fixes against 2.4.6-pre
Hi Linus, the appended patch fixes several tmpfs problems: 1) writing out of a mapping of a tmpfs file into the same file can deadlock 2) shmem_remount_fs garbles parameters which are not supplied 3) shmem_file_setup should check the maximum size Please apply Christoph diff -uNr 6-pre5/include/linux/shmem_fs.h 6-pre5-fix/include/linux/shmem_fs.h --- 6-pre5/include/linux/shmem_fs.h Sun Apr 29 20:33:00 2001 +++ 6-pre5-fix/include/linux/shmem_fs.h Thu Jun 21 15:52:25 2001 @@ -19,6 +19,7 @@ struct shmem_inode_info { spinlock_t lock; + struct semaphore sem; unsigned long max_index; swp_entry_t i_direct[SHMEM_NR_DIRECT]; /* for the first blocks */ swp_entry_t **i_indirect; /* doubly indirect blocks */ diff -uNr 6-pre5/mm/shmem.c 6-pre5-fix/mm/shmem.c --- 6-pre5/mm/shmem.c Tue Jun 12 09:49:28 2001 +++ 6-pre5-fix/mm/shmem.c Thu Jun 21 15:52:26 2001 @@ -3,7 +3,8 @@ * * Copyright (C) 2000 Linus Torvalds. * 2000 Transmeta Corp. - * 2000 Christoph Rohland + * 2000-2001 Christoph Rohland + * 2000-2001 SAP AG * * This file is released under the GPL. */ @@ -33,7 +34,7 @@ #define TMPFS_MAGIC0x01021994 #define ENTRIES_PER_PAGE (PAGE_SIZE/sizeof(unsigned long)) -#define NR_SINGLE (ENTRIES_PER_PAGE + SHMEM_NR_DIRECT) +#define SHMEM_MAX_BLOCKS (SHMEM_NR_DIRECT + ENTRIES_PER_PAGE*ENTRIES_PER_PAGE) static struct super_operations shmem_ops; static struct address_space_operations shmem_aops; @@ -161,6 +162,7 @@ swp_entry_t **base, **ptr, **last; struct shmem_inode_info * info = >u.shmem_i; + down(>sem); inode->i_ctime = inode->i_mtime = CURRENT_TIME; spin_lock (>lock); index = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; @@ -193,10 +195,14 @@ } out: - info->max_index = index; + if (index <= SHMEM_MAX_BLOCKS) + info->max_index = index; + else + info->max_index = SHMEM_MAX_BLOCKS + 1; info->swapped -= freed; shmem_recalc_inode(inode); spin_unlock (>lock); + up(>sem); } static void shmem_delete_inode(struct inode * inode) @@ -281,15 +287,12 @@ * still need to guard against racing with shm_writepage(), which might * be trying to move the page to the swap cache as we run. */ -static struct page * shmem_getpage_locked(struct inode * inode, unsigned long idx) +static struct page * shmem_getpage_locked(struct shmem_inode_info *info, struct inode +* inode, unsigned long idx) { struct address_space * mapping = inode->i_mapping; - struct shmem_inode_info *info; struct page * page; swp_entry_t *entry; - info = >u.shmem_i; - repeat: page = find_lock_page(mapping, idx); if (page) @@ -393,6 +396,7 @@ static int shmem_getpage(struct inode * inode, unsigned long idx, struct page **ptr) { + struct shmem_inode_info *info; struct address_space * mapping = inode->i_mapping; int error; @@ -407,27 +411,28 @@ page_cache_release(*ptr); } - down (>i_sem); - /* retest we may have slept */ + info = >u.shmem_i; + down (>sem); + /* retest we may have slept */ + + *ptr = ERR_PTR(-EFAULT); if (inode->i_size < (loff_t) idx * PAGE_CACHE_SIZE) - goto sigbus; - *ptr = shmem_getpage_locked(inode, idx); + goto failed; + + *ptr = shmem_getpage_locked(>u.shmem_i, inode, idx); if (IS_ERR (*ptr)) goto failed; + UnlockPage(*ptr); - up (>i_sem); + up (>sem); return 0; failed: - up (>i_sem); + up (>sem); error = PTR_ERR(*ptr); - *ptr = NOPAGE_OOM; - if (error != -EFBIG) - *ptr = NOPAGE_SIGBUS; - return error; -sigbus: - up (>i_sem); *ptr = NOPAGE_SIGBUS; - return -EFAULT; + if (error == -ENOMEM) + *ptr = NOPAGE_OOM; + return error; } struct page * shmem_nopage(struct vm_area_struct * vma, unsigned long address, int no_share) @@ -500,6 +505,7 @@ struct inode *shmem_get_inode(struct super_block *sb, int mode, int dev) { struct inode * inode; + struct shmem_inode_info *info; spin_lock (>u.shmem_sb.stat_lock); if (!sb->u.shmem_sb.free_inodes) { @@ -519,7 +525,9 @@ inode->i_rdev = NODEV; inode->i_mapping->a_ops = _aops; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; - spin_lock_init (>u.shmem_i.lock); + info = >u.shmem_i; + spin_lock_init (>lock); + sema_init (>sem, 1); switch (mode & S_IFMT) {
Re: Linux 2.4.5-ac16
Hi Alan, On Tue, 19 Jun 2001, Alan Cox wrote: > 2.4.5-ac16 > o Drop the shmem/removepage changes to see if they(me) > are cuaisng the instabilities in ac15 Any conclusions on that? Greetings Christoph - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[Patch] tmpfs fixes against 2.4.6-pre
Hi Linus, the appended patch fixes several tmpfs problems: 1) writing out of a mapping of a tmpfs file into the same file can deadlock 2) shmem_remount_fs garbles parameters which are not supplied 3) shmem_file_setup should check the maximum size Please apply Christoph diff -uNr 6-pre5/include/linux/shmem_fs.h 6-pre5-fix/include/linux/shmem_fs.h --- 6-pre5/include/linux/shmem_fs.h Sun Apr 29 20:33:00 2001 +++ 6-pre5-fix/include/linux/shmem_fs.h Thu Jun 21 15:52:25 2001 @@ -19,6 +19,7 @@ struct shmem_inode_info { spinlock_t lock; + struct semaphore sem; unsigned long max_index; swp_entry_t i_direct[SHMEM_NR_DIRECT]; /* for the first blocks */ swp_entry_t **i_indirect; /* doubly indirect blocks */ diff -uNr 6-pre5/mm/shmem.c 6-pre5-fix/mm/shmem.c --- 6-pre5/mm/shmem.c Tue Jun 12 09:49:28 2001 +++ 6-pre5-fix/mm/shmem.c Thu Jun 21 15:52:26 2001 @@ -3,7 +3,8 @@ * * Copyright (C) 2000 Linus Torvalds. * 2000 Transmeta Corp. - * 2000 Christoph Rohland + * 2000-2001 Christoph Rohland + * 2000-2001 SAP AG * * This file is released under the GPL. */ @@ -33,7 +34,7 @@ #define TMPFS_MAGIC0x01021994 #define ENTRIES_PER_PAGE (PAGE_SIZE/sizeof(unsigned long)) -#define NR_SINGLE (ENTRIES_PER_PAGE + SHMEM_NR_DIRECT) +#define SHMEM_MAX_BLOCKS (SHMEM_NR_DIRECT + ENTRIES_PER_PAGE*ENTRIES_PER_PAGE) static struct super_operations shmem_ops; static struct address_space_operations shmem_aops; @@ -161,6 +162,7 @@ swp_entry_t **base, **ptr, **last; struct shmem_inode_info * info = inode-u.shmem_i; + down(info-sem); inode-i_ctime = inode-i_mtime = CURRENT_TIME; spin_lock (info-lock); index = (inode-i_size + PAGE_CACHE_SIZE - 1) PAGE_CACHE_SHIFT; @@ -193,10 +195,14 @@ } out: - info-max_index = index; + if (index = SHMEM_MAX_BLOCKS) + info-max_index = index; + else + info-max_index = SHMEM_MAX_BLOCKS + 1; info-swapped -= freed; shmem_recalc_inode(inode); spin_unlock (info-lock); + up(info-sem); } static void shmem_delete_inode(struct inode * inode) @@ -281,15 +287,12 @@ * still need to guard against racing with shm_writepage(), which might * be trying to move the page to the swap cache as we run. */ -static struct page * shmem_getpage_locked(struct inode * inode, unsigned long idx) +static struct page * shmem_getpage_locked(struct shmem_inode_info *info, struct inode +* inode, unsigned long idx) { struct address_space * mapping = inode-i_mapping; - struct shmem_inode_info *info; struct page * page; swp_entry_t *entry; - info = inode-u.shmem_i; - repeat: page = find_lock_page(mapping, idx); if (page) @@ -393,6 +396,7 @@ static int shmem_getpage(struct inode * inode, unsigned long idx, struct page **ptr) { + struct shmem_inode_info *info; struct address_space * mapping = inode-i_mapping; int error; @@ -407,27 +411,28 @@ page_cache_release(*ptr); } - down (inode-i_sem); - /* retest we may have slept */ + info = inode-u.shmem_i; + down (info-sem); + /* retest we may have slept */ + + *ptr = ERR_PTR(-EFAULT); if (inode-i_size (loff_t) idx * PAGE_CACHE_SIZE) - goto sigbus; - *ptr = shmem_getpage_locked(inode, idx); + goto failed; + + *ptr = shmem_getpage_locked(inode-u.shmem_i, inode, idx); if (IS_ERR (*ptr)) goto failed; + UnlockPage(*ptr); - up (inode-i_sem); + up (info-sem); return 0; failed: - up (inode-i_sem); + up (info-sem); error = PTR_ERR(*ptr); - *ptr = NOPAGE_OOM; - if (error != -EFBIG) - *ptr = NOPAGE_SIGBUS; - return error; -sigbus: - up (inode-i_sem); *ptr = NOPAGE_SIGBUS; - return -EFAULT; + if (error == -ENOMEM) + *ptr = NOPAGE_OOM; + return error; } struct page * shmem_nopage(struct vm_area_struct * vma, unsigned long address, int no_share) @@ -500,6 +505,7 @@ struct inode *shmem_get_inode(struct super_block *sb, int mode, int dev) { struct inode * inode; + struct shmem_inode_info *info; spin_lock (sb-u.shmem_sb.stat_lock); if (!sb-u.shmem_sb.free_inodes) { @@ -519,7 +525,9 @@ inode-i_rdev = NODEV; inode-i_mapping-a_ops = shmem_aops; inode-i_atime = inode-i_mtime = inode-i_ctime = CURRENT_TIME; - spin_lock_init (inode-u.shmem_i.lock); + info = inode-u.shmem_i; + spin_lock_init (info-lock); + sema_init (info-sem, 1); switch (mode S_IFMT) { default: init_special_inode
Re: Linux 2.4.5-ac14
Hi Dieter, On Fri, 15 Jun 2001, Dieter Nützel wrote: > I see 4.29 GB under shm with your latest try. > something wrong? Yes, this is nasty. The appended patch fixes that. (I am not really happy to need the PG_marker flag for writepage.) The patch also fixes two other problems: - shmem_file_setup has to check the given size. Else we can corrupt kernel memory on 64bit machines. (Thanks to Oliver Paukstadt for detecting this) - shmem_remount_fs does not initialize the parameters and thus corrupts the sizes (detected by Joris van Rantwijk) Alan, please apply. Greetings Christoph diff -uNr 5-ac14/include/linux/mm.h 5-ac14-fix/include/linux/mm.h --- 5-ac14/include/linux/mm.h Fri Jun 15 10:37:21 2001 +++ 5-ac14-fix/include/linux/mm.h Fri Jun 15 11:24:06 2001 @@ -357,6 +357,7 @@ #define PageMarker(page) test_bit(PG_marker, &(page)->flags) #define SetPageMarker(page)set_bit(PG_marker, &(page)->flags) +#define ClearPageMarker(page) clear_bit(PG_marker, &(page)->flags) #ifdef CONFIG_HIGHMEM #define PageHighMem(page) test_bit(PG_highmem, &(page)->flags) diff -uNr 5-ac14/mm/shmem.c 5-ac14-fix/mm/shmem.c --- 5-ac14/mm/shmem.c Fri Jun 15 10:09:21 2001 +++ 5-ac14-fix/mm/shmem.c Fri Jun 15 11:37:44 2001 @@ -34,6 +34,7 @@ #define TMPFS_MAGIC0x01021994 #define ENTRIES_PER_PAGE (PAGE_SIZE/sizeof(unsigned long)) +#define SHMEM_MAX_BLOCKS (SHMEM_NR_DIRECT + ENTRIES_PER_PAGE*ENTRIES_PER_PAGE) #define SHMEM_SB(sb) (>u.shmem_sb) @@ -56,10 +57,12 @@ struct inode *inode = (struct inode *)page->mapping->host; struct shmem_sb_info * sbinfo = SHMEM_SB(inode->i_sb); - inode->i_blocks -= BLOCKS_PER_PAGE; - spin_lock (>stat_lock); - sbinfo->free_blocks++; - spin_unlock (>stat_lock); + if (!PageMarker(page)) { + inode->i_blocks -= BLOCKS_PER_PAGE; + spin_lock (>stat_lock); + sbinfo->free_blocks++; + spin_unlock (>stat_lock); + } atomic_dec(_nrpages); } @@ -241,9 +244,10 @@ *entry = swap; error = 0; /* Remove the page from the page cache */ - atomic_dec(_nrpages); lru_cache_del(page); + SetPageMarker(page); remove_inode_page(page); + ClearPageMarker(page); /* Add it to the swap cache */ add_to_swap_cache(page, swap); @@ -1062,6 +1066,8 @@ unsigned long max_inodes, inodes; struct shmem_sb_info *sbinfo = SHMEM_SB(sb); + max_blocks = sbinfo->max_blocks; + max_inodes = sbinfo->max_inodes; if (shmem_parse_options (data, NULL, _blocks, _inodes)) return -EINVAL; @@ -1110,7 +1116,7 @@ sbinfo->free_blocks = blocks; sbinfo->max_inodes = inodes; sbinfo->free_inodes = inodes; - sb->s_maxbytes = (unsigned long long)(SHMEM_NR_DIRECT + (ENTRIES_PER_PAGE*ENTRIES_PER_PAGE)) << PAGE_CACHE_SHIFT; + sb->s_maxbytes = (unsigned long long) SHMEM_MAX_BLOCKS << PAGE_CACHE_SHIFT; sb->s_blocksize = PAGE_CACHE_SIZE; sb->s_blocksize_bits = PAGE_CACHE_SHIFT; sb->s_magic = TMPFS_MAGIC; @@ -1311,9 +1317,11 @@ struct qstr this; int vm_enough_memory(long pages); - error = -ENOMEM; + if (size > (unsigned long long) SHMEM_MAX_BLOCKS << PAGE_CACHE_SHIFT) + return ERR_PTR(-EINVAL); + if (!vm_enough_memory((size) >> PAGE_SHIFT)) - goto out; + return ERR_PTR(-ENOMEM); this.name = name; this.len = strlen(name); @@ -1321,7 +1329,7 @@ root = tmpfs_fs_type.kern_mnt->mnt_root; dentry = d_alloc(root, ); if (!dentry) - goto out; + return ERR_PTR(-ENOMEM); error = -ENFILE; file = get_empty_filp(); @@ -1347,7 +1355,6 @@ put_filp(file); put_dentry: dput (dentry); -out: return ERR_PTR(error); } /* - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: Linux 2.4.5-ac14
Hi Dieter, On Fri, 15 Jun 2001, Dieter Nützel wrote: I see 4.29 GB under shm with your latest try. something wrong? Yes, this is nasty. The appended patch fixes that. (I am not really happy to need the PG_marker flag for writepage.) The patch also fixes two other problems: - shmem_file_setup has to check the given size. Else we can corrupt kernel memory on 64bit machines. (Thanks to Oliver Paukstadt for detecting this) - shmem_remount_fs does not initialize the parameters and thus corrupts the sizes (detected by Joris van Rantwijk) Alan, please apply. Greetings Christoph diff -uNr 5-ac14/include/linux/mm.h 5-ac14-fix/include/linux/mm.h --- 5-ac14/include/linux/mm.h Fri Jun 15 10:37:21 2001 +++ 5-ac14-fix/include/linux/mm.h Fri Jun 15 11:24:06 2001 @@ -357,6 +357,7 @@ #define PageMarker(page) test_bit(PG_marker, (page)-flags) #define SetPageMarker(page)set_bit(PG_marker, (page)-flags) +#define ClearPageMarker(page) clear_bit(PG_marker, (page)-flags) #ifdef CONFIG_HIGHMEM #define PageHighMem(page) test_bit(PG_highmem, (page)-flags) diff -uNr 5-ac14/mm/shmem.c 5-ac14-fix/mm/shmem.c --- 5-ac14/mm/shmem.c Fri Jun 15 10:09:21 2001 +++ 5-ac14-fix/mm/shmem.c Fri Jun 15 11:37:44 2001 @@ -34,6 +34,7 @@ #define TMPFS_MAGIC0x01021994 #define ENTRIES_PER_PAGE (PAGE_SIZE/sizeof(unsigned long)) +#define SHMEM_MAX_BLOCKS (SHMEM_NR_DIRECT + ENTRIES_PER_PAGE*ENTRIES_PER_PAGE) #define SHMEM_SB(sb) (sb-u.shmem_sb) @@ -56,10 +57,12 @@ struct inode *inode = (struct inode *)page-mapping-host; struct shmem_sb_info * sbinfo = SHMEM_SB(inode-i_sb); - inode-i_blocks -= BLOCKS_PER_PAGE; - spin_lock (sbinfo-stat_lock); - sbinfo-free_blocks++; - spin_unlock (sbinfo-stat_lock); + if (!PageMarker(page)) { + inode-i_blocks -= BLOCKS_PER_PAGE; + spin_lock (sbinfo-stat_lock); + sbinfo-free_blocks++; + spin_unlock (sbinfo-stat_lock); + } atomic_dec(shmem_nrpages); } @@ -241,9 +244,10 @@ *entry = swap; error = 0; /* Remove the page from the page cache */ - atomic_dec(shmem_nrpages); lru_cache_del(page); + SetPageMarker(page); remove_inode_page(page); + ClearPageMarker(page); /* Add it to the swap cache */ add_to_swap_cache(page, swap); @@ -1062,6 +1066,8 @@ unsigned long max_inodes, inodes; struct shmem_sb_info *sbinfo = SHMEM_SB(sb); + max_blocks = sbinfo-max_blocks; + max_inodes = sbinfo-max_inodes; if (shmem_parse_options (data, NULL, max_blocks, max_inodes)) return -EINVAL; @@ -1110,7 +1116,7 @@ sbinfo-free_blocks = blocks; sbinfo-max_inodes = inodes; sbinfo-free_inodes = inodes; - sb-s_maxbytes = (unsigned long long)(SHMEM_NR_DIRECT + (ENTRIES_PER_PAGE*ENTRIES_PER_PAGE)) PAGE_CACHE_SHIFT; + sb-s_maxbytes = (unsigned long long) SHMEM_MAX_BLOCKS PAGE_CACHE_SHIFT; sb-s_blocksize = PAGE_CACHE_SIZE; sb-s_blocksize_bits = PAGE_CACHE_SHIFT; sb-s_magic = TMPFS_MAGIC; @@ -1311,9 +1317,11 @@ struct qstr this; int vm_enough_memory(long pages); - error = -ENOMEM; + if (size (unsigned long long) SHMEM_MAX_BLOCKS PAGE_CACHE_SHIFT) + return ERR_PTR(-EINVAL); + if (!vm_enough_memory((size) PAGE_SHIFT)) - goto out; + return ERR_PTR(-ENOMEM); this.name = name; this.len = strlen(name); @@ -1321,7 +1329,7 @@ root = tmpfs_fs_type.kern_mnt-mnt_root; dentry = d_alloc(root, this); if (!dentry) - goto out; + return ERR_PTR(-ENOMEM); error = -ENFILE; file = get_empty_filp(); @@ -1347,7 +1355,6 @@ put_filp(file); put_dentry: dput (dentry); -out: return ERR_PTR(error); } /* - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[Patch] 2.4.5-ac13 ramfs and tmpfs accounting
Hi Alan, ramfs accounting does not get notified when a clean page gets dropped from the inode. Also tmpfs should use the new function to do accurate accounting. Else the cached field in -ac will get spurious negative values. The following patch fixes both. Greetings Christoph diff -uNr 5-ac13/fs/ramfs/inode.c 5-ac13-a/fs/ramfs/inode.c --- 5-ac13/fs/ramfs/inode.c Tue Jun 12 09:51:39 2001 +++ 5-ac13-a/fs/ramfs/inode.c Wed Jun 13 09:54:22 2001 @@ -289,7 +289,7 @@ return 0; } -static void ramfs_truncatepage(struct page *page) +static void ramfs_removepage(struct page *page) { struct inode *inode = (struct inode *)page->mapping->host; @@ -659,7 +659,7 @@ writepage: ramfs_writepage, prepare_write: ramfs_prepare_write, commit_write: ramfs_commit_write, - truncatepage: ramfs_truncatepage, + removepage: ramfs_removepage, }; static struct file_operations ramfs_file_operations = { diff -uNr 5-ac13/include/linux/fs.h 5-ac13-a/include/linux/fs.h --- 5-ac13/include/linux/fs.h Tue Jun 12 17:34:25 2001 +++ 5-ac13-a/include/linux/fs.h Wed Jun 13 10:23:48 2001 @@ -368,7 +368,7 @@ int (*sync_page)(struct page *); int (*prepare_write)(struct file *, struct page *, unsigned, unsigned); int (*commit_write)(struct file *, struct page *, unsigned, unsigned); - void (*truncatepage)(struct page *); /* called from truncate_complete_page */ + void (*removepage)(struct page *); /* called when page gets removed from the +inode */ /* Unfortunately this kludge is needed for FIBMAP. Don't use it */ int (*bmap)(struct address_space *, long); }; diff -uNr 5-ac13/mm/filemap.c 5-ac13-a/mm/filemap.c --- 5-ac13/mm/filemap.c Tue Jun 12 09:51:45 2001 +++ 5-ac13-a/mm/filemap.c Wed Jun 13 09:56:43 2001 @@ -82,6 +82,9 @@ { struct address_space * mapping = page->mapping; + if (mapping->a_ops->removepage) + mapping->a_ops->removepage(page); + mapping->nrpages--; list_del(>list); page->mapping = NULL; @@ -206,9 +209,6 @@ if (!page->buffers || block_flushpage(page, 0)) lru_cache_del(page); - if (page->mapping->a_ops->truncatepage) - page->mapping->a_ops->truncatepage(page); - /* * We remove the page from the page cache _after_ we have * destroyed all buffer-cache references to it. Otherwise some diff -uNr 5-ac13/mm/shmem.c 5-ac13-a/mm/shmem.c --- 5-ac13/mm/shmem.c Tue Jun 12 09:51:45 2001 +++ 5-ac13-a/mm/shmem.c Wed Jun 13 09:56:20 2001 @@ -51,42 +51,16 @@ #define BLOCKS_PER_PAGE (PAGE_SIZE/512) -/* - * shmem_recalc_inode - recalculate the size of an inode - * - * @inode: inode to recalc - * @swap: additional swap pages freed externally - * - * We have to calculate the free blocks since the mm can drop pages - * behind our back - * - * But we know that normally - * inodes->i_blocks/BLOCKS_PER_PAGE == - * inode->i_mapping->nrpages + info->swapped - * - * So the mm freed - * inodes->i_blocks/BLOCKS_PER_PAGE - - * (inode->i_mapping->nrpages + info->swapped) - * - * It has to be called with the spinlock held. - * - * The swap parameter is a performance hack for truncate. - */ - -static void shmem_recalc_inode(struct inode * inode, unsigned long swap) +static void shmem_removepage(struct page *page) { - unsigned long freed; + struct inode *inode = (struct inode *)page->mapping->host; + struct shmem_sb_info * sbinfo = SHMEM_SB(inode->i_sb); - freed = (inode->i_blocks/BLOCKS_PER_PAGE) - - (inode->i_mapping->nrpages + SHMEM_I(inode)->swapped); - if (freed){ - struct shmem_sb_info * sbinfo = SHMEM_SB(inode->i_sb); - inode->i_blocks -= freed*BLOCKS_PER_PAGE; - spin_lock (>stat_lock); - sbinfo->free_blocks += freed; - spin_unlock (>stat_lock); - atomic_sub(freed-swap, _nrpages); - } + inode->i_blocks -= BLOCKS_PER_PAGE; + spin_lock (>stat_lock); + sbinfo->free_blocks++; + spin_unlock (>stat_lock); + atomic_dec(_nrpages); } static swp_entry_t * shmem_swp_entry (struct shmem_inode_info *info, unsigned long index) @@ -166,6 +140,7 @@ unsigned long freed = 0; swp_entry_t **base, **ptr, **last; struct shmem_inode_info * info = SHMEM_I(inode); + struct shmem_sb_info * sbinfo = SHMEM_SB(inode->i_sb); down(>sem); inode->i_ctime = inode->i_mtime = CURRENT_TIME; @@ -202,7 +177,9 @@ out: info->max_index = index; info->swapped -= freed; - shmem_recalc_inode(inode, freed); + spin_lock(>stat_lock); + sbinfo->free_blocks += freed; + spin_unlock(>stat_lock); spin_unlock (>lock); up(>sem); } @@ -257,7 +234,6 @@ entry = shmem_swp_entry(info,
Re: DoS using tmpfs
Hi Pavel, On Fri, 8 Jun 2001, Pavel Roskin wrote: > Hello! > > It appears that a system with tmpfs mounted with the default (!!!) > parameters can be used by ordinary users to make the system > non-functional. ... > 1) tmpfs, as opposed to ramfs doesn't limit the usage by >default. It's not a good default for a filesystem designed for >temporary files. Yes, use the size parameter. And no, ramfs has no resource limits in the stock kernel at all. In -ac it limits to half the size of the physical RAM unconditionally. But that's not useful for tmpfs simce this uses swap also. So it is the admins task to add a size parameter. I would love to add a size paramater in percent of virtual memory but this would need some changes in the swapon/off coding. > 2) Not delivering SIGINT to processes is probably not the best >behavior if the memory if low. However, one could argue that some >processes would use even more resources if they get control with >SIGINT. > > 3) All swap in the system was exhausted and yet tmpfs didn't return >ENOSPC to "dd". That the kernel locks up is IMHO a mm fault. tmpfs allocates its pages with GFP_USER and will return an error if this fails. Apparently it never fails but locks up. Greetings Christoph - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: DoS using tmpfs
Hi Pavel, On Fri, 8 Jun 2001, Pavel Roskin wrote: Hello! It appears that a system with tmpfs mounted with the default (!!!) parameters can be used by ordinary users to make the system non-functional. ... 1) tmpfs, as opposed to ramfs doesn't limit the usage by default. It's not a good default for a filesystem designed for temporary files. Yes, use the size parameter. And no, ramfs has no resource limits in the stock kernel at all. In -ac it limits to half the size of the physical RAM unconditionally. But that's not useful for tmpfs simce this uses swap also. So it is the admins task to add a size parameter. I would love to add a size paramater in percent of virtual memory but this would need some changes in the swapon/off coding. 2) Not delivering SIGINT to processes is probably not the best behavior if the memory if low. However, one could argue that some processes would use even more resources if they get control with SIGINT. 3) All swap in the system was exhausted and yet tmpfs didn't return ENOSPC to dd. That the kernel locks up is IMHO a mm fault. tmpfs allocates its pages with GFP_USER and will return an error if this fails. Apparently it never fails but locks up. Greetings Christoph - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[Patch] 2.4.5-ac13 ramfs and tmpfs accounting
Hi Alan, ramfs accounting does not get notified when a clean page gets dropped from the inode. Also tmpfs should use the new function to do accurate accounting. Else the cached field in -ac will get spurious negative values. The following patch fixes both. Greetings Christoph diff -uNr 5-ac13/fs/ramfs/inode.c 5-ac13-a/fs/ramfs/inode.c --- 5-ac13/fs/ramfs/inode.c Tue Jun 12 09:51:39 2001 +++ 5-ac13-a/fs/ramfs/inode.c Wed Jun 13 09:54:22 2001 @@ -289,7 +289,7 @@ return 0; } -static void ramfs_truncatepage(struct page *page) +static void ramfs_removepage(struct page *page) { struct inode *inode = (struct inode *)page-mapping-host; @@ -659,7 +659,7 @@ writepage: ramfs_writepage, prepare_write: ramfs_prepare_write, commit_write: ramfs_commit_write, - truncatepage: ramfs_truncatepage, + removepage: ramfs_removepage, }; static struct file_operations ramfs_file_operations = { diff -uNr 5-ac13/include/linux/fs.h 5-ac13-a/include/linux/fs.h --- 5-ac13/include/linux/fs.h Tue Jun 12 17:34:25 2001 +++ 5-ac13-a/include/linux/fs.h Wed Jun 13 10:23:48 2001 @@ -368,7 +368,7 @@ int (*sync_page)(struct page *); int (*prepare_write)(struct file *, struct page *, unsigned, unsigned); int (*commit_write)(struct file *, struct page *, unsigned, unsigned); - void (*truncatepage)(struct page *); /* called from truncate_complete_page */ + void (*removepage)(struct page *); /* called when page gets removed from the +inode */ /* Unfortunately this kludge is needed for FIBMAP. Don't use it */ int (*bmap)(struct address_space *, long); }; diff -uNr 5-ac13/mm/filemap.c 5-ac13-a/mm/filemap.c --- 5-ac13/mm/filemap.c Tue Jun 12 09:51:45 2001 +++ 5-ac13-a/mm/filemap.c Wed Jun 13 09:56:43 2001 @@ -82,6 +82,9 @@ { struct address_space * mapping = page-mapping; + if (mapping-a_ops-removepage) + mapping-a_ops-removepage(page); + mapping-nrpages--; list_del(page-list); page-mapping = NULL; @@ -206,9 +209,6 @@ if (!page-buffers || block_flushpage(page, 0)) lru_cache_del(page); - if (page-mapping-a_ops-truncatepage) - page-mapping-a_ops-truncatepage(page); - /* * We remove the page from the page cache _after_ we have * destroyed all buffer-cache references to it. Otherwise some diff -uNr 5-ac13/mm/shmem.c 5-ac13-a/mm/shmem.c --- 5-ac13/mm/shmem.c Tue Jun 12 09:51:45 2001 +++ 5-ac13-a/mm/shmem.c Wed Jun 13 09:56:20 2001 @@ -51,42 +51,16 @@ #define BLOCKS_PER_PAGE (PAGE_SIZE/512) -/* - * shmem_recalc_inode - recalculate the size of an inode - * - * @inode: inode to recalc - * @swap: additional swap pages freed externally - * - * We have to calculate the free blocks since the mm can drop pages - * behind our back - * - * But we know that normally - * inodes-i_blocks/BLOCKS_PER_PAGE == - * inode-i_mapping-nrpages + info-swapped - * - * So the mm freed - * inodes-i_blocks/BLOCKS_PER_PAGE - - * (inode-i_mapping-nrpages + info-swapped) - * - * It has to be called with the spinlock held. - * - * The swap parameter is a performance hack for truncate. - */ - -static void shmem_recalc_inode(struct inode * inode, unsigned long swap) +static void shmem_removepage(struct page *page) { - unsigned long freed; + struct inode *inode = (struct inode *)page-mapping-host; + struct shmem_sb_info * sbinfo = SHMEM_SB(inode-i_sb); - freed = (inode-i_blocks/BLOCKS_PER_PAGE) - - (inode-i_mapping-nrpages + SHMEM_I(inode)-swapped); - if (freed){ - struct shmem_sb_info * sbinfo = SHMEM_SB(inode-i_sb); - inode-i_blocks -= freed*BLOCKS_PER_PAGE; - spin_lock (sbinfo-stat_lock); - sbinfo-free_blocks += freed; - spin_unlock (sbinfo-stat_lock); - atomic_sub(freed-swap, shmem_nrpages); - } + inode-i_blocks -= BLOCKS_PER_PAGE; + spin_lock (sbinfo-stat_lock); + sbinfo-free_blocks++; + spin_unlock (sbinfo-stat_lock); + atomic_dec(shmem_nrpages); } static swp_entry_t * shmem_swp_entry (struct shmem_inode_info *info, unsigned long index) @@ -166,6 +140,7 @@ unsigned long freed = 0; swp_entry_t **base, **ptr, **last; struct shmem_inode_info * info = SHMEM_I(inode); + struct shmem_sb_info * sbinfo = SHMEM_SB(inode-i_sb); down(info-sem); inode-i_ctime = inode-i_mtime = CURRENT_TIME; @@ -202,7 +177,9 @@ out: info-max_index = index; info-swapped -= freed; - shmem_recalc_inode(inode, freed); + spin_lock(sbinfo-stat_lock); + sbinfo-free_blocks += freed; + spin_unlock(sbinfo-stat_lock); spin_unlock (info-lock); up(info-sem); } @@ -257,7 +234,6 @@ entry =
Re: unused shared memory is written into core dump - bug or feature?
Hi Peter, On Tue, 12 Jun 2001, Peter Niemayer wrote: > I just noticed that when I attach some SYSV shared memory segments > to my process and then that process dies from a SIGSEGV that _all_ > the shared memory is dumped into the core file, even if it was never > used and therefore didn't show up in any of the memory statistics. Fixed in recent kernel versions (2.2 and 2.4). It will create sparse files and not touch the unused address space. Greetings Christoph - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: unused shared memory is written into core dump - bug or feature?
Hi Peter, On Tue, 12 Jun 2001, Peter Niemayer wrote: I just noticed that when I attach some SYSV shared memory segments to my process and then that process dies from a SIGSEGV that _all_ the shared memory is dumped into the core file, even if it was never used and therefore didn't show up in any of the memory statistics. Fixed in recent kernel versions (2.2 and 2.4). It will create sparse files and not touch the unused address space. Greetings Christoph - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: tmpfs + sendfile bug ?
Hi Linus, On Mon, 21 May 2001, Linus Torvalds wrote: > In article <[EMAIL PROTECTED]>, Christoph Rohland > <[EMAIL PROTECTED]> wrote: >> >>tmpfs does not provide the necessary functions for sendfile and lo: >>readpage, prepare_write and commitwrite. >> >>And I do not see a way how to provide readpage in tmpfs :-( > > Why not just do it the same way ramfs does? > > If you don't have any backing store, you know that the page is > empty. If you _do_ have backing store, a readpage() won't be > called. Ergo: AFAIU readpage is fine as long as there is no backing store. But if the page is in the swap cache, the lookup of the page in the page cache will fail; generic_file_read, loop, sendfile will allocate a page and call readpage with that. Now readpage has to copy the swap cache page over to this page :-( IMHO Copying on swapin is really not worth the additional functionality. Did I miss something? Christoph - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: tmpfs + sendfile bug ?
Hi Linus, On Mon, 21 May 2001, Linus Torvalds wrote: In article [EMAIL PROTECTED], Christoph Rohland [EMAIL PROTECTED] wrote: tmpfs does not provide the necessary functions for sendfile and lo: readpage, prepare_write and commitwrite. And I do not see a way how to provide readpage in tmpfs :-( Why not just do it the same way ramfs does? If you don't have any backing store, you know that the page is empty. If you _do_ have backing store, a readpage() won't be called. Ergo: AFAIU readpage is fine as long as there is no backing store. But if the page is in the swap cache, the lookup of the page in the page cache will fail; generic_file_read, loop, sendfile will allocate a page and call readpage with that. Now readpage has to copy the swap cache page over to this page :-( IMHO Copying on swapin is really not worth the additional functionality. Did I miss something? Christoph - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: tmpfs + sendfile bug ?
Hi Pierre, On Mon, 21 May 2001, Pierre Etchemaite wrote: > I just found a problem GETting a file stored in tmpfs using proftpd; > I always get a "426 Transfer aborted. Data connection closed." > > That could be a bug with tmpfs and sendfile in 2.4.5-pre4 : > > [...] > read(8, "%PDF-1.4\r%\342\343\317\323\r\n870 0 obj\r<< \r/L"..., 8192) = 8192 > shmat(11, 0x4cfe65, 0x3)= 0xb4d4 > sendfile(11, 8, [0], 5045861) = -1 EINVAL (Invalid argument) > [...] > > Any idea ? That's probably the same reason why tmpfs and loopback do not work together: tmpfs does not provide the necessary functions for sendfile and lo: readpage, prepare_write and commitwrite. And I do not see a way how to provide readpage in tmpfs :-( Greetings Christoph - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: tmpfs + sendfile bug ?
Hi Pierre, On Mon, 21 May 2001, Pierre Etchemaite wrote: I just found a problem GETting a file stored in tmpfs using proftpd; I always get a 426 Transfer aborted. Data connection closed. That could be a bug with tmpfs and sendfile in 2.4.5-pre4 : [...] read(8, %PDF-1.4\r%\342\343\317\323\r\n870 0 obj\r \r/L..., 8192) = 8192 shmat(11, 0x4cfe65, 0x3)= 0xb4d4 sendfile(11, 8, [0], 5045861) = -1 EINVAL (Invalid argument) [...] Any idea ? That's probably the same reason why tmpfs and loopback do not work together: tmpfs does not provide the necessary functions for sendfile and lo: readpage, prepare_write and commitwrite. And I do not see a way how to provide readpage in tmpfs :-( Greetings Christoph - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[Patch] tmpfs accounting cleanup for -ac series
Hi Alan, While looking at the -ac version of ramfs I noticed that there is a new address operation introduced which I can use to cleanup shmem. This patch throws away some magic recalculation and makes the accounting of shmem accurate. It also encapsulates all accesses to the superblock_info into a macro. The patch is on top of my previous ones. Greetings Christoph diff -uNr 4-ac9/fs/proc/proc_misc.c c/fs/proc/proc_misc.c --- 4-ac9/fs/proc/proc_misc.c Thu May 17 13:17:37 2001 +++ c/fs/proc/proc_misc.c Thu May 17 13:11:30 2001 @@ -140,17 +140,9 @@ { struct sysinfo i; int len; - unsigned int cached, shmem; + unsigned int cached; - /* -* There may be some inconsistency because shmem_nrpages -* update is delayed to page_cache_size -* We make sure the cached value does not get below zero -*/ - cached = atomic_read(_cache_size); - shmem = atomic_read(_nrpages); - if (shmem < cached) - cached -= shmem; + cached = atomic_read(_cache_size) - atomic_read(_nrpages); /* * display in kilobytes. diff -uNr 4-ac9/mm/mmap.c c/mm/mmap.c --- 4-ac9/mm/mmap.c Thu May 17 13:17:37 2001 +++ c/mm/mmap.c Thu May 17 10:54:22 2001 @@ -56,24 +56,14 @@ */ long free; - unsigned long cached, shmem; - - /* -* There may be some inconsistency because shmem_nrpages -* update is delayed to the page_cache_size -* We make sure the cached value does not get below zero -*/ - cached = atomic_read(_cache_size); - shmem = atomic_read(_nrpages); - if (cached > shmem) - cached -= shmem; /* Sometimes we want to use more memory than we have. */ if (sysctl_overcommit_memory) return 1; free = atomic_read(_pages); - free += cached; + free += atomic_read(_cache_size) ; + free -= atomic_read(_nrpages); free += nr_free_pages(); free += nr_swap_pages; diff -uNr 4-ac9/mm/shmem.c c/mm/shmem.c --- 4-ac9/mm/shmem.cThu May 17 13:17:37 2001 +++ c/mm/shmem.cThu May 17 10:54:03 2001 @@ -35,6 +35,8 @@ #define ENTRIES_PER_PAGE (PAGE_SIZE/sizeof(unsigned long)) +#define SHMEM_SB(sb) (>u.shmem_sb) + static struct super_operations shmem_ops; static struct address_space_operations shmem_aops; static struct file_operations shmem_file_operations; @@ -50,44 +52,6 @@ #define BLOCKS_PER_PAGE (PAGE_SIZE/512) /* - * shmem_recalc_inode - recalculate the size of an inode - * - * @inode: inode to recalc - * @swap: additional swap pages freed externally - * - * We have to calculate the free blocks since the mm can drop pages - * behind our back - * - * But we know that normally - * inodes->i_blocks/BLOCKS_PER_PAGE == - * inode->i_mapping->nrpages + info->swapped - * - * So the mm freed - * inodes->i_blocks/BLOCKS_PER_PAGE - - * (inode->i_mapping->nrpages + info->swapped) - * - * It has to be called with the spinlock held. - * - * The swap parameter is a performance hack for truncate. - */ - -static void shmem_recalc_inode(struct inode * inode, unsigned long swap) -{ - unsigned long freed; - - freed = (inode->i_blocks/BLOCKS_PER_PAGE) - - (inode->i_mapping->nrpages + SHMEM_I(inode)->swapped); - if (freed){ - struct shmem_sb_info * info = >i_sb->u.shmem_sb; - inode->i_blocks -= freed*BLOCKS_PER_PAGE; - spin_lock (>stat_lock); - info->free_blocks += freed; - spin_unlock (>stat_lock); - atomic_sub(freed-swap, _nrpages); - } -} - -/* * shmem_swp_entry - find the swap vector position in the info structure * * @info: info structure for the inode @@ -318,6 +282,7 @@ unsigned long index; unsigned long freed = 0; struct shmem_inode_info * info = SHMEM_I(inode); + struct shmem_sb_info * sbinfo = SHMEM_SB(inode->i_sb); down(>sem); inode->i_ctime = inode->i_mtime = CURRENT_TIME; @@ -328,14 +293,28 @@ freed += shmem_truncate_indirect(info, index); info->swapped -= freed; - shmem_recalc_inode(inode, freed); + spin_lock(>stat_lock); + sbinfo->free_blocks += freed; + spin_unlock(>stat_lock); spin_unlock (>lock); up(>sem); } +static void shmem_truncatepage(struct page *page) +{ + struct inode *inode = (struct inode *)page->mapping->host; + struct shmem_sb_info * sbinfo = SHMEM_SB(inode->i_sb); + + inode->i_blocks -= BLOCKS_PER_PAGE; + spin_lock (>stat_lock); + sbinfo->free_blocks++; + spin_unlock (>stat_lock); + atomic_dec(_nrpages); +} + static void shmem_delete_inode(struct inode * inode) { - struct shmem_sb_info *info = >i_sb->u.shmem_sb; + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); inode->i_size =
[Patch] ramfs accounting in -ac broken
Hi Alan, The ramfs accounting is broken for shared mmaps. It simply does not recognize the pages allocated by writing into a shared mapping but takes them into account when freed. The attached patch should fix that. Greetings Christoph --- 4-ac9/fs/ramfs/inode.c Thu May 17 16:51:57 2001 +++ u4ac9/fs/ramfs/inode.c Thu May 17 14:47:48 2001 @@ -163,9 +163,6 @@ struct ramfs_sb_info *rsb = RAMFS_SB(inode->i_sb); int ret = 1; - if (PageDirty(page)) /* It's already been allocated */ - return 1; - lock_rsb(rsb); if ( (rsb->free_pages > 0) && @@ -185,8 +182,7 @@ { struct ramfs_sb_info *rsb = RAMFS_SB(inode->i_sb); - if (! PageDirty(page)) /* The page was never allocated - this can happen if it was only read */ + if (! Page_Uptodate(page)) return; lock_rsb(rsb); @@ -241,6 +237,8 @@ static int ramfs_readpage(struct file *file, struct page * page) { if (!Page_Uptodate(page)) { + if (!ramfs_alloc_page(file->f_dentry->d_inode, page)) + return -ENOSPC; memset(kmap(page), 0, PAGE_CACHE_SIZE); kunmap(page); flush_dcache_page(page); @@ -266,11 +264,12 @@ struct inode *inode = (struct inode *)page->mapping->host; void *addr; - if (! ramfs_alloc_page(inode, page)) - return -ENOSPC; - addr = (void *) kmap(page); if (!Page_Uptodate(page)) { + if (! ramfs_alloc_page(inode, page)) { + kunmap(page); + return -ENOSPC; + } memset(addr, 0, PAGE_CACHE_SIZE); flush_dcache_page(page); SetPageUptodate(page); - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] rootfs (part 1)
Hi Alan, On Thu, 17 May 2001, Alan Cox wrote: > I think you have a major tool problem. > > bash-2.04$ size mm/shmem.o >text data bss dec hex filename >7422 572 079941f3a mm/shmem.o > bash-2.04$ size fs/ramfs/ramfs.o >text data bss dec hex filename >3185 368 03553 de1 fs/ramfs/ramfs.o > > Never trust ls -l size for binaries, its very very unrelated. > > So ramfs is 3553 bytes, shmem.o in total is 8K on current -ac. But you cannot disable shmem.o totally. That's my whole point in the discussion. Why add something what is mostly included in the kernel already? You have to compare shmem with tmpfs against shmem w/o it: textdata bss dec hex filename 3398 376 03774 ebe fs/ramfs/ramfs.o 5150 484 056341602 mm/shmem.o 9174 636 098102652 mm/shmem.o+tmpfs So tmpfs is 400 Bytes bigger than ramfs. If you add the correct timestamp handling the difference will go down further. And we gain functionality, don't we? Greetings Christoph - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] rootfs (part 1)
Hi Alan, On Thu, 17 May 2001, Alan Cox wrote: I think you have a major tool problem. bash-2.04$ size mm/shmem.o text data bss dec hex filename 7422 572 079941f3a mm/shmem.o bash-2.04$ size fs/ramfs/ramfs.o text data bss dec hex filename 3185 368 03553 de1 fs/ramfs/ramfs.o Never trust ls -l size for binaries, its very very unrelated. So ramfs is 3553 bytes, shmem.o in total is 8K on current -ac. But you cannot disable shmem.o totally. That's my whole point in the discussion. Why add something what is mostly included in the kernel already? You have to compare shmem with tmpfs against shmem w/o it: textdata bss dec hex filename 3398 376 03774 ebe fs/ramfs/ramfs.o 5150 484 056341602 mm/shmem.o 9174 636 098102652 mm/shmem.o+tmpfs So tmpfs is 400 Bytes bigger than ramfs. If you add the correct timestamp handling the difference will go down further. And we gain functionality, don't we? Greetings Christoph - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[Patch] ramfs accounting in -ac broken
Hi Alan, The ramfs accounting is broken for shared mmaps. It simply does not recognize the pages allocated by writing into a shared mapping but takes them into account when freed. The attached patch should fix that. Greetings Christoph --- 4-ac9/fs/ramfs/inode.c Thu May 17 16:51:57 2001 +++ u4ac9/fs/ramfs/inode.c Thu May 17 14:47:48 2001 @@ -163,9 +163,6 @@ struct ramfs_sb_info *rsb = RAMFS_SB(inode-i_sb); int ret = 1; - if (PageDirty(page)) /* It's already been allocated */ - return 1; - lock_rsb(rsb); if ( (rsb-free_pages 0) @@ -185,8 +182,7 @@ { struct ramfs_sb_info *rsb = RAMFS_SB(inode-i_sb); - if (! PageDirty(page)) /* The page was never allocated - this can happen if it was only read */ + if (! Page_Uptodate(page)) return; lock_rsb(rsb); @@ -241,6 +237,8 @@ static int ramfs_readpage(struct file *file, struct page * page) { if (!Page_Uptodate(page)) { + if (!ramfs_alloc_page(file-f_dentry-d_inode, page)) + return -ENOSPC; memset(kmap(page), 0, PAGE_CACHE_SIZE); kunmap(page); flush_dcache_page(page); @@ -266,11 +264,12 @@ struct inode *inode = (struct inode *)page-mapping-host; void *addr; - if (! ramfs_alloc_page(inode, page)) - return -ENOSPC; - addr = (void *) kmap(page); if (!Page_Uptodate(page)) { + if (! ramfs_alloc_page(inode, page)) { + kunmap(page); + return -ENOSPC; + } memset(addr, 0, PAGE_CACHE_SIZE); flush_dcache_page(page); SetPageUptodate(page); - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[Patch] tmpfs accounting cleanup for -ac series
Hi Alan, While looking at the -ac version of ramfs I noticed that there is a new address operation introduced which I can use to cleanup shmem. This patch throws away some magic recalculation and makes the accounting of shmem accurate. It also encapsulates all accesses to the superblock_info into a macro. The patch is on top of my previous ones. Greetings Christoph diff -uNr 4-ac9/fs/proc/proc_misc.c c/fs/proc/proc_misc.c --- 4-ac9/fs/proc/proc_misc.c Thu May 17 13:17:37 2001 +++ c/fs/proc/proc_misc.c Thu May 17 13:11:30 2001 @@ -140,17 +140,9 @@ { struct sysinfo i; int len; - unsigned int cached, shmem; + unsigned int cached; - /* -* There may be some inconsistency because shmem_nrpages -* update is delayed to page_cache_size -* We make sure the cached value does not get below zero -*/ - cached = atomic_read(page_cache_size); - shmem = atomic_read(shmem_nrpages); - if (shmem cached) - cached -= shmem; + cached = atomic_read(page_cache_size) - atomic_read(shmem_nrpages); /* * display in kilobytes. diff -uNr 4-ac9/mm/mmap.c c/mm/mmap.c --- 4-ac9/mm/mmap.c Thu May 17 13:17:37 2001 +++ c/mm/mmap.c Thu May 17 10:54:22 2001 @@ -56,24 +56,14 @@ */ long free; - unsigned long cached, shmem; - - /* -* There may be some inconsistency because shmem_nrpages -* update is delayed to the page_cache_size -* We make sure the cached value does not get below zero -*/ - cached = atomic_read(page_cache_size); - shmem = atomic_read(shmem_nrpages); - if (cached shmem) - cached -= shmem; /* Sometimes we want to use more memory than we have. */ if (sysctl_overcommit_memory) return 1; free = atomic_read(buffermem_pages); - free += cached; + free += atomic_read(page_cache_size) ; + free -= atomic_read(shmem_nrpages); free += nr_free_pages(); free += nr_swap_pages; diff -uNr 4-ac9/mm/shmem.c c/mm/shmem.c --- 4-ac9/mm/shmem.cThu May 17 13:17:37 2001 +++ c/mm/shmem.cThu May 17 10:54:03 2001 @@ -35,6 +35,8 @@ #define ENTRIES_PER_PAGE (PAGE_SIZE/sizeof(unsigned long)) +#define SHMEM_SB(sb) (sb-u.shmem_sb) + static struct super_operations shmem_ops; static struct address_space_operations shmem_aops; static struct file_operations shmem_file_operations; @@ -50,44 +52,6 @@ #define BLOCKS_PER_PAGE (PAGE_SIZE/512) /* - * shmem_recalc_inode - recalculate the size of an inode - * - * @inode: inode to recalc - * @swap: additional swap pages freed externally - * - * We have to calculate the free blocks since the mm can drop pages - * behind our back - * - * But we know that normally - * inodes-i_blocks/BLOCKS_PER_PAGE == - * inode-i_mapping-nrpages + info-swapped - * - * So the mm freed - * inodes-i_blocks/BLOCKS_PER_PAGE - - * (inode-i_mapping-nrpages + info-swapped) - * - * It has to be called with the spinlock held. - * - * The swap parameter is a performance hack for truncate. - */ - -static void shmem_recalc_inode(struct inode * inode, unsigned long swap) -{ - unsigned long freed; - - freed = (inode-i_blocks/BLOCKS_PER_PAGE) - - (inode-i_mapping-nrpages + SHMEM_I(inode)-swapped); - if (freed){ - struct shmem_sb_info * info = inode-i_sb-u.shmem_sb; - inode-i_blocks -= freed*BLOCKS_PER_PAGE; - spin_lock (info-stat_lock); - info-free_blocks += freed; - spin_unlock (info-stat_lock); - atomic_sub(freed-swap, shmem_nrpages); - } -} - -/* * shmem_swp_entry - find the swap vector position in the info structure * * @info: info structure for the inode @@ -318,6 +282,7 @@ unsigned long index; unsigned long freed = 0; struct shmem_inode_info * info = SHMEM_I(inode); + struct shmem_sb_info * sbinfo = SHMEM_SB(inode-i_sb); down(info-sem); inode-i_ctime = inode-i_mtime = CURRENT_TIME; @@ -328,14 +293,28 @@ freed += shmem_truncate_indirect(info, index); info-swapped -= freed; - shmem_recalc_inode(inode, freed); + spin_lock(sbinfo-stat_lock); + sbinfo-free_blocks += freed; + spin_unlock(sbinfo-stat_lock); spin_unlock (info-lock); up(info-sem); } +static void shmem_truncatepage(struct page *page) +{ + struct inode *inode = (struct inode *)page-mapping-host; + struct shmem_sb_info * sbinfo = SHMEM_SB(inode-i_sb); + + inode-i_blocks -= BLOCKS_PER_PAGE; + spin_lock (sbinfo-stat_lock); + sbinfo-free_blocks++; + spin_unlock (sbinfo-stat_lock); + atomic_dec(shmem_nrpages); +} + static void shmem_delete_inode(struct inode * inode) { - struct shmem_sb_info *info = inode-i_sb-u.shmem_sb; +
Re: [PATCH] rootfs (part 1)
Hi Alexander, On Wed, 16 May 2001, Alexander Viro wrote: > Because what I need is an absolute minimum. Heck, I don't even use > regular files (in the full variant of patch, that is). They might > become useful, but I can live with mkdir() and mknod(). So what about adding shmem_mknod and shmem_mkdir to the core shmem.c part? They are now under CONFIG_TMPFS but are only ~20 lines of code. Greetings Christoph - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] rootfs (part 1)
Hi Linus, On Wed, 16 May 2001, Linus Torvalds wrote: > > On 16 May 2001, Christoph Rohland wrote: >> >> cr:/speicher/src/u4ac9 $ ls -l mm/shmem.o* >> -rw-r--r--1 cr users 154652 Mai 16 19:27 mm/shmem.o-tmpfs >> -rw-r--r--1 cr users 180764 Mai 16 19:24 mm/shmem.o+tmpfs >> cr:/speicher/src/u4ac9 $ ls -l fs/ramfs/ramfs.o >> -rw-r--r--1 cr users 141452 Mai 16 19:27 fs/ramfs/ramfs.o >> >> So CONFIG_TMPFS adds 26k and ramfs 140k. > > What the hell are you doing? Compiling with debugging or something? Yep, sorry that was uml with debugging info. > The ramfs inode.o file (the only file that ramfs contains) has 376 > bytes of data and 1612 bytes of code. BYTES. The whole final object > file with all the relocation information is > > -rw-r--r-- 1 torvalds eng 5734 May 16 10:58 ramfs.o > > but out of that 5.5kB, only 2kB are actually linked into the kernel > and are used to _run_. -rw-r--r--1 root root 8656 May 16 20:27 fs/ramfs/ramfs.o -rw-r--r--1 root root11688 May 16 20:24 mm/shmem.o-tmpfs -rw-r--r--1 root root18592 May 16 20:20 mm/shmem.o+tmpfs That's an -ac kernel, so ramfs does accounting and is a little bigger than yours. So the read/write support in tmpfs is about the same size as ramfs. Greetings Christoph - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] rootfs (part 1)
Hi Linus, On Wed, 16 May 2001, Linus Torvalds wrote: > Looks ok, but it also feels like 2.5.x stuff to me. > > Also, there's the question of whether to make ramfs just built-in, > or make _tmpfs_ built in - ramfs is certainly simpler, but tmpfs > does the same things and you need that one for shared mappings etc. > > Comments? cr:/speicher/src/u4ac9 $ ls -l mm/shmem.o* -rw-r--r--1 cr users 154652 Mai 16 19:27 mm/shmem.o-tmpfs -rw-r--r--1 cr users 180764 Mai 16 19:24 mm/shmem.o+tmpfs cr:/speicher/src/u4ac9 $ ls -l fs/ramfs/ramfs.o -rw-r--r--1 cr users 141452 Mai 16 19:27 fs/ramfs/ramfs.o So CONFIG_TMPFS adds 26k and ramfs 140k. Greetings Christoph - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] rootfs (part 1)
Hi Al, On Wed, 16 May 2001, Alexander Viro wrote: > One point that might be better done differently - since we > need ramfs for boot I've just made fs/Config.in declare CONFIG_RAMFS > as define_bool CONFIG_RAMFS y. If ramfs grows (e.g. gets resource > limits patches from -ac) we might be better off doing a minimal > variant permanently in kernel (calling it rootfs) and making > ramfs use rootfs methods. It's completely separate issue, so I've > done it the simplest way for the time being. Why do you use ramfs? Most of it is duplicated in tmpfs and ramfs is a minimal _example_ fs. There was some agreement that this should stay so. Look into mm/shmem.c and look how little is added by CONFIG_TMPFS and how much is duplicated from ramfs If we really think the added swap vector per file in tmpfs is a major overhead we should add the nonswapping functions there. Greetings Christoph - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] rootfs (part 1)
Hi Al, On Wed, 16 May 2001, Alexander Viro wrote: One point that might be better done differently - since we need ramfs for boot I've just made fs/Config.in declare CONFIG_RAMFS as define_bool CONFIG_RAMFS y. If ramfs grows (e.g. gets resource limits patches from -ac) we might be better off doing a minimal variant permanently in kernel (calling it rootfs) and making ramfs use rootfs methods. It's completely separate issue, so I've done it the simplest way for the time being. Why do you use ramfs? Most of it is duplicated in tmpfs and ramfs is a minimal _example_ fs. There was some agreement that this should stay so. Look into mm/shmem.c and look how little is added by CONFIG_TMPFS and how much is duplicated from ramfs If we really think the added swap vector per file in tmpfs is a major overhead we should add the nonswapping functions there. Greetings Christoph - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] rootfs (part 1)
Hi Linus, On Wed, 16 May 2001, Linus Torvalds wrote: Looks ok, but it also feels like 2.5.x stuff to me. Also, there's the question of whether to make ramfs just built-in, or make _tmpfs_ built in - ramfs is certainly simpler, but tmpfs does the same things and you need that one for shared mappings etc. Comments? cr:/speicher/src/u4ac9 $ ls -l mm/shmem.o* -rw-r--r--1 cr users 154652 Mai 16 19:27 mm/shmem.o-tmpfs -rw-r--r--1 cr users 180764 Mai 16 19:24 mm/shmem.o+tmpfs cr:/speicher/src/u4ac9 $ ls -l fs/ramfs/ramfs.o -rw-r--r--1 cr users 141452 Mai 16 19:27 fs/ramfs/ramfs.o So CONFIG_TMPFS adds 26k and ramfs 140k. Greetings Christoph - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] rootfs (part 1)
Hi Linus, On Wed, 16 May 2001, Linus Torvalds wrote: On 16 May 2001, Christoph Rohland wrote: cr:/speicher/src/u4ac9 $ ls -l mm/shmem.o* -rw-r--r--1 cr users 154652 Mai 16 19:27 mm/shmem.o-tmpfs -rw-r--r--1 cr users 180764 Mai 16 19:24 mm/shmem.o+tmpfs cr:/speicher/src/u4ac9 $ ls -l fs/ramfs/ramfs.o -rw-r--r--1 cr users 141452 Mai 16 19:27 fs/ramfs/ramfs.o So CONFIG_TMPFS adds 26k and ramfs 140k. What the hell are you doing? Compiling with debugging or something? Yep, sorry that was uml with debugging info. The ramfs inode.o file (the only file that ramfs contains) has 376 bytes of data and 1612 bytes of code. BYTES. The whole final object file with all the relocation information is -rw-r--r-- 1 torvalds eng 5734 May 16 10:58 ramfs.o but out of that 5.5kB, only 2kB are actually linked into the kernel and are used to _run_. -rw-r--r--1 root root 8656 May 16 20:27 fs/ramfs/ramfs.o -rw-r--r--1 root root11688 May 16 20:24 mm/shmem.o-tmpfs -rw-r--r--1 root root18592 May 16 20:20 mm/shmem.o+tmpfs That's an -ac kernel, so ramfs does accounting and is a little bigger than yours. So the read/write support in tmpfs is about the same size as ramfs. Greetings Christoph - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] rootfs (part 1)
Hi Alexander, On Wed, 16 May 2001, Alexander Viro wrote: Because what I need is an absolute minimum. Heck, I don't even use regular files (in the full variant of patch, that is). They might become useful, but I can live with mkdir() and mknod(). So what about adding shmem_mknod and shmem_mkdir to the core shmem.c part? They are now under CONFIG_TMPFS but are only ~20 lines of code. Greetings Christoph - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[Patch] allow tmpfs bigger than 1GB on s390x
Hi Martin, Here is the patch which implements triple indirect blocks in tmpfs. For the rest of the word: This is needed since s390x is a 64 Bit platform with pagesize of 4k :-( It is on top of my other tmpfs fixes which you can find at ftp://ftp.sap.com/pub/linuxlab/people/cr Greetings Christoph diff -uNr 4-mSsas/include/linux/shmem_fs.h 4-mSsasb/include/linux/shmem_fs.h --- 4-mSsas/include/linux/shmem_fs.hMon May 14 08:49:42 2001 +++ 4-mSsasb/include/linux/shmem_fs.h Mon May 14 09:05:39 2001 @@ -22,9 +22,9 @@ struct shmem_inode_info { spinlock_t lock; struct semaphoresem; - unsigned long max_index; + unsigned long next_index; swp_entry_t i_direct[SHMEM_NR_DIRECT]; /* for the first blocks */ - swp_entry_t **i_indirect; /* doubly indirect blocks */ + void **i_indirect; /* indirect blocks */ unsigned long swapped; int locked; /* into memory */ struct list_headlist; diff -uNr 4-mSsas/mm/shmem.c 4-mSsasb/mm/shmem.c --- 4-mSsas/mm/shmem.c Mon May 14 08:49:42 2001 +++ 4-mSsasb/mm/shmem.c Tue May 15 09:12:00 2001 @@ -34,7 +34,6 @@ #define TMPFS_MAGIC0x01021994 #define ENTRIES_PER_PAGE (PAGE_SIZE/sizeof(unsigned long)) -#define NR_SINGLE (ENTRIES_PER_PAGE + SHMEM_NR_DIRECT) static struct super_operations shmem_ops; static struct address_space_operations shmem_aops; @@ -65,7 +64,7 @@ * * So the mm freed * inodes->i_blocks/BLOCKS_PER_PAGE - - * (inode->i_mapping->nrpages + info->swapped) + * (inode->i_mapping->nrpages + info->swapped) * * It has to be called with the spinlock held. * @@ -88,9 +87,53 @@ } } -static swp_entry_t * shmem_swp_entry (struct shmem_inode_info *info, unsigned long index) +/* + * shmem_swp_entry - find the swap vector position in the info structure + * + * @info: info structure for the inode + * @index: index of the page to find + * @page: optional page to add to the structure. Has to be preset to + * all zeros + * + * If there is no space allocated yet it will return -ENOMEM when + * page == 0 else it will use the page for the needed block. + * + * returns -EFBIG if the index is too big. + * + * + * The swap vector is organized the following way: + * + * There are SHMEM_NR_DIRECT entries directly stored in the + * shmem_inode_info structure. So small files do not need an addional + * allocation. + * + * For pages with index > SHMEM_NR_DIRECT there is the pointer + * i_indirect which points to a page which holds in the first half + * doubly indirect blocks, in the second half triple indirect blocks: + * + * For an artificial ENTRIES_PER_PAGE = 4 this would lead to the + * following layout (for SHMEM_NR_DIRECT == 16): + * + * i_indirect -> dir --> 16-19 + * | +-> 20-23 + * | + * +-->dir2 --> 24-27 + * |+-> 28-31 + * |+-> 32-35 + * |+-> 36-39 + * | + * +-->dir3 --> 40-43 + *+-> 44-47 + *+-> 48-51 + *+-> 52-55 + */ + +#define SHMEM_MAX_BLOCKS (SHMEM_NR_DIRECT + ENTRIES_PER_PAGE * +ENTRIES_PER_PAGE/2*(ENTRIES_PER_PAGE+1)) + +static swp_entry_t * shmem_swp_entry (struct shmem_inode_info *info, unsigned long +index, unsigned long page) { unsigned long offset; + void **dir; if (index < SHMEM_NR_DIRECT) return info->i_direct+index; @@ -99,23 +142,66 @@ offset = index % ENTRIES_PER_PAGE; index /= ENTRIES_PER_PAGE; - if (index >= ENTRIES_PER_PAGE) - return ERR_PTR(-EFBIG); - if (!info->i_indirect) { - info->i_indirect = (swp_entry_t **) get_zeroed_page(GFP_USER); - if (!info->i_indirect) + info->i_indirect = (void *) page; + return ERR_PTR(-ENOMEM); + } + + dir = info->i_indirect + index; + if (index >= ENTRIES_PER_PAGE/2) { + index -= ENTRIES_PER_PAGE/2; + dir = info->i_indirect + ENTRIES_PER_PAGE/2 + + index/ENTRIES_PER_PAGE; + index %= ENTRIES_PER_PAGE; + + if(!*dir) { + *dir = (void *) page; + /* We return since we will need another page + in the next step */ return ERR_PTR(-ENOMEM); + } + dir = ((void **)*dir) + index; } - if(!(info->i_indirect[index])) { - info->i_indirect[index] = (swp_entry_t *) get_zeroed_page(GFP_USER); - if (!info->i_indirect[index]) + if (!*dir) { + if (!page) return ERR_PTR(-ENOMEM); + *dir = (void *)page; } - - return
Assorted tmpfs fixes
struct sysinfo i; int len; + unsigned int cached, shmem; + + /* +* There may be some inconsistency because shmem_nrpages +* update is delayed to page_cache_size +* We make sure the cached value does not get below zero +*/ + cached = atomic_read(_cache_size); + shmem = atomic_read(_nrpages); + if (shmem < cached) + cached -= shmem; /* * display in kilobytes. @@ -153,8 +164,8 @@ "Swap: %8lu %8lu %8lu\n", B(i.totalram), B(i.totalram-i.freeram), B(i.freeram), B(i.sharedram), B(i.bufferram), -B(atomic_read(_cache_size)), B(i.totalswap), -B(i.totalswap-i.freeswap), B(i.freeswap)); + B(cached), B(i.totalswap), + B(i.totalswap-i.freeswap), B(i.freeswap)); /* * Tagged format, for easy grepping and expansion. * The above will go away eventually, once the tools @@ -180,7 +191,7 @@ K(i.freeram), K(i.sharedram), K(i.bufferram), -K(atomic_read(_cache_size)), + K(cached), K(nr_active_pages), K(nr_inactive_dirty_pages), K(nr_inactive_clean_pages()), diff -uNr 2.4.4-mSsu/include/linux/shmem_fs.h 2.4.4-mSsua/include/linux/shmem_fs.h --- 2.4.4-mSsu/include/linux/shmem_fs.h Wed May 2 18:36:05 2001 +++ 2.4.4-mSsua/include/linux/shmem_fs.hMon May 7 12:52:00 2001 @@ -17,6 +17,8 @@ unsigned long val; } swp_entry_t; +extern atomic_t shmem_nrpages; + struct shmem_inode_info { spinlock_t lock; struct semaphoresem; diff -uNr 2.4.4-mSsu/mm/mmap.c 2.4.4-mSsua/mm/mmap.c --- 2.4.4-mSsu/mm/mmap.cSun Apr 29 20:33:01 2001 +++ 2.4.4-mSsua/mm/mmap.c Mon May 7 13:42:03 2001 @@ -55,13 +55,24 @@ */ long free; - + unsigned long cached, shmem; + + /* +* There may be some inconsistency because shmem_nrpages +* update is delayed to the page_cache_size +* We make sure the cached value does not get below zero +*/ + cached = atomic_read(_cache_size); + shmem = atomic_read(_nrpages); + if (cached > shmem) + cached -= shmem; + /* Sometimes we want to use more memory than we have. */ if (sysctl_overcommit_memory) return 1; free = atomic_read(_pages); - free += atomic_read(_cache_size); + free += cached; free += nr_free_pages(); free += nr_swap_pages; diff -uNr 2.4.4-mSsu/mm/shmem.c 2.4.4-mSsua/mm/shmem.c --- 2.4.4-mSsu/mm/shmem.c Fri May 4 21:37:34 2001 +++ 2.4.4-mSsua/mm/shmem.c Mon May 7 11:13:27 2001 @@ -3,7 +3,8 @@ * * Copyright (C) 2000 Linus Torvalds. * 2000 Transmeta Corp. - * 2000 Christoph Rohland + * 2000-2001 Christoph Rohland + * 2000-2001 SAP AG * * This file is released under the GPL. */ @@ -45,6 +46,7 @@ LIST_HEAD (shmem_inodes); static spinlock_t shmem_ilock = SPIN_LOCK_UNLOCKED; +atomic_t shmem_nrpages = ATOMIC_INIT(0); #define BLOCKS_PER_PAGE (PAGE_SIZE/512) @@ -52,6 +54,7 @@ * shmem_recalc_inode - recalculate the size of an inode * * @inode: inode to recalc + * @swap: additional swap pages freed externally * * We have to calculate the free blocks since the mm can drop pages * behind our back @@ -62,12 +65,14 @@ * * So the mm freed * inodes->i_blocks/BLOCKS_PER_PAGE - - * (inode->i_mapping->nrpages + info->swapped) + * (inode->i_mapping->nrpages + info->swapped) * * It has to be called with the spinlock held. + * + * The swap parameter is a performance hack for truncate. */ -static void shmem_recalc_inode(struct inode * inode) +static void shmem_recalc_inode(struct inode * inode, unsigned long swap) { unsigned long freed; @@ -79,6 +84,7 @@ spin_lock (>stat_lock); info->free_blocks += freed; spin_unlock (>stat_lock); + atomic_sub(freed-swap, _nrpages); } } @@ -195,7 +201,7 @@ out: info->max_index = index; info->swapped -= freed; - shmem_recalc_inode(inode); + shmem_recalc_inode(inode, freed); spin_unlock (>lock); up(>sem); } @@ -250,14 +256,15 @@ entry = shmem_swp_entry(info, page->index); if (IS_ERR(entry)) /* this had been allocted on page allocation */ BUG(); - shmem_recalc_inode(page->mapping->host); + shmem_recalc_inode(page->mapping->host, 0); error = -EAGAIN; if (entry->val) BUG(); *entry = swap; error = 0; - /* Remove the from the page cache */ + /* Remove the page from the page
Assorted tmpfs fixes
, shmem; + + /* +* There may be some inconsistency because shmem_nrpages +* update is delayed to page_cache_size +* We make sure the cached value does not get below zero +*/ + cached = atomic_read(page_cache_size); + shmem = atomic_read(shmem_nrpages); + if (shmem cached) + cached -= shmem; /* * display in kilobytes. @@ -153,8 +164,8 @@ Swap: %8lu %8lu %8lu\n, B(i.totalram), B(i.totalram-i.freeram), B(i.freeram), B(i.sharedram), B(i.bufferram), -B(atomic_read(page_cache_size)), B(i.totalswap), -B(i.totalswap-i.freeswap), B(i.freeswap)); + B(cached), B(i.totalswap), + B(i.totalswap-i.freeswap), B(i.freeswap)); /* * Tagged format, for easy grepping and expansion. * The above will go away eventually, once the tools @@ -180,7 +191,7 @@ K(i.freeram), K(i.sharedram), K(i.bufferram), -K(atomic_read(page_cache_size)), + K(cached), K(nr_active_pages), K(nr_inactive_dirty_pages), K(nr_inactive_clean_pages()), diff -uNr 2.4.4-mSsu/include/linux/shmem_fs.h 2.4.4-mSsua/include/linux/shmem_fs.h --- 2.4.4-mSsu/include/linux/shmem_fs.h Wed May 2 18:36:05 2001 +++ 2.4.4-mSsua/include/linux/shmem_fs.hMon May 7 12:52:00 2001 @@ -17,6 +17,8 @@ unsigned long val; } swp_entry_t; +extern atomic_t shmem_nrpages; + struct shmem_inode_info { spinlock_t lock; struct semaphoresem; diff -uNr 2.4.4-mSsu/mm/mmap.c 2.4.4-mSsua/mm/mmap.c --- 2.4.4-mSsu/mm/mmap.cSun Apr 29 20:33:01 2001 +++ 2.4.4-mSsua/mm/mmap.c Mon May 7 13:42:03 2001 @@ -55,13 +55,24 @@ */ long free; - + unsigned long cached, shmem; + + /* +* There may be some inconsistency because shmem_nrpages +* update is delayed to the page_cache_size +* We make sure the cached value does not get below zero +*/ + cached = atomic_read(page_cache_size); + shmem = atomic_read(shmem_nrpages); + if (cached shmem) + cached -= shmem; + /* Sometimes we want to use more memory than we have. */ if (sysctl_overcommit_memory) return 1; free = atomic_read(buffermem_pages); - free += atomic_read(page_cache_size); + free += cached; free += nr_free_pages(); free += nr_swap_pages; diff -uNr 2.4.4-mSsu/mm/shmem.c 2.4.4-mSsua/mm/shmem.c --- 2.4.4-mSsu/mm/shmem.c Fri May 4 21:37:34 2001 +++ 2.4.4-mSsua/mm/shmem.c Mon May 7 11:13:27 2001 @@ -3,7 +3,8 @@ * * Copyright (C) 2000 Linus Torvalds. * 2000 Transmeta Corp. - * 2000 Christoph Rohland + * 2000-2001 Christoph Rohland + * 2000-2001 SAP AG * * This file is released under the GPL. */ @@ -45,6 +46,7 @@ LIST_HEAD (shmem_inodes); static spinlock_t shmem_ilock = SPIN_LOCK_UNLOCKED; +atomic_t shmem_nrpages = ATOMIC_INIT(0); #define BLOCKS_PER_PAGE (PAGE_SIZE/512) @@ -52,6 +54,7 @@ * shmem_recalc_inode - recalculate the size of an inode * * @inode: inode to recalc + * @swap: additional swap pages freed externally * * We have to calculate the free blocks since the mm can drop pages * behind our back @@ -62,12 +65,14 @@ * * So the mm freed * inodes-i_blocks/BLOCKS_PER_PAGE - - * (inode-i_mapping-nrpages + info-swapped) + * (inode-i_mapping-nrpages + info-swapped) * * It has to be called with the spinlock held. + * + * The swap parameter is a performance hack for truncate. */ -static void shmem_recalc_inode(struct inode * inode) +static void shmem_recalc_inode(struct inode * inode, unsigned long swap) { unsigned long freed; @@ -79,6 +84,7 @@ spin_lock (info-stat_lock); info-free_blocks += freed; spin_unlock (info-stat_lock); + atomic_sub(freed-swap, shmem_nrpages); } } @@ -195,7 +201,7 @@ out: info-max_index = index; info-swapped -= freed; - shmem_recalc_inode(inode); + shmem_recalc_inode(inode, freed); spin_unlock (info-lock); up(info-sem); } @@ -250,14 +256,15 @@ entry = shmem_swp_entry(info, page-index); if (IS_ERR(entry)) /* this had been allocted on page allocation */ BUG(); - shmem_recalc_inode(page-mapping-host); + shmem_recalc_inode(page-mapping-host, 0); error = -EAGAIN; if (entry-val) BUG(); *entry = swap; error = 0; - /* Remove the from the page cache */ + /* Remove the page from the page cache */ + atomic_dec(shmem_nrpages); lru_cache_del(page
[Patch] allow tmpfs bigger than 1GB on s390x
Hi Martin, Here is the patch which implements triple indirect blocks in tmpfs. For the rest of the word: This is needed since s390x is a 64 Bit platform with pagesize of 4k :-( It is on top of my other tmpfs fixes which you can find at ftp://ftp.sap.com/pub/linuxlab/people/cr Greetings Christoph diff -uNr 4-mSsas/include/linux/shmem_fs.h 4-mSsasb/include/linux/shmem_fs.h --- 4-mSsas/include/linux/shmem_fs.hMon May 14 08:49:42 2001 +++ 4-mSsasb/include/linux/shmem_fs.h Mon May 14 09:05:39 2001 @@ -22,9 +22,9 @@ struct shmem_inode_info { spinlock_t lock; struct semaphoresem; - unsigned long max_index; + unsigned long next_index; swp_entry_t i_direct[SHMEM_NR_DIRECT]; /* for the first blocks */ - swp_entry_t **i_indirect; /* doubly indirect blocks */ + void **i_indirect; /* indirect blocks */ unsigned long swapped; int locked; /* into memory */ struct list_headlist; diff -uNr 4-mSsas/mm/shmem.c 4-mSsasb/mm/shmem.c --- 4-mSsas/mm/shmem.c Mon May 14 08:49:42 2001 +++ 4-mSsasb/mm/shmem.c Tue May 15 09:12:00 2001 @@ -34,7 +34,6 @@ #define TMPFS_MAGIC0x01021994 #define ENTRIES_PER_PAGE (PAGE_SIZE/sizeof(unsigned long)) -#define NR_SINGLE (ENTRIES_PER_PAGE + SHMEM_NR_DIRECT) static struct super_operations shmem_ops; static struct address_space_operations shmem_aops; @@ -65,7 +64,7 @@ * * So the mm freed * inodes-i_blocks/BLOCKS_PER_PAGE - - * (inode-i_mapping-nrpages + info-swapped) + * (inode-i_mapping-nrpages + info-swapped) * * It has to be called with the spinlock held. * @@ -88,9 +87,53 @@ } } -static swp_entry_t * shmem_swp_entry (struct shmem_inode_info *info, unsigned long index) +/* + * shmem_swp_entry - find the swap vector position in the info structure + * + * @info: info structure for the inode + * @index: index of the page to find + * @page: optional page to add to the structure. Has to be preset to + * all zeros + * + * If there is no space allocated yet it will return -ENOMEM when + * page == 0 else it will use the page for the needed block. + * + * returns -EFBIG if the index is too big. + * + * + * The swap vector is organized the following way: + * + * There are SHMEM_NR_DIRECT entries directly stored in the + * shmem_inode_info structure. So small files do not need an addional + * allocation. + * + * For pages with index SHMEM_NR_DIRECT there is the pointer + * i_indirect which points to a page which holds in the first half + * doubly indirect blocks, in the second half triple indirect blocks: + * + * For an artificial ENTRIES_PER_PAGE = 4 this would lead to the + * following layout (for SHMEM_NR_DIRECT == 16): + * + * i_indirect - dir -- 16-19 + * | +- 20-23 + * | + * +--dir2 -- 24-27 + * |+- 28-31 + * |+- 32-35 + * |+- 36-39 + * | + * +--dir3 -- 40-43 + *+- 44-47 + *+- 48-51 + *+- 52-55 + */ + +#define SHMEM_MAX_BLOCKS (SHMEM_NR_DIRECT + ENTRIES_PER_PAGE * +ENTRIES_PER_PAGE/2*(ENTRIES_PER_PAGE+1)) + +static swp_entry_t * shmem_swp_entry (struct shmem_inode_info *info, unsigned long +index, unsigned long page) { unsigned long offset; + void **dir; if (index SHMEM_NR_DIRECT) return info-i_direct+index; @@ -99,23 +142,66 @@ offset = index % ENTRIES_PER_PAGE; index /= ENTRIES_PER_PAGE; - if (index = ENTRIES_PER_PAGE) - return ERR_PTR(-EFBIG); - if (!info-i_indirect) { - info-i_indirect = (swp_entry_t **) get_zeroed_page(GFP_USER); - if (!info-i_indirect) + info-i_indirect = (void *) page; + return ERR_PTR(-ENOMEM); + } + + dir = info-i_indirect + index; + if (index = ENTRIES_PER_PAGE/2) { + index -= ENTRIES_PER_PAGE/2; + dir = info-i_indirect + ENTRIES_PER_PAGE/2 + + index/ENTRIES_PER_PAGE; + index %= ENTRIES_PER_PAGE; + + if(!*dir) { + *dir = (void *) page; + /* We return since we will need another page + in the next step */ return ERR_PTR(-ENOMEM); + } + dir = ((void **)*dir) + index; } - if(!(info-i_indirect[index])) { - info-i_indirect[index] = (swp_entry_t *) get_zeroed_page(GFP_USER); - if (!info-i_indirect[index]) + if (!*dir) { + if (!page) return ERR_PTR(-ENOMEM); + *dir = (void *)page; } - - return info-i_indirect[index]+offset; +
Re: 2.4.4 kernel freeze for unknown reason
Hi Mike, On Sat, 12 May 2001, Mike Galbraith wrote: > Why do I not see this behavior with a heavy swap throughput test > load? It seems decidedly odd to me that swapspace should remain > allocated on other folks lightly loaded boxen given that my heavily > loaded box does release swapspace quite regularly. What am I > missing? Are you using a database or something other which mostly uses shared mem/tmpfs? This does reclaim swap space on swap in. Greetings Christoph - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: 2.4.4 kernel freeze for unknown reason
Hi Mike, On Sat, 12 May 2001, Mike Galbraith wrote: Why do I not see this behavior with a heavy swap throughput test load? It seems decidedly odd to me that swapspace should remain allocated on other folks lightly loaded boxen given that my heavily loaded box does release swapspace quite regularly. What am I missing? Are you using a database or something other which mostly uses shared mem/tmpfs? This does reclaim swap space on swap in. Greetings Christoph - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[Patch] Do not account shmem pages to the page cache
Hi, The appended patch does it's own accounting of shmem pages and adjust the page cache size to take these into account. So now again you will see shmem pages as used in top/vmstat etc. This confused a lot of people. There is a uncertainty in the calculations since the vm may drop pages behind shmem and the number of shmem pages is estimated too high. This especially happens on truncate because first the page cache is reduced and later the shmem readjusts it's count. To prevent negative cache sizes the adjustment is only done if shmem_nrpages > page_cache_size. The latter part of the patch (all the init.c files) also exports the shmem page number to the shared memory field in meminfo. This means a change in semantics of this field but apparently a lot of people interpret this field exactly this way and it was not used any more The patches are on top of my encapsulation patch. Greetings Christoph diff -uNr 2.4.4-mSsu/fs/proc/proc_misc.c 2.4.4-mSsua/fs/proc/proc_misc.c --- 2.4.4-mSsu/fs/proc/proc_misc.c Sun Apr 29 20:32:52 2001 +++ 2.4.4-mSsua/fs/proc/proc_misc.c Mon May 7 13:38:53 2001 @@ -140,6 +140,17 @@ { struct sysinfo i; int len; + unsigned int cached, shmem; + + /* +* There may be some inconsistency because shmem_nrpages +* update is delayed to page_cache_size +* We make sure the cached value does not get below zero +*/ + cached = atomic_read(_cache_size); + shmem = atomic_read(_nrpages); + if (shmem < cached) + cached -= shmem; /* * display in kilobytes. @@ -153,8 +164,8 @@ "Swap: %8lu %8lu %8lu\n", B(i.totalram), B(i.totalram-i.freeram), B(i.freeram), B(i.sharedram), B(i.bufferram), -B(atomic_read(_cache_size)), B(i.totalswap), -B(i.totalswap-i.freeswap), B(i.freeswap)); + B(cached), B(i.totalswap), + B(i.totalswap-i.freeswap), B(i.freeswap)); /* * Tagged format, for easy grepping and expansion. * The above will go away eventually, once the tools @@ -180,7 +191,7 @@ K(i.freeram), K(i.sharedram), K(i.bufferram), -K(atomic_read(_cache_size)), + K(cached), K(nr_active_pages), K(nr_inactive_dirty_pages), K(nr_inactive_clean_pages()), diff -uNr 2.4.4-mSsu/include/linux/shmem_fs.h 2.4.4-mSsua/include/linux/shmem_fs.h --- 2.4.4-mSsu/include/linux/shmem_fs.h Wed May 2 18:36:05 2001 +++ 2.4.4-mSsua/include/linux/shmem_fs.hMon May 7 12:52:00 2001 @@ -17,6 +17,8 @@ unsigned long val; } swp_entry_t; +extern atomic_t shmem_nrpages; + struct shmem_inode_info { spinlock_t lock; struct semaphoresem; diff -uNr 2.4.4-mSsu/mm/mmap.c 2.4.4-mSsua/mm/mmap.c --- 2.4.4-mSsu/mm/mmap.cSun Apr 29 20:33:01 2001 +++ 2.4.4-mSsua/mm/mmap.c Mon May 7 13:42:03 2001 @@ -55,13 +55,24 @@ */ long free; - + unsigned long cached, shmem; + + /* +* There may be some inconsistency because shmem_nrpages +* update is delayed to the page_cache_size +* We make sure the cached value does not get below zero +*/ + cached = atomic_read(_cache_size); + shmem = atomic_read(_nrpages); + if (cached > shmem) + cached -= shmem; + /* Sometimes we want to use more memory than we have. */ if (sysctl_overcommit_memory) return 1; free = atomic_read(_pages); - free += atomic_read(_cache_size); + free += cached; free += nr_free_pages(); free += nr_swap_pages; diff -uNr 2.4.4-mSsu/mm/shmem.c 2.4.4-mSsua/mm/shmem.c --- 2.4.4-mSsu/mm/shmem.c Fri May 4 21:37:34 2001 +++ 2.4.4-mSsua/mm/shmem.c Mon May 7 11:13:27 2001 @@ -3,7 +3,8 @@ * * Copyright (C) 2000 Linus Torvalds. * 2000 Transmeta Corp. - * 2000 Christoph Rohland + * 2000-2001 Christoph Rohland + * 2000-2001 SAP AG * * This file is released under the GPL. */ @@ -45,6 +46,7 @@ LIST_HEAD (shmem_inodes); static spinlock_t shmem_ilock = SPIN_LOCK_UNLOCKED; +atomic_t shmem_nrpages = ATOMIC_INIT(0); #define BLOCKS_PER_PAGE (PAGE_SIZE/512) @@ -52,6 +54,7 @@ * shmem_recalc_inode - recalculate the size of an inode * * @inode: inode to recalc + * @swap: additional swap pages freed externally * * We have to calculate the free blocks since the mm can drop pages * behind our back @@ -62,12 +65,14 @@ * * So the mm freed * inodes->i_blocks/BLOCKS_PER_PAGE - - * (inode->i_mapping->nrpages + info->swapped) + * (inode->i_mapping->nrpages + info->swapped) * * It has to be called with the s
[Patch] Do not account shmem pages to the page cache
Hi, The appended patch does it's own accounting of shmem pages and adjust the page cache size to take these into account. So now again you will see shmem pages as used in top/vmstat etc. This confused a lot of people. There is a uncertainty in the calculations since the vm may drop pages behind shmem and the number of shmem pages is estimated too high. This especially happens on truncate because first the page cache is reduced and later the shmem readjusts it's count. To prevent negative cache sizes the adjustment is only done if shmem_nrpages page_cache_size. The latter part of the patch (all the init.c files) also exports the shmem page number to the shared memory field in meminfo. This means a change in semantics of this field but apparently a lot of people interpret this field exactly this way and it was not used any more The patches are on top of my encapsulation patch. Greetings Christoph diff -uNr 2.4.4-mSsu/fs/proc/proc_misc.c 2.4.4-mSsua/fs/proc/proc_misc.c --- 2.4.4-mSsu/fs/proc/proc_misc.c Sun Apr 29 20:32:52 2001 +++ 2.4.4-mSsua/fs/proc/proc_misc.c Mon May 7 13:38:53 2001 @@ -140,6 +140,17 @@ { struct sysinfo i; int len; + unsigned int cached, shmem; + + /* +* There may be some inconsistency because shmem_nrpages +* update is delayed to page_cache_size +* We make sure the cached value does not get below zero +*/ + cached = atomic_read(page_cache_size); + shmem = atomic_read(shmem_nrpages); + if (shmem cached) + cached -= shmem; /* * display in kilobytes. @@ -153,8 +164,8 @@ Swap: %8lu %8lu %8lu\n, B(i.totalram), B(i.totalram-i.freeram), B(i.freeram), B(i.sharedram), B(i.bufferram), -B(atomic_read(page_cache_size)), B(i.totalswap), -B(i.totalswap-i.freeswap), B(i.freeswap)); + B(cached), B(i.totalswap), + B(i.totalswap-i.freeswap), B(i.freeswap)); /* * Tagged format, for easy grepping and expansion. * The above will go away eventually, once the tools @@ -180,7 +191,7 @@ K(i.freeram), K(i.sharedram), K(i.bufferram), -K(atomic_read(page_cache_size)), + K(cached), K(nr_active_pages), K(nr_inactive_dirty_pages), K(nr_inactive_clean_pages()), diff -uNr 2.4.4-mSsu/include/linux/shmem_fs.h 2.4.4-mSsua/include/linux/shmem_fs.h --- 2.4.4-mSsu/include/linux/shmem_fs.h Wed May 2 18:36:05 2001 +++ 2.4.4-mSsua/include/linux/shmem_fs.hMon May 7 12:52:00 2001 @@ -17,6 +17,8 @@ unsigned long val; } swp_entry_t; +extern atomic_t shmem_nrpages; + struct shmem_inode_info { spinlock_t lock; struct semaphoresem; diff -uNr 2.4.4-mSsu/mm/mmap.c 2.4.4-mSsua/mm/mmap.c --- 2.4.4-mSsu/mm/mmap.cSun Apr 29 20:33:01 2001 +++ 2.4.4-mSsua/mm/mmap.c Mon May 7 13:42:03 2001 @@ -55,13 +55,24 @@ */ long free; - + unsigned long cached, shmem; + + /* +* There may be some inconsistency because shmem_nrpages +* update is delayed to the page_cache_size +* We make sure the cached value does not get below zero +*/ + cached = atomic_read(page_cache_size); + shmem = atomic_read(shmem_nrpages); + if (cached shmem) + cached -= shmem; + /* Sometimes we want to use more memory than we have. */ if (sysctl_overcommit_memory) return 1; free = atomic_read(buffermem_pages); - free += atomic_read(page_cache_size); + free += cached; free += nr_free_pages(); free += nr_swap_pages; diff -uNr 2.4.4-mSsu/mm/shmem.c 2.4.4-mSsua/mm/shmem.c --- 2.4.4-mSsu/mm/shmem.c Fri May 4 21:37:34 2001 +++ 2.4.4-mSsua/mm/shmem.c Mon May 7 11:13:27 2001 @@ -3,7 +3,8 @@ * * Copyright (C) 2000 Linus Torvalds. * 2000 Transmeta Corp. - * 2000 Christoph Rohland + * 2000-2001 Christoph Rohland + * 2000-2001 SAP AG * * This file is released under the GPL. */ @@ -45,6 +46,7 @@ LIST_HEAD (shmem_inodes); static spinlock_t shmem_ilock = SPIN_LOCK_UNLOCKED; +atomic_t shmem_nrpages = ATOMIC_INIT(0); #define BLOCKS_PER_PAGE (PAGE_SIZE/512) @@ -52,6 +54,7 @@ * shmem_recalc_inode - recalculate the size of an inode * * @inode: inode to recalc + * @swap: additional swap pages freed externally * * We have to calculate the free blocks since the mm can drop pages * behind our back @@ -62,12 +65,14 @@ * * So the mm freed * inodes-i_blocks/BLOCKS_PER_PAGE - - * (inode-i_mapping-nrpages + info-swapped) + * (inode-i_mapping-nrpages + info-swapped) * * It has to be called with the spinlock held
[Resend] Collection of tmpfs patches
Hi, There is some confusion about my latest tmpfs fixes. There were three patches which are cummulative against 2.4.4: 1) deadlock fix for write out of mmap regions. (AFAIK this is integrated in the -ac kernels) 2) encapsulate access to shmem_inode_info 3) Do inline symlinks I attach all these patches to this mail in the case that somebody missed one. Greetings Christoph diff -uNr 2.4.4/include/linux/shmem_fs.h c/include/linux/shmem_fs.h --- 2.4.4/include/linux/shmem_fs.h Sun Apr 29 20:33:00 2001 +++ c/include/linux/shmem_fs.h Sun Apr 29 22:43:56 2001 @@ -19,6 +19,7 @@ struct shmem_inode_info { spinlock_t lock; + struct semaphore sem; unsigned long max_index; swp_entry_t i_direct[SHMEM_NR_DIRECT]; /* for the first blocks */ swp_entry_t **i_indirect; /* doubly indirect blocks */ diff -uNr 2.4.4/mm/shmem.c c/mm/shmem.c --- 2.4.4/mm/shmem.cMon Apr 30 09:45:39 2001 +++ c/mm/shmem.cTue May 1 15:15:38 2001 @@ -161,6 +161,7 @@ swp_entry_t **base, **ptr, **last; struct shmem_inode_info * info = >u.shmem_i; + down(>sem); inode->i_ctime = inode->i_mtime = CURRENT_TIME; spin_lock (>lock); index = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; @@ -197,6 +198,7 @@ info->swapped -= freed; shmem_recalc_inode(inode); spin_unlock (>lock); + up(>sem); } static void shmem_delete_inode(struct inode * inode) @@ -281,15 +283,12 @@ * still need to guard against racing with shm_writepage(), which might * be trying to move the page to the swap cache as we run. */ -static struct page * shmem_getpage_locked(struct inode * inode, unsigned long idx) +static struct page * shmem_getpage_locked(struct shmem_inode_info *info, struct inode +* inode, unsigned long idx) { struct address_space * mapping = inode->i_mapping; - struct shmem_inode_info *info; struct page * page; swp_entry_t *entry; - info = >u.shmem_i; - repeat: page = find_lock_page(mapping, idx); if (page) @@ -393,6 +392,7 @@ static int shmem_getpage(struct inode * inode, unsigned long idx, struct page **ptr) { + struct shmem_inode_info *info; struct address_space * mapping = inode->i_mapping; int error; @@ -407,27 +407,28 @@ page_cache_release(*ptr); } - down (>i_sem); - /* retest we may have slept */ + info = >u.shmem_i; + down (>sem); + /* retest we may have slept */ + + *ptr = ERR_PTR(-EFAULT); if (inode->i_size < (loff_t) idx * PAGE_CACHE_SIZE) - goto sigbus; - *ptr = shmem_getpage_locked(inode, idx); + goto failed; + + *ptr = shmem_getpage_locked(>u.shmem_i, inode, idx); if (IS_ERR (*ptr)) goto failed; + UnlockPage(*ptr); - up (>i_sem); + up (>sem); return 0; failed: - up (>i_sem); + up (>sem); error = PTR_ERR(*ptr); - *ptr = NOPAGE_OOM; - if (error != -EFBIG) - *ptr = NOPAGE_SIGBUS; - return error; -sigbus: - up (>i_sem); *ptr = NOPAGE_SIGBUS; - return -EFAULT; + if (error == -ENOMEM) + *ptr = NOPAGE_OOM; + return error; } struct page * shmem_nopage(struct vm_area_struct * vma, unsigned long address, int no_share) @@ -500,6 +501,7 @@ struct inode *shmem_get_inode(struct super_block *sb, int mode, int dev) { struct inode * inode; + struct shmem_inode_info *info; spin_lock (>u.shmem_sb.stat_lock); if (!sb->u.shmem_sb.free_inodes) { @@ -519,7 +521,9 @@ inode->i_rdev = to_kdev_t(dev); inode->i_mapping->a_ops = _aops; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; - spin_lock_init (>u.shmem_i.lock); + info = >u.shmem_i; + spin_lock_init (>lock); + sema_init (>sem, 1); switch (mode & S_IFMT) { default: init_special_inode(inode, mode, dev); @@ -549,6 +553,7 @@ shmem_file_write(struct file *file,const char *buf,size_t count,loff_t *ppos) { struct inode*inode = file->f_dentry->d_inode; + struct shmem_inode_info *info; unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur; loff_t pos; struct page *page; @@ -624,7 +629,11 @@ __get_user(dummy, buf+bytes-1); } - page = shmem_getpage_locked(inode, index); + info = >u.shmem_i; + down (>sem); + page = shmem_getpage_locked(info, inode, index); + up (>sem); + status = PTR_ERR(page); if (IS_ERR(page)) break; @@ -635,7 +644,6 @@ }
[Resend] Collection of tmpfs patches
Hi, There is some confusion about my latest tmpfs fixes. There were three patches which are cummulative against 2.4.4: 1) deadlock fix for write out of mmap regions. (AFAIK this is integrated in the -ac kernels) 2) encapsulate access to shmem_inode_info 3) Do inline symlinks I attach all these patches to this mail in the case that somebody missed one. Greetings Christoph diff -uNr 2.4.4/include/linux/shmem_fs.h c/include/linux/shmem_fs.h --- 2.4.4/include/linux/shmem_fs.h Sun Apr 29 20:33:00 2001 +++ c/include/linux/shmem_fs.h Sun Apr 29 22:43:56 2001 @@ -19,6 +19,7 @@ struct shmem_inode_info { spinlock_t lock; + struct semaphore sem; unsigned long max_index; swp_entry_t i_direct[SHMEM_NR_DIRECT]; /* for the first blocks */ swp_entry_t **i_indirect; /* doubly indirect blocks */ diff -uNr 2.4.4/mm/shmem.c c/mm/shmem.c --- 2.4.4/mm/shmem.cMon Apr 30 09:45:39 2001 +++ c/mm/shmem.cTue May 1 15:15:38 2001 @@ -161,6 +161,7 @@ swp_entry_t **base, **ptr, **last; struct shmem_inode_info * info = inode-u.shmem_i; + down(info-sem); inode-i_ctime = inode-i_mtime = CURRENT_TIME; spin_lock (info-lock); index = (inode-i_size + PAGE_CACHE_SIZE - 1) PAGE_CACHE_SHIFT; @@ -197,6 +198,7 @@ info-swapped -= freed; shmem_recalc_inode(inode); spin_unlock (info-lock); + up(info-sem); } static void shmem_delete_inode(struct inode * inode) @@ -281,15 +283,12 @@ * still need to guard against racing with shm_writepage(), which might * be trying to move the page to the swap cache as we run. */ -static struct page * shmem_getpage_locked(struct inode * inode, unsigned long idx) +static struct page * shmem_getpage_locked(struct shmem_inode_info *info, struct inode +* inode, unsigned long idx) { struct address_space * mapping = inode-i_mapping; - struct shmem_inode_info *info; struct page * page; swp_entry_t *entry; - info = inode-u.shmem_i; - repeat: page = find_lock_page(mapping, idx); if (page) @@ -393,6 +392,7 @@ static int shmem_getpage(struct inode * inode, unsigned long idx, struct page **ptr) { + struct shmem_inode_info *info; struct address_space * mapping = inode-i_mapping; int error; @@ -407,27 +407,28 @@ page_cache_release(*ptr); } - down (inode-i_sem); - /* retest we may have slept */ + info = inode-u.shmem_i; + down (info-sem); + /* retest we may have slept */ + + *ptr = ERR_PTR(-EFAULT); if (inode-i_size (loff_t) idx * PAGE_CACHE_SIZE) - goto sigbus; - *ptr = shmem_getpage_locked(inode, idx); + goto failed; + + *ptr = shmem_getpage_locked(inode-u.shmem_i, inode, idx); if (IS_ERR (*ptr)) goto failed; + UnlockPage(*ptr); - up (inode-i_sem); + up (info-sem); return 0; failed: - up (inode-i_sem); + up (info-sem); error = PTR_ERR(*ptr); - *ptr = NOPAGE_OOM; - if (error != -EFBIG) - *ptr = NOPAGE_SIGBUS; - return error; -sigbus: - up (inode-i_sem); *ptr = NOPAGE_SIGBUS; - return -EFAULT; + if (error == -ENOMEM) + *ptr = NOPAGE_OOM; + return error; } struct page * shmem_nopage(struct vm_area_struct * vma, unsigned long address, int no_share) @@ -500,6 +501,7 @@ struct inode *shmem_get_inode(struct super_block *sb, int mode, int dev) { struct inode * inode; + struct shmem_inode_info *info; spin_lock (sb-u.shmem_sb.stat_lock); if (!sb-u.shmem_sb.free_inodes) { @@ -519,7 +521,9 @@ inode-i_rdev = to_kdev_t(dev); inode-i_mapping-a_ops = shmem_aops; inode-i_atime = inode-i_mtime = inode-i_ctime = CURRENT_TIME; - spin_lock_init (inode-u.shmem_i.lock); + info = inode-u.shmem_i; + spin_lock_init (info-lock); + sema_init (info-sem, 1); switch (mode S_IFMT) { default: init_special_inode(inode, mode, dev); @@ -549,6 +553,7 @@ shmem_file_write(struct file *file,const char *buf,size_t count,loff_t *ppos) { struct inode*inode = file-f_dentry-d_inode; + struct shmem_inode_info *info; unsigned long limit = current-rlim[RLIMIT_FSIZE].rlim_cur; loff_t pos; struct page *page; @@ -624,7 +629,11 @@ __get_user(dummy, buf+bytes-1); } - page = shmem_getpage_locked(inode, index); + info = inode-u.shmem_i; + down (info-sem); + page = shmem_getpage_locked(info, inode, index); + up (info-sem); + status = PTR_ERR(page); if
[Patch] inline symlinks for tmpfs
Hi David, On Tue, 24 Apr 2001, David L. Parsley wrote: >> OK I will do that for tmpfs soon. And I will do the symlink >> inlining with that patch. OK, here comes the patch for the symlink inlining. It is on top of my previous patch to encapsulate access to the private inode info. Greetings Christoph diff -uNr 2.4.4-mmap_write-SHMEM_I/mm/shmem.c 2.4.4-mmap_write-SHMEM_I-symlink/mm/shmem.c --- 2.4.4-mmap_write-SHMEM_I/mm/shmem.c Fri May 4 21:32:22 2001 +++ 2.4.4-mmap_write-SHMEM_I-symlink/mm/shmem.c Fri May 4 21:37:34 2001 @@ -41,7 +41,6 @@ static struct inode_operations shmem_inode_operations; static struct file_operations shmem_dir_operations; static struct inode_operations shmem_dir_inode_operations; -static struct inode_operations shmem_symlink_inode_operations; static struct vm_operations_struct shmem_vm_ops; LIST_HEAD (shmem_inodes); @@ -205,11 +204,13 @@ { struct shmem_sb_info *info = >i_sb->u.shmem_sb; - spin_lock (_ilock); - list_del (_I(inode)->list); - spin_unlock (_ilock); inode->i_size = 0; - shmem_truncate (inode); + if (inode->i_op->truncate == shmem_truncate){ + spin_lock (_ilock); + list_del (_I(inode)->list); + spin_unlock (_ilock); + shmem_truncate(inode); + } spin_lock (>stat_lock); info->free_inodes++; spin_unlock (>stat_lock); @@ -532,6 +533,9 @@ case S_IFREG: inode->i_op = _inode_operations; inode->i_fop = _file_operations; + spin_lock (_ilock); + list_add (_I(inode)->list, _inodes); + spin_unlock (_ilock); break; case S_IFDIR: inode->i_nlink++; @@ -539,17 +543,17 @@ inode->i_fop = _dir_operations; break; case S_IFLNK: - inode->i_op = _symlink_inode_operations; break; } - spin_lock (_ilock); - list_add (_I(inode)->list, _inodes); - spin_unlock (_ilock); } return inode; } #ifdef CONFIG_TMPFS + +static struct inode_operations shmem_symlink_inode_operations; +static struct inode_operations shmem_symlink_inline_operations; + static ssize_t shmem_file_write(struct file *file,const char *buf,size_t count,loff_t *ppos) { @@ -930,33 +934,54 @@ struct inode *inode; struct page *page; char *kaddr; + struct shmem_inode_info * info; error = shmem_mknod(dir, dentry, S_IFLNK | S_IRWXUGO, 0); if (error) return error; - len = strlen(symname); + len = strlen(symname) + 1; if (len > PAGE_SIZE) return -ENAMETOOLONG; - + inode = dentry->d_inode; - down(>i_sem); - page = shmem_getpage_locked(SHMEM_I(inode), inode, 0); - if (IS_ERR(page)) - goto fail; - kaddr = kmap(page); - memcpy(kaddr, symname, len); - kunmap(page); + info = SHMEM_I(inode); inode->i_size = len; - SetPageDirty(page); - UnlockPage(page); - page_cache_release(page); - up(>i_sem); + if (len <= sizeof(struct shmem_inode_info)) { + /* do it inline */ + memcpy(info, symname, len); + inode->i_op = _symlink_inline_operations; + } else { + spin_lock (_ilock); + list_add (>list, _inodes); + spin_unlock (_ilock); + down(>i_sem); + page = shmem_getpage_locked(info, inode, 0); + if (IS_ERR(page)) { + up(>i_sem); + return PTR_ERR(page); + } + kaddr = kmap(page); + memcpy(kaddr, symname, len); + kunmap(page); + SetPageDirty(page); + UnlockPage(page); + page_cache_release(page); + up(>i_sem); + inode->i_op = _symlink_inode_operations; + } dir->i_ctime = dir->i_mtime = CURRENT_TIME; return 0; -fail: - up(>i_sem); - return PTR_ERR(page); +} + +static int shmem_readlink_inline(struct dentry *dentry, char *buffer, int buflen) +{ + return vfs_readlink(dentry,buffer,buflen, (const char +*)SHMEM_I(dentry->d_inode)); +} + +static int shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd) +{ + return vfs_follow_link(nd, (const char *)SHMEM_I(dentry->d_inode)); } static int shmem_readlink(struct dentry *dentry, char *buffer, int buflen) @@ -986,6 +1011,17 @@ return res; } +static struct inode_operations shmem_symlink_inline_operations = { + readlink: shmem_readlink_inline, + follow_link:shmem_follow_link_inline,
[Patch] inline symlinks for tmpfs
Hi David, On Tue, 24 Apr 2001, David L. Parsley wrote: OK I will do that for tmpfs soon. And I will do the symlink inlining with that patch. OK, here comes the patch for the symlink inlining. It is on top of my previous patch to encapsulate access to the private inode info. Greetings Christoph diff -uNr 2.4.4-mmap_write-SHMEM_I/mm/shmem.c 2.4.4-mmap_write-SHMEM_I-symlink/mm/shmem.c --- 2.4.4-mmap_write-SHMEM_I/mm/shmem.c Fri May 4 21:32:22 2001 +++ 2.4.4-mmap_write-SHMEM_I-symlink/mm/shmem.c Fri May 4 21:37:34 2001 @@ -41,7 +41,6 @@ static struct inode_operations shmem_inode_operations; static struct file_operations shmem_dir_operations; static struct inode_operations shmem_dir_inode_operations; -static struct inode_operations shmem_symlink_inode_operations; static struct vm_operations_struct shmem_vm_ops; LIST_HEAD (shmem_inodes); @@ -205,11 +204,13 @@ { struct shmem_sb_info *info = inode-i_sb-u.shmem_sb; - spin_lock (shmem_ilock); - list_del (SHMEM_I(inode)-list); - spin_unlock (shmem_ilock); inode-i_size = 0; - shmem_truncate (inode); + if (inode-i_op-truncate == shmem_truncate){ + spin_lock (shmem_ilock); + list_del (SHMEM_I(inode)-list); + spin_unlock (shmem_ilock); + shmem_truncate(inode); + } spin_lock (info-stat_lock); info-free_inodes++; spin_unlock (info-stat_lock); @@ -532,6 +533,9 @@ case S_IFREG: inode-i_op = shmem_inode_operations; inode-i_fop = shmem_file_operations; + spin_lock (shmem_ilock); + list_add (SHMEM_I(inode)-list, shmem_inodes); + spin_unlock (shmem_ilock); break; case S_IFDIR: inode-i_nlink++; @@ -539,17 +543,17 @@ inode-i_fop = shmem_dir_operations; break; case S_IFLNK: - inode-i_op = shmem_symlink_inode_operations; break; } - spin_lock (shmem_ilock); - list_add (SHMEM_I(inode)-list, shmem_inodes); - spin_unlock (shmem_ilock); } return inode; } #ifdef CONFIG_TMPFS + +static struct inode_operations shmem_symlink_inode_operations; +static struct inode_operations shmem_symlink_inline_operations; + static ssize_t shmem_file_write(struct file *file,const char *buf,size_t count,loff_t *ppos) { @@ -930,33 +934,54 @@ struct inode *inode; struct page *page; char *kaddr; + struct shmem_inode_info * info; error = shmem_mknod(dir, dentry, S_IFLNK | S_IRWXUGO, 0); if (error) return error; - len = strlen(symname); + len = strlen(symname) + 1; if (len PAGE_SIZE) return -ENAMETOOLONG; - + inode = dentry-d_inode; - down(inode-i_sem); - page = shmem_getpage_locked(SHMEM_I(inode), inode, 0); - if (IS_ERR(page)) - goto fail; - kaddr = kmap(page); - memcpy(kaddr, symname, len); - kunmap(page); + info = SHMEM_I(inode); inode-i_size = len; - SetPageDirty(page); - UnlockPage(page); - page_cache_release(page); - up(inode-i_sem); + if (len = sizeof(struct shmem_inode_info)) { + /* do it inline */ + memcpy(info, symname, len); + inode-i_op = shmem_symlink_inline_operations; + } else { + spin_lock (shmem_ilock); + list_add (info-list, shmem_inodes); + spin_unlock (shmem_ilock); + down(inode-i_sem); + page = shmem_getpage_locked(info, inode, 0); + if (IS_ERR(page)) { + up(inode-i_sem); + return PTR_ERR(page); + } + kaddr = kmap(page); + memcpy(kaddr, symname, len); + kunmap(page); + SetPageDirty(page); + UnlockPage(page); + page_cache_release(page); + up(inode-i_sem); + inode-i_op = shmem_symlink_inode_operations; + } dir-i_ctime = dir-i_mtime = CURRENT_TIME; return 0; -fail: - up(inode-i_sem); - return PTR_ERR(page); +} + +static int shmem_readlink_inline(struct dentry *dentry, char *buffer, int buflen) +{ + return vfs_readlink(dentry,buffer,buflen, (const char +*)SHMEM_I(dentry-d_inode)); +} + +static int shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd) +{ + return vfs_follow_link(nd, (const char *)SHMEM_I(dentry-d_inode)); } static int shmem_readlink(struct dentry *dentry, char *buffer, int buflen) @@ -986,6 +1011,17 @@ return res; } +static struct
[Patch] encapsulate shmem access to shmem_inode_info
Hi, On 24 Apr 2001, Christoph Rohland wrote: > Hi Al, > > On Tue, 24 Apr 2001, Alexander Viro wrote: >> So yes, IMO having such patches available _is_ a good thing. And in >> 2.5 we definitely want them in the tree. If encapsulation part gets >> there during 2.4 and separate allocation is available for all of >> them it will be easier to do without PITA in process. > > OK I will do that for tmpfs soon. And I will do the symlink inlining > with that patch. Here comes the patch to encapsulate all accesses to struct shmem_inode_info into a macro. It is now trivial to allocate the private part independently from the inode. Greetings Christoph P.S: The symlink inlining will come in a separate patch diff -uNr 2.4.4-mmap_write/include/linux/shmem_fs.h 2.4.4-mmap_write-SHMEM_I/include/linux/shmem_fs.h --- 2.4.4-mmap_write/include/linux/shmem_fs.h Tue May 1 20:02:00 2001 +++ 2.4.4-mmap_write-SHMEM_I/include/linux/shmem_fs.h Tue May 1 20:06:10 2001 @@ -18,14 +18,15 @@ } swp_entry_t; struct shmem_inode_info { - spinlock_t lock; - struct semaphore sem; - unsigned long max_index; - swp_entry_t i_direct[SHMEM_NR_DIRECT]; /* for the first blocks */ - swp_entry_t **i_indirect; /* doubly indirect blocks */ - unsigned long swapped; - int locked; /* into memory */ + spinlock_t lock; + struct semaphoresem; + unsigned long max_index; + swp_entry_t i_direct[SHMEM_NR_DIRECT]; /* for the first blocks */ + swp_entry_t **i_indirect; /* doubly indirect blocks */ + unsigned long swapped; + int locked; /* into memory */ struct list_headlist; + struct inode *inode; }; struct shmem_sb_info { @@ -35,5 +36,7 @@ unsigned long free_inodes; /* How many are left for allocation */ spinlock_tstat_lock; }; + +#define SHMEM_I(inode) (>u.shmem_i) #endif diff -uNr 2.4.4-mmap_write/ipc/shm.c 2.4.4-mmap_write-SHMEM_I/ipc/shm.c --- 2.4.4-mmap_write/ipc/shm.c Wed Apr 11 12:36:47 2001 +++ 2.4.4-mmap_write-SHMEM_I/ipc/shm.c Tue May 1 20:06:10 2001 @@ -348,6 +348,7 @@ static void shm_get_stat (unsigned long *rss, unsigned long *swp) { + struct shmem_inode_info *info; int i; *rss = 0; @@ -361,10 +362,11 @@ if(shp == NULL) continue; inode = shp->shm_file->f_dentry->d_inode; - spin_lock (>u.shmem_i.lock); + info = SHMEM_I(inode); + spin_lock (>lock); *rss += inode->i_mapping->nrpages; - *swp += inode->u.shmem_i.swapped; - spin_unlock (>u.shmem_i.lock); + *swp += info->swapped; + spin_unlock (>lock); } } diff -uNr 2.4.4-mmap_write/mm/shmem.c 2.4.4-mmap_write-SHMEM_I/mm/shmem.c --- 2.4.4-mmap_write/mm/shmem.c Tue May 1 20:02:00 2001 +++ 2.4.4-mmap_write-SHMEM_I/mm/shmem.c Wed May 2 16:46:00 2001 @@ -73,7 +73,7 @@ unsigned long freed; freed = (inode->i_blocks/BLOCKS_PER_PAGE) - - (inode->i_mapping->nrpages + inode->u.shmem_i.swapped); + (inode->i_mapping->nrpages + SHMEM_I(inode)->swapped); if (freed){ struct shmem_sb_info * info = >i_sb->u.shmem_sb; inode->i_blocks -= freed*BLOCKS_PER_PAGE; @@ -159,7 +159,7 @@ unsigned long index, start; unsigned long freed = 0; swp_entry_t **base, **ptr, **last; - struct shmem_inode_info * info = >u.shmem_i; + struct shmem_inode_info * info = SHMEM_I(inode); down(>sem); inode->i_ctime = inode->i_mtime = CURRENT_TIME; @@ -206,7 +206,7 @@ struct shmem_sb_info *info = >i_sb->u.shmem_sb; spin_lock (_ilock); - list_del (>u.shmem_i.list); + list_del (_I(inode)->list); spin_unlock (_ilock); inode->i_size = 0; shmem_truncate (inode); @@ -239,7 +239,7 @@ goto out; inode = page->mapping->host; - info = >u.shmem_i; + info = SHMEM_I(inode); swap = __get_swap_page(2); error = -ENOMEM; if (!swap.val) @@ -407,7 +407,7 @@ page_cache_release(*ptr); } - info = >u.shmem_i; + info = SHMEM_I(inode); down (>sem); /* retest we may have slept */ @@ -415,7 +415,7 @@ if (inode->i_size < (loff_t) idx * PAGE_CACHE_SIZE) goto failed; - *ptr = shmem_getpage_locked(>u.shmem_i, inode, idx); + *ptr = shmem_getpage_locked(info, inode, idx); if (IS_ERR (*ptr)) goto failed; @@ -462,7 +462,7 @@ void shmem_lo
Re: tmpfs doesn't update free memory stats?
Hi Jacek, On Fri, 4 May 2001, Jacek Kopecky wrote: > I'm not in the list, please cc your replies to me. > After upgrading to 2.4.4 I started using tmpfs for /tmp and I > noticed a strange behavior: > > dd if=/dev/zero of=blah bs=1024 count=102400 > # increased my used swap space by approx. 100MiB (correct) > rm blah > # did not decrease it back > > Multiple retries showed what looked like a random behavior of > the used swap stats. Is this a correct behavior? Should the swap > stats be dismissed as 'unreliable'? I expected that when creating > a 100MiB file in memory it should increase the swap (or memory) > usage by cca 100MiB and that removing a file from tmpfs means > freeing the memory. It will be adjusted under memory pressure. At this time there is no way to release swap cached pages without the potential of deadlocks. This is not nice but the only short term solution and should not affect anything besides stats. Greetings Christoph - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: tmpfs doesn't update free memory stats?
Hi Jacek, On Fri, 4 May 2001, Jacek Kopecky wrote: I'm not in the list, please cc your replies to me. After upgrading to 2.4.4 I started using tmpfs for /tmp and I noticed a strange behavior: dd if=/dev/zero of=blah bs=1024 count=102400 # increased my used swap space by approx. 100MiB (correct) rm blah # did not decrease it back Multiple retries showed what looked like a random behavior of the used swap stats. Is this a correct behavior? Should the swap stats be dismissed as 'unreliable'? I expected that when creating a 100MiB file in memory it should increase the swap (or memory) usage by cca 100MiB and that removing a file from tmpfs means freeing the memory. It will be adjusted under memory pressure. At this time there is no way to release swap cached pages without the potential of deadlocks. This is not nice but the only short term solution and should not affect anything besides stats. Greetings Christoph - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[Patch] encapsulate shmem access to shmem_inode_info
Hi, On 24 Apr 2001, Christoph Rohland wrote: Hi Al, On Tue, 24 Apr 2001, Alexander Viro wrote: So yes, IMO having such patches available _is_ a good thing. And in 2.5 we definitely want them in the tree. If encapsulation part gets there during 2.4 and separate allocation is available for all of them it will be easier to do without PITA in process. OK I will do that for tmpfs soon. And I will do the symlink inlining with that patch. Here comes the patch to encapsulate all accesses to struct shmem_inode_info into a macro. It is now trivial to allocate the private part independently from the inode. Greetings Christoph P.S: The symlink inlining will come in a separate patch diff -uNr 2.4.4-mmap_write/include/linux/shmem_fs.h 2.4.4-mmap_write-SHMEM_I/include/linux/shmem_fs.h --- 2.4.4-mmap_write/include/linux/shmem_fs.h Tue May 1 20:02:00 2001 +++ 2.4.4-mmap_write-SHMEM_I/include/linux/shmem_fs.h Tue May 1 20:06:10 2001 @@ -18,14 +18,15 @@ } swp_entry_t; struct shmem_inode_info { - spinlock_t lock; - struct semaphore sem; - unsigned long max_index; - swp_entry_t i_direct[SHMEM_NR_DIRECT]; /* for the first blocks */ - swp_entry_t **i_indirect; /* doubly indirect blocks */ - unsigned long swapped; - int locked; /* into memory */ + spinlock_t lock; + struct semaphoresem; + unsigned long max_index; + swp_entry_t i_direct[SHMEM_NR_DIRECT]; /* for the first blocks */ + swp_entry_t **i_indirect; /* doubly indirect blocks */ + unsigned long swapped; + int locked; /* into memory */ struct list_headlist; + struct inode *inode; }; struct shmem_sb_info { @@ -35,5 +36,7 @@ unsigned long free_inodes; /* How many are left for allocation */ spinlock_tstat_lock; }; + +#define SHMEM_I(inode) (inode-u.shmem_i) #endif diff -uNr 2.4.4-mmap_write/ipc/shm.c 2.4.4-mmap_write-SHMEM_I/ipc/shm.c --- 2.4.4-mmap_write/ipc/shm.c Wed Apr 11 12:36:47 2001 +++ 2.4.4-mmap_write-SHMEM_I/ipc/shm.c Tue May 1 20:06:10 2001 @@ -348,6 +348,7 @@ static void shm_get_stat (unsigned long *rss, unsigned long *swp) { + struct shmem_inode_info *info; int i; *rss = 0; @@ -361,10 +362,11 @@ if(shp == NULL) continue; inode = shp-shm_file-f_dentry-d_inode; - spin_lock (inode-u.shmem_i.lock); + info = SHMEM_I(inode); + spin_lock (info-lock); *rss += inode-i_mapping-nrpages; - *swp += inode-u.shmem_i.swapped; - spin_unlock (inode-u.shmem_i.lock); + *swp += info-swapped; + spin_unlock (info-lock); } } diff -uNr 2.4.4-mmap_write/mm/shmem.c 2.4.4-mmap_write-SHMEM_I/mm/shmem.c --- 2.4.4-mmap_write/mm/shmem.c Tue May 1 20:02:00 2001 +++ 2.4.4-mmap_write-SHMEM_I/mm/shmem.c Wed May 2 16:46:00 2001 @@ -73,7 +73,7 @@ unsigned long freed; freed = (inode-i_blocks/BLOCKS_PER_PAGE) - - (inode-i_mapping-nrpages + inode-u.shmem_i.swapped); + (inode-i_mapping-nrpages + SHMEM_I(inode)-swapped); if (freed){ struct shmem_sb_info * info = inode-i_sb-u.shmem_sb; inode-i_blocks -= freed*BLOCKS_PER_PAGE; @@ -159,7 +159,7 @@ unsigned long index, start; unsigned long freed = 0; swp_entry_t **base, **ptr, **last; - struct shmem_inode_info * info = inode-u.shmem_i; + struct shmem_inode_info * info = SHMEM_I(inode); down(info-sem); inode-i_ctime = inode-i_mtime = CURRENT_TIME; @@ -206,7 +206,7 @@ struct shmem_sb_info *info = inode-i_sb-u.shmem_sb; spin_lock (shmem_ilock); - list_del (inode-u.shmem_i.list); + list_del (SHMEM_I(inode)-list); spin_unlock (shmem_ilock); inode-i_size = 0; shmem_truncate (inode); @@ -239,7 +239,7 @@ goto out; inode = page-mapping-host; - info = inode-u.shmem_i; + info = SHMEM_I(inode); swap = __get_swap_page(2); error = -ENOMEM; if (!swap.val) @@ -407,7 +407,7 @@ page_cache_release(*ptr); } - info = inode-u.shmem_i; + info = SHMEM_I(inode); down (info-sem); /* retest we may have slept */ @@ -415,7 +415,7 @@ if (inode-i_size (loff_t) idx * PAGE_CACHE_SIZE) goto failed; - *ptr = shmem_getpage_locked(inode-u.shmem_i, inode, idx); + *ptr = shmem_getpage_locked(info, inode, idx); if (IS_ERR (*ptr)) goto failed; @@ -462,7 +462,7 @@ void shmem_lock(struct file * file, int lock) { struct inode * inode = file-f_dentry-d_inode; - struct shmem_inode_info
Re: [Patch] deadlock on write in tmpfs
Hi Stephen, On Tue, 1 May 2001, Stephen C. Tweedie wrote: > If the locking is for a completely different reason, then a > different semaphore is quite appropriate. In this case you're > trying to lock the shm internal info structures, which is quite > different from the sort of inode locking which the VFS tries to do > itself, so the new semaphore appears quite clean --- and definitely > needed. It's not the addition to the inode semaphore I do care about, but the addition to the spin lock which protects also the shmem internals. But you are probably right: It only protects the onthefly pages between page cache and swap cache. Greetings Christoph - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [Patch] deadlock on write in tmpfs
Hi Stephen, On Tue, 1 May 2001, Stephen C. Tweedie wrote: If the locking is for a completely different reason, then a different semaphore is quite appropriate. In this case you're trying to lock the shm internal info structures, which is quite different from the sort of inode locking which the VFS tries to do itself, so the new semaphore appears quite clean --- and definitely needed. It's not the addition to the inode semaphore I do care about, but the addition to the spin lock which protects also the shmem internals. But you are probably right: It only protects the onthefly pages between page cache and swap cache. Greetings Christoph - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[Patch] deadlock on write in tmpfs
Hi Linus and Stephen, tmpfs deadlocks when writing into a file from a mapping of the same file. The problem is the following: - shmem_file_write may call shmem_no_page and calls shmem_getpage_locked later, - shmem_no_page calls shmem_getpage_locked - shmem_getpage_locked may call shmem_writepage on page allocation - shmem_file_write holds the inode semaphore - shmem_getpage_locked prevent races against shmem_writepage with the shmem spinlock - shmem_getpage_locked needs serialization against itself and shmem_truncate The last was done with the inode semaphore, which deadlocks with shmem_write So I see two choices: 1) Do not serialise the whole of shmem_getpage_locked but protect critical pathes with the spinlock and do retries after sleeps 2) Add another semaphore to serialize shmem_getpage_locked and shmem_truncate I tried some time to get 1) done but the retry logic became way too complicated. So the attached patch implements 2) I still think it's ugly to add another semaphore, but it works. Greetings Christoph diff -uNr 2.4.4/include/linux/shmem_fs.h c/include/linux/shmem_fs.h --- 2.4.4/include/linux/shmem_fs.h Sun Apr 29 20:33:00 2001 +++ c/include/linux/shmem_fs.h Sun Apr 29 22:43:56 2001 @@ -19,6 +19,7 @@ struct shmem_inode_info { spinlock_t lock; + struct semaphore sem; unsigned long max_index; swp_entry_t i_direct[SHMEM_NR_DIRECT]; /* for the first blocks */ swp_entry_t **i_indirect; /* doubly indirect blocks */ diff -uNr 2.4.4/mm/shmem.c c/mm/shmem.c --- 2.4.4/mm/shmem.cMon Apr 30 09:45:39 2001 +++ c/mm/shmem.cTue May 1 15:15:38 2001 @@ -161,6 +161,7 @@ swp_entry_t **base, **ptr, **last; struct shmem_inode_info * info = >u.shmem_i; + down(>sem); inode->i_ctime = inode->i_mtime = CURRENT_TIME; spin_lock (>lock); index = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; @@ -197,6 +198,7 @@ info->swapped -= freed; shmem_recalc_inode(inode); spin_unlock (>lock); + up(>sem); } static void shmem_delete_inode(struct inode * inode) @@ -281,15 +283,12 @@ * still need to guard against racing with shm_writepage(), which might * be trying to move the page to the swap cache as we run. */ -static struct page * shmem_getpage_locked(struct inode * inode, unsigned long idx) +static struct page * shmem_getpage_locked(struct shmem_inode_info *info, struct inode +* inode, unsigned long idx) { struct address_space * mapping = inode->i_mapping; - struct shmem_inode_info *info; struct page * page; swp_entry_t *entry; - info = >u.shmem_i; - repeat: page = find_lock_page(mapping, idx); if (page) @@ -393,6 +392,7 @@ static int shmem_getpage(struct inode * inode, unsigned long idx, struct page **ptr) { + struct shmem_inode_info *info; struct address_space * mapping = inode->i_mapping; int error; @@ -407,27 +407,28 @@ page_cache_release(*ptr); } - down (>i_sem); - /* retest we may have slept */ + info = >u.shmem_i; + down (>sem); + /* retest we may have slept */ + + *ptr = ERR_PTR(-EFAULT); if (inode->i_size < (loff_t) idx * PAGE_CACHE_SIZE) - goto sigbus; - *ptr = shmem_getpage_locked(inode, idx); + goto failed; + + *ptr = shmem_getpage_locked(>u.shmem_i, inode, idx); if (IS_ERR (*ptr)) goto failed; + UnlockPage(*ptr); - up (>i_sem); + up (>sem); return 0; failed: - up (>i_sem); + up (>sem); error = PTR_ERR(*ptr); - *ptr = NOPAGE_OOM; - if (error != -EFBIG) - *ptr = NOPAGE_SIGBUS; - return error; -sigbus: - up (>i_sem); *ptr = NOPAGE_SIGBUS; - return -EFAULT; + if (error == -ENOMEM) + *ptr = NOPAGE_OOM; + return error; } struct page * shmem_nopage(struct vm_area_struct * vma, unsigned long address, int no_share) @@ -500,6 +501,7 @@ struct inode *shmem_get_inode(struct super_block *sb, int mode, int dev) { struct inode * inode; + struct shmem_inode_info *info; spin_lock (>u.shmem_sb.stat_lock); if (!sb->u.shmem_sb.free_inodes) { @@ -519,7 +521,9 @@ inode->i_rdev = to_kdev_t(dev); inode->i_mapping->a_ops = _aops; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; - spin_lock_init (>u.shmem_i.lock); + info = >u.shmem_i; + spin_lock_init (>lock); + sema_init (>sem, 1); switch (mode & S_IFMT) { default: init_special_inode(inode, mode, dev); @@ -549,6 +553,7 @@ shmem_file_write(struct file *file,const char *buf,size_t count,loff_t *ppos) { struct inode
Re: 2.4 and 2GB swap partition limit
Hi Alan, On Mon, 30 Apr 2001, Alan Cox wrote: >> paging in just released 2.4.4, but in previuos kernel, a page that >> was paged-out, reserves its place in swap even if it is paged-in >> again, so once you have paged-out all your ram at least once, you >> can't get any more memory, even if swap is 'empty'. > > This is a bug in the 2.4 VM, nothing more or less. It and the > horrible bounce buffer bugs are forcing large machines to remain on > 2.2. So it has to get fixed Yes, it is a bug. and thanks for stating this so clearly. But a lot of the big servers can go to 2.4. because SYSV shm/shm fs/tmpfs will reclaim the swap entries on swapin. So big databases and applications servers which rely on shm are not affected. Greetings Christoph - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: 2.4 and 2GB swap partition limit
Hi Alan, On Mon, 30 Apr 2001, Alan Cox wrote: paging in just released 2.4.4, but in previuos kernel, a page that was paged-out, reserves its place in swap even if it is paged-in again, so once you have paged-out all your ram at least once, you can't get any more memory, even if swap is 'empty'. This is a bug in the 2.4 VM, nothing more or less. It and the horrible bounce buffer bugs are forcing large machines to remain on 2.2. So it has to get fixed Yes, it is a bug. and thanks for stating this so clearly. But a lot of the big servers can go to 2.4. because SYSV shm/shm fs/tmpfs will reclaim the swap entries on swapin. So big databases and applications servers which rely on shm are not affected. Greetings Christoph - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[Patch] deadlock on write in tmpfs
Hi Linus and Stephen, tmpfs deadlocks when writing into a file from a mapping of the same file. The problem is the following: - shmem_file_write may call shmem_no_page and calls shmem_getpage_locked later, - shmem_no_page calls shmem_getpage_locked - shmem_getpage_locked may call shmem_writepage on page allocation - shmem_file_write holds the inode semaphore - shmem_getpage_locked prevent races against shmem_writepage with the shmem spinlock - shmem_getpage_locked needs serialization against itself and shmem_truncate The last was done with the inode semaphore, which deadlocks with shmem_write So I see two choices: 1) Do not serialise the whole of shmem_getpage_locked but protect critical pathes with the spinlock and do retries after sleeps 2) Add another semaphore to serialize shmem_getpage_locked and shmem_truncate I tried some time to get 1) done but the retry logic became way too complicated. So the attached patch implements 2) I still think it's ugly to add another semaphore, but it works. Greetings Christoph diff -uNr 2.4.4/include/linux/shmem_fs.h c/include/linux/shmem_fs.h --- 2.4.4/include/linux/shmem_fs.h Sun Apr 29 20:33:00 2001 +++ c/include/linux/shmem_fs.h Sun Apr 29 22:43:56 2001 @@ -19,6 +19,7 @@ struct shmem_inode_info { spinlock_t lock; + struct semaphore sem; unsigned long max_index; swp_entry_t i_direct[SHMEM_NR_DIRECT]; /* for the first blocks */ swp_entry_t **i_indirect; /* doubly indirect blocks */ diff -uNr 2.4.4/mm/shmem.c c/mm/shmem.c --- 2.4.4/mm/shmem.cMon Apr 30 09:45:39 2001 +++ c/mm/shmem.cTue May 1 15:15:38 2001 @@ -161,6 +161,7 @@ swp_entry_t **base, **ptr, **last; struct shmem_inode_info * info = inode-u.shmem_i; + down(info-sem); inode-i_ctime = inode-i_mtime = CURRENT_TIME; spin_lock (info-lock); index = (inode-i_size + PAGE_CACHE_SIZE - 1) PAGE_CACHE_SHIFT; @@ -197,6 +198,7 @@ info-swapped -= freed; shmem_recalc_inode(inode); spin_unlock (info-lock); + up(info-sem); } static void shmem_delete_inode(struct inode * inode) @@ -281,15 +283,12 @@ * still need to guard against racing with shm_writepage(), which might * be trying to move the page to the swap cache as we run. */ -static struct page * shmem_getpage_locked(struct inode * inode, unsigned long idx) +static struct page * shmem_getpage_locked(struct shmem_inode_info *info, struct inode +* inode, unsigned long idx) { struct address_space * mapping = inode-i_mapping; - struct shmem_inode_info *info; struct page * page; swp_entry_t *entry; - info = inode-u.shmem_i; - repeat: page = find_lock_page(mapping, idx); if (page) @@ -393,6 +392,7 @@ static int shmem_getpage(struct inode * inode, unsigned long idx, struct page **ptr) { + struct shmem_inode_info *info; struct address_space * mapping = inode-i_mapping; int error; @@ -407,27 +407,28 @@ page_cache_release(*ptr); } - down (inode-i_sem); - /* retest we may have slept */ + info = inode-u.shmem_i; + down (info-sem); + /* retest we may have slept */ + + *ptr = ERR_PTR(-EFAULT); if (inode-i_size (loff_t) idx * PAGE_CACHE_SIZE) - goto sigbus; - *ptr = shmem_getpage_locked(inode, idx); + goto failed; + + *ptr = shmem_getpage_locked(inode-u.shmem_i, inode, idx); if (IS_ERR (*ptr)) goto failed; + UnlockPage(*ptr); - up (inode-i_sem); + up (info-sem); return 0; failed: - up (inode-i_sem); + up (info-sem); error = PTR_ERR(*ptr); - *ptr = NOPAGE_OOM; - if (error != -EFBIG) - *ptr = NOPAGE_SIGBUS; - return error; -sigbus: - up (inode-i_sem); *ptr = NOPAGE_SIGBUS; - return -EFAULT; + if (error == -ENOMEM) + *ptr = NOPAGE_OOM; + return error; } struct page * shmem_nopage(struct vm_area_struct * vma, unsigned long address, int no_share) @@ -500,6 +501,7 @@ struct inode *shmem_get_inode(struct super_block *sb, int mode, int dev) { struct inode * inode; + struct shmem_inode_info *info; spin_lock (sb-u.shmem_sb.stat_lock); if (!sb-u.shmem_sb.free_inodes) { @@ -519,7 +521,9 @@ inode-i_rdev = to_kdev_t(dev); inode-i_mapping-a_ops = shmem_aops; inode-i_atime = inode-i_mtime = inode-i_ctime = CURRENT_TIME; - spin_lock_init (inode-u.shmem_i.lock); + info = inode-u.shmem_i; + spin_lock_init (info-lock); + sema_init (info-sem, 1); switch (mode S_IFMT) { default: init_special_inode(inode, mode, dev); @@ -549,6 +553,7 @@ shmem_file_write(struct
Re: ramdisk/tmpfs/ramfs/memfs ?
Hi Padraig, On Fri, 27 Apr 2001, Padraig Brady wrote: > I don't have swap so don't need tmpfs, but could probably > use it anyway without a backing store? Yes, it does not need backing store. > Anyway why was ramfs created if tmpfs existed, unless tmpfs requires > backing store? They both seem to have been written around the same > time? - shm fs was written as a specialized fs to implement POSIX shared memory based on SYSV shm. - ramfs was introduced shortly after shm fs and was meant as a programming example for a minimal virtual filesystem. - Later shm fs was redone to use the same methods like ramfs but still was only useable for shared memory. - After the release of 2.4.0, I extended shm fs to support read/write and thus be tmpfs and since then it can replace ramfs. Greetings Christoph - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: ramdisk/tmpfs/ramfs/memfs ?
On Fri, 27 Apr 2001, [EMAIL PROTECTED] wrote: >> > tmpfs is basically ramfs with limits. >> > >> >> ... and swappable. >> >> -hpa > > Hmmm and what's shmfs? Precedessor of tmpfs? Yes. > I even cant remember which one I use for /tmp ;-) You can mount tmpfs also with type "shm" for compatibility. Type "shm" will be marked as obsolete in 2.5 Greetings Christoph - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: ramdisk/tmpfs/ramfs/memfs ?
Hi Padraig, On Thu, 26 Apr 2001, Padraig Brady wrote: > 2. Is tmpfs is basically swap and /tmp together in a ramdisk? >The advantage being you need to reserve less RAM for both >together than seperately? tmpfs is ramfs+swap+limits. It is not using ramdisks and is not related to them. > 3. If I've no backing store (harddisk?) is there any advantage >of using tmpfs instead of ramfs? Also does tmpfs need a >backing store? Probably yes, since you spare a little bit kernel memory. most of tmpfs is unconditionally in the kernel for shared mappings. So the actual CONFIG_TMPFS only adds some small functions to the kernel to export this to usre space. Greetings Christoph - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: ramdisk/tmpfs/ramfs/memfs ?
Hi Padraig, On Thu, 26 Apr 2001, Padraig Brady wrote: 2. Is tmpfs is basically swap and /tmp together in a ramdisk? The advantage being you need to reserve less RAM for both together than seperately? tmpfs is ramfs+swap+limits. It is not using ramdisks and is not related to them. 3. If I've no backing store (harddisk?) is there any advantage of using tmpfs instead of ramfs? Also does tmpfs need a backing store? Probably yes, since you spare a little bit kernel memory. most of tmpfs is unconditionally in the kernel for shared mappings. So the actual CONFIG_TMPFS only adds some small functions to the kernel to export this to usre space. Greetings Christoph - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: ramdisk/tmpfs/ramfs/memfs ?
On Fri, 27 Apr 2001, [EMAIL PROTECTED] wrote: tmpfs is basically ramfs with limits. ... and swappable. -hpa Hmmm and what's shmfs? Precedessor of tmpfs? Yes. I even cant remember which one I use for /tmp ;-) You can mount tmpfs also with type shm for compatibility. Type shm will be marked as obsolete in 2.5 Greetings Christoph - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: ramdisk/tmpfs/ramfs/memfs ?
Hi Padraig, On Fri, 27 Apr 2001, Padraig Brady wrote: I don't have swap so don't need tmpfs, but could probably use it anyway without a backing store? Yes, it does not need backing store. Anyway why was ramfs created if tmpfs existed, unless tmpfs requires backing store? They both seem to have been written around the same time? - shm fs was written as a specialized fs to implement POSIX shared memory based on SYSV shm. - ramfs was introduced shortly after shm fs and was meant as a programming example for a minimal virtual filesystem. - Later shm fs was redone to use the same methods like ramfs but still was only useable for shared memory. - After the release of 2.4.0, I extended shm fs to support read/write and thus be tmpfs and since then it can replace ramfs. Greetings Christoph - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: hundreds of mount --bind mountpoints?
Hi Andreas, On Tue, 24 Apr 2001, Andreas Dilger wrote: > On the other hand, sockets and shmem are both relatively large... shmem is only large because the union is large. I introduced the direct swap array of size SHMEM_NR_DIRECT simply to take advantage of the union. We can decrease SHMEM_NR_DIRECT very easily. I am thinking about 1 or 5 which would mean that we allocate an indirect block for files bigger than 4k or 20k respectively. The shmem_inode_info would then be 8 or 12 words. Greetings Christoph - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: shm_open doesn't work (fix maybe).
Hi, On Tue, 24 Apr 2001, Jakub Jelinek wrote: > On Tue, Apr 24, 2001 at 11:46:20AM -0500, Tom Brusehaver (N-Sysdyne > Corporation) wrote: >> >> I have been chasing all around trying to find out why >> shm_open always returns ENOSYS. It is implemented >> in glibc-2.2.2, and seems the 2.4.3 kernel knows about >> shmfs. >> >> It seems the file linux/mm/shmem.c has: >> #define SHMEM_MAGIC 0x01021994 >> >> And the glibc-2.2.2/sysdeps/unix/sysv/linux/linux_fsinfo.h has: >> #define SHMFS_SUPER_MAGIC 0x02011994 >> >> Well, which is correct? > > Update your glibc, 2.2.3pre* matches 2.4.x kernel: > > 2001-03-03 Ulrich Drepper <[EMAIL PROTECTED]> > > * sysdeps/unix/sysv/linux/linux_fsinfo.h (SHMFS_SUPER_MAGIC): > Update for real 2.4 kernels. Yes, and I apologize to Ulrich that the changed number slipped through to the official kernel. My fault. Greetings Christoph - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: shm_open doesn't work (fix maybe).
Hi, On Tue, 24 Apr 2001, Jakub Jelinek wrote: On Tue, Apr 24, 2001 at 11:46:20AM -0500, Tom Brusehaver (N-Sysdyne Corporation) wrote: I have been chasing all around trying to find out why shm_open always returns ENOSYS. It is implemented in glibc-2.2.2, and seems the 2.4.3 kernel knows about shmfs. It seems the file linux/mm/shmem.c has: #define SHMEM_MAGIC 0x01021994 And the glibc-2.2.2/sysdeps/unix/sysv/linux/linux_fsinfo.h has: #define SHMFS_SUPER_MAGIC 0x02011994 Well, which is correct? Update your glibc, 2.2.3pre* matches 2.4.x kernel: 2001-03-03 Ulrich Drepper [EMAIL PROTECTED] * sysdeps/unix/sysv/linux/linux_fsinfo.h (SHMFS_SUPER_MAGIC): Update for real 2.4 kernels. Yes, and I apologize to Ulrich that the changed number slipped through to the official kernel. My fault. Greetings Christoph - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: hundreds of mount --bind mountpoints?
Hi Andreas, On Tue, 24 Apr 2001, Andreas Dilger wrote: On the other hand, sockets and shmem are both relatively large... shmem is only large because the union is large. I introduced the direct swap array of size SHMEM_NR_DIRECT simply to take advantage of the union. We can decrease SHMEM_NR_DIRECT very easily. I am thinking about 1 or 5 which would mean that we allocate an indirect block for files bigger than 4k or 20k respectively. The shmem_inode_info would then be 8 or 12 words. Greetings Christoph - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: hundreds of mount --bind mountpoints?
Hi Al, On Tue, 24 Apr 2001, Alexander Viro wrote: > So yes, IMO having such patches available _is_ a good thing. And in > 2.5 we definitely want them in the tree. If encapsulation part gets > there during 2.4 and separate allocation is available for all of > them it will be easier to do without PITA in process. OK I will do that for tmpfs soon. And I will do the symlink inlining with that patch. Greetings Christoph - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: hundreds of mount --bind mountpoints?
Hi Al, On Tue, 24 Apr 2001, Alexander Viro wrote: >> Half an hour? If it takes more than about 5 minutes for JFFS2 I'd >> be very surprised. > > What's stopping you? > You _are_ JFFS maintainer, aren't you? So is this the start to change all filesystems in 2.4? I am not sure we should do that. Greetings Christoph - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: hundreds of mount --bind mountpoints?
Hi Alexander, On Mon, 23 Apr 2001, Alexander Viro wrote: >> I like it. ext2fs does the same, so there should be no VFS >> hassles involved. Al? > > We should get ext2 and friends to move the sucker _out_ of struct > inode. As it is, sizeof(struct inode) is way too large. This is 2.5 > stuff, but it really has to be done. More filesystems adding stuff > into the union is a Bad Thing(tm). If you want to allocates space - > allocate if yourself; ->clear_inode() is the right place for freeing > it. Yes, I agree that the union is way too large and I did not plan to extend it but simply use the size it has. if (strlen(path) < sizeof(inode->u)) inline the symlink; else put it into the page cache; So if somebody really cleans up the private inode structures it will not trigger that often any more and we perhaps have to rethink the idea. But also if we use struct shmem_inode_info which is 92 bytes right now we would inline all symlinks on my machine. If we reduced its size to 32 (which could be easily done) we would still inline 6642 out of 9317 symlinks on my machine. That's not bad. Greetings Christoph - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: hundreds of mount --bind mountpoints?
Hi Alexander, On Mon, 23 Apr 2001, Alexander Viro wrote: I like it. ext2fs does the same, so there should be no VFS hassles involved. Al? We should get ext2 and friends to move the sucker _out_ of struct inode. As it is, sizeof(struct inode) is way too large. This is 2.5 stuff, but it really has to be done. More filesystems adding stuff into the union is a Bad Thing(tm). If you want to allocates space - allocate if yourself; -clear_inode() is the right place for freeing it. Yes, I agree that the union is way too large and I did not plan to extend it but simply use the size it has. if (strlen(path) sizeof(inode-u)) inline the symlink; else put it into the page cache; So if somebody really cleans up the private inode structures it will not trigger that often any more and we perhaps have to rethink the idea. But also if we use struct shmem_inode_info which is 92 bytes right now we would inline all symlinks on my machine. If we reduced its size to 32 (which could be easily done) we would still inline 6642 out of 9317 symlinks on my machine. That's not bad. Greetings Christoph - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: hundreds of mount --bind mountpoints?
Hi Al, On Tue, 24 Apr 2001, Alexander Viro wrote: Half an hour? If it takes more than about 5 minutes for JFFS2 I'd be very surprised. tone polite What's stopping you? /tone You _are_ JFFS maintainer, aren't you? So is this the start to change all filesystems in 2.4? I am not sure we should do that. Greetings Christoph - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: hundreds of mount --bind mountpoints?
Hi Al, On Tue, 24 Apr 2001, Alexander Viro wrote: So yes, IMO having such patches available _is_ a good thing. And in 2.5 we definitely want them in the tree. If encapsulation part gets there during 2.4 and separate allocation is available for all of them it will be easier to do without PITA in process. OK I will do that for tmpfs soon. And I will do the symlink inlining with that patch. Greetings Christoph - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: hundreds of mount --bind mountpoints?
Hi Ingo, On Mon, 23 Apr 2001, Ingo Oeser wrote: > On Mon, Apr 23, 2001 at 01:43:27PM +0200, Christoph Rohland wrote: >> On Sun, 22 Apr 2001, David L. Parsley wrote: >> > attach packages inside it. Since symlinks in a tmpfs filesystem >> > cost 4k each (ouch!), I'm considering using mount --bind for >> > everything. >> >> What about fixing tmpfs instead? > > The question is: How? If you do it like ramfs, you cannot swap > these symlinks and this is effectively a mlock(symlink) operation > allowed for normal users. -> BAD! How about storing it into the inode structure if it fits into the fs-private union? If it is too big we allocate the page as we do it now. The union has 192 bytes. This should be sufficient for most cases. Greetings Christoph - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: hundreds of mount --bind mountpoints?
Hi David, On Sun, 22 Apr 2001, David L. Parsley wrote: > I'm still working on a packaging system for diskless > (quasi-embedded) devices. The root filesystem is all tmpfs, and I > attach packages inside it. Since symlinks in a tmpfs filesystem > cost 4k each (ouch!), I'm considering using mount --bind for > everything. What about fixing tmpfs instead? Greetings Christoph - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: hundreds of mount --bind mountpoints?
Hi David, On Sun, 22 Apr 2001, David L. Parsley wrote: I'm still working on a packaging system for diskless (quasi-embedded) devices. The root filesystem is all tmpfs, and I attach packages inside it. Since symlinks in a tmpfs filesystem cost 4k each (ouch!), I'm considering using mount --bind for everything. What about fixing tmpfs instead? Greetings Christoph - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: hundreds of mount --bind mountpoints?
Hi Ingo, On Mon, 23 Apr 2001, Ingo Oeser wrote: On Mon, Apr 23, 2001 at 01:43:27PM +0200, Christoph Rohland wrote: On Sun, 22 Apr 2001, David L. Parsley wrote: attach packages inside it. Since symlinks in a tmpfs filesystem cost 4k each (ouch!), I'm considering using mount --bind for everything. What about fixing tmpfs instead? The question is: How? If you do it like ramfs, you cannot swap these symlinks and this is effectively a mlock(symlink) operation allowed for normal users. - BAD! How about storing it into the inode structure if it fits into the fs-private union? If it is too big we allocate the page as we do it now. The union has 192 bytes. This should be sufficient for most cases. Greetings Christoph - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [NEED TESTERS] remove swapin_readahead Re: shmem_getpage_locked() / swapin_readahead() race in 2.4.4-pre3
Hi Stephen, On Tue, 17 Apr 2001, Stephen C. Tweedie wrote: > I don't see the problem. shmem_getpage_locked appears to back off > correctly if it encounters a swap-cached page already existing if > swapin_readahead has installed the page first, at least with the > code in 2.4.3-ac5. But the swap count can be increased by anybody without having the page lock. So the check triggers and is bogus. See my old patch. > There *does* appear to be a race, but it's swapin_readahead racing > with shmem_writepage. That code does not search for an existing > entry in the swap cache when it decides to move a shmem page to > swap, so we can install the page twice and end up doing a lookup on > the wrong physical page if there is swap readahead going on. I cannot follow you here. How can we have a swap cache entry if there is no swap entry. . . . Oh stop you mean swapin_readahead does swap in some totally bogus page into the swap cache after we did __get_swap_page? I never thought about that! > To fix that, shmem_writepage needs to do a swap cache lookup and > lock before installing the new page --- it should probably just copy > the new page into the old one if it finds one already there. OK I will look into that. Greetings Christoph - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [NEED TESTERS] remove swapin_readahead Re: shmem_getpage_locked() / swapin_readahead() race in 2.4.4-pre3
Hi Stephen, On Tue, 17 Apr 2001, Stephen C. Tweedie wrote: I don't see the problem. shmem_getpage_locked appears to back off correctly if it encounters a swap-cached page already existing if swapin_readahead has installed the page first, at least with the code in 2.4.3-ac5. But the swap count can be increased by anybody without having the page lock. So the check triggers and is bogus. See my old patch. There *does* appear to be a race, but it's swapin_readahead racing with shmem_writepage. That code does not search for an existing entry in the swap cache when it decides to move a shmem page to swap, so we can install the page twice and end up doing a lookup on the wrong physical page if there is swap readahead going on. I cannot follow you here. How can we have a swap cache entry if there is no swap entry. . . . Oh stop you mean swapin_readahead does swap in some totally bogus page into the swap cache after we did __get_swap_page? I never thought about that! To fix that, shmem_writepage needs to do a swap cache lookup and lock before installing the new page --- it should probably just copy the new page into the old one if it finds one already there. OK I will look into that. Greetings Christoph - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: shmem_getpage_locked() / swapin_readahead() race in 2.4.4-pre3
Hi, On Sat, 14 Apr 2001, Marcelo Tosatti wrote: > There is a nasty race between shmem_getpage_locked() and > swapin_readahead() with the new shmem code (introduced in 2.4.3-ac3 > and merged in the main tree in 2.4.4-pre3): > > shmem_getpage_locked() finds a page in the swapcache and moves it to > the pagecache as an shmem page, freeing the swapcache and the swap > map entry for this page. (which causes a BUG() in mm/shmem.c:353 > since the swap map entry is being used) > > In the meanwhile, swapin_readahead() is allocating a page and adding > it to the swapcache. Oh, I was just chasing this also. > I don't see any clean fix for this one. I think the actual check for swap_count is not necessary: If swapin_readahead allocates a new swap_cache page for the entry, that's not a real bug. On memory pressure this page will be reclaimed. Actually we have to make shmem much more unfriendly to the swap cache to make it correct: I think we have to drop the whole drop swap cache pages on truncate logic since it uses lookup_swap_cache and delete_from_swap_cache which both lock the page, while holding a spinlock :-( The appended patch implements both changes and relies on the page stealer to shrink the swap cache. It also integrates fixes which Marcelo did send earlier. Greetings Christoph --- 2.4.4-pre3/mm/shmem.c Sat Apr 14 11:12:54 2001 +++ u2.4.3/mm/shmem.c Sun Apr 15 13:45:58 2001 @@ -123,10 +123,19 @@ entry = *ptr; *ptr = (swp_entry_t){0}; freed++; +#if 0 +/* +* This does not work since it may sleep while holding +* a spinlock +* +* We rely on the page stealer to free up the +* allocated swap space later +*/ if ((page = lookup_swap_cache(entry)) != NULL) { delete_from_swap_cache(page); page_cache_release(page); } +#endif swap_free (entry); } return freed; @@ -236,8 +245,10 @@ /* Only move to the swap cache if there are no other users of * the page. */ - if (atomic_read(>count) > 2) - goto out; + if (atomic_read(>count) > 2){ + set_page_dirty(page); + goto out; + } inode = page->mapping->host; info = >u.shmem_i; @@ -348,9 +359,6 @@ if (TryLockPage(page)) goto wait_retry; - if (swap_count(page) > 2) - BUG(); - swap_free(*entry); *entry = (swp_entry_t) {0}; delete_from_swap_cache_nolock(page); @@ -432,6 +440,7 @@ *ptr = NOPAGE_SIGBUS; return error; sigbus: + up (>i_sem); *ptr = NOPAGE_SIGBUS; return -EFAULT; }
Re: shmem_getpage_locked() / swapin_readahead() race in 2.4.4-pre3
Hi, On Sat, 14 Apr 2001, Marcelo Tosatti wrote: There is a nasty race between shmem_getpage_locked() and swapin_readahead() with the new shmem code (introduced in 2.4.3-ac3 and merged in the main tree in 2.4.4-pre3): shmem_getpage_locked() finds a page in the swapcache and moves it to the pagecache as an shmem page, freeing the swapcache and the swap map entry for this page. (which causes a BUG() in mm/shmem.c:353 since the swap map entry is being used) In the meanwhile, swapin_readahead() is allocating a page and adding it to the swapcache. Oh, I was just chasing this also. I don't see any clean fix for this one. I think the actual check for swap_count is not necessary: If swapin_readahead allocates a new swap_cache page for the entry, that's not a real bug. On memory pressure this page will be reclaimed. Actually we have to make shmem much more unfriendly to the swap cache to make it correct: I think we have to drop the whole drop swap cache pages on truncate logic since it uses lookup_swap_cache and delete_from_swap_cache which both lock the page, while holding a spinlock :-( The appended patch implements both changes and relies on the page stealer to shrink the swap cache. It also integrates fixes which Marcelo did send earlier. Greetings Christoph --- 2.4.4-pre3/mm/shmem.c Sat Apr 14 11:12:54 2001 +++ u2.4.3/mm/shmem.c Sun Apr 15 13:45:58 2001 @@ -123,10 +123,19 @@ entry = *ptr; *ptr = (swp_entry_t){0}; freed++; +#if 0 +/* +* This does not work since it may sleep while holding +* a spinlock +* +* We rely on the page stealer to free up the +* allocated swap space later +*/ if ((page = lookup_swap_cache(entry)) != NULL) { delete_from_swap_cache(page); page_cache_release(page); } +#endif swap_free (entry); } return freed; @@ -236,8 +245,10 @@ /* Only move to the swap cache if there are no other users of * the page. */ - if (atomic_read(page-count) 2) - goto out; + if (atomic_read(page-count) 2){ + set_page_dirty(page); + goto out; + } inode = page-mapping-host; info = inode-u.shmem_i; @@ -348,9 +359,6 @@ if (TryLockPage(page)) goto wait_retry; - if (swap_count(page) 2) - BUG(); - swap_free(*entry); *entry = (swp_entry_t) {0}; delete_from_swap_cache_nolock(page); @@ -432,6 +440,7 @@ *ptr = NOPAGE_SIGBUS; return error; sigbus: + up (inode-i_sem); *ptr = NOPAGE_SIGBUS; return -EFAULT; }
Re: 2.4.3-ac2 -- How do I determine if shm is being used?
Hi Miles, On Sat, 07 Apr 2001, Miles Lane wrote: > I have mounted: > > none on /var/shm type shm (rw) Not necessary any more. > tmpfs on /dev/shm type tmpfs (rw) Also not necessary, but recommended for POSIX shm. BTW it will not work with Linus' kernel. Type "shm" is supported by both versions. > X Error of failed request: BadValue (integer parameter out of range > for operation) > Major opcode of failed request: 146 (MIT-SHM) > Minor opcode of failed request: 3 (X_ShmPutImage) > Value in failed request: 0x161 > Serial number of failed request: 35107 > Current serial number in output stream: 35111 Ubfortunately this does not tell what it wanted to do. > I'd like to check to make sure that shm is actually accessible > to my programs. Is there any easy way to do this? ipcs should be your friend. Especially 'ipcs -lm'. Greetings Christoph - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: 2.4.3-ac2 -- How do I determine if shm is being used?
Hi Miles, On Sat, 07 Apr 2001, Miles Lane wrote: I have mounted: none on /var/shm type shm (rw) Not necessary any more. tmpfs on /dev/shm type tmpfs (rw) Also not necessary, but recommended for POSIX shm. BTW it will not work with Linus' kernel. Type "shm" is supported by both versions. X Error of failed request: BadValue (integer parameter out of range for operation) Major opcode of failed request: 146 (MIT-SHM) Minor opcode of failed request: 3 (X_ShmPutImage) Value in failed request: 0x161 Serial number of failed request: 35107 Current serial number in output stream: 35111 Ubfortunately this does not tell what it wanted to do. I'd like to check to make sure that shm is actually accessible to my programs. Is there any easy way to do this? ipcs should be your friend. Especially 'ipcs -lm'. Greetings Christoph - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: tmpfs in 2.4.3 and AC
On Fri, 30 Mar 2001, [EMAIL PROTECTED] wrote: > tmpfs (or shmfs or whatever name you like) is still different in > official series (2.4.3) and in ac series. Its a kick in the ass for > multiboot, as offcial 2.4.3 does not recognise 'tmpfs' in fstab: > > shmfs /dev/shmtmpfs ... Use type shm. It works in both versions. > Any reason, or is because it has been forgotten ? Alan picked up the tmpfs extensions. Linus didn't. Greetings Christoph - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: tmpfs in 2.4.3 and AC
On Fri, 30 Mar 2001, [EMAIL PROTECTED] wrote: tmpfs (or shmfs or whatever name you like) is still different in official series (2.4.3) and in ac series. Its a kick in the ass for multiboot, as offcial 2.4.3 does not recognise 'tmpfs' in fstab: shmfs /dev/shmtmpfs ... Use type shm. It works in both versions. Any reason, or is because it has been forgotten ? Alan picked up the tmpfs extensions. Linus didn't. Greetings Christoph - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: tmpfs: a way to get your system down
Hi Alex, On Sat, 24 Mar 2001, Alex Riesen wrote: > just hit by tmpfs on 2.4.2-ac20 > > mount -t tmpfs mnt > dd if=/dev/zero mnt/tmpfile > > resulted in hardly slowed system and lockup, > and not in "No space left on device", as expected. Use mount option "size". The default is unlimited... Greetings Christoph - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: tmpfs: a way to get your system down
Hi Alex, On Sat, 24 Mar 2001, Alex Riesen wrote: just hit by tmpfs on 2.4.2-ac20 mount -t tmpfs mnt dd if=/dev/zero mnt/tmpfile resulted in hardly slowed system and lockup, and not in "No space left on device", as expected. Use mount option "size". The default is unlimited... Greetings Christoph - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: 2.4 and 2GB swap partition limit
Hi Matt, On Sun, 4 Mar 2001, Matt Domsch wrote: > My concern is that if there continues to be a 2GB swap > partition/file size limitation, and you can have (as currently > #defined) 8 swap partitions, you're limited to 16GB swap, which then > follows a max of 8GB RAM. We'd like to sell servers with 32GB or > 64GB RAM to customers who request such for their applications. Such > customers generally have no problem purchasing additional disks to > be used for swap, likely on a hardware RAID controller. I did think about that too and I also think the 2GB limit is not appropriate for the big servers. But I do not beleive that you need so much swap on these machines. If you drive a 32 GB machine so heavily into swap it is more busy finding the pages to swap than doing anything really interesting. (At least that's my experience) BTW often these big servers run databases and application servers which have most of their memory in shared memory. Shared memory does free the swap entries on swapin. (I thought about changing that but as long as we have no garbage collection for idle swap entries I will not do it) On any loaded server you have to check the swap space requirements regularly and adjust to your needs. But to setup more than let's say 8GB swap is a waste of resource IMHO. > We've also seen (anecdotal evidence here) cases where a kernel > panics, which we believe may have to do with having 0 < swap < 2x > RAM. We're investigating further. That would be a kernel bug which should be fixed. The kernel should handle oom/oos. >> Actually the deal is: either use enough swap (about 2x RAM) or use >> none at all. > > If swap space isn't required in all cases, great! We'll encourage > the use of swap files as needed, rather than swap partitions. But, > if instead you *require* swap = 2x RAM, then the 2GB swap size > limitation must go. No it is not strictly required. But still the 2GB limit is annoying and together with the arch-independent maximum number of swap partitions/files it is pretty dumb. So I would propose to first make a small patch to make MAX_SWAPFILES arch-dependent and bigger. (x86 would allow a muc higher MAX_SWAPFILES) For 2.5 we could perhaps think about a new swapfile layout which allows bigger partitions. Greetings Christoph - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: 2.4 and 2GB swap partition limit
Hi Matt, On Sun, 4 Mar 2001, Matt Domsch wrote: My concern is that if there continues to be a 2GB swap partition/file size limitation, and you can have (as currently #defined) 8 swap partitions, you're limited to 16GB swap, which then follows a max of 8GB RAM. We'd like to sell servers with 32GB or 64GB RAM to customers who request such for their applications. Such customers generally have no problem purchasing additional disks to be used for swap, likely on a hardware RAID controller. I did think about that too and I also think the 2GB limit is not appropriate for the big servers. But I do not beleive that you need so much swap on these machines. If you drive a 32 GB machine so heavily into swap it is more busy finding the pages to swap than doing anything really interesting. (At least that's my experience) BTW often these big servers run databases and application servers which have most of their memory in shared memory. Shared memory does free the swap entries on swapin. (I thought about changing that but as long as we have no garbage collection for idle swap entries I will not do it) On any loaded server you have to check the swap space requirements regularly and adjust to your needs. But to setup more than let's say 8GB swap is a waste of resource IMHO. We've also seen (anecdotal evidence here) cases where a kernel panics, which we believe may have to do with having 0 swap 2x RAM. We're investigating further. That would be a kernel bug which should be fixed. The kernel should handle oom/oos. Actually the deal is: either use enough swap (about 2x RAM) or use none at all. If swap space isn't required in all cases, great! We'll encourage the use of swap files as needed, rather than swap partitions. But, if instead you *require* swap = 2x RAM, then the 2GB swap size limitation must go. No it is not strictly required. But still the 2GB limit is annoying and together with the arch-independent maximum number of swap partitions/files it is pretty dumb. So I would propose to first make a small patch to make MAX_SWAPFILES arch-dependent and bigger. (x86 would allow a muc higher MAX_SWAPFILES) For 2.5 we could perhaps think about a new swapfile layout which allows bigger partitions. Greetings Christoph - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: Kernel is unstable
Hi Linus, On 1 Mar 2001, Linus Torvalds wrote: > Note how do_brk() does the merging itself (see the comment "Can we > just expand an old anonymous mapping?"), and that it's basically > free when done that way, with no worries about locking etc. The same > could be done fairly trivially in mmap too, but I never saw any real > usage patterns that made it look all that worthwhile (*). Handling > the mmap case the same way do_brk() does it would fix the behaviour > of this pathological example too.. Oh there is at least one application, which does trigger the merging quite often: SAP R/3. We have a big memory area which is handled in 1M blocks which get mmaped/munmapped/mprotected all the time. This now leads to a really big avl tree which before has been much smaller. I am not sure that the merging is a gain since it in itself is a overhead and we work on fixed blocks. I simply wanted to point out that there are applications out there which trigger it. Greetings Christoph - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: Kernel is unstable
Hi Linus, On 1 Mar 2001, Linus Torvalds wrote: Note how do_brk() does the merging itself (see the comment "Can we just expand an old anonymous mapping?"), and that it's basically free when done that way, with no worries about locking etc. The same could be done fairly trivially in mmap too, but I never saw any real usage patterns that made it look all that worthwhile (*). Handling the mmap case the same way do_brk() does it would fix the behaviour of this pathological example too.. Oh there is at least one application, which does trigger the merging quite often: SAP R/3. We have a big memory area which is handled in 1M blocks which get mmaped/munmapped/mprotected all the time. This now leads to a really big avl tree which before has been much smaller. I am not sure that the merging is a gain since it in itself is a overhead and we work on fixed blocks. I simply wanted to point out that there are applications out there which trigger it. Greetings Christoph - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[Patch] make file times work in tmpfs
Hi Alan, here is a patch that makes the different file timestamps work on tmpfs. Greetings Christoph --- mac10/mm/shmem.c.orig Wed Feb 14 14:39:46 2001 +++ mac10/mm/shmem.cWed Feb 14 15:30:09 2001 @@ -160,6 +160,7 @@ swp_entry_t **base, **ptr, **last; struct shmem_inode_info * info = >u.shmem_i; + inode->i_ctime = inode->i_mtime = CURRENT_TIME; spin_lock (>lock); index = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; if (index > info->max_index) @@ -734,6 +735,7 @@ struct inode * inode = shmem_get_inode(dir->i_sb, mode, dev); int error = -ENOSPC; + dir->i_ctime = dir->i_mtime = CURRENT_TIME; if (inode) { d_instantiate(dentry, inode); dget(dentry); /* Extra count - pin the dentry in core */ @@ -767,6 +769,7 @@ if (S_ISDIR(inode->i_mode)) return -EPERM; + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; inode->i_nlink++; atomic_inc(>i_count);/* New dentry reference */ dget(dentry); /* Extra pinning count for the created dentry */ @@ -809,7 +812,9 @@ static int shmem_unlink(struct inode * dir, struct dentry *dentry) { - dentry->d_inode->i_nlink--; + struct inode *inode = dentry->d_inode; + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; + inode->i_nlink--; dput(dentry); /* Undo the count from "create" - this does all the work */ return 0; } @@ -836,10 +841,12 @@ if (shmem_empty(new_dentry)) { struct inode *inode = new_dentry->d_inode; if (inode) { + inode->i_ctime = CURRENT_TIME; inode->i_nlink--; dput(new_dentry); } error = 0; + old_dentry->d_inode->i_ctime = old_dir->i_ctime = old_dir->i_mtime = +CURRENT_TIME; } return error; } @@ -873,6 +880,7 @@ UnlockPage(page); page_cache_release(page); up(>i_sem); + dir->i_ctime = dir->i_mtime = CURRENT_TIME; return 0; fail: up(>i_sem); - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[Patch] make file times work in tmpfs
Hi Alan, here is a patch that makes the different file timestamps work on tmpfs. Greetings Christoph --- mac10/mm/shmem.c.orig Wed Feb 14 14:39:46 2001 +++ mac10/mm/shmem.cWed Feb 14 15:30:09 2001 @@ -160,6 +160,7 @@ swp_entry_t **base, **ptr, **last; struct shmem_inode_info * info = inode-u.shmem_i; + inode-i_ctime = inode-i_mtime = CURRENT_TIME; spin_lock (info-lock); index = (inode-i_size + PAGE_CACHE_SIZE - 1) PAGE_CACHE_SHIFT; if (index info-max_index) @@ -734,6 +735,7 @@ struct inode * inode = shmem_get_inode(dir-i_sb, mode, dev); int error = -ENOSPC; + dir-i_ctime = dir-i_mtime = CURRENT_TIME; if (inode) { d_instantiate(dentry, inode); dget(dentry); /* Extra count - pin the dentry in core */ @@ -767,6 +769,7 @@ if (S_ISDIR(inode-i_mode)) return -EPERM; + inode-i_ctime = dir-i_ctime = dir-i_mtime = CURRENT_TIME; inode-i_nlink++; atomic_inc(inode-i_count);/* New dentry reference */ dget(dentry); /* Extra pinning count for the created dentry */ @@ -809,7 +812,9 @@ static int shmem_unlink(struct inode * dir, struct dentry *dentry) { - dentry-d_inode-i_nlink--; + struct inode *inode = dentry-d_inode; + inode-i_ctime = dir-i_ctime = dir-i_mtime = CURRENT_TIME; + inode-i_nlink--; dput(dentry); /* Undo the count from "create" - this does all the work */ return 0; } @@ -836,10 +841,12 @@ if (shmem_empty(new_dentry)) { struct inode *inode = new_dentry-d_inode; if (inode) { + inode-i_ctime = CURRENT_TIME; inode-i_nlink--; dput(new_dentry); } error = 0; + old_dentry-d_inode-i_ctime = old_dir-i_ctime = old_dir-i_mtime = +CURRENT_TIME; } return error; } @@ -873,6 +880,7 @@ UnlockPage(page); page_cache_release(page); up(inode-i_sem); + dir-i_ctime = dir-i_mtime = CURRENT_TIME; return 0; fail: up(inode-i_sem); - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[Patch] correct tmpfs link count for directories
Hi Alan, The attached patch makes tmpfs behave more like other fs's. Apparently perl expects this. Greetings Christoph diff -uNr 2.4.1-ac10/mm/shmem.c 2.4.1-ac10-nlink/mm/shmem.c --- 2.4.1-ac10/mm/shmem.c Mon Feb 12 15:01:47 2001 +++ 2.4.1-ac10-nlink/mm/shmem.c Tue Feb 13 13:48:49 2001 @@ -465,6 +465,7 @@ inode->i_fop = _file_operations; break; case S_IFDIR: + inode->i_nlink++; inode->i_op = _dir_inode_operations; inode->i_fop = _dir_operations; break; @@ -743,7 +744,12 @@ static int shmem_mkdir(struct inode * dir, struct dentry * dentry, int mode) { - return shmem_mknod(dir, dentry, mode | S_IFDIR, 0); + int error; + + if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0))) + return error; + dir->i_nlink++; + return 0; } static int shmem_create(struct inode *dir, struct dentry *dentry, int mode) @@ -801,25 +807,21 @@ return 1; } -/* - * This works for both directories and regular files. - * (non-directories will always have empty subdirs) - */ static int shmem_unlink(struct inode * dir, struct dentry *dentry) { - int retval = -ENOTEMPTY; + dentry->d_inode->i_nlink--; + dput(dentry); /* Undo the count from "create" - this does all the work */ + return 0; +} - if (shmem_empty(dentry)) { - struct inode *inode = dentry->d_inode; +static int shmem_rmdir(struct inode * dir, struct dentry *dentry) +{ + if (!shmem_empty(dentry)) + return -ENOTEMPTY; - inode->i_nlink--; - dput(dentry); /* Undo the count from "create" - this does all the work */ - retval = 0; - } - return retval; + dir->i_nlink--; + return shmem_unlink(dir, dentry); } - -#define shmem_rmdir shmem_unlink /* * The VFS layer already does all the dentry stuff for rename, - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: Linux 2.2.19pre10
Hi Alan, On Tue, 13 Feb 2001, Alan Cox wrote: >> Yes, I understand that. But I never got any note that my fix is >> broken and I still do not understand what's the concern. > > Unless Im misreading the code the segment you poke at has > potentially been freed before it is written too. Oh yes I was blind, shame on me. Here comes a fixed version. Greetings Christoph --- 2.2.19-pre10/ipc/shm.c.orig Tue Feb 13 14:35:25 2001 +++ 2.2.19-pre10/ipc/shm.c Tue Feb 13 14:34:49 2001 @@ -337,6 +337,8 @@ if (current->euid == shp->u.shm_perm.uid || current->euid == shp->u.shm_perm.cuid || capable(CAP_SYS_ADMIN)) { + /* Do not find it any more */ + shp->u.shm_perm.key = IPC_PRIVATE; shp->u.shm_perm.mode |= SHM_DEST; if (shp->u.shm_nattch <= 0) killseg (id); - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: Linux 2.2.19pre10
Hi Alan, On Tue, 13 Feb 2001, Alan Cox wrote: >> No, I do not think that it's minor. We had to bring down running >> application servers to be able to start another one, because the >> new one couldn't create or attach the systemwide os-monitoring >> segment and thus refused to start. That's very bad behaviour. > > Well I'll take corrected fixes, but Im not going to hold up a release for it Yes, I understand that. But I never got any note that my fix is broken and I still do not understand what's the concern. We are holding the BKL while doing this. And if shm_close does not get called with it we should probably acquire it. Greetings Christoph - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/