Ingo Molnar a écrit :

no, i just wanted to make a demonstration that one can be pretty nasty in on-lkml replies while being technically correct :-) I think you went a bit overboard in your replies to Davide. Lets move this back into constructive channels, ok? :)

In any case, here is one preliminary patch to show what I had in mind.

I only had time to compile it (its very late here), not even boot tested, so dont try it !

[PATCH] reduce max latency of get_unused_fd().

Goal is to scan at most 4096 bytes (or 32768 bits) in the open_fds bitmap.

Processes that have many file descriptors might have to scan 128 KB of ram to find a zero bit. Thats about 100 us on modern machines.

This patch introduces an array of counters. Each counter gives the number of 'one' bits in a 4 KB section of the open_fds bitmap.

I chose to statically allocate this array of counters, being very small (64 bytes), so a dynamic allocation would only add complexity.

As a result, max latency is 4 us (same latency on x86 when vm gives you a new page)

Signed-off-by: Eric Dumazet <[EMAIL PROTECTED]>
---
 fs/fcntl.c                |   18 +++++++++++++++---
 fs/file.c                 |    6 +++---
 fs/open.c                 |   13 +++++++++++++
 include/linux/file.h      |   11 +++++++++++
 include/linux/fs.h        |    2 --
 include/linux/init_task.h |    1 +
 kernel/fork.c             |    1 +
 7 files changed, 44 insertions(+), 8 deletions(-)

diff --git a/include/linux/file.h b/include/linux/file.h
index a59001e..ec6b120 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -36,6 +36,16 @@ struct fdtable {
 };
 
 /*
+ * To avoid big latencies in get_unused_fd(),
+ * we maintain counters of "one" bits in bitmap pages
+ * we define a 'page' here to contain 32768 bits,
+ * so that each counter is an unsigned short
+ * with MAX_NR_OPENS = 2^20, we get 32 counters : 64 bytes
+ */
+#define FDSBITS 32768
+#define MAX_NR_OPEN (1024*1024)        /* Absolute upper limit on fd num */
+
+/*
  * Open file table structure
  */
 struct files_struct {
@@ -50,6 +60,7 @@ struct files_struct {
    */
        spinlock_t file_lock ____cacheline_aligned_in_smp;
        int next_fd;
+       unsigned short fds_counter[(MAX_NR_OPEN + (FDSBITS - 1)) / FDSBITS];
        struct embedded_fd_set close_on_exec_init;
        struct embedded_fd_set open_fds_init;
        struct file * fd_array[NR_OPEN_DEFAULT];
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 8e382a5..5257ba6 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -61,6 +61,7 @@ static int locate_fd(struct files_struct
        unsigned int start;
        int error;
        struct fdtable *fdt;
+       unsigned int page_nr;
 
        error = -EINVAL;
        if (orig_start >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
@@ -77,11 +78,19 @@ repeat:
                start = files->next_fd;
 
        newfd = start;
-       if (start < fdt->max_fds)
+
+       error = -EMFILE;
+       if (start < fdt->max_fds) {
+               page_nr = start / FDSBITS;
+               while (files->fds_counter[page_nr] == FDSBITS) {
+                       page_nr++;
+                       start = page_nr * FDSBITS;
+                       if (start >= fdt->max_fds)
+                               goto out;
+               }
                newfd = find_next_zero_bit(fdt->open_fds->fds_bits,
                                           fdt->max_fds, start);
-       
-       error = -EMFILE;
+       }       
        if (newfd >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
                goto out;
 
@@ -122,6 +131,7 @@ static int dupfd(struct file *file, unsi
                /* locate_fd() may have expanded fdtable, load the ptr */
                fdt = files_fdtable(files);
                FD_SET(fd, fdt->open_fds);
+               files->fds_counter[fd / FDSBITS]++;
                FD_CLR(fd, fdt->close_on_exec);
                spin_unlock(&files->file_lock);
                fd_install(fd, file);
@@ -171,7 +181,9 @@ asmlinkage long sys_dup2(unsigned int ol
 
        rcu_assign_pointer(fdt->fd[newfd], file);
        FD_SET(newfd, fdt->open_fds);
+       files->fds_counter[newfd / FDSBITS]++;
        FD_CLR(newfd, fdt->close_on_exec);
+
        spin_unlock(&files->file_lock);
 
        if (tofree)
diff --git a/fs/file.c b/fs/file.c
index c5575de..7dbd9c5 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -147,8 +147,8 @@ static struct fdtable * alloc_fdtable(un
        nr /= (1024 / sizeof(struct file *));
        nr = roundup_pow_of_two(nr + 1);
        nr *= (1024 / sizeof(struct file *));
-       if (nr > NR_OPEN)
-               nr = NR_OPEN;
+       if (nr > MAX_NR_OPEN)
+               nr = MAX_NR_OPEN;
 
        fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL);
        if (!fdt)
@@ -233,7 +233,7 @@ int expand_files(struct files_struct *fi
        if (nr < fdt->max_fds)
                return 0;
        /* Can we expand? */
-       if (nr >= NR_OPEN)
+       if (nr >= MAX_NR_OPEN)
                return -EMFILE;
 
        /* All good, so we try */
diff --git a/fs/open.c b/fs/open.c
index 0d515d1..340e69b 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -860,12 +860,23 @@ int get_unused_fd(void)
        struct files_struct * files = current->files;
        int fd, error;
        struct fdtable *fdt;
+       unsigned int page_nr;
 
        error = -EMFILE;
        spin_lock(&files->file_lock);
 
 repeat:
        fdt = files_fdtable(files);
+       page_nr = files->next_fd / FDSBITS;
+       /*
+        * We can avoid testing big chunks of memory if all bit are set
+        */
+       while (files->fds_counter[page_nr] == FDSBITS) {
+               page_nr++;
+               files->next_fd = page_nr * FDSBITS;
+               if (files->next_fd >= fdt->max_fds)
+                       break;
+       }
        fd = find_next_zero_bit(fdt->open_fds->fds_bits, fdt->max_fds,
                                files->next_fd);
 
@@ -891,6 +902,7 @@ repeat:
        }
 
        FD_SET(fd, fdt->open_fds);
+       files->fds_counter[fd / FDSBITS]++;
        FD_CLR(fd, fdt->close_on_exec);
        files->next_fd = fd + 1;
 #if 1
@@ -913,6 +925,7 @@ static void __put_unused_fd(struct files
 {
        struct fdtable *fdt = files_fdtable(files);
        __FD_CLR(fd, fdt->open_fds);
+       files->fds_counter[fd / FDSBITS]--;
        if (fd < files->next_fd)
                files->next_fd = fd;
 }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b3ae77c..9db2799 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -20,8 +20,6 @@ #include <linux/ioctl.h>
  */
 
 /* Fixed constants first: */
-#undef NR_OPEN
-#define NR_OPEN (1024*1024)    /* Absolute upper limit on fd num */
 #define INR_OPEN 1024          /* Initial setting for nfile rlimits */
 
 #define BLOCK_SIZE_BITS 10
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 276ccaa..bd24190 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -26,6 +26,7 @@ #define INIT_FILES \
        .fdtab          = INIT_FDTABLE,                 \
        .file_lock      = __SPIN_LOCK_UNLOCKED(init_task.file_lock), \
        .next_fd        = 0,                            \
+       .fds_counter    = {0},                          \
        .close_on_exec_init = { { 0, } },               \
        .open_fds_init  = { { 0, } },                   \
        .fd_array       = { NULL, }                     \
diff --git a/kernel/fork.c b/kernel/fork.c
index 73ad5cd..f4341c5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -641,6 +641,7 @@ static struct files_struct *alloc_files(
 
        spin_lock_init(&newf->file_lock);
        newf->next_fd = 0;
+       memset(newf->fds_counter, 0, sizeof(newf->fds_counter));
        fdt = &newf->fdtab;
        fdt->max_fds = NR_OPEN_DEFAULT;
        fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init;

Reply via email to