[Moblin Dev] sreadahead - rotating media speedup ...

Michael Meeks Wed, 19 Aug 2009 02:56:29 -0700

Hi guys,

        I was playing with sreadahead on a rotating disk, and I realised that
there were a number of things that could perhaps be better for a FDD
based system.


        First - readahead is usually not the enemy on a rotating disk, secondly
- seeks cost - so sorting the I/O by some hint as to block position (as
given by FIBMAP) can helpful.

        Also - data collection / timeouts need to be longer on rotating media -
so I added a random factor of two in there :-)

        The speed win (for me) between the original sreadahead and this version
was of the order of 5+ seconds off our original boot time of 30 seconds.

        Then of course, I couldn't resist a few cleanups as well. We'll be
shipping something based on this - and I notice there are several other
fixes in svn that havn't been released too [ it'd be great to get a new
release ].

        Does this live better in bugzilla ? and if so, which one ? :-) and/or
may I have commit access to sreadahead svn (can we please move to git !)
[ I can dump Greg's forward ported kernel tracer patch there too I
guess ].

        Thanks,

                Michael.

--- boilerplate waiver ---
The attached / included patches are submitted under the terms here:
http://bugzilla.openedhand.com/waiver.html applied to whatsoever
product to which they apply, rather than to clutter:

Index: sreadahead.c
===================================================================
--- sreadahead.c        (revision 38)
+++ sreadahead.c        (working copy)
@@ -25,9 +25,10 @@
 #include <sys/types.h>
 #include <sys/syscall.h>
 #include <sys/mount.h>
+#include <sys/ioctl.h>
 #include <sys/signal.h>
-#include <fcntl.h>
 #include <errno.h>
+#include <linux/fs.h>
 
 #include <getopt.h>
 
@@ -47,9 +48,13 @@
 #  warning "Architecture does not support ioprio modification"
 #endif
 #define IOPRIO_WHO_PROCESS 1
+#define IOPRIO_CLASS_RT 1
+#define IOPRIO_CLASS_BE 2
 #define IOPRIO_CLASS_IDLE 3
 #define IOPRIO_CLASS_SHIFT 13
 #define IOPRIO_IDLE_LOWEST (7 | (IOPRIO_CLASS_IDLE << IOPRIO_CLASS_SHIFT))
+#define IOPRIO_BE_HIGHEST  (0 | (IOPRIO_CLASS_BE << IOPRIO_CLASS_SHIFT))
+#define IOPRIO_RT_HIGHEST  (0 | (IOPRIO_CLASS_RT << IOPRIO_CLASS_SHIFT))
 
 #define PACK_PATH      "/var/lib/sreadahead"
 #define DEBUGFS_MNT    "/var/lib/sreadahead/debugfs"
@@ -58,8 +63,9 @@
 #define MAXR 40000     /* trace file can be long */
 #define MAXFL 128
 #define MAXRECS 6      /* reduce nr of fragments to this amount */
+#define MAXTHREADS 16   /* max. number of read threads we can use */
 
-#define DEFAULT_MAX_TIME 15 /* should be enough for every OS to boot */
+#define DEFAULT_MAX_TIME 20 /* should be enough for every OS to boot */
 
 /*
  * By default, the kernel reads ahead for 128kb. This throws off our
@@ -99,6 +105,7 @@
        struct ra_struct        *next;
        struct ra_struct        *prev;
        int                     number;
+       unsigned long           block_order_hint;
 };
 
 static struct ra_struct *ra[MAXR];
@@ -113,14 +120,12 @@
 static unsigned int cursor = 0;
 
 static int debug = 0;
+static int is_ssd = 0;
 
-
-static void readahead_set_len(int size)
+static int sysfs_unmount = 0;
+static void enter_sysfs (void)
 {
        int unmount;
-       int i = 0;
-       char ractl[100];
-       /* changes readahead size to "size" for local block devices */
 
        unmount = chdir("/sys/block");
        if (unmount != 0) {
@@ -129,9 +134,47 @@
                        /* non-fatal */
                        return;
                }
+               sysfs_unmount = 1;
                chdir("/sys/block");
+       } else
+               sysfs_unmount = 0;
+}
+
+static void exit_sysfs (void)
+{
+       chdir("/");
+       if (sysfs_unmount != 0)
+               umount("/sys");
+}
+
+
+static int is_sda_ssd (void)
+{
+       FILE *file;
+       int is_ssd = 0;
+
+       enter_sysfs();
+
+       file = fopen ("sda/queue/rotational", "r");
+       if (file) {
+               char buffer[64];
+               is_ssd = !atoi (fgets (buffer, 64, file));
+               fclose (file);
        }
 
+       exit_sysfs();
+
+       return is_ssd;
+}
+
+static void readahead_set_len(int size)
+{
+       int i = 0;
+       char ractl[100];
+       /* changes readahead size to "size" for local block devices */
+
+       enter_sysfs();
+
        sprintf(ractl, "sda/queue/read_ahead_kb");
        while (i <= 3) {
                /* check first 4 sata discs */
@@ -144,10 +187,7 @@
                i++;
        }
 
-       chdir("/");
-
-       if (unmount != 0)
-               umount("/sys");
+       exit_sysfs();
 }
 
 static void readahead_one(int index)
@@ -187,6 +227,8 @@
        return NULL;
 }
 
+/* sort to help remove duplicates, we retain the original
+   order in the next/prev linked list */
 static void sort_ra_by_name(void)
 {
        int delta = 1;
@@ -343,6 +385,11 @@
                rcount++;
        }
 
+       if (there) {
+               r->block_order_hint = 0; /* first block */
+               ioctl (fd, FIBMAP, &r->block_order_hint);
+       }
+
        free(mincorebuf);
        munmap(mmapptr, statbuf.st_size);
        fclose(file);
@@ -362,15 +409,17 @@
                                fcount++;
                        }
                        rdsize += (tlen <= 0 ? 1024 : tlen);
-                       printf("%s: %d fragment(s), %dkb, %3.1f%%\n",
+                       printf("%s: %d fragment(s), %dkb, %3.1f%% - block 
%ld\n",
                               r->filename, rcount,
                               (tlen <= 1024 ? 1024 : tlen) / 1024,
-                              100.0 * there / (there + notthere));
+                              100.0 * there / (there + notthere),
+                              r->block_order_hint);
                }
 
                memcpy(r->data, record, sizeof(r->data));
                return 1;
        }
+       return 0;
 }
 
 static void get_ra_blocks(void)
@@ -389,6 +438,50 @@
        }
 }
 
+static void write_ra (FILE *file, struct ra_struct *r)
+{
+       if (debug)
+               printf ("write_ra '%s' (0x%lx)\n", r->filename, 
r->block_order_hint);
+       fwrite(r->filename, MAXFL, 1, file);
+       fwrite(r->data, sizeof(r->data), 1, file);
+       rdcount++;
+}
+
+/* split the list of files into chunks - runs of 256 files
+   or so. Inside this chunk, sort by block hint - hopefully
+   this substantially improves read linearity on non-SSDs */
+static void write_sorted_in_chunks_by_block(FILE *file, struct ra_struct *list)
+{
+#define CHUNK_SIZE 256 /* deeply mystical chunk size */
+       while (list) {
+               int i, max = 0;
+               int delta = 1;
+               struct ra_struct *sort_array[CHUNK_SIZE];
+
+               /* copy a chunk across */
+               for (; list && max < CHUNK_SIZE; list = list->next)
+                       sort_array[max++] = list;
+
+               /* sort by first block */
+               while (delta > 0) {
+                       delta = 0;
+                       for (i = 0; i < max - 1; i++) {
+                               if (sort_array[i]->block_order_hint > 
sort_array[i+1]->block_order_hint) {
+                                       struct ra_struct *tmp;
+                                       tmp = sort_array[i];
+                                       sort_array[i] = sort_array[i+1];
+                                       sort_array[i+1] = tmp;
+                                       delta++;
+                               }
+                       }
+               }
+
+               /* write out */
+               for (i = 0; i < max - 1; i++)
+                       write_ra (file, sort_array[i]);
+       }
+}
+
 static void trace_start(void)
 {
        int ret;
@@ -578,13 +671,13 @@
                exit(EXIT_FAILURE);
        }
 
-       r = first_ra;
-       while (r) {
-               fwrite(r->filename, MAXFL, 1, file);
-               fwrite(r->data, sizeof(r->data), 1, file);
-               r = r->next;
-               rdcount++;
+       if (!is_ssd)
+               write_sorted_in_chunks_by_block (file, first_ra);
+       else {
+               for (r = first_ra; r; r = r->next)
+                       write_ra (file, r);
        }
+
        fclose(file);
        if (debug) {
                times(&stop_time);
@@ -619,8 +712,9 @@
 int main(int argc, char **argv)
 {
        FILE *file;
+       int i, max_threads;
        int pid = 0;
-       pthread_t one, two, three, four;
+       pthread_t threads[MAXTHREADS];
        int max_time = DEFAULT_MAX_TIME;
 
        while (1) {
@@ -655,6 +749,10 @@
                }
        }
 
+       is_ssd = is_sda_ssd ();
+       if (!is_ssd)
+               max_time *= 2;
+
        file = fopen(PACK_FILE, "r");
        if (!file) {
                /* enable tracing open calls before we fork! */
@@ -684,26 +782,25 @@
        fclose(file);
 
 #ifdef HAVE_IO_PRIO
-       if (syscall(__NR_ioprio_set, IOPRIO_WHO_PROCESS, pid,
-                   IOPRIO_IDLE_LOWEST) == -1)
-               perror("Can not set IO priority to idle class");
+               if (syscall(__NR_ioprio_set, IOPRIO_WHO_PROCESS, pid,
+                           IOPRIO_IDLE_LOWEST) == -1)
+                       perror("Can not set IO priority to idle class");
 #endif
 
-       readahead_set_len(RA_SMALL);
+       if (is_ssd)
+               readahead_set_len(RA_SMALL);
+       max_threads = 4;
 
        daemon(0,0);
 
-       pthread_create(&one, NULL, one_thread, NULL);
-       pthread_create(&two, NULL, one_thread, NULL);
-       pthread_create(&three, NULL, one_thread, NULL);
-       pthread_create(&four, NULL, one_thread, NULL);
+       for (i = 0; i < max_threads; i++)
+               pthread_create(&threads[i], NULL, one_thread, NULL);
 
-       pthread_join(one, NULL);
-       pthread_join(two, NULL);
-       pthread_join(three, NULL);
-       pthread_join(four, NULL);
+       for (i = 0; i < max_threads; i++)
+               pthread_join(threads[i], NULL);
 
-       readahead_set_len(RA_NORMAL);
+       if (is_ssd)
+               readahead_set_len(RA_NORMAL);
 
        return EXIT_SUCCESS;
 }

-- 
 [email protected]  <><, Pseudo Engineer, itinerant idiot


_______________________________________________
Moblin dev Mailing List
[email protected]

To manage or unsubscribe from this mailing list visit:
http://lists.moblin.org/listinfo/dev or your user account on http://moblin.org 
once logged in.

For more information on the Moblin Developer Mailing lists visit:
http://moblin.org/community/mailing-lists

[Moblin Dev] sreadahead - rotating media speedup ...

Reply via email to