Hi guys,
I was playing with sreadahead on a rotating disk, and I realised that
there were a number of things that could perhaps be better for a FDD
based system.
First - readahead is usually not the enemy on a rotating disk, secondly
- seeks cost - so sorting the I/O by some hint as to block position (as
given by FIBMAP) can helpful.
Also - data collection / timeouts need to be longer on rotating media -
so I added a random factor of two in there :-)
The speed win (for me) between the original sreadahead and this version
was of the order of 5+ seconds off our original boot time of 30 seconds.
Then of course, I couldn't resist a few cleanups as well. We'll be
shipping something based on this - and I notice there are several other
fixes in svn that havn't been released too [ it'd be great to get a new
release ].
Does this live better in bugzilla ? and if so, which one ? :-) and/or
may I have commit access to sreadahead svn (can we please move to git !)
[ I can dump Greg's forward ported kernel tracer patch there too I
guess ].
Thanks,
Michael.
--- boilerplate waiver ---
The attached / included patches are submitted under the terms here:
http://bugzilla.openedhand.com/waiver.html applied to whatsoever
product to which they apply, rather than to clutter:
Index: sreadahead.c
===================================================================
--- sreadahead.c (revision 38)
+++ sreadahead.c (working copy)
@@ -25,9 +25,10 @@
#include <sys/types.h>
#include <sys/syscall.h>
#include <sys/mount.h>
+#include <sys/ioctl.h>
#include <sys/signal.h>
-#include <fcntl.h>
#include <errno.h>
+#include <linux/fs.h>
#include <getopt.h>
@@ -47,9 +48,13 @@
# warning "Architecture does not support ioprio modification"
#endif
#define IOPRIO_WHO_PROCESS 1
+#define IOPRIO_CLASS_RT 1
+#define IOPRIO_CLASS_BE 2
#define IOPRIO_CLASS_IDLE 3
#define IOPRIO_CLASS_SHIFT 13
#define IOPRIO_IDLE_LOWEST (7 | (IOPRIO_CLASS_IDLE << IOPRIO_CLASS_SHIFT))
+#define IOPRIO_BE_HIGHEST (0 | (IOPRIO_CLASS_BE << IOPRIO_CLASS_SHIFT))
+#define IOPRIO_RT_HIGHEST (0 | (IOPRIO_CLASS_RT << IOPRIO_CLASS_SHIFT))
#define PACK_PATH "/var/lib/sreadahead"
#define DEBUGFS_MNT "/var/lib/sreadahead/debugfs"
@@ -58,8 +63,9 @@
#define MAXR 40000 /* trace file can be long */
#define MAXFL 128
#define MAXRECS 6 /* reduce nr of fragments to this amount */
+#define MAXTHREADS 16 /* max. number of read threads we can use */
-#define DEFAULT_MAX_TIME 15 /* should be enough for every OS to boot */
+#define DEFAULT_MAX_TIME 20 /* should be enough for every OS to boot */
/*
* By default, the kernel reads ahead for 128kb. This throws off our
@@ -99,6 +105,7 @@
struct ra_struct *next;
struct ra_struct *prev;
int number;
+ unsigned long block_order_hint;
};
static struct ra_struct *ra[MAXR];
@@ -113,14 +120,12 @@
static unsigned int cursor = 0;
static int debug = 0;
+static int is_ssd = 0;
-
-static void readahead_set_len(int size)
+static int sysfs_unmount = 0;
+static void enter_sysfs (void)
{
int unmount;
- int i = 0;
- char ractl[100];
- /* changes readahead size to "size" for local block devices */
unmount = chdir("/sys/block");
if (unmount != 0) {
@@ -129,9 +134,47 @@
/* non-fatal */
return;
}
+ sysfs_unmount = 1;
chdir("/sys/block");
+ } else
+ sysfs_unmount = 0;
+}
+
+static void exit_sysfs (void)
+{
+ chdir("/");
+ if (sysfs_unmount != 0)
+ umount("/sys");
+}
+
+
+static int is_sda_ssd (void)
+{
+ FILE *file;
+ int is_ssd = 0;
+
+ enter_sysfs();
+
+ file = fopen ("sda/queue/rotational", "r");
+ if (file) {
+ char buffer[64];
+ is_ssd = !atoi (fgets (buffer, 64, file));
+ fclose (file);
}
+ exit_sysfs();
+
+ return is_ssd;
+}
+
+static void readahead_set_len(int size)
+{
+ int i = 0;
+ char ractl[100];
+ /* changes readahead size to "size" for local block devices */
+
+ enter_sysfs();
+
sprintf(ractl, "sda/queue/read_ahead_kb");
while (i <= 3) {
/* check first 4 sata discs */
@@ -144,10 +187,7 @@
i++;
}
- chdir("/");
-
- if (unmount != 0)
- umount("/sys");
+ exit_sysfs();
}
static void readahead_one(int index)
@@ -187,6 +227,8 @@
return NULL;
}
+/* sort to help remove duplicates, we retain the original
+ order in the next/prev linked list */
static void sort_ra_by_name(void)
{
int delta = 1;
@@ -343,6 +385,11 @@
rcount++;
}
+ if (there) {
+ r->block_order_hint = 0; /* first block */
+ ioctl (fd, FIBMAP, &r->block_order_hint);
+ }
+
free(mincorebuf);
munmap(mmapptr, statbuf.st_size);
fclose(file);
@@ -362,15 +409,17 @@
fcount++;
}
rdsize += (tlen <= 0 ? 1024 : tlen);
- printf("%s: %d fragment(s), %dkb, %3.1f%%\n",
+ printf("%s: %d fragment(s), %dkb, %3.1f%% - block
%ld\n",
r->filename, rcount,
(tlen <= 1024 ? 1024 : tlen) / 1024,
- 100.0 * there / (there + notthere));
+ 100.0 * there / (there + notthere),
+ r->block_order_hint);
}
memcpy(r->data, record, sizeof(r->data));
return 1;
}
+ return 0;
}
static void get_ra_blocks(void)
@@ -389,6 +438,50 @@
}
}
+static void write_ra (FILE *file, struct ra_struct *r)
+{
+ if (debug)
+ printf ("write_ra '%s' (0x%lx)\n", r->filename,
r->block_order_hint);
+ fwrite(r->filename, MAXFL, 1, file);
+ fwrite(r->data, sizeof(r->data), 1, file);
+ rdcount++;
+}
+
+/* split the list of files into chunks - runs of 256 files
+ or so. Inside this chunk, sort by block hint - hopefully
+ this substantially improves read linearity on non-SSDs */
+static void write_sorted_in_chunks_by_block(FILE *file, struct ra_struct *list)
+{
+#define CHUNK_SIZE 256 /* deeply mystical chunk size */
+ while (list) {
+ int i, max = 0;
+ int delta = 1;
+ struct ra_struct *sort_array[CHUNK_SIZE];
+
+ /* copy a chunk across */
+ for (; list && max < CHUNK_SIZE; list = list->next)
+ sort_array[max++] = list;
+
+ /* sort by first block */
+ while (delta > 0) {
+ delta = 0;
+ for (i = 0; i < max - 1; i++) {
+ if (sort_array[i]->block_order_hint >
sort_array[i+1]->block_order_hint) {
+ struct ra_struct *tmp;
+ tmp = sort_array[i];
+ sort_array[i] = sort_array[i+1];
+ sort_array[i+1] = tmp;
+ delta++;
+ }
+ }
+ }
+
+ /* write out */
+ for (i = 0; i < max - 1; i++)
+ write_ra (file, sort_array[i]);
+ }
+}
+
static void trace_start(void)
{
int ret;
@@ -578,13 +671,13 @@
exit(EXIT_FAILURE);
}
- r = first_ra;
- while (r) {
- fwrite(r->filename, MAXFL, 1, file);
- fwrite(r->data, sizeof(r->data), 1, file);
- r = r->next;
- rdcount++;
+ if (!is_ssd)
+ write_sorted_in_chunks_by_block (file, first_ra);
+ else {
+ for (r = first_ra; r; r = r->next)
+ write_ra (file, r);
}
+
fclose(file);
if (debug) {
times(&stop_time);
@@ -619,8 +712,9 @@
int main(int argc, char **argv)
{
FILE *file;
+ int i, max_threads;
int pid = 0;
- pthread_t one, two, three, four;
+ pthread_t threads[MAXTHREADS];
int max_time = DEFAULT_MAX_TIME;
while (1) {
@@ -655,6 +749,10 @@
}
}
+ is_ssd = is_sda_ssd ();
+ if (!is_ssd)
+ max_time *= 2;
+
file = fopen(PACK_FILE, "r");
if (!file) {
/* enable tracing open calls before we fork! */
@@ -684,26 +782,25 @@
fclose(file);
#ifdef HAVE_IO_PRIO
- if (syscall(__NR_ioprio_set, IOPRIO_WHO_PROCESS, pid,
- IOPRIO_IDLE_LOWEST) == -1)
- perror("Can not set IO priority to idle class");
+ if (syscall(__NR_ioprio_set, IOPRIO_WHO_PROCESS, pid,
+ IOPRIO_IDLE_LOWEST) == -1)
+ perror("Can not set IO priority to idle class");
#endif
- readahead_set_len(RA_SMALL);
+ if (is_ssd)
+ readahead_set_len(RA_SMALL);
+ max_threads = 4;
daemon(0,0);
- pthread_create(&one, NULL, one_thread, NULL);
- pthread_create(&two, NULL, one_thread, NULL);
- pthread_create(&three, NULL, one_thread, NULL);
- pthread_create(&four, NULL, one_thread, NULL);
+ for (i = 0; i < max_threads; i++)
+ pthread_create(&threads[i], NULL, one_thread, NULL);
- pthread_join(one, NULL);
- pthread_join(two, NULL);
- pthread_join(three, NULL);
- pthread_join(four, NULL);
+ for (i = 0; i < max_threads; i++)
+ pthread_join(threads[i], NULL);
- readahead_set_len(RA_NORMAL);
+ if (is_ssd)
+ readahead_set_len(RA_NORMAL);
return EXIT_SUCCESS;
}
--
[email protected] <><, Pseudo Engineer, itinerant idiot
_______________________________________________
Moblin dev Mailing List
[email protected]
To manage or unsubscribe from this mailing list visit:
http://lists.moblin.org/listinfo/dev or your user account on http://moblin.org
once logged in.
For more information on the Moblin Developer Mailing lists visit:
http://moblin.org/community/mailing-lists