Hi
After discussion on -users about concurrent writes to disk volumes by
the sd, which raised concerns about fragmentation and impact of
concurrency on overall write performance, I thought I'd do some testing
to look into it.
I've just put together a simple tool that uses a specified number of
threads to write dummy data to files (one per thread) on a volume. It
optionally uses posix_fallocate(...) to pre-allocate space in
user-specified chunks up to the total target file size.
I don't do a lot of work in C, and it probably shows; I mostly work with
C++, Python and Java. It's also a utility and benchmark not a production
program.
The tool should be useful for testing out the effects of using
posix_fallocate(...) to pre-allocate space in volumes on various file
systems, with various levels of write concurrency, where performance is
limited by storage write speeds. Fragmentation, write throughput, and
overall write time are of interest.
The usage summary from the executable should provide very basic
guidance on its use.It builds from a simple Makefile on any Linux (and
probably BSD) system. Typical invocation might be:
./palloc 5 1G 900M y
( five threads each write 1GB dummy volumes, posix_fallocate()ing 900MB
chunks ) or
./palloc 3 100M 1M n
( three threads each write 100MB files, not using posix_fallocate. The
chunksize parameter is required but has no effect if posix_fallocate is
n. I should make that prettier, but can't be bothered. )
e2fsprogs is required for file fragmentation measurement on ext* file
systems. It expects the e2fsprogs filefrag utility to be in
/usr/sbin/filefrag. If not found, no fragmentation measurement will be
done. Fragmentation measurement isn't supported for non ext- systems.
( Actually, it seems to work for xfs as well, as it provides the same
ioctl as ext3 for extent examination. Handy. )
I've already found some interesting things, albeit with only brief and
preliminary testing. The first is that periodically calling
posix_fallocate(...) to pre-allocate anything short of very large chunks
chunks of the volume in advance seems to be counter-productive. It's
slower on ext4 and ext3, for one thing. On xfs it also dramatically
*increases* fragmentation. I assume that's because posix_fallocate
forces immediate allocation of the data, overriding xfs's delayed
allocation logic, which otherwise works astonishingly well.
However, if the expected size of the volume is known in advance, calling
posix_fallocate() to preallocate that space seems to be a significant
performance win at least on ext4 and xfs, and drastically reduces
fragmentation.
Unsurprisingly, it looks like it'd be necessary for the sd to have a
decent idea of how big the backup volume will need to be when it starts
creating/appending it in order to be able to help the file system make
better decisions.
I'll play with it some more to do some proper tests tomorrow, time
permitting. I thought it'd be of interest to have the tool in the mean
time, though. Please send complaints/abuse/improvements/bugs/cries of
horror my way.
--
Craig Ringer
#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <fcntl.h>
#include <sys/time.h>
#include "util.h"
char * progname;
void usage() {
fprintf(stderr, "Usage: %s filename size\n", progname);
}
int main(int argc, char* argv[]) {
progname = argv[0];
if ( argc != 3 ) {
usage();
return 1;
}
const char * fn = argv[1];
off_t sz = getsize(argv[2]);
if (sz == 0) {
usage();
fprintf(stderr, "size was zero, negative, empty or not a number.\n");
return 1;
}
FILE* f = fopen(fn, "w");
if (f == NULL) {
fprintf(stderr, "Unable to open %s: %s\n", fn, strerror(errno));
return 2;
}
posix_fadvise(fileno(f), 0, sz, POSIX_FADV_SEQUENTIAL);
struct timeval start_tv;
gettimeofday(&start_tv, NULL);
int ret = posix_fallocate(fileno(f), 0, sz);
if (ret) {
fprintf(stderr, "posix_fallocate() failure: %s\n", strerror(ret));
return 3;
}
struct timeval end_tv;
gettimeofday(&end_tv, NULL);
struct timeval tv_diff = tv_subtract(end_tv, start_tv);
fprintf(stderr, "posix_fallocate() call took %li.%06li seconds\n", tv_diff.tv_sec, tv_diff.tv_usec);
return 0;
}
ALL: falloc palloc
falloc: falloctest.c util.c
gcc -std=c99 -o falloc -g3 -D_POSIX_C_SOURCE=200809L -D_ISOC99_SOURCE
-D_FILE_OFFSET_BITS=64 util.c falloctest.c
palloc: parallel_alloc.c util.c
gcc -std=c99 -o palloc -g3 -D_POSIX_C_SOURCE=200809L -D_ISOC99_SOURCE
-D_FILE_OFFSET_BITS=64 -D_BSD_SOURCE util.c parallel_alloc.c -lpthread
/*
* use multiple threads to write dummy data in parallel,
* optionally posix_fallocate(...)ing it first.
*
* This program tests performance and fragmentation effects
* of using various fallocate chunk sizes.
*/
#include "util.h"
#include <pthread.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <errno.h>
const int errbuf_size = 512;
const int dummyblock_size = 4096;
/* Program name from argv */
const char * progname;
/* Some dummy data to write to a file */
void * dummyblock;
/* Set if any thread enters error state */
int thread_error = 0;
/* Path to e2fsprogs `filefrag' utility */
const char * filefrag_path = "/usr/sbin/filefrag";
/* If true, don't unlink at exit */
int no_unlink_on_exit = 0;
struct thread_info {
/* Members inited during thread setup */
pthread_t thread; /* pthread object for thread */
int threadid; /* application thread id */
int nthreads; /* number of threads */
off_t filesize; /* File size thread should write */
off_t chunksize; /* Chunk size should be written in */
int use_fallocate; /* Thread should use posix_fallocate? */
char filename[8]; /* filename of file thread should create */
/* Members modified by thread */
char * errbuf; /* Per-thread error string formatting buffer, invalid after thread exit */
struct timeval tv_elapsed; /* Wall clock time elapsed during thread run */
struct timeval tv_falloc_time; /* Time spent in posix_falloc() */
struct timeval tv_write_time; /* Time spent in write() */
int nfragments; /* Number of fragments output file was in according to filefrag */
};
/** Write `reqsize' bytes to `f' at the current fd offset. The actual data written is garbage. */
int writechunk(struct thread_info * ti, int f, off_t reqsize) {
struct timeval tv_write_start, tv_write_end;
gettimeofday(&tv_write_start, NULL);
off_t chunk_remaining = reqsize;
while (chunk_remaining) {
/** Write whichever of dummyblock_size and chunk_remaining is least */
size_t req_size = min(chunk_remaining, dummyblock_size);
size_t written = write(f, dummyblock, req_size);
chunk_remaining -= written;
if (written != req_size) {
/** Short write */
strerror_r(errno, ti->errbuf, errbuf_size);
fprintf(stderr, "Thread %i wrote only %i of %i byte chunk: %s\n", ti->threadid, written, req_size, ti->errbuf);
return 1;
}
}
gettimeofday(&tv_write_end, NULL);
ti->tv_write_time = tv_add(ti->tv_write_time, tv_subtract(tv_write_end, tv_write_start));
return 0;
}
/** Use posix_fallocate to request the FS allocate `reqsize' space at `offset' on `f' */
int allocchunk(struct thread_info * ti, int f, off_t offset, off_t reqsize) {
struct timeval tv_falloc_start, tv_falloc_end;
gettimeofday(&tv_falloc_start, NULL);
int ret = posix_fallocate(f, offset, reqsize);
gettimeofday(&tv_falloc_end, NULL);
ti->tv_falloc_time = tv_add(ti->tv_falloc_time, tv_subtract(tv_falloc_end, tv_falloc_start));
if (ret) {
strerror_r(ret, ti->errbuf, errbuf_size);
fprintf(stderr, "posix_fallocate(%i, %lli, %lli) failed with %s\n", f, offset, reqsize, ti->errbuf);
return 1;
}
return 0;
}
/* Write `reqsize' bytes to `f' at `offset', using posix_fallocate if specified by threadinfo.
* returns 0 on success, 1 on error
*/
int writebytes(struct thread_info * ti, int f, off_t offset, off_t reqsize) {
if (ti->use_fallocate) {
if (allocchunk(ti, f, offset, reqsize))
return 1;
}
if ( writechunk(ti, f, reqsize) )
return 1;
return 0;
}
int thread_main( struct thread_info * const ti ) {
char errbuf[errbuf_size];
ti->errbuf = &errbuf[0];
struct timeval tv_start;
gettimeofday(&tv_start, NULL);
tv_zero(&ti->tv_falloc_time);
tv_zero(&ti->tv_write_time);
int f = open( &ti->filename[0], O_WRONLY | O_CREAT | O_TRUNC, 0600 );
posix_fadvise(f, 0, ti->filesize, POSIX_FADV_DONTNEED);
if ( f == -1 ) {
strerror_r(errno, ti->errbuf, errbuf_size);
fprintf(stderr, "Thread %i: Opening file %s failed with %s\n", ti->threadid, &ti->filename[0], ti->errbuf);
thread_error = 1;
return 1;
}
off_t current_offset = 0;
while ( current_offset < ti->filesize ) {
if (thread_error) {
/* Another thread has set the error flag. Terminate. */
break;
}
int reqsize = min(ti->filesize - current_offset, ti->chunksize);
int ret = writebytes(ti, f, current_offset, reqsize);
if (ret) {
/* Write failed. Flag error and terminate. */
thread_error = 1;
break;
}
current_offset += reqsize;
}
close(f);
struct timeval tv_end;
gettimeofday(&tv_end, NULL);
ti->tv_elapsed = tv_subtract(tv_end, tv_start);
ti->errbuf = NULL;
}
void * thread_run( void * args ) {
return (void*) thread_main ((struct thread_info*) args);
}
int unlink_files(struct thread_info * threads, int nthreads) {
/* Clean up old files from previous test */
for ( int i = 0; i < nthreads; i++ ) {
if (unlink(&threads[i].filename[0])) {
if (errno != ENOENT) {
fprintf(stderr, "Unable to unlink file %s: %s\n", &threads[i].filename[0], strerror(errno));
return 1;
}
}
}
return 0;
}
/* Count number of fragments in `filename'. -1 return indicates error. */
int count_fragments(const char * filename) {
/* popen filefrag and read output, parse and extract */
const int buf_size = 512;
char buf[buf_size];
snprintf(&buf[0], buf_size, "\"%s\" \"%s\" 2>/dev/null", filefrag_path, filename);
FILE * fragpipe = popen(&buf[0], "r");
if (fragpipe == NULL) {
fprintf(stderr, "error executing %s %s: %s\n", filefrag_path, filename, strerror(errno));
return -1;
}
/* Read and discard "filename: ", which is the filename length plus two (space and colon) plus null term. */
if (!fgets(&buf[0], strlen(filename) + 3, fragpipe)) {
fprintf(stderr, "Unexpected input from %s %s\n", filefrag_path, filename);
pclose(fragpipe);
return -1;
}
/* now read the number of extents */
int nextents;
if (fscanf(fragpipe, "%d", &nextents) != 1) {
fprintf(stderr, "Unable to read extent count from %s %s\n", filefrag_path, filename);
pclose(fragpipe);
return -1;
}
pclose(fragpipe);
return nextents;
}
void print_test(struct thread_info * threads, int nthreads, struct timeval wall_elapsed) {
if (filefrag_path) {
int totalfrags = 0;
for (int i = 0; i < nthreads; i++) {
int nfrags = count_fragments(&threads[i].filename[0]);
if (nfrags <= 0) {
fprintf(stderr, "Failed to get fragment count for %s, skipping fragment reporting\n", &threads[i].filename[0]);
totalfrags = 0;
break;
}
fprintf(stdout, " Thread %i fragments: %i\n", i, nfrags);
totalfrags += nfrags;
}
if (totalfrags > 0) {
fprintf(stdout, "Total fragments: %i (avg %.2f per file)\n", totalfrags, (double)totalfrags/(double)nthreads);
}
}
struct timeval total_elapsed, total_write_time, total_falloc_time;
tv_zero(&total_elapsed);
tv_zero(&total_write_time);
tv_zero(&total_falloc_time);
fprintf(stdout, "Timings, all excluding unlink() time:\n");
for ( int i = 0; i < nthreads; i++ ) {
struct thread_info * ti = &threads[i];
fprintf(stdout, " Thread %i: Ran %li.%06lis, %li.%06lis in write, %li.%06lis in fallocate\n", i,
ti->tv_elapsed.tv_sec, ti->tv_elapsed.tv_usec,
ti->tv_write_time.tv_sec, ti->tv_write_time.tv_usec,
ti->tv_falloc_time.tv_sec, ti->tv_falloc_time.tv_usec);
total_elapsed = tv_add(total_elapsed, ti->tv_elapsed);
total_write_time = tv_add(total_write_time, ti->tv_write_time);
total_falloc_time = tv_add(total_falloc_time, ti->tv_falloc_time);
}
fprintf(stdout, "THREAD TOTAL: Ran %li.%06lis, %li.%06lis in write, %li.%06lis in fallocate\n",
total_elapsed.tv_sec, total_elapsed.tv_usec,
total_write_time.tv_sec, total_write_time.tv_usec,
total_falloc_time.tv_sec, total_falloc_time.tv_usec);
fprintf(stdout, "WALL TIME: %li.%06lis\n", wall_elapsed.tv_sec, wall_elapsed.tv_usec);
}
/*
* run_test does the grunt work of setting up the threads, firing them off,
* waiting for them to finish and summing up their results
*/
int run_test(int nthreads, off_t filesize, off_t chunksize, int use_fallocate) {
/* Create a block of dummy data that'll be written to files */
dummyblock = malloc(dummyblock_size);
memset(dummyblock, 'X', dummyblock_size);
struct thread_info threads[nthreads];
pthread_attr_t threadattr;
pthread_attr_init(&threadattr);
/* Init the thread info array */
for ( int i = 0; i < nthreads; i++ ) {
threads[i].nthreads = nthreads;
threads[i].filesize = filesize;
threads[i].chunksize = chunksize;
threads[i].use_fallocate = use_fallocate;
threads[i].threadid = i;
snprintf(&threads[i].filename[0], 7, "t%05i", i);
threads[i].filename[7] = '\0';
}
/* Cleanup any crap left over from prior runs */
if (unlink_files(&threads[0], nthreads)) {
return 1;
}
/* Start the clock */
struct timeval tv_start;
gettimeofday(&tv_start, NULL);
/* Spawn threads */
for ( int i = 0; i < nthreads; i++ ) {
pthread_create(&threads[i].thread, &threadattr, &thread_run, &threads[i]);
}
/* Wait for all threads to exit */
for ( int i = 0; i < nthreads; i++ ) {
pthread_join(threads[i].thread, NULL);
}
struct timeval tv_end;
gettimeofday(&tv_end, NULL);
/* Report on runtimes, frag counts, etc */
struct timeval wall_elapsed = tv_subtract(tv_end, tv_start);
print_test(&threads[0], nthreads, wall_elapsed);
/* tidy up */
if (!no_unlink_on_exit) {
if (unlink_files(&threads[0], nthreads)) {
return 1;
}
}
return 0;
}
void usage() {
fprintf(stderr, "Usage: %s [-u] nthreads filesize chunksize use_fallocate\n", progname);
fprintf(stderr, " -u don't unlink files before exit\n");
fprintf(stderr, " nthreads Number of threads to use\n");
fprintf(stderr, " filesize Size of file each thread should create\n");
fprintf(stderr, " chunksize Size of posix_fallocate()'d chunks, if in use\n");
fprintf(stderr, " ( currently no effect if posix_fallocate = n)\n");
fprintf(stderr, " use_fallocate [y/n] use posix_fallocate()?\n");
fprintf(stderr, "\n");
fprintf(stderr, " filesize and chunksize may be specified with scale suffix K, M or G.\n\n");
}
/*
* Test for filefrag utility
*/
void setup_frag() {
if (access("/usr/sbin/filefrag", R_OK|X_OK)) {
fprintf(stderr, "%s from e2fsprogs not found or inaccessible: %s\n", filefrag_path, strerror(errno));
fprintf(stderr, "Fragmentation will not be measured.\n");
filefrag_path = NULL;
}
}
/*
* the main function will process arguments and invoke run_test
*/
int main(int argc, char * argv[]) {
progname = argv[0];
argv++;
argc--;
if (argc < 4) {
usage();
return 1;
}
while (argc > 4) {
/* Process option */
if (strcmp(argv[0], "-u") == 0) {
/* Supress unlinking of files */
no_unlink_on_exit = 1;
} else {
/* Unrecognised arg */
fprintf(stderr, "Unknown argument: %s\n", argv[0]);
usage();
return 1;
}
argc--;
argv++;
}
const int nthreads = strtol(argv[0], NULL, 10);
const off_t filesize = getsize(argv[1]);
const off_t chunksize = getsize(argv[2]);
int use_fallocate;
if (argv[3][0] == 'n' || argv[3][0] == 'N') {
use_fallocate = 0;
} else if (argv[3][0] == 'y' || argv[3][0] == 'Y') {
use_fallocate = 1;
} else {
fprintf(stderr, "use_fallocate must be 'y' or 'n'\n");
usage();
return 1;
}
if (nthreads <= 0) {
fprintf(stderr, "nthreads not a number or <= 0. Must have one or more threads.\n");
usage();
return 1;
}
if (filesize == 0) {
fprintf(stderr, "Filesize was zero or invalid\n");
usage();
return 1;
}
if (chunksize == 0) {
fprintf(stderr, "Chunksize was zero or invalid\n");
usage();
return 1;
}
if (chunksize > filesize) {
fprintf(stderr, "Chunksize > filesize not allowed\n");
usage();
return 1;
}
setup_frag();
fprintf(stderr, "Starting run with %i threads each writing %lli bytes in %lli byte chunks %susing posix_fallocate\n",
nthreads, filesize, chunksize, use_fallocate ? "" : "NOT ");
return run_test(nthreads, filesize, chunksize, use_fallocate);
}
#include "util.h"
#include <stdlib.h>
/* Subtract `time2' from `time1' and return result.
from tv_util.c by Alex Measday */
struct timeval tv_subtract(struct timeval time1, struct timeval time2) {
struct timeval result ;
if ((time1.tv_sec < time2.tv_sec) ||
((time1.tv_sec == time2.tv_sec) &&
(time1.tv_usec <= time2.tv_usec))) { /* TIME1 <= TIME2? */
result.tv_sec = result.tv_usec = 0 ;
} else { /* TIME1 > TIME2 */
result.tv_sec = time1.tv_sec - time2.tv_sec ;
if (time1.tv_usec < time2.tv_usec) {
result.tv_usec = time1.tv_usec + 1000000L - time2.tv_usec ;
result.tv_sec-- ; /* Borrow a second. */
} else {
result.tv_usec = time1.tv_usec - time2.tv_usec ;
}
}
return (result) ;
}
/* Add `time1' to `time2' and return result.
from tv_util.c by Alex Measday */
struct timeval tv_add(struct timeval time1, struct timeval time2) {
struct timeval result ;
result.tv_sec = time1.tv_sec + time2.tv_sec ;
result.tv_usec = time1.tv_usec + time2.tv_usec ;
if (result.tv_usec >= 1000000L) { /* Carry? */
result.tv_sec++ ; result.tv_usec = result.tv_usec - 1000000L ;
}
return (result) ;
}
void tv_zero(struct timeval * time) {
time->tv_sec = 0;
time->tv_usec = 0;
}
off_t getsize(const char * sizestr) {
char * endptr = 0;
off_t sz = (off_t)strtol(sizestr, &endptr, 10);
if (endptr == sizestr) {
// No digits read. Caller will take a zero return as invalid.
return 0;
}
if (endptr[0] == '\0') {
// The input was an unadorned number, so return it.
return sz;
} else {
// User may have supplied a suffix
switch (endptr[0]) {
case 'G':
case 'g':
sz *= 1024;
case 'M':
case 'm':
sz *= 1024;
case 'K':
case 'k':
sz *= 1024;
return sz;
default:
// Anything else is invalid
return 0;
}
}
}
#include <sys/types.h>
#include <sys/time.h>
struct timeval tv_subtract(struct timeval time1, struct timeval time2);
struct timeval tv_add(struct timeval time1, struct timeval time2);
void tv_zero(struct timeval * time);
off_t getsize(const char * sizestr);
/* ARGH no template functions in C, use ugly gcc hack safe macros */
#define max(a,b) \
({ __typeof__ (a) _a = (a); \
__typeof__ (b) _b = (b); \
_a > _b ? _a : _b; })
#define min(a,b) \
({ __typeof__ (a) _a = (a); \
__typeof__ (b) _b = (b); \
_a < _b ? _a : _b; })
------------------------------------------------------------------------------
Download Intel® Parallel Studio Eval
Try the new software tools for yourself. Speed compiling, find bugs
proactively, and fine-tune applications for parallel performance.
See why Intel Parallel Studio got high marks during beta.
http://p.sf.net/sfu/intel-sw-dev
_______________________________________________
Bacula-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/bacula-devel