[To admin: this message was posted earlier via google group. needless to say, it was stalled waiting for approval, please ignore that one. Thanks.]
Hi, Yesterday in #pgsql, I was talking with neilc about determining rpc value in a more concrete way. So I created a program that compares exhaustive (all blocks are eventually read) random reads with sequential reads. The full source is attached. I tested the db files residing on a software RAID-1 composed of 2 IDE 7200rpm drives on linux 2.6.12. What I discovered is: <quote> random_page_cost (floating point) Sets the planner's estimate of the cost of a nonsequentially fetched disk page. This is measured as a multiple of the cost of a sequential page fetch. A higher value makes it more likely a sequential scan will be used, a lower value makes it more likely an index scan will be used. The default is four. </quote> is not precise enough. Which pages? Those that belong to the dbase file or sequential pages on the media? On dbases smaller (calculated from du <dbase_dir>)than 500M, I got a ratio (random over sequential time) of 4.5:1. A 3.0GB dbase has a ratio of 10:1. On a 3GB contiguous file, the ratio is about 4:1. If, in fact, the pages meant in the quotation are pages occupied by the dbase files, then does that mean the RPC config should be changed over time to reflect the varying ratio (which I guess is due to file fragmentation)? If that's the case, isn't RPC config actually a per-database config rather than a per-cluster config? Thanks, YS (gnome)
/* determine the random_page_cost, don't do O_DIRECT since pgsql also doesn't do O_DIRECT */ #include <errno.h> #include <limits.h> #include <fcntl.h> #include <ftw.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <sys/stat.h> #include <sys/time.h> #include <sys/types.h> #include <time.h> #include <unistd.h> static int VERBOSE = 0; double curr_epoch() { struct timeval tv; double time = -1; if (0 == gettimeofday(&tv, NULL)) { time = 1.0 * (tv.tv_sec * 1000 * 1000 + tv.tv_usec) / (1000.0 * 1000.0); } else { perror("gettimeofday"); exit(1); } return time; } double strategy_sequential_read(int fd, off_t length, int block_size, char *buffer) { double elapsed_time = -1; double start_time = -1; char *reason; ssize_t read_code; off_t lseek_code; ssize_t total_read; lseek_code = lseek(fd, 0, SEEK_SET); if ((lseek_code != 0) || (lseek_code == (off_t)-1)) { reason = "lseek"; goto error; } start_time = curr_epoch(); total_read = 0; while (1) { read_code = read(fd, buffer, block_size); /* read() on disk i/o always do full read */ if (read_code == 0) { break; } else if (read_code == -1) { reason = "read"; goto error; } if (VERBOSE) { total_read += read_code; if (total_read % (100*1024*1024)) { printf("\r%0.2lf%%", 1.0*total_read/length); } } } elapsed_time = curr_epoch() - start_time; goto cleanup; error: perror(reason); cleanup: return elapsed_time; } /* inclusive min, exclusive max */ int rand_between(int min, int max) { int random = min + (int) (1.0 * max* rand() / (RAND_MAX + 1.0)); return random; } /* max file size that can be serviced is INT_MAX * block_size */ int *random_shuffling(int length) { int *array; char *reason; int i,j,k; if (!(array = malloc(length * sizeof(int)))) { reason = "malloc"; goto error; } for (i=0; i < length; i++) { array[i] = i; } for (i=length-1; i >=1; i--) { j = rand_between(0, i); k = array[i]; array[i] = array[j]; array[j] = k; } return array; error: perror(reason); return NULL; } double strategy_random_read(int fd, off_t length, int block_size, char *buffer) { int block_count; int *read_sequence = NULL; char *reason = NULL; double elapsed_time = -1; double start_time = -1; int i; ssize_t total_read; if (length/block_size+1 > INT_MAX) { fprintf(stderr, "Cannot do random read on file larger than %ld bytes. Please increase block_size.\n", length); goto cleanup; } block_count = length/block_size+1; if ((read_sequence = random_shuffling(block_count)) == NULL) { fprintf(stderr, "Cannot shuffle read order\n"); goto cleanup; } total_read = 0; start_time = curr_epoch(); for (i=0; i < block_count; i++) { ssize_t read_code; off_t offset = read_sequence[i] * block_size; off_t lseek_code = lseek(fd, offset, SEEK_SET); if ((lseek_code != offset) || (lseek_code == (off_t)-1)) { reason = "lseek"; goto error; } read_code = read(fd, buffer, block_size); /* read() on disk i/o always do full read */ if (read_code == 0) { break; } else if (read_code == -1) { reason = "read"; goto error; } if (VERBOSE) { total_read += read_code; if (total_read % (1024*1024)) { printf("%lf%%\r", 1.0*length/total_read); } } } elapsed_time = curr_epoch() - start_time; goto cleanup; error: perror(reason); cleanup: return elapsed_time; } double time_read_file(const char *path, int block_size, double (*read_strategy)(int fd, off_t length, int block_size, char *buffer)) { int fd = -1; char *buffer = NULL; off_t length; char *reason = NULL; double elapsed = -1; if (!(buffer = malloc(block_size))) { reason = "malloc"; goto error; } if ((fd = open(path, O_RDONLY)) == -1) { reason = "open"; goto error; } if ((length = lseek(fd, 0, SEEK_END)) == (off_t)-1) { reason = "lseek"; goto error; } if ((elapsed = read_strategy(fd, length, block_size, buffer)) == -1) { fprintf(stderr, "Unable to complete reading test\n"); } goto cleanup; error: perror(reason); cleanup: if (fd != -1) { close(fd); } if (buffer != NULL) { free(buffer); } return elapsed; } int clear_buffer(char *flush_file_path) { double elapsed; printf("Clearing buffer...\n"); elapsed = time_read_file(flush_file_path, 8192, strategy_sequential_read); if (elapsed < 0) { return 1; } printf("Cleared\n"); return 0; } static struct { char *path; int block_size; char *read_strategy_name; double (*read_strategy)(int fd, off_t length, int block_size, char *buffer); double total_elapsed_time; } run_env; int test_read_file(const char *path, const struct stat *sb, int flag) { double elapsed_time; if (S_ISREG(sb->st_mode)) { if (VERBOSE) { printf("Reading: %s\n", path); } elapsed_time = time_read_file(path, run_env.block_size, run_env.read_strategy); if (elapsed_time >= 0) { run_env.total_elapsed_time += elapsed_time; } else { return 1; } } return 0; } void test_read_directory() { printf("Using %s strategy\n", run_env.read_strategy_name); run_env.total_elapsed_time = 0; ftw(run_env.path, test_read_file, 1000); printf("Elapsed time: %lf seconds\n", run_env.total_elapsed_time); } int main(int argc, char **argv) { double seq_time = 0; double ran_time = 0; char *flush_file_path = NULL; int i; run_env.path = NULL; run_env.block_size = 8192; for (i=1; i < argc; i++) { if (strcmp("-b", argv[i]) == 0) { i++; run_env.block_size = atoi(argv[i]); } else if (strcmp("-c", argv[i]) == 0) { i++; flush_file_path = argv[i]; } else if (strcmp("-v", argv[i]) == 0) { VERBOSE = 1; } else { run_env.path = argv[i]; } } if (run_env.path == NULL) { fprintf(stderr, "Usage: %s [-b block_size_in_bytes] [-c cache_clear_file_path] [-v] <path>\n", argv[0]); fprintf(stderr, "Block_size_in_bytes defaults to 8192 bytes\n"); fprintf(stderr, "To minimise the effect of caching between runs, cache_clear_file will be read\n"); fprintf(stderr, "Please create a cache_clear file that is about the same size as your RAM\n"); exit(1); } printf("Path: %s\nBlock size: %d bytes\nFlush file path: %s\nVERBOSE: %s\n", run_env.path, run_env.block_size, flush_file_path ? flush_file_path:"NONE", VERBOSE?"true":"false"); if (flush_file_path) { if (clear_buffer(flush_file_path) != 0) { fprintf(stderr, "Unable to flush buffer"); exit(1); } } run_env.read_strategy_name = "RANDOM"; run_env.read_strategy = strategy_random_read; test_read_directory(); ran_time = run_env.total_elapsed_time; if (flush_file_path) { if (clear_buffer(flush_file_path) != 0) { fprintf(stderr, "Unable to flush buffer"); exit(1); } } run_env.read_strategy_name = "SEQUENTIAL"; run_env.read_strategy = strategy_sequential_read; test_read_directory(); seq_time = run_env.total_elapsed_time; printf("RPC=%lf\n", ran_time/seq_time); return 0; }
---------------------------(end of broadcast)--------------------------- TIP 5: don't forget to increase your free space map settings