[To admin: this message was posted earlier via google group. needless
to say, it was stalled waiting for approval, please ignore that
one. Thanks.]

Hi,

Yesterday in #pgsql, I was talking with neilc about determining rpc
value in a more concrete way. So I created a program that compares
exhaustive (all blocks are eventually read) random reads with
sequential reads. The full source is attached.

I tested the db files residing on a software RAID-1 composed of 2 IDE
7200rpm drives on linux 2.6.12.

What I discovered is: 

<quote> 
random_page_cost (floating point)

    Sets the planner's estimate of the cost of a nonsequentially
    fetched disk page. This is measured as a multiple of the cost of a
    sequential page fetch. A higher value makes it more likely a
    sequential scan will be used, a lower value makes it more likely
    an index scan will be used. The default is four. 
</quote>

is not precise enough. Which pages? Those that belong to the dbase
file or sequential pages on the media?

On dbases smaller (calculated from du <dbase_dir>)than 500M, I got a
ratio (random over sequential time) of 4.5:1. A 3.0GB dbase has a
ratio of 10:1. On a 3GB contiguous file, the ratio is about 4:1.

If, in fact, the pages meant in the quotation are pages occupied by
the dbase files, then does that mean the RPC config should be changed
over time to reflect the varying ratio (which I guess is due to file
fragmentation)? If that's the case, isn't RPC config actually a
per-database config rather than a per-cluster config?

Thanks,
YS (gnome)

/* determine the random_page_cost, 
   don't do O_DIRECT since pgsql also doesn't do O_DIRECT */

#include <errno.h>
#include <limits.h>
#include <fcntl.h>
#include <ftw.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <sys/types.h>
#include <time.h>
#include <unistd.h>

static int VERBOSE = 0;


double 
curr_epoch()
{
  struct timeval tv;
  double time = -1;
  if (0 == gettimeofday(&tv, NULL)) {
    time = 1.0 * (tv.tv_sec * 1000 * 1000 + tv.tv_usec) / (1000.0 * 1000.0);
  } else {
    perror("gettimeofday");
    exit(1);
  }
  return time;
}




double
strategy_sequential_read(int fd, off_t length, int block_size, char *buffer)
{
  double elapsed_time = -1;
  double start_time = -1; 
  char *reason;
  ssize_t read_code;
  off_t lseek_code;
  ssize_t total_read;

  lseek_code = lseek(fd, 0, SEEK_SET);
  if ((lseek_code != 0) || (lseek_code == (off_t)-1)) {
    reason = "lseek";
    goto error;
  }
  start_time = curr_epoch();
  total_read = 0;
  while (1) {
    read_code = read(fd, buffer, block_size); /* read() on disk i/o always do full read */
    if (read_code == 0) {
      break;
    } else if (read_code == -1) {
      reason = "read";
      goto error;
    }
    if (VERBOSE) {
      total_read += read_code;
      if (total_read % (100*1024*1024)) {
	printf("\r%0.2lf%%", 1.0*total_read/length);
      }
    }
  }
  elapsed_time = curr_epoch() - start_time;
  goto cleanup;
 error:
  perror(reason);
 cleanup:
  return elapsed_time;
}



/* inclusive min, exclusive max */
int 
rand_between(int min, int max)
{
  int random = min + (int) (1.0 * max* rand() / (RAND_MAX + 1.0));
  return random;
}


/* max file size that can be serviced is INT_MAX * block_size */
int
*random_shuffling(int length)
{
  int *array;
  char *reason;
  int i,j,k;

  if (!(array = malloc(length * sizeof(int)))) {
    reason = "malloc";
    goto error;
  }
  for (i=0; i < length; i++) {
    array[i] = i;
  }
  for (i=length-1; i >=1; i--) {
    j = rand_between(0, i);
    k = array[i];
    array[i] = array[j];
    array[j] = k;
  }
  return array;
 error:
  perror(reason);
  return NULL;
}


double
strategy_random_read(int fd, off_t length, int block_size, char *buffer)
{
  int block_count;
  int *read_sequence = NULL;
  char *reason = NULL;
  double elapsed_time = -1;
  double start_time = -1;
  int i;
  ssize_t total_read;
  if (length/block_size+1 > INT_MAX) {
    fprintf(stderr, "Cannot do random read on file larger than %ld bytes. Please increase block_size.\n", length);
    goto cleanup;
  }
  block_count = length/block_size+1;
  if ((read_sequence = random_shuffling(block_count)) == NULL) {
    fprintf(stderr, "Cannot shuffle read order\n");
    goto cleanup;
  }
  total_read = 0;
  start_time = curr_epoch();
  for (i=0; i < block_count; i++) {
    ssize_t read_code;
    off_t offset = read_sequence[i] * block_size;
    off_t lseek_code = lseek(fd, offset, SEEK_SET);
    if ((lseek_code != offset) || (lseek_code == (off_t)-1)) {
      reason = "lseek";
      goto error;
    }
    read_code = read(fd, buffer, block_size); /* read() on disk i/o always do full read */
    if (read_code == 0) {
      break;
    } else if (read_code == -1) {
      reason = "read";
      goto error;
    }
    if (VERBOSE) {
      total_read += read_code;
      if (total_read % (1024*1024)) {
	printf("%lf%%\r", 1.0*length/total_read);
      }
    }
  }
  elapsed_time = curr_epoch() - start_time;
  goto cleanup;
 error:
  perror(reason);
 cleanup:
  return elapsed_time;
}


double
time_read_file(const char *path, int block_size, double (*read_strategy)(int fd, off_t length, int block_size, char *buffer))
{
  int fd = -1;
  char *buffer = NULL;
  off_t length;
  char *reason = NULL;
  double elapsed = -1;
  
  if (!(buffer = malloc(block_size))) {
    reason = "malloc";
    goto error;
  }
  if ((fd = open(path, O_RDONLY)) == -1) {
    reason = "open";
    goto error;
  }
  if ((length = lseek(fd, 0, SEEK_END)) == (off_t)-1) {
    reason = "lseek";
    goto error;
  }
  if ((elapsed = read_strategy(fd, length, block_size, buffer)) == -1) {
    fprintf(stderr, "Unable to complete reading test\n");
  }
  goto cleanup;
 error:
  perror(reason);
 cleanup:
  if (fd != -1) { close(fd); }
  if (buffer != NULL) { free(buffer); }
  return elapsed;
}

int
clear_buffer(char *flush_file_path)
{
  double elapsed;
  printf("Clearing buffer...\n");
  elapsed = time_read_file(flush_file_path, 8192, strategy_sequential_read);
  if (elapsed < 0) {
    return 1;
  }
  printf("Cleared\n");
  return 0;
}


static struct {
  char *path;
  int block_size;
  char *read_strategy_name;
  double (*read_strategy)(int fd, off_t length, int block_size, char *buffer);
  double total_elapsed_time;
} run_env;

int
test_read_file(const char *path, const struct stat *sb, int flag)
{
  double elapsed_time;
  if (S_ISREG(sb->st_mode)) {
    if (VERBOSE) {
      printf("Reading: %s\n", path);
    }
    elapsed_time = time_read_file(path, run_env.block_size, run_env.read_strategy);
    if (elapsed_time >= 0) {
      run_env.total_elapsed_time += elapsed_time;
    } else {
      return 1;
    }
  }
  return 0;
}

void
test_read_directory()
{
  printf("Using %s strategy\n", run_env.read_strategy_name);
  run_env.total_elapsed_time = 0;
  ftw(run_env.path, test_read_file, 1000);
  printf("Elapsed time: %lf seconds\n", run_env.total_elapsed_time);
}

int
main(int argc, char **argv)
{
  double seq_time = 0;
  double ran_time = 0;
  char *flush_file_path = NULL;
  int i;

  run_env.path = NULL;
  run_env.block_size = 8192;

  for (i=1; i < argc; i++) {
    if (strcmp("-b", argv[i]) == 0) {
      i++;
      run_env.block_size = atoi(argv[i]);
    } else if (strcmp("-c", argv[i]) == 0) {
      i++;
      flush_file_path = argv[i];
    } else if (strcmp("-v", argv[i]) == 0) {
      VERBOSE = 1;
    } else {
      run_env.path = argv[i];
    }
  }
      
  if (run_env.path == NULL) {
    fprintf(stderr, "Usage: %s [-b block_size_in_bytes] [-c cache_clear_file_path] [-v] <path>\n", argv[0]);
    fprintf(stderr, "Block_size_in_bytes defaults to 8192 bytes\n");
    fprintf(stderr, "To minimise the effect of caching between runs, cache_clear_file will be read\n");
    fprintf(stderr, "Please create a cache_clear file that is about the same size as your RAM\n");
    exit(1);
  }
  
  
  printf("Path: %s\nBlock size: %d bytes\nFlush file path: %s\nVERBOSE: %s\n", 
	 run_env.path, run_env.block_size, flush_file_path ? flush_file_path:"NONE", VERBOSE?"true":"false");

  if (flush_file_path) {
    if (clear_buffer(flush_file_path) != 0) {
      fprintf(stderr, "Unable to flush buffer");
      exit(1);
    }
  }

  run_env.read_strategy_name = "RANDOM";
  run_env.read_strategy = strategy_random_read;
  test_read_directory();
  ran_time = run_env.total_elapsed_time;


  if (flush_file_path) {
    if (clear_buffer(flush_file_path) != 0) {
      fprintf(stderr, "Unable to flush buffer");
      exit(1);
    }
  }

  run_env.read_strategy_name = "SEQUENTIAL";
  run_env.read_strategy = strategy_sequential_read;
  test_read_directory();
  seq_time = run_env.total_elapsed_time;

  printf("RPC=%lf\n", ran_time/seq_time);
  return 0;
}
---------------------------(end of broadcast)---------------------------
TIP 5: don't forget to increase your free space map settings

Reply via email to