On Mon, Nov 12, 2012 at 12:09:08PM -0500, Bruce Momjian wrote: > OK, I have had some time to think about this. What the current code > does is, for each database, get a directory listing to know about any > vm, fsm, and >1gig extents that exist in the directory. It caches the > directory listing and does full array scans looking for matches. If the > tablespace changes, it creates a new directory cache and throws away the > old one. This code certainly needs improvement! > > I can think of two solutions. The first would be to scan the database > directory, and any tablespaces used by the database, sort it, then allow > binary search of the directory listing looking for file prefixes that > match the current relation. > > The second approach would be to simply try to copy the fsm, vm, and > extent files, and ignore any ENOEXIST errors. This allows code > simplification. The downside is that it doesn't pull all files with > matching prefixes --- it requires pg_upgrade to _know_ what suffixes > might exist in that directory. Second, it assumes there can be no > number gaps in the file extent numbering (is that safe?). > > I need recommendations on which direction to persue; this would only be > for 9.3.
I went with the second idea, patch attached. Here are the times: ---------- 9.2 ---------- ------------ 9.3 -------- -- normal -- -- bin-up -- -- normal -- -- bin-up -- pg_upgrade dump rest dump rest dump rest dump rest git patch 1 0.12 0.06 0.12 0.06 0.11 0.07 0.11 0.07 11.11 11.02 1000 7.22 2.40 4.74 2.78 2.20 2.43 4.04 2.86 19.60 19.25 2000 5.67 5.10 8.82 5.57 4.50 4.97 8.07 5.69 30.55 26.67 4000 13.34 11.13 25.16 12.52 8.95 11.24 16.75 12.16 60.70 52.31 8000 29.12 25.98 59.60 28.08 16.68 24.02 30.63 27.08 123.05 102.78 16000 87.36 53.16 189.38 62.72 31.38 55.37 61.55 62.66 365.71 286.00 You can see a significant speedup with those loops removed. The 16k case is improved, but still not linear. The 16k dump/restore scale looks fine, so it must be something in pg_upgrade, or in the kernel. -- Bruce Momjian <br...@momjian.us> http://momjian.us EnterpriseDB http://enterprisedb.com + It's impossible for everything to be true. +
diff --git a/contrib/pg_upgrade/relfilenode.c b/contrib/pg_upgrade/relfilenode.c new file mode 100644 index 33a867f..0a49a23 *** a/contrib/pg_upgrade/relfilenode.c --- b/contrib/pg_upgrade/relfilenode.c *************** *** 17,25 **** static void transfer_single_new_db(pageCnvCtx *pageConverter, FileNameMap *maps, int size); ! static void transfer_relfile(pageCnvCtx *pageConverter, ! const char *fromfile, const char *tofile, ! const char *nspname, const char *relname); /* --- 17,24 ---- static void transfer_single_new_db(pageCnvCtx *pageConverter, FileNameMap *maps, int size); ! static int transfer_relfile(pageCnvCtx *pageConverter, FileNameMap *map, ! const char *suffix); /* *************** static void *** 131,185 **** transfer_single_new_db(pageCnvCtx *pageConverter, FileNameMap *maps, int size) { - char old_dir[MAXPGPATH]; - char file_pattern[MAXPGPATH]; - char **namelist = NULL; - int numFiles = 0; int mapnum; ! int fileno; ! bool vm_crashsafe_change = false; ! ! old_dir[0] = '\0'; ! /* Do not copy non-crashsafe vm files for binaries that assume crashsafety */ if (old_cluster.controldata.cat_ver < VISIBILITY_MAP_CRASHSAFE_CAT_VER && new_cluster.controldata.cat_ver >= VISIBILITY_MAP_CRASHSAFE_CAT_VER) ! vm_crashsafe_change = true; for (mapnum = 0; mapnum < size; mapnum++) { ! char old_file[MAXPGPATH]; ! char new_file[MAXPGPATH]; ! ! /* Changed tablespaces? Need a new directory scan? */ ! if (strcmp(maps[mapnum].old_dir, old_dir) != 0) ! { ! if (numFiles > 0) ! { ! for (fileno = 0; fileno < numFiles; fileno++) ! pg_free(namelist[fileno]); ! pg_free(namelist); ! } ! ! snprintf(old_dir, sizeof(old_dir), "%s", maps[mapnum].old_dir); ! numFiles = load_directory(old_dir, &namelist); ! } ! ! /* Copying files might take some time, so give feedback. */ ! ! snprintf(old_file, sizeof(old_file), "%s/%u", maps[mapnum].old_dir, ! maps[mapnum].old_relfilenode); ! snprintf(new_file, sizeof(new_file), "%s/%u", maps[mapnum].new_dir, ! maps[mapnum].new_relfilenode); ! pg_log(PG_REPORT, OVERWRITE_MESSAGE, old_file); ! ! /* ! * Copy/link the relation's primary file (segment 0 of main fork) ! * to the new cluster ! */ ! unlink(new_file); ! transfer_relfile(pageConverter, old_file, new_file, ! maps[mapnum].nspname, maps[mapnum].relname); /* fsm/vm files added in PG 8.4 */ if (GET_MAJOR_VERSION(old_cluster.major_version) >= 804) --- 130,148 ---- transfer_single_new_db(pageCnvCtx *pageConverter, FileNameMap *maps, int size) { int mapnum; ! bool vm_crashsafe_match = true; ! int segno; ! /* Do not copy non-crashsafe vm files for binaries that assume crashsafety */ if (old_cluster.controldata.cat_ver < VISIBILITY_MAP_CRASHSAFE_CAT_VER && new_cluster.controldata.cat_ver >= VISIBILITY_MAP_CRASHSAFE_CAT_VER) ! vm_crashsafe_match = false; for (mapnum = 0; mapnum < size; mapnum++) { ! /* transfer primary file */ ! transfer_relfile(pageConverter, &maps[mapnum], ""); /* fsm/vm files added in PG 8.4 */ if (GET_MAJOR_VERSION(old_cluster.major_version) >= 804) *************** transfer_single_new_db(pageCnvCtx *pageC *** 187,253 **** /* * Copy/link any fsm and vm files, if they exist */ ! snprintf(file_pattern, sizeof(file_pattern), "%u_", ! maps[mapnum].old_relfilenode); ! ! for (fileno = 0; fileno < numFiles; fileno++) ! { ! char *vm_offset = strstr(namelist[fileno], "_vm"); ! bool is_vm_file = false; ! ! /* Is a visibility map file? (name ends with _vm) */ ! if (vm_offset && strlen(vm_offset) == strlen("_vm")) ! is_vm_file = true; ! ! if (strncmp(namelist[fileno], file_pattern, ! strlen(file_pattern)) == 0 && ! (!is_vm_file || !vm_crashsafe_change)) ! { ! snprintf(old_file, sizeof(old_file), "%s/%s", maps[mapnum].old_dir, ! namelist[fileno]); ! snprintf(new_file, sizeof(new_file), "%s/%u%s", maps[mapnum].new_dir, ! maps[mapnum].new_relfilenode, strchr(namelist[fileno], '_')); ! ! unlink(new_file); ! transfer_relfile(pageConverter, old_file, new_file, ! maps[mapnum].nspname, maps[mapnum].relname); ! } ! } } /* * Now copy/link any related segments as well. Remember, PG breaks * large files into 1GB segments, the first segment has no extension, * subsequent segments are named relfilenode.1, relfilenode.2, ! * relfilenode.3, ... 'fsm' and 'vm' files use underscores so are not * copied. */ ! snprintf(file_pattern, sizeof(file_pattern), "%u.", ! maps[mapnum].old_relfilenode); ! ! for (fileno = 0; fileno < numFiles; fileno++) ! { ! if (strncmp(namelist[fileno], file_pattern, ! strlen(file_pattern)) == 0) ! { ! snprintf(old_file, sizeof(old_file), "%s/%s", maps[mapnum].old_dir, ! namelist[fileno]); ! snprintf(new_file, sizeof(new_file), "%s/%u%s", maps[mapnum].new_dir, ! maps[mapnum].new_relfilenode, strchr(namelist[fileno], '.')); ! unlink(new_file); ! transfer_relfile(pageConverter, old_file, new_file, ! maps[mapnum].nspname, maps[mapnum].relname); ! } } } - - if (numFiles > 0) - { - for (fileno = 0; fileno < numFiles; fileno++) - pg_free(namelist[fileno]); - pg_free(namelist); - } } --- 150,176 ---- /* * Copy/link any fsm and vm files, if they exist */ ! transfer_relfile(pageConverter, &maps[mapnum], "_fsm"); ! if (vm_crashsafe_match) ! transfer_relfile(pageConverter, &maps[mapnum], "_vm"); } /* * Now copy/link any related segments as well. Remember, PG breaks * large files into 1GB segments, the first segment has no extension, * subsequent segments are named relfilenode.1, relfilenode.2, ! * relfilenode.3. * copied. */ ! for (segno = 1;; segno++) ! { ! char suffix[65]; ! snprintf(suffix, sizeof(suffix), ".%d", segno); ! if (transfer_relfile(pageConverter, &maps[mapnum], suffix) != 0) ! break; } } } *************** transfer_single_new_db(pageCnvCtx *pageC *** 256,266 **** * * Copy or link file from old cluster to new one. */ ! static void ! transfer_relfile(pageCnvCtx *pageConverter, const char *old_file, ! const char *new_file, const char *nspname, const char *relname) { const char *msg; if ((user_opts.transfer_mode == TRANSFER_MODE_LINK) && (pageConverter != NULL)) pg_log(PG_FATAL, "This upgrade requires page-by-page conversion, " --- 179,213 ---- * * Copy or link file from old cluster to new one. */ ! static int ! transfer_relfile(pageCnvCtx *pageConverter, FileNameMap *map, ! const char *suffix) { const char *msg; + char old_file[MAXPGPATH]; + char new_file[MAXPGPATH]; + int fd; + + snprintf(old_file, sizeof(old_file), "%s/%u%s", map->old_dir, + map->old_relfilenode, suffix); + snprintf(new_file, sizeof(new_file), "%s/%u%s", map->new_dir, + map->new_relfilenode, suffix); + + /* file does not exist? */ + if ((fd = open(old_file, O_RDONLY)) == -1) + { + if (errno == ENOENT) + return -1; + else + pg_log(PG_FATAL, "non-existant file error while copying relation \"%s.%s\" (\"%s\" to \"%s\")\n", + map->nspname, map->relname, old_file, new_file); + } + close(fd); + + unlink(new_file); + + /* Copying files might take some time, so give feedback. */ + pg_log(PG_REPORT, OVERWRITE_MESSAGE, old_file); if ((user_opts.transfer_mode == TRANSFER_MODE_LINK) && (pageConverter != NULL)) pg_log(PG_FATAL, "This upgrade requires page-by-page conversion, " *************** transfer_relfile(pageCnvCtx *pageConvert *** 272,278 **** if ((msg = copyAndUpdateFile(pageConverter, old_file, new_file, true)) != NULL) pg_log(PG_FATAL, "error while copying relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n", ! nspname, relname, old_file, new_file, msg); } else { --- 219,225 ---- if ((msg = copyAndUpdateFile(pageConverter, old_file, new_file, true)) != NULL) pg_log(PG_FATAL, "error while copying relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n", ! map->nspname, map->relname, old_file, new_file, msg); } else { *************** transfer_relfile(pageCnvCtx *pageConvert *** 281,287 **** if ((msg = linkAndUpdateFile(pageConverter, old_file, new_file)) != NULL) pg_log(PG_FATAL, "error while creating link for relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n", ! nspname, relname, old_file, new_file, msg); } ! return; } --- 228,234 ---- if ((msg = linkAndUpdateFile(pageConverter, old_file, new_file)) != NULL) pg_log(PG_FATAL, "error while creating link for relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n", ! map->nspname, map->relname, old_file, new_file, msg); } ! return 0; }
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers