*** a/src/backend/storage/page/bufpage.c
--- b/src/backend/storage/page/bufpage.c
***************
*** 944,980 **** PageSetChecksumInplace(Page page, BlockNumber blkno)
   * Note that if the checksum validation fails we cannot tell the difference
   * between a transposed block and failure from direct on-block corruption,
   * though that is better than just ignoring transposed blocks altogether.
   */
  static uint16
  PageCalcChecksum16(Page page, BlockNumber blkno)
  {
! 	pg_crc32    		crc;
! 	PageHeader	p = (PageHeader) page;
  
  	/* only calculate the checksum for properly-initialized pages */
  	Assert(!PageIsNew(page));
  
! 	INIT_CRC32(crc);
  
! 	/*
! 	 * Initialize the checksum calculation with the block number. This helps
! 	 * catch corruption from whole blocks being transposed with other whole
! 	 * blocks.
! 	 */
! 	COMP_CRC32(crc, &blkno, sizeof(blkno));
  
! 	/*
! 	 * Now add in the LSN, which is always the first field on the page.
! 	 */
! 	COMP_CRC32(crc, page, sizeof(p->pd_lsn));
  
! 	/*
! 	 * Now add the rest of the page, skipping the pd_checksum field.
! 	 */
! 	COMP_CRC32(crc, page + sizeof(p->pd_lsn) + sizeof(p->pd_checksum),
! 				  BLCKSZ - sizeof(p->pd_lsn) - sizeof(p->pd_checksum));
  
! 	FIN_CRC32(crc);
  
! 	return (uint16) crc;
  }
--- 944,1211 ----
   * Note that if the checksum validation fails we cannot tell the difference
   * between a transposed block and failure from direct on-block corruption,
   * though that is better than just ignoring transposed blocks altogether.
+  *
+  * The checksum algorithm is designed to be parallelizable on vector capable
+  * CPU's. The checksum is calculated in 3 phases. First stage aggregates 64
+  * 16bit sums with the evolution function:
+  *
+  *     partial_sum(-1,i) = 0
+  *     partial_sum(n,i) = partial_sum(n-1,i) * prime1 + ptr16Page[i+64*n]
+  *
+  * Second phase aggregates the partial sums together using a similar evolution
+  * function:
+  *
+  *     parallel_sum(-1) = 0
+  *     parallel_sum(i) = parallel_sum(i-1) * prime2 + partial_sum(i)
+  *
+  * Third phase mixes together the parallel sum and block number and squeezes
+  * the output range by a modulo to avoid 0 values. The final checksum is
+  * calculated according to the formula:
+  *
+  *     checksum = (parallel_sum * prime1 + blkno * prime2) mod trunc + 1
+  *
+  * The values of the primes are empirically chosen, the exact value of prime 1
+  * does not matter much, prime 2 needs to be large to ensure fast mixing.
   */
+ 
+ #define N_SUMS 64
+ #define CSUM_PRIME1 0x49
+ #define CSUM_PRIME2 0x986b
+ #define CSUM_TRUNC 65521
+ 
+ #if defined(__GNUC__) || defined(__INTEL_COMPILER)
+ #if defined(__x86_64__)
+ /*
+  * For x86-64 we use vectorized assembly code to speed up the algorithm. The
+  * sums are calculated in parallel using vectors of 8 16bit values. Inner
+  * loop is fully unrolled and the sums are held in vector registers to
+  * pipeline multiplication latency and eliminate load-store overhead. The
+  * aggregation phase reorganizes computations, first multiplying each value
+  * by its corresponding power of prime2 and then adding up the vector
+  * registers in a tree configuration. Only SSE2 instructions are used so we
+  * don't need to check for processor capabilities.
+  */
+ #define HAS_PLATFORM_CHECKSUM
+ 
+ /*
+  * Initialize helper vectors. The array contains four 8x16bit vectors:
+  *  1. Prime 1 broadcasted to a full vector
+  *  2. Prime 2 powers from 7..0
+  *  3. Prime 2 powers from 39..32
+  *  4. Prime 2 power 8 broadcasted to a full vector
+  * Aligned to 64 bytes because we want the whole array to be on a single
+  * cache line.
+  */
+ #define CSUM_MUL(a,b) ((uint16) ((uint64)a * (uint64)b))
+ #define CSUM_PRIME2_POW2 CSUM_MUL(CSUM_PRIME2, CSUM_PRIME2)
+ #define CSUM_PRIME2_POW3 CSUM_MUL(CSUM_PRIME2_POW2, CSUM_PRIME2)
+ #define CSUM_PRIME2_POW4 CSUM_MUL(CSUM_PRIME2_POW2, CSUM_PRIME2_POW2)
+ #define CSUM_PRIME2_POW5 CSUM_MUL(CSUM_PRIME2_POW4, CSUM_PRIME2)
+ #define CSUM_PRIME2_POW6 CSUM_MUL(CSUM_PRIME2_POW4, CSUM_PRIME2_POW2)
+ #define CSUM_PRIME2_POW7 CSUM_MUL(CSUM_PRIME2_POW4, CSUM_PRIME2_POW3)
+ #define CSUM_PRIME2_POW8 CSUM_MUL(CSUM_PRIME2_POW4, CSUM_PRIME2_POW4)
+ #define CSUM_PRIME2_POW8 CSUM_MUL(CSUM_PRIME2_POW4, CSUM_PRIME2_POW4)
+ #define CSUM_PRIME2_POW32 CSUM_MUL(CSUM_PRIME2_POW8, \
+ 	                      CSUM_MUL(CSUM_PRIME2_POW8, \
+ 	                      CSUM_MUL(CSUM_PRIME2_POW8, \
+ 	                               CSUM_PRIME2_POW8)))
+ 
+ static uint16 primeVectors[32] __attribute__ ((aligned (64))) =
+ {
+ 	CSUM_PRIME1, CSUM_PRIME1, CSUM_PRIME1, CSUM_PRIME1,
+ 	CSUM_PRIME1, CSUM_PRIME1, CSUM_PRIME1, CSUM_PRIME1,
+ 
+ 	CSUM_PRIME2_POW7, CSUM_PRIME2_POW6, CSUM_PRIME2_POW5, CSUM_PRIME2_POW4,
+ 	CSUM_PRIME2_POW3, CSUM_PRIME2_POW2, CSUM_PRIME2,      1,
+ 
+ 	CSUM_MUL(CSUM_PRIME2_POW7, CSUM_PRIME2_POW32),
+ 	CSUM_MUL(CSUM_PRIME2_POW6, CSUM_PRIME2_POW32),
+ 	CSUM_MUL(CSUM_PRIME2_POW5, CSUM_PRIME2_POW32),
+ 	CSUM_MUL(CSUM_PRIME2_POW4, CSUM_PRIME2_POW32),
+ 	CSUM_MUL(CSUM_PRIME2_POW3, CSUM_PRIME2_POW32),
+ 	CSUM_MUL(CSUM_PRIME2_POW2, CSUM_PRIME2_POW32),
+ 	CSUM_MUL(CSUM_PRIME2,      CSUM_PRIME2_POW32),
+ 	CSUM_PRIME2_POW32,
+ 
+ 	CSUM_PRIME2_POW8, CSUM_PRIME2_POW8, CSUM_PRIME2_POW8, CSUM_PRIME2_POW8,
+ 	CSUM_PRIME2_POW8, CSUM_PRIME2_POW8, CSUM_PRIME2_POW8, CSUM_PRIME2_POW8
+  };
+ 
  static uint16
  PageCalcChecksum16(Page page, BlockNumber blkno)
  {
! 	/* Parallel sum is 32bit because we can't copy out only 16 bits from xmm0 */
! 	uint32 parallel_sum;
! 	uint16 checksum;
  
  	/* only calculate the checksum for properly-initialized pages */
  	Assert(!PageIsNew(page));
+ 	/* assembly code assumes that the checksum is at offset 8 */
+ 	Assert(offsetof(PageHeaderData, pd_checksum) == 8);
+ 	/* assembly code assumes we aggregate 64 sums in parallel */
+ 	Assert(N_SUMS == 64);
  
! 	__asm__ __volatile__(
! 		/* rdx is the iteration step, we aggregate 128bytes in loop */
! 		"	mov    $0x80, %%rdx				\n"
! 		/* rcx is the offset on the page */
! 		"	xor    %%rcx, %%rcx				\n"
  
! 		/*
! 		 * Registers xmm0..7 keep the intermediate parallel checksums. We
! 		 * initialize them with data from the page, zeroing out the checksum.
! 		 */
! 		"	movdqu (%1,%%rcx,1), %%xmm0		\n"
! 		"	pinsrw $0x4, %%ecx, %%xmm0		\n"
! 		"	movdqu 0x10(%1,%%rcx,1), %%xmm1	\n"
! 		"	movdqu 0x20(%1,%%rcx,1), %%xmm2	\n"
! 		"	movdqu 0x30(%1,%%rcx,1), %%xmm3	\n"
! 		"	movdqu 0x40(%1,%%rcx,1), %%xmm4	\n"
! 		"	movdqu 0x50(%1,%%rcx,1), %%xmm5	\n"
! 		"	movdqu 0x60(%1,%%rcx,1), %%xmm6	\n"
! 		"	movdqu 0x70(%1,%%rcx,1), %%xmm7	\n"
! 		/*
! 		 * Update the offset value. We use 32bit registers here for a shorter
! 		 * instruction so the setup code length aligns with 16 bytes and the
! 		 * loop alignment below doesn't cause too much space overhead.
! 		 */
! 		"	mov    %%edx, %%ecx				\n"
  
! 		/* xmm9 contains prime 1 broadcasted to all positions */
! 		"	movdqa (%2), %%xmm9				\n"
  
! 		/*
! 		 * Main loop, calculate hash codes in parallel, each iteration
! 		 * multiplies the state with prime 1 and adds in 128 bytes from the
! 		 * page.
! 		 */
  
! 		"1:									\n"
! 		".align 16							\n"
! 		"	movdqu  (%1,%%rcx,1), %%xmm8	\n"
! 		"	pmullw  %%xmm9, %%xmm0			\n"
! 		"	paddw   %%xmm8, %%xmm0			\n"
! 		"	movdqu  0x10(%1,%%rcx,1), %%xmm8\n"
! 		"	pmullw  %%xmm9, %%xmm1			\n"
! 		"	paddw   %%xmm8, %%xmm1			\n"
! 		"	movdqu  0x20(%1,%%rcx,1), %%xmm8\n"
! 		"	pmullw  %%xmm9, %%xmm2			\n"
! 		"	paddw   %%xmm8, %%xmm2			\n"
! 		"	movdqu  0x30(%1,%%rcx,1), %%xmm8\n"
! 		"	pmullw  %%xmm9, %%xmm3			\n"
! 		"	paddw   %%xmm8, %%xmm3			\n"
! 		"	movdqu  0x40(%1,%%rcx,1), %%xmm8\n"
! 		"	pmullw  %%xmm9, %%xmm4			\n"
! 		"	paddw   %%xmm8, %%xmm4			\n"
! 		"	movdqu  0x50(%1,%%rcx,1), %%xmm8\n"
! 		"	pmullw  %%xmm9, %%xmm5			\n"
! 		"	paddw   %%xmm8, %%xmm5			\n"
! 		"	movdqu  0x60(%1,%%rcx,1), %%xmm8\n"
! 		"	pmullw  %%xmm9, %%xmm6			\n"
! 		"	paddw   %%xmm8, %%xmm6			\n"
! 		"	movdqu  0x70(%1,%%rcx,1), %%xmm8\n"
! 		"	pmullw  %%xmm9, %%xmm7			\n"
! 		"	paddw   %%xmm8, %%xmm7			\n"
! 
! 		/* update offset and check if we have hit page size already */
! 		"	add     %%rdx, %%rcx			\n"
! 		"	cmp %3, %%ecx					\n"
! 		"	jnz 1b							\n"
  
! 		/*
! 		 * Aggregation phase. We store prime 2 to the power of 7..0 in xmm10,
! 		 * to the power of 39..32 in xmm1 and to the power if 8 in xmm8. We
! 		 * change the order of operations so that we first multiply each
! 		 * partial checksum with the power that it has in the final value
! 		 * (powers go from 63..0) and then add them together. This code is
! 		 * structured to minimize dependency graph depth. The critical chain
! 		 * has 4 multiplies and 5 adds. The final value ends up in xmm0.
! 		 */
! 		"	movdqa 0x10(%2), %%xmm10		\n"
! 		"	movdqa 0x20(%2), %%xmm11		\n"
! 		"	movdqa 0x30(%2), %%xmm8			\n"
! 
! 		"	pmullw %%xmm10, %%xmm7			\n"
! 		"	pmullw %%xmm8, %%xmm10			\n"
! 		"	pmullw %%xmm10, %%xmm6			\n"
! 		"	paddw  %%xmm7, %%xmm6			\n"
! 		"	pmullw %%xmm8, %%xmm10			\n"
! 		"	pmullw %%xmm10, %%xmm5			\n"
! 		"	pmullw %%xmm8, %%xmm10			\n"
! 		"	pmullw %%xmm10, %%xmm4			\n"
! 		"	paddw %%xmm5, %%xmm4			\n"
! 		"	pmullw %%xmm11, %%xmm3			\n"
! 		"	pmullw %%xmm8, %%xmm11			\n"
! 		"	pmullw %%xmm11, %%xmm2			\n"
! 		"	paddw %%xmm3, %%xmm2			\n"
! 		"	pmullw %%xmm8, %%xmm11			\n"
! 		"	pmullw %%xmm11, %%xmm1			\n"
! 		"	pmullw %%xmm8, %%xmm11			\n"
! 		"	pmullw %%xmm11, %%xmm0			\n"
! 		"	paddw %%xmm1, %%xmm0			\n"
! 		"	paddw %%xmm6, %%xmm4			\n"
! 		"	paddw %%xmm2, %%xmm0			\n"
! 		"	paddw %%xmm4, %%xmm0			\n"
! 		"	movdqa %%xmm0, %%xmm1			\n"
! 		"	psrldq $0x8, %%xmm1				\n"
! 		"	paddw %%xmm1, %%xmm0			\n"
! 		"	movdqa %%xmm0, %%xmm1			\n"
! 		"	psrldq $0x4, %%xmm1				\n"
! 		"	paddw %%xmm1, %%xmm0			\n"
! 		"	movdqa %%xmm0, %%xmm1			\n"
! 		"	psrldq $0x2, %%xmm1				\n"
! 		"	paddw %%xmm1, %%xmm0			\n"
! 
! 		/* store the checksum in output register  */
! 		"	movd %%xmm0, %0					\n"
! 
! :		"=r"(parallel_sum)
! :		"r"(page), "r"(primeVectors), "r"(BLCKSZ)
! :		"rcx","rdx","xmm0","xmm1","xmm2","xmm3","xmm4",
! 		"xmm5","xmm6","xmm7","xmm8", "xmm9","xmm10","xmm11");
! 
! 	/* mask out only the resulting sum */
! 	parallel_sum &= 0xFFFF;
! 	checksum = ((parallel_sum*CSUM_PRIME1 + blkno*CSUM_PRIME2) % CSUM_TRUNC) + 1;
! 
! 	return checksum;
  }
+ #endif /* __x86_64__ */
+ #endif	/* defined(__GNUC__) || defined(__INTEL_COMPILER) */
+ 
+ #ifndef HAS_PLATFORM_CHECKSUM
+ /*
+  * Generic implementation of the checksum algorithm. The code is structured
+  * so vectorizing compilers can recognize the aggregation pattern. For gcc
+  * -funroll-loops and -ftree-vectorize will cause the main loop to be
+  * vectorized.
+  */
+ static uint16
+ PageCalcChecksum16(Page page, BlockNumber blkno)
+ {
+ 	uint16 sums[N_SUMS];
+ 	uint16 (*pageArr)[N_SUMS] = (uint16 (*)[N_SUMS]) page;
+ 	uint16 parallel_sum = 0;
+ 	uint16 checksum;
+ 	int i, j;
+ 
+ 	/* only calculate the checksum for properly-initialized pages */
+ 	Assert(!PageIsNew(page));
+ 
+ 	/* initialize sums */
+ 	for (j = 0; j < N_SUMS; j++)
+ 		sums[j] = (j == offsetof(PageHeaderData, pd_checksum)/sizeof(int16)) ? 0 : pageArr[0][j];
+ 
+ 	for (i = 1; i < BLCKSZ/sizeof(uint16)/N_SUMS; i++)
+ 		for (j = 0; j < N_SUMS; j++)
+ 		    sums[j] = sums[j]*CSUM_PRIME1 + pageArr[i][j];
+ 
+ 
+ 	for (i = 0; i < N_SUMS; i++)
+ 		parallel_sum = parallel_sum*CSUM_PRIME2 + sums[i];
+ 
+ 	checksum = (((uint32) parallel_sum*CSUM_PRIME1 + blkno*CSUM_PRIME2) % CSUM_TRUNC) + 1;
+ 	return checksum;
+ }
+ 
+ #endif /* !HAS_PLATFORM_CHECKSUM */
