If the problem is not inlining, then why does the same code in C++ does not suffer from this performance limitation (when compiled with Visual C++ 2008)?

#include <stdio.h>
#include <assert.h>
#include <stdlib.h>

template <class T>
T min(T a, T b) { return a < b ? a : b; }
template <class T>
T max(T a, T b) { return a > b ? a : b; }

inline void *operator new(size_t, void *ptr) { return ptr; }
inline void operator delete(void *, void *) {}

template <class T>
class Array
{
public:

 Array() {}
 Array(int length) { resize(length); }
 Array(const Array<T> &a) { append(a); }

 void clear() { base.clear(); }

 void resize(int sz)
 {
   assert(sz >= 0);
   if(sz == 0) return clear();
   if(!base.array) reserve(sz, sz);
   *base.lengthPtr() = sz;
 }

 void reserve(int capacity, int length)
 {
   assert(length <= capacity);
   if(capacity == 0) return clear();
   ArrayBase old;
   if(base.array)
   {
     if(base.capacity() >= capacity) return;
     old.array = base.array;
     base.array = 0;
   }
   base.array = (char*)(new char[capacity*sizeof(T)+8]);
   *base.lengthPtr() = length;
   *base.capacityPtr() = capacity;
   for(int i = 0; i < capacity; i++) new(base.data()+i)T;
   if(old.array) copyData(old);
 }

 int length() const { return base.length(); }
 int capacity() const { return base.capacity(); }

 bool empty() const { return base.array == 0; }

 T &at(int i)
 {
  assert(!empty());
   assert(i >= 0 && i < length());
   return base.data()[i];
 }

 const T &at(int i) const
 {
   assert(!empty());
   assert(i >= 0 && i < length());
   return base.data()[i];
 }

 T &operator [](int i) { return at(i); }
 const T &operator [](int i) const { return at(i); }

 void operator = (const Array &array)
 {
   copy(array);
 }

 void copy(const Array &array)
 {
   if(array.empty()) return clear();
   int len = array.length();
   reserve(len, len);
   for(int i = 0; i < len; i++) at(i) = array[i];
 }

 T &front() { return at(0); }
 const T &front() const { return at(0); }
 T &back() { return at(length()-1); }
 const T &back() const { return at(length()-1); }

 T &append()
 {
   int sz = length();
   int sp = capacity();
   if(sp > sz) (*base.lengthPtr())++;
   else
   {
     sz++;
     sp = max(max(2, sp+sp/2), sz);
     reserve(sp, sz);
   }
   return back();
 }

 void append(const Array<T> &a)
 {
   int prevLen = length();
   resize(prevLen + a.length());
   for(int i = 0; i < a.length(); i++) at(i+prevLen) = a[i];
 }

private:

 class ArrayBase
 {
 public:

   ArrayBase() { array = 0; }
   ~ArrayBase() { clear(); }
   void clear() { if(array) delete array; }

   int length() const { return array ? *lengthPtr() : 0; }
   int capacity() const { return array ? *capacityPtr() : 0; }
   int* lengthPtr() const { return (int*)array; }
   int* capacityPtr() const { return (int*)(array+4); }
   T* data() const { return (T*)(array+8); }

   char* array;
 };

 ArrayBase base;

 void copyData(ArrayBase &array)
 {
   int copyLen = min(length(), array.length());
   for(int i = 0; i < copyLen; i++) { at(i) = array.data()[i]; }
 }
};

template <class T>
struct Less
{
 bool operator() (T a, T b) { return a < b; }
};

template <class T>
void swap(T &a, T &b)
{
     T temp = a;
     a = b;
     b = temp;
}

template <class T, class L>
void insertionSort1(Array<T> &a, int low, int high, L less)
{
 for(int i = low; i <= high; i++)
 {
   int min = i;
   for(int j = i + 1; j <= high; j++)
     if(less(a[j], a[min])) min = j;
   swap(a[i], a[min]);
 }
}

template<class T, class L>
int partition1(Array<T> &a, int p, int r, L less)
{
 T x = a[r];
 int j = p - 1;
 for (int i = p; i < r; i++)
 {
   if (less(x, a[i])) continue;
   swap(a[i], a[++j]);
 }
 a[r] = a[j + 1];
 a[j + 1] = x;
 return j + 1;
}

template<class T, class L>
void quickSort1(Array<T> &a, int p, int r, L less)
{
 if(p + 7 > r) return insertionSort1(a, p, r, less);
 if (p < r)
 {
   int q = partition1(a, p, r, less);
   quickSort1(a, p, q - 1, less);
   quickSort1(a, q + 1, r, less);
 }
}

template <class T, class L>
void sort1(Array<T> &a) { quickSort1(a, 0, a.length()-1, Less<T>()); }

template <class T, class L>
void insertionSort2(T *a, int low, int high, L less)
{
 for(int i = low; i <= high; i++)
 {
   int min = i;
   for(int j = i + 1; j <= high; j++)
     if(less(a[j], a[min])) min = j;
   swap(a[i], a[min]);
 }
}

template<class T, class L>
int partition2(T *a, int p, int r, L less)
{
 T x = a[r];
 int j = p - 1;
 for (int i = p; i < r; i++)
 {
   if (less(x, a[i])) continue;
   swap(a[i], a[++j]);
 }
 a[r] = a[j + 1];
 a[j + 1] = x;
 return j + 1;
}

template<class T, class L>
void quickSort2(T *a, int p, int r, L less)
{
 if(p + 7 > r) return insertionSort2(a, p, r, less);
 if (p < r)
 {
   int q = partition2(a, p, r, less);
   quickSort2(a, p, q - 1, less);
   quickSort2(a, q + 1, r, less);
 }
}

template <class T, class L>
void sort2(Array<T> &a) { quickSort2(&a[0], 0, a.length()-1, Less<T>()); }

typedef unsigned long long uint64;

uint64 getCycle() {
   __asm { RDTSC }
}

Array<double> vals;

uint64 bench1()
{
     uint64 startTime = getCycle();
 Array<double> v;
 for(int i = 0; i < 100; i++)
 {
   v = vals;
   sort1<double, Less<double> >(v);
 }
     return getCycle()-startTime;
}

uint64 bench2()
{
     uint64 startTime = getCycle();
 Array<double> v;
 for(int i = 0; i < 100; i++)
 {
   v = vals;
   sort2<double, Less<double> >(v);
 }
     return getCycle()-startTime;
}

void main()
{
 vals.resize(1000);
 for(int i = 0; i < 1000; i++) vals[i] = (rand() % 1000000) / 1000.0;

 uint64 time1 = 0;
 uint64 time2 = 0;

 for(int i = 0; i < 100; i++)
 {
   time1 += bench1();
   time2 += bench2();
 }
 printf("Sorting with Array: %f\n", time1/100000.0);
 printf("Sorting with pointers: %f\n", time2/100000.0);
 printf("%f percent faster\n", 100.0 * (time1-time2) / time1);
}


Reply via email to