changeset 8700:314be237cd5b

sorting optimizations
author Jaroslav Hajek <highegg@gmail.com>
date Mon, 09 Feb 2009 13:05:35 +0100
parents 6e764b7317bd
children 1652e39b934e
files liboctave/Array-C.cc liboctave/Array-d.cc liboctave/Array-f.cc liboctave/Array-fC.cc liboctave/Array-i.cc liboctave/Array-s.cc liboctave/Array.cc liboctave/Array.h liboctave/ChangeLog liboctave/oct-sort.cc liboctave/oct-sort.h src/ChangeLog src/TEMPLATE-INST/Array-tc.cc
diffstat 13 files changed, 908 insertions(+), 1001 deletions(-) [+]
line wrap: on
line diff
--- a/liboctave/Array-C.cc	Mon Feb 09 01:56:06 2009 -0500
+++ b/liboctave/Array-C.cc	Mon Feb 09 13:05:35 2009 +0100
@@ -31,6 +31,7 @@
 
 #include "Array.h"
 #include "Array.cc"
+#include "oct-sort.cc"
 
 static double
 xabs (const Complex& x)
@@ -38,27 +39,9 @@
   return (xisinf (x.real ()) || xisinf (x.imag ())) ? octave_Inf : abs (x);
 }
 
-static bool
-operator < (const Complex& a, const Complex& b)
-{
-  return (xisnan (b) || (xabs (a) < xabs (b))
-	  || ((xabs (a) == xabs (b)) && (arg (a) < arg (b))));
-}
-
-static bool
-operator > (const Complex& a, const Complex& b)
-{
-  return (xisnan (a) || (xabs (a) > xabs (b))
-	  || ((xabs (a) == xabs (b)) && (arg (a) > arg (b))));
-}
-
-// This file must be included after the < and > operators are
-// defined to avoid errors with the Intel C++ compiler.
-#include "oct-sort.cc"
-
 template <>
 bool
-ascending_compare (Complex a, Complex b)
+octave_sort<Complex>::ascending_compare (Complex a, Complex b)
 {
   return (xisnan (b) || (xabs (a) < xabs (b))
 	  || ((xabs (a) == xabs (b)) && (arg (a) < arg (b))));
@@ -66,32 +49,12 @@
 
 template <>
 bool
-ascending_compare (vec_index<Complex> *a, vec_index<Complex> *b)
-{
-  return (xisnan (b->vec)
-	  || (xabs (a->vec) < xabs (b->vec))
-	  || ((xabs (a->vec) == xabs (b->vec))
-	      && (arg (a->vec) < arg (b->vec))));
-}
-
-template <>
-bool
-descending_compare (Complex a, Complex b)
+octave_sort<Complex>::descending_compare (Complex a, Complex b)
 {
   return (xisnan (a) || (xabs (a) > xabs (b))
 	  || ((xabs (a) == xabs (b)) && (arg (a) > arg (b))));
 }
 
-template <>
-bool
-descending_compare (vec_index<Complex> *a, vec_index<Complex> *b)
-{
-  return (xisnan (a->vec)
-	  || (xabs (a->vec) > xabs (b->vec))
-	  || ((xabs (a->vec) == xabs (b->vec))
-	      && (arg (a->vec) > arg (b->vec))));
-}
-
 INSTANTIATE_ARRAY_SORT (Complex);
 
 INSTANTIATE_ARRAY_AND_ASSIGN (Complex, OCTAVE_API);
--- a/liboctave/Array-d.cc	Mon Feb 09 01:56:06 2009 -0500
+++ b/liboctave/Array-d.cc	Mon Feb 09 13:05:35 2009 +0100
@@ -27,368 +27,23 @@
 
 // Instantiate Arrays of double values.
 
+#include "lo-ieee.h"
 #include "Array.h"
 #include "Array.cc"
-#include "oct-sort.cc"
 #include "oct-locbuf.h"
 
-#if defined (HAVE_IEEE754_DATA_FORMAT)
-
-static inline uint64_t
-FloatFlip (uint64_t f)
-{
-  uint64_t mask
-    = -static_cast<int64_t>(f >> 63) | 0x8000000000000000ULL;
-
-  return f ^ mask;
-}
-
-static inline uint64_t
-IFloatFlip (uint64_t f)
-{
-  uint64_t mask = ((f >> 63) - 1) | 0x8000000000000000ULL;
-
-  return f ^ mask;
-}
-
-template <>
-bool
-ascending_compare (double a, double b)
-{
-  return (xisnan (b) || (a < b));
-}
-
-template <>
-bool
-ascending_compare (const double *a, const double *b)
-{
-  return (xisnan (*b) || (*a < *b));
-}
-
-template <>
-bool
-descending_compare (double a, double b)
-{
-  return (xisnan (a) || (a > b));
-}
-
-template <>
-bool
-descending_compare (const double *a, const double *b)
-{
-  return (xisnan (*b) || (*a > *b));
-}
-
-INSTANTIATE_ARRAY_SORT (uint64_t);
+#define INLINE_ASCENDING_SORT
+#define INLINE_DESCENDING_SORT
+#include "oct-sort.cc"
 
 template <>
-Array<double>
-Array<double>::sort (octave_idx_type dim, sortmode mode) const
+inline bool _sort_isnan (double x)
 {
-  Array<double> m (dims ());
-
-  dim_vector dv = m.dims ();
-
-  if (m.length () < 1)
-    return m;
-
-  octave_idx_type ns = dv(dim);
-  octave_idx_type iter = dv.numel () / ns;
-  octave_idx_type stride = 1;
-  for (int i = 0; i < dim; i++)
-    stride *= dv(i);
-
-  double *v = m.fortran_vec ();
-  const double *ov = data ();
-
-  uint64_t *p = reinterpret_cast<uint64_t *> (v);
-  const uint64_t *op = reinterpret_cast<const uint64_t *> (ov);
-
-  octave_sort<uint64_t> lsort;
-
-  if (mode == ASCENDING)
-    lsort.set_compare (ascending_compare);
-  else if (mode == DESCENDING)
-    lsort.set_compare (descending_compare);
-  else
-    abort ();
-
-  if (stride == 1)
-    {
-      for (octave_idx_type j = 0; j < iter; j++)
-	{
-	  // Flip the data in the vector so that int compares on
-	  // IEEE754 give the correct ordering.
-
-	  for (octave_idx_type i = 0; i < ns; i++)
-	    p[i] = FloatFlip (op[i]);
-	      
-	  lsort.sort (p, ns);
-
-	  // Flip the data out of the vector so that int compares
-	  // on IEEE754 give the correct ordering.
-
-	  for (octave_idx_type i = 0; i < ns; i++)
-	    p[i] = IFloatFlip (p[i]);
-
-	  // There are two representations of NaN.  One will be
-	  // sorted to the beginning of the vector and the other
-	  // to the end.  If it will be sorted incorrectly, fix
-	  // things up.
-
-	  if (lo_ieee_signbit (octave_NaN))
-	    {
-	      if (mode == ASCENDING)
-		{
-		  octave_idx_type i = 0;
-		  double *vtmp = reinterpret_cast<double *> (p);
-		  while (xisnan (vtmp[i++]) && i < ns)
-		    /* do nothing */;
-		  for (octave_idx_type l = 0; l < ns - i + 1; l++)
-		    vtmp[l] = vtmp[l+i-1];
-		  for (octave_idx_type l = ns - i + 1; l < ns; l++)
-		    vtmp[l] = octave_NaN;
-		}
-	      else
-		{
-		  octave_idx_type i = ns;
-		  double *vtmp = reinterpret_cast<double *> (p);
-		  while (xisnan (vtmp[--i]) && i > 0)
-		    /* do nothing */;
-		  for (octave_idx_type l = i; l >= 0; l--)
-		    vtmp[l-i+ns-1] = vtmp[l];
-		  for (octave_idx_type l = 0; l < ns - i - 1; l++)
-		    vtmp[l] = octave_NaN;
-		}
-	    }
-
-	  p += ns;
-          op += ns;
-	}
-    }
-  else
-    {
-      OCTAVE_LOCAL_BUFFER (uint64_t, vi, ns);
-
-      for (octave_idx_type j = 0; j < iter; j++)
-	{
-	  octave_idx_type offset = j;
-	  octave_idx_type offset2 = 0;
-	  while (offset >= stride)
-	    {
-	      offset -= stride;
-	      offset2++;
-	    }
-	  offset += offset2 * stride * ns;
-
-	  // Flip the data in the vector so that int compares on
-	  // IEEE754 give the correct ordering.
-
-	  for (octave_idx_type i = 0; i < ns; i++)
-	    vi[i] = FloatFlip (op[i*stride + offset]);
-
-	  lsort.sort (vi, ns);
-
-	  // Flip the data out of the vector so that int compares
-	  // on IEEE754 give the correct ordering.
-
-	  for (octave_idx_type i = 0; i < ns; i++)
-	    p[i*stride + offset] = IFloatFlip (vi[i]);
-	      
-	  // There are two representations of NaN. One will be
-	  // sorted to the beginning of the vector and the other
-	  // to the end. If it will be sorted to the beginning,
-	  // fix things up.
-
-	  if (lo_ieee_signbit (octave_NaN))
-	    {
-	      if (mode == ASCENDING)
-		{
-		   octave_idx_type i = 0;
-		  while (xisnan (v[i++*stride + offset]) && i < ns)
-		    /* do nothing */;
-		  for (octave_idx_type l = 0; l < ns - i + 1; l++)
-		    v[l*stride + offset] = v[(l+i-1)*stride + offset];
-		  for (octave_idx_type l = ns - i + 1; l < ns; l++)
-		    v[l*stride + offset] = octave_NaN;
-		}
-	      else
-		{
-		   octave_idx_type i = ns;
-		  while (xisnan (v[--i*stride + offset]) && i > 0)
-		    /* do nothing */;
-		  for (octave_idx_type l = i; l >= 0; l--)
-		    v[(l-i+ns-1)*stride + offset] = v[l*stride + offset];
-		  for (octave_idx_type l = 0; l < ns - i - 1; l++)
-		    v[l*stride + offset] = octave_NaN;
-		}
-	    }
-	}
-    }
-
-  return m;
-}
-
-template <>
-Array<double>
-Array<double>::sort (Array<octave_idx_type> &sidx, octave_idx_type dim, 
-		     sortmode mode) const
-{
-  Array<double> m (dims ());
-
-  dim_vector dv = m.dims ();
-
-  if (m.length () < 1)
-    {
-      sidx = Array<octave_idx_type> (dv);
-      return m;
-    }
-
-  octave_idx_type ns = dv(dim);
-  octave_idx_type iter = dv.numel () / ns;
-  octave_idx_type stride = 1;
-  for (int i = 0; i < dim; i++)
-    stride *= dv(i);
-
-  double *v = m.fortran_vec ();
-  const double *ov = data ();
-
-  uint64_t *p = reinterpret_cast<uint64_t *> (v);
-  const uint64_t *op = reinterpret_cast<const uint64_t *> (ov);
-
-  octave_sort<const uint64_t *> indexed_sort;
-
-  if (mode == ASCENDING)
-    indexed_sort.set_compare (ascending_compare);
-  else if (mode == DESCENDING)
-    indexed_sort.set_compare (descending_compare);
-  else
-    abort ();
-
-  OCTAVE_LOCAL_BUFFER (const uint64_t *, vi, ns);
-  OCTAVE_LOCAL_BUFFER (uint64_t, vix, ns);
-  
-  sidx = Array<octave_idx_type> (dv);
-      
-  for (octave_idx_type j = 0; j < iter; j++)
-    {
-      octave_idx_type offset = j;
-      octave_idx_type offset2 = 0;
-      while (offset >= stride)
-	{
-	  offset -= stride;
-	  offset2++;
-	}
-      offset += offset2 * stride * ns;
-
-      // Flip the data in the vector so that int compares on
-      // IEEE754 give the correct ordering.
-
-      for (octave_idx_type i = 0; i < ns; i++)
-	{
-	  vix[i] = FloatFlip (op[i*stride + offset]);
-	  vi[i] = vix + i;
-	}
-
-      indexed_sort.sort (vi, ns);
-
-      // Flip the data out of the vector so that int compares on
-      // IEEE754 give the correct ordering
-
-      for (octave_idx_type i = 0; i < ns; i++)
-	{
-	  p[i*stride + offset] = IFloatFlip (*vi[i]);
-	  sidx(i*stride + offset) = vi[i] - vix;
-	}
-
-      // There are two representations of NaN.  One will be sorted
-      // to the beginning of the vector and the other to the end.
-      // If it will be sorted to the beginning, fix things up.
-
-      if (lo_ieee_signbit (octave_NaN))
-	{
-	  if (mode == ASCENDING)
-	    {
-	      octave_idx_type i = 0;
-	      while (xisnan (v[i++*stride+offset]) && i < ns)
-		/* do nothing */;
-	      OCTAVE_LOCAL_BUFFER (double, itmp, i - 1);
-	      for (octave_idx_type l = 0; l < i -1; l++)
-		itmp[l] = sidx(l*stride + offset);
-	      for (octave_idx_type l = 0; l < ns - i + 1; l++)
-		{
-		  v[l*stride + offset] = v[(l+i-1)*stride + offset];
-		  sidx(l*stride + offset) = sidx((l+i-1)*stride + offset);
-		}
-	      for (octave_idx_type k = 0, l = ns - i + 1; l < ns; l++, k++)
-		{
-		  v[l*stride + offset] = octave_NaN;
-		  sidx(l*stride + offset) = 
-		    static_cast<octave_idx_type>(itmp[k]);
-		}
-	    }
-	  else 
-	    {
-	      octave_idx_type i = ns;
-	      while (xisnan (v[--i*stride+offset]) && i > 0)
-		/* do nothing */;
-	      OCTAVE_LOCAL_BUFFER (double, itmp, ns - i - 1);
-	      for (octave_idx_type l = 0; l < ns - i -1; l++)
-		itmp[l] = sidx((l+i+1)*stride + offset);
-	      for (octave_idx_type l = i; l >= 0; l--)
-		{
-		  v[(l-i+ns-1)*stride + offset] = v[l*stride + offset];
-		  sidx((l-i+ns-1)*stride + offset) = sidx(l*stride + offset);
-		}
-	      for (octave_idx_type k = 0, l = 0; l < ns - i - 1; l++, k++)
-		{
-		  v[l*stride + offset] = octave_NaN;
-		  sidx(l*stride + offset) = 
-		    static_cast<octave_idx_type>(itmp[k]);
-		}
-	    }
-	}
-    }
-
-  return m;
-}
-
-#else
-
-template <>
-bool
-ascending_compare (double a, double b)
-{
-  return (xisnan (b) || (a < b));
-}
-
-template <>
-bool
-ascending_compare (const double *a, 
-		   const double *b)
-{
-  return (xisnan (*b) || (*a < *b));
-}
-
-template <>
-bool
-descending_compare (double a, double b)
-{
-  return (xisnan (a) || (a > b));
-}
-
-template <>
-bool
-descending_compare (const double *a, 
-		    const double *b)
-{
-  return (xisnan (*b) || (*a > *b));
+  return lo_ieee_isnan (x);
 }
 
 INSTANTIATE_ARRAY_SORT (double);
 
-#endif
-
 INSTANTIATE_ARRAY_AND_ASSIGN (double, OCTAVE_API);
 
 INSTANTIATE_ARRAY_ASSIGN (double, int, OCTAVE_API)
--- a/liboctave/Array-f.cc	Mon Feb 09 01:56:06 2009 -0500
+++ b/liboctave/Array-f.cc	Mon Feb 09 13:05:35 2009 +0100
@@ -27,367 +27,22 @@
 
 // Instantiate Arrays of float values.
 
+#include "lo-ieee.h"
 #include "Array.h"
 #include "Array.cc"
-#include "oct-sort.cc"
 #include "oct-locbuf.h"
 
-#if defined (HAVE_IEEE754_DATA_FORMAT)
-
-static inline uint32_t
-FloatFlip (uint32_t f)
-{
-  uint32_t mask
-    = -static_cast<int32_t>(f >> 31) | 0x80000000UL;
-
-  return f ^ mask;
-}
-
-static inline uint32_t
-IFloatFlip (uint32_t f)
-{
-  uint32_t mask = ((f >> 31) - 1) | 0x80000000UL;
-
-  return f ^ mask;
-}
-
-template <>
-bool
-ascending_compare (float a, float b)
-{
-  return (xisnan (b) || (a < b));
-}
-
-template <>
-bool
-ascending_compare (const float *a, const float *b)
-{
-  return (xisnan (*b) || (*a < *b));
-}
-
-template <>
-bool
-descending_compare (float a, float b)
-{
-  return (xisnan (a) || (a > b));
-}
-
-template <>
-bool
-descending_compare (const float *a, const float *b)
-{
-  return (xisnan (*b) || (*a > *b));
-}
-
-INSTANTIATE_ARRAY_SORT (uint32_t);
+#define INLINE_ASCENDING_SORT
+#define INLINE_DESCENDING_SORT
+#include "oct-sort.cc"
 
 template <>
-Array<float>
-Array<float>::sort (octave_idx_type dim, sortmode mode) const
+inline bool _sort_isnan (float x)
 {
-  Array<float> m (dims ());
-
-  dim_vector dv = m.dims ();
-
-  if (m.length () < 1)
-    return m;
-
-  octave_idx_type ns = dv(dim);
-  octave_idx_type iter = dv.numel () / ns;
-  octave_idx_type stride = 1;
-  for (int i = 0; i < dim; i++)
-    stride *= dv(i);
-
-  float *v = m.fortran_vec ();
-  const float *ov = data ();
-
-  uint32_t *p = reinterpret_cast<uint32_t *> (v);
-  const uint32_t *op = reinterpret_cast<const uint32_t *> (ov);
-
-  octave_sort<uint32_t> lsort;
-
-  if (mode == ASCENDING)
-    lsort.set_compare (ascending_compare);
-  else if (mode == DESCENDING)
-    lsort.set_compare (descending_compare);
-  else
-    abort ();
-
-  if (stride == 1)
-    {
-      for (octave_idx_type j = 0; j < iter; j++)
-	{
-	  // Flip the data in the vector so that int compares on
-	  // IEEE754 give the correct ordering.
-
-	  for (octave_idx_type i = 0; i < ns; i++)
-	    p[i] = FloatFlip (op[i]);
-	      
-	  lsort.sort (p, ns);
-
-	  // Flip the data out of the vector so that int compares
-	  // on IEEE754 give the correct ordering.
-
-	  for (octave_idx_type i = 0; i < ns; i++)
-	    p[i] = IFloatFlip (p[i]);
-
-	  // There are two representations of NaN.  One will be
-	  // sorted to the beginning of the vector and the other
-	  // to the end.  If it will be sorted incorrectly, fix
-	  // things up.
-
-	  if (lo_ieee_signbit (octave_Float_NaN))
-	    {
-	      if (mode == ASCENDING)
-		{
-		  octave_idx_type i = 0;
-		  float *vtmp = reinterpret_cast<float *> (p);
-		  while (xisnan (vtmp[i++]) && i < ns)
-		    /* do nothing */;
-		  for (octave_idx_type l = 0; l < ns - i + 1; l++)
-		    vtmp[l] = vtmp[l+i-1];
-		  for (octave_idx_type l = ns - i + 1; l < ns; l++)
-		    vtmp[l] = octave_Float_NaN;
-		}
-	      else
-		{
-		  octave_idx_type i = ns;
-		  float *vtmp = reinterpret_cast<float *> (p);
-		  while (xisnan (vtmp[--i]) && i > 0)
-		    /* do nothing */;
-		  for (octave_idx_type l = i; l >= 0; l--)
-		    vtmp[l-i+ns-1] = vtmp[l];
-		  for (octave_idx_type l = 0; l < ns - i - 1; l++)
-		    vtmp[l] = octave_Float_NaN;
-		}
-	    }
-
-	  p += ns;
-          op += ns;
-	}
-    }
-  else
-    {
-      OCTAVE_LOCAL_BUFFER (uint32_t, vi, ns);
-
-      for (octave_idx_type j = 0; j < iter; j++)
-	{
-	  octave_idx_type offset = j;
-	  octave_idx_type offset2 = 0;
-	  while (offset >= stride)
-	    {
-	      offset -= stride;
-	      offset2++;
-	    }
-	  offset += offset2 * stride * ns;
-
-	  // Flip the data in the vector so that int compares on
-	  // IEEE754 give the correct ordering.
-
-	  for (octave_idx_type i = 0; i < ns; i++)
-	    vi[i] = FloatFlip (op[i*stride + offset]);
-
-	  lsort.sort (vi, ns);
-
-	  // Flip the data out of the vector so that int compares
-	  // on IEEE754 give the correct ordering.
-
-	  for (octave_idx_type i = 0; i < ns; i++)
-	    p[i*stride + offset] = IFloatFlip (vi[i]);
-	      
-	  // There are two representations of NaN. One will be
-	  // sorted to the beginning of the vector and the other
-	  // to the end. If it will be sorted to the beginning,
-	  // fix things up.
-
-	  if (lo_ieee_signbit (octave_Float_NaN))
-	    {
-	      if (mode == ASCENDING)
-		{
-		   octave_idx_type i = 0;
-		  while (xisnan (v[i++*stride + offset]) && i < ns)
-		    /* do nothing */;
-		  for (octave_idx_type l = 0; l < ns - i + 1; l++)
-		    v[l*stride + offset] = v[(l+i-1)*stride + offset];
-		  for (octave_idx_type l = ns - i + 1; l < ns; l++)
-		    v[l*stride + offset] = octave_Float_NaN;
-		}
-	      else
-		{
-		   octave_idx_type i = ns;
-		  while (xisnan (v[--i*stride + offset]) && i > 0)
-		    /* do nothing */;
-		  for (octave_idx_type l = i; l >= 0; l--)
-		    v[(l-i+ns-1)*stride + offset] = v[l*stride + offset];
-		  for (octave_idx_type l = 0; l < ns - i - 1; l++)
-		    v[l*stride + offset] = octave_Float_NaN;
-		}
-	    }
-	}
-    }
-
-  return m;
+  return lo_ieee_isnan (x);
 }
 
-template <>
-Array<float>
-Array<float>::sort (Array<octave_idx_type> &sidx, octave_idx_type dim, 
-		     sortmode mode) const
-{
-  Array<float> m (dims ());
-
-  dim_vector dv = m.dims ();
-
-  if (m.length () < 1)
-    {
-      sidx = Array<octave_idx_type> (dv);
-      return m;
-    }
-
-  octave_idx_type ns = dv(dim);
-  octave_idx_type iter = dv.numel () / ns;
-  octave_idx_type stride = 1;
-  for (int i = 0; i < dim; i++)
-    stride *= dv(i);
-
-  float *v = m.fortran_vec ();
-  const float *ov = data ();
-
-  uint32_t *p = reinterpret_cast<uint32_t *> (v);
-  const uint32_t *op = reinterpret_cast<const uint32_t *> (ov);
-
-  octave_sort<const uint32_t *> indexed_sort;
-
-  if (mode == ASCENDING)
-    indexed_sort.set_compare (ascending_compare);
-  else if (mode == DESCENDING)
-    indexed_sort.set_compare (descending_compare);
-  else
-    abort ();
-
-  OCTAVE_LOCAL_BUFFER (const uint32_t *, vi, ns);
-  OCTAVE_LOCAL_BUFFER (uint32_t, vix, ns);
-  
-  sidx = Array<octave_idx_type> (dv);
-      
-  for (octave_idx_type j = 0; j < iter; j++)
-    {
-      octave_idx_type offset = j;
-      octave_idx_type offset2 = 0;
-      while (offset >= stride)
-	{
-	  offset -= stride;
-	  offset2++;
-	}
-      offset += offset2 * stride * ns;
-
-      // Flip the data in the vector so that int compares on
-      // IEEE754 give the correct ordering.
-
-      for (octave_idx_type i = 0; i < ns; i++)
-	{
-	  vix[i] = FloatFlip (op[i*stride + offset]);
-	  vi[i] = vix + i;
-	}
-
-      indexed_sort.sort (vi, ns);
-
-      // Flip the data out of the vector so that int compares on
-      // IEEE754 give the correct ordering
-
-      for (octave_idx_type i = 0; i < ns; i++)
-	{
-	  p[i*stride + offset] = IFloatFlip (*vi[i]);
-	  sidx(i*stride + offset) = vi[i] - vix;
-	}
-
-      // There are two representations of NaN.  One will be sorted
-      // to the beginning of the vector and the other to the end.
-      // If it will be sorted to the beginning, fix things up.
-
-      if (lo_ieee_signbit (octave_Float_NaN))
-	{
-	  if (mode == ASCENDING)
-	    {
-	      octave_idx_type i = 0;
-	      while (xisnan (v[i++*stride+offset]) && i < ns)
-		/* do nothing */;
-	      OCTAVE_LOCAL_BUFFER (float, itmp, i - 1);
-	      for (octave_idx_type l = 0; l < i -1; l++)
-		itmp[l] = sidx(l*stride + offset);
-	      for (octave_idx_type l = 0; l < ns - i + 1; l++)
-		{
-		  v[l*stride + offset] = v[(l+i-1)*stride + offset];
-		  sidx(l*stride + offset) = sidx((l+i-1)*stride + offset);
-		}
-	      for (octave_idx_type k = 0, l = ns - i + 1; l < ns; l++, k++)
-		{
-		  v[l*stride + offset] = octave_Float_NaN;
-		  sidx(l*stride + offset) = 
-		    static_cast<octave_idx_type>(itmp[k]);
-		}
-	    }
-	  else 
-	    {
-	      octave_idx_type i = ns;
-	      while (xisnan (v[--i*stride+offset]) && i > 0)
-		/* do nothing */;
-	      OCTAVE_LOCAL_BUFFER (float, itmp, ns - i - 1);
-	      for (octave_idx_type l = 0; l < ns - i -1; l++)
-		itmp[l] = sidx((l+i+1)*stride + offset);
-	      for (octave_idx_type l = i; l >= 0; l--)
-		{
-		  v[(l-i+ns-1)*stride + offset] = v[l*stride + offset];
-		  sidx((l-i+ns-1)*stride + offset) = sidx(l*stride + offset);
-		}
-	      for (octave_idx_type k = 0, l = 0; l < ns - i - 1; l++, k++)
-		{
-		  v[l*stride + offset] = octave_Float_NaN;
-		  sidx(l*stride + offset) = 
-		    static_cast<octave_idx_type>(itmp[k]);
-		}
-	    }
-	}
-    }
-
-  return m;
-}
-
-#else
-
-template <>
-bool
-ascending_compare (float a, float b)
-{
-  return (xisnan (b) || (a < b));
-}
-
-template <>
-bool
-ascending_compare (const float *a, 
-		   const float *b)
-{
-  return (xisnan (*b) || (*a < *b));
-}
-
-template <>
-bool
-descending_compare (float a, float b)
-{
-  return (xisnan (a) || (a > b));
-}
-
-template <>
-bool
-descending_compare (const float *a, 
-		    const float *b)
-{
-  return (xisnan (*b) || (*a > *b));
-}
-
-INSTANTIATE_ARRAY_SORT (float);
-
-#endif
+INSTANTIATE_ARRAY_SORT (double);
 
 INSTANTIATE_ARRAY_AND_ASSIGN (float, OCTAVE_API);
 
--- a/liboctave/Array-fC.cc	Mon Feb 09 01:56:06 2009 -0500
+++ b/liboctave/Array-fC.cc	Mon Feb 09 13:05:35 2009 +0100
@@ -31,6 +31,7 @@
 
 #include "Array.h"
 #include "Array.cc"
+#include "oct-sort.cc"
 
 static float
 xabs (const FloatComplex& x)
@@ -38,27 +39,9 @@
   return (xisinf (x.real ()) || xisinf (x.imag ())) ? octave_Float_Inf : abs (x);
 }
 
-static bool
-operator < (const FloatComplex& a, const FloatComplex& b)
-{
-  return (xisnan (b) || (xabs (a) < xabs (b))
-	  || ((xabs (a) == xabs (b)) && (arg (a) < arg (b))));
-}
-
-static bool
-operator > (const FloatComplex& a, const FloatComplex& b)
-{
-  return (xisnan (a) || (xabs (a) > xabs (b))
-	  || ((xabs (a) == xabs (b)) && (arg (a) > arg (b))));
-}
-
-// This file must be included after the < and > operators are
-// defined to avoid errors with the Intel C++ compiler.
-#include "oct-sort.cc"
-
 template <>
 bool
-ascending_compare (FloatComplex a, FloatComplex b)
+octave_sort<FloatComplex>::ascending_compare (FloatComplex a, FloatComplex b)
 {
   return (xisnan (b) || (xabs (a) < xabs (b))
 	  || ((xabs (a) == xabs (b)) && (arg (a) < arg (b))));
@@ -66,32 +49,12 @@
 
 template <>
 bool
-ascending_compare (vec_index<FloatComplex> *a, vec_index<FloatComplex> *b)
-{
-  return (xisnan (b->vec)
-	  || (xabs (a->vec) < xabs (b->vec))
-	  || ((xabs (a->vec) == xabs (b->vec))
-	      && (arg (a->vec) < arg (b->vec))));
-}
-
-template <>
-bool
-descending_compare (FloatComplex a, FloatComplex b)
+octave_sort<FloatComplex>::descending_compare (FloatComplex a, FloatComplex b)
 {
   return (xisnan (a) || (xabs (a) > xabs (b))
 	  || ((xabs (a) == xabs (b)) && (arg (a) > arg (b))));
 }
 
-template <>
-bool
-descending_compare (vec_index<FloatComplex> *a, vec_index<FloatComplex> *b)
-{
-  return (xisnan (a->vec)
-	  || (xabs (a->vec) > xabs (b->vec))
-	  || ((xabs (a->vec) == xabs (b->vec))
-	      && (arg (a->vec) > arg (b->vec))));
-}
-
 INSTANTIATE_ARRAY_SORT (FloatComplex);
 
 INSTANTIATE_ARRAY_AND_ASSIGN (FloatComplex, OCTAVE_API);
--- a/liboctave/Array-i.cc	Mon Feb 09 01:56:06 2009 -0500
+++ b/liboctave/Array-i.cc	Mon Feb 09 13:05:35 2009 +0100
@@ -31,6 +31,9 @@
 
 #include "Array.h"
 #include "Array.cc"
+
+#define INLINE_ASCENDING_SORT
+#define INLINE_DESCENDING_SORT
 #include "oct-sort.cc"
 
 INSTANTIATE_ARRAY_SORT (int);
--- a/liboctave/Array-s.cc	Mon Feb 09 01:56:06 2009 -0500
+++ b/liboctave/Array-s.cc	Mon Feb 09 13:05:35 2009 +0100
@@ -29,6 +29,9 @@
 
 #include "Array.h"
 #include "Array.cc"
+
+#define INLINE_ASCENDING_SORT
+#define INLINE_DESCENDING_SORT
 #include "oct-sort.cc"
 
 INSTANTIATE_ARRAY_SORT (short);
--- a/liboctave/Array.cc	Mon Feb 09 01:56:06 2009 -0500
+++ b/liboctave/Array.cc	Mon Feb 09 13:05:35 2009 +0100
@@ -1975,32 +1975,12 @@
     dimensions = new_dims;
 }
 
+// Non-real types don't have NaNs.
 template <class T>
-bool 
-ascending_compare (T a, T b)
-{
-  return (a < b);
-}
-
-template <class T>
-bool 
-descending_compare (T a, T b)
+inline
+bool _sort_isnan (T)
 {
-  return (a > b);
-}
-
-template <class T>
-bool 
-ascending_compare (const T *a, const T *b)
-{
-  return (*a < *b);
-}
-
-template <class T>
-bool 
-descending_compare (const T *a, const T *b)
-{
-  return (*a > *b);
+  return false;
 }
 
 template <class T>
@@ -2027,9 +2007,9 @@
   octave_sort<T> lsort;
   
   if (mode == ASCENDING) 
-    lsort.set_compare (ascending_compare);
+    lsort.set_compare (octave_sort<T>::ascending_compare);
   else if (mode == DESCENDING)
-    lsort.set_compare (descending_compare);
+    lsort.set_compare (octave_sort<T>::descending_compare);
   else
     abort ();
 
@@ -2037,15 +2017,36 @@
     {
       for (octave_idx_type j = 0; j < iter; j++)
 	{
-          std::copy (ov, ov + ns, v);
-	  lsort.sort (v, ns);
+          // copy and partition out NaNs. 
+          // FIXME: impact on integer types noticeable?
+          octave_idx_type kl = 0, ku = ns;
+          for (octave_idx_type i = 0; i < ns; i++)
+            {
+              T tmp = ov[i];
+              if (_sort_isnan (tmp))
+                v[--ku] = tmp;
+              else
+                v[kl++] = tmp;
+            }
+
+          // sort.
+	  lsort.sort (v, kl);
+
+          if (ku < ns)
+            {
+              // NaNs are in reverse order
+              std::reverse (v + ku, v + ns);
+              if (mode == DESCENDING)
+                std::rotate (v, v + ku, v + ns);
+            }
+
 	  v += ns;
           ov += ns;
 	}
     }
   else
     {
-      OCTAVE_LOCAL_BUFFER (T, pvi, ns);
+      OCTAVE_LOCAL_BUFFER (T, buf, ns);
 
       for (octave_idx_type j = 0; j < iter; j++) 
 	{
@@ -2060,13 +2061,32 @@
 
 	  offset += offset2 * stride * ns;
 	  
-	  for (octave_idx_type i = 0; i < ns; i++)
-	    pvi[i] = ov[i*stride + offset];
+          // gather and partition out NaNs. 
+          // FIXME: impact on integer types noticeable?
+          octave_idx_type kl = 0, ku = ns;
+          for (octave_idx_type i = 0; i < ns; i++)
+            {
+              T tmp = ov[i*stride + offset];
+              if (_sort_isnan (tmp))
+                buf[--ku] = tmp;
+              else
+                buf[kl++] = tmp;
+            }
 
-	  lsort.sort (pvi, ns);
-	      
+          // sort.
+	  lsort.sort (buf, kl);
+
+          if (ku < ns)
+            {
+              // NaNs are in reverse order
+              std::reverse (buf + ku, buf + ns);
+              if (mode == DESCENDING)
+                std::rotate (buf, buf + ku, buf + ns);
+            }
+
+          // scatter.
 	  for (octave_idx_type i = 0; i < ns; i++)
-	    v[i*stride + offset] = pvi[i];
+	    v[i*stride + offset] = buf[i];
 	}
     }
 
@@ -2098,44 +2118,68 @@
   T *v = m.fortran_vec ();
   const T *ov = data ();
 
-  octave_sort<const T *> indexed_sort;
+  octave_sort<T> lsort;
 
+  sidx = Array<octave_idx_type> (dv);
+  octave_idx_type *vi = sidx.fortran_vec ();
+  
   if (mode == ASCENDING) 
-    indexed_sort.set_compare (ascending_compare);
+    lsort.set_compare (octave_sort<T>::ascending_compare);
   else if (mode == DESCENDING)
-    indexed_sort.set_compare (descending_compare);
+    lsort.set_compare (octave_sort<T>::descending_compare);
   else
     abort ();
 
-  OCTAVE_LOCAL_BUFFER (const T *, vi, ns);
-
-  sidx = Array<octave_idx_type> (dv);
-      
   if (stride == 1)
     {
       for (octave_idx_type j = 0; j < iter; j++)
 	{
-	   octave_idx_type offset = j * ns;
-
-	  for (octave_idx_type i = 0; i < ns; i++)
-            vi[i] = ov + i;
-
-	  indexed_sort.sort (vi, ns);
+          // copy and partition out NaNs. 
+          // FIXME: impact on integer types noticeable?
+          octave_idx_type kl = 0, ku = ns;
+          for (octave_idx_type i = 0; i < ns; i++)
+            {
+              T tmp = ov[i];
+              if (_sort_isnan (tmp))
+                {
+                  --ku;
+                  v[ku] = tmp;
+                  vi[ku] = i;
+                }
+              else
+                {
+                  v[kl] = tmp;
+                  vi[kl] = i;
+                  kl++;
+                }
+            }
 
-	  for (octave_idx_type i = 0; i < ns; i++)
-	    {
-	      v[i] = *vi[i];
-	      sidx(i + offset) = vi[i] - ov;
-	    }
+          // sort.
+	  lsort.sort (v, vi, kl);
+
+          if (ku < ns)
+            {
+              // NaNs are in reverse order
+              std::reverse (v + ku, v + ns);
+              std::reverse (vi + ku, vi + ns);
+              if (mode == DESCENDING)
+                {
+                  std::rotate (v, v + ku, v + ns);
+                  std::rotate (vi, vi + ku, vi + ns);
+                }
+            }
+
 	  v += ns;
+          vi += ns;
           ov += ns;
 	}
     }
   else
     {
-      OCTAVE_LOCAL_BUFFER (T, vix, ns);
+      OCTAVE_LOCAL_BUFFER (T, buf, ns);
+      OCTAVE_LOCAL_BUFFER (octave_idx_type, bufi, ns);
 
-      for (octave_idx_type j = 0; j < iter; j++)
+      for (octave_idx_type j = 0; j < iter; j++) 
 	{
 	  octave_idx_type offset = j;
 	  octave_idx_type offset2 = 0;
@@ -2147,45 +2191,53 @@
 	    }
 
 	  offset += offset2 * stride * ns;
-	      
-	  for (octave_idx_type i = 0; i < ns; i++)
-	    {
-	      vix[i] = ov[i*stride + offset];
-	      vi[i] = vix + i;
-	    }
+	  
+          // gather and partition out NaNs. 
+          // FIXME: impact on integer types noticeable?
+          octave_idx_type kl = 0, ku = ns;
+          for (octave_idx_type i = 0; i < ns; i++)
+            {
+              T tmp = ov[i*stride + offset];
+              if (_sort_isnan (tmp))
+                {
+                  --ku;
+                  buf[ku] = tmp;
+                  bufi[ku] = i;
+                }
+              else
+                {
+                  buf[kl] = tmp;
+                  bufi[kl] = i;
+                  kl++;
+                }
+            }
 
-	  indexed_sort.sort (vi, ns);
-	      
+          // sort.
+	  lsort.sort (buf, bufi, kl);
+
+          if (ku < ns)
+            {
+              // NaNs are in reverse order
+              std::reverse (buf + ku, buf + ns);
+              std::reverse (bufi + ku, bufi + ns);
+              if (mode == DESCENDING)
+                {
+                  std::rotate (buf, buf + ku, buf + ns);
+                  std::rotate (bufi, bufi + ku, bufi + ns);
+                }
+            }
+
+          // scatter.
 	  for (octave_idx_type i = 0; i < ns; i++)
-	    {
-	      v[i*stride+offset] = *vi[i];
-	      sidx(i*stride+offset) = vi[i] - vix;
-	    }
+	    v[i*stride + offset] = buf[i];
+	  for (octave_idx_type i = 0; i < ns; i++)
+	    vi[i*stride + offset] = bufi[i];
 	}
     }
 
   return m;
 }
 
-#if defined (HAVE_IEEE754_DATA_FORMAT)
-
-template <>
-bool ascending_compare (double, double);
-template <>
-bool ascending_compare (const double*, const double*);
-template <>
-bool descending_compare (double, double);
-template <>
-bool descending_compare (const double*, const double*);
-
-template <>
-Array<double> Array<double>::sort (octave_idx_type dim, sortmode mode) const;
-template <>
-Array<double> Array<double>::sort (Array<octave_idx_type> &sidx,
-				   octave_idx_type dim, sortmode mode) const;
-
-#endif
-
 template <class T>
 Array<T>
 Array<T>::diag (octave_idx_type k) const
--- a/liboctave/Array.h	Mon Feb 09 01:56:06 2009 -0500
+++ b/liboctave/Array.h	Mon Feb 09 13:05:35 2009 +0100
@@ -594,16 +594,8 @@
 
 #define INSTANTIATE_ARRAY_SORT(T) \
   template class octave_sort<T>; \
-  template class octave_sort<const T*>;
 
 #define NO_INSTANTIATE_ARRAY_SORT(T) \
-  template class vec_index<T>; \
-  template <> bool ascending_compare (T, T) { return true; } \
-  template <> bool ascending_compare (const T *, const T *) \
-    { return true; } \
-  template <> bool descending_compare (T, T) { return true; } \
-  template <> bool descending_compare (const T *, const T *) \
-    { return true; } \
   template <> Array<T> Array<T>::sort \
     (octave_idx_type, sortmode) const { return *this; } \
   template <> Array<T> Array<T>::sort (Array<octave_idx_type> &sidx, \
--- a/liboctave/ChangeLog	Mon Feb 09 01:56:06 2009 -0500
+++ b/liboctave/ChangeLog	Mon Feb 09 13:05:35 2009 +0100
@@ -1,3 +1,13 @@
+2009-02-09  Jaroslav Hajek  <highegg@gmail.com>
+
+	* oct-sort.cc (octave_sort<T>): Rewrite for optimizations. Allow
+	inlined comparison functor and by-the-way indexed sorting.
+	* oct-sort.h (octave_sort<T>): Update interface.
+	* Array.cc (Array<T>::sort): Reflect changes. Use copy & partition
+	mechanism.
+	* Array-d.cc, Array-f.cc, Array-C.cc, Array-fC.cc, Array-s.cc,
+	Array-i.cc: Reflect changes.
+
 2009-02-05  John W. Eaton  <jwe@octave.org>
 
 	* file-stat.cc (base_file_stat::is_sock):
--- a/liboctave/oct-sort.cc	Mon Feb 09 01:56:06 2009 -0500
+++ b/liboctave/oct-sort.cc	Mon Feb 09 13:05:35 2009 +0100
@@ -1,6 +1,6 @@
 /*
 Copyright (C) 2003, 2004, 2005, 2006, 2007 David Bateman
-Copyright (C) 2008 Jaroslav Hajek
+Copyright (C) 2008, 2009 Jaroslav Hajek
 
 This file is part of Octave.
 
@@ -33,6 +33,10 @@
   memmove by std::copy is possible if the destination starts before the source.
   If not, std::copy_backward needs to be used.
   
+* templatize comparison operator in most methods, provide possible dispatch
+
+* duplicate methods to avoid by-the-way indexed sorting
+
 
 The Python license is
 
@@ -98,12 +102,8 @@
 #include "quit.h"
 #include "oct-sort.h"
 
-#ifndef IFLT
-#define IFLT(a,b)  if (compare ? compare ((a), (b)) : ((a) < (b)))
-#endif
-
 template <class T>
-octave_sort<T>::octave_sort (void) : compare (0)
+octave_sort<T>::octave_sort (void) : compare (ascending_compare)
 { 
   merge_init ();
   merge_getmem (1024);
@@ -116,38 +116,20 @@
   merge_getmem (1024);
 }
   
-/* Reverse a slice of a list in place, from lo up to (exclusive) hi. */
 template <class T>
+template <class Comp>
 void
-octave_sort<T>::reverse_slice (T *lo, T *hi)
+octave_sort<T>::binarysort (T *data, octave_idx_type nel, 
+                            octave_idx_type start, Comp comp)
 {
-  --hi;
-  while (lo < hi) 
-    {
-      T t = *lo;
-      *lo = *hi;
-      *hi = t;
-      ++lo;
-      --hi;
-    }
-}
-
-template <class T>
-void
-octave_sort<T>::binarysort (T *lo, T *hi, T *start)
-{
-  T *l, *p, *r;
-  T pivot;
-
-  if (lo == start)
+  if (start == 0)
     ++start;
 
-  for (; start < hi; ++start) 
+  for (; start < nel; ++start) 
     {
       /* set l to where *start belongs */
-      l = lo;
-      r = start;
-      pivot = *r;
+      octave_idx_type l = 0, r = start;
+      T pivot = data[start];
       /* Invariants:
        * pivot >= all in [lo, l).
        * pivot  < all in [r, start).
@@ -155,8 +137,8 @@
        */
       do 
 	{
-	  p = l + ((r - l) >> 1);
-	  IFLT (pivot, *p)
+	  octave_idx_type p = l + ((r - l) >> 1);
+	  if (comp (pivot, data[p]))
 	    r = p;
 	  else
 	    l = p+1;
@@ -169,9 +151,58 @@
 	 Slide over to make room.
 	 Caution: using memmove is much slower under MSVC 5;
 	 we're not usually moving many slots. */
-      for (p = start; p > l; --p)
-	*p = *(p-1);
-      *l = pivot;
+      // NOTE: using swap and going upwards appears to be faster.
+      for (octave_idx_type p = l; p < start; p++)
+        std::swap (pivot, data[p]);
+      data[start] = pivot;
+    }
+
+  return;
+}
+
+template <class T>
+template <class Comp>
+void
+octave_sort<T>::binarysort (T *data, octave_idx_type *idx, octave_idx_type nel, 
+                            octave_idx_type start, Comp comp)
+{
+  if (start == 0)
+    ++start;
+
+  for (; start < nel; ++start) 
+    {
+      /* set l to where *start belongs */
+      octave_idx_type l = 0, r = start;
+      T pivot = data[start];
+      /* Invariants:
+       * pivot >= all in [lo, l).
+       * pivot  < all in [r, start).
+       * The second is vacuously true at the start.
+       */
+      do 
+	{
+	  octave_idx_type p = l + ((r - l) >> 1);
+	  if (comp (pivot, data[p]))
+	    r = p;
+	  else
+	    l = p+1;
+	} 
+      while (l < r);
+      /* The invariants still hold, so pivot >= all in [lo, l) and
+	 pivot < all in [l, start), so pivot belongs at l.  Note
+	 that if there are elements equal to pivot, l points to the
+	 first slot after them -- that's why this sort is stable.
+	 Slide over to make room.
+	 Caution: using memmove is much slower under MSVC 5;
+	 we're not usually moving many slots. */
+      // NOTE: using swap and going upwards appears to be faster.
+      for (octave_idx_type p = l; p < start; p++)
+        std::swap (pivot, data[p]);
+      data[start] = pivot;
+      octave_idx_type ipivot = idx[start];
+      for (octave_idx_type p = l; p < start; p++)
+        std::swap (ipivot, idx[p]);
+      idx[start] = ipivot;
     }
 
   return;
@@ -196,10 +227,12 @@
 Returns -1 in case of error.
 */
 template <class T>
+template <class Comp>
 octave_idx_type
-octave_sort<T>::count_run (T *lo, T *hi, bool& descending)
+octave_sort<T>::count_run (T *lo, octave_idx_type nel, bool& descending, Comp comp)
 {
   octave_idx_type n;
+  T *hi = lo + nel;
 
   descending = false;
   ++lo;
@@ -208,12 +241,12 @@
 
   n = 2;
 
-  IFLT (*lo, *(lo-1))
+  if (comp (*lo, *(lo-1)))
     {
       descending = true;
       for (lo = lo+1; lo < hi; ++lo, ++n) 
 	{
-	  IFLT (*lo, *(lo-1))
+	  if (comp (*lo, *(lo-1)))
 	    ;
 	  else
 	    break;
@@ -223,7 +256,7 @@
     {
       for (lo = lo+1; lo < hi; ++lo, ++n) 
 	{
-	  IFLT (*lo, *(lo-1))
+	  if (comp (*lo, *(lo-1)))
 	    break;
 	}
     }
@@ -253,8 +286,10 @@
 Returns -1 on error.  See listsort.txt for info on the method.
 */
 template <class T>
+template <class Comp>
 octave_idx_type
-octave_sort<T>::gallop_left (T key, T *a, octave_idx_type n, octave_idx_type hint)
+octave_sort<T>::gallop_left (T key, T *a, octave_idx_type n, octave_idx_type hint,
+                             Comp comp)
 {
   octave_idx_type ofs;
   octave_idx_type lastofs;
@@ -263,7 +298,7 @@
   a += hint;
   lastofs = 0;
   ofs = 1;
-  IFLT (*a, key)
+  if (comp (*a, key))
     {
       /* a[hint] < key -- gallop right, until
        * a[hint + lastofs] < key <= a[hint + ofs]
@@ -271,7 +306,7 @@
       const octave_idx_type maxofs = n - hint;	/* &a[n-1] is highest */
       while (ofs < maxofs) 
 	{
-	  IFLT (a[ofs], key)
+	  if (comp (a[ofs], key))
 	    {
 	      lastofs = ofs;
 	      ofs = (ofs << 1) + 1;
@@ -295,7 +330,7 @@
       const octave_idx_type maxofs = hint + 1;	/* &a[0] is lowest */
       while (ofs < maxofs) 
 	{
-	  IFLT (*(a-ofs), key)
+	  if (comp (*(a-ofs), key))
 	    break;
 	  /* key <= a[hint - ofs] */
 	  lastofs = ofs;
@@ -321,7 +356,7 @@
     {
       octave_idx_type m = lastofs + ((ofs - lastofs) >> 1);
 
-      IFLT (a[m], key)
+      if (comp (a[m], key))
 	lastofs = m+1;	/* a[m] < key */
       else
 	ofs = m;	/* key <= a[m] */
@@ -345,8 +380,10 @@
 written as one routine with yet another "left or right?" flag.
 */
 template <class T>
+template <class Comp>
 octave_idx_type
-octave_sort<T>::gallop_right (T key, T *a, octave_idx_type n, octave_idx_type hint)
+octave_sort<T>::gallop_right (T key, T *a, octave_idx_type n, octave_idx_type hint,
+                              Comp comp)
 {
   octave_idx_type ofs;
   octave_idx_type lastofs;
@@ -355,7 +392,7 @@
   a += hint;
   lastofs = 0;
   ofs = 1;
-  IFLT (key, *a)
+  if (comp (key, *a))
     {
       /* key < a[hint] -- gallop left, until
        * a[hint - ofs] <= key < a[hint - lastofs]
@@ -363,7 +400,7 @@
       const octave_idx_type maxofs = hint + 1;	/* &a[0] is lowest */
       while (ofs < maxofs) 
 	{
-	  IFLT (key, *(a-ofs))
+	  if (comp (key, *(a-ofs)))
 	    {
 	      lastofs = ofs;
 	      ofs = (ofs << 1) + 1;
@@ -388,7 +425,7 @@
       const octave_idx_type maxofs = n - hint;	/* &a[n-1] is highest */
       while (ofs < maxofs) 
 	{
-	  IFLT (key, a[ofs])
+	  if (comp (key, a[ofs]))
 	    break;
 	  /* a[hint + ofs] <= key */
 	  lastofs = ofs;
@@ -413,7 +450,7 @@
     {
       octave_idx_type m = lastofs + ((ofs - lastofs) >> 1);
 
-      IFLT (key, a[m])
+      if (comp (key, a[m]))
 	ofs = m;	/* key < a[m] */
       else
 	lastofs = m+1;	/* a[m] <= key */
@@ -428,6 +465,7 @@
 octave_sort<T>::merge_init (void)
 {
   ms.a = 0;
+  ms.ia = 0;
   ms.alloced = 0;
   ms.n = 0;
   ms.min_gallop = MIN_GALLOP;
@@ -442,6 +480,7 @@
 octave_sort<T>::merge_freemem (void)
 {
   delete [] ms.a;
+  delete [] ms.ia;
   ms.alloced = 0;
   ms.a = 0;
 }
@@ -509,7 +548,29 @@
   return -1;
 }
 
-#define MERGE_GETMEM(NEED) ((NEED) <= ms.alloced ? 0 : merge_getmem (NEED))
+template <class T>
+int
+octave_sort<T>::merge_getmemi (octave_idx_type need)
+{
+  if (need <= ms.alloced && ms.a && ms.ia)
+    return 0;
+
+  need = roundupsize (need); 
+  /* Don't realloc!  That can cost cycles to copy the old data, but
+   * we don't care what's in the block.
+   */
+  merge_freemem ();
+  ms.a = new T[need];
+  ms.ia = new octave_idx_type[need];
+  if (ms.a && ms.ia)
+    {
+      ms.alloced = need;
+      return 0;
+    }
+  merge_freemem ();	/* reset to sane state */
+
+  return -1;
+}
 
 /* Merge the na elements starting at pa with the nb elements starting at pb
  * in a stable way, in-place.  na and nb must be > 0, and pa + na == pb.
@@ -518,15 +579,18 @@
  * Return 0 if successful, -1 if error.
  */
 template <class T>
+template <class Comp>
 int
-octave_sort<T>::merge_lo (T *pa, octave_idx_type na, T *pb, octave_idx_type nb)
+octave_sort<T>::merge_lo (T *pa, octave_idx_type na, 
+                          T *pb, octave_idx_type nb,
+                          Comp comp)
 {
   octave_idx_type k;
   T *dest;
   int result = -1;	/* guilty until proved innocent */
   octave_idx_type min_gallop = ms.min_gallop;
 
-  if (MERGE_GETMEM (na) < 0)
+  if (merge_getmem (na) < 0)
     return -1;
   std::copy (pa, pa + na, ms.a);
   dest = pa;
@@ -550,7 +614,10 @@
       for (;;)
 	{
 
-	  IFLT (*pb, *pa)
+          // FIXME: these loops are candidates for further optimizations.
+          // Rather than testing everything in each cycle, it may be more
+          // efficient to do it in hunks. 
+	  if (comp (*pb, *pa))
 	    {
 	      *dest++ = *pb++;
 	      ++bcount;
@@ -584,14 +651,13 @@
 	{
 	  min_gallop -= min_gallop > 1;
 	  ms.min_gallop = min_gallop;
-	  k = gallop_right (*pb, pa, na, 0);
+	  k = gallop_right (*pb, pa, na, 0, comp);
 	  acount = k;
 	  if (k)
 	    {
 	      if (k < 0)
 		goto Fail;
-              std::copy (pa, pa + k, dest);
-	      dest += k;
+              dest = std::copy (pa, pa + k, dest);
 	      pa += k;
 	      na -= k;
 	      if (na == 1)
@@ -608,14 +674,13 @@
 	  if (nb == 0)
 	    goto Succeed;
 
-	  k = gallop_left (*pa, pb, nb, 0);
+	  k = gallop_left (*pa, pb, nb, 0, comp);
 	  bcount = k;
 	  if (k)
 	    {
 	      if (k < 0)
 		goto Fail;
-              std::copy (pb, pb + k, dest);
-	      dest += k;
+              dest = std::copy (pb, pb + k, dest);
 	      pb += k;
 	      nb -= k;
 	      if (nb == 0)
@@ -648,6 +713,147 @@
   return 0;
 }
 
+template <class T>
+template <class Comp>
+int
+octave_sort<T>::merge_lo (T *pa, octave_idx_type *ipa, octave_idx_type na, 
+                          T *pb, octave_idx_type *ipb, octave_idx_type nb,
+                          Comp comp)
+{
+  octave_idx_type k;
+  T *dest;
+  octave_idx_type *idest;
+  int result = -1;	/* guilty until proved innocent */
+  octave_idx_type min_gallop = ms.min_gallop;
+
+  if (merge_getmemi (na) < 0)
+    return -1;
+  std::copy (pa, pa + na, ms.a);
+  std::copy (ipa, ipa + na, ms.ia);
+  dest = pa; idest = ipa;
+  pa = ms.a; ipa = ms.ia;
+
+  *dest++ = *pb++; *idest++ = *ipb++;
+  --nb;
+  if (nb == 0)
+    goto Succeed;
+  if (na == 1)
+    goto CopyB;
+
+  for (;;)
+    {
+      octave_idx_type acount = 0;	/* # of times A won in a row */
+      octave_idx_type bcount = 0;	/* # of times B won in a row */
+
+      /* Do the straightforward thing until (if ever) one run
+       * appears to win consistently.
+       */
+      for (;;)
+	{
+
+	  if (comp (*pb, *pa))
+	    {
+	      *dest++ = *pb++; *idest++ = *ipb++;
+	      ++bcount;
+	      acount = 0;
+	      --nb;
+	      if (nb == 0)
+		goto Succeed;
+	      if (bcount >= min_gallop)
+		break;
+	    }
+	  else
+	    {
+	      *dest++ = *pa++; *idest++ = *ipa++;
+	      ++acount;
+	      bcount = 0;
+	      --na;
+	      if (na == 1)
+		goto CopyB;
+	      if (acount >= min_gallop)
+		break;
+	    }
+	}
+
+      /* One run is winning so consistently that galloping may
+       * be a huge win.  So try that, and continue galloping until
+       * (if ever) neither run appears to be winning consistently
+       * anymore.
+       */
+      ++min_gallop;
+      do
+	{
+	  min_gallop -= min_gallop > 1;
+	  ms.min_gallop = min_gallop;
+	  k = gallop_right (*pb, pa, na, 0, comp);
+	  acount = k;
+	  if (k)
+	    {
+	      if (k < 0)
+		goto Fail;
+              dest = std::copy (pa, pa + k, dest);
+              idest = std::copy (ipa, ipa + k, idest);
+	      pa += k; ipa += k;
+	      na -= k;
+	      if (na == 1)
+		goto CopyB;
+	      /* na==0 is impossible now if the comparison
+	       * function is consistent, but we can't assume
+	       * that it is.
+	       */
+	      if (na == 0)
+		goto Succeed;
+	    }
+	  *dest++ = *pb++; *idest++ = *ipb++;
+	  --nb;
+	  if (nb == 0)
+	    goto Succeed;
+
+	  k = gallop_left (*pa, pb, nb, 0, comp);
+	  bcount = k;
+	  if (k)
+	    {
+	      if (k < 0)
+		goto Fail;
+              dest = std::copy (pb, pb + k, dest);
+              idest = std::copy (ipb, ipb + k, idest);
+	      pb += k; ipb += k;
+	      nb -= k;
+	      if (nb == 0)
+		goto Succeed;
+	    }
+	  *dest++ = *pa++; *idest++ = *ipa++;
+	  --na;
+	  if (na == 1)
+	    goto CopyB;
+	}
+      while (acount >= MIN_GALLOP || bcount >= MIN_GALLOP);
+
+      ++min_gallop;	/* penalize it for leaving galloping mode */
+      ms.min_gallop = min_gallop;
+    }
+
+ Succeed:
+  result = 0;
+
+ Fail:
+  if (na)
+    {
+      std::copy (pa, pa + na, dest);
+      std::copy (ipa, ipa + na, idest);
+    }
+  return result;
+
+ CopyB:
+  /* The last element of pa belongs at the end of the merge. */
+  std::copy (pb, pb + nb, dest);
+  std::copy (ipb, ipb + nb, idest);
+  dest[nb] = *pa;
+  idest[nb] = *ipa;
+
+  return 0;
+}
+
 /* Merge the na elements starting at pa with the nb elements starting at pb
  * in a stable way, in-place.  na and nb must be > 0, and pa + na == pb.
  * Must also have that *pb < *pa, that pa[na-1] belongs at the end of the
@@ -655,17 +861,19 @@
  * Return 0 if successful, -1 if error.
  */
 template <class T>
+template <class Comp>
 int
-octave_sort<T>::merge_hi (T *pa, octave_idx_type na, T *pb, octave_idx_type nb)
+octave_sort<T>::merge_hi (T *pa, octave_idx_type na, 
+                          T *pb, octave_idx_type nb,
+                          Comp comp)
 {
   octave_idx_type k;
   T *dest;
   int result = -1;	/* guilty until proved innocent */
-  T *basea;
-  T *baseb;
+  T *basea, *baseb;
   octave_idx_type min_gallop = ms.min_gallop;
 
-  if (MERGE_GETMEM (nb) < 0)
+  if (merge_getmem (nb) < 0)
     return -1;
   dest = pb + nb - 1;
   std::copy (pb, pb + nb, ms.a);
@@ -691,7 +899,7 @@
        */
       for (;;) 
 	{
-	  IFLT (*pb, *pa)
+	  if (comp (*pb, *pa))
 	    {
 	      *dest-- = *pa--;
 	      ++acount;
@@ -725,16 +933,15 @@
 	{
 	  min_gallop -= min_gallop > 1;
 	  ms.min_gallop = min_gallop;
-	  k = gallop_right (*pb, basea, na, na-1);
+	  k = gallop_right (*pb, basea, na, na-1, comp);
 	  if (k < 0)
 	    goto Fail;
 	  k = na - k;
 	  acount = k;
 	  if (k) 
 	    {
-	      dest -= k;
+              dest = std::copy_backward (pa+1 - k, pa+1, dest+1) - 1;
 	      pa -= k;
-              std::copy_backward (pa+1, pa+1 + k, dest+1 + k);
 	      na -= k;
 	      if (na == 0)
 		goto Succeed;
@@ -744,7 +951,7 @@
 	  if (nb == 1)
 	    goto CopyA;
 
-	  k = gallop_left (*pa, baseb, nb, nb-1);
+	  k = gallop_left (*pa, baseb, nb, nb-1, comp);
 	  if (k < 0)
 	    goto Fail;
 	  k = nb - k;
@@ -783,28 +990,176 @@
 
 CopyA:
   /* The first element of pb belongs at the front of the merge. */
-  dest -= na;
+  dest = std::copy_backward (pa+1 - na, pa+1, dest+1) - 1;
   pa -= na;
-  std::copy_backward (pa+1, pa+1 + na, dest+1 + na);
   *dest = *pb;
 
   return 0;
 }
 
+template <class T>
+template <class Comp>
+int
+octave_sort<T>::merge_hi (T *pa, octave_idx_type *ipa, octave_idx_type na, 
+                          T *pb, octave_idx_type *ipb, octave_idx_type nb,
+                          Comp comp)
+{
+  octave_idx_type k;
+  T *dest;
+  octave_idx_type *idest;
+  int result = -1;	/* guilty until proved innocent */
+  T *basea, *baseb;
+  octave_idx_type *ibasea, *ibaseb;
+  octave_idx_type min_gallop = ms.min_gallop;
+
+  if (merge_getmemi (nb) < 0)
+    return -1;
+  dest = pb + nb - 1;
+  idest = ipb + nb - 1;
+  std::copy (pb, pb + nb, ms.a);
+  std::copy (ipb, ipb + nb, ms.ia);
+  basea = pa; ibasea = ipa;
+  baseb = ms.a; ibaseb = ms.ia;
+  pb = ms.a + nb - 1; ipb = ms.ia + nb - 1;
+  pa += na - 1; ipa += na - 1;
+
+  *dest-- = *pa--; *idest-- = *ipa--;
+  --na;
+  if (na == 0)
+    goto Succeed;
+  if (nb == 1)
+    goto CopyA;
+
+  for (;;) 
+    {
+      octave_idx_type acount = 0;	/* # of times A won in a row */
+      octave_idx_type bcount = 0;	/* # of times B won in a row */
+
+      /* Do the straightforward thing until (if ever) one run
+       * appears to win consistently.
+       */
+      for (;;) 
+	{
+	  if (comp (*pb, *pa))
+	    {
+	      *dest-- = *pa--; *idest-- = *ipa--;
+	      ++acount;
+	      bcount = 0;
+	      --na;
+	      if (na == 0)
+		goto Succeed;
+	      if (acount >= min_gallop)
+		break;
+	    }
+	  else 
+	    {
+	      *dest-- = *pb--; *idest-- = *ipb--;
+	      ++bcount;
+	      acount = 0;
+	      --nb;
+	      if (nb == 1)
+		goto CopyA;
+	      if (bcount >= min_gallop)
+		break;
+	    }
+	}
+
+      /* One run is winning so consistently that galloping may
+       * be a huge win.  So try that, and continue galloping until
+       * (if ever) neither run appears to be winning consistently
+       * anymore.
+       */
+      ++min_gallop;
+      do 
+	{
+	  min_gallop -= min_gallop > 1;
+	  ms.min_gallop = min_gallop;
+	  k = gallop_right (*pb, basea, na, na-1, comp);
+	  if (k < 0)
+	    goto Fail;
+	  k = na - k;
+	  acount = k;
+	  if (k) 
+	    {
+              dest = std::copy_backward (pa+1 - k, pa+1, dest+1) - 1;
+              idest = std::copy_backward (ipa+1 - k, ipa+1, idest+1) - 1;
+	      pa -= k; ipa -= k;
+	      na -= k;
+	      if (na == 0)
+		goto Succeed;
+	    }
+	  *dest-- = *pb--; *idest-- = *ipb--;
+	  --nb;
+	  if (nb == 1)
+	    goto CopyA;
+
+	  k = gallop_left (*pa, baseb, nb, nb-1, comp);
+	  if (k < 0)
+	    goto Fail;
+	  k = nb - k;
+	  bcount = k;
+	  if (k) 
+	    {
+	      dest -= k; idest -= k;
+	      pb -= k; ipb -= k;
+              std::copy (pb+1, pb+1 + k, dest+1);
+              std::copy (ipb+1, ipb+1 + k, idest+1);
+	      nb -= k;
+	      if (nb == 1)
+		goto CopyA;
+	      /* nb==0 is impossible now if the comparison
+	       * function is consistent, but we can't assume
+	       * that it is.
+	       */
+	      if (nb == 0)
+		goto Succeed;
+	    }
+	  *dest-- = *pa--; *idest-- = *ipa--;
+	  --na;
+	  if (na == 0)
+	    goto Succeed;
+	} while (acount >= MIN_GALLOP || bcount >= MIN_GALLOP);
+      ++min_gallop;	/* penalize it for leaving galloping mode */
+      ms.min_gallop = min_gallop;
+    }
+
+Succeed:
+  result = 0;
+
+Fail:
+  if (nb)
+    {
+      std::copy (baseb, baseb + nb, dest-(nb-1));
+      std::copy (ibaseb, ibaseb + nb, idest-(nb-1));
+    }
+  return result;
+
+CopyA:
+  /* The first element of pb belongs at the front of the merge. */
+  dest = std::copy_backward (pa+1 - na, pa+1, dest+1) - 1;
+  idest = std::copy_backward (ipa+1 - na, ipa+1, idest+1) - 1;
+  pa -= na; ipa -= na;
+  *dest = *pb; *idest = *ipb;
+
+  return 0;
+}
+
 /* Merge the two runs at stack indices i and i+1.
  * Returns 0 on success, -1 on error.
  */
 template <class T>
+template <class Comp>
 int
-octave_sort<T>::merge_at (octave_idx_type i)
+octave_sort<T>::merge_at (octave_idx_type i, T *data,
+                          Comp comp)
 {
   T *pa, *pb;
   octave_idx_type na, nb;
   octave_idx_type k;
 
-  pa = ms.pending[i].base;
+  pa = data + ms.pending[i].base;
   na = ms.pending[i].len;
-  pb = ms.pending[i+1].base;
+  pb = data + ms.pending[i+1].base;
   nb = ms.pending[i+1].len;
 
   /* Record the length of the combined runs; if i is the 3rd-last
@@ -819,7 +1174,7 @@
   /* Where does b start in a?  Elements in a before that can be
    * ignored (already in place).
    */
-  k = gallop_right (*pb, pa, na, 0);
+  k = gallop_right (*pb, pa, na, 0, comp);
   if (k < 0)
     return -1;
   pa += k;
@@ -830,7 +1185,7 @@
   /* Where does a end in b?  Elements in b after that can be
    * ignored (already in place).
    */
-  nb = gallop_left (pa[na-1], pb, nb, nb-1);
+  nb = gallop_left (pa[na-1], pb, nb, nb-1, comp);
   if (nb <= 0)
     return nb;
 
@@ -838,9 +1193,63 @@
    * min(na, nb) elements.
    */
   if (na <= nb)
-    return merge_lo (pa, na, pb, nb);
+    return merge_lo (pa, na, pb, nb, comp);
   else
-    return merge_hi (pa, na, pb, nb);
+    return merge_hi (pa, na, pb, nb, comp);
+}
+
+template <class T>
+template <class Comp>
+int
+octave_sort<T>::merge_at (octave_idx_type i, T *data, octave_idx_type *idx,
+                          Comp comp)
+{
+  T *pa, *pb;
+  octave_idx_type *ipa, *ipb;
+  octave_idx_type na, nb;
+  octave_idx_type k;
+
+  pa = data + ms.pending[i].base;
+  ipa = idx + ms.pending[i].base;
+  na = ms.pending[i].len;
+  pb = data + ms.pending[i+1].base;
+  ipb = idx + ms.pending[i+1].base;
+  nb = ms.pending[i+1].len;
+
+  /* Record the length of the combined runs; if i is the 3rd-last
+   * run now, also slide over the last run (which isn't involved
+   * in this merge).  The current run i+1 goes away in any case.
+   */
+  ms.pending[i].len = na + nb;
+  if (i == ms.n - 3)
+    ms.pending[i+1] = ms.pending[i+2];
+  --ms.n;
+
+  /* Where does b start in a?  Elements in a before that can be
+   * ignored (already in place).
+   */
+  k = gallop_right (*pb, pa, na, 0, comp);
+  if (k < 0)
+    return -1;
+  pa += k; ipa += k;
+  na -= k;
+  if (na == 0)
+    return 0;
+
+  /* Where does a end in b?  Elements in b after that can be
+   * ignored (already in place).
+   */
+  nb = gallop_left (pa[na-1], pb, nb, nb-1, comp);
+  if (nb <= 0)
+    return nb;
+
+  /* Merge what remains of the runs, using a temp array with
+   * min(na, nb) elements.
+   */
+  if (na <= nb)
+    return merge_lo (pa, ipa, na, pb, ipb, nb, comp);
+  else
+    return merge_hi (pa, ipa, na, pb, ipb, nb, comp);
 }
 
 /* Examine the stack of runs waiting to be merged, merging adjacent runs
@@ -854,8 +1263,9 @@
  * Returns 0 on success, -1 on error.
  */
 template <class T>
+template <class Comp>
 int
-octave_sort<T>::merge_collapse (void)
+octave_sort<T>::merge_collapse (T *data, Comp comp)
 {
   struct s_slice *p = ms.pending;
 
@@ -866,12 +1276,41 @@
 	{
 	  if (p[n-1].len < p[n+1].len)
 	    --n;
-	  if (merge_at (n) < 0)
+	  if (merge_at (n, data, comp) < 0)
 	    return -1;
 	}
       else if (p[n].len <= p[n+1].len) 
 	{
-	  if (merge_at (n) < 0)
+	  if (merge_at (n, data, comp) < 0)
+	    return -1;
+	}
+      else
+	break;
+    }
+
+  return 0;
+}
+
+template <class T>
+template <class Comp>
+int
+octave_sort<T>::merge_collapse (T *data, octave_idx_type *idx, Comp comp)
+{
+  struct s_slice *p = ms.pending;
+
+  while (ms.n > 1) 
+    {
+      octave_idx_type n = ms.n - 2;
+      if (n > 0 && p[n-1].len <= p[n].len + p[n+1].len) 
+	{
+	  if (p[n-1].len < p[n+1].len)
+	    --n;
+	  if (merge_at (n, data, idx, comp) < 0)
+	    return -1;
+	}
+      else if (p[n].len <= p[n+1].len) 
+	{
+	  if (merge_at (n, data, idx, comp) < 0)
 	    return -1;
 	}
       else
@@ -887,8 +1326,9 @@
  * Returns 0 on success, -1 on error.
  */
 template <class T>
+template <class Comp>
 int
-octave_sort<T>::merge_force_collapse (void)
+octave_sort<T>::merge_force_collapse (T *data, Comp comp)
 {
   struct s_slice *p = ms.pending;
 
@@ -897,7 +1337,26 @@
       octave_idx_type n = ms.n - 2;
       if (n > 0 && p[n-1].len < p[n+1].len)
 	--n;
-      if (merge_at (n) < 0)
+      if (merge_at (n, data, comp) < 0)
+	return -1;
+    }
+
+  return 0;
+}
+
+template <class T>
+template <class Comp>
+int
+octave_sort<T>::merge_force_collapse (T *data, octave_idx_type *idx, Comp comp)
+{
+  struct s_slice *p = ms.pending;
+
+  while (ms.n > 1) 
+    {
+      octave_idx_type n = ms.n - 2;
+      if (n > 0 && p[n-1].len < p[n+1].len)
+	--n;
+      if (merge_at (n, data, idx, comp) < 0)
 	return -1;
     }
 
@@ -930,18 +1389,75 @@
 }
 
 template <class T>
+template <class Comp>
 void
-octave_sort<T>::sort (T *v, octave_idx_type elements)
+octave_sort<T>::sort (T *data, octave_idx_type nel, Comp comp)
 {
   /* Re-initialize the Mergestate as this might be the second time called */
   ms.n = 0;
   ms.min_gallop = MIN_GALLOP;
 
-  if (elements > 1)
+  if (nel > 1)
     {
-      octave_idx_type nremaining = elements; 
-      T *lo = v;
-      T *hi = v + elements;
+      octave_idx_type nremaining = nel; 
+      octave_idx_type lo = 0;
+
+      /* March over the array once, left to right, finding natural runs,
+       * and extending short natural runs to minrun elements.
+       */
+      octave_idx_type minrun = merge_compute_minrun (nremaining);
+      do 
+	{
+	  bool descending;
+	  octave_idx_type n;
+
+	  /* Identify next run. */
+	  n = count_run (data + lo, nremaining, descending, comp);
+	  if (n < 0)
+	    goto fail;
+	  if (descending)
+            std::reverse (data + lo, data + lo + n);
+	  /* If short, extend to min(minrun, nremaining). */
+	  if (n < minrun) 
+	    {
+	      const octave_idx_type force = nremaining <= minrun ? nremaining : minrun;
+	      binarysort (data + lo, force, n, comp);
+	      n = force;
+	    }
+	  /* Push run onto pending-runs stack, and maybe merge. */
+	  assert (ms.n < MAX_MERGE_PENDING);
+	  ms.pending[ms.n].base = lo;
+	  ms.pending[ms.n].len = n;
+	  ++ms.n;
+	  if (merge_collapse (data, comp) < 0)
+	    goto fail;
+	  /* Advance to find next run. */
+	  lo += n;
+	  nremaining -= n;
+	}
+      while (nremaining);
+
+      merge_force_collapse (data, comp);
+    }
+
+fail:
+  return;
+}
+
+template <class T>
+template <class Comp>
+void
+octave_sort<T>::sort (T *data, octave_idx_type *idx, octave_idx_type nel, 
+                      Comp comp)
+{
+  /* Re-initialize the Mergestate as this might be the second time called */
+  ms.n = 0;
+  ms.min_gallop = MIN_GALLOP;
+
+  if (nel > 1)
+    {
+      octave_idx_type nremaining = nel; 
+      octave_idx_type lo = 0;
 
       /* March over the array once, left to right, finding natural runs,
        * and extending short natural runs to minrun elements.
@@ -953,16 +1469,19 @@
 	  octave_idx_type n;
 
 	  /* Identify next run. */
-	  n = count_run (lo, hi, descending);
+	  n = count_run (data + lo, nremaining, descending, comp);
 	  if (n < 0)
 	    goto fail;
 	  if (descending)
-	    reverse_slice (lo, lo + n);
+            {
+              std::reverse (data + lo, data + lo + n);
+              std::reverse (idx + lo, idx + lo + n);
+            }
 	  /* If short, extend to min(minrun, nremaining). */
 	  if (n < minrun) 
 	    {
 	      const octave_idx_type force = nremaining <= minrun ? nremaining : minrun;
-	      binarysort (lo, lo + force, lo + n);
+	      binarysort (data + lo, idx + lo, force, n, comp);
 	      n = force;
 	    }
 	  /* Push run onto pending-runs stack, and maybe merge. */
@@ -970,7 +1489,7 @@
 	  ms.pending[ms.n].base = lo;
 	  ms.pending[ms.n].len = n;
 	  ++ms.n;
-	  if (merge_collapse () < 0)
+	  if (merge_collapse (data, idx, comp) < 0)
 	    goto fail;
 	  /* Advance to find next run. */
 	  lo += n;
@@ -978,13 +1497,63 @@
 	}
       while (nremaining);
 
-      merge_force_collapse ();
+      merge_force_collapse (data, idx, comp);
     }
 
 fail:
   return;
 }
 
+template <class T>
+void
+octave_sort<T>::sort (T *data, octave_idx_type nel)
+{
+#ifdef INLINE_ASCENDING_SORT
+  if (compare == ascending_compare)
+    sort (data, nel, std::less<T> ());
+  else
+#endif
+#ifdef INLINE_DESCENDING_SORT    
+    if (compare == descending_compare)
+    sort (data, nel, std::greater<T> ());
+  else
+#endif
+    if (compare)
+      sort (data, nel, compare);
+}
+
+template <class T>
+void
+octave_sort<T>::sort (T *data, octave_idx_type *idx, octave_idx_type nel)
+{
+#ifdef INLINE_ASCENDING_SORT
+  if (compare == ascending_compare)
+    sort (data, idx, nel, std::less<T> ());
+  else
+#endif
+#ifdef INLINE_DESCENDING_SORT    
+    if (compare == descending_compare)
+    sort (data, idx, nel, std::greater<T> ());
+  else
+#endif
+    if (compare)
+      sort (data, idx, nel, compare);
+}
+
+template <class T>
+bool 
+octave_sort<T>::ascending_compare (T x, T y)
+{
+  return x < y;
+}
+
+template <class T>
+bool 
+octave_sort<T>::descending_compare (T x, T y)
+{
+  return x > y;
+}
+
 /*
 ;;; Local Variables: ***
 ;;; mode: C++ ***
--- a/liboctave/oct-sort.h	Mon Feb 09 01:56:06 2009 -0500
+++ b/liboctave/oct-sort.h	Mon Feb 09 13:05:35 2009 +0100
@@ -113,7 +113,12 @@
 
   void set_compare (bool (*comp) (T, T)) { compare = comp; }
 
-  void sort (T *v, octave_idx_type elements);
+  void sort (T *data, octave_idx_type nel);
+
+  void sort (T *data, octave_idx_type *idx, octave_idx_type nel);
+
+  static bool ascending_compare (T, T);
+  static bool descending_compare (T, T);
 
 private:
 
@@ -127,8 +132,7 @@
   
   struct s_slice 
   {
-    T *base;
-    octave_idx_type len;
+    octave_idx_type base, len;
   };
   
   struct MergeState 
@@ -142,6 +146,7 @@
     // 'a' is temp storage to help with merges.  It contains room for
     // alloced entries.
     T *a;               // may point to temparray below
+    octave_idx_type *ia;
     octave_idx_type alloced;
     
     // A stack of n pending runs yet to be merged.  Run #i starts at
@@ -160,15 +165,25 @@
   
   MergeState ms;
   
-  void reverse_slice (T *lo, T *hi);
-  
-  void binarysort (T *lo, T *hi, T *start);
+    
+  template <class Comp>
+  void binarysort (T *data, octave_idx_type nel, 
+              octave_idx_type start, Comp comp);
+    
+  template <class Comp>
+  void binarysort (T *data, octave_idx_type *idx, octave_idx_type nel, 
+              octave_idx_type start, Comp comp);
     
-  octave_idx_type count_run (T *lo, T *hi, bool& descending);
+  template <class Comp>
+  octave_idx_type count_run (T *lo, octave_idx_type n, bool& descending, Comp comp);
 
-  octave_idx_type gallop_left (T key, T *a, octave_idx_type n, octave_idx_type hint);
+  template <class Comp>
+  octave_idx_type gallop_left (T key, T *a, octave_idx_type n, octave_idx_type hint,
+                               Comp comp);
 
-  octave_idx_type gallop_right (T key, T *a, octave_idx_type n, octave_idx_type hint);
+  template <class Comp>
+  octave_idx_type gallop_right (T key, T *a, octave_idx_type n, octave_idx_type hint,
+                                Comp comp);
 
   void merge_init (void);
 
@@ -176,17 +191,56 @@
 
   int merge_getmem (octave_idx_type need);
 
-  int merge_lo (T *pa, octave_idx_type na, T *pb, octave_idx_type nb);
+  int merge_getmemi (octave_idx_type need);
+
+  template <class Comp>
+  int merge_lo (T *pa, octave_idx_type na, 
+                T *pb, octave_idx_type nb,
+                Comp comp);
 
-  int merge_hi (T *pa, octave_idx_type na, T *pb, octave_idx_type nb);
+  template <class Comp>
+  int merge_lo (T *pa, octave_idx_type *ipa, octave_idx_type na, 
+                T *pb, octave_idx_type *ipb, octave_idx_type nb,
+                Comp comp);
+
+  template <class Comp>
+  int merge_hi (T *pa, octave_idx_type na, 
+                T *pb, octave_idx_type nb,
+                Comp comp);
 
-  int merge_at (octave_idx_type i);
+  template <class Comp>
+  int merge_hi (T *pa, octave_idx_type *ipa, octave_idx_type na, 
+                T *pb, octave_idx_type *ipb, octave_idx_type nb,
+                Comp comp);
+
+  template <class Comp>
+  int merge_at (octave_idx_type i, T *data,
+                Comp comp);
 
-  int merge_collapse (void);
+  template <class Comp>
+  int merge_at (octave_idx_type i, T *data, octave_idx_type *idx,
+                Comp comp);
+
+  template <class Comp>
+  int merge_collapse (T *data, Comp comp);
 
-  int merge_force_collapse (void);
+  template <class Comp>
+  int merge_collapse (T *data, octave_idx_type *idx, Comp comp);
+
+  template <class Comp>
+  int merge_force_collapse (T *data, Comp comp);
+
+  template <class Comp>
+  int merge_force_collapse (T *data, octave_idx_type *idx, Comp comp);
 
   octave_idx_type merge_compute_minrun (octave_idx_type n);
+
+  template <class Comp>
+  void sort (T *data, octave_idx_type nel, Comp comp);
+
+  template <class Comp>
+  void sort (T *data, octave_idx_type *idx, octave_idx_type nel, Comp comp);
+
 };
 
 template <class T>
--- a/src/ChangeLog	Mon Feb 09 01:56:06 2009 -0500
+++ b/src/ChangeLog	Mon Feb 09 13:05:35 2009 +0100
@@ -1,3 +1,7 @@
+2009-02-09  Jaroslav Hajek  <highegg@gmail.com>
+
+	* TEMPLATE-INST/Array-tc.cc: Reflect changes in octave_sort.
+
 2009-02-08  John W. Eaton  <jwe@octave.org>
 
 	* octave.cc (initialize_pathsearch): Delete.
--- a/src/TEMPLATE-INST/Array-tc.cc	Mon Feb 09 01:56:06 2009 -0500
+++ b/src/TEMPLATE-INST/Array-tc.cc	Mon Feb 09 13:05:35 2009 +0100
@@ -40,38 +40,22 @@
 
 #include "oct-obj.h"
 
-#define IFLT(a, b) if (compare ? compare ((a), (b)) : true)
-
 #include "oct-sort.cc"
 
 template <>
 bool
-ascending_compare (octave_value a, octave_value b)
+octave_sort<octave_value>::ascending_compare (octave_value a, octave_value b)
 {
   return (a.string_value () < b.string_value ());
 }
 
 template <>
 bool
-ascending_compare (const octave_value *a, const octave_value *b)
-{
-  return (a->string_value () < b->string_value ());
-}
-
-template <>
-bool
-descending_compare (octave_value a, octave_value b)
+octave_sort<octave_value>::descending_compare (octave_value a, octave_value b)
 {
   return (a.string_value () > b.string_value ());
 }
 
-template <>
-bool
-descending_compare (const octave_value *a, const octave_value *b)
-{
-  return (a->string_value () > b->string_value ());
-}
-
 INSTANTIATE_ARRAY_SORT (octave_value);
 
 template class OCTINTERP_API Array<octave_value>;