changeset 9012:9f5e095555fc

smarter algorithm for permute
author Jaroslav Hajek <highegg@gmail.com>
date Wed, 25 Mar 2009 11:40:06 +0100
parents dd5725531732
children 3b1908b58662
files liboctave/Array.cc liboctave/ChangeLog
diffstat 2 files changed, 68 insertions(+), 17 deletions(-) [+]
line wrap: on
line diff
--- a/liboctave/Array.cc	Tue Mar 24 19:41:56 2009 -0400
+++ b/liboctave/Array.cc	Wed Mar 25 11:40:06 2009 +0100
@@ -461,6 +461,7 @@
 class rec_permute_helper
 {
   octave_idx_type *dim, *stride;
+  bool use_blk;
   int top;
 
 public:
@@ -472,37 +473,78 @@
       dim = new octave_idx_type [2*n];
       // A hack to avoid double allocation
       stride = dim + n;
-      top = 0;
 
       // Get cumulative dimensions.
       OCTAVE_LOCAL_BUFFER (octave_idx_type, cdim, n+1);
       cdim[0] = 1;
       for (int i = 1; i < n+1; i++) cdim[i] = cdim[i-1] * dv(i-1);
 
-      int k = 0;
-      // Reduce leading identity
-      for (; k < n && perm(k) == k; k++) ;
-      if (k > 0)
+      // Setup the permuted strides.
+      for (int k = 0; k < n; k++)
         {
-          dim[0] = cdim[k];
-          stride[0] = 1;
+          int kk = perm(k);
+          dim[k] = dv(kk);
+          stride[k] = cdim[kk];
         }
-      else
-        top = -1;
-
-      // Setup the permuted strides.
-      for (; k < n; k++)
+
+      // Reduce contiguous runs.
+      top = 0;
+      for (int k = 1; k < n; k++)
         {
-          ++top;
-          int kk = perm(k);
-          dim[top] = dv(kk);
-          stride[top] = cdim[kk];
+          if (stride[k] == stride[top]*dim[top])
+            dim[top] *= dim[k];
+          else
+            {
+              top++;
+              dim[top] = dim[k];
+              stride[top] = stride[k];
+            }
         }
 
+      // Determine whether we can use block transposes.
+      use_blk = top >= 1 && stride[1] == 1 && stride[0] == dim[1];
+
     }
 
   ~rec_permute_helper (void) { delete [] dim; }
 
+  // Helper method for fast blocked transpose.
+  template <class T>
+  T *blk_trans (const T *src, T *dest, octave_idx_type nr, octave_idx_type nc) const
+    {
+      static const octave_idx_type m = 8;
+      OCTAVE_LOCAL_BUFFER (T, blk, m*m);
+      for (octave_idx_type kr = 0; kr < nr; kr += m)
+        for (octave_idx_type kc = 0; kc < nc; kc += m)
+          {
+            octave_idx_type lr = std::min (m, nr - kr);
+            octave_idx_type lc = std::min (m, nc - kc);
+            if (lr == m && lc == m)
+              {
+                const T *ss = src + kc * nr + kr;
+                for (octave_idx_type j = 0; j < m; j++)
+                  for (octave_idx_type i = 0; i < m; i++)
+                    blk[j*m+i] = ss[j*nr + i];
+                T *dd = dest + kr * nc + kc;
+                for (octave_idx_type j = 0; j < m; j++)
+                  for (octave_idx_type i = 0; i < m; i++)
+                    dd[j*nc+i] = blk[i*m+j];
+              }
+            else
+              {
+                const T *ss = src + kc * nr + kr;
+                for (octave_idx_type j = 0; j < lc; j++)
+                  for (octave_idx_type i = 0; i < lr; i++)
+                    blk[j*m+i] = ss[j*nr + i];
+                T *dd = dest + kr * nc + kc;
+                for (octave_idx_type j = 0; j < lr; j++)
+                  for (octave_idx_type i = 0; i < lc; i++)
+                    dd[j*nc+i] = blk[i*m+j];
+              }
+          }
+
+      return dest + nr*nc;
+    }
 private:
 
   // Recursive N-d generalized transpose
@@ -521,8 +563,9 @@
 
               dest += len;
             }
-
         }
+      else if (use_blk && lev == 1)
+        dest = blk_trans (src, dest, dim[1], dim[0]);
       else
         {
           octave_idx_type step = stride[lev], len = dim[lev];
--- a/liboctave/ChangeLog	Tue Mar 24 19:41:56 2009 -0400
+++ b/liboctave/ChangeLog	Wed Mar 25 11:40:06 2009 +0100
@@ -1,3 +1,11 @@
+2009-03-25  Jaroslav Hajek  <highegg@gmail.com>
+
+	* Array.cc (rec_permute_helper::use_blk): New field.
+	(rec_permute_helper::blk_trans): New method.
+	(rec_permute_helper::rec_permute_helper): Use smart reductions,
+	detect possibility of using blocked transpose.
+	(rec_permute_helper::do_permute): Use blocked transpose if possible.
+
 2009-03-23  Jaroslav Hajek  <highegg@gmail.com>
 
 	* idx-vector.cc (convert_index(double,...)): Simplify.