changeset 8758:83c9d60c3c47

implement short-circuiting row-reduction any/all algorithm
author Jaroslav Hajek <highegg@gmail.com>
date Mon, 16 Feb 2009 13:53:09 +0100
parents 79576d40acb6
children c32a08dccae6
files liboctave/ChangeLog liboctave/mx-inlines.cc
diffstat 2 files changed, 38 insertions(+), 12 deletions(-) [+]
line wrap: on
line diff
--- a/liboctave/ChangeLog	Mon Feb 16 12:45:15 2009 +0100
+++ b/liboctave/ChangeLog	Mon Feb 16 13:53:09 2009 +0100
@@ -1,3 +1,8 @@
+2009-02-16  Jaroslav Hajek  <highegg@gmail.com>
+	
+	* mx-inlines.cc (OP_ROW_SHORT_CIRCUIT): New macro.
+	(mx_inline_any, mx_inline_all): Override row-reduction case.
+
 2009-02-16  Jaroslav Hajek  <highegg@gmail.com>
 	
 	* mx-inlines.cc (OP_RED_FCNN): Use explicit type qualification.
--- a/liboctave/mx-inlines.cc	Mon Feb 16 12:45:15 2009 +0100
+++ b/liboctave/mx-inlines.cc	Mon Feb 16 13:53:09 2009 +0100
@@ -30,6 +30,7 @@
 #include "quit.h"
 
 #include "oct-cmplx.h"
+#include "oct-locbuf.h"
 
 template <class R, class S>
 inline void
@@ -317,16 +318,6 @@
 #define OP_RED_ANYC(ac, el) if (xis_true (el)) { ac = true; break; } else continue
 #define OP_RED_ALLC(ac, el) if (xis_false (el)) { ac = false; break; } else continue
 
-// Row any/all reductions are a tradeoff - we traverse the array by
-// columns to gain cache coherence, but sacrifice short-circuiting for that.
-// For certain logical arrays, this could mean a significant loss.
-// A more sophisticated implementation could introduce a buffer of active
-// row indices to achieve both. Right now, I don't see the operation as
-// important enough.
-
-#define OP_RED_ANYR(ac, el) if (xis_true (el)) ac = true
-#define OP_RED_ALLR(ac, el) if (xis_false (el)) ac = false
-
 #define OP_RED_FCN(F, TSRC, TRES, OP, ZERO) \
 template <class T> \
 inline TRES \
@@ -367,8 +358,38 @@
 OP_RED_FCN2 (mx_inline_prod, T, T, OP_RED_PROD, 1)
 OP_RED_FCN2 (mx_inline_sumsq, T, T, OP_RED_SUMSQ, 0)
 OP_RED_FCN2 (mx_inline_sumsq, std::complex<T>, T, OP_RED_SUMSQC, 0)
-OP_RED_FCN2 (mx_inline_any, T, bool, OP_RED_ANYR, false)
-OP_RED_FCN2 (mx_inline_all, T, bool, OP_RED_ALLR, true)
+
+// Using the general code for any/all would sacrifice short-circuiting.
+// OTOH, going by rows would sacrifice cache-coherence. The following algorithm
+// will achieve both, at the cost of a temporary octave_idx_type array.
+
+#define OP_ROW_SHORT_CIRCUIT(F, PRED, ZERO) \
+template <class T> \
+inline void \
+F (const T* v, bool *r, octave_idx_type m, octave_idx_type n) \
+{ \
+  /* FIXME: it may be sub-optimal to allocate the buffer here. */ \
+  OCTAVE_LOCAL_BUFFER (octave_idx_type, iact, m); \
+  for (octave_idx_type i = 0; i < m; i++) iact[i] = i; \
+  octave_idx_type nact = m; \
+  for (octave_idx_type j = 0; j < n; j++) \
+    { \
+      octave_idx_type k = 0; \
+      for (octave_idx_type i = 0; i < nact; i++) \
+        { \
+          octave_idx_type ia = iact[i]; \
+          if (! PRED (v[ia])) \
+            iact[k++] = ia; \
+        } \
+      nact = k; \
+      v += m; \
+    } \
+  for (octave_idx_type i = 0; i < m; i++) r[i] = ! ZERO; \
+  for (octave_idx_type i = 0; i < nact; i++) r[iact[i]] = ZERO; \
+}
+
+OP_ROW_SHORT_CIRCUIT (mx_inline_any, xis_true, false)
+OP_ROW_SHORT_CIRCUIT (mx_inline_all, xis_false, true)
 
 #define OP_RED_FCNN(F, TSRC, TRES) \
 template <class T> \