changeset 9876:21d81d06b221

cache-aligned loop for rowwise dot
author Jaroslav Hajek <highegg@gmail.com>
date Thu, 26 Nov 2009 21:06:21 +0100
parents 47f36dd27203
children cac3b4e5035b
files libcruft/ChangeLog libcruft/blas-xtra/cdotc3.f libcruft/blas-xtra/ddot3.f libcruft/blas-xtra/sdot3.f libcruft/blas-xtra/zdotc3.f src/ChangeLog src/DLD-FUNCTIONS/dot.cc
diffstat 7 files changed, 42 insertions(+), 42 deletions(-) [+]
line wrap: on
line diff
--- a/libcruft/ChangeLog	Thu Nov 26 13:17:39 2009 +0100
+++ b/libcruft/ChangeLog	Thu Nov 26 21:06:21 2009 +0100
@@ -1,3 +1,10 @@
+2009-11-26  Jaroslav Hajek  <highegg@gmail.com>
+
+	* blas-xtra/sdot3.f: Use nested cache-aligned loop for general case.
+	* blas-xtra/ddot3.f: Ditto.
+	* blas-xtra/cdotc3.f: Ditto.
+	* blas-xtra/zdotc3.f: Ditto.
+
 2009-11-26  Jaroslav Hajek  <highegg@gmail.com>
 
 	* blas-xtra/sdot3.f: New source.
--- a/libcruft/blas-xtra/cdotc3.f	Thu Nov 26 13:17:39 2009 +0100
+++ b/libcruft/blas-xtra/cdotc3.f	Thu Nov 26 21:06:21 2009 +0100
@@ -29,8 +29,6 @@
       complex a(m,k,n),b(m,k,n)
       complex c(m,n)
 
-      integer kk
-      parameter (kk = 64)
       complex cdotc
       external cdotc
 
@@ -38,22 +36,21 @@
       if (m <= 0 .or. n <= 0) return
 
       if (m == 1) then
-c the column-major case
+c the column-major case.
         do j = 1,n
           c(1,j) = cdotc(k,a(1,1,j),1,b(1,1,j),1)
         end do
       else
-c here the product is row-wise, but we'd still like to use BLAS's dot
-c for its usually better accuracy. let's do a compromise and split the
-c middle dimension.
+c We prefer performance here, because that's what we generally
+c do by default in reduction functions. Besides, the accuracy
+c of xDOT is questionable. Hence, do a cache-aligned nested loop.
         do j = 1,n
-          l = mod(k,kk)
           do i = 1,m
-            c(i,j) = ddot(l,a(i,1,j),m,b(i,1,j),m)
+            c(i,j) = 0e0
           end do
-          do l = l+1,k,kk
+          do l = 1,k
             do i = 1,m
-              c(i,j) = c(i,j) + cdotc(kk,a(i,l,j),m,b(i,l,j),m)
+              c(i,j) = c(i,j) + conjg(a(i,l,j))*b(i,l,j)
             end do
           end do
         end do
--- a/libcruft/blas-xtra/ddot3.f	Thu Nov 26 13:17:39 2009 +0100
+++ b/libcruft/blas-xtra/ddot3.f	Thu Nov 26 21:06:21 2009 +0100
@@ -29,8 +29,6 @@
       double precision a(m,k,n),b(m,k,n)
       double precision c(m,n)
 
-      integer kk
-      parameter (kk = 64)
       double precision ddot
       external ddot
 
@@ -39,22 +37,21 @@
       if (m <= 0 .or. n <= 0) return
 
       if (m == 1) then
-c the column-major case
+c the column-major case.
         do j = 1,n
           c(1,j) = ddot(k,a(1,1,j),1,b(1,1,j),1)
         end do
       else
-c here the product is row-wise, but we'd still like to use BLAS's dot
-c for its usually better accuracy. let's do a compromise and split the
-c middle dimension.
+c We prefer performance here, because that's what we generally
+c do by default in reduction functions. Besides, the accuracy
+c of xDOT is questionable. Hence, do a cache-aligned nested loop.
         do j = 1,n
-          l = mod(k,kk)
           do i = 1,m
-            c(i,j) = ddot(l,a(i,1,j),m,b(i,1,j),m)
+            c(i,j) = 0d0
           end do
-          do l = l+1,k,kk
+          do l = 1,k
             do i = 1,m
-              c(i,j) = c(i,j) + ddot(kk,a(i,l,j),m,b(i,l,j),m)
+              c(i,j) = c(i,j) + a(i,l,j)*b(i,l,j)
             end do
           end do
         end do
--- a/libcruft/blas-xtra/sdot3.f	Thu Nov 26 13:17:39 2009 +0100
+++ b/libcruft/blas-xtra/sdot3.f	Thu Nov 26 21:06:21 2009 +0100
@@ -29,8 +29,6 @@
       real a(m,k,n),b(m,k,n)
       real c(m,n)
 
-      integer kk
-      parameter (kk = 64)
       real sdot
       external sdot
 
@@ -38,22 +36,21 @@
       if (m <= 0 .or. n <= 0) return
 
       if (m == 1) then
-c the column-major case
+c the column-major case.
         do j = 1,n
           c(1,j) = sdot(k,a(1,1,j),1,b(1,1,j),1)
         end do
       else
-c here the product is row-wise, but we'd still like to use BLAS's dot
-c for its usually better accuracy. let's do a compromise and split the
-c middle dimension.
+c We prefer performance here, because that's what we generally
+c do by default in reduction functions. Besides, the accuracy
+c of xDOT is questionable. Hence, do a cache-aligned nested loop.
         do j = 1,n
-          l = mod(k,kk)
           do i = 1,m
-            c(i,j) = ddot(l,a(i,1,j),m,b(i,1,j),m)
+            c(i,j) = 0d0
           end do
-          do l = l+1,k,kk
+          do l = 1,k
             do i = 1,m
-              c(i,j) = c(i,j) + sdot(kk,a(i,l,j),m,b(i,l,j),m)
+              c(i,j) = c(i,j) + a(i,l,j)*b(i,l,j)
             end do
           end do
         end do
--- a/libcruft/blas-xtra/zdotc3.f	Thu Nov 26 13:17:39 2009 +0100
+++ b/libcruft/blas-xtra/zdotc3.f	Thu Nov 26 21:06:21 2009 +0100
@@ -29,8 +29,6 @@
       double complex a(m,k,n),b(m,k,n)
       double complex c(m,n)
 
-      integer kk
-      parameter (kk = 64)
       double complex zdotc
       external zdotc
 
@@ -38,22 +36,21 @@
       if (m <= 0 .or. n <= 0) return
 
       if (m == 1) then
-c the column-major case
+c the column-major case.
         do j = 1,n
           c(1,j) = zdotc(k,a(1,1,j),1,b(1,1,j),1)
         end do
       else
-c here the product is row-wise, but we'd still like to use BLAS's dot
-c for its usually better accuracy. let's do a compromise and split the
-c middle dimension.
+c We prefer performance here, because that's what we generally
+c do by default in reduction functions. Besides, the accuracy
+c of xDOT is questionable. Hence, do a cache-aligned nested loop.
         do j = 1,n
-          l = mod(k,kk)
           do i = 1,m
-            c(i,j) = ddot(l,a(i,1,j),m,b(i,1,j),m)
+            c(i,j) = 0d0
           end do
-          do l = l+1,k,kk
+          do l = 1,k
             do i = 1,m
-              c(i,j) = c(i,j) + zdotc(kk,a(i,l,j),m,b(i,l,j),m)
+              c(i,j) = c(i,j) + conjg(a(i,l,j))*b(i,l,j)
             end do
           end do
         end do
--- a/src/ChangeLog	Thu Nov 26 13:17:39 2009 +0100
+++ b/src/ChangeLog	Thu Nov 26 21:06:21 2009 +0100
@@ -1,3 +1,7 @@
+2009-11-26  Jaroslav Hajek  <highegg@gmail.com>
+
+	* DLD-FUNCTIONS/dot.cc (Fdot): Update docs.
+
 2009-11-26  Jaroslav Hajek  <highegg@gmail.com>
 
 	* DLD-FUNCTIONS/dot.cc: New source.
--- a/src/DLD-FUNCTIONS/dot.cc	Thu Nov 26 13:17:39 2009 +0100
+++ b/src/DLD-FUNCTIONS/dot.cc	Thu Nov 26 21:06:21 2009 +0100
@@ -111,8 +111,9 @@
 given, calculate the dot products along this dimension.\n\
 \n\
 This is equivalent to doing @code{sum (conj (@var{X}) .* @var{Y}, @var{dim})},\n\
-but avoids forming a temporary array and uses the BLAS xDOT functions,\n\
-usually resulting in increased accuracy of the computation.\n\
+but avoids forming a temporary array and is faster.\n\
+When @var{X} and @var{Y} are column vectors, the result is equivalent to\n\
+@code{ @var{X}'*@var{Y} }.\n\
 @end deftypefn")
 {
   octave_value retval;