view liboctave/oct-convn.cc @ 10385:56116dceb1e0

add omitted source from the last change
author Jaroslav Hajek <highegg@gmail.com>
date Tue, 02 Mar 2010 10:59:05 +0100
parents
children 5af0b4bb384d
line wrap: on
line source

/*

Copyright (C) 2010 VZLU Prague

This file is part of Octave.

Octave is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the
Free Software Foundation; either version 3 of the License, or (at your
option) any later version.

Octave is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
for more details.

You should have received a copy of the GNU General Public License
along with Octave; see the file COPYING.  If not, see
<http://www.gnu.org/licenses/>.

*/

#ifdef HAVE_CONFIG_H
#include <config.h>
#endif

#include <iostream>
#include <algorithm>
#include "oct-convn.h"
#include "oct-locbuf.h"


// FIXME: Only the axpy-form accumulation is used. This is natural for outer
// convolution as it requires no boundary treatment.
// This typically requires one more memory store per operation, but as the
// memory access pattern is equivalent (switching ro and rw parts), I wouldn't
// expect a significant difference. cf. for instance sum(), where row-wise sum
// (axpy-based) is no slower than column-wise (dot-based).
// It would be nice, however, if the inner convolution was computed directly by
// dot-based accumulation.

// FIXME: Specifying the kernel as outer product should probably get special
// treatment.

// All kernels smaller than 7x5 get specialized code.
#define MAX_KERNEL_SIZE_M 7
#define MAX_KERNEL_SIZE_N 5

template <class T, class R, int mb, int nb>
static void 
convolve_2d_kernel_axpy (const T *a, octave_idx_type ma, octave_idx_type na,
                         const R *b, T *c, octave_idx_type ldc)
{
  for (octave_idx_type ja = 0; ja < na; ja++)
    for (octave_idx_type ia = 0; ia < ma; ia++)
      {
        T aij = a[ma*ja + ia];
        // The following double loop is a candidate for complete unrolling.
        for (int jb = 0; jb < nb; jb++)
          for (int ib = 0; ib < mb; ib++)
            c[(ja+jb)*ldc + (ia+ib)] += aij * b[mb*jb+ib];
      }
}

// Kernel dispatcher.
template <class T, class R>
static void
convolve_2d_axpy (const T *a, octave_idx_type ma, octave_idx_type na,
                  const R *b, octave_idx_type mb, octave_idx_type nb,
                  T *c, octave_idx_type ldc)
{
  // Table of kernels.
  static void (*table[MAX_KERNEL_SIZE_M][MAX_KERNEL_SIZE_N]) 
    (const T *, octave_idx_type, octave_idx_type, const R *, T *, octave_idx_type)
    = {
        // This must be repeated MAX_KERNEL_SIZE-times. I do not see a way to
        // automate this.
#define STORE_KERNEL_POINTER(M,N) \
        convolve_2d_kernel_axpy<T,R,M,N>
#define STORE_KERNEL_ROW(M) \
          { \
            STORE_KERNEL_POINTER(M,1), \
            STORE_KERNEL_POINTER(M,2), \
            STORE_KERNEL_POINTER(M,3), \
            STORE_KERNEL_POINTER(M,4), \
            STORE_KERNEL_POINTER(M,5), \
          }

        STORE_KERNEL_ROW(1),
        STORE_KERNEL_ROW(2),
        STORE_KERNEL_ROW(3),
        STORE_KERNEL_ROW(4),
        STORE_KERNEL_ROW(5),
        STORE_KERNEL_ROW(6),
        STORE_KERNEL_ROW(7),
    };

  if (mb != 0 && nb != 0)
    (*table[mb-1][nb-1]) (a, ma, na, b, c, ldc);
}

// 2d convolution with a matrix kernel.
template <class T, class R>
static void 
convolve_2d (const T *a, octave_idx_type ma, octave_idx_type na,
             const R *b, octave_idx_type mb, octave_idx_type nb,
             T *c)
{
  octave_idx_type ldc = ma + mb - 1;
  if (mb <= MAX_KERNEL_SIZE_M && nb <= MAX_KERNEL_SIZE_N)
    {
      // Call kernel directly on b.
      convolve_2d_axpy (a, ma, na, b, mb, nb, c, ldc);
    }
  else
    {
      // Split b to blocks.
      OCTAVE_LOCAL_BUFFER (R, b1, MAX_KERNEL_SIZE_M*MAX_KERNEL_SIZE_N);
      for (octave_idx_type jb = 0; jb < nb; jb += MAX_KERNEL_SIZE_N)
        {
          octave_idx_type nb1 = std::min (nb - jb, MAX_KERNEL_SIZE_N);
          for (octave_idx_type ib = 0; ib < mb; ib += MAX_KERNEL_SIZE_M)
            {
              octave_idx_type mb1 = std::min (mb - ib, MAX_KERNEL_SIZE_M);

              // Copy block to buffer.
              R *bf = b1;
              for (octave_idx_type j = jb; j < jb+nb1; j++)
                for (octave_idx_type i = ib; i < ib+mb1; i++)
                  *bf++ = b[j*mb + i];

              // Call kernel.
              convolve_2d_axpy (a, ma, na, b1, mb1, nb1, c + ldc*jb + ib, ldc);
            }
        }
    }

}

template <class T, class R>
void convolve_nd (const T *a, const dim_vector& ad, const dim_vector& acd,
                  const R *b, const dim_vector& bd, const dim_vector& bcd,
                  T *c, const dim_vector& ccd, int nd)
{
  if (nd == 2)
    convolve_2d<T, R> (a, ad(0), ad(1), b, bd(0), bd(1), c);
  else
    {
      octave_idx_type ma = acd(nd-2), na = ad(nd-1), mb = bcd(nd-2), nb = bd(nd-1);
      octave_idx_type ldc = ccd(nd-2);
      for (octave_idx_type jb = 0; jb < nb; jb++)
        {
          for (octave_idx_type ja = 0; ja < na; ja++)
            convolve_nd<T, R> (a + ma*ja, ad, acd, b + mb*jb, bd, bcd, 
                               c + ldc*(ja+jb), ccd, nd-1);
        }
    }
}

// Arbitrary convolutor. 
// The 2nd array is assumed to be the smaller one.
template <class T, class R>
static MArray<T>
convolve (const MArray<T>& a, const MArray<R>& b,
          convn_type ct)
{
  if (a.is_empty () || b.is_empty ())
    return MArray<T> ();

  int nd = std::max (a.ndims (), b.ndims ());
  const dim_vector adims = a.dims ().redim (nd), bdims = b.dims ().redim (nd);
  dim_vector cdims = dim_vector::alloc (nd);

  for (int i = 0; i < nd; i++)
    cdims(i) = std::max (adims(i) + bdims(i) - 1, 0);

  MArray<T> c (cdims, T());

  convolve_nd<T, R> (a.fortran_vec (), adims, adims.cumulative (),
                     b.fortran_vec (), bdims, bdims.cumulative (),
                     c.fortran_vec (), cdims.cumulative (), nd);

  // Pick the relevant part.
  Array<idx_vector> sidx (nd, 1);

  switch (ct)
    {
    case convn_valid:
        {
          for (int i = 0; i < nd; i++)
            sidx(i) = idx_vector (bdims(i)-1, adims(i));
          c = c.index (sidx);
          break;
        }
    case convn_same:
        {
          for (int i = 0; i < nd; i++)
            sidx(i) = idx_vector::make_range ((bdims(i)-1)/2, 1, adims(i));
          c = c.index (sidx);
          break;
        }
    default:
      break;
    }

  return c;
}

#define CONV_DEFS(TPREF, RPREF) \
TPREF ## NDArray \
convn (const TPREF ## NDArray& a, const RPREF ## NDArray& b, convn_type ct) \
{ \
  return convolve (a, b, ct); \
} \
TPREF ## Matrix \
convn (const TPREF ## Matrix& a, const RPREF ## Matrix& b, convn_type ct) \
{ \
  return convolve (a, b, ct); \
} \
TPREF ## Matrix \
convn (const TPREF ## Matrix& a, const RPREF ## ColumnVector& c, \
       const RPREF ## RowVector& r, convn_type ct) \
{ \
  return convolve (a, c * r, ct); \
}

CONV_DEFS ( , )
CONV_DEFS (Complex, )
CONV_DEFS (Complex, Complex)
CONV_DEFS (Float, Float)
CONV_DEFS (FloatComplex, Float)
CONV_DEFS (FloatComplex, FloatComplex)