5798
|
1 ## Copyright (C) 2000-2006 Paul Kienzle |
5589
|
2 ## |
|
3 ## This program is free software; you can redistribute it and/or modify |
|
4 ## it under the terms of the GNU General Public License as published by |
|
5 ## the Free Software Foundation; either version 2 of the License, or |
|
6 ## (at your option) any later version. |
|
7 ## |
|
8 ## This program is distributed in the hope that it will be useful, |
|
9 ## but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
10 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
11 ## GNU General Public License for more details. |
|
12 ## |
|
13 ## You should have received a copy of the GNU General Public License |
|
14 ## along with this program; if not, write to the Free Software |
|
15 ## Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA |
|
16 ## 02110-1301 USA |
|
17 |
|
18 ## -*- texinfo -*- |
5798
|
19 ## @deftypefn {Function File} {} speed (@var{f}, @var{init}, @var{max_n}, @var{f2}, @var{tol}) |
|
20 ## @deftypefnx {Function File} {[@var{order}, @var{n}, @var{T_f}, @var{T_f2}] =} speed (@dots{}) |
5589
|
21 ## |
|
22 ## Determine the execution time of an expression for various @var{n}. |
|
23 ## The @var{n} are log-spaced from 1 to @var{max_n}. For each @var{n}, |
|
24 ## an initialization expression is computed to create whatever data |
5798
|
25 ## are needed for the test. If a second expression is given, the |
|
26 ## execution times of the two expressions will be compared. Called |
|
27 ## without output arguments the results are presented graphically. |
5589
|
28 ## |
|
29 ## @table @code |
|
30 ## @item @var{f} |
|
31 ## The expression to evaluate. |
|
32 ## |
|
33 ## @item @var{max_n} |
5798
|
34 ## The maximum test length to run. Default value is 100. Alternatively, |
|
35 ## use @code{[min_n,max_n]} or for complete control, @code{[n1,n2,@dots{},nk]}. |
5589
|
36 ## |
|
37 ## @item @var{init} |
|
38 ## Initialization expression for function argument values. Use @var{k} |
|
39 ## for the test number and @var{n} for the size of the test. This should |
|
40 ## compute values for all variables listed in args. Note that init will |
|
41 ## be evaluated first for k=0, so things which are constant throughout |
|
42 ## the test can be computed then. The default value is @code{@var{x} = |
|
43 ## randn (@var{n}, 1);}. |
|
44 ## |
|
45 ## @item @var{f2} |
|
46 ## An alternative expression to evaluate, so the speed of the two |
|
47 ## can be compared. Default is @code{[]}. |
|
48 ## |
|
49 ## @item @var{tol} |
|
50 ## If @var{tol} is @code{Inf}, then no comparison will be made between the |
|
51 ## results of expression @var{f} and expression @var{f2}. Otherwise, |
|
52 ## expression @var{f} should produce a value @var{v} and expression @var{f2} |
|
53 ## should produce a value @var{v2}, and these shall be compared using |
6429
|
54 ## @code{assert(@var{v},@var{v2},@var{tol})}. If @var{tol} is positive, |
|
55 ## the tolerance is assumed to be absolutr. If @var{tol} is negative, |
|
56 ## the tolerance is assumed to be relative. The default is @code{eps}. |
5589
|
57 ## |
5798
|
58 ## @item @var{order} |
|
59 ## The time complexity of the expression @code{O(a n^p)}. This |
|
60 ## is a structure with fields @code{a} and @code{p}. |
5589
|
61 ## |
5798
|
62 ## @item @var{n} |
|
63 ## The values @var{n} for which the expression was calculated and the |
|
64 ## the execution time was greater than zero. |
5589
|
65 ## |
5798
|
66 ## @item @var{T_f} |
|
67 ## The nonzero execution times recorded for the expression @var{f} in seconds. |
|
68 ## |
|
69 ## @item @var{T_f2} |
|
70 ## The nonzero execution times recorded for the expression @var{f2} in seconds. |
|
71 ## If it is needed, the mean time ratio is just @code{mean(T_f./T_f2)}. |
|
72 ## |
5589
|
73 ## @end table |
|
74 ## |
5798
|
75 ## The slope of the execution time graph shows the approximate |
|
76 ## power of the asymptotic running time @code{O(n^p)}. This |
|
77 ## power is plotted for the region over which it is approximated |
|
78 ## (the latter half of the graph). The estimated power is not |
|
79 ## very accurate, but should be sufficient to determine the |
|
80 ## general order of your algorithm. It should indicate if for |
|
81 ## example your implementation is unexpectedly @code{O(n^2)} |
|
82 ## rather than @code{O(n)} because it extends a vector each |
|
83 ## time through the loop rather than preallocating one which is |
|
84 ## big enough. For example, in the current version of Octave, |
|
85 ## the following is not the expected @code{O(n)}: |
5589
|
86 ## |
5798
|
87 ## @example |
|
88 ## speed("for i=1:n,y@{i@}=x(i); end", "", [1000,10000]) |
|
89 ## @end example |
|
90 ## |
|
91 ## but it is if you preallocate the cell array @code{y}: |
5589
|
92 ## |
|
93 ## @example |
5798
|
94 ## speed("for i=1:n,y@{i@}=x(i);end", ... |
|
95 ## "x=rand(n,1);y=cell(size(x));", [1000,10000]) |
|
96 ## @end example |
|
97 ## |
|
98 ## An attempt is made to approximate the cost of the individual |
|
99 ## operations, but it is wildly inaccurate. You can improve the |
|
100 ## stability somewhat by doing more work for each @code{n}. For |
|
101 ## example: |
|
102 ## |
|
103 ## @example |
|
104 ## speed("airy(x)", "x=rand(n,10)", [10000,100000]) |
5589
|
105 ## @end example |
|
106 ## |
5798
|
107 ## When comparing a new and original expression, the line on the |
|
108 ## speedup ratio graph should be larger than 1 if the new expression |
|
109 ## is faster. Better algorithms have a shallow slope. Generally, |
|
110 ## vectorizing an algorithm will not change the slope of the execution |
|
111 ## time graph, but it will shift it relative to the original. For |
|
112 ## example: |
|
113 ## |
|
114 ## @example |
|
115 ## speed("v=sum(x)", "", [10000,100000], ... |
|
116 ## "v=0;for i=1:length(x),v+=x(i);end") |
|
117 ## @end example |
|
118 ## |
5589
|
119 ## A more complex example, if you had an original version of @code{xcorr} |
|
120 ## using for loops and another version using an FFT, you could compare the |
|
121 ## run speed for various lags as follows, or for a fixed lag with varying |
|
122 ## vector lengths as follows: |
|
123 ## |
|
124 ## @example |
|
125 ## speed("v=xcorr(x,n)", "x=rand(128,1);", 100, ... |
6429
|
126 ## "v2=xcorr_orig(x,n)", -100*eps) |
5589
|
127 ## speed("v=xcorr(x,15)", "x=rand(20+n,1);", 100, ... |
6429
|
128 ## "v2=xcorr_orig(x,n)", -100*eps) |
5589
|
129 ## @end example |
|
130 ## |
|
131 ## Assuming one of the two versions is in @var{xcorr_orig}, this would |
|
132 ## would compare their speed and their output values. Note that the |
|
133 ## FFT version is not exact, so we specify an acceptable tolerance on |
|
134 ## the comparison @code{100*eps}, and the errors should be computed |
|
135 ## relatively, as @code{abs((@var{x} - @var{y})./@var{y})} rather than |
|
136 ## absolutely as @code{abs(@var{x} - @var{y})}. |
|
137 ## |
|
138 ## Type @code{example('speed')} to see some real examples. Note for |
|
139 ## obscure reasons, you can't run examples 1 and 2 directly using |
|
140 ## @code{demo('speed')}. Instead use, @code{eval(example('speed',1))} |
|
141 ## and @code{eval(example('speed',2))}. |
|
142 ## @end deftypefn |
|
143 |
|
144 ## TODO: consider two dimensional speedup surfaces for functions like kron. |
5798
|
145 function [__order, __test_n, __tnew, __torig] ... |
6494
|
146 = speed (__f1, __init, __max_n, __f2, __tol) |
|
147 |
|
148 if (nargin < 1 || nargin > 6) |
6046
|
149 print_usage (); |
5589
|
150 endif |
6494
|
151 |
|
152 if (nargin < 2 || isempty (__init)) |
5589
|
153 __init = "x = randn(n, 1);"; |
|
154 endif |
6494
|
155 |
|
156 if (nargin < 3 || isempty (__max_n)) |
|
157 __max_n = 100; |
|
158 endif |
|
159 |
|
160 if (nargin < 4) |
|
161 __f2 = []; |
|
162 endif |
|
163 |
|
164 if (nargin < 5 || isempty (__tol)) |
|
165 __tol = eps; |
|
166 endif |
5798
|
167 |
|
168 __numtests = 15; |
5589
|
169 |
5798
|
170 ## Let user specify range of n |
6494
|
171 if (isscalar (__max_n)) |
5798
|
172 __min_n = 1; |
6494
|
173 assert (__max_n > __min_n); |
|
174 __test_n = logspace (0, log10 (__max_n), __numtests); |
|
175 elseif (length (__max_n) == 2) |
5798
|
176 __min_n = __max_n(1); |
|
177 __max_n = __max_n(2); |
6494
|
178 assert (__min_n >= 1); |
|
179 __test_n = logspace (log10 (__min_n), log10 (__max_n), __numtests); |
5798
|
180 else |
|
181 __test_n = __max_n; |
|
182 endif |
6494
|
183 __test_n = unique (round (__test_n)); # Force n to be an integer |
|
184 assert (__test_n >= 1); |
5589
|
185 |
6494
|
186 __torig = __tnew = zeros (size (__test_n)); |
5589
|
187 |
6494
|
188 disp (strcat ("testing ", __f1, "\ninit: ", __init)); |
5589
|
189 |
|
190 ## make sure the functions are freshly loaded by evaluating them at |
5798
|
191 ## test_n(1); first have to initialize the args though. |
6494
|
192 n = 1; |
|
193 k = 0; |
|
194 eval (strcat (__init, ";")); |
|
195 if (! isempty (__f2)) |
|
196 eval (strcat (__f2, ";")); |
|
197 endif |
|
198 eval (strcat (__f1, ";")); |
5589
|
199 |
|
200 ## run the tests |
6494
|
201 for k = 1:length (__test_n) |
|
202 n = __test_n(k); |
|
203 eval (strcat (__init, ";")); |
5589
|
204 |
6494
|
205 printf ("n%i=%i ",k, n); |
|
206 fflush (stdout); |
|
207 eval (strcat ("__t=time();", __f1, "; __v1=ans; __t = time()-__t;")); |
5589
|
208 if (__t < 0.25) |
6494
|
209 eval (strcat ("__t2=time();", __f1, "; __t2 = time()-__t2;")); |
|
210 eval (strcat ("__t3=time();", __f1, "; __t3 = time()-__t3;")); |
|
211 __t = min ([__t, __t2, __t3]); |
5589
|
212 endif |
|
213 __tnew(k) = __t; |
|
214 |
6494
|
215 if (! isempty (__f2)) |
|
216 eval (strcat ("__t=time();", __f2, "; __v2=ans; __t = time()-__t;")); |
5589
|
217 if (__t < 0.25) |
6494
|
218 eval (strcat ("__t2=time();", __f2, "; __t2 = time()-__t2;")); |
|
219 eval (strcat ("__t3=time();", __f2, "; __t3 = time()-__t3;")); |
5589
|
220 endif |
|
221 __torig(k) = __t; |
6494
|
222 if (! isinf(__tol)) |
|
223 assert (__v1, __v2, __tol); |
5589
|
224 endif |
|
225 endif |
5798
|
226 endfor |
5589
|
227 |
5798
|
228 ## Drop times of zero |
6494
|
229 if (! isempty (__f2)) |
|
230 zidx = (__tnew < 100*eps | __torig < 100*eps); |
5798
|
231 __test_n(zidx) = []; |
|
232 __tnew(zidx) = []; |
|
233 __torig(zidx) = []; |
5589
|
234 else |
6494
|
235 zidx = (__tnew < 100*eps); |
5798
|
236 __test_n(zidx) = []; |
|
237 __tnew(zidx) = []; |
5589
|
238 endif |
6494
|
239 |
5798
|
240 ## Approximate time complexity and return it if requested |
6494
|
241 tailidx = ceil(length(__test_n)/2):length(__test_n); |
|
242 p = polyfit (log (__test_n(tailidx)), log (__tnew(tailidx)), 1); |
|
243 if (nargout > 0) |
5798
|
244 __order.p = p(1); |
6494
|
245 __order.a = exp (p(2)); |
5798
|
246 endif |
5589
|
247 |
5798
|
248 ## Plot the data if no output is requested. |
|
249 doplot = (nargout == 0); |
6430
|
250 |
|
251 if (doplot) |
|
252 figure; |
|
253 endif |
5798
|
254 |
6494
|
255 if (doplot && ! isempty (__f2)) |
|
256 subplot (1, 2, 1); |
|
257 semilogx (__test_n, __torig./__tnew, |
|
258 strcat ("-*r;", strrep (__f1, ";", "."), "/", |
|
259 strrep (__f2, ";", "."), ";"), |
5798
|
260 __test_n, __tnew./__torig, |
6494
|
261 strcat ("-*g;", strrep (__f2, ";", "."), "/", |
|
262 strrep (__f1, ";", "."), ";")); |
|
263 xlabel ("test length"); |
|
264 title (__f1); |
|
265 ylabel ("speedup ratio"); |
|
266 |
|
267 subplot (1, 2, 2); |
|
268 loglog (__test_n, __tnew*1000, |
|
269 strcat ("*-g;", strrep (__f1, ";", "."), ";" ), |
|
270 __test_n, __torig*1000, |
|
271 strcat ("*-r;", strrep (__f2,";","."), ";")); |
|
272 |
|
273 xlabel ("test length"); |
5589
|
274 ylabel ("best execution time (ms)"); |
6494
|
275 title (strcat ("init: ", __init)); |
|
276 |
5798
|
277 ratio = mean (__torig ./ __tnew); |
6494
|
278 printf ("\n\nMean runtime ratio = %.3g for '%s' vs '%s'\n", |
5798
|
279 ratio, __f2, __f1); |
|
280 |
6494
|
281 elseif (doplot) |
5798
|
282 |
6494
|
283 loglog (__test_n, __tnew*1000, "*-g;execution time;"); |
|
284 xlabel ("test length"); |
5589
|
285 ylabel ("best execution time (ms)"); |
6494
|
286 title (strcat (__f1, " init: ", __init)); |
5798
|
287 |
5589
|
288 endif |
5798
|
289 |
6494
|
290 if (doplot) |
5798
|
291 |
|
292 ## Plot time complexity approximation (using milliseconds). |
6494
|
293 order = sprintf ("O(n^%g)", round (10*p(1))/10); |
|
294 v = polyval (p, log (__test_n(tailidx))); |
|
295 |
|
296 loglog (__test_n(tailidx), exp(v)*1000, sprintf ("b;%s;", order)); |
5798
|
297 |
|
298 ## Get base time to 1 digit of accuracy |
6494
|
299 dt = exp (p(2)); |
|
300 dt = floor (dt/10^floor(log10(dt)))*10^floor(log10(dt)); |
|
301 if (log10 (dt) >= -0.5) |
|
302 time = sprintf ("%g s", dt); |
|
303 elseif (log10 (dt) >= -3.5) |
|
304 time = sprintf ("%g ms", dt*1e3); |
|
305 elseif (log10 (dt) >= -6.5) |
|
306 time = sprintf ("%g us", dt*1e6); |
|
307 else |
|
308 time = sprintf ("%g ns", dt*1e9); |
5798
|
309 endif |
|
310 |
|
311 ## Display nicely formatted complexity. |
6494
|
312 printf ("\nFor %s:\n", __f1); |
5798
|
313 printf (" asymptotic power: %s\n", order); |
|
314 printf (" approximate time per operation: %s\n", time); |
|
315 |
|
316 endif |
|
317 |
5589
|
318 endfunction |
|
319 |
|
320 %!demo if 1 |
|
321 %! function x = build_orig(n) |
|
322 %! ## extend the target vector on the fly |
|
323 %! for i=0:n-1, x([1:10]+i*10) = 1:10; endfor |
|
324 %! endfunction |
|
325 %! function x = build(n) |
|
326 %! ## preallocate the target vector |
|
327 %! x = zeros(1, n*10); |
|
328 %! try |
|
329 %! if (prefer_column_vectors), x = x.'; endif |
|
330 %! catch |
|
331 %! end |
|
332 %! for i=0:n-1, x([1:10]+i*10) = 1:10; endfor |
|
333 %! endfunction |
|
334 %! |
|
335 %! disp("-----------------------"); |
|
336 %! type build_orig; |
|
337 %! disp("-----------------------"); |
|
338 %! type build; |
|
339 %! disp("-----------------------"); |
|
340 %! |
|
341 %! disp("Preallocated vector test.\nThis takes a little while..."); |
6429
|
342 %! speed('build(n)', '', 1000, 'build_orig(n)'); |
5589
|
343 %! clear build build_orig |
|
344 %! disp("Note how much faster it is to pre-allocate a vector."); |
|
345 %! disp("Notice the peak speedup ratio."); |
|
346 %! endif |
|
347 |
|
348 %!demo if 1 |
|
349 %! function x = build_orig(n) |
|
350 %! for i=0:n-1, x([1:10]+i*10) = 1:10; endfor |
|
351 %! endfunction |
|
352 %! function x = build(n) |
|
353 %! idx = [1:10]'; |
|
354 %! x = idx(:,ones(1,n)); |
|
355 %! x = reshape(x, 1, n*10); |
|
356 %! try |
|
357 %! if (prefer_column_vectors), x = x.'; endif |
|
358 %! catch |
|
359 %! end |
|
360 %! endfunction |
|
361 %! |
|
362 %! disp("-----------------------"); |
|
363 %! type build_orig; |
|
364 %! disp("-----------------------"); |
|
365 %! type build; |
|
366 %! disp("-----------------------"); |
|
367 %! |
|
368 %! disp("Vectorized test. This takes a little while..."); |
6429
|
369 %! speed('build(n)', '', 1000, 'build_orig(n)'); |
5589
|
370 %! clear build build_orig |
|
371 %! disp("-----------------------"); |
|
372 %! disp("This time, the for loop is done away with entirely."); |
|
373 %! disp("Notice how much bigger the speedup is then in example 1."); |
|
374 %! endif |