364 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			364 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| /* Karatsuba convolution
 | |
|  *
 | |
|  *  Copyright (C) 1999 Ralph Loader <suckfish@ihug.co.nz>
 | |
|  *
 | |
|  * This library is free software; you can redistribute it and/or
 | |
|  * modify it under the terms of the GNU Library General Public
 | |
|  * License as published by the Free Software Foundation; either
 | |
|  * version 2 of the License, or (at your option) any later version.
 | |
|  *
 | |
|  * This library is distributed in the hope that it will be useful,
 | |
|  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
|  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | |
|  * Library General Public License for more details.
 | |
|  *
 | |
|  * You should have received a copy of the GNU Library General Public
 | |
|  * License along with this library; if not, write to the
 | |
|  * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
 | |
|  * Boston, MA 02110-1301, USA.
 | |
|  *
 | |
|  *
 | |
|  * Note: 7th December 2004: This file used to be licensed under the GPL,
 | |
|  *       but we got permission from Ralp Loader to relicense it to LGPL.
 | |
|  *
 | |
|  *  $Id$
 | |
|  *
 | |
|  */
 | |
| 
 | |
| /* The algorithm is based on the following.  For the convolution of a pair
 | |
|  * of pairs, (a,b) * (c,d) = (0, a.c, a.d+b.c, b.d), we can reduce the four
 | |
|  * multiplications to three, by the formulae a.d+b.c = (a+b).(c+d) - a.c -
 | |
|  * b.d.  A similar relation enables us to compute a 2n by 2n convolution
 | |
|  * using 3 n by n convolutions, and thus a 2^n by 2^n convolution using 3^n
 | |
|  * multiplications (as opposed to the 4^n that the quadratic algorithm
 | |
|  * takes. */
 | |
| 
 | |
| /* For large n, this is slower than the O(n log n) that the FFT method
 | |
|  * takes, but we avoid using complex numbers, and we only have to compute
 | |
|  * one convolution, as opposed to 3 FFTs.  We have good locality-of-
 | |
|  * reference as well, which will help on CPUs with tiny caches.  */
 | |
| 
 | |
| /* E.g., for a 512 x 512 convolution, the FFT method takes 55 * 512 = 28160
 | |
|  * (real) multiplications, as opposed to 3^9 = 19683 for the Karatsuba
 | |
|  * algorithm.  We actually want 257 outputs of a 256 x 512 convolution;
 | |
|  * that doesn't appear to give an easy advantage for the FFT algorithm, but
 | |
|  * for the Karatsuba algorithm, it's easy to use two 256 x 256
 | |
|  * convolutions, taking 2 x 3^8 = 12312 multiplications.  [This difference
 | |
|  * is that the FFT method "wraps" the arrays, doing a 2^n x 2^n -> 2^n,
 | |
|  * while the Karatsuba algorithm pads with zeros, doing 2^n x 2^n -> 2.2^n
 | |
|  * - 1]. */
 | |
| 
 | |
| /* There's a big lie above, actually... for a 4x4 convolution, it's quicker
 | |
|  * to do it using 16 multiplications than the more complex Karatsuba
 | |
|  * algorithm...  So the recursion bottoms out at 4x4s.  This increases the
 | |
|  * number of multiplications by a factor of 16/9, but reduces the overheads
 | |
|  * dramatically. */
 | |
| 
 | |
| /* The convolution algorithm is implemented as a stack machine.  We have a
 | |
|  * stack of commands, each in one of the forms "do a 2^n x 2^n
 | |
|  * convolution", or "combine these three length 2^n outputs into one
 | |
|  * 2^{n+1} output." */
 | |
| 
 | |
| #ifdef HAVE_CONFIG_H
 | |
| #include "config.h"
 | |
| #endif
 | |
| 
 | |
| #include <stdlib.h>
 | |
| #include "convolve.h"
 | |
| 
 | |
| typedef union stack_entry_s
 | |
| {
 | |
|   struct
 | |
|   {
 | |
|     const double *left, *right;
 | |
|     double *out;
 | |
|   }
 | |
|   v;
 | |
|   struct
 | |
|   {
 | |
|     double *main, *null;
 | |
|   }
 | |
|   b;
 | |
| 
 | |
| }
 | |
| stack_entry;
 | |
| 
 | |
| struct _struct_convolve_state
 | |
| {
 | |
|   int depth, small, big, stack_size;
 | |
|   double *left;
 | |
|   double *right;
 | |
|   double *scratch;
 | |
|   stack_entry *stack;
 | |
| };
 | |
| 
 | |
| /*
 | |
|  * Initialisation routine - sets up tables and space to work in.
 | |
|  * Returns a pointer to internal state, to be used when performing calls.
 | |
|  * On error, returns NULL.
 | |
|  * The pointer should be freed when it is finished with, by convolve_close().
 | |
|  */
 | |
| convolve_state *
 | |
| convolve_init (int depth)
 | |
| {
 | |
|   convolve_state *state;
 | |
| 
 | |
|   state = malloc (sizeof (convolve_state));
 | |
|   state->depth = depth;
 | |
|   state->small = (1 << depth);
 | |
|   state->big = (2 << depth);
 | |
|   state->stack_size = depth * 3;
 | |
|   state->left = calloc (state->big, sizeof (double));
 | |
|   state->right = calloc (state->small * 3, sizeof (double));
 | |
|   state->scratch = calloc (state->small * 3, sizeof (double));
 | |
|   state->stack = calloc (state->stack_size + 1, sizeof (stack_entry));
 | |
|   return state;
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Free the state allocated with convolve_init().
 | |
|  */
 | |
| void
 | |
| convolve_close (convolve_state * state)
 | |
| {
 | |
|   free (state->left);
 | |
|   free (state->right);
 | |
|   free (state->scratch);
 | |
|   free (state->stack);
 | |
|   free (state);
 | |
| }
 | |
| 
 | |
| static void
 | |
| convolve_4 (double *out, const double *left, const double *right)
 | |
| /* This does a 4x4 -> 7 convolution.  For what it's worth, the slightly odd
 | |
|  * ordering gives about a 1% speed up on my Pentium II. */
 | |
| {
 | |
|   double l0, l1, l2, l3, r0, r1, r2, r3;
 | |
|   double a;
 | |
| 
 | |
|   l0 = left[0];
 | |
|   r0 = right[0];
 | |
|   a = l0 * r0;
 | |
|   l1 = left[1];
 | |
|   r1 = right[1];
 | |
|   out[0] = a;
 | |
|   a = (l0 * r1) + (l1 * r0);
 | |
|   l2 = left[2];
 | |
|   r2 = right[2];
 | |
|   out[1] = a;
 | |
|   a = (l0 * r2) + (l1 * r1) + (l2 * r0);
 | |
|   l3 = left[3];
 | |
|   r3 = right[3];
 | |
|   out[2] = a;
 | |
| 
 | |
|   out[3] = (l0 * r3) + (l1 * r2) + (l2 * r1) + (l3 * r0);
 | |
|   out[4] = (l1 * r3) + (l2 * r2) + (l3 * r1);
 | |
|   out[5] = (l2 * r3) + (l3 * r2);
 | |
|   out[6] = l3 * r3;
 | |
| }
 | |
| 
 | |
| static void
 | |
| convolve_run (stack_entry * top, unsigned size, double *scratch)
 | |
| /* Interpret a stack of commands.  The stack starts with two entries; the
 | |
|  * convolution to do, and an illegal entry used to mark the stack top.  The
 | |
|  * size is the number of entries in each input, and must be a power of 2,
 | |
|  * and at least 8.  It is OK to have out equal to left and/or right.
 | |
|  * scratch must have length 3*size.  The number of stack entries needed is
 | |
|  * 3n-4 where size=2^n. */
 | |
| {
 | |
|   do {
 | |
|     const double *left;
 | |
|     const double *right;
 | |
|     double *out;
 | |
| 
 | |
|     /* When we get here, the stack top is always a convolve,
 | |
|      * with size > 4.  So we will split it.  We repeatedly split
 | |
|      * the top entry until we get to size = 4. */
 | |
| 
 | |
|     left = top->v.left;
 | |
|     right = top->v.right;
 | |
|     out = top->v.out;
 | |
|     top++;
 | |
| 
 | |
|     do {
 | |
|       double *s_left, *s_right;
 | |
|       int i;
 | |
| 
 | |
|       /* Halve the size. */
 | |
|       size >>= 1;
 | |
| 
 | |
|       /* Allocate the scratch areas. */
 | |
|       s_left = scratch + size * 3;
 | |
|       /* s_right is a length 2*size buffer also used for
 | |
|        * intermediate output. */
 | |
|       s_right = scratch + size * 4;
 | |
| 
 | |
|       /* Create the intermediate factors. */
 | |
|       for (i = 0; i < size; i++) {
 | |
|         double l = left[i] + left[i + size];
 | |
|         double r = right[i] + right[i + size];
 | |
| 
 | |
|         s_left[i + size] = r;
 | |
|         s_left[i] = l;
 | |
|       }
 | |
| 
 | |
|       /* Push the combine entry onto the stack. */
 | |
|       top -= 3;
 | |
|       top[2].b.main = out;
 | |
|       top[2].b.null = NULL;
 | |
| 
 | |
|       /* Push the low entry onto the stack.  This must be
 | |
|        * the last of the three sub-convolutions, because
 | |
|        * it may overwrite the arguments. */
 | |
|       top[1].v.left = left;
 | |
|       top[1].v.right = right;
 | |
|       top[1].v.out = out;
 | |
| 
 | |
|       /* Push the mid entry onto the stack. */
 | |
|       top[0].v.left = s_left;
 | |
|       top[0].v.right = s_right;
 | |
|       top[0].v.out = s_right;
 | |
| 
 | |
|       /* Leave the high entry in variables. */
 | |
|       left += size;
 | |
|       right += size;
 | |
|       out += size * 2;
 | |
| 
 | |
|     } while (size > 4);
 | |
| 
 | |
|     /* When we get here, the stack top is a group of 3
 | |
|      * convolves, with size = 4, followed by some combines.  */
 | |
|     convolve_4 (out, left, right);
 | |
|     convolve_4 (top[0].v.out, top[0].v.left, top[0].v.right);
 | |
|     convolve_4 (top[1].v.out, top[1].v.left, top[1].v.right);
 | |
|     top += 2;
 | |
| 
 | |
|     /* Now process combines. */
 | |
|     do {
 | |
|       /* b.main is the output buffer, mid is the middle
 | |
|        * part which needs to be adjusted in place, and
 | |
|        * then folded back into the output.  We do this in
 | |
|        * a slightly strange way, so as to avoid having
 | |
|        * two loops. */
 | |
|       double *out = top->b.main;
 | |
|       double *mid = scratch + size * 4;
 | |
|       unsigned int i;
 | |
| 
 | |
|       top++;
 | |
|       out[size * 2 - 1] = 0;
 | |
|       for (i = 0; i < size - 1; i++) {
 | |
|         double lo;
 | |
|         double hi;
 | |
| 
 | |
|         lo = mid[0] - (out[0] + out[2 * size]) + out[size];
 | |
|         hi = mid[size] - (out[size] + out[3 * size]) + out[2 * size];
 | |
|         out[size] = lo;
 | |
|         out[2 * size] = hi;
 | |
|         out++;
 | |
|         mid++;
 | |
|       }
 | |
|       size <<= 1;
 | |
|     } while (top->b.null == NULL);
 | |
|   } while (top->b.main != NULL);
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * convolve_match:
 | |
|  * @lastchoice: an array of size SMALL.
 | |
|  * @input: an array of size BIG (2*SMALL)
 | |
|  * @state: a (non-NULL) pointer returned by convolve_init.
 | |
|  *
 | |
|  * We find the contiguous SMALL-size sub-array of input that best matches
 | |
|  * lastchoice. A measure of how good a sub-array is compared with the lastchoice
 | |
|  * is given by the sum of the products of each pair of entries.  We maximise
 | |
|  * that, by taking an appropriate convolution, and then finding the maximum
 | |
|  * entry in the convolutions.
 | |
|  *
 | |
|  * Return: the position of the best match
 | |
|  */
 | |
| int
 | |
| convolve_match (const int *lastchoice, const short *input,
 | |
|     convolve_state * state)
 | |
| {
 | |
|   double avg = 0;
 | |
|   double best;
 | |
|   int p = 0;
 | |
|   int i;
 | |
|   double *left = state->left;
 | |
|   double *right = state->right;
 | |
|   double *scratch = state->scratch;
 | |
|   stack_entry *top = state->stack + (state->stack_size - 1);
 | |
| 
 | |
|   for (i = 0; i < state->big; i++)
 | |
|     left[i] = input[i];
 | |
| 
 | |
|   for (i = 0; i < state->small; i++) {
 | |
|     double a = lastchoice[(state->small - 1) - i];
 | |
| 
 | |
|     right[i] = a;
 | |
|     avg += a;
 | |
|   }
 | |
| 
 | |
|   /* We adjust the smaller of the two input arrays to have average
 | |
|    * value 0.  This makes the eventual result insensitive to both
 | |
|    * constant offsets and positive multipliers of the inputs. */
 | |
|   avg /= state->small;
 | |
|   for (i = 0; i < state->small; i++)
 | |
|     right[i] -= avg;
 | |
|   /* End-of-stack marker. */
 | |
|   top[1].b.null = scratch;
 | |
|   top[1].b.main = NULL;
 | |
|   /* The low (small x small) part, of which we want the high outputs. */
 | |
|   top->v.left = left;
 | |
|   top->v.right = right;
 | |
|   top->v.out = right + state->small;
 | |
|   convolve_run (top, state->small, scratch);
 | |
| 
 | |
|   /* The high (small x small) part, of which we want the low outputs. */
 | |
|   top->v.left = left + state->small;
 | |
|   top->v.right = right;
 | |
|   top->v.out = right;
 | |
|   convolve_run (top, state->small, scratch);
 | |
| 
 | |
|   /* Now find the best position amoungs this.  Apart from the first
 | |
|    * and last, the required convolution outputs are formed by adding
 | |
|    * outputs from the two convolutions above. */
 | |
|   best = right[state->big - 1];
 | |
|   right[state->big + state->small - 1] = 0;
 | |
|   p = -1;
 | |
|   for (i = 0; i < state->small; i++) {
 | |
|     double a = right[i] + right[i + state->big];
 | |
| 
 | |
|     if (a > best) {
 | |
|       best = a;
 | |
|       p = i;
 | |
|     }
 | |
|   }
 | |
|   p++;
 | |
| 
 | |
| #if 0
 | |
|   {
 | |
|     /* This is some debugging code... */
 | |
|     best = 0;
 | |
|     for (i = 0; i < state->small; i++)
 | |
|       best += ((double) input[i + p]) * ((double) lastchoice[i] - avg);
 | |
| 
 | |
|     for (i = 0; i <= state->small; i++) {
 | |
|       double tot = 0;
 | |
|       unsigned int j;
 | |
| 
 | |
|       for (j = 0; j < state->small; j++)
 | |
|         tot += ((double) input[i + j]) * ((double) lastchoice[j] - avg);
 | |
|       if (tot > best)
 | |
|         printf ("(%i)", i);
 | |
|       if (tot != left[i + (state->small - 1)])
 | |
|         printf ("!");
 | |
|     }
 | |
| 
 | |
|     printf ("%i\n", p);
 | |
|   }
 | |
| #endif
 | |
| 
 | |
|   return p;
 | |
| }
 |