# Lossy compression algorithms very often make use of DCT or DFT calculations,
# or variations of these calculations. This file is intended to be a short
# reference about classical DCT and DFT algorithms.


from random import random
from math import pi, sin, cos, sqrt
from cmath import exp

def exp_j (alpha):
    return exp (alpha * 1j)

def conjugate (c):
    c = c + 0j
    return c.real - 1j * c.imag

def vector (N):
    return [0j] * N


# Let us start with the canonical definition of the unscaled DFT algorithm :
# (I can not draw sigmas in a text file so I'll use python code instead)  :)

def W (k, N):
    return exp_j ((-2*pi*k)/N)

def unscaled_DFT (N, input, output):
    for o in range(N):		# o is output index
	output[o] = 0
	for i in range(N):
	    output[o] = output[o] + input[i] * W (i*o, N)

# This algorithm takes complex input and output. There are N*N complex
# multiplications and N*(N-1) complex additions.


# Of course this algorithm is an extremely naive implementation and there are
# some ways to use the trigonometric properties of the coefficients to find
# some decompositions that can accelerate the calculation by several orders
# of magnitude... This is a well known and studied problem. One of the
# available explanations of this process is at this URL :
# www.cmlab.csie.ntu.edu.tw/cml/dsp/training/coding/transform/fft.html


# Let's start with the radix-2 decimation-in-time algorithm :

def unscaled_DFT_radix2_time (N, input, output):
    even_input = vector(N/2)
    odd_input = vector(N/2)
    even_output = vector(N/2)
    odd_output = vector(N/2)

    for i in range(N/2):
	even_input[i] = input[2*i]
	odd_input[i] = input[2*i+1]

    unscaled_DFT (N/2, even_input, even_output)
    unscaled_DFT (N/2, odd_input, odd_output)

    for i in range(N/2):
	odd_output[i] = odd_output[i] * W (i, N)

    for i in range(N/2):
	output[i] = even_output[i] + odd_output[i]
	output[i+N/2] = even_output[i] - odd_output[i]

# This algorithm takes complex input and output.

# We divide the DFT calculation into 2 DFT calculations of size N/2
# We then do N/2 complex multiplies followed by N complex additions.
# (actually W(0,N) = 1 and W(N/4,N) = -j so we can skip a few of these complex
# multiplies... we will skip 1 for i=0 and 1 for i=N/4. Also for i=N/8 and for
# i=3*N/8 the W(i,N) values can be special-cased to implement the complex
# multiplication using only 2 real additions and 2 real multiplies)

# Also note that all the basic stages of this DFT algorithm are easily
# reversible, so we can calculate the IDFT with the same complexity.


# A variant of this is the radix-2 decimation-in-frequency algorithm :

def unscaled_DFT_radix2_freq (N, input, output):
    even_input = vector(N/2)
    odd_input = vector(N/2)
    even_output = vector(N/2)
    odd_output = vector(N/2)

    for i in range(N/2):
	even_input[i] = input[i] + input[i+N/2]
	odd_input[i] = input[i] - input[i+N/2]

    for i in range(N/2):
	odd_input[i] = odd_input[i] * W (i, N)

    unscaled_DFT (N/2, even_input, even_output)
    unscaled_DFT (N/2, odd_input, odd_output)

    for i in range(N/2):
	output[2*i] = even_output[i]
	output[2*i+1] = odd_output[i]

# Note that the decimation-in-time and the decimation-in-frequency variants
# have exactly the same complexity, they only do the operations in a different
# order.

# Actually, if you look at the decimation-in-time variant of the DFT, and
# reverse it to calculate an IDFT, you get something that is extremely close
# to the decimation-in-frequency DFT algorithm...


# The radix-4 algorithms are slightly more efficient : they take into account
# the fact that with complex numbers, multiplications by j and -j are also
# "free"... i.e. when you code them using real numbers, they translate into
# a few data moves but no real operation.

# Let's start with the radix-4 decimation-in-time algorithm :

def unscaled_DFT_radix4_time (N, input, output):
    input_0 = vector(N/4)
    input_1 = vector(N/4)
    input_2 = vector(N/4)
    input_3 = vector(N/4)
    output_0 = vector(N/4)
    output_1 = vector(N/4)
    output_2 = vector(N/4)
    output_3 = vector(N/4)
    tmp_0 = vector(N/4)
    tmp_1 = vector(N/4)
    tmp_2 = vector(N/4)
    tmp_3 = vector(N/4)

    for i in range(N/4):
	input_0[i] = input[4*i]
	input_1[i] = input[4*i+1]
	input_2[i] = input[4*i+2]
	input_3[i] = input[4*i+3]

    unscaled_DFT (N/4, input_0, output_0)
    unscaled_DFT (N/4, input_1, output_1)
    unscaled_DFT (N/4, input_2, output_2)
    unscaled_DFT (N/4, input_3, output_3)

    for i in range(N/4):
	output_1[i] = output_1[i] * W (i, N)
	output_2[i] = output_2[i] * W (2*i, N)
	output_3[i] = output_3[i] * W (3*i, N)

    for i in range(N/4):
	tmp_0[i] = output_0[i] + output_2[i]
	tmp_1[i] = output_0[i] - output_2[i]
	tmp_2[i] = output_1[i] + output_3[i]
	tmp_3[i] = output_1[i] - output_3[i]

    for i in range(N/4):
	output[i] = tmp_0[i] + tmp_2[i]
	output[i+N/4] = tmp_1[i] - 1j * tmp_3[i]
	output[i+N/2] = tmp_0[i] - tmp_2[i]
	output[i+3*N/4] = tmp_1[i] + 1j * tmp_3[i]

# This algorithm takes complex input and output.

# We divide the DFT calculation into 4 DFT calculations of size N/4
# We then do 3*N/4 complex multiplies followed by 2*N complex additions.
# (actually W(0,N) = 1 and W(N/4,N) = -j so we can skip a few of these complex
# multiplies... we will skip 3 for i=0 and 1 for i=N/8. Also for i=N/8
# the remaining W(i,N) and W(3*i,N) multiplies can be implemented using only
# 2 real additions and 2 real multiplies. For i=N/16 and i=3*N/16 we can also
# optimise the W(2*i/N) multiply this way.

# If we wanted to do the same decomposition with one radix-2 decomposition
# of size N and 2 radix-2 decompositions of size N/2, the total cost would be
# N complex multiplies and 2*N complex additions. Thus we see that the
# decomposition of one DFT calculation of size N into 4 calculations of size
# N/4 using the radix-4 algorithm instead of the radix-2 algorithm saved N/4
# complex multiplies...


# The radix-4 decimation-in-frequency algorithm is similar :

def unscaled_DFT_radix4_freq (N, input, output):
    input_0 = vector(N/4)
    input_1 = vector(N/4)
    input_2 = vector(N/4)
    input_3 = vector(N/4)
    output_0 = vector(N/4)
    output_1 = vector(N/4)
    output_2 = vector(N/4)
    output_3 = vector(N/4)
    tmp_0 = vector(N/4)
    tmp_1 = vector(N/4)
    tmp_2 = vector(N/4)
    tmp_3 = vector(N/4)

    for i in range(N/4):
	tmp_0[i] = input[i] + input[i+N/2]
	tmp_1[i] = input[i+N/4] + input[i+3*N/4]
	tmp_2[i] = input[i] - input[i+N/2]
	tmp_3[i] = input[i+N/4] - input[i+3*N/4]

    for i in range(N/4):
	input_0[i] = tmp_0[i] + tmp_1[i]
	input_1[i] = tmp_2[i] - 1j * tmp_3[i]
	input_2[i] = tmp_0[i] - tmp_1[i]
	input_3[i] = tmp_2[i] + 1j * tmp_3[i]

    for i in range(N/4):
	input_1[i] = input_1[i] * W (i, N)
	input_2[i] = input_2[i] * W (2*i, N)
	input_3[i] = input_3[i] * W (3*i, N)

    unscaled_DFT (N/4, input_0, output_0)
    unscaled_DFT (N/4, input_1, output_1)
    unscaled_DFT (N/4, input_2, output_2)
    unscaled_DFT (N/4, input_3, output_3)

    for i in range(N/4):
	output[4*i] = output_0[i]
	output[4*i+1] = output_1[i]
	output[4*i+2] = output_2[i]
	output[4*i+3] = output_3[i]

# Once again the complexity is exactly the same as for the radix-4
# decimation-in-time DFT algorithm, only the order of the operations is
# different.


# Now, let us reorder the radix-4 algorithms in a different way :

#def unscaled_DFT_radix4_time (N, input, output):
#   input_0 = vector(N/4)
#   input_1 = vector(N/4)
#   input_2 = vector(N/4)
#   input_3 = vector(N/4)
#   output_0 = vector(N/4)
#   output_1 = vector(N/4)
#   output_2 = vector(N/4)
#   output_3 = vector(N/4)
#   tmp_0 = vector(N/4)
#   tmp_1 = vector(N/4)
#   tmp_2 = vector(N/4)
#   tmp_3 = vector(N/4)
#
#   for i in range(N/4):
#	input_0[i] = input[4*i]
#	input_2[i] = input[4*i+2]
#
#   unscaled_DFT (N/4, input_0, output_0)
#   unscaled_DFT (N/4, input_2, output_2)
#
#   for i in range(N/4):
#	output_2[i] = output_2[i] * W (2*i, N)
#
#   for i in range(N/4):
#	tmp_0[i] = output_0[i] + output_2[i]
#	tmp_1[i] = output_0[i] - output_2[i]
#
#   for i in range(N/4):
#	input_1[i] = input[4*i+1]
#	input_3[i] = input[4*i+3]
#
#   unscaled_DFT (N/4, input_1, output_1)
#   unscaled_DFT (N/4, input_3, output_3)
#
#   for i in range(N/4):
#	output_1[i] = output_1[i] * W (i, N)
#	output_3[i] = output_3[i] * W (3*i, N)
#
#   for i in range(N/4):
#	tmp_2[i] = output_1[i] + output_3[i]
#	tmp_3[i] = output_1[i] - output_3[i]
#
#   for i in range(N/4):
#	output[i] = tmp_0[i] + tmp_2[i]
#	output[i+N/4] = tmp_1[i] - 1j * tmp_3[i]
#	output[i+N/2] = tmp_0[i] - tmp_2[i]
#	output[i+3*N/4] = tmp_1[i] + 1j * tmp_3[i]

# We didn't do anything here, only reorder the operations. But now, look at the
# first part of this function, up to the calculations of tmp0 and tmp1 : this
# is extremely similar to the radix-2 decimation-in-time algorithm ! or more
# precisely, it IS the radix-2 decimation-in-time algorithm, with size N/2,
# applied on a vector representing the even input coefficients, and giving
# an output vector that is the concatenation of tmp0 and tmp1.
# This is important to notice, because this means we can now choose to
# calculate tmp0 and tmp1 using any DFT algorithm that we want, and we know
# that some of them are more efficient than radix-2...

# This leads us directly to the split-radix decimation-in-time algorithm :

def unscaled_DFT_split_radix_time (N, input, output):
    even_input = vector(N/2)
    input_1 = vector(N/4)
    input_3 = vector(N/4)
    even_output = vector(N/2)
    output_1 = vector(N/4)
    output_3 = vector(N/4)
    tmp_0 = vector(N/4)
    tmp_1 = vector(N/4)

    for i in range(N/2):
	even_input[i] = input[2*i]

    for i in range(N/4):
	input_1[i] = input[4*i+1]
	input_3[i] = input[4*i+3]

    unscaled_DFT (N/2, even_input, even_output)
    unscaled_DFT (N/4, input_1, output_1)
    unscaled_DFT (N/4, input_3, output_3)

    for i in range(N/4):
	output_1[i] = output_1[i] * W (i, N)
	output_3[i] = output_3[i] * W (3*i, N)

    for i in range(N/4):
	tmp_0[i] = output_1[i] + output_3[i]
	tmp_1[i] = output_1[i] - output_3[i]

    for i in range(N/4):
	output[i] = even_output[i] + tmp_0[i]
	output[i+N/4] = even_output[i+N/4] - 1j * tmp_1[i]
	output[i+N/2] = even_output[i] - tmp_0[i]
	output[i+3*N/4] = even_output[i+N/4] + 1j * tmp_1[i]

# This function performs one DFT of size N/2 and two of size N/4, followed by
# N/2 complex multiplies and 3*N/2 complex additions.
# (actually W(0,N) = 1 and W(N/4,N) = -j so we can skip a few of these complex
# multiplies... we will skip 2 for i=0. Also for i=N/8 the W(i,N) and W(3*i,N)
# multiplies can be implemented using only 2 real additions and 2 real
# multiplies)


# We can similarly define the split-radix decimation-in-frequency DFT :

def unscaled_DFT_split_radix_freq (N, input, output):
    even_input = vector(N/2)
    input_1 = vector(N/4)
    input_3 = vector(N/4)
    even_output = vector(N/2)
    output_1 = vector(N/4)
    output_3 = vector(N/4)
    tmp_0 = vector(N/4)
    tmp_1 = vector(N/4)

    for i in range(N/2):
	even_input[i] = input[i] + input[i+N/2]

    for i in range(N/4):
	tmp_0[i] = input[i] - input[i+N/2]
	tmp_1[i] = input[i+N/4] - input[i+3*N/4]

    for i in range(N/4):
	input_1[i] = tmp_0[i] - 1j * tmp_1[i]
	input_3[i] = tmp_0[i] + 1j * tmp_1[i]

    for i in range(N/4):
	input_1[i] = input_1[i] * W (i, N)
	input_3[i] = input_3[i] * W (3*i, N)

    unscaled_DFT (N/2, even_input, even_output)
    unscaled_DFT (N/4, input_1, output_1)
    unscaled_DFT (N/4, input_3, output_3)

    for i in range(N/2):
	output[2*i] = even_output[i]

    for i in range(N/4):
	output[4*i+1] = output_1[i]
	output[4*i+3] = output_3[i]

# The complexity is again the same as for the decimation-in-time variant.


# Now let us summarize our various algorithms for DFT decomposition :

# radix-2 : DFT(N) -> 2*DFT(N/2) using N/2 multiplies and N additions
# radix-4 : DFT(N) -> 4*DFT(N/2) using 3*N/4 multiplies and 2*N additions
# split-radix : DFT(N) -> DFT(N/2) + 2*DFT(N/4) using N/2 muls and 3*N/2 adds

# (we are always speaking of complex multiplies and complex additions... a
# complex addition is implemented with 2 real additions, and a complex
# multiply is implemented with either 2 adds and 4 muls or 3 adds and 3 muls,
# so we will keep a separate count of these)

# If we want to take into account the special values of W(i,N), we can remove
# a few complex multiplies. Supposing N>=16 we can remove :
# radix-2 : remove 2 complex multiplies, simplify 2 others
# radix-4 : remove 4 complex multiplies, simplify 4 others
# split-radix : remove 2 complex multiplies, simplify 2 others

# This gives the following table for the complexity of a complex DFT :
#	N	real additions	real multiplies	complex multiplies
#       1               0               0               0
#       2               4               0               0
#       4              16               0               0
#       8              52               4               0
#      16             136               8               4
#      32             340              20              16
#      64             808              40              52
#     128            1876              84             144
#     256            4264             168             372
#     512            9556             340             912
#    1024           21160             680            2164
#    2048           46420            1364            5008
#    4096          101032            2728           11380
#    8192          218452            5460           25488
#   16384          469672           10920           56436
#   32768         1004884           21844          123792
#   65536         2140840           43688          269428

# If we chose to implement complex multiplies with 3 real muls + 3 real adds,
# then these results are consistent with the table at the end of the
# www.cmlab.csie.ntu.edu.tw DFT tutorial that I mentioned earlier.


# Now another important case for the DFT is the one where the inputs are
# real numbers instead of complex ones. We have to find ways to optimize for
# this important case.

# If the DFT inputs are real-valued, then the DFT outputs have nice properties
# too : output[0] and output[N/2] will be real numbers, and output[N-i] will
# be the conjugate of output[i] for i in 0...N/2-1

# Likewise, if the DFT inputs are purely imaginary numbers, then the DFT
# outputs will have special properties too : output[0] and output[N/2] will be
# purely imaginary, and output[N-i] will be the opposite of the conjugate of
# output[i] for i in 0...N/2-1

# We can use these properties to calculate two real-valued DFT at once :

def two_real_unscaled_DFT (N, input1, input2, output1, output2):
    input = vector(N)
    output = vector(N)

    for i in range(N):
	input[i] = input1[i] + 1j * input2[i]

    unscaled_DFT (N, input, output)

    output1[0] = output[0].real + 0j
    output2[0] = output[0].imag + 0j

    for i in range(N/2)[1:]:
	output1[i] = 0.5 * (output[i] + conjugate(output[N-i]))
	output2[i] = -0.5j * (output[i] - conjugate(output[N-i]))

	output1[N-i] = conjugate(output1[i])
	output2[N-i] = conjugate(output2[i])

    output1[N/2] = output[N/2].real + 0j
    output2[N/2] = output[N/2].imag + 0j

# This routine does a total of N-2 complex additions and N-2 complex
# multiplies by 0.5

# This routine can also be inverted to calculate the IDFT of two vectors at
# once if we know that the outputs will be real-valued.


# If we have only one real-valued DFT calculation to do, we can still cut this
# calculation in several parts using one of the decimate-in-time methods
# (so that the different parts are still real-valued)

# As with complex DFT calculations, the best method is to use a split radix.
# There are a lot of symetries in the DFT outputs that we can exploit to
# reduce the number of operations...

def real_unscaled_DFT_split_radix_time_1 (N, input, output):
    even_input = vector(N/2)
    even_output = vector(N/2)
    input_1 = vector(N/4)
    output_1 = vector(N/4)
    input_3 = vector(N/4)
    output_3 = vector(N/4)
    tmp_0 = vector(N/4)
    tmp_1 = vector(N/4)

    for i in range(N/2):
	even_input[i] = input[2*i]

    for i in range(N/4):
	input_1[i] = input[4*i+1]
	input_3[i] = input[4*i+3]

    unscaled_DFT (N/2, even_input, even_output)
    # this is again a real DFT !
    # we will only use even_output[i] for i in 0 ... N/4 included. we know that
    # even_output[N/2-i] is the conjugate of even_output[i]... also we know
    # that even_output[0] and even_output[N/4] are purely real.

    unscaled_DFT (N/4, input_1, output_1)
    unscaled_DFT (N/4, input_3, output_3)
    # these are real DFTs too... with symetries in the outputs... once again

    tmp_0[0] = output_1[0] + output_3[0]	# real numbers
    tmp_1[0] = output_1[0] - output_3[0]	# real numbers

    tmp__0 = (output_1[N/8] + output_3[N/8]) * sqrt(0.5)	# real numbers
    tmp__1 = (output_1[N/8] - output_3[N/8]) * sqrt(0.5)	# real numbers
    tmp_0[N/8] = tmp__1 - 1j * tmp__0		# real + 1j * real
    tmp_1[N/8] = tmp__0 - 1j * tmp__1		# real + 1j * real

    for i in range(N/8)[1:]:
	output_1[i] = output_1[i] * W (i, N)
	output_3[i] = output_3[i] * W (3*i, N)

	tmp_0[i] = output_1[i] + output_3[i]
	tmp_1[i] = output_1[i] - output_3[i]

	tmp_0[N/4-i] = -1j * conjugate(tmp_1[i])
	tmp_1[N/4-i] = -1j * conjugate(tmp_0[i])

    output[0] = even_output[0] + tmp_0[0]		# real numbers
    output[N/4] = even_output[N/4] - 1j * tmp_1[0]	# real + 1j * real
    output[N/2] = even_output[0] - tmp_0[0]		# real numbers
    output[3*N/4] = even_output[N/4] + 1j * tmp_1[0]	# real + 1j * real

    for i in range(N/4)[1:]:
	output[i] = even_output[i] + tmp_0[i]
	output[i+N/4] = conjugate(even_output[N/4-i]) - 1j * tmp_1[i]

	output[N-i] = conjugate(output[i])
	output[3*N/4-i] = conjugate(output[i+N/4])

# This function performs one real DFT of size N/2 and two real DFT of size
# N/4, followed by 6 real additions, 2 real multiplies, 3*N/4-4 complex
# additions and N/4-2 complex multiplies.


# We can also try to combine the two real DFT of size N/4 into a single complex
# DFT :

def real_unscaled_DFT_split_radix_time_2 (N, input, output):
    even_input = vector(N/2)
    even_output = vector(N/2)
    odd_input = vector(N/4)
    odd_output = vector(N/4)
    tmp_0 = vector(N/4)
    tmp_1 = vector(N/4)

    for i in range(N/2):
	even_input[i] = input[2*i]

    for i in range(N/4):
	odd_input[i] = input[4*i+1] + 1j * input[4*i+3]

    unscaled_DFT (N/2, even_input, even_output)
    # this is again a real DFT !
    # we will only use even_output[i] for i in 0 ... N/4 included. we know that
    # even_output[N/2-i] is the conjugate of even_output[i]... also we know
    # that even_output[0] and even_output[N/4] are purely real.

    unscaled_DFT (N/4, odd_input, odd_output)
    # but this one is a complex DFT so no special properties here

    output_1 = odd_output[0].real
    output_3 = odd_output[0].imag
    tmp_0[0] = output_1 + output_3	# real numbers
    tmp_1[0] = output_1 - output_3	# real numbers

    output_1 = odd_output[N/8].real
    output_3 = odd_output[N/8].imag
    tmp__0 = (output_1 + output_3) * sqrt(0.5)	# real numbers
    tmp__1 = (output_1 - output_3) * sqrt(0.5)	# real numbers
    tmp_0[N/8] = tmp__1 - 1j * tmp__0		# real + 1j * real
    tmp_1[N/8] = tmp__0 - 1j * tmp__1		# real + 1j * real

    for i in range(N/8)[1:]:
	output_1 = odd_output[i] + conjugate(odd_output[N/4-i])
	output_3 = odd_output[i] - conjugate(odd_output[N/4-i])

	output_1 = output_1 * 0.5 * W (i, N)
	output_3 = output_3 * -0.5j * W (3*i, N)

	tmp_0[i] = output_1 + output_3
	tmp_1[i] = output_1 - output_3

	tmp_0[N/4-i] = -1j * conjugate(tmp_1[i])
	tmp_1[N/4-i] = -1j * conjugate(tmp_0[i])

    output[0] = even_output[0] + tmp_0[0]		# real numbers
    output[N/4] = even_output[N/4] - 1j * tmp_1[0]	# real + 1j * real
    output[N/2] = even_output[0] - tmp_0[0]		# real numbers
    output[3*N/4] = even_output[N/4] + 1j * tmp_1[0]	# real + 1j * real

    for i in range(N/4)[1:]:
	output[i] = even_output[i] + tmp_0[i]
	output[i+N/4] = conjugate(even_output[N/4-i]) - 1j * tmp_1[i]

	output[N-i] = conjugate(output[i])
	output[3*N/4-i] = conjugate(output[i+N/4])

# This function performs one real DFT of size N/2 and one complex DFT of size
# N/4, followed by 6 real additions, 2 real multiplies, N-6 complex additions
# and N/4-2 complex multiplies.

# After comparing the performance, it turns out that for real-valued DFT, the
# version of the algorithm that subdivides the calculation into one real
# DFT of size N/2 and two real DFT of size N/4 is the most efficient one.
# The other version gives exactly the same number of multiplies and a few more
# real additions.


# Now we can also try the decimate-in-frequency method for a real-valued DFT.
# Using the split-radix algorithm, and by taking into account the symetries of
# the outputs :

def real_unscaled_DFT_split_radix_freq (N, input, output):
    even_input = vector(N/2)
    input_1 = vector(N/4)
    even_output = vector(N/2)
    output_1 = vector(N/4)
    tmp_0 = vector(N/4)
    tmp_1 = vector(N/4)

    for i in range(N/2):
	even_input[i] = input[i] + input[i+N/2]

    for i in range(N/4):
	tmp_0[i] = input[i] - input[i+N/2]
	tmp_1[i] = input[i+N/4] - input[i+3*N/4]

    for i in range(N/4):
	input_1[i] = tmp_0[i] - 1j * tmp_1[i]

    for i in range(N/4):
	input_1[i] = input_1[i] * W (i, N)

    unscaled_DFT (N/2, even_input, even_output)
    # This is still a real-valued DFT

    unscaled_DFT (N/4, input_1, output_1)
    # But that one is a complex-valued DFT

    for i in range(N/2):
	output[2*i] = even_output[i]

    for i in range(N/4):
	output[4*i+1] = output_1[i]
	output[N-1-4*i] = conjugate(output_1[i])

# I think this implementation is much more elegant than the decimate-in-time
# version ! It looks very much like the complex-valued version, all we had to
# do was remove one of the complex-valued internal DFT calls because we could
# deduce the outputs by using the symetries of the problem.

# As for performance, we did N real additions, N/4 complex multiplies (a bit
# less actually, because W(0,N) = 1 and W(N/8,N) is a "simple" multiply), then
# one real DFT of size N/2 and one complex DFT of size N/4.

# It turns out that even if the methods are so different, the number of
# operations is exactly the same as for the best of the two decimation-in-time
# methods that we tried.


# This gives us the following performance for real-valued DFT :
#	N	real additions	real multiplies	complex multiplies
#       2               2               0               0
#       4               6               0               0
#       8              20               2               0
#      16              54               4               2
#      32             140              10               8
#      64             342              20              26
#     128             812              42              72
#     256            1878              84             186
#     512            4268             170             456
#    1024            9558             340            1082
#    2048           21164             682            2504
#    4096           46422            1364            5690
#    8192          101036            2730           12744
#   16384          218454            5460           28218
#   32768          469676           10922           61896
#   65536         1004886           21844          134714


# As an example, this is an implementation of the real-valued DFT8 :

def DFT8 (input, output):
    even_0 = input[0] + input[4]
    even_1 = input[1] + input[5]
    even_2 = input[2] + input[6]
    even_3 = input[3] + input[7]

    tmp_0 = even_0 + even_2
    tmp_1 = even_0 - even_2
    tmp_2 = even_1 + even_3
    tmp_3 = even_1 - even_3

    output[0] = tmp_0 + tmp_2
    output[2] = tmp_1 - 1j * tmp_3
    output[4] = tmp_0 - tmp_2

    odd_0_r = input[0] - input[4]
    odd_0_i = input[2] - input[6]

    tmp_0 = input[1] - input[5]
    tmp_1 = input[3] - input[7]
    odd_1_r = (tmp_0 - tmp_1) * sqrt(0.5)
    odd_1_i = (tmp_0 + tmp_1) * sqrt(0.5)

    output[1] = (odd_0_r + odd_1_r) - 1j * (odd_0_i + odd_1_i)
    output[5] = (odd_0_r - odd_1_r) - 1j * (odd_0_i - odd_1_i)

    output[3] = conjugate(output[5])
    output[6] = conjugate(output[2])
    output[7] = conjugate(output[1])


# Also a basic implementation of the real-valued DFT4 :

def DFT4 (input, output):
    tmp_0 = input[0] + input[2]
    tmp_1 = input[0] - input[2]
    tmp_2 = input[1] + input[3]
    tmp_3 = input[1] - input[3]

    output[0] = tmp_0 + tmp_2
    output[1] = tmp_1 - 1j * tmp_3
    output[2] = tmp_0 - tmp_2
    output[3] = tmp_1 + 1j * tmp_3


# A similar idea might be used to calculate only the real part of the output
# of a complex DFT : we take an DFT algorithm for real inputs and complex
# outputs and we simply reverse it. The resulting algorithm will only work
# with inputs that satisfy the conjugation rule (input[i] is the conjugate of
# input[N-i]) so we can do a first pass to modify the input so that it follows
# this rule. An example implementation is as follows (adapted from the
# unscaled_DFT_split_radix_time algorithm) :

def complex2real_unscaled_DFT_split_radix_time (N, input, output):
    even_input = vector(N/2)
    input_1 = vector(N/4)
    even_output = vector(N/2)
    output_1 = vector(N/4)

    for i in range(N/2):
	even_input[i] = input[2*i]

    for i in range(N/4):
	input_1[i] = input[4*i+1] + conjugate(input[N-1-4*i])

    unscaled_DFT (N/2, even_input, even_output)
    unscaled_DFT (N/4, input_1, output_1)

    for i in range(N/4):
	output_1[i] = output_1[i] * W (i, N)

    for i in range(N/4):
	output[i] = even_output[i] + output_1[i].real
	output[i+N/4] = even_output[i+N/4] + output_1[i].imag
	output[i+N/2] = even_output[i] - output_1[i].real
	output[i+3*N/4] = even_output[i+N/4] - output_1[i].imag

# This algorithm does N/4 complex additions, N/4-1 complex multiplies
# (including one "simple" multiply for i=N/8), N real additions, one
# "complex-to-real" DFT of size N/2, and one complex DFT of size N/4.
# Also, in the complex DFT of size N/4, we do not care about the imaginary
# part of output_1[0], which in practice allows us to save one real addition.

# This gives us the following performance for complex DFT with real outputs :
#	N	real additions	real multiplies	complex multiplies
#       1               0               0               0
#       2               2               0               0
#       4               8               0               0
#       8              25               2               0
#      16              66               4               2
#      32             167              10               8
#      64             400              20              26
#     128             933              42              72
#     256            2126              84             186
#     512            4771             170             456
#    1024           10572             340            1082
#    2048           23201             682            2504
#    4096           50506            1364            5690
#    8192          109215            2730           12744
#   16384          234824            5460           28218
#   32768          502429           10922           61896
#   65536         1070406           21844          134714


# Now let's talk about the DCT algorithm. The canonical definition for it is
# as follows :

def C (k, N):
    return cos ((k*pi)/(2*N))

def unscaled_DCT (N, input, output):
    for o in range(N):		# o is output index
	output[o] = 0
	for i in range(N):	# i is input index
	    output[o] = output[o] + input[i] * C ((2*i+1)*o, N)

# This trivial algorithm uses N*N multiplications and N*(N-1) additions.


# One possible decomposition on this calculus is to use the fact that C (i, N)
# and C (2*N-i, N) are opposed. This can lead to this decomposition :

#def unscaled_DCT (N, input, output):
#   even_input = vector (N)
#   odd_input = vector (N)
#   even_output = vector (N)
#   odd_output = vector (N)
#
#   for i in range(N/2):
#	even_input[i] = input[i] + input[N-1-i]
#	odd_input[i] = input[i] - input[N-1-i]
#
#   unscaled_DCT (N, even_input, even_output)
#   unscaled_DCT (N, odd_input, odd_output)
#
#   for i in range(N/2):
#	output[2*i] = even_output[2*i]
#	output[2*i+1] = odd_output[2*i+1]

# Now the even part can easily be calculated : by looking at the C(k,N)
# formula, we see that the even part is actually an unscaled DCT of size N/2.
# The odd part looks like a DCT of size N/2, but the coefficients are
# actually C ((2*i+1)*(2*o+1), 2*N) instead of C ((2*i+1)*o, N).

# We use a trigonometric relation here :
# 2 * C ((a+b)/2, N) * C ((a-b)/2, N) = C (a, N) + C (b, N)
# Thus with a = (2*i+1)*o and b = (2*i+1)*(o+1) :
# 2 * C((2*i+1)*(2*o+1),2N) * C(2*i+1,2N) = C((2*i+1)*o,N) + C((2*i+1)*(o+1),N)

# This leads us to the Lee DCT algorithm :

def unscaled_DCT_Lee (N, input, output):
    even_input = vector(N/2)
    odd_input = vector(N/2)
    even_output = vector(N/2)
    odd_output = vector(N/2)

    for i in range(N/2):
	even_input[i] = input[i] + input[N-1-i]
	odd_input[i] = input[i] - input[N-1-i]

    for i in range(N/2):
	odd_input[i] = odd_input[i] * (0.5 / C (2*i+1, N))

    unscaled_DCT (N/2, even_input, even_output)
    unscaled_DCT (N/2, odd_input, odd_output)

    for i in range(N/2-1):
	odd_output[i] = odd_output[i] + odd_output[i+1]

    for i in range(N/2):
	output[2*i] = even_output[i]
	output[2*i+1] = odd_output[i];

# Notes about this algorithm :

# The algorithm can be easily inverted to calculate the IDCT instead :
# each of the basic stages are separately inversible...

# This function does N adds, then N/2 muls, then 2 recursive calls with
# size N/2, then N/2-1 adds again. If we apply it recursively, the total
# number of operations will be N*log2(N)/2 multiplies and N*(3*log2(N)/2-1) + 1
# additions. So this is much faster than the canonical algorithm.

# Some of the multiplication coefficients 0.5/cos(...) can get quite large.
# This means that a small error in the input will give a large error on the
# output... For a DCT of size N the biggest coefficient will be at i=N/2-1
# and it will be slightly more than N/pi which can be large for large N's.

# In the IDCT however, the multiplication coefficients for the reverse
# transformation are of the form 2*cos(...) so they can not get big and there
# is no accuracy problem.

# You can find another description of this algorithm at
# http://www.intel.com/drg/mmx/appnotes/ap533.htm


# Another idea is to observe that the DCT calculation can be made to look like
# the DFT calculation : C (k, N) is the real part of W (k, 4*N) or W (-k, 4*N).
# We can use this idea translate the DCT algorithm into a call to the DFT
# algorithm :

def unscaled_DCT_DFT (N, input, output):
    DFT_input = vector (4*N)
    DFT_output = vector (4*N)

    for i in range(N):
	DFT_input[2*i+1] = input[i]
	#DFT_input[4*N-2*i-1] = input[i]	# We could use this instead

    unscaled_DFT (4*N, DFT_input, DFT_output)

    for i in range(N):
	output[i] = DFT_output[i].real


# We can then use our knowledge of the DFT calculation to optimize for this
# particular case. For example using the radix-2 decimation-in-time method :

#def unscaled_DCT_DFT (N, input, output):
#   DFT_input = vector (2*N)
#   DFT_output = vector (2*N)
#
#   for i in range(N):
#	DFT_input[i] = input[i]
#	#DFT_input[2*N-1-i] = input[i]	# We could use this instead
#
#   unscaled_DFT (2*N, DFT_input, DFT_output)
#
#   for i in range(N):
#       DFT_output[i] = DFT_output[i] * W (i, 4*N)
#
#   for i in range(N):
#	output[i] = DFT_output[i].real

# This leads us to the AAN implementation of the DCT algorithm : if we set
# both DFT_input[i] and DFT_input[2*N-1-i] to input[i], then the imaginary
# parts of W(2*i+1) and W(-2*i-1) will compensate, and output_DFT[i] will
# already be a real after the multiplication by W(i,4*N). Which means that
# before the multiplication, it is the product of a real number and W(-i,4*N).
# This leads to the following code, called the AAN algorithm :

def unscaled_DCT_AAN (N, input, output):
    DFT_input = vector (2*N)
    DFT_output = vector (2*N)

    for i in range(N):
	DFT_input[i] = input[i]
	DFT_input[2*N-1-i] = input[i]

    symetrical_unscaled_DFT (2*N, DFT_input, DFT_output)

    for i in range(N):
	output[i] = DFT_output[i].real * (0.5 / C (i, N))

# Notes about the AAN algorithm :

# The cost of this function is N real multiplies and a DFT of size 2*N. The
# DFT to calculate has special properties : the inputs are real and symmetric.
# Also, we only need to calculate the real parts of the N first DFT outputs.
# We can try to take advantage of all that.

# We can invert this algorithm to calculate the IDCT. The final multiply
# stage is trivially invertible. The DFT stage is invertible too, but we have
# to take into account the special properties of this particular DFT for that.

# Once again we have to take care of numerical precision for the DFT : the
# output coefficients can get large, so that a small error in the input will
# give a large error on the output... For a DCT of size N the biggest
# coefficient will be at i=N/2-1 and it will be slightly more than N/pi

# You can find another description of this algorithm at this url :
# www.cmlab.csie.ntu.edu.tw/cml/dsp/training/coding/transform/fastdct.html
# (It is the same server where we already found a description of the fast DFT)


# To optimize the DFT calculation, we can take a lot of specific things into
# account : the input is real and symetric, and we only care about the real
# part of the output. Also, we only care about the N first output coefficients,
# but that one does not save operations actually, because the other
# coefficients are the conjugates of the ones we look anyway.

# One useful way to use the symmetry of the input is to use the radix-2
# decimation-in-frequency algorithm. We can write a version of
# unscaled_DFT_radix2_freq for the case where the input is symmetrical :
# we have removed a few additions in the first stages because even_input
# is symmetrical and odd_input is antisymetrical. Also, we have modified the
# odd_input vector so that the second half of it is set to zero and the real
# part of the DFT output is not modified. After that modification, the second
# part of the odd_input was null so we used the radix-2 decimation-in-frequency
# again on the odd DFT. Also odd_output is symmetrical because input is real...

def symetrical_unscaled_DFT (N, input, output):
    even_input = vector(N/2)
    odd_tmp = vector(N/2)
    odd_input = vector(N/2)
    even_output = vector(N/2)
    odd_output = vector(N/2)

    for i in range(N/4):
	even_input[N/2-i-1] = even_input[i] = input[i] + input[N/2-1-i]

    for i in range(N/4):
	odd_tmp[i] = input[i] - input[N/2-1-i]

    odd_input[0] = odd_tmp[0]
    for i in range(N/4)[1:]:
	odd_input[i] = (odd_tmp[i] + odd_tmp[i-1]) * W (i, N)

    unscaled_DFT (N/2, even_input, even_output)
    # symmetrical real inputs, real outputs

    unscaled_DFT (N/4, odd_input, odd_output)
    # complex inputs, real outputs

    for i in range(N/2):
	output[2*i] = even_output[i]

    for i in range(N/4):
	output[N-1-4*i] = output[4*i+1] = odd_output[i]

# This procedure takes 3*N/4-1 real additions and N/2-3 real multiplies,
# followed by another symmetrical real DFT of size N/2 and a "complex to real"
# DFT of size N/4.

# We thus get the following performance results :
#	N	real additions	real multiplies	complex multiplies
#       1               0               0               0
#       2               0               0               0
#       4               2               0               0
#       8               9               1               0
#      16              28               6               0
#      32              76              21               0
#      64             189              54               2
#     128             451             125              10
#     256            1042             270              36
#     512            2358             565             108
#    1024            5251            1158             294
#    2048           11557            2349             750
#    4096           25200            4734            1832
#    8192           54544            9509            4336
#   16384          117337           19062           10026
#   32768          251127           38173           22770
#   65536          535102           76398           50988


# We thus get a better performance with the AAN DCT algorithm than with the
# Lee DCT algorithm : we can do a DCT of size 32 with 189 additions, 54+32 real
# multiplies, and 2 complex multiplies. The Lee algorithm would have used 209
# additions and 80 multiplies. With the AAN algorithm, we also have the
# advantage that a big number of the multiplies are actually grouped at the
# output stage of the algorithm, so if we want to do a DCT followed by a
# quantization stage, we will be able to group the multiply of the output with
# the multiply of the quantization stage, thus saving 32 more operations. In
# the mpeg audio layer 1 or 2 processing, we can also group the multiply of the
# output with the multiply of the convolution stage...

# Another source code for the AAN algorithm (implemented on 8 points, and
# without all of the explanations) can be found at this URL :
# http://developer.intel.com/drg/pentiumII/appnotes/aan_org.c . This
# implementation uses 28 adds and 6+8 muls instead of 29 adds and 5+8 muls -
# the difference is that in the symetrical_unscaled_DFT procedure, they noticed
# how odd_input[i] and odd_input[N/4-i] will be combined at the start of the
# complex-to-real DFT and they took advantage of this to convert 2 real adds
# and 4 real muls into one complex multiply.


# TODO : write about multi-dimentional DCT


# TEST CODE

def dump (vector):
    str = ""
    for i in range(len(vector)):
	if i:
	    str = str + ", "
	vector[i] = vector[i] + 0j
	realstr = "%+.4f" % vector[i].real
	imagstr = "%+.4fj" % vector[i].imag
	if (realstr == "-0.0000"):
	    realstr = "+0.0000"
	if (imagstr == "-0.0000j"):
	    imagstr = "+0.0000j"
	str = str + realstr #+ imagstr
    return "[%s]" % str

def test(N):
    input = vector(N)
    output = vector(N)
    verify = vector(N)

    for i in range(N):
	input[i] = random() + 1j * random()

    unscaled_DFT (N, input, output)
    unscaled_DFT (N, input, verify)

    if (dump(output) != dump(verify)):
	print dump(output)
	print dump(verify)

#test (64)


# PERFORMANCE ANALYSIS CODE

def display (table):
    N = 1
    print "#\tN\treal additions\treal multiplies\tcomplex multiplies"
    while table.has_key(N):
	print "#%8d%16d%16d%16d" % (N, table[N][0], table[N][1], table[N][2])
	N = 2*N
    print

best_complex_DFT = {}

def complex_DFT (max_N):
    best_complex_DFT[1] = (0,0,0)
    best_complex_DFT[2] = (4,0,0)
    best_complex_DFT[4] = (16,0,0)
    N = 8
    while (N<=max_N):
	# best method = split radix
	best2 = best_complex_DFT[N/2]
	best4 = best_complex_DFT[N/4]
	best_complex_DFT[N] = (best2[0] + 2*best4[0] + 3*N + 4,
			       best2[1] + 2*best4[1] + 4,
			       best2[2] + 2*best4[2] + N/2 - 4)
	N = 2*N

best_real_DFT = {}

def real_DFT (max_N):
    best_real_DFT[1] = (0,0,0)
    best_real_DFT[2] = (2,0,0)
    best_real_DFT[4] = (6,0,0)
    N = 8
    while (N<=max_N):
	# best method = split radix decimate-in-frequency
	best2 = best_real_DFT[N/2]
	best4 = best_complex_DFT[N/4]
	best_real_DFT[N] = (best2[0] + best4[0] + N + 2,
			    best2[1] + best4[1] + 2,
			    best2[2] + best4[2] + N/4 - 2)
	N = 2*N

best_complex2real_DFT = {}

def complex2real_DFT (max_N):
    best_complex2real_DFT[1] = (0,0,0)
    best_complex2real_DFT[2] = (2,0,0)
    best_complex2real_DFT[4] = (8,0,0)
    N = 8
    while (N<=max_N):
	best2 = best_complex2real_DFT[N/2]
	best4 = best_complex_DFT[N/4]
	best_complex2real_DFT[N] = (best2[0] + best4[0] + 3*N/2 + 1,
				    best2[1] + best4[1] + 2,
				    best2[2] + best4[2] + N/4 - 2)
	N = 2*N

best_real_symetric_DFT = {}

def real_symetric_DFT (max_N):
    best_real_symetric_DFT[1] = (0,0,0)
    best_real_symetric_DFT[2] = (0,0,0)
    best_real_symetric_DFT[4] = (2,0,0)
    N = 8
    while (N<=max_N):
	best2 = best_real_symetric_DFT[N/2]
	best4 = best_complex2real_DFT[N/4]
	best_real_symetric_DFT[N] = (best2[0] + best4[0] + 3*N/4 - 1,
				     best2[1] + best4[1] + N/2 - 3,
				     best2[2] + best4[2])
	N = 2*N

complex_DFT (65536)
real_DFT (65536)
complex2real_DFT (65536)
real_symetric_DFT (65536)


print "complex DFT"
display (best_complex_DFT)

print "real DFT"
display (best_real_DFT)

print "complex2real DFT"
display (best_complex2real_DFT)

print "real symetric DFT"
display (best_real_symetric_DFT)