+#include "unicode.compatibility.h"
+
+// Creating a portable defintion of countof
+#if defined( _WIN32 )
+ #define countOf _countof
+#else
+ #define countOf( arr ) ( sizeof( arr ) / sizeof( arr[ 0 ] ) )
+#endif
+
+/*
+ * \brief OpenCL related initialization
+ * Create Context, Device list
+ * Load CL file, compile, link CL source
+ * Build program and kernel objects
+ */
+std::vector< cl_device_id > initializeCL( cl_device_type deviceType,
+ cl_uint deviceGpuList,
+ cl_context& context,
+ bool printclInfo );
+
+/*
+ * \brief OpenCL memory buffer creation
+ */
+int createOpenCLMemoryBuffer(
+ cl_context& context,
+ const size_t bufferSizeBytes,
+ const cl_uint numBuffers,
+ cl_mem buffer[],
+ cl_mem_flags accessibility
+ );
+
+/*
+ * \brief OpenCL command queue creation
+ * Create Command Queue
+ * Create OpenCL memory buffer objects
+ */
+void createOpenCLCommandQueue( cl_context& context,
+ cl_uint commandQueueFlags,
+ cl_command_queue& commandQueue,
+ std::vector< cl_device_id > devices,
+ const size_t bufferSizeBytesIn,
+ const cl_uint numBuffersIn,
+ cl_mem clMemBufferIn[],
+ const size_t bufferSizeBytesOut,
+ const cl_uint numBuffersOut,
+ cl_mem clMemBufferOut[] );
+
+/*
+ * \brief release OpenCL memory buffer
+ */
+int releaseOpenCLMemBuffer( const cl_uint numBuffers, cl_mem buffer[] );
+
+std::string prettyPrintclFFTStatus( const cl_int& status );
+
+// This is used to either wrap an OpenCL function call, or to explicitly check a variable for an OpenCL error condition.
+// If an error occurs, we throw.
+// Note: std::runtime_error does not take unicode strings as input, so only strings supported
+inline cl_int OpenCL_V_Throw ( cl_int res, const std::string& msg, size_t lineno )
+{
+ switch( res )
+ {
+ case CL_SUCCESS: /**< No error */
+ break;
+ default:
+ {
+ std::stringstream tmp;
+ tmp << "OPENCL_V_THROWERROR< ";
+ tmp << prettyPrintclFFTStatus( res );
+ tmp << " > (";
+ tmp << lineno;
+ tmp << "): ";
+ tmp << msg;
+ std::string errorm (tmp.str());
+ std::cout << errorm<< std::endl;
+ throw std::runtime_error( errorm );
+ }
+ }
+
+ return res;
+}
+#define OPENCL_V_THROW(_status,_message) OpenCL_V_Throw (_status, _message, __LINE__)
+
+/*
+ * \brief Release OpenCL resources (Context, Memory etc.)
+ */
+int cleanupCL( cl_context* context, cl_command_queue* commandQueue, const cl_uint numBuffersIn, cl_mem inputBuffer[], const cl_uint numBuffersOut, cl_mem outputBuffer[], cl_event* outEvent );
+
+#endif
diff --git a/src/client/stdafx.cpp b/src/client/stdafx.cpp
new file mode 100644
index 00000000..2587b2c1
--- /dev/null
+++ b/src/client/stdafx.cpp
@@ -0,0 +1,25 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+// stdafx.cpp : source file that includes just the standard includes
+// clFFT.pch will be the pre-compiled header
+// stdafx.obj will contain the pre-compiled type information
+
+#include "stdafx.h"
+
+// TODO: reference any additional headers you need in STDAFX.H
+// and not in this file
diff --git a/src/include/clAmdFft.h b/src/include/clAmdFft.h
new file mode 100644
index 00000000..848d0760
--- /dev/null
+++ b/src/include/clAmdFft.h
@@ -0,0 +1,535 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*! @file clAmdFft.h
+ * /note clAmdFft.h is a deprecated header file.
+ * This header is provided to help projects that were written with the older clAmdFft codebase, to help them
+ * port to the new API at their own schedule. It will not be maintained or updated, and will be removed after
+ * a reasonable amount of time has passed. All new code should be written against clFFT.h.
+ * Older projects should migrate to the new header at their earliest convenience.
+ */
+
+#pragma once
+#if !defined( CLAMDFFT_DOTH )
+#define CLAMDFFT_DOTH
+
+#include "clFFT.h"
+
+/* The following header defines a fixed version number as this header is deprecated and won't be updated */
+#include "clAmdFft.version.h"
+
+/* In general, you can not use namespaces for strict C compliance, so we prefix our public accessible names
+ * with the string clAmdFft
+ */
+
+/* All functions will return pre-defined error codes, and will NOT throw exceptions to the caller
+ */
+
+/*! @brief clAmdFft error codes definition, incorporating OpenCL error definitions
+ *
+ * This enumeration is a superset of the OpenCL error codes. For example, CL_OUT_OF_HOST_MEMORY,
+ * which is defined in cl.h is aliased as CLFFT_OUT_OF_HOST_MEMORY. The set of basic OpenCL
+ * error codes is extended to add extra values specific to the clAmdFft package.
+ */
+typedef enum clfftStatus_ clAmdFftStatus;
+
+/*! @brief The dimension of the input and output buffers that will be fed into all FFT transforms */
+typedef enum clfftDim_ clAmdFftDim;
+
+/*! @brief These are the expected layouts of the buffers */
+typedef enum clfftLayout_ clAmdFftLayout;
+
+/*! @brief This is the expected precision of each FFT.
+ */
+typedef enum clfftPrecision_ clAmdFftPrecision;
+
+/*! @brief What is the expected direction of each FFT, time or the frequency domains */
+typedef enum clfftDirection_ clAmdFftDirection;
+
+/*! @brief Are the input buffers overwritten with the results */
+typedef enum clfftResultLocation_ clAmdFftResultLocation;
+
+/*! @brief This determines whether the result is returned in original order. It is valid only for
+dimensions greater than 1. */
+typedef enum clfftResultTransposed_ clAmdFftResultTransposed;
+
+/*! @brief Data structure that can be passed to clAmdFftSetup() to control the behavior of the FFT runtime
+ * @details This structure contains values that can be initialized before instantiation of the FFT runtime
+ * with ::clAmdFftSetup(). To initialize this structure, pass a pointer to a user struct to ::clAmdFftInitSetupData( ),
+ * which will clear the structure and set the version member variables to the current values.
+ */
+typedef struct clfftSetupData_ clAmdFftSetupData;
+
+/*! @brief An abstract handle to the object that represents the state of the FFT(s) */
+typedef clfftPlanHandle clAmdFftPlanHandle;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+ /*! @brief Initialize an clAmdFftSetupData struct for the client
+ * @details clAmdFftSetupData is passed to clAmdFftSetup to control behavior of the FFT runtime
+ * @param[out] setupData Data structure is cleared, initialized with version information and default values
+ * @return Enum describing error condition; superset of OpenCL error codes
+ */
+ __inline clAmdFftStatus clAmdFftInitSetupData( clAmdFftSetupData* setupData )
+ {
+ return clfftInitSetupData( setupData );
+ }
+
+ /*! @brief Initialize internal FFT resources.
+ * @details AMD's FFT implementation caches kernels, programs and buffers for its internal use.
+ * @param[in] setupData Data structure that can be passed into the setup routine to control FFT generation behavior
+ * and debug functionality
+ * @return Enum describing error condition; superset of OpenCL error codes
+ */
+ __inline clAmdFftStatus clAmdFftSetup( const clAmdFftSetupData* setupData )
+ {
+ return clfftSetup( setupData );
+ }
+
+ /*! @brief Release all internal resources.
+ * @details Call when client is done with this FFT library, allowing the library to destroy all resources it has cached
+ * @return Enum describing error condition; superset of OpenCL error codes
+ */
+ __inline clAmdFftStatus clAmdFftTeardown( )
+ {
+ return clfftTeardown( );
+ }
+
+ /*! @brief Query the FFT library for version information
+ * @details Return the major, minor and patch version numbers associated with this FFT library
+ * @param[out] major Major functionality change
+ * @param[out] minor Minor functionality change
+ * @param[out] patch Bug fixes, documentation changes, no new features introduced
+ * @return Enum describing error condition; superset of OpenCL error codes
+ */
+ __inline clAmdFftStatus clAmdFftGetVersion( cl_uint* major, cl_uint* minor, cl_uint* patch )
+ {
+ return clfftGetVersion( major, minor, patch );
+ }
+
+ /*! @brief Create a plan object initialized entirely with default values.
+ * @details A plan is a repository of state for calculating FFT's. Allows the runtime to pre-calculate kernels, programs
+ * and buffers and associate them with buffers of specified dimensions.
+ * @param[out] plHandle Handle to the newly created plan
+ * @param[in] context Client is responsible for providing an OpenCL context for the plan
+ * @param[in] dim The dimensionality of the FFT transform; describes how many elements are in the array
+ * @param[in] clLengths An array of lengths, of size 'dim'. Each value describes the length of additional dimensions
+ * @return Enum describing error condition; superset of OpenCL error codes
+ */
+ __inline clAmdFftStatus clAmdFftCreateDefaultPlan( clAmdFftPlanHandle* plHandle, cl_context context, const clAmdFftDim dim,
+ const size_t* clLengths )
+ {
+ return clfftCreateDefaultPlan( plHandle, context, dim, clLengths );
+ }
+
+ /*! @brief Create a copy of an existing plan.
+ * @details This API allows a client to create a new plan based upon an existing plan. This is a convenience function
+ * provided for quickly creating plans that are similar, but may differ slightly.
+ * @param[out] out_plHandle Handle to the newly created plan that is based on in_plHandle
+ * @param[in] new_context Client is responsible for providing a new context for the new plan
+ * @param[in] in_plHandle Handle to a plan to be copied, previously created
+ * @return Enum describing error condition; superset of OpenCL error codes
+ */
+ __inline clAmdFftStatus clAmdFftCopyPlan( clAmdFftPlanHandle* out_plHandle, cl_context new_context, clAmdFftPlanHandle in_plHandle )
+ {
+ return clfftCopyPlan( out_plHandle, new_context, in_plHandle );
+ }
+
+ /*! @brief Prepare the plan for execution.
+ * @details After all plan parameters are set, the client has the option of 'baking' the plan, which tells the runtime that
+ * no more changes to the plan's parameters are expected, and the OpenCL kernels should be compiled. This optional function
+ * allows the client application to perform this function when the application is being initialized instead of on the first
+ * execution.
+ * At this point, the clAmdFft runtime will apply all implimented optimizations, possibly including
+ * running kernel experiments on the devices in the plan context.
+ * Users should assume that this function will take a long time to execute. If a plan is not baked before being executed,
+ * users should assume that the first call to clAmdFftEnqueueTransform will take a long time to execute.
+ *
If any significant parameter of a plan is changed after the plan is baked (by a subsequent call to one of
+ * the clAmdFftSetPlan____ functions), that will not be considered an error. Instead, the plan will revert back to
+ * the unbaked state, discarding the benefits of the baking operation.
+ * @param[in] plHandle Handle to a plan previously created
+ * @param[in] numQueues Number of command queues in commQueueFFT; 0 is a valid value, in which case client does not want
+ * the runtime to run load experiments and only pre-calculate state information
+ * @param[in] commQueueFFT An array of cl_command_queues created by the client; the command queues must be a proper subset of
+ * the devices included in the plan context
+ * @param[in] pfn_notify A function pointer to a notification routine. The notification routine is a callback function that
+ * an application can register and which will be called when the program executable has been built (successfully or unsuccessfully).
+ * Currently, this parameter MUST be NULL or nullptr.
+ * @param[in] user_data Passed as an argument when pfn_notify is called.
+ * Currently, this parameter MUST be NULL or nullptr.
+ * @return Enum describing error condition; superset of OpenCL error codes
+ */
+ __inline clAmdFftStatus clAmdFftBakePlan( clAmdFftPlanHandle plHandle, cl_uint numQueues, cl_command_queue* commQueueFFT,
+ void (CL_CALLBACK *pfn_notify)(clAmdFftPlanHandle plHandle, void *user_data), void* user_data )
+ {
+ return clfftBakePlan( plHandle, numQueues, commQueueFFT, pfn_notify, user_data );
+ }
+
+ /*! @brief Release the resources of a plan.
+ * @details A plan may include kernels, programs and buffers associated with it that consume memory. When a plan
+ * is not needed anymore, the client should release the plan.
+ * @param[in,out] plHandle Handle to a plan previously created
+ * @return Enum describing error condition; superset of OpenCL error codes
+ */
+ __inline clAmdFftStatus clAmdFftDestroyPlan( clAmdFftPlanHandle* plHandle )
+ {
+ return clfftDestroyPlan( plHandle );
+ }
+
+ /*! @brief Retrieve the OpenCL context of a previously created plan.
+ * @details User should pass a reference to an cl_context variable, which will be changed to point to a
+ * context set in the specified plan.
+ * @param[in] plHandle Handle to a plan previously created
+ * @param[out] context Reference to user allocated cl_context, which will point to context set in plan
+ * @return Enum describing error condition; superset of OpenCL error codes
+ */
+ __inline clAmdFftStatus clAmdFftGetPlanContext( const clAmdFftPlanHandle plHandle, cl_context* context )
+ {
+ return clfftGetPlanContext( plHandle, context );
+ }
+
+ /*! @brief Retrieve the floating point precision of the FFT data
+ * @details User should pass a reference to an clAmdFftPrecision variable, which will be set to the
+ * precision of the FFT complex data in the plan.
+ * @param[in] plHandle Handle to a plan previously created
+ * @param[out] precision Reference to user clAmdFftPrecision enum
+ * @return Enum describing error condition; superset of OpenCL error codes
+ */
+ __inline clAmdFftStatus clAmdFftGetPlanPrecision( const clAmdFftPlanHandle plHandle, clAmdFftPrecision* precision )
+ {
+ return clfftGetPlanPrecision( plHandle, precision );
+ }
+
+ /*! @brief Set the floating point precision of the FFT data
+ * @details Set the plan property which will be the precision of the FFT complex data in the plan.
+ * @param[in] plHandle Handle to a plan previously created
+ * @param[in] precision Reference to user clAmdFftPrecision enum
+ * @return Enum describing error condition; superset of OpenCL error codes
+ */
+ __inline clAmdFftStatus clAmdFftSetPlanPrecision( clAmdFftPlanHandle plHandle, clAmdFftPrecision precision )
+ {
+ return clfftSetPlanPrecision( plHandle, precision );
+ }
+
+ /*! @brief Retrieve the scaling factor that should be applied to the FFT data
+ * @details User should pass a reference to an cl_float variable, which will be set to the
+ * floating point scaling factor that will be multiplied across the FFT data.
+ * @param[in] plHandle Handle to a plan previously created
+ * @param[in] dir Which direction does the scaling factor apply to
+ * @param[out] scale Reference to user cl_float variable
+ * @return Enum describing error condition; superset of OpenCL error codes
+ */
+ __inline clAmdFftStatus clAmdFftGetPlanScale( const clAmdFftPlanHandle plHandle, clAmdFftDirection dir, cl_float* scale )
+ {
+ return clfftGetPlanScale( plHandle, dir, scale );
+ }
+
+ /*! @brief Set the scaling factor that should be applied to the FFT data
+ * @details Set the plan property which will be the floating point scaling factor that will be
+ * multiplied across the FFT data.
+ * @param[in] plHandle Handle to a plan previously created
+ * @param[in] dir Which direction does the scaling factor apply to
+ * @param[in] scale Reference to user cl_float variable
+ * @return Enum describing error condition; superset of OpenCL error codes
+ */
+ __inline clAmdFftStatus clAmdFftSetPlanScale( clAmdFftPlanHandle plHandle, clAmdFftDirection dir, cl_float scale )
+ {
+ return clfftSetPlanScale( plHandle, dir, scale );
+ }
+
+ /*! @brief Retrieve the number of discrete arrays that this plan can handle concurrently
+ * @details User should pass a reference to an cl_uint variable, which will be set to the
+ * number of discrete arrays (1D or 2D) that will be batched together for this plan
+ * @param[in] plHandle Handle to a plan previously created
+ * @param[out] batchSize How many discrete number of FFT's are to be performed
+ * @return Enum describing error condition; superset of OpenCL error codes
+ */
+ __inline clAmdFftStatus clAmdFftGetPlanBatchSize( const clAmdFftPlanHandle plHandle, size_t* batchSize )
+ {
+ return clfftGetPlanBatchSize( plHandle, batchSize );
+ }
+
+ /*! @brief Set the number of discrete arrays that this plan can handle concurrently
+ * @details Set the plan property which will be set to the number of discrete arrays (1D or 2D)
+ * that will be batched together for this plan
+ * @param[in] plHandle Handle to a plan previously created
+ * @param[in] batchSize How many discrete number of FFT's are to be performed
+ * @return Enum describing error condition; superset of OpenCL error codes
+ */
+ __inline clAmdFftStatus clAmdFftSetPlanBatchSize( clAmdFftPlanHandle plHandle, size_t batchSize )
+ {
+ return clfftSetPlanBatchSize( plHandle, batchSize );
+ }
+
+ /*! @brief Retrieve the dimensionality of FFT's to be transformed in the plan
+ * @details Queries a plan object and retrieves the dimensionality that the plan is set for. A size is returned to
+ * help the client allocate the proper storage to hold the dimensions in a further call to clAmdFftGetPlanLength
+ * @param[in] plHandle Handle to a plan previously created
+ * @param[out] dim The dimensionality of the FFT's to be transformed
+ * @param[out] size Value used to allocate an array to hold the FFT dimensions.
+ * @return Enum describing error condition; superset of OpenCL error codes
+ */
+ __inline clAmdFftStatus clAmdFftGetPlanDim( const clAmdFftPlanHandle plHandle, clAmdFftDim* dim, cl_uint* size )
+ {
+ return clfftGetPlanDim( plHandle, dim, size );
+ }
+
+ /*! @brief Set the dimensionality of FFT's to be transformed by the plan
+ * @details Set the dimensionality of FFT's to be transformed by the plan
+ * @param[in] plHandle Handle to a plan previously created
+ * @param[in] dim The dimensionality of the FFT's to be transformed
+ * @return Enum describing error condition; superset of OpenCL error codes
+ */
+ __inline clAmdFftStatus clAmdFftSetPlanDim( clAmdFftPlanHandle plHandle, const clAmdFftDim dim )
+ {
+ return clfftSetPlanDim( plHandle, dim );
+ }
+
+ /*! @brief Retrieve the length of each dimension of the FFT
+ * @details User should pass a reference to a size_t array, which will be set to the
+ * length of each discrete dimension of the FFT
+ * @param[in] plHandle Handle to a plan previously created
+ * @param[in] dim The dimension of the length parameters; describes how many elements are in the array
+ * @param[out] clLengths An array of lengths, of size 'dim'. Each array value describes the length of each dimension
+ * @return Enum describing error condition; superset of OpenCL error codes
+ */
+ __inline clAmdFftStatus clAmdFftGetPlanLength( const clAmdFftPlanHandle plHandle, const clAmdFftDim dim, size_t* clLengths )
+ {
+ return clfftGetPlanLength( plHandle, dim, clLengths );
+ }
+
+ /*! @brief Set the length of each dimension of the FFT
+ * @details Set the plan property which will be the length of each discrete dimension of the FFT
+ * @param[in] plHandle Handle to a plan previously created
+ * @param[in] dim The dimension of the length parameters; describes how many elements are in the array
+ * @param[in] clLengths An array of lengths, of size 'dim'. Each value describes the length of additional dimensions
+ * @return Enum describing error condition; superset of OpenCL error codes
+ */
+ __inline clAmdFftStatus clAmdFftSetPlanLength( clAmdFftPlanHandle plHandle, const clAmdFftDim dim, const size_t* clLengths )
+ {
+ return clfftSetPlanLength( plHandle, dim, clLengths );
+ }
+
+ /*! @brief Retrieve the distance between consecutive elements for input buffers in a dimension.
+ * @details Depending on how the dimension is set in the plan (for 2D or 3D FFT's), strideY or strideZ can be safely
+ * ignored
+ * @param[in] plHandle Handle to a plan previously created
+ * @param[in] dim The dimension of the stride parameters; describes how many elements are in the array
+ * @param[out] clStrides An array of strides, of size 'dim'.
+ */
+ __inline clAmdFftStatus clAmdFftGetPlanInStride( const clAmdFftPlanHandle plHandle, const clAmdFftDim dim, size_t* clStrides )
+ {
+ return clfftGetPlanInStride( plHandle, dim, clStrides );
+ }
+
+ /*! @brief Set the distance between consecutive elements for input buffers in a dimension.
+ * @details Set the plan properties which will be the distance between elements in a given dimension
+ * (units are in terms of clAmdFftPrecision)
+ * @param[in] plHandle Handle to a plan previously created
+ * @param[in] dim The dimension of the stride parameters; describes how many elements are in the array
+ * @param[in] clStrides An array of strides, of size 'dim'. Usually strideX=1 so that successive elements in the first dimension are stored contiguously.
+ * Typically strideY=LenX, strideZ=LenX*LenY such that successive elements in the second and third dimensions are stored in packed format.
+ * See @ref DistanceStridesandPitches for details.
+ */
+ __inline clAmdFftStatus clAmdFftSetPlanInStride( clAmdFftPlanHandle plHandle, const clAmdFftDim dim, size_t* clStrides )
+ {
+ return clfftSetPlanInStride( plHandle, dim, clStrides );
+ }
+
+ /*! @brief Retrieve the distance between consecutive elements for output buffers in a dimension.
+ * @details Depending on how the dimension is set in the plan (for 2D or 3D FFT's), strideY or strideZ can be safely
+ * ignored
+ * @param[in] plHandle Handle to a plan previously created
+ * @param[in] dim The dimension of the stride parameters; describes how many elements are in the array
+ * @param[out] clStrides An array of strides, of size 'dim'.
+ */
+ __inline clAmdFftStatus clAmdFftGetPlanOutStride( const clAmdFftPlanHandle plHandle, const clAmdFftDim dim, size_t* clStrides )
+ {
+ return clfftGetPlanOutStride( plHandle, dim, clStrides );
+ }
+
+ /*! @brief Set the distance between consecutive elements for output buffers in a dimension.
+ * @details Set the plan properties which will be the distance between elements in a given dimension
+ * (units are in terms of clAmdFftPrecision)
+ * @param[in] plHandle Handle to a plan previously created
+ * @param[in] dim The dimension of the stride parameters; describes how many elements are in the array
+ * @param[in] clStrides An array of strides, of size 'dim'. Usually strideX=1 so that successive elements in the first dimension are stored contiguously.
+ * Typically strideY=LenX, strideZ=LenX*LenY such that successive elements in the second and third dimensions are stored in packed format.
+ * @sa clAmdFftSetPlanInStride
+ */
+ __inline clAmdFftStatus clAmdFftSetPlanOutStride( clAmdFftPlanHandle plHandle, const clAmdFftDim dim, size_t* clStrides )
+ {
+ return clfftSetPlanOutStride( plHandle, dim, clStrides );
+ }
+
+ /*! @brief Retrieve the distance between Array objects
+ * @details Pitch is the distance between each discrete array object in an FFT array. This is only used
+ * for 'array' dimensions in clAmdFftDim; see clAmdFftSetPlanDimension (units are in terms of clAmdFftPrecision)
+ * @param[in] plHandle Handle to a plan previously created
+ * @param[out] iDist The distance between the beginning elements of the discrete array objects in memory on input.
+ * For contiguous arrays in memory, iDist=(strideX*strideY*strideZ)
+ * @param[out] oDist The distance between the beginning elements of the discrete array objects in memory on output.
+ * For contiguous arrays in memory, oDist=(strideX*strideY*strideZ)
+ */
+ __inline clAmdFftStatus clAmdFftGetPlanDistance( const clAmdFftPlanHandle plHandle, size_t* iDist, size_t* oDist )
+ {
+ return clfftGetPlanDistance( plHandle, iDist, oDist );
+ }
+
+ /*! @brief Set the distance between Array objects
+ * @details Pitch is the distance between each discrete array object in an FFT array. This is only used
+ * for 'array' dimensions in clAmdFftDim; see clAmdFftSetPlanDimension (units are in terms of clAmdFftPrecision)
+ * @param[in] plHandle Handle to a plan previously created
+ * @param[out] iDist The distance between the beginning elements of the discrete array objects in memory on input.
+ * For contiguous arrays in memory, iDist=(strideX*strideY*strideZ)
+ * @param[out] oDist The distance between the beginning elements of the discrete array objects in memory on output.
+ * For contiguous arrays in memory, oDist=(strideX*strideY*strideZ)
+ */
+ __inline clAmdFftStatus clAmdFftSetPlanDistance( clAmdFftPlanHandle plHandle, size_t iDist, size_t oDist )
+ {
+ return clfftSetPlanDistance( plHandle, iDist, oDist );
+ }
+
+ /*! @brief Retrieve the expected layout of the input and output buffers
+ * @details Output buffers can be filled with either hermitian or complex numbers. Complex numbers can be stored
+ * in various layouts; this informs the FFT engine what layout to produce on output
+ * @param[in] plHandle Handle to a plan previously created
+ * @param[out] iLayout Indicates how the input buffers are laid out in memory
+ * @param[out] oLayout Indicates how the output buffers are laid out in memory
+ */
+ __inline clAmdFftStatus clAmdFftGetLayout( const clAmdFftPlanHandle plHandle, clAmdFftLayout* iLayout, clAmdFftLayout* oLayout )
+ {
+ return clfftGetLayout( plHandle, iLayout, oLayout );
+ }
+
+ /*! @brief Set the expected layout of the input and output buffers
+ * @details Output buffers can be filled with either hermitian or complex numbers. Complex numbers can be stored
+ * in various layouts; this informs the FFT engine what layout to produce on output
+ * @param[in] plHandle Handle to a plan previously created
+ * @param[in] iLayout Indicates how the input buffers are laid out in memory
+ * @param[in] oLayout Indicates how the output buffers are laid out in memory
+ */
+ __inline clAmdFftStatus clAmdFftSetLayout( clAmdFftPlanHandle plHandle, clAmdFftLayout iLayout, clAmdFftLayout oLayout )
+ {
+ return clfftSetLayout( plHandle, iLayout, oLayout );
+ }
+
+ /*! @brief Retrieve whether the input buffers are going to be overwritten with results
+ * @details If the setting is to do an in-place transform, the input buffers are overwritten with the results of the
+ * transform. If the setting is for out-of-place transforms, the engine knows to look for separate output buffers
+ * on the Enqueue call.
+ * @param[in] plHandle Handle to a plan previously created
+ * @param[out] placeness Tells the FFT engine to clobber the input buffers or to expect output buffers for results
+ */
+ __inline clAmdFftStatus clAmdFftGetResultLocation( const clAmdFftPlanHandle plHandle, clAmdFftResultLocation* placeness )
+ {
+ return clfftGetResultLocation( plHandle, placeness );
+ }
+
+ /*! @brief Set whether the input buffers are going to be overwritten with results
+ * @details If the setting is to do an in-place transform, the input buffers are overwritten with the results of the
+ * transform. If the setting is for out-of-place transforms, the engine knows to look for separate output buffers
+ * on the Enqueue call.
+ * @param[in] plHandle Handle to a plan previously created
+ * @param[in] placeness Tells the FFT engine to clobber the input buffers or to expect output buffers for results
+ */
+ __inline clAmdFftStatus clAmdFftSetResultLocation( clAmdFftPlanHandle plHandle, clAmdFftResultLocation placeness )
+ {
+ return clfftSetResultLocation( plHandle, placeness );
+ }
+
+ /*! @brief Retrieve the final transpose setting of a muti-dimensional FFT
+ * @details A multi-dimensional FFT typically transposes the data several times during calculation. If the client
+ * does not care about the final transpose to put data back in proper dimension, the final transpose can be skipped
+ * for possible speed improvements
+ * @param[in] plHandle Handle to a plan previously created
+ * @param[out] transposed Parameter specifies whether the final transpose can be skipped
+ */
+ __inline clAmdFftStatus clAmdFftGetPlanTransposeResult( const clAmdFftPlanHandle plHandle, clAmdFftResultTransposed * transposed )
+ {
+ return clfftGetPlanTransposeResult( plHandle, transposed );
+ }
+
+ /*! @brief Set the final transpose setting of a muti-dimensional FFT
+ * @details A multi-dimensional FFT typically transposes the data several times during calculation. If the client
+ * does not care about the final transpose to put data back in proper dimension, the final transpose can be skipped
+ * for possible speed improvements
+ * @param[in] plHandle Handle to a plan previously created
+ * @param[in] transposed Parameter specifies whether the final transpose can be skipped
+ */
+ __inline clAmdFftStatus clAmdFftSetPlanTransposeResult( clAmdFftPlanHandle plHandle, clAmdFftResultTransposed transposed )
+ {
+ return clfftSetPlanTransposeResult( plHandle, transposed );
+ }
+
+ /*! @brief Get buffer size (in bytes), which may be needed internally for an intermediate buffer
+ * @details Very large FFT transforms may need multiple passes, and the operation would need a temporary buffer to hold
+ * intermediate results. This function is only valid after the plan is baked, otherwise an invalid operation error
+ * is returned. If buffersize returns as 0, the runtime needs no temporary buffer.
+ * @param[in] plHandle Handle to a plan previously created
+ * @param[out] buffersize Size in bytes for intermediate buffer
+ */
+ __inline clAmdFftStatus clAmdFftGetTmpBufSize( const clAmdFftPlanHandle plHandle, size_t* buffersize )
+ {
+ return clfftGetTmpBufSize( plHandle, buffersize );
+ }
+
+ /*! @brief Enqueue an FFT transform operation, and return immediately (non-blocking)
+ * @details This transform API is the function that actually computes the FFT transfrom. It is non-blocking as it
+ * only enqueues the OpenCL kernels for execution. The synchronization step has to be managed by the user.
+ * @param[in] plHandle Handle to a plan previously created
+ * @param[in] dir Forwards or backwards transform
+ * @param[in] numQueuesAndEvents Number of command queues in commQueues; number of expected events to be returned in outEvents
+ * @param[in] commQueues An array of cl_command_queues created by the client; the command queues must be a proper subset of
+ * the devices included in the plan context
+ * @param[in] numWaitEvents Specify the number of elements in the eventWaitList array
+ * @param[in] waitEvents Events that this transform should wait to complete before executing on the device
+ * @param[out] outEvents The runtime fills this array with events corresponding 1 to 1 with the input command queues passed
+ * in commQueues. This parameter can be NULL or nullptr, in which case client is not interested in receiving notifications
+ * when transforms are finished, otherwise if not NULL the client is responsible for allocating this array, with at least
+ * as many elements as specified in numQueuesAndEvents.
+ * @param[in] inputBuffers An array of cl_mem objects that contain data for processing by the FFT runtime. If the transform
+ * is in place, the FFT results will overwrite the input buffers
+ * @param[out] outputBuffers An array of cl_mem objects that will store the results of out of place transforms. If the transform
+ * is in place, this parameter may be NULL or nullptr. It is completely ignored
+ * @param[in] tmpBuffer A cl_mem object that is reserved as a temporary buffer for FFT processing. If clTmpBuffers is NULL or nullptr,
+ * and the runtime needs temporary storage, an internal temporary buffer will be created on the fly managed by the runtime.
+ * @return Enum describing error condition; superset of OpenCL error codes
+ */
+ __inline clAmdFftStatus clAmdFftEnqueueTransform(
+ clAmdFftPlanHandle plHandle,
+ clAmdFftDirection dir,
+ cl_uint numQueuesAndEvents,
+ cl_command_queue* commQueues,
+ cl_uint numWaitEvents,
+ const cl_event* waitEvents,
+ cl_event* outEvents,
+ cl_mem* inputBuffers,
+ cl_mem* outputBuffers,
+ cl_mem tmpBuffer
+ )
+ {
+ return clfftEnqueueTransform( plHandle, dir, numQueuesAndEvents, commQueues, numWaitEvents, waitEvents, outEvents,
+ inputBuffers, outputBuffers, tmpBuffer );
+ }
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/include/clAmdFft.version.h b/src/include/clAmdFft.version.h
new file mode 100644
index 00000000..ec9ef542
--- /dev/null
+++ b/src/include/clAmdFft.version.h
@@ -0,0 +1,29 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+/*! @file clAmdFft.version.h
+ * /note clAmdFft.version.h is a deprecated header file.
+ * This header is provided to help projects that were written with the older clAmdFft codebase, to help them
+ * port to the new API at their own schedule. It will not be maintained or updated, and will be removed after
+ * a reasonable amount of time has passed. All new code should be written against clFFT.h.
+ * Older projects should migrate to the new header at their earliest convenience.
+ */
+
+/* the configured version and settings for clFFT
+ */
+#define clAmdFftVersionMajor 2
+#define clAmdFftVersionMinor 0
+#define clAmdFftVersionPatch 0
diff --git a/src/include/clFFT.h b/src/include/clFFT.h
new file mode 100644
index 00000000..f75ded30
--- /dev/null
+++ b/src/include/clFFT.h
@@ -0,0 +1,580 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*! @file clFFT.h
+ * clFFT.h defines all of the public interfaces and types that are meant to be used by clFFT clients
+ * This is the one public header file that should be consumed by clFFT clients. It is written to adhere to native "C"
+ * interfaces to make clFFT library as portable as possible; it should be callable from C, C++, .NET and Fortran,
+ * either with the proper linking or using wrapper classes.
+ *
+ */
+
+#pragma once
+#if !defined( CLFFT_H )
+#define CLFFT_H
+
+#if defined(__APPLE__) || defined(__MACOSX)
+ #include
+#else
+ #include
+#endif
+
+#include "version.h"
+
+/*! This preprocessor definition is the standard way of making exporting APIs
+ * from a DLL simpler. All files within this DLL are compiled with the CLFFT_EXPORTS
+ * symbol defined on the command line. This symbol should not be defined on any project
+ * that uses this DLL. This way any other project whose source files include this file see
+ * clfft functions as being imported from a DLL, whereas this DLL sees symbols
+ * defined with this macro as being exported.
+ */
+#if defined( _WIN32 )
+ #if !defined( __cplusplus )
+ #define inline __inline
+ #endif
+
+ #if defined( CLFFT_EXPORTS )
+ #define CLFFTAPI __declspec( dllexport )
+ #else
+ #define CLFFTAPI __declspec( dllimport )
+ #endif
+#else
+ #define CLFFTAPI
+#endif
+
+/* In general, you can not use namespaces for strict C compliance, so we prefix our public accessible names
+ * with the string clfft
+ */
+
+/* All functions will return pre-defined error codes, and will NOT throw exceptions to the caller
+ */
+
+/*! @brief clfft error codes definition, incorporating OpenCL error definitions
+ *
+ * This enumeration is a superset of the OpenCL error codes. For example, CL_OUT_OF_HOST_MEMORY,
+ * which is defined in cl.h is aliased as CLFFT_OUT_OF_HOST_MEMORY. The set of basic OpenCL
+ * error codes is extended to add extra values specific to the clfft package.
+ */
+enum clfftStatus_
+{
+ CLFFT_INVALID_GLOBAL_WORK_SIZE = CL_INVALID_GLOBAL_WORK_SIZE,
+ CLFFT_INVALID_MIP_LEVEL = CL_INVALID_MIP_LEVEL,
+ CLFFT_INVALID_BUFFER_SIZE = CL_INVALID_BUFFER_SIZE,
+ CLFFT_INVALID_GL_OBJECT = CL_INVALID_GL_OBJECT,
+ CLFFT_INVALID_OPERATION = CL_INVALID_OPERATION,
+ CLFFT_INVALID_EVENT = CL_INVALID_EVENT,
+ CLFFT_INVALID_EVENT_WAIT_LIST = CL_INVALID_EVENT_WAIT_LIST,
+ CLFFT_INVALID_GLOBAL_OFFSET = CL_INVALID_GLOBAL_OFFSET,
+ CLFFT_INVALID_WORK_ITEM_SIZE = CL_INVALID_WORK_ITEM_SIZE,
+ CLFFT_INVALID_WORK_GROUP_SIZE = CL_INVALID_WORK_GROUP_SIZE,
+ CLFFT_INVALID_WORK_DIMENSION = CL_INVALID_WORK_DIMENSION,
+ CLFFT_INVALID_KERNEL_ARGS = CL_INVALID_KERNEL_ARGS,
+ CLFFT_INVALID_ARG_SIZE = CL_INVALID_ARG_SIZE,
+ CLFFT_INVALID_ARG_VALUE = CL_INVALID_ARG_VALUE,
+ CLFFT_INVALID_ARG_INDEX = CL_INVALID_ARG_INDEX,
+ CLFFT_INVALID_KERNEL = CL_INVALID_KERNEL,
+ CLFFT_INVALID_KERNEL_DEFINITION = CL_INVALID_KERNEL_DEFINITION,
+ CLFFT_INVALID_KERNEL_NAME = CL_INVALID_KERNEL_NAME,
+ CLFFT_INVALID_PROGRAM_EXECUTABLE = CL_INVALID_PROGRAM_EXECUTABLE,
+ CLFFT_INVALID_PROGRAM = CL_INVALID_PROGRAM,
+ CLFFT_INVALID_BUILD_OPTIONS = CL_INVALID_BUILD_OPTIONS,
+ CLFFT_INVALID_BINARY = CL_INVALID_BINARY,
+ CLFFT_INVALID_SAMPLER = CL_INVALID_SAMPLER,
+ CLFFT_INVALID_IMAGE_SIZE = CL_INVALID_IMAGE_SIZE,
+ CLFFT_INVALID_IMAGE_FORMAT_DESCRIPTOR = CL_INVALID_IMAGE_FORMAT_DESCRIPTOR,
+ CLFFT_INVALID_MEM_OBJECT = CL_INVALID_MEM_OBJECT,
+ CLFFT_INVALID_HOST_PTR = CL_INVALID_HOST_PTR,
+ CLFFT_INVALID_COMMAND_QUEUE = CL_INVALID_COMMAND_QUEUE,
+ CLFFT_INVALID_QUEUE_PROPERTIES = CL_INVALID_QUEUE_PROPERTIES,
+ CLFFT_INVALID_CONTEXT = CL_INVALID_CONTEXT,
+ CLFFT_INVALID_DEVICE = CL_INVALID_DEVICE,
+ CLFFT_INVALID_PLATFORM = CL_INVALID_PLATFORM,
+ CLFFT_INVALID_DEVICE_TYPE = CL_INVALID_DEVICE_TYPE,
+ CLFFT_INVALID_VALUE = CL_INVALID_VALUE,
+ CLFFT_MAP_FAILURE = CL_MAP_FAILURE,
+ CLFFT_BUILD_PROGRAM_FAILURE = CL_BUILD_PROGRAM_FAILURE,
+ CLFFT_IMAGE_FORMAT_NOT_SUPPORTED = CL_IMAGE_FORMAT_NOT_SUPPORTED,
+ CLFFT_IMAGE_FORMAT_MISMATCH = CL_IMAGE_FORMAT_MISMATCH,
+ CLFFT_MEM_COPY_OVERLAP = CL_MEM_COPY_OVERLAP,
+ CLFFT_PROFILING_INFO_NOT_AVAILABLE = CL_PROFILING_INFO_NOT_AVAILABLE,
+ CLFFT_OUT_OF_HOST_MEMORY = CL_OUT_OF_HOST_MEMORY,
+ CLFFT_OUT_OF_RESOURCES = CL_OUT_OF_RESOURCES,
+ CLFFT_MEM_OBJECT_ALLOCATION_FAILURE = CL_MEM_OBJECT_ALLOCATION_FAILURE,
+ CLFFT_COMPILER_NOT_AVAILABLE = CL_COMPILER_NOT_AVAILABLE,
+ CLFFT_DEVICE_NOT_AVAILABLE = CL_DEVICE_NOT_AVAILABLE,
+ CLFFT_DEVICE_NOT_FOUND = CL_DEVICE_NOT_FOUND,
+ CLFFT_SUCCESS = CL_SUCCESS,
+ //-------------------------- Extended status codes for clfft ----------------------------------------
+ CLFFT_BUGCHECK = 4*1024, /*!< Bugcheck. */
+ CLFFT_NOTIMPLEMENTED, /*!< Functionality is not implemented yet. */
+ CLFFT_TRANSPOSED_NOTIMPLEMENTED, /*!< Transposed functionality is not implemented for this transformation. */
+ CLFFT_FILE_NOT_FOUND, /*!< Tried to open an existing file on the host system, but failed. */
+ CLFFT_FILE_CREATE_FAILURE, /*!< Tried to create a file on the host system, but failed. */
+ CLFFT_VERSION_MISMATCH, /*!< Version conflict between client and library. */
+ CLFFT_INVALID_PLAN, /*!< Requested plan could not be found. */
+ CLFFT_DEVICE_NO_DOUBLE, /*!< Double precision not supported on this device. */
+ CLFFT_ENDSTATUS /* This value will always be last, and marks the length of clfftStatus. */
+};
+typedef enum clfftStatus_ clfftStatus;
+
+/*! @brief The dimension of the input and output buffers that will be fed into all FFT transforms */
+typedef enum clfftDim_
+{
+ CLFFT_1D = 1, /*!< 1 Dimensional FFT transform (default). */
+ CLFFT_2D, /*!< 2 Dimensional FFT transform. */
+ CLFFT_3D, /*!< 3 Dimensional FFT transform. */
+ ENDDIMENSION /*!< This value will always be last, and marks the length of clfftDim. */
+} clfftDim;
+
+/*! @brief These are the expected layouts of the buffers */
+typedef enum clfftLayout_
+{
+ CLFFT_COMPLEX_INTERLEAVED = 1, /*!< An array of complex numbers, with real and imaginary components together (default). */
+ CLFFT_COMPLEX_PLANAR, /*!< Arrays of real componets and arrays of imaginary components that have been seperated out. */
+ CLFFT_HERMITIAN_INTERLEAVED, /*!< Compressed form of complex numbers; complex-conjugates not stored, real and imaginary components in same array. */
+ CLFFT_HERMITIAN_PLANAR, /*!< Compressed form of complex numbers; complex-conjugates not stored, real and imaginary components in separate arrays. */
+ CLFFT_REAL, /*!< An array of real numbers, with no corresponding imaginary components. */
+ ENDLAYOUT /*!< This value will always be last, and marks the length of clfftLayout. */
+} clfftLayout;
+
+/*! @brief This is the expected precision of each FFT.
+ */
+typedef enum clfftPrecision_
+{
+ CLFFT_SINGLE = 1, /*!< An array of complex numbers, with real and imaginary components as floats (default). */
+ CLFFT_DOUBLE, /*!< An array of complex numbers, with real and imaginary components as doubles. */
+ CLFFT_SINGLE_FAST, /*!< Faster implementation preferred. */
+ CLFFT_DOUBLE_FAST, /*!< Faster implementation preferred. */
+ ENDPRECISION /*!< This value will always be last, and marks the length of clfftPrecision. */
+} clfftPrecision;
+
+/*! @brief What is the expected direction of each FFT, time or the frequency domains */
+typedef enum clfftDirection_
+{
+ CLFFT_FORWARD = -1, /*!< FFT transform from the time to the frequency domain. */
+ CLFFT_BACKWARD = 1, /*!< FFT transform from the frequency to the time domain. */
+ CLFFT_MINUS = -1, /*!< Alias for the forward transform. */
+ CLFFT_PLUS = 1, /*!< Alias for the backward transform. */
+ ENDDIRECTION /*!< This value will always be last, and marks the length of clfftDirection. */
+} clfftDirection;
+
+/*! @brief Are the input buffers overwritten with the results */
+typedef enum clfftResultLocation_
+{
+ CLFFT_INPLACE = 1, /*!< The input and output buffers are the same (default). */
+ CLFFT_OUTOFPLACE, /*!< Seperate input and output buffers. */
+ ENDPLACE /*!< This value will always be last, and marks the length of clfftPlaceness. */
+} clfftResultLocation;
+
+/*! @brief This determines whether the result is returned in original order. It is valid only for
+dimensions greater than 1. */
+typedef enum clfftResultTransposed_ {
+ CLFFT_NOTRANSPOSE = 1, /*!< The results are returned in the original preserved order (default) */
+ CLFFT_TRANSPOSED, /*!< The result is transposed where transpose kernel is supported (possibly faster) */
+ ENDTRANSPOSED /*!< This value will always be last, and marks the length of clfftResultTransposed */
+} clfftResultTransposed;
+
+/*! BitMasks to be used with clfftSetupData.debugFlags */
+#define CLFFT_DUMP_PROGRAMS 0x1
+
+/*! @brief Data structure that can be passed to clfftSetup() to control the behavior of the FFT runtime
+ * @details This structure contains values that can be initialized before instantiation of the FFT runtime
+ * with ::clfftSetup(). To initialize this structure, pass a pointer to a user struct to ::clfftInitSetupData( ),
+ * which will clear the structure and set the version member variables to the current values.
+ */
+struct clfftSetupData_
+{
+ cl_uint major; /*!< Major version number of the project; signifies major API changes. */
+ cl_uint minor; /*!< Minor version number of the project; minor API changes that could break backwards compatibility. */
+ cl_uint patch; /*!< Patch version number of the project; Always incrementing number, signifies change over time. */
+
+ /*! Bitwise flags that control the behavior of library debug logic. */
+ cl_ulong debugFlags; /*! This should be set to zero, except when debugging the clfft library.
+ * debugFlags can be set to CLFFT_DUMP_PROGRAMS, in which case the dynamically generated OpenCL kernels will
+ * be written to text files in the current working directory. These files will have a *.cl suffix.
+ */
+};
+typedef struct clfftSetupData_ clfftSetupData;
+
+/*! @brief An abstract handle to the object that represents the state of the FFT(s) */
+typedef size_t clfftPlanHandle;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+ /*! @brief Initialize an clfftSetupData struct for the client
+ * @details clfftSetupData is passed to clfftSetup to control behavior of the FFT runtime
+ * @param[out] setupData Data structure is cleared, initialized with version information and default values
+ * @return Enum describing error condition; superset of OpenCL error codes
+ */
+ __inline clfftStatus clfftInitSetupData( clfftSetupData* setupData )
+ {
+ setupData->major = clfftVersionMajor;
+ setupData->minor = clfftVersionMinor;
+ setupData->patch = clfftVersionPatch;
+ setupData->debugFlags = 0;
+
+ return CLFFT_SUCCESS;
+ }
+
+ /*! @brief Initialize internal FFT resources.
+ * @details AMD's FFT implementation caches kernels, programs and buffers for its internal use.
+ * @param[in] setupData Data structure that can be passed into the setup routine to control FFT generation behavior
+ * and debug functionality
+ * @return Enum describing error condition; superset of OpenCL error codes
+ */
+ CLFFTAPI clfftStatus clfftSetup( const clfftSetupData* setupData );
+
+ /*! @brief Release all internal resources.
+ * @details Call when client is done with this FFT library, allowing the library to destroy all resources it has cached
+ * @return Enum describing error condition; superset of OpenCL error codes
+ */
+ CLFFTAPI clfftStatus clfftTeardown( );
+
+ /*! @brief Query the FFT library for version information
+ * @details Return the major, minor and patch version numbers associated with this FFT library
+ * @param[out] major Major functionality change
+ * @param[out] minor Minor functionality change
+ * @param[out] patch Bug fixes, documentation changes, no new features introduced
+ * @return Enum describing error condition; superset of OpenCL error codes
+ */
+ CLFFTAPI clfftStatus clfftGetVersion( cl_uint* major, cl_uint* minor, cl_uint* patch );
+
+ /*! @brief Create a plan object initialized entirely with default values.
+ * @details A plan is a repository of state for calculating FFT's. Allows the runtime to pre-calculate kernels, programs
+ * and buffers and associate them with buffers of specified dimensions.
+ * @param[out] plHandle Handle to the newly created plan
+ * @param[in] context Client is responsible for providing an OpenCL context for the plan
+ * @param[in] dim The dimensionality of the FFT transform; describes how many elements are in the array
+ * @param[in] clLengths An array of lengths, of size 'dim'. Each value describes the length of additional dimensions
+ * @return Enum describing error condition; superset of OpenCL error codes
+ */
+ CLFFTAPI clfftStatus clfftCreateDefaultPlan( clfftPlanHandle* plHandle, cl_context context, const clfftDim dim,
+ const size_t* clLengths );
+
+ /*! @brief Create a copy of an existing plan.
+ * @details This API allows a client to create a new plan based upon an existing plan. This is a convenience function
+ * provided for quickly creating plans that are similar, but may differ slightly.
+ * @param[out] out_plHandle Handle to the newly created plan that is based on in_plHandle
+ * @param[in] new_context Client is responsible for providing a new context for the new plan
+ * @param[in] in_plHandle Handle to a plan to be copied, previously created
+ * @return Enum describing error condition; superset of OpenCL error codes
+ */
+ CLFFTAPI clfftStatus clfftCopyPlan( clfftPlanHandle* out_plHandle, cl_context new_context, clfftPlanHandle in_plHandle );
+
+ /*! @brief Prepare the plan for execution.
+ * @details After all plan parameters are set, the client has the option of 'baking' the plan, which tells the runtime that
+ * no more changes to the plan's parameters are expected, and the OpenCL kernels should be compiled. This optional function
+ * allows the client application to perform this function when the application is being initialized instead of on the first
+ * execution.
+ * At this point, the clfft runtime will apply all implimented optimizations, possibly including
+ * running kernel experiments on the devices in the plan context.
+ *
Users should assume that this function will take a long time to execute. If a plan is not baked before being executed,
+ * users should assume that the first call to clfftEnqueueTransform will take a long time to execute.
+ *
If any significant parameter of a plan is changed after the plan is baked (by a subsequent call to one of
+ * the clfftSetPlan____ functions), that will not be considered an error. Instead, the plan will revert back to
+ * the unbaked state, discarding the benefits of the baking operation.
+ * @param[in] plHandle Handle to a plan previously created
+ * @param[in] numQueues Number of command queues in commQueueFFT; 0 is a valid value, in which case client does not want
+ * the runtime to run load experiments and only pre-calculate state information
+ * @param[in] commQueueFFT An array of cl_command_queues created by the client; the command queues must be a proper subset of
+ * the devices included in the plan context
+ * @param[in] pfn_notify A function pointer to a notification routine. The notification routine is a callback function that
+ * an application can register and which will be called when the program executable has been built (successfully or unsuccessfully).
+ * Currently, this parameter MUST be NULL or nullptr.
+ * @param[in] user_data Passed as an argument when pfn_notify is called.
+ * Currently, this parameter MUST be NULL or nullptr.
+ * @return Enum describing error condition; superset of OpenCL error codes
+ */
+ CLFFTAPI clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_command_queue* commQueueFFT,
+ void (CL_CALLBACK *pfn_notify)(clfftPlanHandle plHandle, void *user_data), void* user_data );
+
+ /*! @brief Release the resources of a plan.
+ * @details A plan may include kernels, programs and buffers associated with it that consume memory. When a plan
+ * is not needed anymore, the client should release the plan.
+ * @param[in,out] plHandle Handle to a plan previously created
+ * @return Enum describing error condition; superset of OpenCL error codes
+ */
+ CLFFTAPI clfftStatus clfftDestroyPlan( clfftPlanHandle* plHandle );
+
+ /*! @brief Retrieve the OpenCL context of a previously created plan.
+ * @details User should pass a reference to an cl_context variable, which will be changed to point to a
+ * context set in the specified plan.
+ * @param[in] plHandle Handle to a plan previously created
+ * @param[out] context Reference to user allocated cl_context, which will point to context set in plan
+ * @return Enum describing error condition; superset of OpenCL error codes
+ */
+ CLFFTAPI clfftStatus clfftGetPlanContext( const clfftPlanHandle plHandle, cl_context* context );
+
+ /*! @brief Retrieve the floating point precision of the FFT data
+ * @details User should pass a reference to an clfftPrecision variable, which will be set to the
+ * precision of the FFT complex data in the plan.
+ * @param[in] plHandle Handle to a plan previously created
+ * @param[out] precision Reference to user clfftPrecision enum
+ * @return Enum describing error condition; superset of OpenCL error codes
+ */
+ CLFFTAPI clfftStatus clfftGetPlanPrecision( const clfftPlanHandle plHandle, clfftPrecision* precision );
+
+ /*! @brief Set the floating point precision of the FFT data
+ * @details Set the plan property which will be the precision of the FFT complex data in the plan.
+ * @param[in] plHandle Handle to a plan previously created
+ * @param[in] precision Reference to user clfftPrecision enum
+ * @return Enum describing error condition; superset of OpenCL error codes
+ */
+ CLFFTAPI clfftStatus clfftSetPlanPrecision( clfftPlanHandle plHandle, clfftPrecision precision );
+
+ /*! @brief Retrieve the scaling factor that should be applied to the FFT data
+ * @details User should pass a reference to an cl_float variable, which will be set to the
+ * floating point scaling factor that will be multiplied across the FFT data.
+ * @param[in] plHandle Handle to a plan previously created
+ * @param[in] dir Which direction does the scaling factor apply to
+ * @param[out] scale Reference to user cl_float variable
+ * @return Enum describing error condition; superset of OpenCL error codes
+ */
+ CLFFTAPI clfftStatus clfftGetPlanScale( const clfftPlanHandle plHandle, clfftDirection dir, cl_float* scale );
+
+ /*! @brief Set the scaling factor that should be applied to the FFT data
+ * @details Set the plan property which will be the floating point scaling factor that will be
+ * multiplied across the FFT data.
+ * @param[in] plHandle Handle to a plan previously created
+ * @param[in] dir Which direction does the scaling factor apply to
+ * @param[in] scale Reference to user cl_float variable
+ * @return Enum describing error condition; superset of OpenCL error codes
+ */
+ CLFFTAPI clfftStatus clfftSetPlanScale( clfftPlanHandle plHandle, clfftDirection dir, cl_float scale );
+
+ /*! @brief Retrieve the number of discrete arrays that this plan can handle concurrently
+ * @details User should pass a reference to an cl_uint variable, which will be set to the
+ * number of discrete arrays (1D or 2D) that will be batched together for this plan
+ * @param[in] plHandle Handle to a plan previously created
+ * @param[out] batchSize How many discrete number of FFT's are to be performed
+ * @return Enum describing error condition; superset of OpenCL error codes
+ */
+ CLFFTAPI clfftStatus clfftGetPlanBatchSize( const clfftPlanHandle plHandle, size_t* batchSize );
+
+ /*! @brief Set the number of discrete arrays that this plan can handle concurrently
+ * @details Set the plan property which will be set to the number of discrete arrays (1D or 2D)
+ * that will be batched together for this plan
+ * @param[in] plHandle Handle to a plan previously created
+ * @param[in] batchSize How many discrete number of FFT's are to be performed
+ * @return Enum describing error condition; superset of OpenCL error codes
+ */
+ CLFFTAPI clfftStatus clfftSetPlanBatchSize( clfftPlanHandle plHandle, size_t batchSize );
+
+ /*! @brief Retrieve the dimensionality of FFT's to be transformed in the plan
+ * @details Queries a plan object and retrieves the dimensionality that the plan is set for. A size is returned to
+ * help the client allocate the proper storage to hold the dimensions in a further call to clfftGetPlanLength
+ * @param[in] plHandle Handle to a plan previously created
+ * @param[out] dim The dimensionality of the FFT's to be transformed
+ * @param[out] size Value used to allocate an array to hold the FFT dimensions.
+ * @return Enum describing error condition; superset of OpenCL error codes
+ */
+ CLFFTAPI clfftStatus clfftGetPlanDim( const clfftPlanHandle plHandle, clfftDim* dim, cl_uint* size );
+
+ /*! @brief Set the dimensionality of FFT's to be transformed by the plan
+ * @details Set the dimensionality of FFT's to be transformed by the plan
+ * @param[in] plHandle Handle to a plan previously created
+ * @param[in] dim The dimensionality of the FFT's to be transformed
+ * @return Enum describing error condition; superset of OpenCL error codes
+ */
+ CLFFTAPI clfftStatus clfftSetPlanDim( clfftPlanHandle plHandle, const clfftDim dim );
+
+ /*! @brief Retrieve the length of each dimension of the FFT
+ * @details User should pass a reference to a size_t array, which will be set to the
+ * length of each discrete dimension of the FFT
+ * @param[in] plHandle Handle to a plan previously created
+ * @param[in] dim The dimension of the length parameters; describes how many elements are in the array
+ * @param[out] clLengths An array of lengths, of size 'dim'. Each array value describes the length of each dimension
+ * @return Enum describing error condition; superset of OpenCL error codes
+ */
+ CLFFTAPI clfftStatus clfftGetPlanLength( const clfftPlanHandle plHandle, const clfftDim dim, size_t* clLengths );
+
+ /*! @brief Set the length of each dimension of the FFT
+ * @details Set the plan property which will be the length of each discrete dimension of the FFT
+ * @param[in] plHandle Handle to a plan previously created
+ * @param[in] dim The dimension of the length parameters; describes how many elements are in the array
+ * @param[in] clLengths An array of lengths, of size 'dim'. Each value describes the length of additional dimensions
+ * @return Enum describing error condition; superset of OpenCL error codes
+ */
+ CLFFTAPI clfftStatus clfftSetPlanLength( clfftPlanHandle plHandle, const clfftDim dim, const size_t* clLengths );
+
+ /*! @brief Retrieve the distance between consecutive elements for input buffers in a dimension.
+ * @details Depending on how the dimension is set in the plan (for 2D or 3D FFT's), strideY or strideZ can be safely
+ * ignored
+ * @param[in] plHandle Handle to a plan previously created
+ * @param[in] dim The dimension of the stride parameters; describes how many elements are in the array
+ * @param[out] clStrides An array of strides, of size 'dim'.
+ */
+ CLFFTAPI clfftStatus clfftGetPlanInStride( const clfftPlanHandle plHandle, const clfftDim dim, size_t* clStrides );
+
+ /*! @brief Set the distance between consecutive elements for input buffers in a dimension.
+ * @details Set the plan properties which will be the distance between elements in a given dimension
+ * (units are in terms of clfftPrecision)
+ * @param[in] plHandle Handle to a plan previously created
+ * @param[in] dim The dimension of the stride parameters; describes how many elements are in the array
+ * @param[in] clStrides An array of strides, of size 'dim'. Usually strideX=1 so that successive elements in the first dimension are stored contiguously.
+ * Typically strideY=LenX, strideZ=LenX*LenY such that successive elements in the second and third dimensions are stored in packed format.
+ * See @ref DistanceStridesandPitches for details.
+ */
+ CLFFTAPI clfftStatus clfftSetPlanInStride( clfftPlanHandle plHandle, const clfftDim dim, size_t* clStrides );
+
+ /*! @brief Retrieve the distance between consecutive elements for output buffers in a dimension.
+ * @details Depending on how the dimension is set in the plan (for 2D or 3D FFT's), strideY or strideZ can be safely
+ * ignored
+ * @param[in] plHandle Handle to a plan previously created
+ * @param[in] dim The dimension of the stride parameters; describes how many elements are in the array
+ * @param[out] clStrides An array of strides, of size 'dim'.
+ */
+ CLFFTAPI clfftStatus clfftGetPlanOutStride( const clfftPlanHandle plHandle, const clfftDim dim, size_t* clStrides );
+
+ /*! @brief Set the distance between consecutive elements for output buffers in a dimension.
+ * @details Set the plan properties which will be the distance between elements in a given dimension
+ * (units are in terms of clfftPrecision)
+ * @param[in] plHandle Handle to a plan previously created
+ * @param[in] dim The dimension of the stride parameters; describes how many elements are in the array
+ * @param[in] clStrides An array of strides, of size 'dim'. Usually strideX=1 so that successive elements in the first dimension are stored contiguously.
+ * Typically strideY=LenX, strideZ=LenX*LenY such that successive elements in the second and third dimensions are stored in packed format.
+ * @sa clfftSetPlanInStride
+ */
+ CLFFTAPI clfftStatus clfftSetPlanOutStride( clfftPlanHandle plHandle, const clfftDim dim, size_t* clStrides );
+
+ /*! @brief Retrieve the distance between Array objects
+ * @details Pitch is the distance between each discrete array object in an FFT array. This is only used
+ * for 'array' dimensions in clfftDim; see clfftSetPlanDimension (units are in terms of clfftPrecision)
+ * @param[in] plHandle Handle to a plan previously created
+ * @param[out] iDist The distance between the beginning elements of the discrete array objects in memory on input.
+ * For contiguous arrays in memory, iDist=(strideX*strideY*strideZ)
+ * @param[out] oDist The distance between the beginning elements of the discrete array objects in memory on output.
+ * For contiguous arrays in memory, oDist=(strideX*strideY*strideZ)
+ */
+ CLFFTAPI clfftStatus clfftGetPlanDistance( const clfftPlanHandle plHandle, size_t* iDist, size_t* oDist );
+
+ /*! @brief Set the distance between Array objects
+ * @details Pitch is the distance between each discrete array object in an FFT array. This is only used
+ * for 'array' dimensions in clfftDim; see clfftSetPlanDimension (units are in terms of clfftPrecision)
+ * @param[in] plHandle Handle to a plan previously created
+ * @param[out] iDist The distance between the beginning elements of the discrete array objects in memory on input.
+ * For contiguous arrays in memory, iDist=(strideX*strideY*strideZ)
+ * @param[out] oDist The distance between the beginning elements of the discrete array objects in memory on output.
+ * For contiguous arrays in memory, oDist=(strideX*strideY*strideZ)
+ */
+ CLFFTAPI clfftStatus clfftSetPlanDistance( clfftPlanHandle plHandle, size_t iDist, size_t oDist );
+
+ /*! @brief Retrieve the expected layout of the input and output buffers
+ * @details Output buffers can be filled with either hermitian or complex numbers. Complex numbers can be stored
+ * in various layouts; this informs the FFT engine what layout to produce on output
+ * @param[in] plHandle Handle to a plan previously created
+ * @param[out] iLayout Indicates how the input buffers are laid out in memory
+ * @param[out] oLayout Indicates how the output buffers are laid out in memory
+ */
+ CLFFTAPI clfftStatus clfftGetLayout( const clfftPlanHandle plHandle, clfftLayout* iLayout, clfftLayout* oLayout );
+
+ /*! @brief Set the expected layout of the input and output buffers
+ * @details Output buffers can be filled with either hermitian or complex numbers. Complex numbers can be stored
+ * in various layouts; this informs the FFT engine what layout to produce on output
+ * @param[in] plHandle Handle to a plan previously created
+ * @param[in] iLayout Indicates how the input buffers are laid out in memory
+ * @param[in] oLayout Indicates how the output buffers are laid out in memory
+ */
+ CLFFTAPI clfftStatus clfftSetLayout( clfftPlanHandle plHandle, clfftLayout iLayout, clfftLayout oLayout );
+
+ /*! @brief Retrieve whether the input buffers are going to be overwritten with results
+ * @details If the setting is to do an in-place transform, the input buffers are overwritten with the results of the
+ * transform. If the setting is for out-of-place transforms, the engine knows to look for separate output buffers
+ * on the Enqueue call.
+ * @param[in] plHandle Handle to a plan previously created
+ * @param[out] placeness Tells the FFT engine to clobber the input buffers or to expect output buffers for results
+ */
+ CLFFTAPI clfftStatus clfftGetResultLocation( const clfftPlanHandle plHandle, clfftResultLocation* placeness );
+
+ /*! @brief Set whether the input buffers are going to be overwritten with results
+ * @details If the setting is to do an in-place transform, the input buffers are overwritten with the results of the
+ * transform. If the setting is for out-of-place transforms, the engine knows to look for separate output buffers
+ * on the Enqueue call.
+ * @param[in] plHandle Handle to a plan previously created
+ * @param[in] placeness Tells the FFT engine to clobber the input buffers or to expect output buffers for results
+ */
+ CLFFTAPI clfftStatus clfftSetResultLocation( clfftPlanHandle plHandle, clfftResultLocation placeness );
+
+ /*! @brief Retrieve the final transpose setting of a muti-dimensional FFT
+ * @details A multi-dimensional FFT typically transposes the data several times during calculation. If the client
+ * does not care about the final transpose to put data back in proper dimension, the final transpose can be skipped
+ * for possible speed improvements
+ * @param[in] plHandle Handle to a plan previously created
+ * @param[out] transposed Parameter specifies whether the final transpose can be skipped
+ */
+ CLFFTAPI clfftStatus clfftGetPlanTransposeResult( const clfftPlanHandle plHandle, clfftResultTransposed * transposed );
+
+ /*! @brief Set the final transpose setting of a muti-dimensional FFT
+ * @details A multi-dimensional FFT typically transposes the data several times during calculation. If the client
+ * does not care about the final transpose to put data back in proper dimension, the final transpose can be skipped
+ * for possible speed improvements
+ * @param[in] plHandle Handle to a plan previously created
+ * @param[in] transposed Parameter specifies whether the final transpose can be skipped
+ */
+ CLFFTAPI clfftStatus clfftSetPlanTransposeResult( clfftPlanHandle plHandle, clfftResultTransposed transposed );
+
+
+ /*! @brief Get buffer size (in bytes), which may be needed internally for an intermediate buffer
+ * @details Very large FFT transforms may need multiple passes, and the operation would need a temporary buffer to hold
+ * intermediate results. This function is only valid after the plan is baked, otherwise an invalid operation error
+ * is returned. If buffersize returns as 0, the runtime needs no temporary buffer.
+ * @param[in] plHandle Handle to a plan previously created
+ * @param[out] buffersize Size in bytes for intermediate buffer
+ */
+ CLFFTAPI clfftStatus clfftGetTmpBufSize( const clfftPlanHandle plHandle, size_t* buffersize );
+
+ /*! @brief Enqueue an FFT transform operation, and return immediately (non-blocking)
+ * @details This transform API is the function that actually computes the FFT transfrom. It is non-blocking as it
+ * only enqueues the OpenCL kernels for execution. The synchronization step has to be managed by the user.
+ * @param[in] plHandle Handle to a plan previously created
+ * @param[in] dir Forwards or backwards transform
+ * @param[in] numQueuesAndEvents Number of command queues in commQueues; number of expected events to be returned in outEvents
+ * @param[in] commQueues An array of cl_command_queues created by the client; the command queues must be a proper subset of
+ * the devices included in the plan context
+ * @param[in] numWaitEvents Specify the number of elements in the eventWaitList array
+ * @param[in] waitEvents Events that this transform should wait to complete before executing on the device
+ * @param[out] outEvents The runtime fills this array with events corresponding 1 to 1 with the input command queues passed
+ * in commQueues. This parameter can be NULL or nullptr, in which case client is not interested in receiving notifications
+ * when transforms are finished, otherwise if not NULL the client is responsible for allocating this array, with at least
+ * as many elements as specified in numQueuesAndEvents.
+ * @param[in] inputBuffers An array of cl_mem objects that contain data for processing by the FFT runtime. If the transform
+ * is in place, the FFT results will overwrite the input buffers
+ * @param[out] outputBuffers An array of cl_mem objects that will store the results of out of place transforms. If the transform
+ * is in place, this parameter may be NULL or nullptr. It is completely ignored
+ * @param[in] tmpBuffer A cl_mem object that is reserved as a temporary buffer for FFT processing. If clTmpBuffers is NULL or nullptr,
+ * and the runtime needs temporary storage, an internal temporary buffer will be created on the fly managed by the runtime.
+ * @return Enum describing error condition; superset of OpenCL error codes
+ */
+ CLFFTAPI clfftStatus clfftEnqueueTransform(
+ clfftPlanHandle plHandle,
+ clfftDirection dir,
+ cl_uint numQueuesAndEvents,
+ cl_command_queue* commQueues,
+ cl_uint numWaitEvents,
+ const cl_event* waitEvents,
+ cl_event* outEvents,
+ cl_mem* inputBuffers,
+ cl_mem* outputBuffers,
+ cl_mem tmpBuffer
+ );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/include/convenienceFunctions.h b/src/include/convenienceFunctions.h
new file mode 100644
index 00000000..e32bd3fc
--- /dev/null
+++ b/src/include/convenienceFunctions.h
@@ -0,0 +1,28 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*****************************************************/
+template< typename T >
+unsigned int float_as_hex( T a ) {
+ return *(unsigned int*)&a;
+}
+
+/*****************************************************/
+template< typename T >
+T hex_as_float( unsigned int a ) {
+ return *(T*)&a;
+}
\ No newline at end of file
diff --git a/src/include/sharedLibrary.h b/src/include/sharedLibrary.h
new file mode 100644
index 00000000..9f34b3a1
--- /dev/null
+++ b/src/include/sharedLibrary.h
@@ -0,0 +1,90 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#pragma once
+#ifndef _SHAREDLIBRARY_H_
+#define _SHAREDLIBRARY_H_
+#include
+
+// _WIN32 is defined for both 32 & 64 bit environments
+#if defined( _WIN32 )
+ #define WIN32_LEAN_AND_MEAN // Exclude rarely-used stuff from Windows headers
+ // Windows Header Files:
+ #include
+#else
+ #include
+#endif
+
+inline void* LoadSharedLibrary( std::string linuxPrefix, std::string libraryName, bool quiet )
+{
+#if defined( _WIN32 )
+ libraryName += ".dll";
+
+ // HMODULE is actually the load address; function returns NULL if it cannot find the shared library
+ HMODULE fileHandle = ::LoadLibraryExA( libraryName.c_str( ), NULL, NULL );
+#else
+ tstring linuxName = linuxPrefix;
+ linuxName += libraryName += ".so";
+ void* fileHandle = ::dlopen( linuxName.c_str( ), RTLD_NOW );
+ if( !quiet && !fileHandle )
+ {
+ std::cerr << ::dlerror( ) << std::endl;
+ }
+#endif
+
+ return fileHandle;
+}
+
+// If the function succeeds, the return value is nonzero.
+// If the function fails, the return value is zero.
+inline int FreeSharedLibrary( void*& libHandle )
+{
+ int result = 0;
+
+#if defined( _WIN32 )
+ if( libHandle != 0 )
+ result = ::FreeLibrary( reinterpret_cast< HMODULE >( libHandle ) );
+#else
+ if( libHandle != 0 )
+ result = ( ::dlclose( libHandle ) == 0 );
+#endif
+
+ libHandle = NULL;
+
+ return result;
+}
+
+// This takes a shared module handle returned from LoadSharedLibrary, and a text string of a symbol
+// to load from the module, and returns a pointer to that symbol. If the symbol is not found, NULL
+// is returned. If the module handle is NULL, NULL is returned.
+inline void* LoadFunctionAddr( void* libHandle, std::string funcName )
+{
+ if( libHandle == NULL )
+ return NULL;
+
+#if defined( _WIN32 )
+ HMODULE fileHandle = reinterpret_cast< HMODULE >( libHandle );
+
+ void* pFunc = ::GetProcAddress( fileHandle, funcName.c_str( ) );
+#else
+ void* pFunc = ::dlsym( libHandle, funcName.c_str( ) );
+#endif
+
+ return pFunc;
+}
+
+#endif // _SHAREDLIBRARY_H_
diff --git a/src/include/stdafx.h b/src/include/stdafx.h
new file mode 100644
index 00000000..5a8077bf
--- /dev/null
+++ b/src/include/stdafx.h
@@ -0,0 +1,49 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+// stdafx.h : include file for standard system include files,
+// or project specific include files that are used frequently, but
+// are changed infrequently
+//
+
+#pragma once
+
+#define _CRT_SECURE_NO_WARNINGS
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+// _WIN32 is defined for both 32 & 64 bit environments
+#if defined( _WIN32 )
+ #include
+ #include "targetver.h"
+
+ #define NOMINMAX
+ #define WIN32_LEAN_AND_MEAN // Exclude rarely-used stuff from Windows headers
+ // Windows Header Files:
+ #include
+#endif
diff --git a/src/include/targetver.h b/src/include/targetver.h
new file mode 100644
index 00000000..7c05692e
--- /dev/null
+++ b/src/include/targetver.h
@@ -0,0 +1,25 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#pragma once
+
+// Including SDKDDKVer.h defines the highest available Windows platform.
+
+// If you wish to build your application for a previous Windows platform, include WinSDKVer.h and
+// set the _WIN32_WINNT macro to the platform you wish to support before including SDKDDKVer.h.
+
+#include
diff --git a/src/include/unicode.compatibility.h b/src/include/unicode.compatibility.h
new file mode 100644
index 00000000..56a365f9
--- /dev/null
+++ b/src/include/unicode.compatibility.h
@@ -0,0 +1,59 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#pragma once
+#if !defined( amd_unicode_h )
+#define amd_unicode_h
+
+// Typedefs to support unicode and ansii compilation
+#if defined( _UNICODE )
+ typedef std::wstring tstring;
+ typedef std::wstringstream tstringstream;
+ typedef std::wifstream tifstream;
+ typedef std::wofstream tofstream;
+ typedef std::wfstream tfstream;
+ static std::wostream& tout = std::wcout;
+ static std::wostream& terr = std::wcerr;
+#else
+ typedef std::string tstring;
+ typedef std::stringstream tstringstream;
+ typedef std::ifstream tifstream;
+ typedef std::ofstream tofstream;
+ typedef std::fstream tfstream;
+ static std::ostream& tout = std::cout;
+ static std::ostream& terr = std::cerr;
+#endif
+
+// These macros help linux cope with the conventions of windows tchar.h file
+#if defined( _WIN32 )
+ #include
+ #include
+#else
+ #if defined( __GNUC__ )
+ typedef char TCHAR;
+ typedef char _TCHAR;
+ #define _tmain main
+
+ #if defined( UNICODE )
+ #define _T(x) L ## x
+ #else
+ #define _T(x) x
+ #endif
+ #endif
+#endif
+
+#endif
diff --git a/src/include/version.h.in b/src/include/version.h.in
new file mode 100644
index 00000000..343cd7e2
--- /dev/null
+++ b/src/include/version.h.in
@@ -0,0 +1,22 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/* the configured version and settings for clFFT
+ */
+#define clfftVersionMajor @CLFFT_VERSION_MAJOR@
+#define clfftVersionMinor @CLFFT_VERSION_MINOR@
+#define clfftVersionPatch @CLFFT_VERSION_PATCH@
diff --git a/src/library/CMakeLists.txt b/src/library/CMakeLists.txt
new file mode 100644
index 00000000..ea7637bf
--- /dev/null
+++ b/src/library/CMakeLists.txt
@@ -0,0 +1,102 @@
+# ########################################################################
+# Copyright 2013 Advanced Micro Devices, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ########################################################################
+
+# List the names of common files to compile across all platforms
+set( clFFT.Source transform.cpp
+ accessors.cpp
+ plan.cpp
+ repo.cpp
+ generator.stockham.cpp
+ generator.transpose.cpp
+ generator.copy.cpp
+ lifetime.cpp
+ stdafx.cpp )
+
+# Windows only uses dllmain
+if( MSVC )
+ set( clFFT.Source ${clFFT.Source} dllmain.cpp )
+endif( )
+
+set( clFFT.Headers private.h
+ repo.h
+ plan.h
+ lock.h
+ mainpage.h
+ generator.h
+ generator.stockham.h
+ generator.transpose.h
+ ../include/stdafx.h
+ ../include/unicode.compatibility.h
+ ../include/targetver.h
+ ../include/clAmdFft.h
+ ../include/clFFT.h )
+
+set( clFFT.Files ${clFFT.Source} ${clFFT.Headers} )
+
+# For a rainy day, add pre-compiled header support
+#if( MSVC )
+# if (USE_MSVC_PCH)
+
+# set_source_files_properties(LungAnalysisPCH.cxx
+# PROPERTIES
+# COMPILE_FLAGS "/YcLungAnalysisPCH.h"
+# )
+# foreach( src_file ${UPMC_LA_SRCS} )
+# set_source_files_properties(
+# ${src_file}
+# PROPERTIES
+# COMPILE_FLAGS "/YuLungAnalysisPCH.h"
+# )
+# endforeach( src_file ${UPMC_LA_SRCS} )
+
+# list(APPEND UPMC_LA_SRCS LungAnalysisPCH.cxx)
+# list(APPEND UPMC_LA_HDRS LungAnalysisPCH.h)
+
+# endif(USE_MSVC_PCH)
+#endif (MSVC)
+
+# add_definitions( ${Boost_LIB_DIAGNOSTIC_DEFINITIONS} )
+add_definitions( "/DCLFFT_EXPORTS" )
+
+# Include standard OpenCL headers
+include_directories( ${OPENCL_INCLUDE_DIRS} ${PROJECT_BINARY_DIR}/include ../include )
+
+add_library( clFFT SHARED ${clFFT.Files} )
+target_link_libraries( clFFT ${OPENCL_LIBRARIES} )
+
+set_target_properties( clFFT PROPERTIES VERSION ${CLFFT_VERSION} )
+set_target_properties( clFFT PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" )
+
+if( UNIX )
+ # Right now, linux has problems compiling dynamic_cast, but the flag below doesn't help
+ # set_target_properties( clFFT PROPERTIES COMPILE_FLAGS "-frtti" )
+endif( )
+
+if( BUILD64 )
+ # CPack configuration; include the executable into the package
+ install( TARGETS clFFT
+ RUNTIME DESTINATION bin64
+ LIBRARY DESTINATION lib64
+ ARCHIVE DESTINATION lib64/import
+ )
+else()
+ # CPack configuration; include the executable into the package
+ install( TARGETS clFFT
+ RUNTIME DESTINATION bin32
+ LIBRARY DESTINATION lib32
+ ARCHIVE DESTINATION lib32/import
+ )
+endif()
diff --git a/src/library/ReadMe.txt b/src/library/ReadMe.txt
new file mode 100644
index 00000000..72470754
--- /dev/null
+++ b/src/library/ReadMe.txt
@@ -0,0 +1,56 @@
+# ########################################################################
+# Copyright 2013 Advanced Micro Devices, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ########################################################################
+
+========================================================================
+ CONSOLE APPLICATION : AMD.clFFT Project Overview
+========================================================================
+
+AppWizard has created this AMD.clFFT application for you.
+
+This file contains a summary of what you will find in each of the files that
+make up your AMD.clFFT application.
+
+
+AMD.clFFT.vcxproj
+ This is the main project file for VC++ projects generated using an Application Wizard.
+ It contains information about the version of Visual C++ that generated the file, and
+ information about the platforms, configurations, and project features selected with the
+ Application Wizard.
+
+AMD.clFFT.vcxproj.filters
+ This is the filters file for VC++ projects generated using an Application Wizard.
+ It contains information about the association between the files in your project
+ and the filters. This association is used in the IDE to show grouping of files with
+ similar extensions under a specific node (for e.g. ".cpp" files are associated with the
+ "Source Files" filter).
+
+AMD.clFFT.cpp
+ This is the main application source file.
+
+/////////////////////////////////////////////////////////////////////////////
+Other standard files:
+
+StdAfx.h, StdAfx.cpp
+ These files are used to build a precompiled header (PCH) file
+ named AMD.clFFT.pch and a precompiled types file named StdAfx.obj.
+
+/////////////////////////////////////////////////////////////////////////////
+Other notes:
+
+AppWizard uses "TODO:" comments to indicate parts of the source code you
+should add to or customize.
+
+/////////////////////////////////////////////////////////////////////////////
diff --git a/src/library/accessors.cpp b/src/library/accessors.cpp
new file mode 100644
index 00000000..8d6ce65f
--- /dev/null
+++ b/src/library/accessors.cpp
@@ -0,0 +1,826 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+// clfft.accessors.cpp : Defines all the getters/setters for the Plan
+//
+
+#include "stdafx.h"
+#include "private.h"
+#include "repo.h"
+
+using std::vector;
+
+clfftStatus clfftGetPlanBatchSize( const clfftPlanHandle plHandle, size_t* batchsize )
+{
+ FFTRepo& fftRepo = FFTRepo::getInstance( );
+ FFTPlan* fftPlan = NULL;
+ lockRAII* planLock = NULL;
+
+ OPENCL_V( fftRepo.getPlan( plHandle, fftPlan, planLock ), _T( "fftRepo.getPlan failed" ) );
+ scopedLock sLock( *planLock, _T( "clfftGetPlanBatchSize" ) );
+
+ *batchsize = fftPlan->batchsize;
+ return CLFFT_SUCCESS;
+}
+
+clfftStatus clfftSetPlanBatchSize( clfftPlanHandle plHandle, size_t batchsize )
+{
+ FFTRepo& fftRepo = FFTRepo::getInstance( );
+ FFTPlan* fftPlan = NULL;
+ lockRAII* planLock = NULL;
+
+ OPENCL_V( fftRepo.getPlan( plHandle, fftPlan, planLock ), _T( "fftRepo.getPlan failed" ) );
+ scopedLock sLock( *planLock, _T( "clfftSetPlanBatchSize" ) );
+
+ // If we modify the state of the plan, we assume that we can't trust any pre-calculated contents anymore
+ fftPlan->baked = false;
+ fftPlan->batchsize = batchsize;
+ return CLFFT_SUCCESS;
+}
+
+clfftStatus clfftGetPlanContext( const clfftPlanHandle plHandle, cl_context* context )
+{
+ FFTRepo& fftRepo = FFTRepo::getInstance( );
+ FFTPlan* fftPlan = NULL;
+ lockRAII* planLock = NULL;
+
+ OPENCL_V( fftRepo.getPlan( plHandle, fftPlan, planLock ), _T( "fftRepo.getPlan failed" ) );
+ scopedLock sLock( *planLock, _T( "clfftGetPlanContext" ) );
+
+ *context = fftPlan->context;
+ return CLFFT_SUCCESS;
+}
+
+clfftStatus clfftGetPlanPrecision( const clfftPlanHandle plHandle, clfftPrecision* precision )
+{
+ FFTRepo& fftRepo = FFTRepo::getInstance( );
+ FFTPlan* fftPlan = NULL;
+ lockRAII* planLock = NULL;
+
+ OPENCL_V( fftRepo.getPlan( plHandle, fftPlan, planLock ), _T( "fftRepo.getPlan failed" ) );
+ scopedLock sLock( *planLock, _T( "clfftGetPlanPrecision" ) );
+
+ *precision = fftPlan->precision;
+
+ return CLFFT_SUCCESS;
+}
+
+// This is a helper function to query a device for it's caps and check whether a certain user supplied cap is present
+// Returns CLFFT_SUCCESS if the cap is present, CLFFT_INVALID_OPERATION if it is not found. All devices specified
+// in the devices vector must contain the cap.
+clfftStatus checkDevExt( std::string cap, std::vector< cl_device_id >& devices )
+{
+ for( size_t d = 0; d < devices.size( ); ++d)
+ {
+ size_t deviceExtSize = 0;
+ OPENCL_V( ::clGetDeviceInfo( devices[ d ], CL_DEVICE_EXTENSIONS, 0, NULL, &deviceExtSize ),
+ "Getting CL_DEVICE_EXTENSIONS Platform Info string size ( ::clGetDeviceInfo() )" );
+
+ std::vector< char > szDeviceExt( deviceExtSize );
+ OPENCL_V( ::clGetDeviceInfo( devices[ d ], CL_DEVICE_EXTENSIONS, deviceExtSize, &szDeviceExt[ 0 ], NULL ),
+ "Getting CL_DEVICE_EXTENSIONS Platform Info string ( ::clGetDeviceInfo() )" );
+
+ std::string strDeviceExt = &szDeviceExt[ 0 ];
+
+ if( strDeviceExt.find( cap.c_str( ), 0 ) == std::string::npos )
+ return CLFFT_DEVICE_NO_DOUBLE;
+ }
+
+ return CLFFT_SUCCESS;
+}
+clfftStatus clfftSetPlanPrecision( clfftPlanHandle plHandle, clfftPrecision precision )
+{
+ FFTRepo& fftRepo = FFTRepo::getInstance( );
+ FFTPlan* fftPlan = NULL;
+ lockRAII* planLock = NULL;
+
+ OPENCL_V( fftRepo.getPlan( plHandle, fftPlan, planLock ), _T( "fftRepo.getPlan failed" ) );
+ scopedLock sLock( *planLock, _T( "clfftSetPlanPrecision" ) );
+
+ if( precision >= ENDPRECISION )
+ return CLFFT_INVALID_ARG_VALUE;
+
+ // We do not support CLFFT_*_FAST currently
+ if( precision == CLFFT_SINGLE_FAST || precision == CLFFT_DOUBLE_FAST )
+ return CLFFT_NOTIMPLEMENTED;
+
+ // If the user specifies double precision, check that the device supports double precision first
+ if( precision == CLFFT_DOUBLE || precision == CLFFT_DOUBLE_FAST )
+ {
+ clfftStatus retAmdFp64 = checkDevExt( "cl_amd_fp64", fftPlan->devices );
+ if( retAmdFp64 != CLFFT_SUCCESS )
+ {
+ // If AMD's extention is not supported, check for Khronos extention
+ clfftStatus retKhrFp64 = checkDevExt( "cl_khr_fp64", fftPlan->devices );
+ if( retKhrFp64 != CLFFT_SUCCESS )
+ return retKhrFp64;
+ }
+ }
+
+ // If we modify the state of the plan, we assume that we can't trust any pre-calculated contents anymore
+ fftPlan->baked = false;
+ fftPlan->precision = precision;
+
+ return CLFFT_SUCCESS;
+}
+
+clfftStatus clfftGetPlanScale( const clfftPlanHandle plHandle, clfftDirection dir, cl_float* scale )
+{
+ FFTRepo& fftRepo = FFTRepo::getInstance( );
+ FFTPlan* fftPlan = NULL;
+ lockRAII* planLock = NULL;
+
+ OPENCL_V( fftRepo.getPlan( plHandle, fftPlan, planLock ), _T( "fftRepo.getPlan failed" ) );
+ scopedLock sLock( *planLock, _T( "clfftGetPlanScale" ) );
+
+ if( dir >= ENDDIRECTION )
+ return CLFFT_INVALID_ARG_VALUE;
+
+ if( dir == CLFFT_FORWARD || dir == CLFFT_MINUS )
+ *scale = (cl_float)(fftPlan->forwardScale);
+ else
+ *scale = (cl_float)(fftPlan->backwardScale);
+
+ return CLFFT_SUCCESS;
+}
+
+clfftStatus clfftSetPlanScale( clfftPlanHandle plHandle, clfftDirection dir, cl_float scale )
+{
+ FFTRepo& fftRepo = FFTRepo::getInstance( );
+ FFTPlan* fftPlan = NULL;
+ lockRAII* planLock = NULL;
+
+ OPENCL_V( fftRepo.getPlan( plHandle, fftPlan, planLock ), _T( "fftRepo.getPlan failed" ) );
+ scopedLock sLock( *planLock, _T( "clfftSetPlanScale" ) );
+
+ if( dir >= ENDDIRECTION )
+ return CLFFT_INVALID_ARG_VALUE;
+
+ // If we modify the state of the plan, we assume that we can't trust any pre-calculated contents anymore
+ fftPlan->baked = false;
+
+ if( dir == CLFFT_FORWARD || dir == CLFFT_MINUS )
+ fftPlan->forwardScale = scale;
+ else
+ fftPlan->backwardScale = scale;
+
+ return CLFFT_SUCCESS;
+}
+
+clfftStatus clfftGetPlanDim( const clfftPlanHandle plHandle, clfftDim* dim, cl_uint* size )
+{
+ FFTRepo& fftRepo = FFTRepo::getInstance( );
+ FFTPlan* fftPlan = NULL;
+ lockRAII* planLock = NULL;
+
+ OPENCL_V( fftRepo.getPlan( plHandle, fftPlan, planLock ), _T( "fftRepo.getPlan failed" ) );
+ scopedLock sLock( *planLock, _T( "clfftGetPlanDim" ) );
+
+ *dim = fftPlan->dim;
+
+ switch( fftPlan->dim )
+ {
+ case CLFFT_1D:
+ {
+ *size = 1;
+ }
+ break;
+ case CLFFT_2D:
+ {
+ *size = 2;
+ }
+ break;
+ case CLFFT_3D:
+ {
+ *size = 3;
+ }
+ break;
+ default:
+ return CLFFT_NOTIMPLEMENTED;
+ break;
+ }
+
+ return CLFFT_SUCCESS;
+}
+
+clfftStatus clfftSetPlanDim( clfftPlanHandle plHandle, const clfftDim dim )
+{
+ FFTRepo& fftRepo = FFTRepo::getInstance( );
+ FFTPlan* fftPlan = NULL;
+ lockRAII* planLock = NULL;
+
+ OPENCL_V( fftRepo.getPlan( plHandle, fftPlan, planLock ), _T( "fftRepo.getPlan failed" ) );
+ scopedLock sLock( *planLock, _T( "clfftGetPlanDim" ) );
+
+ // We resize the vectors in the plan to keep their sizes consistent with the value of the dimension
+ switch( dim )
+ {
+ case CLFFT_1D:
+ {
+ fftPlan->length.resize( 1 );
+ fftPlan->inStride.resize( 1 );
+ fftPlan->outStride.resize( 1 );
+ }
+ break;
+ case CLFFT_2D:
+ {
+ fftPlan->length.resize( 2 );
+ fftPlan->inStride.resize( 2 );
+ fftPlan->outStride.resize( 2 );
+ }
+ break;
+ case CLFFT_3D:
+ {
+ fftPlan->length.resize( 3 );
+ fftPlan->inStride.resize( 3 );
+ fftPlan->outStride.resize( 3 );
+ }
+ break;
+ default:
+ return CLFFT_NOTIMPLEMENTED;
+ break;
+ }
+
+ // If we modify the state of the plan, we assume that we can't trust any pre-calculated contents anymore
+ fftPlan->baked = false;
+ fftPlan->dim = dim;
+
+ return CLFFT_SUCCESS;
+}
+
+clfftStatus clfftGetPlanLength( const clfftPlanHandle plHandle, const clfftDim dim, size_t* clLengths )
+{
+ FFTRepo& fftRepo = FFTRepo::getInstance( );
+ FFTPlan* fftPlan = NULL;
+ lockRAII* planLock = NULL;
+
+ OPENCL_V( fftRepo.getPlan( plHandle, fftPlan, planLock ), _T( "fftRepo.getPlan failed" ) );
+ scopedLock sLock( *planLock, _T( "clfftGetPlanLength" ) );
+
+ if( clLengths == NULL )
+ return CLFFT_INVALID_HOST_PTR;
+
+ if( fftPlan->length.empty( ) )
+ return CLFFT_INVALID_ARG_INDEX;
+
+ switch( dim )
+ {
+ case CLFFT_1D:
+ {
+ clLengths[ DimX ] = fftPlan->length[ DimX ];
+ }
+ break;
+ case CLFFT_2D:
+ {
+ if( fftPlan->length.size( ) < 2 )
+ return CLFFT_INVALID_ARG_INDEX;
+
+ clLengths[ DimX ] = fftPlan->length[ DimX ];
+ clLengths[ DimY ] = fftPlan->length[ DimY ];
+ }
+ break;
+ case CLFFT_3D:
+ {
+ if( fftPlan->length.size( ) < 3 )
+ return CLFFT_INVALID_ARG_INDEX;
+
+ clLengths[ DimX ] = fftPlan->length[ DimX ];
+ clLengths[ DimY ] = fftPlan->length[ DimY ];
+ clLengths[ DimZ ] = fftPlan->length[ DimZ ];
+ }
+ break;
+ default:
+ return CLFFT_NOTIMPLEMENTED;
+ break;
+ }
+ return CLFFT_SUCCESS;
+}
+
+clfftStatus clfftSetPlanLength( clfftPlanHandle plHandle, const clfftDim dim, const size_t* clLengths )
+{
+ FFTRepo& fftRepo = FFTRepo::getInstance( );
+ FFTPlan* fftPlan = NULL;
+ lockRAII* planLock = NULL;
+
+ OPENCL_V( fftRepo.getPlan( plHandle, fftPlan, planLock ), _T( "fftRepo.getPlan failed" ) );
+ scopedLock sLock( *planLock, _T( "clfftSetPlanLength" ) );
+
+ if( clLengths == NULL )
+ return CLFFT_INVALID_HOST_PTR;
+
+ // Simplest to clear any previous contents, because it's valid for user to shrink dimension
+ fftPlan->length.clear( );
+ switch( dim )
+ {
+ case CLFFT_1D:
+ {
+ // Minimum length size is 1
+ if( clLengths[ DimX ] == 0 )
+ return CLFFT_INVALID_ARG_VALUE;
+
+ if( !IsASupportedLength( clLengths[ DimX ] ) )
+ return CLFFT_NOTIMPLEMENTED;
+
+ fftPlan->length.push_back( clLengths[ DimX ] );
+ }
+ break;
+ case CLFFT_2D:
+ {
+ // Minimum length size is 1
+ if( clLengths[ DimX ] == 0 || clLengths[ DimY ] == 0 )
+ return CLFFT_INVALID_ARG_VALUE;
+
+ if( !IsASupportedLength( clLengths[ DimX ] ) || !IsASupportedLength( clLengths[ DimY ] ) )
+ {
+ return CLFFT_NOTIMPLEMENTED;
+ }
+
+ fftPlan->length.push_back( clLengths[ DimX ] );
+ fftPlan->length.push_back( clLengths[ DimY ] );
+ }
+ break;
+ case CLFFT_3D:
+ {
+ // Minimum length size is 1
+ if( clLengths[ DimX ] == 0 || clLengths[ DimY ] == 0 || clLengths[ DimZ ] == 0)
+ return CLFFT_INVALID_ARG_VALUE;
+
+ if( !IsASupportedLength( clLengths[ DimX ] ) || !IsASupportedLength( clLengths[ DimY ] ) ||
+ !IsASupportedLength( clLengths[ DimZ ] ) )
+ {
+ return CLFFT_NOTIMPLEMENTED;
+ }
+
+ fftPlan->length.push_back( clLengths[ DimX ] );
+ fftPlan->length.push_back( clLengths[ DimY ] );
+ fftPlan->length.push_back( clLengths[ DimZ ] );
+ }
+ break;
+ default:
+ return CLFFT_NOTIMPLEMENTED;
+ break;
+ }
+
+ fftPlan->dim = dim;
+
+ // If we modify the state of the plan, we assume that we can't trust any pre-calculated contents anymore
+ fftPlan->baked = false;
+
+ return CLFFT_SUCCESS;
+}
+
+clfftStatus clfftGetPlanInStride( const clfftPlanHandle plHandle, const clfftDim dim, size_t* clStrides )
+{
+ FFTRepo& fftRepo = FFTRepo::getInstance( );
+ FFTPlan* fftPlan = NULL;
+ lockRAII* planLock = NULL;
+
+ OPENCL_V( fftRepo.getPlan( plHandle, fftPlan, planLock ), _T( "fftRepo.getPlan failed" ) );
+ scopedLock sLock( *planLock, _T( "clfftGetPlanInStride" ) );
+
+ if( clStrides == NULL )
+ return CLFFT_INVALID_HOST_PTR;
+
+ switch( dim )
+ {
+ case CLFFT_1D:
+ {
+ if( fftPlan->inStride.size( ) > 0 )
+ clStrides[ DimX ] = fftPlan->inStride[ DimX ];
+ else
+ return CLFFT_INVALID_ARG_INDEX;
+ }
+ break;
+ case CLFFT_2D:
+ {
+ if( fftPlan->inStride.size( ) > 1 )
+ {
+ clStrides[ DimX ] = fftPlan->inStride[ DimX ];
+ clStrides[ DimY ] = fftPlan->inStride[ DimY ];
+ }
+ else
+ return CLFFT_INVALID_ARG_INDEX;
+ }
+ break;
+ case CLFFT_3D:
+ {
+ if( fftPlan->inStride.size( ) > 2 )
+ {
+ clStrides[ DimX ] = fftPlan->inStride[ DimX ];
+ clStrides[ DimY ] = fftPlan->inStride[ DimY ];
+ clStrides[ DimZ ] = fftPlan->inStride[ DimZ ];
+ }
+ else
+ return CLFFT_INVALID_ARG_INDEX;
+ }
+ break;
+ default:
+ return CLFFT_NOTIMPLEMENTED;
+ break;
+ }
+
+ return CLFFT_SUCCESS;
+}
+
+clfftStatus clfftSetPlanInStride( clfftPlanHandle plHandle, const clfftDim dim, size_t* clStrides )
+{
+ FFTRepo& fftRepo = FFTRepo::getInstance( );
+ FFTPlan* fftPlan = NULL;
+ lockRAII* planLock = NULL;
+
+ OPENCL_V( fftRepo.getPlan( plHandle, fftPlan, planLock ), _T( "fftRepo.getPlan failed" ) );
+ scopedLock sLock( *planLock, _T( "clfftSetPlanInStride" ) );
+
+ if( clStrides == NULL )
+ return CLFFT_INVALID_HOST_PTR;
+
+ // Simplest to clear any previous contents, because it's valid for user to shrink dimension
+ fftPlan->inStride.clear( );
+ switch( dim )
+ {
+ case CLFFT_1D:
+ {
+ fftPlan->inStride.push_back( clStrides[ DimX ] );
+ }
+ break;
+ case CLFFT_2D:
+ {
+ fftPlan->inStride.push_back( clStrides[ DimX ] );
+ fftPlan->inStride.push_back( clStrides[ DimY ] );
+ }
+ break;
+ case CLFFT_3D:
+ {
+ fftPlan->inStride.push_back( clStrides[ DimX ] );
+ fftPlan->inStride.push_back( clStrides[ DimY ] );
+ fftPlan->inStride.push_back( clStrides[ DimZ ] );
+ }
+ break;
+ default:
+ return CLFFT_NOTIMPLEMENTED;
+ break;
+ }
+
+ // If we modify the state of the plan, we assume that we can't trust any pre-calculated contents anymore
+ fftPlan->baked = false;
+
+ return CLFFT_SUCCESS;
+}
+
+clfftStatus clfftGetPlanOutStride( const clfftPlanHandle plHandle, const clfftDim dim, size_t* clStrides )
+{
+ FFTRepo& fftRepo = FFTRepo::getInstance( );
+ FFTPlan* fftPlan = NULL;
+ lockRAII* planLock = NULL;
+
+ OPENCL_V( fftRepo.getPlan( plHandle, fftPlan, planLock ), _T( "fftRepo.getPlan failed" ) );
+ scopedLock sLock( *planLock, _T( "clfftGetPlanOutStride" ) );
+
+ if( clStrides == NULL )
+ return CLFFT_INVALID_HOST_PTR;
+
+ switch( dim )
+ {
+ case CLFFT_1D:
+ {
+ if( fftPlan->outStride.size( ) > 0 )
+ clStrides[ DimX ] = fftPlan->outStride[ DimX ];
+ else
+ return CLFFT_INVALID_ARG_INDEX;
+ }
+ break;
+ case CLFFT_2D:
+ {
+ if( fftPlan->outStride.size( ) > 1 )
+ {
+ clStrides[ DimX ] = fftPlan->outStride[ DimX ];
+ clStrides[ DimY ] = fftPlan->outStride[ DimY ];
+ }
+ else
+ return CLFFT_INVALID_ARG_INDEX;
+ }
+ break;
+ case CLFFT_3D:
+ {
+ if( fftPlan->outStride.size( ) > 2 )
+ {
+ clStrides[ DimX ] = fftPlan->outStride[ DimX ];
+ clStrides[ DimY ] = fftPlan->outStride[ DimY ];
+ clStrides[ DimZ ] = fftPlan->outStride[ DimZ ];
+ }
+ else
+ return CLFFT_INVALID_ARG_INDEX;
+ }
+ break;
+ default:
+ return CLFFT_NOTIMPLEMENTED;
+ break;
+ }
+
+ return CLFFT_SUCCESS;
+}
+
+clfftStatus clfftSetPlanOutStride( clfftPlanHandle plHandle, const clfftDim dim, size_t* clStrides )
+{
+ FFTRepo& fftRepo = FFTRepo::getInstance( );
+ FFTPlan* fftPlan = NULL;
+ lockRAII* planLock = NULL;
+
+ OPENCL_V( fftRepo.getPlan( plHandle, fftPlan, planLock ), _T( "fftRepo.getPlan failed" ) );
+ scopedLock sLock( *planLock, _T( "clfftSetPlanOutStride" ) );
+
+ if( clStrides == NULL )
+ return CLFFT_INVALID_HOST_PTR;
+
+ // Simplest to clear any previous contents, because it's valid for user to shrink dimension
+ fftPlan->outStride.clear( );
+ switch( dim )
+ {
+ case CLFFT_1D:
+ {
+ fftPlan->outStride.push_back( clStrides[ DimX ] );
+ }
+ break;
+ case CLFFT_2D:
+ {
+ fftPlan->outStride.push_back( clStrides[ DimX ] );
+ fftPlan->outStride.push_back( clStrides[ DimY ] );
+ }
+ break;
+ case CLFFT_3D:
+ {
+ fftPlan->outStride.push_back( clStrides[ DimX ] );
+ fftPlan->outStride.push_back( clStrides[ DimY ] );
+ fftPlan->outStride.push_back( clStrides[ DimZ ] );
+ }
+ break;
+ default:
+ return CLFFT_NOTIMPLEMENTED;
+ break;
+ }
+
+ // If we modify the state of the plan, we assume that we can't trust any pre-calculated contents anymore
+ fftPlan->baked = false;
+
+ return CLFFT_SUCCESS;
+}
+
+clfftStatus clfftGetPlanDistance( const clfftPlanHandle plHandle, size_t* iDist, size_t* oDist )
+{
+ FFTRepo& fftRepo = FFTRepo::getInstance( );
+ FFTPlan* fftPlan = NULL;
+ lockRAII* planLock = NULL;
+
+ OPENCL_V( fftRepo.getPlan( plHandle, fftPlan, planLock ), _T( "fftRepo.getPlan failed" ) );
+ scopedLock sLock( *planLock, _T( "clfftGetPlanDistance" ) );
+
+ *iDist = fftPlan->iDist;
+ *oDist = fftPlan->oDist;
+
+ return CLFFT_SUCCESS;
+}
+
+clfftStatus clfftSetPlanDistance( clfftPlanHandle plHandle, size_t iDist, size_t oDist )
+{
+ FFTRepo& fftRepo = FFTRepo::getInstance( );
+ FFTPlan* fftPlan = NULL;
+ lockRAII* planLock = NULL;
+
+ OPENCL_V( fftRepo.getPlan( plHandle, fftPlan, planLock ), _T( "fftRepo.getPlan failed" ) );
+ scopedLock sLock( *planLock, _T( "clfftSetPlanDistance" ) );
+
+ // If we modify the state of the plan, we assume that we can't trust any pre-calculated contents anymore
+ fftPlan->baked = false;
+ fftPlan->iDist = iDist;
+ fftPlan->oDist = oDist;
+
+ return CLFFT_SUCCESS;
+}
+
+clfftStatus clfftGetLayout( const clfftPlanHandle plHandle, clfftLayout* iLayout, clfftLayout* oLayout )
+{
+ FFTRepo& fftRepo = FFTRepo::getInstance( );
+ FFTPlan* fftPlan = NULL;
+ lockRAII* planLock = NULL;
+
+ OPENCL_V( fftRepo.getPlan( plHandle, fftPlan, planLock ), _T( "fftRepo.getPlan failed" ) );
+ scopedLock sLock( *planLock, _T( "clfftGetLayout" ) );
+
+ *iLayout = fftPlan->inputLayout;
+ *oLayout = fftPlan->outputLayout;
+
+ return CLFFT_SUCCESS;
+}
+
+clfftStatus clfftSetLayout( clfftPlanHandle plHandle, clfftLayout iLayout, clfftLayout oLayout )
+{
+ FFTRepo& fftRepo = FFTRepo::getInstance( );
+ FFTPlan* fftPlan = NULL;
+ lockRAII* planLock = NULL;
+
+ OPENCL_V( fftRepo.getPlan( plHandle, fftPlan, planLock ), _T( "fftRepo.getPlan failed" ) );
+ scopedLock sLock( *planLock, _T( "clfftSetLayout" ) );
+
+ // Basic error checking on parameter
+ if( ( iLayout >= ENDLAYOUT ) || ( oLayout >= ENDLAYOUT ) )
+ return CLFFT_INVALID_ARG_VALUE;
+
+ // We currently only support a subset of formats
+ switch( iLayout )
+ {
+ case CLFFT_COMPLEX_INTERLEAVED:
+ {
+ if( (oLayout == CLFFT_HERMITIAN_INTERLEAVED) || (oLayout == CLFFT_HERMITIAN_PLANAR) || (oLayout == CLFFT_REAL))
+ return CLFFT_NOTIMPLEMENTED;
+ }
+ break;
+ case CLFFT_COMPLEX_PLANAR:
+ {
+ if( (oLayout == CLFFT_HERMITIAN_INTERLEAVED) || (oLayout == CLFFT_HERMITIAN_PLANAR) || (oLayout == CLFFT_REAL))
+ return CLFFT_NOTIMPLEMENTED;
+ }
+ break;
+ case CLFFT_HERMITIAN_INTERLEAVED:
+ {
+ if(oLayout != CLFFT_REAL) return CLFFT_NOTIMPLEMENTED;
+ }
+ break;
+ case CLFFT_HERMITIAN_PLANAR:
+ {
+ if(oLayout != CLFFT_REAL) return CLFFT_NOTIMPLEMENTED;
+ }
+ break;
+ case CLFFT_REAL:
+ {
+ if((oLayout == CLFFT_REAL) || (oLayout == CLFFT_COMPLEX_INTERLEAVED) || (oLayout == CLFFT_COMPLEX_PLANAR))
+ return CLFFT_NOTIMPLEMENTED;
+ }
+ break;
+ default:
+ return CLFFT_NOTIMPLEMENTED;
+ break;
+ }
+
+ // We currently only support a subset of formats
+ switch( oLayout )
+ {
+ case CLFFT_COMPLEX_PLANAR:
+ case CLFFT_COMPLEX_INTERLEAVED:
+ case CLFFT_HERMITIAN_INTERLEAVED:
+ case CLFFT_HERMITIAN_PLANAR:
+ case CLFFT_REAL:
+ break;
+ default:
+ return CLFFT_NOTIMPLEMENTED;
+ break;
+ }
+
+ // If we modify the state of the plan, we assume that we can't trust any pre-calculated contents anymore
+ fftPlan->baked = false;
+ fftPlan->inputLayout = iLayout;
+ fftPlan->outputLayout = oLayout;
+
+ return CLFFT_SUCCESS;
+}
+
+clfftStatus clfftGetResultLocation( const clfftPlanHandle plHandle, clfftResultLocation* placeness )
+{
+ FFTRepo& fftRepo = FFTRepo::getInstance( );
+ FFTPlan* fftPlan = NULL;
+ lockRAII* planLock = NULL;
+
+ OPENCL_V( fftRepo.getPlan( plHandle, fftPlan, planLock ), _T( "fftRepo.getPlan failed" ) );
+ scopedLock sLock( *planLock, _T( "clfftGetResultLocation" ) );
+
+ *placeness = fftPlan->placeness;
+
+ return CLFFT_SUCCESS;
+}
+
+clfftStatus clfftSetResultLocation( clfftPlanHandle plHandle, clfftResultLocation placeness )
+{
+ FFTRepo& fftRepo = FFTRepo::getInstance( );
+ FFTPlan* fftPlan = NULL;
+ lockRAII* planLock = NULL;
+
+ OPENCL_V( fftRepo.getPlan( plHandle, fftPlan, planLock ), _T( "fftRepo.getPlan failed" ) );
+ scopedLock sLock( *planLock, _T( "clfftSetResultLocation" ) );
+
+ // Basic error checking on parameter
+ if( placeness >= ENDPLACE )
+ return CLFFT_INVALID_ARG_VALUE;
+
+ // If we modify the state of the plan, we assume that we can't trust any pre-calculated contents anymore
+ fftPlan->baked = false;
+ fftPlan->placeness = placeness;
+
+ return CLFFT_SUCCESS;
+}
+
+
+clfftStatus clfftGetPlanTransposeResult( const clfftPlanHandle plHandle, clfftResultTransposed * transposed )
+{
+ FFTRepo& fftRepo = FFTRepo::getInstance( );
+ FFTPlan* fftPlan = NULL;
+ lockRAII* planLock = NULL;
+
+ OPENCL_V( fftRepo.getPlan( plHandle, fftPlan, planLock ), _T( "fftRepo.getPlan failed" ) );
+ scopedLock sLock( *planLock, _T( "clfftGetResultLocation" ) );
+
+ *transposed = fftPlan->transposed;
+
+ return CLFFT_SUCCESS;
+}
+
+clfftStatus clfftSetPlanTransposeResult( clfftPlanHandle plHandle, clfftResultTransposed transposed )
+{
+ FFTRepo& fftRepo = FFTRepo::getInstance( );
+ FFTPlan* fftPlan = NULL;
+ lockRAII* planLock = NULL;
+
+ OPENCL_V( fftRepo.getPlan( plHandle, fftPlan, planLock ), _T( "fftRepo.getPlan failed" ) );
+ scopedLock sLock( *planLock, _T( "clfftSetResultLocation" ) );
+
+ // Basic error checking on parameter
+ if( transposed >= ENDTRANSPOSED )
+ return CLFFT_INVALID_ARG_VALUE;
+
+ // If we modify the state of the plan, we assume that we can't trust any pre-calculated contents anymore
+ fftPlan->baked = false;
+ fftPlan->transposed = transposed;
+
+ return CLFFT_SUCCESS;
+}
+
+clfftStatus clfftGetTmpBufSize( const clfftPlanHandle plHandle, size_t* buffersize )
+{
+ FFTRepo& fftRepo = FFTRepo::getInstance( );
+ FFTPlan* fftPlan = NULL;
+ lockRAII* planLock = NULL;
+
+ OPENCL_V( fftRepo.getPlan( plHandle, fftPlan, planLock ), _T( "fftRepo.getPlan failed" ) );
+ scopedLock sLock( *planLock, _T( "clfftGetPlanBatchSize" ) );
+
+ if (fftPlan->baked == true)
+ {
+ *buffersize = fftPlan->tmpBufSize;
+ return CLFFT_SUCCESS;
+ }
+
+ return CLFFT_INVALID_OPERATION;
+}
+
+clfftStatus clfftSetInternal( clfftPlanHandle plHandle, void* data )
+{
+ FFTRepo& fftRepo = FFTRepo::getInstance( );
+ FFTPlan* fftPlan = NULL;
+ lockRAII* planLock = NULL;
+
+ OPENCL_V( fftRepo.getPlan( plHandle, fftPlan, planLock ), _T( "fftRepo.getPlan failed" ) );
+ scopedLock sLock( *planLock, _T( "clfftSetResultLocation" ) );
+
+ struct InternalData {
+ size_t large1D_Xfactor;
+ size_t cacheSize;
+ bool bLdsComplex;
+ bool ldsPadding;
+ unsigned uLdsFraction;
+ } *mydata;
+
+ mydata = (InternalData *) data;
+
+ fftPlan->large1D_Xfactor = mydata->large1D_Xfactor;
+ fftPlan->cacheSize = mydata->cacheSize;
+ fftPlan->bLdsComplex = mydata->bLdsComplex;
+ fftPlan->ldsPadding = mydata->ldsPadding;
+ fftPlan->uLdsFraction = mydata->uLdsFraction;
+
+ return CLFFT_SUCCESS;
+}
+
+clfftStatus clfftLocalMemSize( const clfftPlanHandle plHandle, cl_ulong* local_mem_size )
+{
+ FFTRepo& repo = FFTRepo::getInstance( );
+ FFTPlan* plan = NULL;
+ lockRAII* lock = NULL;
+
+ OPENCL_V( repo.getPlan( plHandle, plan, lock ), _T( "repo.getPlan failed" ) );
+ scopedLock sLock( *lock, _T( "clfftLocalMemSize" ) );
+
+ *local_mem_size = plan->envelope.limit_LocalMemSize;
+ return CLFFT_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/library/dllmain.cpp b/src/library/dllmain.cpp
new file mode 100644
index 00000000..5d651328
--- /dev/null
+++ b/src/library/dllmain.cpp
@@ -0,0 +1,36 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+// dllmain.cpp : Defines the entry point for the DLL application.
+#include "stdafx.h"
+
+BOOL APIENTRY DllMain( HMODULE hModule,
+ DWORD ul_reason_for_call,
+ LPVOID lpReserved
+ )
+{
+ switch (ul_reason_for_call)
+ {
+ case DLL_PROCESS_ATTACH:
+ case DLL_THREAD_ATTACH:
+ case DLL_THREAD_DETACH:
+ case DLL_PROCESS_DETACH:
+ break;
+ }
+ return TRUE;
+}
+
diff --git a/src/library/generator.copy.cpp b/src/library/generator.copy.cpp
new file mode 100644
index 00000000..e839ed8a
--- /dev/null
+++ b/src/library/generator.copy.cpp
@@ -0,0 +1,474 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include "stdafx.h"
+#include
+#include
+#include "generator.stockham.h"
+
+using namespace StockhamGenerator;
+
+namespace CopyGenerator
+{
+ // Copy kernel
+ template
+ class CopyKernel
+ {
+ size_t N;
+ size_t Nt;
+ const FFTKernelGenKeyParams params;
+ bool h2c, c2h;
+
+ inline std::string OffsetCalc(const std::string &off, bool input = true)
+ {
+ std::string str;
+
+ const size_t *pStride = input ? params.fft_inStride : params.fft_outStride;
+
+ std::string batch = "batch";
+
+ switch(params.fft_DataDim)
+ {
+ case 5:
+ {
+ str += "\t{\n\tuint ocalc1 = ";
+ str += batch; str += "%"; str += SztToStr(params.fft_N[1] * params.fft_N[2] * params.fft_N[3]);
+ str += ";\n";
+
+ str += "\tuint ocalc0 = ";
+ str += "ocalc1"; str += "%"; str += SztToStr(params.fft_N[1] * params.fft_N[2]);
+ str += ";\n";
+
+ str += "\t"; str += off; str += " = ";
+ str += "("; str += batch; str += "/"; str += SztToStr(params.fft_N[1] * params.fft_N[2] * params.fft_N[3]);
+ str += ")*"; str += SztToStr(pStride[4]); str += " + ";
+
+ str += "(ocalc1"; str += "/"; str += SztToStr(params.fft_N[1] * params.fft_N[2]); str += ")*";
+ str += SztToStr(pStride[3]); str += " + ";
+
+ str += "(ocalc0"; str += "/"; str += SztToStr(params.fft_N[1]); str += ")*";
+ str += SztToStr(pStride[2]); str += " + ";
+ str += "(ocalc0"; str += "%"; str += SztToStr(params.fft_N[1]); str += ")*";
+ str += SztToStr(pStride[1]); str += ";\n";
+
+ str += "\t}\n";
+ }
+ break;
+ case 4:
+ {
+ str += "\t{\n\tuint ocalc0 = ";
+ str += batch; str += "%"; str += SztToStr(params.fft_N[1] * params.fft_N[2]);
+ str += ";\n";
+
+ str += "\t"; str += off; str += " = ";
+ str += "("; str += batch; str += "/"; str += SztToStr(params.fft_N[1] * params.fft_N[2]); str += ")*";
+ str += SztToStr(pStride[3]); str += " + ";
+
+ str += "(ocalc0"; str += "/"; str += SztToStr(params.fft_N[1]); str += ")*";
+ str += SztToStr(pStride[2]); str += " + ";
+ str += "(ocalc0"; str += "%"; str += SztToStr(params.fft_N[1]); str += ")*";
+ str += SztToStr(pStride[1]); str += ";\n";
+
+ str += "\t}\n";
+ }
+ break;
+ case 3:
+ {
+ str += "\t"; str += off; str += " = ";
+ str += "("; str += batch; str += "/"; str += SztToStr(params.fft_N[1]); str += ")*";
+ str += SztToStr(pStride[2]); str += " + ";
+ str += "("; str += batch; str += "%"; str += SztToStr(params.fft_N[1]); str += ")*";
+ str += SztToStr(pStride[1]); str += ";\n";
+ }
+ break;
+ case 2:
+ {
+ str += "\t"; str += off; str += " = ";
+ str += batch; str += "*"; str += SztToStr(pStride[1]); str += ";\n";
+ }
+ break;
+ default:
+ assert(false);
+ }
+
+ return str;
+ }
+
+ public:
+ CopyKernel( const FFTKernelGenKeyParams ¶msVal) :
+ params(paramsVal)
+
+ {
+ N = params.fft_N[0];
+ Nt = 1 + N/2;
+
+ h2c = ( (params.fft_inputLayout == CLFFT_HERMITIAN_PLANAR) ||
+ (params.fft_inputLayout == CLFFT_HERMITIAN_INTERLEAVED) ) ? true : false;
+ c2h = ( (params.fft_outputLayout == CLFFT_HERMITIAN_PLANAR) ||
+ (params.fft_outputLayout == CLFFT_HERMITIAN_INTERLEAVED) ) ? true : false;
+
+ // We only do out-of-place copies at this point
+ assert(params.fft_placeness == CLFFT_OUTOFPLACE);
+ }
+
+ void GenerateKernel(std::string &str)
+ {
+ std::string rType = RegBaseType(1);
+ std::string r2Type = RegBaseType(2);
+
+ bool inIlvd; // Input is interleaved format
+ bool outIlvd; // Output is interleaved format
+ inIlvd = ( (params.fft_inputLayout == CLFFT_COMPLEX_INTERLEAVED) ||
+ (params.fft_inputLayout == CLFFT_HERMITIAN_INTERLEAVED) ) ? true : false;
+ outIlvd = ( (params.fft_outputLayout == CLFFT_COMPLEX_INTERLEAVED) ||
+ (params.fft_outputLayout == CLFFT_HERMITIAN_INTERLEAVED) ) ? true : false;
+
+
+
+ // Pragma
+ str += ClPragma();
+
+ std::string sfx = FloatSuffix();
+
+ // Copy kernel begin
+ str += "__kernel void ";
+
+ // Function name
+ if(h2c) str += "copy_h2c";
+ else str += "copy_c2h";
+
+ str += "(";
+
+ if(inIlvd)
+ {
+ str += "__global const "; str += r2Type; str += " * restrict gbIn, ";
+ }
+ else
+ {
+ str += "__global const "; str += rType; str += " * restrict gbInRe, ";
+ str += "__global const "; str += rType; str += " * restrict gbInIm, ";
+ }
+
+ if(outIlvd)
+ {
+ str += "__global "; str += r2Type; str += " * restrict gbOut)\n";
+ }
+ else
+ {
+ str += "__global "; str += rType; str += " * restrict gbOutRe, ";
+ str += "__global "; str += rType; str += " * restrict gbOutIm)\n";
+ }
+
+
+ str += "{\n";
+
+ // Initialize
+ str += "\tuint me = get_global_id(0);\n\t";
+
+ // Declare memory pointers
+ str += "\n\t";
+ str += "uint iOffset;\n\t";
+ str += "uint oOffset;\n\t";
+
+ // input
+ if(inIlvd)
+ {
+ str += "__global "; str += r2Type; str += " *lwbIn;\n\t";
+ }
+ else
+ {
+ str += "__global "; str += rType; str += " *lwbInRe;\n\t";
+ str += "__global "; str += rType; str += " *lwbInIm;\n\t";
+ }
+
+ // output
+ if(outIlvd)
+ {
+ str += "__global "; str += r2Type; str += " *lwbOut;\n";
+ if(h2c)
+ {
+ str += "\t";
+ str += "__global "; str += r2Type; str += " *lwbOut2;\n\n";
+ }
+ }
+ else
+ {
+ str += "__global "; str += rType; str += " *lwbOutRe;\n\t";
+ str += "__global "; str += rType; str += " *lwbOutIm;\n";
+ if(h2c)
+ {
+ str += "\t";
+ str += "__global "; str += rType; str += " *lwbOutRe2;\n\t";
+ str += "__global "; str += rType; str += " *lwbOutIm2;\n\n";
+ }
+ }
+
+
+
+ // Setup registers
+ str += "\t"; str += RegBaseType(2); str += " R;\n\n";
+
+ // Setup variables
+ str += "\tuint batch, mel, mel2;\n\t";
+ str += "batch = me/"; str += SztToStr(Nt); str += ";\n\t";
+ str += "mel = me%"; str += SztToStr(Nt); str += ";\n\t";
+ str += "mel2 = ("; str += SztToStr(N); str += " - mel)%"; str += SztToStr(N); str += ";\n\n";
+
+
+ // Setup memory pointers
+ str += OffsetCalc("iOffset", true);
+ str += OffsetCalc("oOffset", false);
+
+ // offset strings
+ std::string inF, inF2, outF, outF2;
+ inF = "(mel*"; inF += SztToStr(params.fft_inStride[0]); inF += ")";
+ inF2 = "(mel2*"; inF2 += SztToStr(params.fft_inStride[0]); inF2 += ")";
+ outF = "(mel*"; outF += SztToStr(params.fft_outStride[0]); outF += ")";
+ outF2 = "(mel2*"; outF2 += SztToStr(params.fft_outStride[0]); outF2 += ")";
+
+ str += "\n\t";
+
+ // inputs
+ if(inIlvd)
+ {
+ str += "lwbIn = gbIn + iOffset + "; str += inF; str += ";\n\t";
+ }
+ else
+ {
+ str += "lwbInRe = gbInRe + iOffset + "; str += inF; str += ";\n\t";
+ str += "lwbInIm = gbInIm + iOffset + "; str += inF; str += ";\n\t";
+ }
+
+ // outputs
+ if(outIlvd)
+ {
+ str += "lwbOut = gbOut + oOffset + "; str += outF; str += ";\n";
+ if(h2c)
+ {
+ str += "\t";
+ str += "lwbOut2 = gbOut + oOffset + "; str += outF2; str += ";\n";
+ }
+ }
+ else
+ {
+ str += "lwbOutRe = gbOutRe + oOffset + "; str += outF; str += ";\n\t";
+ str += "lwbOutIm = gbOutIm + oOffset + "; str += outF; str += ";\n";
+ if(h2c)
+ {
+ str += "\t";
+ str += "lwbOutRe2 = gbOutRe + oOffset + "; str += outF2; str += ";\n\t";
+ str += "lwbOutIm2 = gbOutIm + oOffset + "; str += outF2; str += ";\n";
+ }
+ }
+
+ str += "\n\t";
+
+ // Do the copy
+ if(c2h)
+ {
+ if(inIlvd)
+ {
+ str += "R = lwbIn[0];\n\t";
+ }
+ else
+ {
+ str += "R.x = lwbInRe[0];\n\t";
+ str += "R.y = lwbInIm[0];\n\t";
+ }
+
+ if(outIlvd)
+ {
+ str += "lwbOut[0] = R;\n\n";
+ }
+ else
+ {
+ str += "lwbOutRe[0] = R.x;\n\t";
+ str += "lwbOutIm[0] = R.y;\n\t";
+ }
+ }
+ else
+ {
+ if(inIlvd)
+ {
+ str += "R = lwbIn[0];\n\t";
+ }
+ else
+ {
+ str += "R.x = lwbInRe[0];\n\t";
+ str += "R.y = lwbInIm[0];\n\t";
+ }
+
+ if(outIlvd)
+ {
+ str += "lwbOut[0] = R;\n\t";
+ str += "R.y = -R.y;\n\t";
+ str += "lwbOut2[0] = R;\n\n";
+ }
+ else
+ {
+ str += "lwbOutRe[0] = R.x;\n\t";
+ str += "lwbOutIm[0] = R.y;\n\t";
+ str += "R.y = -R.y;\n\t";
+ str += "lwbOutRe2[0] = R.x;\n\t";
+ str += "lwbOutIm2[0] = R.y;\n\n";
+ }
+ }
+
+ str += "}\n";
+ }
+ };
+};
+
+
+template<>
+clfftStatus FFTPlan::GetKernelGenKeyPvt (FFTKernelGenKeyParams & params) const
+{
+
+ // Query the devices in this context for their local memory sizes
+ // How we generate a kernel depends on the *minimum* LDS size for all devices.
+ //
+ const FFTEnvelope * pEnvelope = NULL;
+ OPENCL_V(const_cast(this)->GetEnvelope (& pEnvelope), _T("GetEnvelope failed"));
+ BUG_CHECK (NULL != pEnvelope);
+
+ ::memset( ¶ms, 0, sizeof( params ) );
+ params.fft_precision = this->precision;
+ params.fft_placeness = this->placeness;
+ params.fft_inputLayout = this->inputLayout;
+ params.fft_MaxWorkGroupSize = this->envelope.limit_WorkGroupSize;
+
+ ARG_CHECK (this->inStride.size() == this->outStride.size())
+
+ params.fft_outputLayout = this->outputLayout;
+
+ switch (this->inStride.size()) {
+ // 1-D array is a 2-D data structure.
+ // 1-D unit is a special case of 1-D array.
+ case 1:
+ ARG_CHECK(this->length .size() > 0);
+ ARG_CHECK(this->outStride.size() > 0);
+ params.fft_DataDim = 2;
+ params.fft_N[0] = this->length[0];
+ params.fft_inStride[0] = this->inStride[0];
+ params.fft_inStride[1] = this->iDist;
+ params.fft_outStride[0] = this->outStride[0];
+ params.fft_outStride[1] = this->oDist;
+ break;
+
+ // 2-D array is a 3-D data structure
+ // 2-D unit is a speical case of 2-D array.
+ case 2:
+ ARG_CHECK(this->length .size() > 1);
+ ARG_CHECK(this->outStride.size() > 1);
+ params.fft_DataDim = 3;
+ params.fft_N[0] = this->length[0];
+ params.fft_N[1] = this->length[1];
+ params.fft_inStride[0] = this->inStride[0];
+ params.fft_inStride[1] = this->inStride[1];
+ params.fft_inStride[2] = this->iDist;
+ params.fft_outStride[0] = this->outStride[0];
+ params.fft_outStride[1] = this->outStride[1];
+ params.fft_outStride[2] = this->oDist;
+ break;
+
+ // 3-D array is a 4-D data structure
+ // 3-D unit is a special case of 3-D array.
+ case 3:
+ ARG_CHECK(this->length .size() > 2);
+ ARG_CHECK(this->outStride.size() > 2);
+ params.fft_DataDim = 4;
+ params.fft_N[0] = this->length[0];
+ params.fft_N[1] = this->length[1];
+ params.fft_N[2] = this->length[2];
+ params.fft_inStride[0] = this->inStride[0];
+ params.fft_inStride[1] = this->inStride[1];
+ params.fft_inStride[2] = this->inStride[2];
+ params.fft_inStride[3] = this->iDist;
+ params.fft_outStride[0] = this->outStride[0];
+ params.fft_outStride[1] = this->outStride[1];
+ params.fft_outStride[2] = this->outStride[2];
+ params.fft_outStride[3] = this->oDist;
+ break;
+
+ default:
+ ARG_CHECK (false);
+ }
+
+ params.fft_fwdScale = this->forwardScale;
+ params.fft_backScale = this->backwardScale;
+
+ return CLFFT_SUCCESS;
+}
+
+template<>
+clfftStatus FFTPlan::GetWorkSizesPvt (std::vector & globalWS, std::vector & localWS) const
+{
+ FFTKernelGenKeyParams fftParams;
+ OPENCL_V( this->GetKernelGenKeyPvt( fftParams ), _T("GetKernelGenKey() failed!") );
+
+ size_t count = this->batchsize;
+ switch(fftParams.fft_DataDim)
+ {
+ case 5: assert(false);
+ case 4: count *= fftParams.fft_N[2];
+ case 3: count *= fftParams.fft_N[1];
+ case 2: count *= (1 + fftParams.fft_N[0]/2); break;
+ case 1: assert(false);
+ }
+
+ globalWS.push_back( count );
+ localWS.push_back( 64 );
+
+ return CLFFT_SUCCESS;
+}
+
+template<>
+clfftStatus FFTPlan::GetMax1DLengthPvt (size_t * longest) const
+{
+ return FFTPlan::GetMax1DLengthPvt(longest);
+}
+
+using namespace CopyGenerator;
+
+template<>
+clfftStatus FFTPlan::GenerateKernelPvt(FFTRepo& fftRepo ) const
+{
+ FFTKernelGenKeyParams params;
+ OPENCL_V( this->GetKernelGenKeyPvt (params), _T("GetKernelGenKey() failed!") );
+
+ std::string programCode;
+ Precision pr = (params.fft_precision == CLFFT_SINGLE) ? P_SINGLE : P_DOUBLE;
+ switch(pr)
+ {
+ case P_SINGLE:
+ {
+ CopyKernel kernel(params);
+ kernel.GenerateKernel(programCode);
+ } break;
+ case P_DOUBLE:
+ {
+ CopyKernel kernel(params);
+ kernel.GenerateKernel(programCode);
+ } break;
+ }
+
+
+ OPENCL_V( fftRepo.setProgramCode( Copy, params, programCode ), _T( "fftRepo.setclString() failed!" ) );
+ OPENCL_V( fftRepo.setProgramEntryPoints( Copy, params, "copy_c2h", "copy_h2c" ), _T( "fftRepo.setProgramEntryPoint() failed!" ) );
+
+ return CLFFT_SUCCESS;
+}
diff --git a/src/library/generator.h b/src/library/generator.h
new file mode 100644
index 00000000..2aac9836
--- /dev/null
+++ b/src/library/generator.h
@@ -0,0 +1,31 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#pragma once
+#if !defined( AMD_CLFFT_generator_H )
+#define AMD_CLFFT_generator_H
+
+// Enum to help provide descriptive names to array indices, when indexing into our various vectors
+enum clfftGenerators
+{
+ Stockham, // Using the Stockham autosort frameworks
+ Transpose,
+ Copy,
+ ENDGENERATORS ///< This value will always be last, and marks the length of clfftGenerators
+};
+
+#endif
diff --git a/src/library/generator.stockham.cpp b/src/library/generator.stockham.cpp
new file mode 100644
index 00000000..8a6f5a60
--- /dev/null
+++ b/src/library/generator.stockham.cpp
@@ -0,0 +1,3250 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include "stdafx.h"
+#include
+#include "generator.stockham.h"
+#include
+
+// FFT Stockham Autosort Method
+//
+// Each pass does one digit reverse in essence. Hence by the time all passes are done, complete
+// digit reversal is done and output FFT is in correct order. Intermediate FFTs are stored in natural order,
+// which is not the case with basic Cooley-Tukey algorithm. Natural order in intermediate data makes it
+// convenient for stitching together passes with different radices.
+//
+// Basic FFT algorithm:
+//
+// Pass loop
+// {
+// Outer loop
+// {
+// Inner loop
+// {
+// }
+// }
+// }
+//
+// The sweeps of the outer and inner loop resemble matrix indexing, this matrix changes shape with every pass as noted below
+//
+// FFT pass diagram (radix 2)
+//
+// k k+R k
+// * * * * * * * * * * * * * * * * * * * * * * * *
+// * | | * * | *
+// * | | * * | *
+// * | | * LS --> * | *
+// * | | * * | *
+// * | | * * | *
+// * * * * * * * * * * * * * * * * * | *
+// RS * | * L
+// * | *
+// * | *
+// * | *
+// * | *
+// * | *
+// * | *
+// * | *
+// * * * * * * * *
+// R
+//
+//
+// With every pass, the matrix doubles in height and halves in length
+//
+//
+// N = 2^T = Length of FFT
+// q = pass loop index
+// k = outer loop index = (0 ... R-1)
+// j = inner loop index = (0 ... LS-1)
+//
+// Tables shows how values change as we go through the passes
+//
+// q | LS | R | L | RS
+// ___|______|______|_____|___
+// 0 | 1 | N/2 | 2 | N
+// 1 | 2 | N/4 | 4 | N/2
+// 2 | 4 | N/8 | 8 | N/4
+// . | . | . | . | .
+// T-1 | N/2 | 1 | N | 2
+//
+//
+// Data Read Order
+// Radix 2: k*LS + j, (k+R)*LS + j
+// Radix 3: k*LS + j, (k+R)*LS + j, (k+2R)*LS + j
+// Radix 4: k*LS + j, (k+R)*LS + j, (k+2R)*LS + j, (k+3R)*LS + j
+// Radix 5: k*LS + j, (k+R)*LS + j, (k+2R)*LS + j, (k+3R)*LS + j, (k+4R)*LS + j
+//
+// Data Write Order
+// Radix 2: k*L + j, k*L + j + LS
+// Radix 3: k*L + j, k*L + j + LS, k*L + j + 2*LS
+// Radix 4: k*L + j, k*L + j + LS, k*L + j + 2*LS, k*L + j + 3*LS
+// Radix 5: k*L + j, k*L + j + LS, k*L + j + 2*LS, k*L + j + 3*LS, k*L + j + 4*LS
+//
+
+namespace StockhamGenerator
+{
+ // Experimnetal Start =========================================
+ // Kernel Generator Parameterization ==========================
+
+ // Uncomment this directive to activate parameter reads from file
+//#define PARMETERS_TO_BE_READ
+
+ // Parameters to read
+ struct ParamRead
+ {
+ size_t workGroupSize;
+ size_t numTransformsPerWg;
+ std::vector radices;
+ bool halfLds;
+ };
+
+ // File format
+
+ // WorkGroupSize:
+ // TransformsPerWorkGroup:
+ // Radices:
+ // LdsUse:
+
+ void ReadParameterFile(ParamRead &readParam)
+ {
+ const char *fileName = "parameters.txt";
+ std::ifstream file(fileName);
+
+ if(!file.is_open())
+ {
+ std::cout << "File: " << fileName << " could not be opened, exiting ...." << std::endl;
+ exit(-1);
+ }
+
+ std::string strWgs = "WorkGroupSize:";
+ std::string strNtw = "TransformsPerWorkGroup:";
+ std::string strRad = "Radices:";
+ std::string strLds = "LdsUse:";
+ std::string numbers = "0123456789";
+
+ std::string line;
+ while(std::getline(file, line))
+ {
+
+ size_t pos;
+
+ pos = line.find(strWgs);
+ if(pos != std::string::npos)
+ {
+ line.erase(pos, strWgs.length());
+ size_t numStart = line.find_first_of(numbers);
+ size_t numEnd = line.find_first_not_of(numbers, numStart);
+ std::string val = line.substr(numStart, numEnd-numStart);
+ readParam.workGroupSize = strtol(val.c_str(), NULL, 10);
+ continue;
+ }
+
+ pos = line.find(strNtw);
+ if(pos != std::string::npos)
+ {
+ line.erase(pos, strNtw.length());
+ size_t numStart = line.find_first_of(numbers);
+ size_t numEnd = line.find_first_not_of(numbers, numStart);
+ std::string val = line.substr(numStart, numEnd-numStart);
+ readParam.numTransformsPerWg = strtol(val.c_str(), NULL, 10);
+ continue;
+ }
+
+ pos = line.find(strRad);
+ if(pos != std::string::npos)
+ {
+ line.erase(pos, strRad.length());
+ while(std::string::npos != line.find_first_of(numbers))
+ {
+ size_t numStart = line.find_first_of(numbers);
+ size_t numEnd = line.find_first_not_of(numbers, numStart);
+ std::string val = line.substr(numStart, numEnd-numStart);
+ readParam.radices.push_back(strtol(val.c_str(), NULL, 10));
+ line.erase(0, numEnd);
+ }
+ continue;
+ }
+ }
+
+ //std::cout << std::endl;
+ //std::cout << "File Parameters" << std::endl;
+ //std::cout << strWgs << " " << readParam.workGroupSize << std::endl;
+ //std::cout << strNtw << " " << readParam.numTransformsPerWg << std::endl;
+ //std::cout << strRad << " "; for(size_t i=0; i
+ class KernelCoreSpecs
+ {
+ struct SpecRecord
+ {
+ size_t length;
+ size_t workGroupSize;
+ size_t numTransforms;
+ size_t numPasses;
+ size_t radices[12]; // Setting upper limit of number of passes to 12
+ };
+
+ typedef typename std::map SpecTable;
+ SpecTable specTable;
+
+ public:
+ KernelCoreSpecs()
+ {
+ switch(PR)
+ {
+ case P_SINGLE:
+ {
+ SpecRecord specRecord[] = {
+
+ RADIX_TABLE_COMMON
+
+ // Length, WorkGroupSize, NumTransforms, NumPasses, Radices
+ { 4096, 256, 1, 4, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 1024, 128, 1, 4, 8, 8, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 128, 64, 4, 3, 8, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 8, 64, 32, 2, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+
+ };
+
+ size_t tableLength = sizeof(specRecord)/sizeof(specRecord[0]);
+ for(size_t i=0; isecond.radices;
+ numPasses = it->second.numPasses;
+ }
+ }
+
+ void GetWGSAndNT(size_t length, size_t &workGroupSize, size_t &numTransforms) const
+ {
+ workGroupSize = 0;
+ numTransforms = 0;
+
+ typename SpecTable::const_iterator it = specTable.find(length);
+ if(it != specTable.end())
+ {
+ workGroupSize = it->second.workGroupSize;
+ numTransforms = it->second.numTransforms;
+ }
+ }
+ };
+
+
+
+ // Given the length of 1d fft, this function determines the appropriate work group size
+ // and the number of transforms per work group
+ // TODO for optimizations - experiment with different possibilities for work group sizes and num transforms for improving performance
+ void DetermineSizes(const size_t &MAX_WGS, const size_t &length, size_t &workGroupSize, size_t &numTrans)
+ {
+ assert(MAX_WGS >= 64);
+
+ if(length == 1) // special case
+ {
+ workGroupSize = 64;
+ numTrans = 64;
+ return;
+ }
+
+ size_t baseRadix[] = {5,3,2}; // list only supported primes
+ size_t baseRadixSize = sizeof(baseRadix)/sizeof(baseRadix[0]);
+
+ size_t l = length;
+ std::map primeFactors;
+ std::map primeFactorsExpanded;
+ for(size_t r=0; r= 1024) { workGroupSize = (MAX_WGS >= 256) ? 256 : MAX_WGS; numTrans = 1; }
+ //else if (length == 512) { workGroupSize = (MAX_WGS >= 128) ? 128 : MAX_WGS; numTrans = 1; }
+ else if (length == 512) { workGroupSize = 64; numTrans = 1; }
+ else if (length >= 16) { workGroupSize = 64; numTrans = 256/length; }
+ else { workGroupSize = 64; numTrans = 128/length; }
+ }
+ else if (primeFactorsExpanded[3] == length) // Length is pure power of 3
+ {
+ workGroupSize = (MAX_WGS >= 256) ? 243 : 27;
+ if(length >= 3*workGroupSize) numTrans = 1;
+ else numTrans = (3*workGroupSize)/length;
+ }
+ else if (primeFactorsExpanded[5] == length) // Length is pure power of 5
+ {
+ workGroupSize = (MAX_WGS >= 128) ? 125 : 25;
+ if(length >= 5*workGroupSize) numTrans = 1;
+ else numTrans = (5*workGroupSize)/length;
+ }
+ else
+ {
+ size_t leastNumPerWI; // least number of elements in one work item
+ size_t maxWorkGroupSize; // maximum work group size desired
+
+ if (primeFactorsExpanded[2] * primeFactorsExpanded[3] == length) // Length is mix of 2&3 only
+ {
+ if(!(length%12)) { leastNumPerWI = 12; maxWorkGroupSize = (MAX_WGS >= 128) ? 128 : MAX_WGS; }
+ else { leastNumPerWI = 6; maxWorkGroupSize = (MAX_WGS >= 256) ? 256 : MAX_WGS; }
+ }
+ else if (primeFactorsExpanded[2] * primeFactorsExpanded[5] == length) // Length is mix of 2&5 only
+ {
+ if(!(length%20)) { leastNumPerWI = 20; maxWorkGroupSize = 64; }
+ else { leastNumPerWI = 10; maxWorkGroupSize = (MAX_WGS >= 128) ? 128 : MAX_WGS; }
+ }
+ else if (primeFactorsExpanded[3] * primeFactorsExpanded[5] == length) // Length is mix of 3&5 only
+ {
+ leastNumPerWI = 15;
+ maxWorkGroupSize = 64;
+ }
+ else
+ {
+ leastNumPerWI = 30;
+ maxWorkGroupSize = 64;
+ }
+
+
+ // Make sure the work group size does not exceed MAX_WGS
+ // for large problems sizes, this means doing more work per work-item
+ size_t lnpi;
+ size_t ft = 1;
+ while(1)
+ {
+ lnpi = leastNumPerWI * ft++;
+ if(length%lnpi) continue;
+
+ if( (length/lnpi) <= MAX_WGS )
+ {
+ leastNumPerWI = lnpi;
+ break;
+ }
+ }
+
+ numTrans = 1;
+ size_t n=1;
+ while( ((n*length)/leastNumPerWI) <= maxWorkGroupSize )
+ {
+ numTrans = n;
+ n++;
+ }
+
+ workGroupSize = (numTrans*length)/leastNumPerWI;
+ assert(workGroupSize <= MAX_WGS);
+ }
+ }
+
+ // Twiddle factors table
+ class TwiddleTable
+ {
+ size_t N; // length
+ double *wc, *ws; // cosine, sine arrays
+
+ public:
+ TwiddleTable(size_t length) : N(length)
+ {
+ // Allocate memory for the tables
+ // We compute twiddle factors in double precision for both P_SINGLE and P_DOUBLE
+ wc = new double[N];
+ ws = new double[N];
+ }
+
+ ~TwiddleTable()
+ {
+ // Free
+ delete[] wc;
+ delete[] ws;
+ }
+
+ template
+ void GenerateTwiddleTable(const std::vector &radices, std::string &twStr)
+ {
+ const double TWO_PI = -6.283185307179586476925286766559;
+
+ // Make sure the radices vector sums up to N
+ size_t sz = 1;
+ for(std::vector::const_iterator i = radices.begin();
+ i != radices.end(); i++)
+ {
+ sz *= (*i);
+ }
+ assert(sz == N);
+
+ // Generate the table
+ size_t L = 1;
+ size_t nt = 0;
+ for(std::vector::const_iterator i = radices.begin();
+ i != radices.end(); i++)
+ {
+ size_t radix = *i;
+
+ L *= radix;
+
+ // Twiddle factors
+ for(size_t k=0; k<(L/radix); k++)
+ {
+ double theta = TWO_PI * ((double)k)/((double)L);
+
+ for(size_t j=1; j();
+
+ // Stringize the table
+ std::stringstream ss;
+ for(size_t i = 0; i < (N-1); i++)
+ {
+ ss << "("; ss << RegBaseType(2); ss << ")(";
+
+ char cv[64], sv[64];
+ sprintf(cv, "%036.34lf", wc[i]);
+ sprintf(sv, "%036.34lf", ws[i]);
+ ss << cv; ss << sfx; ss << ", ";
+ ss << sv; ss << sfx; ss << "),\n";
+ }
+ twStr += ss.str();
+ }
+ };
+
+
+ // Twiddle factors table for large N
+ // used in 3-step algorithm
+ class TwiddleTableLarge
+ {
+ size_t N; // length
+ size_t X, Y;
+ size_t tableSize;
+ double *wc, *ws; // cosine, sine arrays
+
+ public:
+ TwiddleTableLarge(size_t length) : N(length)
+ {
+ X = size_t(1) << ARBITRARY::TWIDDLE_DEE;
+ Y = DivRoundingUp (CeilPo2(N), ARBITRARY::TWIDDLE_DEE);
+ tableSize = X * Y;
+
+ // Allocate memory for the tables
+ wc = new double[tableSize];
+ ws = new double[tableSize];
+ }
+
+ ~TwiddleTableLarge()
+ {
+ // Free
+ delete[] wc;
+ delete[] ws;
+ }
+
+ template
+ void GenerateTwiddleTable(std::string &twStr)
+ {
+ const double TWO_PI = -6.283185307179586476925286766559;
+
+ // Generate the table
+ size_t nt = 0;
+ double phi = TWO_PI / double (N);
+ for (size_t iY = 0; iY < Y; ++iY)
+ {
+ size_t i = size_t(1) << (iY * ARBITRARY::TWIDDLE_DEE);
+ for (size_t iX = 0; iX < X; ++iX)
+ {
+ size_t j = i * iX;
+
+ double c = cos(phi * (double)j);
+ double s = sin(phi * (double)j);
+
+ //if (fabs(c) < 1.0E-12) c = 0.0;
+ //if (fabs(s) < 1.0E-12) s = 0.0;
+
+ wc[nt] = c;
+ ws[nt++] = s;
+ }
+ }
+
+ std::string sfx = FloatSuffix();
+
+ // Stringize the table
+ std::stringstream ss;
+ nt = 0;
+
+ ss << "\n __constant ";
+ ss << RegBaseType(2);
+ ss << " " << TwTableLargeName();
+ ss << "[" << Y << "][" << X << "] = {\n";
+ for (size_t iY = 0; iY < Y; ++iY)
+ {
+ ss << "{ ";
+ for (size_t iX = 0; iX < X; ++iX)
+ {
+ char cv[64], sv[64];
+ sprintf(cv, "%036.34lf", wc[nt]);
+ sprintf(sv, "%036.34lf", ws[nt++]);
+ ss << "("; ss << RegBaseType(2); ss << ")(";
+ ss << cv; ss << sfx; ss << ", ";
+ ss << sv; ss << sfx; ss << ")";
+ ss << ", ";
+ }
+ ss << " },\n";
+ }
+ ss << "};\n\n";
+
+
+ // Twiddle calc function
+ ss << "__attribute__((always_inline)) ";
+ ss << RegBaseType(2);
+ ss << "\n" << TwTableLargeFunc() << "(uint u)\n{\n";
+
+ ss << "\t" "uint j = u & " << unsigned(X-1) << ";\n";
+ ss << "\t" ; ss << RegBaseType(2); ss << " result = ";
+ ss << TwTableLargeName();
+ ss << "[0][j];\n";
+
+ for (size_t iY = 1; iY < Y; ++iY)
+ {
+ std::string phasor = TwTableLargeName();
+ phasor += "[";
+ phasor += SztToStr(iY);
+ phasor += "][j]";
+
+ stringpair product = ComplexMul((RegBaseType(2)).c_str(), "result", phasor.c_str());
+
+ ss << "\t" "u >>= " << unsigned (ARBITRARY::TWIDDLE_DEE) << ";\n";
+ ss << "\t" "j = u & " << unsigned(X-1) << ";\n";
+ ss << "\t" "result = " << product.first << "\n";
+ ss << "\t" "\t" << product.second <<";\n";
+ }
+ ss << "\t" "return result;\n}\n\n";
+
+ twStr += ss.str();
+ }
+ };
+
+ // A pass inside an FFT kernel
+ template
+ class Pass
+ {
+ size_t position; // Position in the kernel
+
+ size_t algL; // 'L' value from fft algorithm
+ size_t algLS; // 'LS' value
+ size_t algR; // 'R' value
+
+ size_t length; // Length of FFT
+ size_t radix; // Base radix
+ size_t cnPerWI; // Complex numbers per work-item
+
+ size_t workGroupSize; // size of the workgroup = (length / cnPerWI)
+ // this number is essentially number of work-items needed to compute 1 transform
+ // this number will be different from the kernel class workGroupSize if there
+ // are multiple transforms per workgroup
+
+ size_t numButterfly; // Number of basic FFT butterflies = (cnPerWI / radix)
+ size_t numB1, numB2, numB4; // number of different types of butterflies
+
+ bool r2c; // real to complex transform
+ bool c2r; // complex to real transform
+ bool rcFull;
+ bool rcSimple;
+
+ bool enableGrouping;
+ bool linearRegs;
+ Pass *nextPass;
+
+ inline void RegBase(size_t regC, std::string &str) const
+ {
+ str += "B";
+ str += SztToStr(regC);
+ }
+
+ inline void RegBaseAndCount(size_t num, std::string &str) const
+ {
+ str += "C";
+ str += SztToStr(num);
+ }
+
+ inline void RegBaseAndCountAndPos(const std::string &RealImag, size_t radPos, std::string &str) const
+ {
+ str += RealImag;
+ str += SztToStr(radPos);
+ }
+
+ void RegIndex(size_t regC, size_t num, const std::string &RealImag, size_t radPos, std::string &str) const
+ {
+ RegBase(regC, str);
+ RegBaseAndCount(num, str);
+ RegBaseAndCountAndPos(RealImag, radPos, str);
+ }
+
+ void DeclareRegs(const std::string ®Type, size_t regC, size_t numB, std::string &passStr) const
+ {
+ std::string regBase;
+ RegBase(regC, regBase);
+
+ if(linearRegs)
+ {
+ assert(regC == 1);
+ assert(numB == numButterfly);
+ }
+
+ for(size_t i=0; i(2);
+
+ for(size_t i=0; i(2);
+ std::string rType = RegBaseType(1);
+
+ size_t butterflyIndex = numPrev;
+
+ std::string regBase;
+ RegBase(regC, regBase);
+
+ // special write back to global memory with float4 grouping, writing 2 complex numbers at once
+ if( numB && (numB%2 == 0) && (regC == 1) && (stride == 1) && (numButterfly%2 == 0) && (algLS%2 == 0) && (flag == SR_WRITE) &&
+ (nextPass == NULL) && interleaved && (component == SR_COMP_BOTH) && linearRegs && enableGrouping )
+ {
+ assert((numButterfly * workGroupSize) == algLS);
+ assert(bufferRe.compare(bufferIm) == 0); // Make sure Real & Imag buffer strings are same for interleaved data
+
+ passStr += "\n\t";
+ passStr += "__global "; passStr += RegBaseType(4);
+ passStr += " *buff4g = "; passStr += bufferRe; passStr += ";\n\t"; // Assuming 'outOffset' is 0, so not adding it here
+
+ for(size_t r=0; r(4); passStr += ")(";
+ passStr += regIndexA; passStr += ".x, ";
+ passStr += regIndexA; passStr += ".y, ";
+ passStr += regIndexB; passStr += ".x, ";
+ passStr += regIndexB; passStr += ".y) ";
+ if(scale != 1.0f) { passStr += " * "; passStr += FloatToStr(scale); passStr += FloatSuffix(); }
+ passStr += ";";
+
+ butterflyIndex++;
+ }
+ }
+
+ return;
+ }
+
+ for(size_t i=0; i algLS )
+ {
+ passStr += "(("; passStr += SztToStr(numButterfly);
+ passStr += "*me + "; passStr += SztToStr(butterflyIndex); passStr += ")/";
+ passStr += SztToStr(algLS); passStr += ")*"; passStr += SztToStr(algL); passStr += " + (";
+ passStr += SztToStr(numButterfly); passStr += "*me + "; passStr += SztToStr(butterflyIndex);
+ passStr += ")%"; passStr += SztToStr(algLS); passStr += " + ";
+ }
+ else
+ {
+ passStr += SztToStr(numButterfly); passStr += "*me + "; passStr += SztToStr(butterflyIndex);
+ passStr += " + ";
+ }
+
+ passStr += SztToStr(r*algLS); passStr += " )*"; passStr += SztToStr(stride); passStr += "]";
+ passStr += tail; passStr += " = "; passStr += regIndex;
+ if(scale != 1.0f) { passStr += " * "; passStr += FloatToStr(scale); passStr += FloatSuffix(); }
+ passStr += ";";
+
+ // Since we write real & imag at once, we break the loop
+ if(interleaved && (component == SR_COMP_BOTH) && linearRegs)
+ break;
+ }
+ }
+
+ butterflyIndex++;
+ }
+
+ }
+ }
+
+ assert(butterflyIndex <= numButterfly);
+ }
+
+
+ // Special SweepRegs function to carry out some R-C/C-R specific operations
+ void SweepRegsRC( size_t flag, bool fwd, bool interleaved, size_t stride, size_t component,
+ double scale, bool setZero, bool batch2, bool oddt,
+ const std::string &bufferRe, const std::string &bufferIm, const std::string &offset,
+ std::string &passStr) const
+ {
+ assert( (flag == SR_READ ) ||
+ (flag == SR_WRITE) );
+
+
+ // component: 0 - real, 1 - imaginary, 2 - both
+ size_t cStart, cEnd;
+ switch(component)
+ {
+ case SR_COMP_REAL: cStart = 0; cEnd = 1; break;
+ case SR_COMP_IMAG: cStart = 1; cEnd = 2; break;
+ case SR_COMP_BOTH: cStart = 0; cEnd = 2; break;
+ default: assert(false);
+ }
+
+ std::string rType = RegBaseType(1);
+
+ assert(r2c || c2r);
+ assert(linearRegs);
+ bool singlePass = ((position == 0) && (nextPass == NULL));
+
+ size_t numCR = numButterfly * radix;
+ if(!(numCR%2)) assert(!oddt);
+
+ size_t rStart = 0;
+ size_t rEnd = numCR;
+
+ bool oddp = ((numCR%2) && (numCR > 1) && !setZero);
+ if(oddp)
+ {
+ if(oddt) { rStart = numCR-1; rEnd = numCR+1; }
+ else { rStart = 0; rEnd = numCR-1; }
+ }
+
+ if(!oddp) assert(!oddt);
+
+ for(size_t r=rStart; r(); }
+
+ if(fwd)
+ {
+ std::string idxStr, idxStrRev;
+ idxStr += SztToStr(length/(2*workGroupSize)); idxStr += "*me +"; idxStr += oddpadd; idxStr += SztToStr(lid);
+ idxStrRev += SztToStr(length); idxStrRev += " - ("; idxStrRev += idxStr; idxStrRev += " )";
+
+ std::string val1Str, val2Str;
+
+ val1Str += "\n\t";
+ val1Str += buffer; val1Str += "["; val1Str += offset; val1Str += " + ( ";
+ val1Str += idxStr; val1Str += " )*"; val1Str += SztToStr(stride); val1Str += "]";
+ val1Str += tail; val1Str += " = ";
+
+ val2Str += "\n\t";
+ val2Str += buffer; val2Str += "["; val2Str += offset; val2Str += " + ( ";
+ val2Str += idxStrRev; val2Str += " )*"; val2Str += SztToStr(stride); val2Str += "]";
+ val2Str += tail; val2Str += " = ";
+
+ std::string real1, imag1, real2, imag2;
+
+ real1 += "("; real1 += regIndex; real1 += ".x + "; real1 += regIndexPair; real1 += ".x)*0.5";
+ imag1 += "("; imag1 += regIndex; imag1 += ".y - "; imag1 += regIndexPair; imag1 += ".y)*0.5";
+ real2 += "("; real2 += regIndex; real2 += ".y + "; real2 += regIndexPair; real2 += ".y)*0.5";
+ imag2 += "(-"; imag2 += regIndex; imag2 += ".x + "; imag2 += regIndexPair; imag2 += ".x)*0.5";
+
+ if(interleaved && (component == SR_COMP_BOTH))
+ {
+ val1Str += "("; val1Str += RegBaseType(2); val1Str += ")( ";
+ val2Str += "("; val2Str += RegBaseType(2); val2Str += ")( ";
+
+ if(!batch2) { val1Str += real1; val1Str += ", "; val1Str += "+"; val1Str += imag1;
+ val2Str += real1; val2Str += ", "; val2Str += "-"; val2Str += imag1; }
+ else { val1Str += real2; val1Str += ", "; val1Str += "+"; val1Str += imag2;
+ val2Str += real2; val2Str += ", "; val2Str += "-"; val2Str += imag2; }
+
+ val1Str += " )";
+ val2Str += " )";
+ }
+ else
+ {
+ val1Str += " (";
+ val2Str += " (";
+ if(c == 0)
+ {
+ if(!batch2) { val1Str += real1;
+ val2Str += real1; }
+ else { val1Str += real2;
+ val2Str += real2; }
+ }
+ else
+ {
+ if(!batch2) { val1Str += "+"; val1Str += imag1;
+ val2Str += "-"; val2Str += imag1; }
+ else { val1Str += "+"; val1Str += imag2;
+ val2Str += "-"; val2Str += imag2; }
+ }
+ val1Str += " )";
+ val2Str += " )";
+ }
+
+ val1Str += sclStr;
+ val2Str += sclStr;
+
+ passStr += val1Str; passStr += ";";
+ if(rcFull) { passStr += val2Str; passStr += ";"; }
+ }
+ else
+ {
+ std::string idxStr, idxStrRev;
+ idxStr += SztToStr(bid); idxStr += "*me +"; idxStr += oddpadd; idxStr += SztToStr(lid);
+ idxStrRev += SztToStr(length); idxStrRev += " - ("; idxStrRev += idxStr; idxStrRev += " )";
+
+ passStr += "\n\t";
+ passStr += buffer; passStr += "["; passStr += offset; passStr += " + ( ";
+
+ if(!batch2) passStr += idxStr;
+ else passStr += idxStrRev;
+
+ passStr += " )*"; passStr += SztToStr(stride); passStr += "]";
+ passStr += tail; passStr += " = ";
+
+ passStr += "( ";
+ if(c == 0)
+ {
+ regIndex += ".x"; regIndexPair += ".x";
+
+ if(!batch2) { passStr += regIndex; passStr += " - "; passStr += regIndexPair; }
+ else { passStr += regIndex; passStr += " + "; passStr += regIndexPair; }
+ }
+ else
+ {
+ regIndex += ".y"; regIndexPair += ".y";
+
+ if(!batch2) { passStr += regIndex; passStr += " + "; passStr += regIndexPair; }
+ else { passStr += " - "; passStr += regIndex; passStr += " + "; passStr += regIndexPair; }
+ }
+ passStr += " )";
+ passStr += sclStr;
+ passStr += ";";
+ }
+
+
+
+ // Since we write real & imag at once, we break the loop
+ if(interleaved && (component == SR_COMP_BOTH))
+ break;
+ }
+ }
+ }
+ }
+
+ }
+
+
+ void CallButterfly(const std::string &bflyName, size_t regC, size_t numB, std::string &passStr) const
+ {
+ std::string regBase;
+ RegBase(regC, regBase);
+
+ for(size_t i=0; i *np) { nextPass = np; }
+ void SetGrouping(bool grp) { enableGrouping = grp; }
+
+ void GeneratePass( bool fwd, std::string &passStr, bool fft_3StepTwiddle,
+ bool inInterleaved, bool outInterleaved,
+ bool inReal, bool outReal,
+ size_t inStride, size_t outStride, double scale,
+ bool gIn = false, bool gOut = false) const
+ {
+ const std::string bufferInRe = (inReal || inInterleaved) ? "bufIn" : "bufInRe";
+ const std::string bufferInIm = (inReal || inInterleaved) ? "bufIn" : "bufInIm";
+ const std::string bufferOutRe = (outReal || outInterleaved) ? "bufOut" : "bufOutRe";
+ const std::string bufferOutIm = (outReal || outInterleaved) ? "bufOut" : "bufOutIm";
+
+ const std::string bufferInRe2 = (inReal || inInterleaved) ? "bufIn2" : "bufInRe2";
+ const std::string bufferInIm2 = (inReal || inInterleaved) ? "bufIn2" : "bufInIm2";
+ const std::string bufferOutRe2 = (outReal || outInterleaved) ? "bufOut2" : "bufOutRe2";
+ const std::string bufferOutIm2 = (outReal || outInterleaved) ? "bufOut2" : "bufOutIm2";
+
+ // for real transforms we use only B1 butteflies (regC = 1)
+ if(r2c || c2r)
+ {
+ assert(numB1 == numButterfly);
+ assert(linearRegs);
+ }
+
+ // Check if it is single pass transform
+ bool singlePass = ((position == 0) && (nextPass == NULL));
+ if(singlePass) assert(numButterfly == 1); // for single pass transforms, there can be only 1 butterfly per transform
+ if(singlePass) assert(workGroupSize == 1);
+
+ // Register types
+ std::string regB1Type = RegBaseType(1);
+ std::string regB2Type = RegBaseType(2);
+ std::string regB4Type = RegBaseType(4);
+
+ //Function attribute
+ passStr += "__attribute__((always_inline)) void\n";
+
+ //Function name
+ passStr += PassName(position, fwd);
+
+ // Function arguments
+ passStr += "(";
+ passStr += "uint rw, uint b, uint me, uint inOffset, uint outOffset, ";
+
+ // For now, interleaved support is there for only global buffers
+ // TODO : add support for LDS interleaved
+ if(inInterleaved) assert(gIn);
+ if(outInterleaved) assert(gOut);
+
+ if(r2c || c2r)
+ {
+ if(gIn)
+ {
+ if(inInterleaved)
+ {
+ passStr += "__global "; passStr += regB2Type; passStr += " *"; passStr += bufferInRe; passStr += ", ";
+ if(!rcSimple) { passStr += "__global "; passStr += regB2Type; passStr += " *"; passStr += bufferInRe2; passStr += ", "; }
+ }
+ else if(inReal)
+ {
+ passStr += "__global "; passStr += regB1Type; passStr += " *"; passStr += bufferInRe; passStr += ", ";
+ if(!rcSimple) { passStr += "__global "; passStr += regB1Type; passStr += " *"; passStr += bufferInRe2; passStr += ", "; }
+ }
+ else
+ {
+ passStr += "__global "; passStr += regB1Type; passStr += " *"; passStr += bufferInRe; passStr += ", ";
+ if(!rcSimple) { passStr += "__global "; passStr += regB1Type; passStr += " *"; passStr += bufferInRe2; passStr += ", "; }
+ passStr += "__global "; passStr += regB1Type; passStr += " *"; passStr += bufferInIm; passStr += ", ";
+ if(!rcSimple) { passStr += "__global "; passStr += regB1Type; passStr += " *"; passStr += bufferInIm2; passStr += ", "; }
+ }
+ }
+ else
+ {
+ passStr += "__local "; passStr += regB1Type; passStr += " *"; passStr += bufferInRe; passStr += ", ";
+ passStr += "__local "; passStr += regB1Type; passStr += " *"; passStr += bufferInIm; passStr += ", ";
+ }
+
+ if(gOut)
+ {
+ if(outInterleaved)
+ {
+ passStr += "__global "; passStr += regB2Type; passStr += " *"; passStr += bufferOutRe;
+ if(!rcSimple) { passStr += ", "; passStr += "__global "; passStr += regB2Type; passStr += " *"; passStr += bufferOutRe2; }
+ }
+ else if(outReal)
+ {
+ passStr += "__global "; passStr += regB1Type; passStr += " *"; passStr += bufferOutRe;
+ if(!rcSimple) { passStr += ", "; passStr += "__global "; passStr += regB1Type; passStr += " *"; passStr += bufferOutRe2; }
+ }
+ else
+ {
+ passStr += "__global "; passStr += regB1Type; passStr += " *"; passStr += bufferOutRe; passStr += ", ";
+ if(!rcSimple) { passStr += "__global "; passStr += regB1Type; passStr += " *"; passStr += bufferOutRe2; passStr += ", "; }
+ passStr += "__global "; passStr += regB1Type; passStr += " *"; passStr += bufferOutIm;
+ if(!rcSimple) { passStr += ", "; passStr += "__global "; passStr += regB1Type; passStr += " *"; passStr += bufferOutIm2; }
+ }
+ }
+ else
+ {
+ passStr += "__local "; passStr += regB1Type; passStr += " *"; passStr += bufferOutRe; passStr += ", ";
+ passStr += "__local "; passStr += regB1Type; passStr += " *"; passStr += bufferOutIm;
+ }
+ }
+ else
+ {
+ if(gIn)
+ {
+ if(inInterleaved)
+ {
+ passStr += "__global "; passStr += regB2Type; passStr += " *"; passStr += bufferInRe; passStr += ", ";
+ }
+ else
+ {
+ passStr += "__global "; passStr += regB1Type; passStr += " *"; passStr += bufferInRe; passStr += ", ";
+ passStr += "__global "; passStr += regB1Type; passStr += " *"; passStr += bufferInIm; passStr += ", ";
+ }
+ }
+ else
+ {
+ passStr += "__local "; passStr += regB1Type; passStr += " *"; passStr += bufferInRe; passStr += ", ";
+ passStr += "__local "; passStr += regB1Type; passStr += " *"; passStr += bufferInIm; passStr += ", ";
+ }
+
+
+ if(gOut)
+ {
+ if(outInterleaved)
+ {
+ passStr += "__global "; passStr += regB2Type; passStr += " *"; passStr += bufferOutRe;
+ }
+ else
+ {
+ passStr += "__global "; passStr += regB1Type; passStr += " *"; passStr += bufferOutRe; passStr += ", ";
+ passStr += "__global "; passStr += regB1Type; passStr += " *"; passStr += bufferOutIm;
+ }
+ }
+ else
+ {
+ passStr += "__local "; passStr += regB1Type; passStr += " *"; passStr += bufferOutRe; passStr += ", ";
+ passStr += "__local "; passStr += regB1Type; passStr += " *"; passStr += bufferOutIm;
+ }
+ }
+
+ // Register arguments
+ if(linearRegs)
+ {
+ passStr += ", "; passStr += IterRegArgs();
+ }
+ passStr += ")\n{\n";
+
+ // Register Declarations
+ if(!linearRegs)
+ {
+ DeclareRegs(regB1Type, 1, numB1, passStr);
+ DeclareRegs(regB2Type, 2, numB2, passStr);
+ DeclareRegs(regB4Type, 4, numB4, passStr);
+ }
+
+ // odd cnPerWI processing
+ bool oddp = false;
+ oddp = ((cnPerWI%2) && (length > 1) && (!singlePass));
+
+ // additional register for odd
+ if( !rcSimple && oddp && ((r2c && (nextPass == NULL)) || (c2r && (position == 0))) )
+ {
+ passStr += "\n\t";
+ passStr += "uint brv = 0;\n\t";
+ passStr += "\n\t";
+ passStr += regB2Type; passStr += " R"; passStr += SztToStr(cnPerWI); passStr += "[1];\n\t";
+ passStr += "(*R"; passStr += SztToStr(cnPerWI); passStr += ").x = 0; ";
+ passStr += "(*R"; passStr += SztToStr(cnPerWI); passStr += ").y = 0;\n";
+ }
+
+ // Special private memory for c-r 1 pass transforms
+ if( !rcSimple && (c2r && (position == 0)) && singlePass )
+ {
+ assert(radix == length);
+
+ passStr += "\n\t";
+ passStr += regB1Type;
+ passStr += " mpvt["; passStr += SztToStr(length); passStr += "];\n";
+ }
+
+ passStr += "\n";
+
+ // Read into registers
+ if(r2c)
+ {
+ if(position == 0)
+ {
+ passStr += "\n\tif(rw)\n\t{";
+ SweepRegs(SR_READ, fwd, inInterleaved, inStride, SR_COMP_REAL, 1.0f, bufferInRe, bufferInIm, "inOffset", 1, numB1, 0, passStr);
+ passStr += "\n\t}\n";
+
+ if(rcSimple)
+ {
+ passStr += "\n";
+ SweepRegsRC(SR_READ, fwd, inInterleaved, inStride, SR_COMP_IMAG, 1.0f, true, true, false, bufferInRe2, bufferInIm2, "inOffset", passStr);
+ passStr += "\n";
+ }
+ else
+ {
+ passStr += "\n\tif(rw > 1)\n\t{";
+ SweepRegs(SR_READ, fwd, inInterleaved, inStride, SR_COMP_IMAG, 1.0f, bufferInRe2, bufferInIm2, "inOffset", 1, numB1, 0, passStr);
+ passStr += "\n\t}\n";
+
+ passStr += "\telse\n\t{";
+ SweepRegsRC(SR_READ, fwd, inInterleaved, inStride, SR_COMP_IMAG, 1.0f, true, true, false, bufferInRe2, bufferInIm2, "inOffset", passStr);
+ passStr += "\n\t}\n";
+ }
+ }
+ }
+ else if(c2r && !rcSimple)
+ {
+ if(position == 0)
+ {
+ std::string processBufRe = bufferOutRe;
+ std::string processBufIm = bufferOutIm;
+ std::string processBufOffset = "outOffset";
+ size_t processBufStride = outStride;
+
+ if(singlePass)
+ {
+ processBufRe = "mpvt";
+ processBufIm = "mpvt";
+ processBufOffset = "0";
+ processBufStride = 1;
+ }
+
+ passStr += "\n\tif(rw && !me)\n\t{\n\t";
+ passStr += processBufRe; passStr += "["; passStr += processBufOffset; passStr += "] = ";
+ passStr += bufferInRe; passStr+= "[inOffset]";
+ if(inInterleaved) passStr += ".x;\n\t}"; else passStr += ";\n\t}";
+
+ if(length > 1)
+ {
+ passStr += "\n\n\tif(rw)\n\t{";
+ SweepRegsRC(SR_READ, fwd, inInterleaved, inStride, SR_COMP_REAL, 1.0f, false, false, false, bufferInRe, bufferInRe, "inOffset", passStr);
+ passStr += "\n\t}\n";
+
+ passStr += "\n\tif(rw > 1)\n\t{";
+ SweepRegsRC(SR_READ, fwd, inInterleaved, inStride, SR_COMP_REAL, 1.0f, false, true, false, bufferInIm2, bufferInIm2, "inOffset", passStr);
+ passStr += "\n\t}\n\telse\n\t{";
+ SweepRegsRC(SR_READ, fwd, inInterleaved, inStride, SR_COMP_REAL, 1.0f, true, true, false, bufferInIm2, bufferInIm2, "inOffset", passStr);
+ passStr += "\n\t}\n";
+
+ if(oddp)
+ {
+ passStr += "\n\tif(rw && (me%2))\n\t{";
+ SweepRegsRC(SR_READ, fwd, inInterleaved, inStride, SR_COMP_REAL, 1.0f, false, false, true, bufferInRe, bufferInRe, "inOffset", passStr);
+ passStr += "\n\t}";
+ passStr += "\n\tif((rw > 1) && (me%2))\n\t{";
+ SweepRegsRC(SR_READ, fwd, inInterleaved, inStride, SR_COMP_REAL, 1.0f, false, true, true, bufferInIm2, bufferInIm2, "inOffset", passStr);
+ passStr += "\n\t}\n";
+ }
+
+
+ SweepRegsRC(SR_WRITE, fwd, outInterleaved, processBufStride, SR_COMP_REAL, 1.0f, false, true, false, processBufRe, processBufIm, processBufOffset, passStr);
+ if(oddp)
+ {
+ passStr += "\n\tif(me%2)\n\t{";
+ SweepRegsRC(SR_WRITE, fwd, outInterleaved, processBufStride, SR_COMP_REAL, 1.0f, false, true, true, processBufRe, processBufIm, processBufOffset, passStr);
+ passStr += "\n\t}\n";
+ }
+ SweepRegsRC(SR_WRITE, fwd, outInterleaved, processBufStride, SR_COMP_REAL, 1.0f, false, false, false, processBufRe, processBufIm, processBufOffset, passStr);
+ if(oddp)
+ {
+ passStr += "\n\tif(me%2)\n\t{";
+ SweepRegsRC(SR_WRITE, fwd, outInterleaved, processBufStride, SR_COMP_REAL, 1.0f, false, false, true, processBufRe, processBufIm, processBufOffset, passStr);
+ passStr += "\n\t}\n";
+ }
+ }
+
+ passStr += "\n\n\tbarrier(CLK_LOCAL_MEM_FENCE);\n";
+ SweepRegs(SR_READ, fwd, outInterleaved, processBufStride, SR_COMP_REAL, 1.0f, processBufRe, processBufIm, processBufOffset, 1, numB1, 0, passStr);
+ passStr += "\n\n\tbarrier(CLK_LOCAL_MEM_FENCE);\n";
+
+
+ passStr += "\n\tif((rw > 1) && !me)\n\t{\n\t";
+ passStr += processBufIm; passStr += "["; passStr += processBufOffset; passStr += "] = ";
+ passStr += bufferInRe2; passStr+= "[inOffset]";
+ if(inInterleaved) passStr += ".x;\n\t}"; else passStr += ";\n\t}";
+ passStr += "\n\tif((rw == 1) && !me)\n\t{\n\t"; passStr += processBufIm; passStr += "["; passStr += processBufOffset; passStr += "] = 0;\n\t}";
+
+
+ if(length > 1)
+ {
+ passStr += "\n\n\tif(rw)\n\t{";
+ SweepRegsRC(SR_READ, fwd, inInterleaved, inStride, SR_COMP_IMAG, 1.0f, false, false, false, bufferInIm, bufferInIm, "inOffset", passStr);
+ passStr += "\n\t}\n";
+
+ passStr += "\n\tif(rw > 1)\n\t{";
+ SweepRegsRC(SR_READ, fwd, inInterleaved, inStride, SR_COMP_IMAG, 1.0f, false, true, false, bufferInRe2, bufferInRe2, "inOffset", passStr);
+ passStr += "\n\t}\n\telse\n\t{";
+ SweepRegsRC(SR_READ, fwd, inInterleaved, inStride, SR_COMP_IMAG, 1.0f, true, true, false, bufferInRe2, bufferInRe2, "inOffset", passStr);
+ passStr += "\n\t}\n";
+
+ if(oddp)
+ {
+ passStr += "\n\tif(rw && (me%2))\n\t{";
+ SweepRegsRC(SR_READ, fwd, inInterleaved, inStride, SR_COMP_IMAG, 1.0f, false, false, true, bufferInIm, bufferInIm, "inOffset", passStr);
+ passStr += "\n\t}";
+ passStr += "\n\tif((rw > 1) && (me%2))\n\t{";
+ SweepRegsRC(SR_READ, fwd, inInterleaved, inStride, SR_COMP_IMAG, 1.0f, false, true, true, bufferInRe2, bufferInRe2, "inOffset", passStr);
+ passStr += "\n\t}\n";
+ }
+
+ SweepRegsRC(SR_WRITE, fwd, outInterleaved, processBufStride, SR_COMP_IMAG, 1.0f, false, true, false, processBufRe, processBufIm, processBufOffset, passStr);
+ if(oddp)
+ {
+ passStr += "\n\tif(me%2)\n\t{";
+ SweepRegsRC(SR_WRITE, fwd, outInterleaved, processBufStride, SR_COMP_IMAG, 1.0f, false, true, true, processBufRe, processBufIm, processBufOffset, passStr);
+ passStr += "\n\t}\n";
+ }
+ SweepRegsRC(SR_WRITE, fwd, outInterleaved, processBufStride, SR_COMP_IMAG, 1.0f, false, false, false, processBufRe, processBufIm, processBufOffset, passStr);
+ if(oddp)
+ {
+ passStr += "\n\tif(me%2)\n\t{";
+ SweepRegsRC(SR_WRITE, fwd, outInterleaved, processBufStride, SR_COMP_IMAG, 1.0f, false, false, true, processBufRe, processBufIm, processBufOffset, passStr);
+ passStr += "\n\t}\n";
+ }
+ }
+
+ passStr += "\n\n\tbarrier(CLK_LOCAL_MEM_FENCE);\n";
+ SweepRegs(SR_READ, fwd, outInterleaved, processBufStride, SR_COMP_IMAG, 1.0f, processBufRe, processBufIm, processBufOffset, 1, numB1, 0, passStr);
+ passStr += "\n\n\tbarrier(CLK_LOCAL_MEM_FENCE);\n";
+ }
+ }
+ else
+ {
+ if( (!linearRegs) || (linearRegs && (position == 0)) )
+ {
+ passStr += "\n\tif(rw)\n\t{";
+ SweepRegs(SR_READ, fwd, inInterleaved, inStride, SR_COMP_BOTH, 1.0f, bufferInRe, bufferInIm, "inOffset", 1, numB1, 0, passStr);
+ SweepRegs(SR_READ, fwd, inInterleaved, inStride, SR_COMP_BOTH, 1.0f, bufferInRe, bufferInIm, "inOffset", 2, numB2, numB1, passStr);
+ SweepRegs(SR_READ, fwd, inInterleaved, inStride, SR_COMP_BOTH, 1.0f, bufferInRe, bufferInIm, "inOffset", 4, numB4, 2*numB2 + numB1, passStr);
+ passStr += "\n\t}\n";
+ }
+ }
+
+
+ passStr += "\n";
+
+ // Twiddle multiply
+ if( (position > 0) && (radix > 1) )
+ {
+ SweepRegs(SR_TWMUL, fwd, false, 1, SR_COMP_BOTH, 1.0f, bufferInRe, bufferInIm, "", 1, numB1, 0, passStr);
+ SweepRegs(SR_TWMUL, fwd, false, 1, SR_COMP_BOTH, 1.0f, bufferInRe, bufferInIm, "", 2, numB2, numB1, passStr);
+ SweepRegs(SR_TWMUL, fwd, false, 1, SR_COMP_BOTH, 1.0f, bufferInRe, bufferInIm, "", 4, numB4, 2*numB2 + numB1, passStr);
+ }
+
+ // Butterfly calls
+ if(radix > 1)
+ {
+ if(numB1) CallButterfly(ButterflyName(radix, 1, fwd), 1, numB1, passStr);
+ if(numB2) CallButterfly(ButterflyName(radix, 2, fwd), 2, numB2, passStr);
+ if(numB4) CallButterfly(ButterflyName(radix, 4, fwd), 4, numB4, passStr);
+ }
+
+ passStr += "\n";
+
+ if( (position != 0) && (!linearRegs) && (nextPass != NULL) )
+ passStr += "\n\n\tbarrier(CLK_LOCAL_MEM_FENCE);\n";
+
+ passStr += "\n";
+
+ // 3-step twiddle multiplies
+ if(fft_3StepTwiddle)
+ {
+ assert(nextPass == NULL);
+ if(linearRegs)
+ {
+ SweepRegs(SR_TWMUL_3STEP, fwd, false, 1, SR_COMP_BOTH, 1.0f, bufferInRe, bufferInIm, "", 1, numB1, 0, passStr);
+ }
+ else
+ {
+ SweepRegs(SR_TWMUL_3STEP, fwd, false, 1, SR_COMP_BOTH, 1.0f, bufferInRe, bufferInIm, "", 1, numB1, 0, passStr);
+ SweepRegs(SR_TWMUL_3STEP, fwd, false, 1, SR_COMP_BOTH, 1.0f, bufferInRe, bufferInIm, "", 2, numB2, numB1, passStr);
+ SweepRegs(SR_TWMUL_3STEP, fwd, false, 1, SR_COMP_BOTH, 1.0f, bufferInRe, bufferInIm, "", 4, numB4, 2*numB2 + numB1, passStr);
+ }
+ }
+
+ // Write back from registers
+ if(linearRegs)
+ {
+ // In this case, we have to write & again read back for the next pass since we are
+ // using only half the lds. Number of barriers will increase at the cost of halving the lds.
+
+ if(nextPass == NULL) // last pass
+ {
+ if(r2c && !rcSimple)
+ {
+ if(!singlePass)
+ {
+ SweepRegs(SR_WRITE, fwd, inInterleaved, inStride, SR_COMP_REAL, 1.0f, bufferInRe, bufferInIm, "inOffset", 1, numB1, 0, passStr);
+ passStr += "\n\n\tbarrier(CLK_LOCAL_MEM_FENCE);\n";
+ SweepRegsRC(SR_READ, fwd, inInterleaved, inStride, SR_COMP_REAL, 1.0f, false, false, false, bufferInRe, bufferInIm, "inOffset", passStr);
+ if(oddp)
+ {
+ passStr += "\n\tif(me%2)\n\t{";
+ SweepRegsRC(SR_READ, fwd, inInterleaved, inStride, SR_COMP_REAL, 1.0f, false, false, true, bufferInRe, bufferInIm, "inOffset", passStr);
+ passStr += "\n\t}\n";
+ }
+
+ passStr += "\n\tif(rw && !me)\n\t{\n\t";
+ if(outInterleaved)
+ {
+ passStr += bufferOutRe; passStr+= "[outOffset].x = "; passStr += bufferInRe; passStr += "[inOffset]";
+ if(scale != 1.0) { passStr += " * "; passStr += FloatToStr(scale); passStr += FloatSuffix(); } passStr += ";\n\t";
+ passStr += bufferOutIm; passStr+= "[outOffset].y = "; passStr += "0;\n\t}";
+ }
+ else
+ {
+ passStr += bufferOutRe; passStr+= "[outOffset] = "; passStr += bufferInRe; passStr += "[inOffset]";
+ if(scale != 1.0) { passStr += " * "; passStr += FloatToStr(scale); passStr += FloatSuffix(); } passStr += ";\n\t";
+ passStr += bufferOutIm; passStr+= "[outOffset] = "; passStr += "0;\n\t}";
+ }
+ passStr += "\n\n\tbarrier(CLK_LOCAL_MEM_FENCE);\n";
+
+
+ SweepRegs(SR_WRITE, fwd, inInterleaved, inStride, SR_COMP_IMAG, 1.0f, bufferInRe, bufferInIm, "inOffset", 1, numB1, 0, passStr);
+ passStr += "\n\n\tbarrier(CLK_LOCAL_MEM_FENCE);\n";
+ SweepRegsRC(SR_READ, fwd, inInterleaved, inStride, SR_COMP_IMAG, 1.0f, false, false, false, bufferInRe, bufferInIm, "inOffset", passStr);
+ if(oddp)
+ {
+ passStr += "\n\tif(me%2)\n\t{";
+ SweepRegsRC(SR_READ, fwd, inInterleaved, inStride, SR_COMP_IMAG, 1.0f, false, false, true, bufferInRe, bufferInIm, "inOffset", passStr);
+ passStr += "\n\t}\n";
+ }
+
+ passStr += "\n\tif((rw > 1) && !me)\n\t{\n\t";
+ if(outInterleaved)
+ {
+ passStr += bufferOutRe2; passStr+= "[outOffset].x = "; passStr += bufferInIm; passStr += "[inOffset]";
+ if(scale != 1.0) { passStr += " * "; passStr += FloatToStr(scale); passStr += FloatSuffix(); } passStr += ";\n\t";
+ passStr += bufferOutIm2; passStr+= "[outOffset].y = "; passStr += "0;\n\t}";
+ }
+ else
+ {
+ passStr += bufferOutRe2; passStr+= "[outOffset] = "; passStr += bufferInIm; passStr += "[inOffset]";
+ if(scale != 1.0) { passStr += " * "; passStr += FloatToStr(scale); passStr += FloatSuffix(); } passStr += ";\n\t";
+ passStr += bufferOutIm2; passStr+= "[outOffset] = "; passStr += "0;\n\t}";
+ }
+ passStr += "\n\n\tbarrier(CLK_LOCAL_MEM_FENCE);\n";
+ }
+
+
+ passStr += "\n\n\tif(rw)\n\t{";
+ SweepRegsRC(SR_WRITE, fwd, outInterleaved, outStride, SR_COMP_BOTH, scale, false, false, false, bufferOutRe, bufferOutIm, "outOffset", passStr);
+ passStr += "\n\t}\n";
+ if(oddp)
+ {
+ passStr += "\n\n\tbrv = ((rw != 0) & (me%2 == 1));\n\t";
+ passStr += "if(brv)\n\t{";
+ SweepRegsRC(SR_WRITE, fwd, outInterleaved, outStride, SR_COMP_BOTH, scale, false, false, true, bufferOutRe, bufferOutIm, "outOffset", passStr);
+ passStr += "\n\t}\n";
+ }
+
+ passStr += "\n\n\tif(rw > 1)\n\t{";
+ SweepRegsRC(SR_WRITE, fwd, outInterleaved, outStride, SR_COMP_BOTH, scale, false, true, false, bufferOutRe2, bufferOutIm2, "outOffset", passStr);
+ passStr += "\n\t}\n";
+ if(oddp)
+ {
+ passStr += "\n\n\tbrv = ((rw > 1) & (me%2 == 1));\n\t";
+ passStr += "if(brv)\n\t{";
+ SweepRegsRC(SR_WRITE, fwd, outInterleaved, outStride, SR_COMP_BOTH, scale, false, true, true, bufferOutRe2, bufferOutIm2, "outOffset", passStr);
+ passStr += "\n\t}\n";
+ }
+
+ }
+ else if(c2r)
+ {
+ passStr += "\n\tif(rw)\n\t{";
+ SweepRegs(SR_WRITE, fwd, outInterleaved, outStride, SR_COMP_REAL, scale, bufferOutRe, bufferOutIm, "outOffset", 1, numB1, 0, passStr);
+ passStr += "\n\t}\n";
+
+ if(!rcSimple)
+ {
+ passStr += "\n\tif(rw > 1)\n\t{";
+ SweepRegs(SR_WRITE, fwd, outInterleaved, outStride, SR_COMP_IMAG, scale, bufferOutRe2, bufferOutIm2, "outOffset", 1, numB1, 0, passStr);
+ passStr += "\n\t}\n";
+ }
+ }
+ else
+ {
+ passStr += "\n\tif(rw)\n\t{";
+ SweepRegs(SR_WRITE, fwd, outInterleaved, outStride, SR_COMP_BOTH, scale, bufferOutRe, bufferOutIm, "outOffset", 1, numB1, 0, passStr);
+ passStr += "\n\t}\n";
+ }
+ }
+ else
+ {
+ passStr += "\n\tif(rw)\n\t{";
+ SweepRegs(SR_WRITE, fwd, outInterleaved, outStride, SR_COMP_REAL, scale, bufferOutRe, bufferOutIm, "outOffset", 1, numB1, 0, passStr);
+ passStr += "\n\t}\n";
+ passStr += "\n\n\tbarrier(CLK_LOCAL_MEM_FENCE);\n";
+ passStr += "\n\tif(rw)\n\t{";
+ nextPass->SweepRegs(SR_READ, fwd, outInterleaved, outStride, SR_COMP_REAL, scale, bufferOutRe, bufferOutIm, "outOffset", 1, nextPass->GetNumB1(), 0, passStr);
+ passStr += "\n\t}\n";
+ passStr += "\n\n\tbarrier(CLK_LOCAL_MEM_FENCE);\n";
+ passStr += "\n\tif(rw)\n\t{";
+ SweepRegs(SR_WRITE, fwd, outInterleaved, outStride, SR_COMP_IMAG, scale, bufferOutRe, bufferOutIm, "outOffset", 1, numB1, 0, passStr);
+ passStr += "\n\t}\n";
+ passStr += "\n\n\tbarrier(CLK_LOCAL_MEM_FENCE);\n";
+ passStr += "\n\tif(rw)\n\t{";
+ nextPass->SweepRegs(SR_READ, fwd, outInterleaved, outStride, SR_COMP_IMAG, scale, bufferOutRe, bufferOutIm, "outOffset", 1, nextPass->GetNumB1(), 0, passStr);
+ passStr += "\n\t}\n";
+ passStr += "\n\n\tbarrier(CLK_LOCAL_MEM_FENCE);\n";
+ }
+ }
+ else
+ {
+ passStr += "\n\tif(rw)\n\t{";
+ SweepRegs(SR_WRITE, fwd, outInterleaved, outStride, SR_COMP_BOTH, scale, bufferOutRe, bufferOutIm, "outOffset", 1, numB1, 0, passStr);
+ SweepRegs(SR_WRITE, fwd, outInterleaved, outStride, SR_COMP_BOTH, scale, bufferOutRe, bufferOutIm, "outOffset", 2, numB2, numB1, passStr);
+ SweepRegs(SR_WRITE, fwd, outInterleaved, outStride, SR_COMP_BOTH, scale, bufferOutRe, bufferOutIm, "outOffset", 4, numB4, 2*numB2 + numB1, passStr);
+ passStr += "\n\t}\n";
+ }
+
+
+ passStr += "\n}\n\n";
+ }
+ };
+
+ // FFT kernel
+ template
+ class Kernel
+ {
+ size_t length; // Length of FFT
+ size_t workGroupSize; // Work group size
+ size_t cnPerWI; // complex numbers per work-item
+
+ size_t numTrans; // Number of transforms per work-group
+ size_t workGroupSizePerTrans; // Work group subdivision per transform
+ size_t numPasses; // Number of FFT passes
+ std::vector radices; // Base radix at each pass
+ std::vector > passes; // Array of pass objects
+
+ bool halfLds; // LDS used to store one component (either real or imaginary) at a time
+ // for passing intermediate data between the passes, if this is set
+ // then each pass-function should accept same set of registers
+
+ // Future optimization ideas
+ // bool limitRegs; // TODO: Incrementally write to LDS, thereby using same set of registers for more than 1 butterflies
+ // bool combineReadTwMul; // TODO: Combine reading into registers and Twiddle multiply
+
+ bool r2c2r; // real to complex or complex to real transform
+ bool r2c, c2r;
+ bool rcFull;
+ bool rcSimple;
+
+ const FFTKernelGenKeyParams params; // key params
+
+
+ inline std::string IterRegs(const std::string &pfx, bool initComma = true)
+ {
+ std::string str = "";
+
+ if(halfLds)
+ {
+ if(initComma) str += ", ";
+
+ for(size_t i=0; i kcs;
+ kcs.GetRadices(length, nPasses, pRadices);
+ if((params.fft_MaxWorkGroupSize >= 256) && (pRadices != NULL))
+ {
+ for(size_t i=0; i(i, length, rad, cnPerWI, L, LS, R, linearRegs, r2c, c2r, rcFull, rcSimple));
+
+ LS *= rad;
+ }
+ assert(R == 1); // this has to be true for correct radix composition of the length
+ numPasses = nPasses;
+ }
+ else
+ {
+ // Possible radices
+ size_t cRad[] = {10,8,6,5,4,3,2,1}; // Must be in descending order
+ size_t cRadSize = (sizeof(cRad)/sizeof(cRad[0]));
+
+ while(true)
+ {
+ size_t rad;
+
+ assert(cRadSize >= 1);
+ for(size_t r=0; r cnPerWI) || (cnPerWI%rad))
+ continue;
+
+ if(!(R % rad))
+ break;
+ }
+
+ assert((cnPerWI%rad) == 0);
+
+ L = LS * rad;
+ R /= rad;
+
+ radices.push_back(rad);
+ passes.push_back(Pass(pid, length, rad, cnPerWI, L, LS, R, linearRegs, r2c, c2r, rcFull, rcSimple));
+
+ pid++;
+ LS *= rad;
+
+ assert(R >= 1);
+ if(R == 1)
+ break;
+ }
+ numPasses = pid;
+ }
+
+ assert(numPasses == passes.size());
+ assert(numPasses == radices.size());
+
+#ifdef PARMETERS_TO_BE_READ
+
+ ParamRead pr;
+ ReadParameterFile(pr);
+
+ radices.clear();
+ passes.clear();
+
+ radices = pr.radices;
+ numPasses = radices.size();
+
+ LS = 1;
+ R = length;
+ for(size_t i=0; i(i, length, rad, cnPerWI, L, LS, R, linearRegs));
+
+ LS *= rad;
+ }
+ assert(R == 1);
+#endif
+
+ // Grouping read/writes ok?
+ bool grp = IsGroupedReadWritePossible();
+ for(size_t i=0; i < numPasses; i++)
+ passes[i].SetGrouping(grp);
+
+ // Store the next pass-object pointers
+ if(numPasses > 1)
+ for(size_t i=0; i < (numPasses - 1); i++)
+ passes[i].SetNextPass(&passes[i+1]);
+
+ }
+
+ void GenerateKernel(std::string &str)
+ {
+ std::string twType = RegBaseType(2);
+ std::string rType = RegBaseType(1);
+ std::string r2Type = RegBaseType(2);
+
+ bool inInterleaved; // Input is interleaved format
+ bool outInterleaved; // Output is interleaved format
+ inInterleaved = ( (params.fft_inputLayout == CLFFT_COMPLEX_INTERLEAVED) ||
+ (params.fft_inputLayout == CLFFT_HERMITIAN_INTERLEAVED) ) ? true : false;
+ outInterleaved = ( (params.fft_outputLayout == CLFFT_COMPLEX_INTERLEAVED) ||
+ (params.fft_outputLayout == CLFFT_HERMITIAN_INTERLEAVED) ) ? true : false;
+
+ bool inReal; // Input is real format
+ bool outReal; // Output is real format
+ inReal = (params.fft_inputLayout == CLFFT_REAL) ? true : false;
+ outReal = (params.fft_outputLayout == CLFFT_REAL) ? true : false;
+
+ size_t large1D = params.fft_N[0] * params.fft_N[1];
+
+ // Pragma
+ str += ClPragma();
+
+ // Twiddle table
+ if(length > 1)
+ {
+ TwiddleTable twTable(length);
+
+ str += "\n__constant ";
+ str += twType; str += " ";
+ str += TwTableName();
+ str += "["; str += SztToStr(length-1); str += "] = {\n";
+ twTable.GenerateTwiddleTable(radices, str);
+ str += "};\n\n";
+ }
+ str += "\n";
+
+ // twiddle factors for 1d-large 3-step algorithm
+ if(params.fft_3StepTwiddle)
+ {
+ TwiddleTableLarge twLarge(large1D);
+ twLarge.GenerateTwiddleTable(str);
+ }
+
+ std::string sfx = FloatSuffix();
+
+ // Vector type
+ str += "#define fvect2 "; str += RegBaseType(2); str += "\n\n";
+
+ //constants
+ str += "#define C8Q 0.70710678118654752440084436210485"; str += sfx; str += "\n";
+
+ str += "#define C5QA 0.30901699437494742410229341718282"; str += sfx; str += "\n";
+ str += "#define C5QB 0.95105651629515357211643933337938"; str += sfx; str += "\n";
+ str += "#define C5QC 0.50000000000000000000000000000000"; str += sfx; str += "\n";
+ str += "#define C5QD 0.58778525229247312916870595463907"; str += sfx; str += "\n";
+ str += "#define C5QE 0.80901699437494742410229341718282"; str += sfx; str += "\n";
+
+ str += "#define C3QA 0.50000000000000000000000000000000"; str += sfx; str += "\n";
+ str += "#define C3QB 0.86602540378443864676372317075294"; str += sfx; str += "\n";
+ str += "\n";
+
+ bool cReg = halfLds ? true : false;
+
+ // Generate butterflies for all unique radices
+ std::list uradices;
+ for(std::vector::const_iterator r = radices.begin(); r != radices.end(); r++)
+ uradices.push_back(*r);
+
+ uradices.sort();
+ uradices.unique();
+
+ typename std::vector< Pass >::const_iterator p;
+ if(length > 1)
+ {
+ for(std::list::const_iterator r = uradices.begin(); r != uradices.end(); r++)
+ {
+ size_t rad = *r;
+ p = passes.begin();
+ while(p->GetRadix() != rad) p++;
+
+ for(size_t d=0; d<2; d++)
+ {
+ bool fwd = d ? false : true;
+
+ if(p->GetNumB1()) { Butterfly bfly(rad, 1, fwd, cReg); bfly.GenerateButterfly(str); str += "\n"; }
+ if(p->GetNumB2()) { Butterfly bfly(rad, 2, fwd, cReg); bfly.GenerateButterfly(str); str += "\n"; }
+ if(p->GetNumB4()) { Butterfly bfly(rad, 4, fwd, cReg); bfly.GenerateButterfly(str); str += "\n"; }
+ }
+ }
+ }
+
+ // Generate passes
+ for(size_t d=0; d<2; d++)
+ {
+ bool fwd;
+
+ if(r2c2r)
+ {
+ fwd = r2c;
+ }
+ else
+ {
+ fwd = d ? false : true;
+ }
+
+ double scale = fwd ? params.fft_fwdScale : params.fft_backScale;
+ bool tw3Step = false;
+
+ for(p = passes.begin(); p != passes.end(); p++)
+ {
+ double s = 1.0;
+ size_t ins = 1, outs = 1;
+ bool gIn = false, gOut = false;
+ bool inIlvd = false, outIlvd = false;
+ bool inRl = false, outRl = false;
+ if(p == passes.begin()) { inIlvd = inInterleaved; inRl = inReal; gIn = true; ins = params.fft_inStride[0]; }
+ if((p+1) == passes.end()) { outIlvd = outInterleaved; outRl = outReal; gOut = true; outs = params.fft_outStride[0]; s = scale; tw3Step = params.fft_3StepTwiddle; }
+
+ p->GeneratePass(fwd, str, tw3Step, inIlvd, outIlvd, inRl, outRl, ins, outs, s, gIn, gOut);
+ }
+
+ // if real transform we do only 1 direction
+ if(r2c2r)
+ break;
+ }
+
+ // TODO : address this kludge
+ str += " typedef union { uint u; int i; } cb_t;\n\n";
+
+ for(size_t d=0; d<2; d++)
+ {
+ bool fwd;
+
+ if(r2c2r)
+ {
+ fwd = inReal ? true : false;
+ }
+ else
+ {
+ fwd = d ? false : true;
+ }
+
+ // FFT kernel begin
+ // Function attribute
+ str += "__kernel __attribute__((reqd_work_group_size (";
+ str += SztToStr(workGroupSize); str += ",1,1)))\nvoid ";
+
+ // Function name
+ if(fwd) str += "fft_fwd";
+ else str += "fft_back";
+ str += "(";
+
+ // TODO : address this kludge
+ str += "__constant cb_t *cb __attribute__((max_constant_size(32))), ";
+
+ // Function attributes
+ if(params.fft_placeness == CLFFT_INPLACE)
+ {
+ if(r2c2r)
+ {
+ if(outInterleaved)
+ {
+ str += "__global "; str += r2Type; str += " * restrict gb)\n";
+ }
+ else
+ {
+ str += "__global "; str += rType; str += " * restrict gb)\n";
+ }
+ }
+ else
+ {
+ assert(inInterleaved == outInterleaved);
+ assert(params.fft_inStride[1] == params.fft_outStride[1]);
+ assert(params.fft_inStride[0] == params.fft_outStride[0]);
+
+ if(inInterleaved)
+ {
+ str += "__global "; str += r2Type; str += " * restrict gb)\n";
+ }
+ else
+ {
+ str += "__global "; str += rType; str += " * restrict gbRe, ";
+ str += "__global "; str += rType; str += " * restrict gbIm)\n";
+ }
+ }
+ }
+ else
+ {
+ if(r2c2r)
+ {
+ if(inInterleaved)
+ {
+ str += "__global "; str += r2Type; str += " * restrict gbIn, ";
+ }
+ else if(inReal)
+ {
+ str += "__global "; str += rType; str += " * restrict gbIn, ";
+ }
+ else
+ {
+ str += "__global const "; str += rType; str += " * restrict gbInRe, ";
+ str += "__global const "; str += rType; str += " * restrict gbInIm, ";
+ }
+
+ if(outInterleaved)
+ {
+ str += "__global "; str += r2Type; str += " * restrict gbOut)\n";
+ }
+ else if(outReal)
+ {
+ str += "__global "; str += rType; str += " * restrict gbOut)\n";
+ }
+ else
+ {
+ str += "__global const "; str += rType; str += " * restrict gbOutRe, ";
+ str += "__global const "; str += rType; str += " * restrict gbOutIm)\n";
+ }
+ }
+ else
+ {
+ if(inInterleaved)
+ {
+ str += "__global const "; str += r2Type; str += " * restrict gbIn, ";
+ }
+ else
+ {
+ str += "__global const "; str += rType; str += " * restrict gbInRe, ";
+ str += "__global const "; str += rType; str += " * restrict gbInIm, ";
+ }
+
+ if(outInterleaved)
+ {
+ str += "__global "; str += r2Type; str += " * restrict gbOut)\n";
+ }
+ else
+ {
+ str += "__global "; str += rType; str += " * restrict gbOutRe, ";
+ str += "__global "; str += rType; str += " * restrict gbOutIm)\n";
+ }
+ }
+ }
+
+ str += "{\n";
+
+ // Initialize
+ str += "\t";
+ str += "uint me = get_local_id(0);\n\t";
+ str += "uint batch = get_group_id(0);";
+ str += "\n";
+
+ // Allocate LDS
+ size_t ldsSize = halfLds ? length*numTrans : 2*length*numTrans;
+ if(numPasses > 1)
+ {
+ str += "\n\t";
+ str += "__local "; str += rType; str += " lds[";
+ str += SztToStr(ldsSize); str += "];\n";
+ }
+
+ // Declare memory pointers
+ str += "\n\t";
+ if(r2c2r)
+ {
+ str += "uint iOffset;\n\t";
+ str += "uint oOffset;\n\n\t";
+ if(!rcSimple)
+ {
+ str += "uint iOffset2;\n\t";
+ str += "uint oOffset2;\n\n\t";
+ }
+
+ if(inInterleaved)
+ {
+ if(!rcSimple) { str += "__global "; str += r2Type; str += " *lwbIn2;\n\t"; }
+ str += "__global "; str += r2Type; str += " *lwbIn;\n\t";
+ }
+ else if(inReal)
+ {
+ if(!rcSimple) { str += "__global "; str += rType; str += " *lwbIn2;\n\t"; }
+ str += "__global "; str += rType; str += " *lwbIn;\n\t";
+
+ }
+ else
+ {
+ if(!rcSimple) { str += "__global "; str += rType; str += " *lwbInRe2;\n\t"; }
+ if(!rcSimple) { str += "__global "; str += rType; str += " *lwbInIm2;\n\t"; }
+ str += "__global "; str += rType; str += " *lwbInRe;\n\t";
+ str += "__global "; str += rType; str += " *lwbInIm;\n\t";
+ }
+
+ if(outInterleaved)
+ {
+ if(!rcSimple) { str += "__global "; str += r2Type; str += " *lwbOut2;\n\t"; }
+ str += "__global "; str += r2Type; str += " *lwbOut;\n\n";
+
+ }
+ else if(outReal)
+ {
+ if(!rcSimple) { str += "__global "; str += rType; str += " *lwbOut2;\n\t"; }
+ str += "__global "; str += rType; str += " *lwbOut;\n\n";
+
+ }
+ else
+ {
+ if(!rcSimple) { str += "__global "; str += rType; str += " *lwbOutRe2;\n\t"; }
+ if(!rcSimple) { str += "__global "; str += rType; str += " *lwbOutIm2;\n\t"; }
+ str += "__global "; str += rType; str += " *lwbOutRe;\n\t";
+ str += "__global "; str += rType; str += " *lwbOutIm;\n\n";
+
+ }
+ }
+ else
+ {
+ if(params.fft_placeness == CLFFT_INPLACE)
+ {
+ str += "uint ioOffset;\n\t";
+
+ if(inInterleaved)
+ {
+ str += "__global "; str += r2Type; str += " *lwb;\n\n";
+ }
+ else
+ {
+ str += "__global "; str += rType; str += " *lwbRe;\n\t";
+ str += "__global "; str += rType; str += " *lwbIm;\n\n";
+ }
+ }
+ else
+ {
+ str += "uint iOffset;\n\t";
+ str += "uint oOffset;\n\t";
+
+ if(inInterleaved)
+ {
+ str += "__global "; str += r2Type; str += " *lwbIn;\n\t";
+ }
+ else
+ {
+ str += "__global "; str += rType; str += " *lwbInRe;\n\t";
+ str += "__global "; str += rType; str += " *lwbInIm;\n\t";
+ }
+
+ if(outInterleaved)
+ {
+ str += "__global "; str += r2Type; str += " *lwbOut;\n\n";
+ }
+ else
+ {
+ str += "__global "; str += rType; str += " *lwbOutRe;\n\t";
+ str += "__global "; str += rType; str += " *lwbOutIm;\n\n";
+ }
+ }
+ }
+
+ // Setup registers if needed
+ if(halfLds)
+ {
+ str += "\t"; str += RegBaseType(2);
+ str += " "; str += IterRegs("", false);
+ str += ";\n\n";
+ }
+
+ // Calculate total transform count
+ std::string totalBatch = "(";
+ size_t i = 0;
+ while(i < (params.fft_DataDim - 2))
+ {
+ totalBatch += SztToStr(params.fft_N[i+1]); totalBatch += " * ";
+ i++;
+ }
+ totalBatch += "cb["; totalBatch += SztToStr(i); totalBatch += "].u)";
+
+ // Conditional read-write ('rw') for arbitrary batch number
+ if(r2c2r && !rcSimple)
+ {
+ str += "\tuint this = "; str += totalBatch; str += " - batch*";
+ str += SztToStr(2*numTrans); str += ";\n";
+ str += "\tuint rw = (me < ((this+1)/2)*"; str += SztToStr(workGroupSizePerTrans);
+ str += ") ? (this - 2*(me/"; str += SztToStr(workGroupSizePerTrans); str += ")) : 0;\n\n";
+ }
+ else
+ {
+ if(numTrans > 1)
+ {
+ str += "\tuint rw = (me < ("; str += totalBatch;
+ str += " - batch*"; str += SztToStr(numTrans); str += ")*";
+ str += SztToStr(workGroupSizePerTrans); str += ") ? 1 : 0;\n\n";
+ }
+ }
+
+ // Transform index for 3-step twiddles
+ if(params.fft_3StepTwiddle)
+ {
+ if(numTrans == 1)
+ {
+ str += "\tuint b = batch%";
+ }
+ else
+ {
+ str += "\tuint b = (batch*"; str += SztToStr(numTrans); str += " + (me/";
+ str += SztToStr(workGroupSizePerTrans); str += "))%";
+ }
+
+ str += SztToStr(params.fft_N[1]); str += ";\n\n";
+ }
+ else
+ {
+ str += "\tuint b = 0;\n\n";
+ }
+
+ // Setup memory pointers
+ if(r2c2r)
+ {
+ str += OffsetCalc("iOffset", true);
+ str += OffsetCalc("oOffset", false);
+ if(!rcSimple) { str += OffsetCalc("iOffset2", true, true); }
+ if(!rcSimple) { str += OffsetCalc("oOffset2", false, true); }
+
+ str += "\n\t";
+ if(params.fft_placeness == CLFFT_INPLACE)
+ {
+ if(inInterleaved)
+ {
+ if(!rcSimple) { str += "lwbIn2 = (__global "; str += r2Type; str += " *)gb + iOffset2;\n\t"; }
+ str += "lwbIn = (__global "; str += r2Type; str += " *)gb + iOffset;\n\t";
+ }
+ else
+ {
+ if(!rcSimple) { str += "lwbIn2 = (__global "; str += rType; str += " *)gb + iOffset2;\n\t"; }
+ str += "lwbIn = (__global "; str += rType; str += " *)gb + iOffset;\n\t";
+
+ }
+
+ if(!rcSimple) { str += "lwbOut2 = gb + oOffset2;\n\t"; }
+ str += "lwbOut = gb + oOffset;\n\n";
+
+ }
+ else
+ {
+ if(inInterleaved || inReal)
+ {
+ if(!rcSimple) { str += "lwbIn2 = gbIn + iOffset2;\n\t"; }
+ str += "lwbIn = gbIn + iOffset;\n\t";
+ }
+ else
+ {
+ if(!rcSimple) { str += "lwbInRe2 = gbInRe + iOffset2;\n\t"; }
+ if(!rcSimple) { str += "lwbInIm2 = gbInIm + iOffset2;\n\t"; }
+ str += "lwbInRe = gbInRe + iOffset;\n\t";
+ str += "lwbInIm = gbInIm + iOffset;\n\t";
+
+ }
+
+ if(outInterleaved || outReal)
+ {
+ if(!rcSimple) { str += "lwbOut2 = gbOut + oOffset2;\n\t"; }
+ str += "lwbOut = gbOut + oOffset;\n\n";
+ }
+ else
+ {
+ if(!rcSimple) { str += "lwbOutRe2 = gbOutRe + oOffset2;\n\t"; }
+ if(!rcSimple) { str += "lwbOutIm2 = gbOutIm + oOffset2;\n\t"; }
+ str += "lwbOutRe = gbOutRe + oOffset;\n\t";
+ str += "lwbOutIm = gbOutIm + oOffset;\n\n";
+ }
+ }
+ }
+ else
+ {
+ if(params.fft_placeness == CLFFT_INPLACE)
+ {
+ str += OffsetCalc("ioOffset", true);
+
+ str += "\t";
+ if(inInterleaved)
+ {
+ str += "lwb = gb + ioOffset;\n\n";
+ }
+ else
+ {
+ str += "lwbRe = gbRe + ioOffset;\n\t";
+ str += "lwbIm = gbIm + ioOffset;\n\n";
+ }
+ }
+ else
+ {
+ str += OffsetCalc("iOffset", true);
+ str += OffsetCalc("oOffset", false);
+
+ str += "\t";
+ if(inInterleaved)
+ {
+ str += "lwbIn = gbIn + iOffset;\n\t";
+ }
+ else
+ {
+ str += "lwbInRe = gbInRe + iOffset;\n\t";
+ str += "lwbInIm = gbInIm + iOffset;\n\t";
+ }
+
+ if(outInterleaved)
+ {
+ str += "lwbOut = gbOut + oOffset;\n\n";
+ }
+ else
+ {
+ str += "lwbOutRe = gbOutRe + oOffset;\n\t";
+ str += "lwbOutIm = gbOutIm + oOffset;\n\n";
+ }
+ }
+ }
+
+ // Set rw and 'me' per transform
+ // rw string also contains 'b'
+ std::string rw, me;
+
+ if(r2c2r && !rcSimple) rw = "rw, b, ";
+ else rw = (numTrans > 1) ? "rw, b, " : "1, b, ";
+
+ if(numTrans > 1) { me += "me%"; me += SztToStr(workGroupSizePerTrans); me += ", "; }
+ else { me += "me, "; }
+
+ // Buffer strings
+ std::string inBuf, outBuf;
+ if(r2c2r)
+ {
+ if(rcSimple)
+ {
+ if(inInterleaved || inReal) inBuf = "lwbIn, ";
+ else inBuf = "lwbInRe, lwbInIm, ";
+ if(outInterleaved || outReal) outBuf = "lwbOut";
+ else outBuf = "lwbOutRe, lwbOutIm";
+ }
+ else
+ {
+ if(inInterleaved || inReal) inBuf = "lwbIn, lwbIn2, ";
+ else inBuf = "lwbInRe, lwbInRe2, lwbInIm, lwbInIm2, ";
+ if(outInterleaved || outReal) outBuf = "lwbOut, lwbOut2";
+ else outBuf = "lwbOutRe, lwbOutRe2, lwbOutIm, lwbOutIm2";
+ }
+ }
+ else
+ {
+ if(params.fft_placeness == CLFFT_INPLACE)
+ {
+ if(inInterleaved) { inBuf = "lwb, "; outBuf = "lwb"; }
+ else { inBuf = "lwbRe, lwbIm, "; outBuf = "lwbRe, lwbIm"; }
+ }
+ else
+ {
+ if(inInterleaved) inBuf = "lwbIn, ";
+ else inBuf = "lwbInRe, lwbInIm, ";
+ if(outInterleaved) outBuf = "lwbOut";
+ else outBuf = "lwbOutRe, lwbOutIm";
+ }
+ }
+
+ // Call passes
+ if(numPasses == 1)
+ {
+ str += "\t";
+ str += PassName(0, fwd);
+ str += "("; str += rw; str += me;
+ str += "0, 0, ";
+ str += inBuf; str += outBuf;
+ str += IterRegs("&");
+ str += ");\n";
+ }
+ else
+ {
+ for(typename std::vector >::const_iterator p = passes.begin(); p != passes.end(); p++)
+ {
+ str += "\t";
+ str += PassName(p->GetPosition(), fwd);
+ str += "(";
+
+ std::string ldsOff;
+ if(numTrans > 1)
+ {
+ ldsOff += "(me/"; ldsOff += SztToStr(workGroupSizePerTrans);
+ ldsOff += ")*"; ldsOff += SztToStr(length);
+ }
+ else
+ {
+ ldsOff += "0";
+ }
+
+ std::string ldsArgs;
+ if(halfLds) { ldsArgs += "lds, lds"; }
+ else { ldsArgs += "lds, lds + "; ldsArgs += SztToStr(length*numTrans); }
+
+ str += rw; str += me;
+ if(p == passes.begin()) // beginning pass
+ {
+ str += "0, ";
+ str += ldsOff;
+ str += ", ";
+ str += inBuf;
+ str += ldsArgs; str += IterRegs("&"); str += ");\n";
+ if(!halfLds) str += "\tbarrier(CLK_LOCAL_MEM_FENCE);\n";
+ }
+ else if((p+1) == passes.end()) // ending pass
+ {
+ str += ldsOff;
+ str += ", ";
+ str += "0, ";
+ str += ldsArgs; str += ", ";
+ str += outBuf;
+ str += IterRegs("&"); str += ");\n";
+ }
+ else // intermediate pass
+ {
+ str += ldsOff;
+ str += ", ";
+ str += ldsOff;
+ str += ", ";
+ str += ldsArgs; str += ", ";
+ str += ldsArgs; str += IterRegs("&"); str += ");\n";
+ if(!halfLds) str += "\tbarrier(CLK_LOCAL_MEM_FENCE);\n";
+ }
+ }
+ }
+
+ str += "}\n\n";
+
+ if(r2c2r)
+ break;
+ }
+ }
+ };
+};
+
+using namespace StockhamGenerator;
+
+template<>
+clfftStatus FFTPlan::GetKernelGenKeyPvt (FFTKernelGenKeyParams & params) const
+{
+
+ // Query the devices in this context for their local memory sizes
+ // How we generate a kernel depends on the *minimum* LDS size for all devices.
+ //
+ const FFTEnvelope * pEnvelope = NULL;
+ OPENCL_V(const_cast(this)->GetEnvelope (& pEnvelope), _T("GetEnvelope failed"));
+ BUG_CHECK (NULL != pEnvelope);
+
+ ::memset( ¶ms, 0, sizeof( params ) );
+ params.fft_precision = this->precision;
+ params.fft_placeness = this->placeness;
+ params.fft_inputLayout = this->inputLayout;
+ params.fft_MaxWorkGroupSize = this->envelope.limit_WorkGroupSize;
+
+ ARG_CHECK (this->inStride.size() == this->outStride.size())
+
+ bool real_transform = ((this->inputLayout == CLFFT_REAL) || (this->outputLayout == CLFFT_REAL));
+
+ if ( (CLFFT_INPLACE == this->placeness) && (!real_transform) ) {
+ // If this is an in-place transform the
+ // input and output layout, dimensions and strides
+ // *MUST* be the same.
+ //
+ ARG_CHECK (this->inputLayout == this->outputLayout)
+ params.fft_outputLayout = this->inputLayout;
+ for (size_t u = this->inStride.size(); u-- > 0; ) {
+ ARG_CHECK (this->inStride[u] == this->outStride[u]);
+ }
+ } else {
+ params.fft_outputLayout = this->outputLayout;
+ }
+
+ switch (this->inStride.size()) {
+ // 1-D array is a 2-D data structure.
+ // 1-D unit is a special case of 1-D array.
+ case 1:
+ ARG_CHECK(this->length .size() > 0);
+ ARG_CHECK(this->outStride.size() > 0);
+ params.fft_DataDim = 2;
+ params.fft_N[0] = this->length[0];
+ params.fft_inStride[0] = this->inStride[0];
+ params.fft_inStride[1] = this->iDist;
+ params.fft_outStride[0] = this->outStride[0];
+ params.fft_outStride[1] = this->oDist;
+ break;
+
+ // 2-D array is a 3-D data structure
+ // 2-D unit is a speical case of 2-D array.
+ case 2:
+ ARG_CHECK(this->length .size() > 1);
+ ARG_CHECK(this->outStride.size() > 1);
+ params.fft_DataDim = 3;
+ params.fft_N[0] = this->length[0];
+ params.fft_N[1] = this->length[1];
+ params.fft_inStride[0] = this->inStride[0];
+ params.fft_inStride[1] = this->inStride[1];
+ params.fft_inStride[2] = this->iDist;
+ params.fft_outStride[0] = this->outStride[0];
+ params.fft_outStride[1] = this->outStride[1];
+ params.fft_outStride[2] = this->oDist;
+ break;
+
+ // 3-D array is a 4-D data structure
+ // 3-D unit is a special case of 3-D array.
+ case 3:
+ ARG_CHECK(this->length .size() > 2);
+ ARG_CHECK(this->outStride.size() > 2);
+ params.fft_DataDim = 4;
+ params.fft_N[0] = this->length[0];
+ params.fft_N[1] = this->length[1];
+ params.fft_N[2] = this->length[2];
+ params.fft_inStride[0] = this->inStride[0];
+ params.fft_inStride[1] = this->inStride[1];
+ params.fft_inStride[2] = this->inStride[2];
+ params.fft_inStride[3] = this->iDist;
+ params.fft_outStride[0] = this->outStride[0];
+ params.fft_outStride[1] = this->outStride[1];
+ params.fft_outStride[2] = this->outStride[2];
+ params.fft_outStride[3] = this->oDist;
+ break;
+
+ // 5-D data structure
+ // This can occur when a large dimension is split into two for
+ // the "3-step" algorithm.
+ //
+ case 4:
+ ARG_CHECK(this->length .size() > 3);
+ ARG_CHECK(this->outStride.size() > 3);
+ params.fft_DataDim = 5;
+ params.fft_N[0] = this->length[0];
+ params.fft_N[1] = this->length[1];
+ params.fft_N[2] = this->length[2];
+ params.fft_N[3] = this->length[3];
+ params.fft_inStride[0] = this->inStride[0];
+ params.fft_inStride[1] = this->inStride[1];
+ params.fft_inStride[2] = this->inStride[2];
+ params.fft_inStride[3] = this->inStride[3];
+ params.fft_inStride[4] = this->iDist;
+ params.fft_outStride[0] = this->outStride[0];
+ params.fft_outStride[1] = this->outStride[1];
+ params.fft_outStride[2] = this->outStride[2];
+ params.fft_outStride[3] = this->outStride[3];
+ params.fft_outStride[4] = this->oDist;
+ break;
+ default:
+ ARG_CHECK (false);
+ }
+
+ // TODO: we could simplify the address calculations in the kernel
+ // when the input data is contiguous.
+ // For example, a 3-D data structure with
+ // lengths: [*, 64, *]
+ // strides: [*, 1024, 65536]
+ // could be reduced to a 2-D data structure.
+
+ params.fft_LdsComplex = this->bLdsComplex;
+
+ params.fft_RCsimple = this->RCsimple;
+
+ size_t wgs, nt;
+#ifdef PARMETERS_TO_BE_READ
+ ParamRead pr;
+ ReadParameterFile(pr);
+ wgs = pr.workGroupSize;
+ nt = pr.numTransformsPerWg;
+#else
+ size_t t_wgs, t_nt;
+ Precision pr = (params.fft_precision == CLFFT_SINGLE) ? P_SINGLE : P_DOUBLE;
+ switch(pr)
+ {
+ case P_SINGLE:
+ {
+ KernelCoreSpecs kcs;
+ kcs.GetWGSAndNT(params.fft_N[0], t_wgs, t_nt);
+ } break;
+ case P_DOUBLE:
+ {
+ KernelCoreSpecs kcs;
+ kcs.GetWGSAndNT(params.fft_N[0], t_wgs, t_nt);
+ } break;
+ }
+
+ if((t_wgs != 0) && (t_nt != 0) && (this->envelope.limit_WorkGroupSize >= 256))
+ {
+ wgs = t_wgs;
+ nt = t_nt;
+ }
+ else
+ DetermineSizes(this->envelope.limit_WorkGroupSize, params.fft_N[0], wgs, nt);
+#endif
+
+ assert((nt * params.fft_N[0]) >= wgs);
+ assert((nt * params.fft_N[0])%wgs == 0);
+
+ params.fft_R = (nt * params.fft_N[0])/wgs;
+ params.fft_SIMD = wgs;
+
+
+ params.fft_MaxRadix = params.fft_R;
+ params.fft_UseFMA = true;
+
+ if (this->large1D != 0) {
+ ARG_CHECK (params.fft_N[0] != 0)
+ ARG_CHECK ((this->large1D % params.fft_N[0]) == 0)
+ params.fft_3StepTwiddle = true;
+ params.fft_N[1] = this->large1D / params.fft_N[0];
+ }
+
+ params.fft_fwdScale = this->forwardScale;
+ params.fft_backScale = this->backwardScale;
+
+ return CLFFT_SUCCESS;
+}
+
+template<>
+clfftStatus FFTPlan::GetWorkSizesPvt (std::vector & globalWS, std::vector & localWS) const
+{
+ // How many complex numbers in the input mutl-dimensional array?
+ //
+ unsigned long long count = 1;
+ for (unsigned u = 0; u < length.size(); ++u) {
+ count *= std::max (1, this->length[ u ]);
+ }
+ count *= this->batchsize;
+
+
+ FFTKernelGenKeyParams fftParams;
+ // Translate the user plan into the structure that we use to map plans to clPrograms
+ OPENCL_V( this->GetKernelGenKeyPvt( fftParams ), _T("GetKernelGenKey() failed!") );
+
+ count = DivRoundingUp (count, fftParams.fft_R); // count of WorkItems
+ count = DivRoundingUp (count, fftParams.fft_SIMD); // count of WorkGroups
+
+ // for real transforms we only need half the work groups since we do twice the work in 1 work group
+ if( !(fftParams.fft_RCsimple) && ((fftParams.fft_inputLayout == CLFFT_REAL) || (fftParams.fft_outputLayout == CLFFT_REAL)) )
+ count = DivRoundingUp (count, 2);
+
+ count = std::max (count, 1) * fftParams.fft_SIMD;
+ // .. count of WorkItems, rounded up to next multiple of fft_SIMD.
+
+ // 1 dimension work group size
+ globalWS.push_back( static_cast< size_t >( count ) );
+
+ localWS.push_back( fftParams.fft_SIMD );
+
+ return CLFFT_SUCCESS;
+}
+
+template<>
+clfftStatus FFTPlan::GetMax1DLengthPvt (size_t * longest) const
+{
+ // TODO The caller has already acquired the lock on *this
+ // However, we shouldn't depend on it.
+
+ // Query the devices in this context for their local memory sizes
+ // How large a kernel we can generate depends on the *minimum* LDS
+ // size for all devices.
+ //
+ const FFTEnvelope * pEnvelope = NULL;
+ OPENCL_V(this->GetEnvelope (& pEnvelope), _T("GetEnvelope failed"));
+ BUG_CHECK (NULL != pEnvelope);
+
+ ARG_CHECK (NULL != longest)
+ size_t LdsperElement = this->ElementSize();
+ size_t result = pEnvelope->limit_LocalMemSize /
+ (1 * LdsperElement);
+ result = FloorPo2 (result);
+ *longest = result;
+ return CLFFT_SUCCESS;
+}
+
+template<>
+clfftStatus FFTPlan::GenerateKernelPvt(FFTRepo& fftRepo ) const
+{
+ FFTKernelGenKeyParams params;
+ OPENCL_V( this->GetKernelGenKeyPvt (params), _T("GetKernelGenKey() failed!") );
+
+ std::string programCode;
+ Precision pr = (params.fft_precision == CLFFT_SINGLE) ? P_SINGLE : P_DOUBLE;
+ switch(pr)
+ {
+ case P_SINGLE:
+ {
+ Kernel kernel(params);
+ kernel.GenerateKernel(programCode);
+ } break;
+ case P_DOUBLE:
+ {
+ Kernel kernel(params);
+ kernel.GenerateKernel(programCode);
+ } break;
+ }
+
+#ifdef KERNEL_INTERJECT
+ ReadKernelFromFile(programCode);
+#endif
+
+ OPENCL_V( fftRepo.setProgramCode( Stockham, params, programCode ), _T( "fftRepo.setclString() failed!" ) );
+ OPENCL_V( fftRepo.setProgramEntryPoints( Stockham, params, "fft_fwd", "fft_back" ), _T( "fftRepo.setProgramEntryPoint() failed!" ) );
+
+ return CLFFT_SUCCESS;
+}
diff --git a/src/library/generator.stockham.h b/src/library/generator.stockham.h
new file mode 100644
index 00000000..bba7d640
--- /dev/null
+++ b/src/library/generator.stockham.h
@@ -0,0 +1,1401 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#pragma once
+#if !defined( AMD_CLFFT_generator_stockham_H )
+#define AMD_CLFFT_generator_stockham_H
+#include "private.h"
+#include "repo.h"
+#include "plan.h"
+
+typedef union {
+ cl_float f;
+ cl_uint u;
+ cl_int i;
+} cb_t;
+
+namespace StockhamGenerator
+{
+ // Precision
+ enum Precision
+ {
+ P_SINGLE,
+ P_DOUBLE,
+ };
+
+ template
+ inline size_t PrecisionWidth()
+ {
+ switch(PR)
+ {
+ case P_SINGLE: return 1;
+ case P_DOUBLE: return 2;
+ default: assert(false); return 1;
+ }
+ }
+
+ template
+ inline std::string ClPragma()
+ {
+ switch(PR)
+ {
+ case P_SINGLE: return "";
+ case P_DOUBLE: return "\n#ifdef cl_khr_fp64\n"
+ "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
+ "#else\n"
+ "#pragma OPENCL EXTENSION cl_amd_fp64 : enable\n"
+ "#endif\n\n";
+ default: assert(false); return "";
+ }
+ }
+
+ // Convert unsigned integers to string
+ inline std::string SztToStr(size_t i)
+ {
+ std::stringstream ss;
+ ss << i;
+ return ss.str();
+ }
+
+ inline std::string FloatToStr(double f)
+ {
+ std::stringstream ss;
+ ss.precision(16);
+ ss << std::scientific << f;
+ return ss.str();
+ }
+
+ // Find the smallest power of 2 that is >= n; return its power of 2 factor
+ // e.g., CeilPo2 (7) returns 3 : (2^3 >= 7)
+ inline size_t CeilPo2 (size_t n)
+ {
+ size_t v = 1, t = 0;
+ while(v < n)
+ {
+ v <<= 1;
+ t++;
+ }
+
+ return t;
+ }
+
+ inline size_t FloorPo2 (size_t n)
+ // return the largest power of 2 that is <= n.
+ // e.g., FloorPo2 (7) returns 4.
+ // *** TODO use x86 BSR instruction, using compiler intrinsics.
+ {
+ size_t tmp;
+ while (0 != (tmp = n & (n-1)))
+ n = tmp;
+ return n;
+ }
+
+ typedef std::pair stringpair;
+ inline stringpair ComplexMul(const char *type, const char * a, const char * b, bool forward = true)
+ {
+ stringpair result;
+ result.first = "(";
+ result.first += type;
+ result.first += ") ((";
+ result.first += a;
+ result.first += ".x * ";
+ result.first += b;
+ result.first += (forward ? ".x - " : ".x + ");
+ result.first += a;
+ result.first += ".y * ";
+ result.first += b;
+ result.first += ".y),";
+ result.second = "(";
+ result.second += a;
+ result.second += ".y * ";
+ result.second += b;
+ result.second += (forward ? ".x + " : ".x - ");
+ result.second += a;
+ result.second += ".x * ";
+ result.second += b;
+ result.second += ".y))";
+ return result;
+ }
+
+ // Register data base types
+ template
+ inline std::string RegBaseType(size_t count)
+ {
+ switch(PR)
+ {
+ case P_SINGLE:
+ switch(count)
+ {
+ case 1: return "float";
+ case 2: return "float2";
+ case 4: return "float4";
+ default: assert(false); return "";
+ }
+ break;
+ case P_DOUBLE:
+ switch(count)
+ {
+ case 1: return "double";
+ case 2: return "double2";
+ case 4: return "double4";
+ default: assert(false); return "";
+ }
+ break;
+ default:
+ assert(false); return "";
+ }
+ }
+
+ template
+ inline std::string FloatSuffix()
+ {
+ // Suffix for constants
+ std::string sfx;
+ switch(PR)
+ {
+ case P_SINGLE: sfx = "f"; break;
+ case P_DOUBLE: sfx = ""; break;
+ default: assert(false);
+ }
+
+ return sfx;
+ }
+
+ inline std::string ButterflyName(size_t radix, size_t count, bool fwd)
+ {
+ std::string str;
+ if(fwd) str += "Fwd";
+ else str += "Inv";
+ str += "Rad"; str += SztToStr(radix);
+ str += "B"; str += SztToStr(count);
+ return str;
+ }
+
+ inline std::string PassName(size_t pos, bool fwd)
+ {
+ std::string str;
+ if(fwd) str += "Fwd";
+ else str += "Inv";
+ str += "Pass"; str += SztToStr(pos);
+ return str;
+ }
+
+ inline std::string TwTableName()
+ {
+ return "twiddles";
+ }
+
+ inline std::string TwTableLargeName()
+ {
+ return "twiddle_dee";
+ }
+
+ inline std::string TwTableLargeFunc()
+ {
+ return "TW3step";
+ }
+
+ // FFT butterfly
+ template
+ class Butterfly
+ {
+ size_t radix; // Base radix
+ size_t count; // Number of basic butterflies, valid values: 1,2,4
+ bool fwd; // FFT direction
+ bool cReg; // registers are complex numbers, .x (real), .y(imag)
+
+ size_t BitReverse (size_t n, size_t N) const
+ {
+ return (N < 2) ? n : (BitReverse (n >> 1, N >> 1) | ((n & 1) != 0 ? (N >> 1) : 0));
+ }
+
+ void GenerateButterflyStr(std::string &bflyStr) const
+ {
+ std::string regType = cReg ? RegBaseType(2) : RegBaseType(count);
+
+ // Function attribute
+ bflyStr += "__attribute__((always_inline)) void \n";
+
+ // Function name
+ bflyStr += ButterflyName(radix, count, fwd);
+
+ // Function Arguments
+ bflyStr += "(";
+ for(size_t i=0;;i++)
+ {
+ if(cReg)
+ {
+ bflyStr += regType; bflyStr += " *R";
+ if(radix & (radix-1)) bflyStr += SztToStr(i);
+ else bflyStr += SztToStr(BitReverse(i,radix));
+ }
+ else
+ {
+ bflyStr += regType; bflyStr += " *R"; bflyStr += SztToStr(i); bflyStr += ", "; // real arguments
+ bflyStr += regType; bflyStr += " *I"; bflyStr += SztToStr(i); // imaginary arguments
+ }
+
+ if(i == radix-1)
+ {
+ bflyStr += ")";
+ break;
+ }
+ else
+ {
+ bflyStr += ", ";
+ }
+ }
+
+ bflyStr += "\n{\n\n";
+
+
+ // Temporary variables
+ // Allocate temporary variables if we are not using complex registers (cReg = 0) or if cReg is true, then
+ // allocate temporary variables only for non power-of-2 radices
+ if( (radix & (radix-1)) || (!cReg) )
+ {
+ bflyStr += "\t";
+ if(cReg)
+ bflyStr += RegBaseType(1);
+ else
+ bflyStr += regType;
+
+ for(size_t i=0;;i++)
+ {
+ bflyStr += " TR"; bflyStr += SztToStr(i); bflyStr += ","; // real arguments
+ bflyStr += " TI"; bflyStr += SztToStr(i); // imaginary arguments
+
+ if(i == radix-1)
+ {
+ bflyStr += ";";
+ break;
+ }
+ else
+ {
+ bflyStr += ",";
+ }
+ }
+ }
+ else
+ {
+ bflyStr += "\t";
+ bflyStr += RegBaseType(2);
+ bflyStr += " T;";
+ }
+
+
+ bflyStr += "\n\n\t";
+
+ // Butterfly for different radices
+ switch(radix)
+ {
+ case 2:
+ {
+ if(cReg)
+ {
+ bflyStr +=
+ "(*R1) = (*R0) - (*R1);\n\t"
+ "(*R0) = 2.0f * (*R0) - (*R1);\n\t";
+ }
+ else
+ {
+ bflyStr +=
+ "TR0 = (*R0) + (*R1);\n\t"
+ "TI0 = (*I0) + (*I1);\n\t"
+ "TR1 = (*R0) - (*R1);\n\t"
+ "TI1 = (*I0) - (*I1);\n\t";
+ }
+
+ } break;
+ case 3:
+ {
+ if(fwd)
+ {
+ if(cReg)
+ {
+ bflyStr +=
+ "TR0 = (*R0).x + (*R1).x + (*R2).x;\n\t"
+ "TR1 = ((*R0).x - C3QA*((*R1).x + (*R2).x)) + C3QB*((*R1).y - (*R2).y);\n\t"
+ "TR2 = ((*R0).x - C3QA*((*R1).x + (*R2).x)) - C3QB*((*R1).y - (*R2).y);\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "TI0 = (*R0).y + (*R1).y + (*R2).y;\n\t"
+ "TI1 = ((*R0).y - C3QA*((*R1).y + (*R2).y)) - C3QB*((*R1).x - (*R2).x);\n\t"
+ "TI2 = ((*R0).y - C3QA*((*R1).y + (*R2).y)) + C3QB*((*R1).x - (*R2).x);\n\t";
+ }
+ else
+ {
+ bflyStr +=
+ "TR0 = *R0 + *R1 + *R2;\n\t"
+ "TR1 = (*R0 - C3QA*(*R1 + *R2)) + C3QB*(*I1 - *I2);\n\t"
+ "TR2 = (*R0 - C3QA*(*R1 + *R2)) - C3QB*(*I1 - *I2);\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "TI0 = *I0 + *I1 + *I2;\n\t"
+ "TI1 = (*I0 - C3QA*(*I1 + *I2)) - C3QB*(*R1 - *R2);\n\t"
+ "TI2 = (*I0 - C3QA*(*I1 + *I2)) + C3QB*(*R1 - *R2);\n\t";
+ }
+ }
+ else
+ {
+ if(cReg)
+ {
+ bflyStr +=
+ "TR0 = (*R0).x + (*R1).x + (*R2).x;\n\t"
+ "TR1 = ((*R0).x - C3QA*((*R1).x + (*R2).x)) - C3QB*((*R1).y - (*R2).y);\n\t"
+ "TR2 = ((*R0).x - C3QA*((*R1).x + (*R2).x)) + C3QB*((*R1).y - (*R2).y);\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "TI0 = (*R0).y + (*R1).y + (*R2).y;\n\t"
+ "TI1 = ((*R0).y - C3QA*((*R1).y + (*R2).y)) + C3QB*((*R1).x - (*R2).x);\n\t"
+ "TI2 = ((*R0).y - C3QA*((*R1).y + (*R2).y)) - C3QB*((*R1).x - (*R2).x);\n\t";
+ }
+ else
+ {
+ bflyStr +=
+ "TR0 = *R0 + *R1 + *R2;\n\t"
+ "TR1 = (*R0 - C3QA*(*R1 + *R2)) - C3QB*(*I1 - *I2);\n\t"
+ "TR2 = (*R0 - C3QA*(*R1 + *R2)) + C3QB*(*I1 - *I2);\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "TI0 = *I0 + *I1 + *I2;\n\t"
+ "TI1 = (*I0 - C3QA*(*I1 + *I2)) + C3QB*(*R1 - *R2);\n\t"
+ "TI2 = (*I0 - C3QA*(*I1 + *I2)) - C3QB*(*R1 - *R2);\n\t";
+ }
+ }
+ } break;
+ case 4:
+ {
+ if(fwd)
+ {
+ if(cReg)
+ {
+ bflyStr +=
+ "(*R1) = (*R0) - (*R1);\n\t"
+ "(*R0) = 2.0f * (*R0) - (*R1);\n\t"
+ "(*R3) = (*R2) - (*R3);\n\t"
+ "(*R2) = 2.0f * (*R2) - (*R3);\n\t"
+ "\n\t"
+ "(*R2) = (*R0) - (*R2);\n\t"
+ "(*R0) = 2.0f * (*R0) - (*R2);\n\t"
+ "(*R3) = (*R1) + (fvect2)(-(*R3).y, (*R3).x);\n\t"
+ "(*R1) = 2.0f * (*R1) - (*R3);\n\t";
+ }
+ else
+ {
+ bflyStr +=
+ "TR0 = (*R0) + (*R2) + (*R1) + (*R3);\n\t"
+ "TR1 = (*R0) - (*R2) + (*I1) - (*I3);\n\t"
+ "TR2 = (*R0) + (*R2) - (*R1) - (*R3);\n\t"
+ "TR3 = (*R0) - (*R2) - (*I1) + (*I3);\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "TI0 = (*I0) + (*I2) + (*I1) + (*I3);\n\t"
+ "TI1 = (*I0) - (*I2) - (*R1) + (*R3);\n\t"
+ "TI2 = (*I0) + (*I2) - (*I1) - (*I3);\n\t"
+ "TI3 = (*I0) - (*I2) + (*R1) - (*R3);\n\t";
+ }
+ }
+ else
+ {
+ if(cReg)
+ {
+ bflyStr +=
+ "(*R1) = (*R0) - (*R1);\n\t"
+ "(*R0) = 2.0f * (*R0) - (*R1);\n\t"
+ "(*R3) = (*R2) - (*R3);\n\t"
+ "(*R2) = 2.0f * (*R2) - (*R3);\n\t"
+ "\n\t"
+ "(*R2) = (*R0) - (*R2);\n\t"
+ "(*R0) = 2.0f * (*R0) - (*R2);\n\t"
+ "(*R3) = (*R1) + (fvect2)((*R3).y, -(*R3).x);\n\t"
+ "(*R1) = 2.0f * (*R1) - (*R3);\n\t";
+ }
+ else
+ {
+ bflyStr +=
+ "TR0 = (*R0) + (*R2) + (*R1) + (*R3);\n\t"
+ "TR1 = (*R0) - (*R2) - (*I1) + (*I3);\n\t"
+ "TR2 = (*R0) + (*R2) - (*R1) - (*R3);\n\t"
+ "TR3 = (*R0) - (*R2) + (*I1) - (*I3);\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "TI0 = (*I0) + (*I2) + (*I1) + (*I3);\n\t"
+ "TI1 = (*I0) - (*I2) + (*R1) - (*R3);\n\t"
+ "TI2 = (*I0) + (*I2) - (*I1) - (*I3);\n\t"
+ "TI3 = (*I0) - (*I2) - (*R1) + (*R3);\n\t";
+ }
+ }
+ } break;
+ case 5:
+ {
+ if(fwd)
+ {
+ if(cReg)
+ {
+ bflyStr +=
+ "TR0 = (*R0).x + (*R1).x + (*R2).x + (*R3).x + (*R4).x;\n\t"
+ "TR1 = ((*R0).x - C5QC*((*R2).x + (*R3).x)) + C5QB*((*R1).y - (*R4).y) + C5QD*((*R2).y - (*R3).y) + C5QA*(((*R1).x - (*R2).x) + ((*R4).x - (*R3).x));\n\t"
+ "TR4 = ((*R0).x - C5QC*((*R2).x + (*R3).x)) - C5QB*((*R1).y - (*R4).y) - C5QD*((*R2).y - (*R3).y) + C5QA*(((*R1).x - (*R2).x) + ((*R4).x - (*R3).x));\n\t"
+ "TR2 = ((*R0).x - C5QC*((*R1).x + (*R4).x)) - C5QB*((*R2).y - (*R3).y) + C5QD*((*R1).y - (*R4).y) + C5QA*(((*R2).x - (*R1).x) + ((*R3).x - (*R4).x));\n\t"
+ "TR3 = ((*R0).x - C5QC*((*R1).x + (*R4).x)) + C5QB*((*R2).y - (*R3).y) - C5QD*((*R1).y - (*R4).y) + C5QA*(((*R2).x - (*R1).x) + ((*R3).x - (*R4).x));\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "TI0 = (*R0).y + (*R1).y + (*R2).y + (*R3).y + (*R4).y;\n\t"
+ "TI1 = ((*R0).y - C5QC*((*R2).y + (*R3).y)) - C5QB*((*R1).x - (*R4).x) - C5QD*((*R2).x - (*R3).x) + C5QA*(((*R1).y - (*R2).y) + ((*R4).y - (*R3).y));\n\t"
+ "TI4 = ((*R0).y - C5QC*((*R2).y + (*R3).y)) + C5QB*((*R1).x - (*R4).x) + C5QD*((*R2).x - (*R3).x) + C5QA*(((*R1).y - (*R2).y) + ((*R4).y - (*R3).y));\n\t"
+ "TI2 = ((*R0).y - C5QC*((*R1).y + (*R4).y)) + C5QB*((*R2).x - (*R3).x) - C5QD*((*R1).x - (*R4).x) + C5QA*(((*R2).y - (*R1).y) + ((*R3).y - (*R4).y));\n\t"
+ "TI3 = ((*R0).y - C5QC*((*R1).y + (*R4).y)) - C5QB*((*R2).x - (*R3).x) + C5QD*((*R1).x - (*R4).x) + C5QA*(((*R2).y - (*R1).y) + ((*R3).y - (*R4).y));\n\t";
+ }
+ else
+ {
+ bflyStr +=
+ "TR0 = *R0 + *R1 + *R2 + *R3 + *R4;\n\t"
+ "TR1 = (*R0 - C5QC*(*R2 + *R3)) + C5QB*(*I1 - *I4) + C5QD*(*I2 - *I3) + C5QA*((*R1 - *R2) + (*R4 - *R3));\n\t"
+ "TR4 = (*R0 - C5QC*(*R2 + *R3)) - C5QB*(*I1 - *I4) - C5QD*(*I2 - *I3) + C5QA*((*R1 - *R2) + (*R4 - *R3));\n\t"
+ "TR2 = (*R0 - C5QC*(*R1 + *R4)) - C5QB*(*I2 - *I3) + C5QD*(*I1 - *I4) + C5QA*((*R2 - *R1) + (*R3 - *R4));\n\t"
+ "TR3 = (*R0 - C5QC*(*R1 + *R4)) + C5QB*(*I2 - *I3) - C5QD*(*I1 - *I4) + C5QA*((*R2 - *R1) + (*R3 - *R4));\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "TI0 = *I0 + *I1 + *I2 + *I3 + *I4;\n\t"
+ "TI1 = (*I0 - C5QC*(*I2 + *I3)) - C5QB*(*R1 - *R4) - C5QD*(*R2 - *R3) + C5QA*((*I1 - *I2) + (*I4 - *I3));\n\t"
+ "TI4 = (*I0 - C5QC*(*I2 + *I3)) + C5QB*(*R1 - *R4) + C5QD*(*R2 - *R3) + C5QA*((*I1 - *I2) + (*I4 - *I3));\n\t"
+ "TI2 = (*I0 - C5QC*(*I1 + *I4)) + C5QB*(*R2 - *R3) - C5QD*(*R1 - *R4) + C5QA*((*I2 - *I1) + (*I3 - *I4));\n\t"
+ "TI3 = (*I0 - C5QC*(*I1 + *I4)) - C5QB*(*R2 - *R3) + C5QD*(*R1 - *R4) + C5QA*((*I2 - *I1) + (*I3 - *I4));\n\t";
+ }
+ }
+ else
+ {
+ if(cReg)
+ {
+ bflyStr +=
+ "TR0 = (*R0).x + (*R1).x + (*R2).x + (*R3).x + (*R4).x;\n\t"
+ "TR1 = ((*R0).x - C5QC*((*R2).x + (*R3).x)) - C5QB*((*R1).y - (*R4).y) - C5QD*((*R2).y - (*R3).y) + C5QA*(((*R1).x - (*R2).x) + ((*R4).x - (*R3).x));\n\t"
+ "TR4 = ((*R0).x - C5QC*((*R2).x + (*R3).x)) + C5QB*((*R1).y - (*R4).y) + C5QD*((*R2).y - (*R3).y) + C5QA*(((*R1).x - (*R2).x) + ((*R4).x - (*R3).x));\n\t"
+ "TR2 = ((*R0).x - C5QC*((*R1).x + (*R4).x)) + C5QB*((*R2).y - (*R3).y) - C5QD*((*R1).y - (*R4).y) + C5QA*(((*R2).x - (*R1).x) + ((*R3).x - (*R4).x));\n\t"
+ "TR3 = ((*R0).x - C5QC*((*R1).x + (*R4).x)) - C5QB*((*R2).y - (*R3).y) + C5QD*((*R1).y - (*R4).y) + C5QA*(((*R2).x - (*R1).x) + ((*R3).x - (*R4).x));\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "TI0 = (*R0).y + (*R1).y + (*R2).y + (*R3).y + (*R4).y;\n\t"
+ "TI1 = ((*R0).y - C5QC*((*R2).y + (*R3).y)) + C5QB*((*R1).x - (*R4).x) + C5QD*((*R2).x - (*R3).x) + C5QA*(((*R1).y - (*R2).y) + ((*R4).y - (*R3).y));\n\t"
+ "TI4 = ((*R0).y - C5QC*((*R2).y + (*R3).y)) - C5QB*((*R1).x - (*R4).x) - C5QD*((*R2).x - (*R3).x) + C5QA*(((*R1).y - (*R2).y) + ((*R4).y - (*R3).y));\n\t"
+ "TI2 = ((*R0).y - C5QC*((*R1).y + (*R4).y)) - C5QB*((*R2).x - (*R3).x) + C5QD*((*R1).x - (*R4).x) + C5QA*(((*R2).y - (*R1).y) + ((*R3).y - (*R4).y));\n\t"
+ "TI3 = ((*R0).y - C5QC*((*R1).y + (*R4).y)) + C5QB*((*R2).x - (*R3).x) - C5QD*((*R1).x - (*R4).x) + C5QA*(((*R2).y - (*R1).y) + ((*R3).y - (*R4).y));\n\t";
+ }
+ else
+ {
+ bflyStr +=
+ "TR0 = *R0 + *R1 + *R2 + *R3 + *R4;\n\t"
+ "TR1 = (*R0 - C5QC*(*R2 + *R3)) - C5QB*(*I1 - *I4) - C5QD*(*I2 - *I3) + C5QA*((*R1 - *R2) + (*R4 - *R3));\n\t"
+ "TR4 = (*R0 - C5QC*(*R2 + *R3)) + C5QB*(*I1 - *I4) + C5QD*(*I2 - *I3) + C5QA*((*R1 - *R2) + (*R4 - *R3));\n\t"
+ "TR2 = (*R0 - C5QC*(*R1 + *R4)) + C5QB*(*I2 - *I3) - C5QD*(*I1 - *I4) + C5QA*((*R2 - *R1) + (*R3 - *R4));\n\t"
+ "TR3 = (*R0 - C5QC*(*R1 + *R4)) - C5QB*(*I2 - *I3) + C5QD*(*I1 - *I4) + C5QA*((*R2 - *R1) + (*R3 - *R4));\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "TI0 = *I0 + *I1 + *I2 + *I3 + *I4;\n\t"
+ "TI1 = (*I0 - C5QC*(*I2 + *I3)) + C5QB*(*R1 - *R4) + C5QD*(*R2 - *R3) + C5QA*((*I1 - *I2) + (*I4 - *I3));\n\t"
+ "TI4 = (*I0 - C5QC*(*I2 + *I3)) - C5QB*(*R1 - *R4) - C5QD*(*R2 - *R3) + C5QA*((*I1 - *I2) + (*I4 - *I3));\n\t"
+ "TI2 = (*I0 - C5QC*(*I1 + *I4)) - C5QB*(*R2 - *R3) + C5QD*(*R1 - *R4) + C5QA*((*I2 - *I1) + (*I3 - *I4));\n\t"
+ "TI3 = (*I0 - C5QC*(*I1 + *I4)) + C5QB*(*R2 - *R3) - C5QD*(*R1 - *R4) + C5QA*((*I2 - *I1) + (*I3 - *I4));\n\t";
+ }
+ }
+ } break;
+ case 6:
+ {
+ if(fwd)
+ {
+ if(cReg)
+ {
+ bflyStr +=
+ "TR0 = (*R0).x + (*R2).x + (*R4).x;\n\t"
+ "TR2 = ((*R0).x - C3QA*((*R2).x + (*R4).x)) + C3QB*((*R2).y - (*R4).y);\n\t"
+ "TR4 = ((*R0).x - C3QA*((*R2).x + (*R4).x)) - C3QB*((*R2).y - (*R4).y);\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "TI0 = (*R0).y + (*R2).y + (*R4).y;\n\t"
+ "TI2 = ((*R0).y - C3QA*((*R2).y + (*R4).y)) - C3QB*((*R2).x - (*R4).x);\n\t"
+ "TI4 = ((*R0).y - C3QA*((*R2).y + (*R4).y)) + C3QB*((*R2).x - (*R4).x);\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "TR1 = (*R1).x + (*R3).x + (*R5).x;\n\t"
+ "TR3 = ((*R1).x - C3QA*((*R3).x + (*R5).x)) + C3QB*((*R3).y - (*R5).y);\n\t"
+ "TR5 = ((*R1).x - C3QA*((*R3).x + (*R5).x)) - C3QB*((*R3).y - (*R5).y);\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "TI1 = (*R1).y + (*R3).y + (*R5).y;\n\t"
+ "TI3 = ((*R1).y - C3QA*((*R3).y + (*R5).y)) - C3QB*((*R3).x - (*R5).x);\n\t"
+ "TI5 = ((*R1).y - C3QA*((*R3).y + (*R5).y)) + C3QB*((*R3).x - (*R5).x);\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "(*R0).x = TR0 + TR1;\n\t"
+ "(*R1).x = TR2 + ( C3QA*TR3 + C3QB*TI3);\n\t"
+ "(*R2).x = TR4 + (-C3QA*TR5 + C3QB*TI5);\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "(*R0).y = TI0 + TI1;\n\t"
+ "(*R1).y = TI2 + (-C3QB*TR3 + C3QA*TI3);\n\t"
+ "(*R2).y = TI4 + (-C3QB*TR5 - C3QA*TI5);\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "(*R3).x = TR0 - TR1;\n\t"
+ "(*R4).x = TR2 - ( C3QA*TR3 + C3QB*TI3);\n\t"
+ "(*R5).x = TR4 - (-C3QA*TR5 + C3QB*TI5);\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "(*R3).y = TI0 - TI1;\n\t"
+ "(*R4).y = TI2 - (-C3QB*TR3 + C3QA*TI3);\n\t"
+ "(*R5).y = TI4 - (-C3QB*TR5 - C3QA*TI5);\n\t";
+ }
+ else
+ {
+ bflyStr +=
+ "TR0 = *R0 + *R2 + *R4;\n\t"
+ "TR2 = (*R0 - C3QA*(*R2 + *R4)) + C3QB*(*I2 - *I4);\n\t"
+ "TR4 = (*R0 - C3QA*(*R2 + *R4)) - C3QB*(*I2 - *I4);\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "TI0 = *I0 + *I2 + *I4;\n\t"
+ "TI2 = (*I0 - C3QA*(*I2 + *I4)) - C3QB*(*R2 - *R4);\n\t"
+ "TI4 = (*I0 - C3QA*(*I2 + *I4)) + C3QB*(*R2 - *R4);\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "TR1 = *R1 + *R3 + *R5;\n\t"
+ "TR3 = (*R1 - C3QA*(*R3 + *R5)) + C3QB*(*I3 - *I5);\n\t"
+ "TR5 = (*R1 - C3QA*(*R3 + *R5)) - C3QB*(*I3 - *I5);\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "TI1 = *I1 + *I3 + *I5;\n\t"
+ "TI3 = (*I1 - C3QA*(*I3 + *I5)) - C3QB*(*R3 - *R5);\n\t"
+ "TI5 = (*I1 - C3QA*(*I3 + *I5)) + C3QB*(*R3 - *R5);\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "(*R0) = TR0 + TR1;\n\t"
+ "(*R1) = TR2 + ( C3QA*TR3 + C3QB*TI3);\n\t"
+ "(*R2) = TR4 + (-C3QA*TR5 + C3QB*TI5);\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "(*I0) = TI0 + TI1;\n\t"
+ "(*I1) = TI2 + (-C3QB*TR3 + C3QA*TI3);\n\t"
+ "(*I2) = TI4 + (-C3QB*TR5 - C3QA*TI5);\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "(*R3) = TR0 - TR1;\n\t"
+ "(*R4) = TR2 - ( C3QA*TR3 + C3QB*TI3);\n\t"
+ "(*R5) = TR4 - (-C3QA*TR5 + C3QB*TI5);\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "(*I3) = TI0 - TI1;\n\t"
+ "(*I4) = TI2 - (-C3QB*TR3 + C3QA*TI3);\n\t"
+ "(*I5) = TI4 - (-C3QB*TR5 - C3QA*TI5);\n\t";
+ }
+ }
+ else
+ {
+ if(cReg)
+ {
+ bflyStr +=
+ "TR0 = (*R0).x + (*R2).x + (*R4).x;\n\t"
+ "TR2 = ((*R0).x - C3QA*((*R2).x + (*R4).x)) - C3QB*((*R2).y - (*R4).y);\n\t"
+ "TR4 = ((*R0).x - C3QA*((*R2).x + (*R4).x)) + C3QB*((*R2).y - (*R4).y);\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "TI0 = (*R0).y + (*R2).y + (*R4).y;\n\t"
+ "TI2 = ((*R0).y - C3QA*((*R2).y + (*R4).y)) + C3QB*((*R2).x - (*R4).x);\n\t"
+ "TI4 = ((*R0).y - C3QA*((*R2).y + (*R4).y)) - C3QB*((*R2).x - (*R4).x);\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "TR1 = (*R1).x + (*R3).x + (*R5).x;\n\t"
+ "TR3 = ((*R1).x - C3QA*((*R3).x + (*R5).x)) - C3QB*((*R3).y - (*R5).y);\n\t"
+ "TR5 = ((*R1).x - C3QA*((*R3).x + (*R5).x)) + C3QB*((*R3).y - (*R5).y);\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "TI1 = (*R1).y + (*R3).y + (*R5).y;\n\t"
+ "TI3 = ((*R1).y - C3QA*((*R3).y + (*R5).y)) + C3QB*((*R3).x - (*R5).x);\n\t"
+ "TI5 = ((*R1).y - C3QA*((*R3).y + (*R5).y)) - C3QB*((*R3).x - (*R5).x);\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "(*R0).x = TR0 + TR1;\n\t"
+ "(*R1).x = TR2 + ( C3QA*TR3 - C3QB*TI3);\n\t"
+ "(*R2).x = TR4 + (-C3QA*TR5 - C3QB*TI5);\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "(*R0).y = TI0 + TI1;\n\t"
+ "(*R1).y = TI2 + ( C3QB*TR3 + C3QA*TI3);\n\t"
+ "(*R2).y = TI4 + ( C3QB*TR5 - C3QA*TI5);\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "(*R3).x = TR0 - TR1;\n\t"
+ "(*R4).x = TR2 - ( C3QA*TR3 - C3QB*TI3);\n\t"
+ "(*R5).x = TR4 - (-C3QA*TR5 - C3QB*TI5);\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "(*R3).y = TI0 - TI1;\n\t"
+ "(*R4).y = TI2 - ( C3QB*TR3 + C3QA*TI3);\n\t"
+ "(*R5).y = TI4 - ( C3QB*TR5 - C3QA*TI5);\n\t";
+ }
+ else
+ {
+ bflyStr +=
+ "TR0 = *R0 + *R2 + *R4;\n\t"
+ "TR2 = (*R0 - C3QA*(*R2 + *R4)) - C3QB*(*I2 - *I4);\n\t"
+ "TR4 = (*R0 - C3QA*(*R2 + *R4)) + C3QB*(*I2 - *I4);\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "TI0 = *I0 + *I2 + *I4;\n\t"
+ "TI2 = (*I0 - C3QA*(*I2 + *I4)) + C3QB*(*R2 - *R4);\n\t"
+ "TI4 = (*I0 - C3QA*(*I2 + *I4)) - C3QB*(*R2 - *R4);\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "TR1 = *R1 + *R3 + *R5;\n\t"
+ "TR3 = (*R1 - C3QA*(*R3 + *R5)) - C3QB*(*I3 - *I5);\n\t"
+ "TR5 = (*R1 - C3QA*(*R3 + *R5)) + C3QB*(*I3 - *I5);\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "TI1 = *I1 + *I3 + *I5;\n\t"
+ "TI3 = (*I1 - C3QA*(*I3 + *I5)) + C3QB*(*R3 - *R5);\n\t"
+ "TI5 = (*I1 - C3QA*(*I3 + *I5)) - C3QB*(*R3 - *R5);\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "(*R0) = TR0 + TR1;\n\t"
+ "(*R1) = TR2 + ( C3QA*TR3 - C3QB*TI3);\n\t"
+ "(*R2) = TR4 + (-C3QA*TR5 - C3QB*TI5);\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "(*I0) = TI0 + TI1;\n\t"
+ "(*I1) = TI2 + ( C3QB*TR3 + C3QA*TI3);\n\t"
+ "(*I2) = TI4 + ( C3QB*TR5 - C3QA*TI5);\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "(*R3) = TR0 - TR1;\n\t"
+ "(*R4) = TR2 - ( C3QA*TR3 - C3QB*TI3);\n\t"
+ "(*R5) = TR4 - (-C3QA*TR5 - C3QB*TI5);\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "(*I3) = TI0 - TI1;\n\t"
+ "(*I4) = TI2 - ( C3QB*TR3 + C3QA*TI3);\n\t"
+ "(*I5) = TI4 - ( C3QB*TR5 - C3QA*TI5);\n\t";
+ }
+ }
+ } break;
+ case 8:
+ {
+ if(fwd)
+ {
+ if(cReg)
+ {
+ bflyStr +=
+ "(*R1) = (*R0) - (*R1);\n\t"
+ "(*R0) = 2.0f * (*R0) - (*R1);\n\t"
+ "(*R3) = (*R2) - (*R3);\n\t"
+ "(*R2) = 2.0f * (*R2) - (*R3);\n\t"
+ "(*R5) = (*R4) - (*R5);\n\t"
+ "(*R4) = 2.0f * (*R4) - (*R5);\n\t"
+ "(*R7) = (*R6) - (*R7);\n\t"
+ "(*R6) = 2.0f * (*R6) - (*R7);\n\t"
+ "\n\t"
+ "(*R2) = (*R0) - (*R2);\n\t"
+ "(*R0) = 2.0f * (*R0) - (*R2);\n\t"
+ "(*R3) = (*R1) + (fvect2)(-(*R3).y, (*R3).x);\n\t"
+ "(*R1) = 2.0f * (*R1) - (*R3);\n\t"
+ "(*R6) = (*R4) - (*R6);\n\t"
+ "(*R4) = 2.0f * (*R4) - (*R6);\n\t"
+ "(*R7) = (*R5) + (fvect2)(-(*R7).y, (*R7).x);\n\t"
+ "(*R5) = 2.0f * (*R5) - (*R7);\n\t"
+ "\n\t"
+ "(*R4) = (*R0) - (*R4);\n\t"
+ "(*R0) = 2.0f * (*R0) - (*R4);\n\t"
+ "(*R5) = ((*R1) - C8Q * (*R5)) - C8Q * (fvect2)((*R5).y, -(*R5).x);\n\t"
+ "(*R1) = 2.0f * (*R1) - (*R5);\n\t"
+ "(*R6) = (*R2) + (fvect2)(-(*R6).y, (*R6).x);\n\t"
+ "(*R2) = 2.0f * (*R2) - (*R6);\n\t"
+ "(*R7) = ((*R3) + C8Q * (*R7)) - C8Q * (fvect2)((*R7).y, -(*R7).x);\n\t"
+ "(*R3) = 2.0f * (*R3) - (*R7);\n\t";
+ }
+ else
+ {
+ bflyStr +=
+ "TR0 = (*R0) + (*R4) + (*R2) + (*R6) + (*R1) + (*R3) + (*R5) + (*R7) ;\n\t"
+ "TR1 = (*R0) - (*R4) + (*I2) - (*I6) + C8Q*(*R1) + C8Q*(*I1) - C8Q*(*R3) + C8Q*(*I3) - C8Q*(*R5) - C8Q*(*I5) + C8Q*(*R7) - C8Q*(*I7);\n\t"
+ "TR2 = (*R0) + (*R4) - (*R2) - (*R6) + (*I1) - (*I3) + (*I5) - (*I7);\n\t"
+ "TR3 = (*R0) - (*R4) - (*I2) + (*I6) - C8Q*(*R1) + C8Q*(*I1) + C8Q*(*R3) + C8Q*(*I3) + C8Q*(*R5) - C8Q*(*I5) - C8Q*(*R7) - C8Q*(*I7);\n\t"
+ "TR4 = (*R0) + (*R4) + (*R2) + (*R6) - (*R1) - (*R3) - (*R5) - (*R7) ;\n\t"
+ "TR5 = (*R0) - (*R4) + (*I2) - (*I6) - C8Q*(*R1) - C8Q*(*I1) + C8Q*(*R3) - C8Q*(*I3) + C8Q*(*R5) + C8Q*(*I5) - C8Q*(*R7) + C8Q*(*I7);\n\t"
+ "TR6 = (*R0) + (*R4) - (*R2) - (*R6) - (*I1) + (*I3) - (*I5) + (*I7);\n\t"
+ "TR7 = (*R0) - (*R4) - (*I2) + (*I6) + C8Q*(*R1) - C8Q*(*I1) - C8Q*(*R3) - C8Q*(*I3) - C8Q*(*R5) + C8Q*(*I5) + C8Q*(*R7) + C8Q*(*I7);\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "TI0 = (*I0) + (*I4) + (*I2) + (*I6) + (*I1) + (*I3) + (*I5) + (*I7);\n\t"
+ "TI1 = (*I0) - (*I4) - (*R2) + (*R6) - C8Q*(*R1) + C8Q*(*I1) - C8Q*(*R3) - C8Q*(*I3) + C8Q*(*R5) - C8Q*(*I5) + C8Q*(*R7) + C8Q*(*I7);\n\t"
+ "TI2 = (*I0) + (*I4) - (*I2) - (*I6) - (*R1) + (*R3) - (*R5) + (*R7) ;\n\t"
+ "TI3 = (*I0) - (*I4) + (*R2) - (*R6) - C8Q*(*R1) - C8Q*(*I1) - C8Q*(*R3) + C8Q*(*I3) + C8Q*(*R5) + C8Q*(*I5) + C8Q*(*R7) - C8Q*(*I7);\n\t"
+ "TI4 = (*I0) + (*I4) + (*I2) + (*I6) - (*I1) - (*I3) - (*I5) - (*I7);\n\t"
+ "TI5 = (*I0) - (*I4) - (*R2) + (*R6) + C8Q*(*R1) - C8Q*(*I1) + C8Q*(*R3) + C8Q*(*I3) - C8Q*(*R5) + C8Q*(*I5) - C8Q*(*R7) - C8Q*(*I7);\n\t"
+ "TI6 = (*I0) + (*I4) - (*I2) - (*I6) + (*R1) - (*R3) + (*R5) - (*R7) ;\n\t"
+ "TI7 = (*I0) - (*I4) + (*R2) - (*R6) + C8Q*(*R1) + C8Q*(*I1) + C8Q*(*R3) - C8Q*(*I3) - C8Q*(*R5) - C8Q*(*I5) - C8Q*(*R7) + C8Q*(*I7);\n\t";
+ }
+ }
+ else
+ {
+ if(cReg)
+ {
+ bflyStr +=
+ "(*R1) = (*R0) - (*R1);\n\t"
+ "(*R0) = 2.0f * (*R0) - (*R1);\n\t"
+ "(*R3) = (*R2) - (*R3);\n\t"
+ "(*R2) = 2.0f * (*R2) - (*R3);\n\t"
+ "(*R5) = (*R4) - (*R5);\n\t"
+ "(*R4) = 2.0f * (*R4) - (*R5);\n\t"
+ "(*R7) = (*R6) - (*R7);\n\t"
+ "(*R6) = 2.0f * (*R6) - (*R7);\n\t"
+ "\n\t"
+ "(*R2) = (*R0) - (*R2);\n\t"
+ "(*R0) = 2.0f * (*R0) - (*R2);\n\t"
+ "(*R3) = (*R1) + (fvect2)((*R3).y, -(*R3).x);\n\t"
+ "(*R1) = 2.0f * (*R1) - (*R3);\n\t"
+ "(*R6) = (*R4) - (*R6);\n\t"
+ "(*R4) = 2.0f * (*R4) - (*R6);\n\t"
+ "(*R7) = (*R5) + (fvect2)((*R7).y, -(*R7).x);\n\t"
+ "(*R5) = 2.0f * (*R5) - (*R7);\n\t"
+ "\n\t"
+ "(*R4) = (*R0) - (*R4);\n\t"
+ "(*R0) = 2.0f * (*R0) - (*R4);\n\t"
+ "(*R5) = ((*R1) - C8Q * (*R5)) + C8Q * (fvect2)((*R5).y, -(*R5).x);\n\t"
+ "(*R1) = 2.0f * (*R1) - (*R5);\n\t"
+ "(*R6) = (*R2) + (fvect2)((*R6).y, -(*R6).x);\n\t"
+ "(*R2) = 2.0f * (*R2) - (*R6);\n\t"
+ "(*R7) = ((*R3) + C8Q * (*R7)) + C8Q * (fvect2)((*R7).y, -(*R7).x);\n\t"
+ "(*R3) = 2.0f * (*R3) - (*R7);\n\t";
+ }
+ else
+ {
+ bflyStr +=
+ "TR0 = (*R0) + (*R4) + (*R2) + (*R6) + (*R1) + (*R3) + (*R5) + (*R7) ;\n\t"
+ "TR1 = (*R0) - (*R4) - (*I2) + (*I6) + C8Q*(*R1) - C8Q*(*I1) - C8Q*(*R3) - C8Q*(*I3) - C8Q*(*R5) + C8Q*(*I5) + C8Q*(*R7) + C8Q*(*I7);\n\t"
+ "TR2 = (*R0) + (*R4) - (*R2) - (*R6) - (*I1) + (*I3) - (*I5) + (*I7);\n\t"
+ "TR3 = (*R0) - (*R4) + (*I2) - (*I6) - C8Q*(*R1) - C8Q*(*I1) + C8Q*(*R3) - C8Q*(*I3) + C8Q*(*R5) + C8Q*(*I5) - C8Q*(*R7) + C8Q*(*I7);\n\t"
+ "TR4 = (*R0) + (*R4) + (*R2) + (*R6) - (*R1) - (*R3) - (*R5) - (*R7) ;\n\t"
+ "TR5 = (*R0) - (*R4) - (*I2) + (*I6) - C8Q*(*R1) + C8Q*(*I1) + C8Q*(*R3) + C8Q*(*I3) + C8Q*(*R5) - C8Q*(*I5) - C8Q*(*R7) - C8Q*(*I7);\n\t"
+ "TR6 = (*R0) + (*R4) - (*R2) - (*R6) + (*I1) - (*I3) + (*I5) - (*I7);\n\t"
+ "TR7 = (*R0) - (*R4) + (*I2) - (*I6) + C8Q*(*R1) + C8Q*(*I1) - C8Q*(*R3) + C8Q*(*I3) - C8Q*(*R5) - C8Q*(*I5) + C8Q*(*R7) - C8Q*(*I7);\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "TI0 = (*I0) + (*I4) + (*I2) + (*I6) + (*I1) + (*I3) + (*I5) + (*I7);\n\t"
+ "TI1 = (*I0) - (*I4) + (*R2) - (*R6) + C8Q*(*R1) + C8Q*(*I1) + C8Q*(*R3) - C8Q*(*I3) - C8Q*(*R5) - C8Q*(*I5) - C8Q*(*R7) + C8Q*(*I7);\n\t"
+ "TI2 = (*I0) + (*I4) - (*I2) - (*I6) + (*R1) - (*R3) + (*R5) - (*R7) ;\n\t"
+ "TI3 = (*I0) - (*I4) - (*R2) + (*R6) + C8Q*(*R1) - C8Q*(*I1) + C8Q*(*R3) + C8Q*(*I3) - C8Q*(*R5) + C8Q*(*I5) - C8Q*(*R7) - C8Q*(*I7);\n\t"
+ "TI4 = (*I0) + (*I4) + (*I2) + (*I6) - (*I1) - (*I3) - (*I5) - (*I7);\n\t"
+ "TI5 = (*I0) - (*I4) + (*R2) - (*R6) - C8Q*(*R1) - C8Q*(*I1) - C8Q*(*R3) + C8Q*(*I3) + C8Q*(*R5) + C8Q*(*I5) + C8Q*(*R7) - C8Q*(*I7);\n\t"
+ "TI6 = (*I0) + (*I4) - (*I2) - (*I6) - (*R1) + (*R3) - (*R5) + (*R7) ;\n\t"
+ "TI7 = (*I0) - (*I4) - (*R2) + (*R6) - C8Q*(*R1) + C8Q*(*I1) - C8Q*(*R3) - C8Q*(*I3) + C8Q*(*R5) - C8Q*(*I5) + C8Q*(*R7) + C8Q*(*I7);\n\t";
+ }
+ }
+ } break;
+ case 10:
+ {
+ if(fwd)
+ {
+ if(cReg)
+ {
+ bflyStr +=
+ "TR0 = (*R0).x + (*R2).x + (*R4).x + (*R6).x + (*R8).x;\n\t"
+ "TR2 = ((*R0).x - C5QC*((*R4).x + (*R6).x)) + C5QB*((*R2).y - (*R8).y) + C5QD*((*R4).y - (*R6).y) + C5QA*(((*R2).x - (*R4).x) + ((*R8).x - (*R6).x));\n\t"
+ "TR8 = ((*R0).x - C5QC*((*R4).x + (*R6).x)) - C5QB*((*R2).y - (*R8).y) - C5QD*((*R4).y - (*R6).y) + C5QA*(((*R2).x - (*R4).x) + ((*R8).x - (*R6).x));\n\t"
+ "TR4 = ((*R0).x - C5QC*((*R2).x + (*R8).x)) - C5QB*((*R4).y - (*R6).y) + C5QD*((*R2).y - (*R8).y) + C5QA*(((*R4).x - (*R2).x) + ((*R6).x - (*R8).x));\n\t"
+ "TR6 = ((*R0).x - C5QC*((*R2).x + (*R8).x)) + C5QB*((*R4).y - (*R6).y) - C5QD*((*R2).y - (*R8).y) + C5QA*(((*R4).x - (*R2).x) + ((*R6).x - (*R8).x));\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "TI0 = (*R0).y + (*R2).y + (*R4).y + (*R6).y + (*R8).y;\n\t"
+ "TI2 = ((*R0).y - C5QC*((*R4).y + (*R6).y)) - C5QB*((*R2).x - (*R8).x) - C5QD*((*R4).x - (*R6).x) + C5QA*(((*R2).y - (*R4).y) + ((*R8).y - (*R6).y));\n\t"
+ "TI8 = ((*R0).y - C5QC*((*R4).y + (*R6).y)) + C5QB*((*R2).x - (*R8).x) + C5QD*((*R4).x - (*R6).x) + C5QA*(((*R2).y - (*R4).y) + ((*R8).y - (*R6).y));\n\t"
+ "TI4 = ((*R0).y - C5QC*((*R2).y + (*R8).y)) + C5QB*((*R4).x - (*R6).x) - C5QD*((*R2).x - (*R8).x) + C5QA*(((*R4).y - (*R2).y) + ((*R6).y - (*R8).y));\n\t"
+ "TI6 = ((*R0).y - C5QC*((*R2).y + (*R8).y)) - C5QB*((*R4).x - (*R6).x) + C5QD*((*R2).x - (*R8).x) + C5QA*(((*R4).y - (*R2).y) + ((*R6).y - (*R8).y));\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "TR1 = (*R1).x + (*R3).x + (*R5).x + (*R7).x + (*R9).x;\n\t"
+ "TR3 = ((*R1).x - C5QC*((*R5).x + (*R7).x)) + C5QB*((*R3).y - (*R9).y) + C5QD*((*R5).y - (*R7).y) + C5QA*(((*R3).x - (*R5).x) + ((*R9).x - (*R7).x));\n\t"
+ "TR9 = ((*R1).x - C5QC*((*R5).x + (*R7).x)) - C5QB*((*R3).y - (*R9).y) - C5QD*((*R5).y - (*R7).y) + C5QA*(((*R3).x - (*R5).x) + ((*R9).x - (*R7).x));\n\t"
+ "TR5 = ((*R1).x - C5QC*((*R3).x + (*R9).x)) - C5QB*((*R5).y - (*R7).y) + C5QD*((*R3).y - (*R9).y) + C5QA*(((*R5).x - (*R3).x) + ((*R7).x - (*R9).x));\n\t"
+ "TR7 = ((*R1).x - C5QC*((*R3).x + (*R9).x)) + C5QB*((*R5).y - (*R7).y) - C5QD*((*R3).y - (*R9).y) + C5QA*(((*R5).x - (*R3).x) + ((*R7).x - (*R9).x));\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "TI1 = (*R1).y + (*R3).y + (*R5).y + (*R7).y + (*R9).y;\n\t"
+ "TI3 = ((*R1).y - C5QC*((*R5).y + (*R7).y)) - C5QB*((*R3).x - (*R9).x) - C5QD*((*R5).x - (*R7).x) + C5QA*(((*R3).y - (*R5).y) + ((*R9).y - (*R7).y));\n\t"
+ "TI9 = ((*R1).y - C5QC*((*R5).y + (*R7).y)) + C5QB*((*R3).x - (*R9).x) + C5QD*((*R5).x - (*R7).x) + C5QA*(((*R3).y - (*R5).y) + ((*R9).y - (*R7).y));\n\t"
+ "TI5 = ((*R1).y - C5QC*((*R3).y + (*R9).y)) + C5QB*((*R5).x - (*R7).x) - C5QD*((*R3).x - (*R9).x) + C5QA*(((*R5).y - (*R3).y) + ((*R7).y - (*R9).y));\n\t"
+ "TI7 = ((*R1).y - C5QC*((*R3).y + (*R9).y)) - C5QB*((*R5).x - (*R7).x) + C5QD*((*R3).x - (*R9).x) + C5QA*(((*R5).y - (*R3).y) + ((*R7).y - (*R9).y));\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "(*R0).x = TR0 + TR1;\n\t"
+ "(*R1).x = TR2 + ( C5QE*TR3 + C5QD*TI3);\n\t"
+ "(*R2).x = TR4 + ( C5QA*TR5 + C5QB*TI5);\n\t"
+ "(*R3).x = TR6 + (-C5QA*TR7 + C5QB*TI7);\n\t"
+ "(*R4).x = TR8 + (-C5QE*TR9 + C5QD*TI9);\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "(*R0).y = TI0 + TI1;\n\t"
+ "(*R1).y = TI2 + (-C5QD*TR3 + C5QE*TI3);\n\t"
+ "(*R2).y = TI4 + (-C5QB*TR5 + C5QA*TI5);\n\t"
+ "(*R3).y = TI6 + (-C5QB*TR7 - C5QA*TI7);\n\t"
+ "(*R4).y = TI8 + (-C5QD*TR9 - C5QE*TI9);\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "(*R5).x = TR0 - TR1;\n\t"
+ "(*R6).x = TR2 - ( C5QE*TR3 + C5QD*TI3);\n\t"
+ "(*R7).x = TR4 - ( C5QA*TR5 + C5QB*TI5);\n\t"
+ "(*R8).x = TR6 - (-C5QA*TR7 + C5QB*TI7);\n\t"
+ "(*R9).x = TR8 - (-C5QE*TR9 + C5QD*TI9);\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "(*R5).y = TI0 - TI1;\n\t"
+ "(*R6).y = TI2 - (-C5QD*TR3 + C5QE*TI3);\n\t"
+ "(*R7).y = TI4 - (-C5QB*TR5 + C5QA*TI5);\n\t"
+ "(*R8).y = TI6 - (-C5QB*TR7 - C5QA*TI7);\n\t"
+ "(*R9).y = TI8 - (-C5QD*TR9 - C5QE*TI9);\n\t";
+ }
+ else
+ {
+ bflyStr +=
+ "TR0 = *R0 + *R2 + *R4 + *R6 + *R8;\n\t"
+ "TR2 = (*R0 - C5QC*(*R4 + *R6)) + C5QB*(*I2 - *I8) + C5QD*(*I4 - *I6) + C5QA*((*R2 - *R4) + (*R8 - *R6));\n\t"
+ "TR8 = (*R0 - C5QC*(*R4 + *R6)) - C5QB*(*I2 - *I8) - C5QD*(*I4 - *I6) + C5QA*((*R2 - *R4) + (*R8 - *R6));\n\t"
+ "TR4 = (*R0 - C5QC*(*R2 + *R8)) - C5QB*(*I4 - *I6) + C5QD*(*I2 - *I8) + C5QA*((*R4 - *R2) + (*R6 - *R8));\n\t"
+ "TR6 = (*R0 - C5QC*(*R2 + *R8)) + C5QB*(*I4 - *I6) - C5QD*(*I2 - *I8) + C5QA*((*R4 - *R2) + (*R6 - *R8));\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "TI0 = *I0 + *I2 + *I4 + *I6 + *I8;\n\t"
+ "TI2 = (*I0 - C5QC*(*I4 + *I6)) - C5QB*(*R2 - *R8) - C5QD*(*R4 - *R6) + C5QA*((*I2 - *I4) + (*I8 - *I6));\n\t"
+ "TI8 = (*I0 - C5QC*(*I4 + *I6)) + C5QB*(*R2 - *R8) + C5QD*(*R4 - *R6) + C5QA*((*I2 - *I4) + (*I8 - *I6));\n\t"
+ "TI4 = (*I0 - C5QC*(*I2 + *I8)) + C5QB*(*R4 - *R6) - C5QD*(*R2 - *R8) + C5QA*((*I4 - *I2) + (*I6 - *I8));\n\t"
+ "TI6 = (*I0 - C5QC*(*I2 + *I8)) - C5QB*(*R4 - *R6) + C5QD*(*R2 - *R8) + C5QA*((*I4 - *I2) + (*I6 - *I8));\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "TR1 = *R1 + *R3 + *R5 + *R7 + *R9;\n\t"
+ "TR3 = (*R1 - C5QC*(*R5 + *R7)) + C5QB*(*I3 - *I9) + C5QD*(*I5 - *I7) + C5QA*((*R3 - *R5) + (*R9 - *R7));\n\t"
+ "TR9 = (*R1 - C5QC*(*R5 + *R7)) - C5QB*(*I3 - *I9) - C5QD*(*I5 - *I7) + C5QA*((*R3 - *R5) + (*R9 - *R7));\n\t"
+ "TR5 = (*R1 - C5QC*(*R3 + *R9)) - C5QB*(*I5 - *I7) + C5QD*(*I3 - *I9) + C5QA*((*R5 - *R3) + (*R7 - *R9));\n\t"
+ "TR7 = (*R1 - C5QC*(*R3 + *R9)) + C5QB*(*I5 - *I7) - C5QD*(*I3 - *I9) + C5QA*((*R5 - *R3) + (*R7 - *R9));\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "TI1 = *I1 + *I3 + *I5 + *I7 + *I9;\n\t"
+ "TI3 = (*I1 - C5QC*(*I5 + *I7)) - C5QB*(*R3 - *R9) - C5QD*(*R5 - *R7) + C5QA*((*I3 - *I5) + (*I9 - *I7));\n\t"
+ "TI9 = (*I1 - C5QC*(*I5 + *I7)) + C5QB*(*R3 - *R9) + C5QD*(*R5 - *R7) + C5QA*((*I3 - *I5) + (*I9 - *I7));\n\t"
+ "TI5 = (*I1 - C5QC*(*I3 + *I9)) + C5QB*(*R5 - *R7) - C5QD*(*R3 - *R9) + C5QA*((*I5 - *I3) + (*I7 - *I9));\n\t"
+ "TI7 = (*I1 - C5QC*(*I3 + *I9)) - C5QB*(*R5 - *R7) + C5QD*(*R3 - *R9) + C5QA*((*I5 - *I3) + (*I7 - *I9));\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "(*R0) = TR0 + TR1;\n\t"
+ "(*R1) = TR2 + ( C5QE*TR3 + C5QD*TI3);\n\t"
+ "(*R2) = TR4 + ( C5QA*TR5 + C5QB*TI5);\n\t"
+ "(*R3) = TR6 + (-C5QA*TR7 + C5QB*TI7);\n\t"
+ "(*R4) = TR8 + (-C5QE*TR9 + C5QD*TI9);\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "(*I0) = TI0 + TI1;\n\t"
+ "(*I1) = TI2 + (-C5QD*TR3 + C5QE*TI3);\n\t"
+ "(*I2) = TI4 + (-C5QB*TR5 + C5QA*TI5);\n\t"
+ "(*I3) = TI6 + (-C5QB*TR7 - C5QA*TI7);\n\t"
+ "(*I4) = TI8 + (-C5QD*TR9 - C5QE*TI9);\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "(*R5) = TR0 - TR1;\n\t"
+ "(*R6) = TR2 - ( C5QE*TR3 + C5QD*TI3);\n\t"
+ "(*R7) = TR4 - ( C5QA*TR5 + C5QB*TI5);\n\t"
+ "(*R8) = TR6 - (-C5QA*TR7 + C5QB*TI7);\n\t"
+ "(*R9) = TR8 - (-C5QE*TR9 + C5QD*TI9);\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "(*I5) = TI0 - TI1;\n\t"
+ "(*I6) = TI2 - (-C5QD*TR3 + C5QE*TI3);\n\t"
+ "(*I7) = TI4 - (-C5QB*TR5 + C5QA*TI5);\n\t"
+ "(*I8) = TI6 - (-C5QB*TR7 - C5QA*TI7);\n\t"
+ "(*I9) = TI8 - (-C5QD*TR9 - C5QE*TI9);\n\t";
+ }
+ }
+ else
+ {
+ if(cReg)
+ {
+ bflyStr +=
+ "TR0 = (*R0).x + (*R2).x + (*R4).x + (*R6).x + (*R8).x;\n\t"
+ "TR2 = ((*R0).x - C5QC*((*R4).x + (*R6).x)) - C5QB*((*R2).y - (*R8).y) - C5QD*((*R4).y - (*R6).y) + C5QA*(((*R2).x - (*R4).x) + ((*R8).x - (*R6).x));\n\t"
+ "TR8 = ((*R0).x - C5QC*((*R4).x + (*R6).x)) + C5QB*((*R2).y - (*R8).y) + C5QD*((*R4).y - (*R6).y) + C5QA*(((*R2).x - (*R4).x) + ((*R8).x - (*R6).x));\n\t"
+ "TR4 = ((*R0).x - C5QC*((*R2).x + (*R8).x)) + C5QB*((*R4).y - (*R6).y) - C5QD*((*R2).y - (*R8).y) + C5QA*(((*R4).x - (*R2).x) + ((*R6).x - (*R8).x));\n\t"
+ "TR6 = ((*R0).x - C5QC*((*R2).x + (*R8).x)) - C5QB*((*R4).y - (*R6).y) + C5QD*((*R2).y - (*R8).y) + C5QA*(((*R4).x - (*R2).x) + ((*R6).x - (*R8).x));\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "TI0 = (*R0).y + (*R2).y + (*R4).y + (*R6).y + (*R8).y;\n\t"
+ "TI2 = ((*R0).y - C5QC*((*R4).y + (*R6).y)) + C5QB*((*R2).x - (*R8).x) + C5QD*((*R4).x - (*R6).x) + C5QA*(((*R2).y - (*R4).y) + ((*R8).y - (*R6).y));\n\t"
+ "TI8 = ((*R0).y - C5QC*((*R4).y + (*R6).y)) - C5QB*((*R2).x - (*R8).x) - C5QD*((*R4).x - (*R6).x) + C5QA*(((*R2).y - (*R4).y) + ((*R8).y - (*R6).y));\n\t"
+ "TI4 = ((*R0).y - C5QC*((*R2).y + (*R8).y)) - C5QB*((*R4).x - (*R6).x) + C5QD*((*R2).x - (*R8).x) + C5QA*(((*R4).y - (*R2).y) + ((*R6).y - (*R8).y));\n\t"
+ "TI6 = ((*R0).y - C5QC*((*R2).y + (*R8).y)) + C5QB*((*R4).x - (*R6).x) - C5QD*((*R2).x - (*R8).x) + C5QA*(((*R4).y - (*R2).y) + ((*R6).y - (*R8).y));\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "TR1 = (*R1).x + (*R3).x + (*R5).x + (*R7).x + (*R9).x;\n\t"
+ "TR3 = ((*R1).x - C5QC*((*R5).x + (*R7).x)) - C5QB*((*R3).y - (*R9).y) - C5QD*((*R5).y - (*R7).y) + C5QA*(((*R3).x - (*R5).x) + ((*R9).x - (*R7).x));\n\t"
+ "TR9 = ((*R1).x - C5QC*((*R5).x + (*R7).x)) + C5QB*((*R3).y - (*R9).y) + C5QD*((*R5).y - (*R7).y) + C5QA*(((*R3).x - (*R5).x) + ((*R9).x - (*R7).x));\n\t"
+ "TR5 = ((*R1).x - C5QC*((*R3).x + (*R9).x)) + C5QB*((*R5).y - (*R7).y) - C5QD*((*R3).y - (*R9).y) + C5QA*(((*R5).x - (*R3).x) + ((*R7).x - (*R9).x));\n\t"
+ "TR7 = ((*R1).x - C5QC*((*R3).x + (*R9).x)) - C5QB*((*R5).y - (*R7).y) + C5QD*((*R3).y - (*R9).y) + C5QA*(((*R5).x - (*R3).x) + ((*R7).x - (*R9).x));\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "TI1 = (*R1).y + (*R3).y + (*R5).y + (*R7).y + (*R9).y;\n\t"
+ "TI3 = ((*R1).y - C5QC*((*R5).y + (*R7).y)) + C5QB*((*R3).x - (*R9).x) + C5QD*((*R5).x - (*R7).x) + C5QA*(((*R3).y - (*R5).y) + ((*R9).y - (*R7).y));\n\t"
+ "TI9 = ((*R1).y - C5QC*((*R5).y + (*R7).y)) - C5QB*((*R3).x - (*R9).x) - C5QD*((*R5).x - (*R7).x) + C5QA*(((*R3).y - (*R5).y) + ((*R9).y - (*R7).y));\n\t"
+ "TI5 = ((*R1).y - C5QC*((*R3).y + (*R9).y)) - C5QB*((*R5).x - (*R7).x) + C5QD*((*R3).x - (*R9).x) + C5QA*(((*R5).y - (*R3).y) + ((*R7).y - (*R9).y));\n\t"
+ "TI7 = ((*R1).y - C5QC*((*R3).y + (*R9).y)) + C5QB*((*R5).x - (*R7).x) - C5QD*((*R3).x - (*R9).x) + C5QA*(((*R5).y - (*R3).y) + ((*R7).y - (*R9).y));\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "(*R0).x = TR0 + TR1;\n\t"
+ "(*R1).x = TR2 + ( C5QE*TR3 - C5QD*TI3);\n\t"
+ "(*R2).x = TR4 + ( C5QA*TR5 - C5QB*TI5);\n\t"
+ "(*R3).x = TR6 + (-C5QA*TR7 - C5QB*TI7);\n\t"
+ "(*R4).x = TR8 + (-C5QE*TR9 - C5QD*TI9);\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "(*R0).y = TI0 + TI1;\n\t"
+ "(*R1).y = TI2 + ( C5QD*TR3 + C5QE*TI3);\n\t"
+ "(*R2).y = TI4 + ( C5QB*TR5 + C5QA*TI5);\n\t"
+ "(*R3).y = TI6 + ( C5QB*TR7 - C5QA*TI7);\n\t"
+ "(*R4).y = TI8 + ( C5QD*TR9 - C5QE*TI9);\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "(*R5).x = TR0 - TR1;\n\t"
+ "(*R6).x = TR2 - ( C5QE*TR3 - C5QD*TI3);\n\t"
+ "(*R7).x = TR4 - ( C5QA*TR5 - C5QB*TI5);\n\t"
+ "(*R8).x = TR6 - (-C5QA*TR7 - C5QB*TI7);\n\t"
+ "(*R9).x = TR8 - (-C5QE*TR9 - C5QD*TI9);\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "(*R5).y = TI0 - TI1;\n\t"
+ "(*R6).y = TI2 - ( C5QD*TR3 + C5QE*TI3);\n\t"
+ "(*R7).y = TI4 - ( C5QB*TR5 + C5QA*TI5);\n\t"
+ "(*R8).y = TI6 - ( C5QB*TR7 - C5QA*TI7);\n\t"
+ "(*R9).y = TI8 - ( C5QD*TR9 - C5QE*TI9);\n\t";
+ }
+ else
+ {
+ bflyStr +=
+ "TR0 = *R0 + *R2 + *R4 + *R6 + *R8;\n\t"
+ "TR2 = (*R0 - C5QC*(*R4 + *R6)) - C5QB*(*I2 - *I8) - C5QD*(*I4 - *I6) + C5QA*((*R2 - *R4) + (*R8 - *R6));\n\t"
+ "TR8 = (*R0 - C5QC*(*R4 + *R6)) + C5QB*(*I2 - *I8) + C5QD*(*I4 - *I6) + C5QA*((*R2 - *R4) + (*R8 - *R6));\n\t"
+ "TR4 = (*R0 - C5QC*(*R2 + *R8)) + C5QB*(*I4 - *I6) - C5QD*(*I2 - *I8) + C5QA*((*R4 - *R2) + (*R6 - *R8));\n\t"
+ "TR6 = (*R0 - C5QC*(*R2 + *R8)) - C5QB*(*I4 - *I6) + C5QD*(*I2 - *I8) + C5QA*((*R4 - *R2) + (*R6 - *R8));\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "TI0 = *I0 + *I2 + *I4 + *I6 + *I8;\n\t"
+ "TI2 = (*I0 - C5QC*(*I4 + *I6)) + C5QB*(*R2 - *R8) + C5QD*(*R4 - *R6) + C5QA*((*I2 - *I4) + (*I8 - *I6));\n\t"
+ "TI8 = (*I0 - C5QC*(*I4 + *I6)) - C5QB*(*R2 - *R8) - C5QD*(*R4 - *R6) + C5QA*((*I2 - *I4) + (*I8 - *I6));\n\t"
+ "TI4 = (*I0 - C5QC*(*I2 + *I8)) - C5QB*(*R4 - *R6) + C5QD*(*R2 - *R8) + C5QA*((*I4 - *I2) + (*I6 - *I8));\n\t"
+ "TI6 = (*I0 - C5QC*(*I2 + *I8)) + C5QB*(*R4 - *R6) - C5QD*(*R2 - *R8) + C5QA*((*I4 - *I2) + (*I6 - *I8));\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "TR1 = *R1 + *R3 + *R5 + *R7 + *R9;\n\t"
+ "TR3 = (*R1 - C5QC*(*R5 + *R7)) - C5QB*(*I3 - *I9) - C5QD*(*I5 - *I7) + C5QA*((*R3 - *R5) + (*R9 - *R7));\n\t"
+ "TR9 = (*R1 - C5QC*(*R5 + *R7)) + C5QB*(*I3 - *I9) + C5QD*(*I5 - *I7) + C5QA*((*R3 - *R5) + (*R9 - *R7));\n\t"
+ "TR5 = (*R1 - C5QC*(*R3 + *R9)) + C5QB*(*I5 - *I7) - C5QD*(*I3 - *I9) + C5QA*((*R5 - *R3) + (*R7 - *R9));\n\t"
+ "TR7 = (*R1 - C5QC*(*R3 + *R9)) - C5QB*(*I5 - *I7) + C5QD*(*I3 - *I9) + C5QA*((*R5 - *R3) + (*R7 - *R9));\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "TI1 = *I1 + *I3 + *I5 + *I7 + *I9;\n\t"
+ "TI3 = (*I1 - C5QC*(*I5 + *I7)) + C5QB*(*R3 - *R9) + C5QD*(*R5 - *R7) + C5QA*((*I3 - *I5) + (*I9 - *I7));\n\t"
+ "TI9 = (*I1 - C5QC*(*I5 + *I7)) - C5QB*(*R3 - *R9) - C5QD*(*R5 - *R7) + C5QA*((*I3 - *I5) + (*I9 - *I7));\n\t"
+ "TI5 = (*I1 - C5QC*(*I3 + *I9)) - C5QB*(*R5 - *R7) + C5QD*(*R3 - *R9) + C5QA*((*I5 - *I3) + (*I7 - *I9));\n\t"
+ "TI7 = (*I1 - C5QC*(*I3 + *I9)) + C5QB*(*R5 - *R7) - C5QD*(*R3 - *R9) + C5QA*((*I5 - *I3) + (*I7 - *I9));\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "(*R0) = TR0 + TR1;\n\t"
+ "(*R1) = TR2 + ( C5QE*TR3 - C5QD*TI3);\n\t"
+ "(*R2) = TR4 + ( C5QA*TR5 - C5QB*TI5);\n\t"
+ "(*R3) = TR6 + (-C5QA*TR7 - C5QB*TI7);\n\t"
+ "(*R4) = TR8 + (-C5QE*TR9 - C5QD*TI9);\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "(*I0) = TI0 + TI1;\n\t"
+ "(*I1) = TI2 + ( C5QD*TR3 + C5QE*TI3);\n\t"
+ "(*I2) = TI4 + ( C5QB*TR5 + C5QA*TI5);\n\t"
+ "(*I3) = TI6 + ( C5QB*TR7 - C5QA*TI7);\n\t"
+ "(*I4) = TI8 + ( C5QD*TR9 - C5QE*TI9);\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "(*R5) = TR0 - TR1;\n\t"
+ "(*R6) = TR2 - ( C5QE*TR3 - C5QD*TI3);\n\t"
+ "(*R7) = TR4 - ( C5QA*TR5 - C5QB*TI5);\n\t"
+ "(*R8) = TR6 - (-C5QA*TR7 - C5QB*TI7);\n\t"
+ "(*R9) = TR8 - (-C5QE*TR9 - C5QD*TI9);\n\t";
+
+ bflyStr += "\n\t";
+
+ bflyStr +=
+ "(*I5) = TI0 - TI1;\n\t"
+ "(*I6) = TI2 - ( C5QD*TR3 + C5QE*TI3);\n\t"
+ "(*I7) = TI4 - ( C5QB*TR5 + C5QA*TI5);\n\t"
+ "(*I8) = TI6 - ( C5QB*TR7 - C5QA*TI7);\n\t"
+ "(*I9) = TI8 - ( C5QD*TR9 - C5QE*TI9);\n\t";
+ }
+ }
+ } break;
+ case 16:
+ {
+ if(fwd)
+ {
+ if(cReg)
+ {
+ bflyStr +=
+
+ "(*R1) = (*R0) - (*R1);\n\t"
+ "(*R0) = 2.0f * (*R0) - (*R1);\n\t"
+ "(*R3) = (*R2) - (*R3);\n\t"
+ "(*R2) = 2.0f * (*R2) - (*R3);\n\t"
+ "(*R5) = (*R4) - (*R5);\n\t"
+ "(*R4) = 2.0f * (*R4) - (*R5);\n\t"
+ "(*R7) = (*R6) - (*R7);\n\t"
+ "(*R6) = 2.0f * (*R6) - (*R7);\n\t"
+ "(*R9) = (*R8) - (*R9);\n\t"
+ "(*R8) = 2.0f * (*R8) - (*R9);\n\t"
+ "(*R11) = (*R10) - (*R11);\n\t"
+ "(*R10) = 2.0f * (*R10) - (*R11);\n\t"
+ "(*R13) = (*R12) - (*R13);\n\t"
+ "(*R12) = 2.0f * (*R12) - (*R13);\n\t"
+ "(*R15) = (*R14) - (*R15);\n\t"
+ "(*R14) = 2.0f * (*R14) - (*R15);\n\t"
+ "\n\t"
+ "(*R2) = (*R0) - (*R2);\n\t"
+ "(*R0) = 2.0f * (*R0) - (*R2);\n\t"
+ "(*R3) = (*R1) + (fvect2)(-(*R3).y, (*R3).x);\n\t"
+ "(*R1) = 2.0f * (*R1) - (*R3);\n\t"
+ "(*R6) = (*R4) - (*R6);\n\t"
+ "(*R4) = 2.0f * (*R4) - (*R6);\n\t"
+ "(*R7) = (*R5) + (fvect2)(-(*R7).y, (*R7).x);\n\t"
+ "(*R5) = 2.0f * (*R5) - (*R7);\n\t"
+ "(*R10) = (*R8) - (*R10);\n\t"
+ "(*R8) = 2.0f * (*R8) - (*R10);\n\t"
+ "(*R11) = (*R9) + (fvect2)(-(*R11).y, (*R11).x);\n\t"
+ "(*R9) = 2.0f * (*R9) - (*R11);\n\t"
+ "(*R14) = (*R12) - (*R14);\n\t"
+ "(*R12) = 2.0f * (*R12) - (*R14);\n\t"
+ "(*R15) = (*R13) + (fvect2)(-(*R15).y, (*R15).x);\n\t"
+ "(*R13) = 2.0f * (*R13) - (*R15);\n\t"
+ "\n\t"
+ "(*R4) = (*R0) - (*R4);\n\t"
+ "(*R0) = 2.0f * (*R0) - (*R4);\n\t"
+ "(*R5) = ((*R1) - C8Q * (*R5)) - C8Q * (fvect2)((*R5).y, -(*R5).x);\n\t"
+ "(*R1) = 2.0f * (*R1) - (*R5);\n\t"
+ "(*R6) = (*R2) + (fvect2)(-(*R6).y, (*R6).x);\n\t"
+ "(*R2) = 2.0f * (*R2) - (*R6);\n\t"
+ "(*R7) = ((*R3) + C8Q * (*R7)) - C8Q * (fvect2)((*R7).y, -(*R7).x);\n\t"
+ "(*R3) = 2.0f * (*R3) - (*R7);\n\t"
+ "(*R12) = (*R8) - (*R12);\n\t"
+ "(*R8) = 2.0f * (*R8) - (*R12);\n\t"
+ "(*R13) = ((*R9) - C8Q * (*R13)) - C8Q * (fvect2)((*R13).y, -(*R13).x);\n\t"
+ "(*R9) = 2.0f * (*R9) - (*R13);\n\t"
+ "(*R14) = (*R10) + (fvect2)(-(*R14).y, (*R14).x);\n\t"
+ "(*R10) = 2.0f * (*R10) - (*R14);\n\t"
+ "(*R15) = ((*R11) + C8Q * (*R15)) - C8Q * (fvect2)((*R15).y, -(*R15).x);\n\t"
+ "(*R11) = 2.0f * (*R11) - (*R15);\n\t"
+ "\n\t"
+ "(*R8) = (*R0) - (*R8);\n\t"
+ "(*R0) = 2.0f * (*R0) - (*R8);\n\t"
+ "(*R9) = ((*R1) - 0.92387953251128675612818318939679 * (*R9)) - 0.3826834323650897717284599840304 * (fvect2)((*R9).y, -(*R9).x);\n\t"
+ "(*R1) = 2.0f * (*R1) - (*R9);\n\t"
+ "(*R10) = ((*R2) - C8Q * (*R10)) - C8Q * (fvect2)((*R10).y, -(*R10).x);\n\t"
+ "(*R2) = 2.0f * (*R2) - (*R10);\n\t"
+ "(*R11) = ((*R3) - 0.3826834323650897717284599840304 * (*R11)) - 0.92387953251128675612818318939679 * (fvect2)((*R11).y, -(*R11).x);\n\t"
+ "(*R3) = 2.0f * (*R3) - (*R11);\n\t"
+ "(*R12) = (*R4) + (fvect2)(-(*R12).y, (*R12).x);\n\t"
+ "(*R4) = 2.0f * (*R4) - (*R12);\n\t"
+ "(*R13) = ((*R5) + 0.3826834323650897717284599840304 * (*R13)) - 0.92387953251128675612818318939679 * (fvect2)((*R13).y, -(*R13).x);\n\t"
+ "(*R5) = 2.0f * (*R5) - (*R13);\n\t"
+ "(*R14) = ((*R6) + C8Q * (*R14)) - C8Q * (fvect2)((*R14).y, -(*R14).x);\n\t"
+ "(*R6) = 2.0f * (*R6) - (*R14);\n\t"
+ "(*R15) = ((*R7) + 0.92387953251128675612818318939679 * (*R15)) - 0.3826834323650897717284599840304 * (fvect2)((*R15).y, -(*R15).x);\n\t"
+ "(*R7) = 2.0f * (*R7) - (*R15);\n\t";
+
+ }
+ else
+ assert(false);
+ }
+ else
+ {
+ if(cReg)
+ {
+ bflyStr +=
+
+ "(*R1) = (*R0) - (*R1);\n\t"
+ "(*R0) = 2.0f * (*R0) - (*R1);\n\t"
+ "(*R3) = (*R2) - (*R3);\n\t"
+ "(*R2) = 2.0f * (*R2) - (*R3);\n\t"
+ "(*R5) = (*R4) - (*R5);\n\t"
+ "(*R4) = 2.0f * (*R4) - (*R5);\n\t"
+ "(*R7) = (*R6) - (*R7);\n\t"
+ "(*R6) = 2.0f * (*R6) - (*R7);\n\t"
+ "(*R9) = (*R8) - (*R9);\n\t"
+ "(*R8) = 2.0f * (*R8) - (*R9);\n\t"
+ "(*R11) = (*R10) - (*R11);\n\t"
+ "(*R10) = 2.0f * (*R10) - (*R11);\n\t"
+ "(*R13) = (*R12) - (*R13);\n\t"
+ "(*R12) = 2.0f * (*R12) - (*R13);\n\t"
+ "(*R15) = (*R14) - (*R15);\n\t"
+ "(*R14) = 2.0f * (*R14) - (*R15);\n\t"
+ "\n\t"
+ "(*R2) = (*R0) - (*R2);\n\t"
+ "(*R0) = 2.0f * (*R0) - (*R2);\n\t"
+ "(*R3) = (*R1) + (fvect2)((*R3).y, -(*R3).x);\n\t"
+ "(*R1) = 2.0f * (*R1) - (*R3);\n\t"
+ "(*R6) = (*R4) - (*R6);\n\t"
+ "(*R4) = 2.0f * (*R4) - (*R6);\n\t"
+ "(*R7) = (*R5) + (fvect2)((*R7).y, -(*R7).x);\n\t"
+ "(*R5) = 2.0f * (*R5) - (*R7);\n\t"
+ "(*R10) = (*R8) - (*R10);\n\t"
+ "(*R8) = 2.0f * (*R8) - (*R10);\n\t"
+ "(*R11) = (*R9) + (fvect2)((*R11).y, -(*R11).x);\n\t"
+ "(*R9) = 2.0f * (*R9) - (*R11);\n\t"
+ "(*R14) = (*R12) - (*R14);\n\t"
+ "(*R12) = 2.0f * (*R12) - (*R14);\n\t"
+ "(*R15) = (*R13) + (fvect2)((*R15).y, -(*R15).x);\n\t"
+ "(*R13) = 2.0f * (*R13) - (*R15);\n\t"
+ "\n\t"
+ "(*R4) = (*R0) - (*R4);\n\t"
+ "(*R0) = 2.0f * (*R0) - (*R4);\n\t"
+ "(*R5) = ((*R1) - C8Q * (*R5)) + C8Q * (fvect2)((*R5).y, -(*R5).x);\n\t"
+ "(*R1) = 2.0f * (*R1) - (*R5);\n\t"
+ "(*R6) = (*R2) + (fvect2)((*R6).y, -(*R6).x);\n\t"
+ "(*R2) = 2.0f * (*R2) - (*R6);\n\t"
+ "(*R7) = ((*R3) + C8Q * (*R7)) + C8Q * (fvect2)((*R7).y, -(*R7).x);\n\t"
+ "(*R3) = 2.0f * (*R3) - (*R7);\n\t"
+ "(*R12) = (*R8) - (*R12);\n\t"
+ "(*R8) = 2.0f * (*R8) - (*R12);\n\t"
+ "(*R13) = ((*R9) - C8Q * (*R13)) + C8Q * (fvect2)((*R13).y, -(*R13).x);\n\t"
+ "(*R9) = 2.0f * (*R9) - (*R13);\n\t"
+ "(*R14) = (*R10) + (fvect2)((*R14).y, -(*R14).x);\n\t"
+ "(*R10) = 2.0f * (*R10) - (*R14);\n\t"
+ "(*R15) = ((*R11) + C8Q * (*R15)) + C8Q * (fvect2)((*R15).y, -(*R15).x);\n\t"
+ "(*R11) = 2.0f * (*R11) - (*R15);\n\t"
+ "\n\t"
+ "(*R8) = (*R0) - (*R8);\n\t"
+ "(*R0) = 2.0f * (*R0) - (*R8);\n\t"
+ "(*R9) = ((*R1) - 0.92387953251128675612818318939679 * (*R9)) + 0.3826834323650897717284599840304 * (fvect2)((*R9).y, -(*R9).x);\n\t"
+ "(*R1) = 2.0f * (*R1) - (*R9);\n\t"
+ "(*R10) = ((*R2) - C8Q * (*R10)) + C8Q * (fvect2)((*R10).y, -(*R10).x);\n\t"
+ "(*R2) = 2.0f * (*R2) - (*R10);\n\t"
+ "(*R11) = ((*R3) - 0.3826834323650897717284599840304 * (*R11)) + 0.92387953251128675612818318939679 * (fvect2)((*R11).y, -(*R11).x);\n\t"
+ "(*R3) = 2.0f * (*R3) - (*R11);\n\t"
+ "(*R12) = (*R4) + (fvect2)((*R12).y, -(*R12).x);\n\t"
+ "(*R4) = 2.0f * (*R4) - (*R12);\n\t"
+ "(*R13) = ((*R5) + 0.3826834323650897717284599840304 * (*R13)) + 0.92387953251128675612818318939679 * (fvect2)((*R13).y, -(*R13).x);\n\t"
+ "(*R5) = 2.0f * (*R5) - (*R13);\n\t"
+ "(*R14) = ((*R6) + C8Q * (*R14)) + C8Q * (fvect2)((*R14).y, -(*R14).x);\n\t"
+ "(*R6) = 2.0f * (*R6) - (*R14);\n\t"
+ "(*R15) = ((*R7) + 0.92387953251128675612818318939679 * (*R15)) + 0.3826834323650897717284599840304 * (fvect2)((*R15).y, -(*R15).x);\n\t"
+ "(*R7) = 2.0f * (*R7) - (*R15);\n\t";
+
+ }
+ else
+ assert(false);
+ }
+ } break;
+ default:
+ assert(false);
+ }
+
+ bflyStr += "\n\t";
+
+ // Assign results
+ if( (radix & (radix-1)) || (!cReg) )
+ {
+ if( (radix != 10) && (radix != 6) )
+ {
+ for(size_t i=0; i 0)
+ GenerateButterflyStr(bflyStr);
+ }
+ };
+
+};
+
+#endif
+
diff --git a/src/library/generator.transpose.cpp b/src/library/generator.transpose.cpp
new file mode 100644
index 00000000..3b12504f
--- /dev/null
+++ b/src/library/generator.transpose.cpp
@@ -0,0 +1,837 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+// clfft.generator.Transpose.cpp : Dynamic run-time generator of openCL transpose kernels
+//
+
+// TODO: generalize the kernel to work with any size
+
+#include "stdafx.h"
+#include
+#include "generator.transpose.h"
+
+#define QUOTEMARK(x) #x
+
+#define PLANNAR_READ(z0, z1, z2, z3, gp) \
+ ss << INDENT2 << QUOTEMARK(z0 = gp[0*HSTRIDE/4*8];\n) \
+ << INDENT2 << QUOTEMARK(z1 = gp[1*HSTRIDE/4*8];\n) \
+ << INDENT2 << QUOTEMARK(z2 = gp[2*HSTRIDE/4*8];\n) \
+ << INDENT2 << QUOTEMARK(z3 = gp[3*HSTRIDE/4*8];\n);
+
+#define INTERLEAVED_READ(z00, z01, z10, z11, z20, z21, z30, z31) \
+ ss << INDENT2 << QUOTEMARK(z00 = gp[0*HSTRIDE/4*16];\n) \
+ << INDENT2 << QUOTEMARK(z01 = gp[0*HSTRIDE/4*16 + 1];\n) \
+ << INDENT2 << QUOTEMARK(z10 = gp[1*HSTRIDE/4*16];\n) \
+ << INDENT2 << QUOTEMARK(z11 = gp[1*HSTRIDE/4*16 + 1];\n) \
+ << INDENT2 << QUOTEMARK(z20 = gp[2*HSTRIDE/4*16];\n) \
+ << INDENT2 << QUOTEMARK(z21 = gp[2*HSTRIDE/4*16 + 1];\n) \
+ << INDENT2 << QUOTEMARK(z30 = gp[3*HSTRIDE/4*16];\n) \
+ << INDENT2 << QUOTEMARK(z31 = gp[3*HSTRIDE/4*16 + 1];\n);
+
+#define PLANNAR_WRITE(z0, z1, z2, z3, gp) \
+ ss << INDENT2 << QUOTEMARK(gp[0*VSTRIDE/4*8] = z0;\n) \
+ << INDENT2 << QUOTEMARK(gp[1*VSTRIDE/4*8] = z1;\n) \
+ << INDENT2 << QUOTEMARK(gp[2*VSTRIDE/4*8] = z2;\n) \
+ << INDENT2 << QUOTEMARK(gp[3*VSTRIDE/4*8] = z3;\n);
+
+#define INTERLEAVED_WRITE(z00, z01, z10, z11, z20, z21, z30, z31) \
+ ss << INDENT2 << QUOTEMARK(gp[0*VSTRIDE/4*16] = z00;\n) \
+ << INDENT2 << QUOTEMARK(gp[0*VSTRIDE/4*16+1] = z01;\n) \
+ << INDENT2 << QUOTEMARK(gp[1*VSTRIDE/4*16] = z10;\n) \
+ << INDENT2 << QUOTEMARK(gp[1*VSTRIDE/4*16+1] = z11;\n) \
+ << INDENT2 << QUOTEMARK(gp[2*VSTRIDE/4*16] = z20;\n) \
+ << INDENT2 << QUOTEMARK(gp[2*VSTRIDE/4*16+1] = z21;\n) \
+ << INDENT2 << QUOTEMARK(gp[3*VSTRIDE/4*16] = z30;\n) \
+ << INDENT2 << QUOTEMARK(gp[3*VSTRIDE/4*16+1] = z31;\n);
+
+#define WRITE_TO_LDS(lp, jump, z0, z1, z2, z3, part) \
+ ss << INDENT2 << QUOTEMARK(lp[0*jump] = z0.part;\n) \
+ << INDENT2 << QUOTEMARK(lp[1*jump] = z1.part;\n) \
+ << INDENT2 << QUOTEMARK(lp[2*jump] = z2.part;\n) \
+ << INDENT2 << QUOTEMARK(lp[3*jump] = z3.part;\n) \
+ << INDENT2 << QUOTEMARK(lp += jump*4;\n) \
+ << "\n";
+
+typedef enum inputoutputflag_
+{
+ PLANNAR_PLANNAR = 1,
+ PLANNAR_INTERLEAVED,
+ INTERLEAVED_PLANNAR,
+ INTERLEAVED_INTERLEAVED,
+ ENDTRANSIO
+} transio;
+
+static clfftStatus GenerateTransposeKernel (FFTKernelGenKeyParams & params,
+ std::string & kernel)
+{
+ kernel.reserve (8000);
+ std::stringstream ss (std::stringstream::out);
+
+ const char * szIn0 = "";
+ const char * szIn1 = "";
+ const char * szOut0 = "";
+ const char * szOut1 = "";
+ const char * typeIn = "";
+ const char * typeOut = "";
+ const char * INDENT2 = "";
+ const char * INDENT = " ";
+ const char * datatype="";
+ const char * datatype2="";
+ bool xyflag = (params.fft_N[0] == params.fft_N[1]) ? false : true;
+ transio iotype;
+
+
+ if (params.fft_precision == CLFFT_SINGLE)
+ {
+ datatype = "float";
+ datatype2 = "float2";
+ }
+ else
+ {
+ datatype = "double";
+ datatype2 = "double2";
+ ss << "#pragma OPENCL EXTENSION cl_amd_fp64 : enable\n\n";
+ }
+
+ size_t hstride = params.fft_N[0];
+ size_t vstride = params.fft_N[1];
+
+ ss << "#define HSTRIDE " << hstride << "\n";
+ ss << "#define VSTRIDE " << vstride << "\n";
+ if (xyflag)
+ {
+ ss << "#define DIMX " << hstride/32 << "\n";
+ ss << "#define DIMY " << vstride/32 << "\n";
+ ss << "#define DIM ((DIMX > DIMY) ? DIMX : DIMY) \n\n";
+ INDENT2 = " ";
+ }
+ else
+ {
+ ss << "#define DIM " << vstride/32 << "\n\n";
+ INDENT2 = " ";
+ }
+
+ // Generate the kernel entry point and parameter list
+ //
+ ss << "__attribute__((reqd_work_group_size(" << unsigned(params.fft_SIMD) << ",1,1)))\n"
+ << "__kernel void\n"
+ << "fft_trans" << "(";
+
+ if (xyflag && params.fft_placeness == CLFFT_INPLACE) return CLFFT_INVALID_ARG_VALUE;
+
+ switch (params.fft_inputLayout) {
+ case CLFFT_COMPLEX_INTERLEAVED:
+ typeIn = datatype2;
+ if (params.fft_placeness == CLFFT_INPLACE)
+ {
+ szIn0 = szOut0 = "gcomplx";
+ typeOut = datatype2;
+ ss << "__global " << typeIn << " * restrict " << szIn0;
+ iotype = INTERLEAVED_INTERLEAVED;
+ }
+ else
+ {
+ szIn0 = "gcomplxIn";
+ ss << "__global " << typeIn << " * restrict " << szIn0;
+
+ if (params.fft_outputLayout == CLFFT_COMPLEX_PLANAR)
+ {
+ szOut0 = "grealOut";
+ szOut1 = "gimagOut";
+ typeOut = datatype;
+ ss <<", __global " << typeOut <<" * restrict " << szOut0
+ <<", __global " << typeOut <<" * restrict " << szOut1;
+ iotype = INTERLEAVED_PLANNAR;
+ }
+ else
+ {
+ szOut0 = "gcomplxOut";
+ typeOut = datatype2;
+ ss <<", __global " << typeOut <<" * restrict " << szOut0;
+ iotype = INTERLEAVED_INTERLEAVED;
+ }
+ }
+ break;
+ case CLFFT_COMPLEX_PLANAR:
+ typeIn = datatype;
+ if (params.fft_placeness == CLFFT_INPLACE)
+ {
+ szIn0 = szOut0 = "greal";
+ szIn1 = szOut1 = "gimag";
+ typeOut = datatype;
+ ss << "__global " << typeIn << " * restrict " << szIn0 <<", __global " << typeIn <<" * restrict " << szIn1;
+ iotype = PLANNAR_PLANNAR;
+ }
+ else
+ {
+ szIn0 = "greadIn";
+ szIn1 = "gimagIn";
+ ss <<"__global " << typeIn << " * restrict " << szIn0 <<", __global " << typeIn <<" * restrict " << szIn1;
+ if (params.fft_outputLayout == CLFFT_COMPLEX_PLANAR)
+ {
+ szOut0 = "grealOut";
+ szOut1 = "gimagOut";
+ typeOut = datatype;
+ ss << ", __global " << typeOut <<" * restrict " << szOut0 <<", __global " << typeOut <<" * restrict " << szOut1;
+ iotype = PLANNAR_PLANNAR;
+ }
+ else
+ {
+ szOut0 = "gcomplxOut";
+ typeOut = datatype2;
+ ss << ", __global " << typeOut <<" * restrict " << szOut0;
+ iotype = PLANNAR_INTERLEAVED;
+ }
+ }
+ break;
+ default:
+ return CLFFT_NOTIMPLEMENTED;
+ }
+ ss << ")\n{\n";
+
+ // Support plannar and interleaved format
+ switch (iotype)
+ {
+ case PLANNAR_INTERLEAVED:
+ ss << INDENT << "__local " << typeIn << " ldsa[2048];\n"
+ << INDENT << "__local " << typeIn << " ldsb[2048];\n";
+ break;
+ case INTERLEAVED_PLANNAR:
+ case PLANNAR_PLANNAR:
+ case INTERLEAVED_INTERLEAVED:
+ ss << INDENT << "__local " << typeIn << " ldsa[1024];\n"
+ << INDENT << "__local " << typeIn << " ldsb[1024];\n";
+ break;
+ default:
+ return CLFFT_NOTIMPLEMENTED;
+ }
+
+ ss << INDENT << "uint gid = get_global_id(0);\n"
+ << INDENT << "uint me = gid & 0x3fU;\n"
+ << INDENT << "uint k = (gid >> 6) % ";
+
+ // add batch support
+ size_t batchnum = (vstride > hstride) ? ((vstride/32) * (vstride/32 +1) /2)
+ : ((hstride/32) * (hstride/32 +1) /2);
+ ss << batchnum
+ << ";\n"
+ << "\n";
+
+ ss << INDENT << "// Compute location of blocks\n"
+ << INDENT << "int l = DIM+0.5f - native_sqrt((DIM+0.5f)*(DIM+0.5f) - 2.0f * (float)as_int(k));\n"
+ << INDENT << "int kl = ((DIM*2+1 - l) * l) >> 1;\n"
+ << INDENT << "uint j = k - kl;\n"
+ << INDENT << "uint i = l + j;\n"
+ << "\n";
+
+ ss << INDENT << "uint goa, gob;\n"
+ << INDENT << "uint go = ((me & 0x7U) << 2) + ((gid>>6)/" << batchnum << ") * VSTRIDE * HSTRIDE;\n"
+ << INDENT << "__global " << datatype << "4 *gp;\n"
+ << INDENT << "__local " << datatype << "4 *lp4;\n"
+ << INDENT << "uint lo = ((me & 0x7U) << 7) + (me >> 3);\n"
+ << INDENT << "uint lot = (me<<2); \n";
+
+ switch (iotype)
+ {
+ case PLANNAR_PLANNAR:
+ ss << INDENT << datatype <<"4 z0, z1, z2, z3;\n\n"
+ << INDENT << "__local " << typeIn <<" *lp;\n";
+ break;
+ case PLANNAR_INTERLEAVED:
+ ss << INDENT << "__global " << datatype << "4 *gpi;\n"
+ << INDENT << "__local " << typeIn <<" *lp;\n"
+ << INDENT << datatype << "4 z00, z01, z10, z11, z20, z21, z30, z31;\n\n";
+ break;
+ case INTERLEAVED_PLANNAR:
+ ss << INDENT << "__local " << typeOut <<" *lp;\n"
+ << INDENT << datatype << "4 z00, z01, z10, z11, z20, z21, z30, z31;\n\n";
+ break;
+ case INTERLEAVED_INTERLEAVED:
+ ss << INDENT << "__local " << typeIn <<" *lp;\n"
+ << INDENT << datatype << "4 z00, z01, z10, z11, z20, z21, z30, z31;\n\n";
+ break;
+ }
+
+ if (xyflag)
+ {
+ ss << INDENT << "if ( i < DIMX && j < DIMY) \n"
+ << INDENT << "{\n";
+ }
+
+ ss << INDENT2 << "// Array offsets\n"
+ << INDENT2 << "goa = go + (i << 5) + j * (HSTRIDE*32) + (me >> 3)*HSTRIDE;\n"
+ <<"\n";
+
+ ss << INDENT2 << "// Load A block\n"
+ << INDENT2 << "gp = (__global " << datatype << "4 *)(" << szIn0 <<" + goa);\n";
+
+ switch (iotype)
+ {
+ case PLANNAR_PLANNAR:
+ PLANNAR_READ(z0, z1, z2, z3, gp);
+ break;
+ case PLANNAR_INTERLEAVED:
+ PLANNAR_READ(z00, z10, z20, z30, gp);
+ ss << INDENT2 << "gpi = (__global " << datatype << "4 *)(" << szIn1 <<" + goa);\n";
+ PLANNAR_READ(z01, z11, z21, z31, gpi);
+ break;
+ default:
+ INTERLEAVED_READ(z00, z01, z10, z11, z20, z21, z30, z31);
+ break;
+ }
+ ss << "\n";
+
+ ss << INDENT2 << "// Save into LDS\n";
+ switch (iotype)
+ {
+ case PLANNAR_PLANNAR:
+ ss << INDENT2 << "lp = ldsa + lo;\n";
+ WRITE_TO_LDS(lp, 8, z0, z1, z2, z3, x);
+ WRITE_TO_LDS(lp, 8, z0, z1, z2, z3, y);
+ WRITE_TO_LDS(lp, 8, z0, z1, z2, z3, z);
+ WRITE_TO_LDS(lp, 8, z0, z1, z2, z3, w);
+ break;
+ case PLANNAR_INTERLEAVED:
+ ss << INDENT2 << "lp = ldsa + lo*2;\n";
+ WRITE_TO_LDS(lp, 16, z00, z10, z20, z30, x);
+ WRITE_TO_LDS(lp, 16, z00, z10, z20, z30, y);
+ WRITE_TO_LDS(lp, 16, z00, z10, z20, z30, z);
+ WRITE_TO_LDS(lp, 16, z00, z10, z20, z30, w);
+ ss << INDENT2 << "lp = ldsa + lo*2 + 1;\n";
+ WRITE_TO_LDS(lp, 16, z01, z11, z21, z31, x);
+ WRITE_TO_LDS(lp, 16, z01, z11, z21, z31, y);
+ WRITE_TO_LDS(lp, 16, z01, z11, z21, z31, z);
+ WRITE_TO_LDS(lp, 16, z01, z11, z21, z31, w);
+ break;
+ case INTERLEAVED_INTERLEAVED:
+ ss << INDENT2 << "lp = ldsa + lo;\n";
+ WRITE_TO_LDS(lp, 8, z00, z10, z20, z30, xy);
+ WRITE_TO_LDS(lp, 8, z00, z10, z20, z30, zw);
+ WRITE_TO_LDS(lp, 8, z01, z11, z21, z31, xy);
+ WRITE_TO_LDS(lp, 8, z01, z11, z21, z31, zw);
+ break;
+ case INTERLEAVED_PLANNAR:
+ ss << INDENT2 << "lp = (__local " << typeOut << "*)ldsa + lo;\n";
+ WRITE_TO_LDS(lp, 8, z00, z10, z20, z30, x);
+ WRITE_TO_LDS(lp, 8, z00, z10, z20, z30, z);
+ WRITE_TO_LDS(lp, 8, z01, z11, z21, z31, x);
+ WRITE_TO_LDS(lp, 8, z01, z11, z21, z31, z);
+ //next write to lp = ldsa+lo+1024
+ ss << INDENT2 << "lp += (1024-32*4);\n";
+ WRITE_TO_LDS(lp, 8, z00, z10, z20, z30, y);
+ WRITE_TO_LDS(lp, 8, z00, z10, z20, z30, w);
+ WRITE_TO_LDS(lp, 8, z01, z11, z21, z31, y);
+ WRITE_TO_LDS(lp, 8, z01, z11, z21, z31, w);
+ break;
+ }
+
+ ss << INDENT;
+ if (xyflag) ss << "} ";
+ ss <<"//End load A block\n\n";
+
+ if (xyflag)
+ ss << INDENT << "if (i < DIMY && j < DIMX) \n"
+ << INDENT << "{\n";
+
+ ss << INDENT2 << "// Load B block\n"
+ << INDENT2 << "gob = go + (j << 5) + i * (HSTRIDE*32) + (me >> 3)*HSTRIDE;\n"
+ << INDENT2 << "gp = (__global " << datatype << "4 *)(" << szIn0 << " + gob);\n";
+
+ switch (iotype)
+ {
+ case PLANNAR_PLANNAR:
+ PLANNAR_READ(z0, z1, z2, z3, gp);
+ break;
+ case PLANNAR_INTERLEAVED:
+ PLANNAR_READ(z00, z10, z20, z30, gp);
+ ss << INDENT2 << "gpi = (__global " << datatype << "4 *)(" << szIn1 <<" + gob);\n";
+ PLANNAR_READ(z01, z11, z21, z31, gpi);
+ break;
+ default:
+ INTERLEAVED_READ(z00, z01, z10, z11, z20, z21, z30, z31);
+ break;
+ }
+ ss << "\n";
+
+ ss << INDENT2 << "// Save into LDS\n";
+ switch (iotype)
+ {
+ case PLANNAR_PLANNAR:
+ ss << INDENT2 << "lp = ldsb + lo;\n";
+ WRITE_TO_LDS(lp, 8, z0, z1, z2, z3, x);
+ WRITE_TO_LDS(lp, 8, z0, z1, z2, z3, y);
+ WRITE_TO_LDS(lp, 8, z0, z1, z2, z3, z);
+ WRITE_TO_LDS(lp, 8, z0, z1, z2, z3, w);
+ break;
+ case PLANNAR_INTERLEAVED:
+ ss << INDENT2 << "lp = ldsb + lo*2;\n";
+ WRITE_TO_LDS(lp, 16, z00, z10, z20, z30, x);
+ WRITE_TO_LDS(lp, 16, z00, z10, z20, z30, y);
+ WRITE_TO_LDS(lp, 16, z00, z10, z20, z30, z);
+ WRITE_TO_LDS(lp, 16, z00, z10, z20, z30, w);
+ ss << INDENT2 << "lp = ldsb + lo*2 + 1;\n";
+ WRITE_TO_LDS(lp, 16, z01, z11, z21, z31, x);
+ WRITE_TO_LDS(lp, 16, z01, z11, z21, z31, y);
+ WRITE_TO_LDS(lp, 16, z01, z11, z21, z31, z);
+ WRITE_TO_LDS(lp, 16, z01, z11, z21, z31, w);
+ break;
+ case INTERLEAVED_INTERLEAVED:
+ ss << INDENT2 << "lp = ldsb + lo;\n";
+ WRITE_TO_LDS(lp, 8, z00, z10, z20, z30, xy);
+ WRITE_TO_LDS(lp, 8, z00, z10, z20, z30, zw);
+ WRITE_TO_LDS(lp, 8, z01, z11, z21, z31, xy);
+ WRITE_TO_LDS(lp, 8, z01, z11, z21, z31, zw);
+ break;
+ case INTERLEAVED_PLANNAR:
+ ss << INDENT2 << "lp = (__local " << typeOut << "*) ldsb + lo;\n";
+ WRITE_TO_LDS(lp, 8, z00, z10, z20, z30, x);
+ WRITE_TO_LDS(lp, 8, z00, z10, z20, z30, z);
+ WRITE_TO_LDS(lp, 8, z01, z11, z21, z31, x);
+ WRITE_TO_LDS(lp, 8, z01, z11, z21, z31, z);
+ //next write to lp = ldsa+lo+1024
+ ss << INDENT2 << "lp += (1024-32*4);\n";
+ WRITE_TO_LDS(lp, 8, z00, z10, z20, z30, y);
+ WRITE_TO_LDS(lp, 8, z00, z10, z20, z30, w);
+ WRITE_TO_LDS(lp, 8, z01, z11, z21, z31, y);
+ WRITE_TO_LDS(lp, 8, z01, z11, z21, z31, w);
+ break;
+ }
+
+ ss << INDENT;
+ if (xyflag) ss<< "} ";
+ ss << "// End load B block\n\n";
+
+ ss << INDENT << "barrier(CLK_LOCAL_MEM_FENCE);\n"
+ << "\n";
+
+ if (xyflag) ss << INDENT << "if (i < DIMY && j < DIMX)\n"
+ << INDENT << "{\n";
+
+ ss << INDENT2 << "// write A block\n";
+
+ ss << INDENT2 << "goa = go + (i << 5) + j * (VSTRIDE*32) + (me >> 3)*VSTRIDE;\n"
+ << INDENT2 << "gp = (__global " << datatype << "4 *)(" << szOut0 << " + goa);\n";
+
+ switch (iotype)
+ {
+ case PLANNAR_PLANNAR:
+ ss << INDENT2 << "lp4 = (__local " << datatype << "4 *)(ldsb + lot);\n\n";
+ ss << INDENT2 << "z0 = lp4[0];\n"
+ << INDENT2 << "lp4 += 32*2;\n"
+ << "\n";
+
+ ss << INDENT2 << "z1 = lp4[0];\n"
+ << INDENT2 << "lp4 += 32*2;\n"
+ << "\n";
+
+ ss << INDENT2 << "z2 = lp4[0];\n"
+ << INDENT2 << "lp4 += 32*2;\n"
+ << "\n";
+
+ ss << INDENT2 << "z3 = lp4[0];\n\n";
+
+ PLANNAR_WRITE(z0, z1, z2, z3, gp);
+ break;
+ case INTERLEAVED_PLANNAR:
+ ss << INDENT2 << "lp4 = (__local " << datatype << "4 *)((__local " << typeOut << "*)ldsb + lot);\n\n";
+ ss << INDENT2 << "z00 = lp4[0];\n"
+ << INDENT2 << "lp4 += 32*2;\n"
+ << "\n";
+
+ ss << INDENT2 << "z10 = lp4[0];\n"
+ << INDENT2 << "lp4 += 32*2;\n"
+ << "\n";
+
+ ss << INDENT2 << "z20 = lp4[0];\n"
+ << INDENT2 << "lp4 += 32*2;\n"
+ << "\n";
+
+ ss << INDENT2 << "z30 = lp4[0];\n\n";
+
+ PLANNAR_WRITE(z00, z10, z20, z30, gp);
+
+ ss << INDENT2 << "gp = (__global " << datatype << "4 *)(" << szOut1 << " + goa);\n";
+ ss << INDENT2 << "lp4 += (256 - 32*6);\n";
+ ss << INDENT2 << "z01 = lp4[0];\n"
+ << INDENT2 << "lp4 += 32*2;\n"
+ << "\n";
+
+ ss << INDENT2 << "z11 = lp4[0];\n"
+ << INDENT2 << "lp4 += 32*2;\n"
+ << "\n";
+
+ ss << INDENT2 << "z21 = lp4[0];\n"
+ << INDENT2 << "lp4 += 32*2;\n"
+ << "\n";
+
+ ss << INDENT2 << "z31 = lp4[0];\n\n";
+
+ PLANNAR_WRITE(z01, z11, z21, z31, gp);
+ break;
+
+ case PLANNAR_INTERLEAVED:
+ case INTERLEAVED_INTERLEAVED:
+ if (iotype == PLANNAR_INTERLEAVED)
+ ss << INDENT2 << "lp4 = (__local " << datatype << "4 *)(ldsb + lot*2);\n";
+ else
+ ss << INDENT2 << "lp4 = (__local " << datatype << "4 *)(ldsb + lot);\n";
+
+ ss << INDENT2 << "z00 = lp4[0];\n"
+ << INDENT2 << "z01 = lp4[1];\n"
+ << INDENT2 << "lp4 += 32*4;\n"
+ << "\n";
+
+ ss << INDENT2 << "z10 = lp4[0];\n"
+ << INDENT2 << "z11 = lp4[1];\n"
+ << INDENT2 << "lp4 += 32*4;\n"
+ << "\n";
+
+ ss << INDENT2 << "z20 = lp4[0];\n"
+ << INDENT2 << "z21 = lp4[1];\n"
+ << INDENT2 << "lp4 += 32*4;\n"
+ << "\n";
+
+ ss << INDENT2 << "z30 = lp4[0];\n"
+ << INDENT2 << "z31 = lp4[1];\n\n";
+
+ INTERLEAVED_WRITE(z00, z01, z10, z11, z20, z21, z30, z31);
+ break;
+ }
+ ss << "\n";
+
+ ss << INDENT;
+ if (xyflag) ss << "} ";
+ ss << "// End write A block;\n\n";
+
+ if (xyflag) ss << INDENT << "if (i < DIMX && j < DIMY)\n"
+ << INDENT << "{\n";
+
+ ss << INDENT2 << "// write B block\n\n";
+ ss << INDENT2 << "gob = go + (j << 5) + i * (VSTRIDE*32) + (me >> 3)*VSTRIDE;\n"
+ << INDENT2 << "gp = (__global " << datatype << "4 *)(" << szOut0 << " + gob);\n";
+
+ switch (iotype)
+ {
+ case PLANNAR_PLANNAR:
+ ss << INDENT2 << "lp4 = (__local " << datatype << "4 *)(ldsa + lot);\n\n";
+ ss << INDENT2 << "z0 = lp4[0];\n"
+ << INDENT2 << "lp4 += 32*2;\n"
+ << "\n";
+
+ ss << INDENT2 << "z1 = lp4[0];\n"
+ << INDENT2 << "lp4 += 32*2;\n"
+ << "\n";
+
+ ss << INDENT2 << "z2 = lp4[0];\n"
+ << INDENT2 << "lp4 += 32*2;\n"
+ << "\n";
+
+ ss << INDENT2 << "z3 = lp4[0];\n\n";
+
+ PLANNAR_WRITE(z0, z1, z2, z3, gp);
+ break;
+ case INTERLEAVED_PLANNAR:
+ ss << INDENT2 << "lp4 = (__local " << datatype << "4 *)((__local " << typeOut << "*)ldsa + lot);\n\n";
+ ss << INDENT2 << "z00 = lp4[0];\n"
+ << INDENT2 << "lp4 += 32*2;\n"
+ << "\n";
+
+ ss << INDENT2 << "z10 = lp4[0];\n"
+ << INDENT2 << "lp4 += 32*2;\n"
+ << "\n";
+
+ ss << INDENT2 << "z20 = lp4[0];\n"
+ << INDENT2 << "lp4 += 32*2;\n"
+ << "\n";
+
+ ss << INDENT2 << "z30 = lp4[0];\n\n";
+
+ PLANNAR_WRITE(z00, z10, z20, z30, gp);
+
+ ss << INDENT2 << "gp = (__global " << datatype << "4 *)(" << szOut1 << " + gob);\n";
+ ss << INDENT2 << "lp4 += (256 - 32*6);\n";
+ ss << INDENT2 << "z01 = lp4[0];\n"
+ << INDENT2 << "lp4 += 32*2;\n"
+ << "\n";
+
+ ss << INDENT2 << "z11 = lp4[0];\n"
+ << INDENT2 << "lp4 += 32*2;\n"
+ << "\n";
+
+ ss << INDENT2 << "z21 = lp4[0];\n"
+ << INDENT2 << "lp4 += 32*2;\n"
+ << "\n";
+
+ ss << INDENT2 << "z31 = lp4[0];\n\n";
+
+ PLANNAR_WRITE(z01, z11, z21, z31, gp);
+ break;
+
+ case PLANNAR_INTERLEAVED:
+ case INTERLEAVED_INTERLEAVED:
+ if (iotype == PLANNAR_INTERLEAVED)
+ ss << INDENT2 << "lp4 = (__local " << datatype << "4 *)(ldsa + lot*2);\n\n";
+ else
+ ss << INDENT2 << "lp4 = (__local " << datatype << "4 *)(ldsa + lot);\n\n";
+
+ ss << INDENT2 << "z00 = lp4[0];\n"
+ << INDENT2 << "z01 = lp4[1];\n"
+ << INDENT2 << "lp4 += 32*4;\n"
+ << "\n";
+
+ ss << INDENT2 << "z10 = lp4[0];\n"
+ << INDENT2 << "z11 = lp4[1];\n"
+ << INDENT2 << "lp4 += 32*4;\n"
+ << "\n";
+
+ ss << INDENT2 << "z20 = lp4[0];\n"
+ << INDENT2 << "z21 = lp4[1];\n"
+ << INDENT2 << "lp4 += 32*4;\n"
+ << "\n";
+
+ ss << INDENT2 << "z30 = lp4[0];\n"
+ << INDENT2 << "z31 = lp4[1];\n\n";
+
+ INTERLEAVED_WRITE(z00, z01, z10, z11, z20, z21, z30, z31);
+ break;
+ }
+ ss << "\n";
+
+ ss << INDENT;
+ if(xyflag) ss << "} ";
+ ss << "// End write B block;\n\n";
+
+ if (iotype == PLANNAR_PLANNAR)
+ {
+ ss << INDENT << "// Identical handling for imaginary data\n"
+ << INDENT << "barrier(CLK_LOCAL_MEM_FENCE);\n"
+ << "\n";
+
+ if (xyflag) ss << INDENT << "if (i < DIMX && j < DIMY)\n"
+ << INDENT << "{\n";
+
+ ss << INDENT2 << "//load A block\n"
+ << INDENT2 << "goa = go + (i << 5) + j * (HSTRIDE*32) + (me >> 3)*HSTRIDE;\n"
+ << INDENT2 << "gp = (__global " << datatype << "4 *)(" << szIn1 << " + goa);\n"
+ << "\n";
+
+ PLANNAR_READ(z0, z1, z2, z3, gp);
+
+ ss << INDENT2 << "lp = ldsa + lo;\n"
+ << "\n";
+
+ WRITE_TO_LDS(lp, 8, z0, z1, z2, z3, x);
+ WRITE_TO_LDS(lp, 8, z0, z1, z2, z3, y);
+ WRITE_TO_LDS(lp, 8, z0, z1, z2, z3, z);
+ WRITE_TO_LDS(lp, 8, z0, z1, z2, z3, w);
+
+ ss << INDENT;
+ if (xyflag) ss << "} ";
+ ss << "//end load A block\n\n";
+
+ if (xyflag) ss << INDENT << "if (i < DIMY && j < DIMX)\n"
+ << INDENT << "{\n";
+
+ ss << INDENT2 << "//load B block\n"
+ << INDENT2 << "gob = go + (j << 5) + i * (HSTRIDE*32) + (me >> 3)*HSTRIDE;\n"
+ << INDENT2 << "gp = (__global " << datatype << "4 *)(" << szIn1 <<" + gob);\n"
+ << "\n";
+
+ PLANNAR_READ(z0, z1, z2, z3, gp);
+
+ ss << INDENT2 << "lp = ldsb + lo;\n"
+ << "\n";
+
+ WRITE_TO_LDS(lp, 8, z0, z1, z2, z3, x);
+ WRITE_TO_LDS(lp, 8, z0, z1, z2, z3, y);
+ WRITE_TO_LDS(lp, 8, z0, z1, z2, z3, z);
+ WRITE_TO_LDS(lp, 8, z0, z1, z2, z3, w);
+
+ ss << INDENT;
+ if (xyflag) ss << "} ";
+ ss << "// end load B block\n\n";
+
+ ss << INDENT << "barrier(CLK_LOCAL_MEM_FENCE);\n"
+ << "\n";
+
+ if (xyflag) ss << INDENT << "if (i < DIMY && j < DIMX)\n"
+ << INDENT << "{\n";
+
+ ss << INDENT2 << "//Write A block\n"
+ << INDENT2 << "lp4 = (__local " << datatype << "4 *)(ldsb + lot);\n"
+ << "\n";
+
+ ss << INDENT2 << "z0 = lp4[0];\n"
+ << INDENT2 << "lp4 += 32*2;\n"
+ << "\n";
+
+ ss << INDENT2 << "z1 = lp4[0];\n"
+ << INDENT2 << "lp4 += 32*2;\n"
+ << "\n";
+
+ ss << INDENT2 << "z2 = lp4[0];\n"
+ << INDENT2 << "lp4 += 32*2;\n"
+ << "\n";
+
+ ss << INDENT2 << "z3 = lp4[0];\n"
+ << "\n";
+
+ ss << INDENT2 << "goa = go + (i << 5) + j * (VSTRIDE*32) + (me >> 3)*VSTRIDE;\n"
+ << INDENT2 << "gp = (__global " << datatype << "4 *)(" << szOut1 << " + goa);\n"
+ << "\n";
+
+ PLANNAR_WRITE(z0, z1, z2, z3, gp);
+
+ ss << INDENT;
+ if (xyflag) ss << "} ";
+ ss << "// end write A block\n\n";
+
+ if (xyflag) ss << INDENT << "if (i < DIMX && j < DIMY)\n"
+ << INDENT << "{\n";
+
+ ss << INDENT2 << "//write B block\n"
+ << INDENT2 << "lp4 = (__local " << datatype << "4 *)(ldsa + lot);\n"
+ << "\n";
+
+ ss << INDENT2 << "z0 = lp4[0];\n"
+ << INDENT2 << "lp4 += 32*2;\n"
+ << "\n";
+
+ ss << INDENT2 << "z1 = lp4[0];\n"
+ << INDENT2 << "lp4 += 32*2;\n"
+ << "\n";
+
+ ss << INDENT2 << "z2 = lp4[0];\n"
+ << INDENT2 << "lp4 += 32*2;\n"
+ << "\n";
+
+ ss << INDENT2 << "z3 = lp4[0];\n"
+ << "\n";
+
+ ss << INDENT2 << "gob = go + (j << 5) + i * (VSTRIDE*32) + (me >> 3)*VSTRIDE;\n"
+ << INDENT2 << "gp = (__global " << datatype << "4 *)(" << szOut1 << " + gob);\n";
+ PLANNAR_WRITE(z0, z1, z2, z3, gp);
+
+ ss << INDENT;
+ if (xyflag) ss << "} ";
+ ss << "// end write B block\n";
+ }
+
+ ss << "}\n\n";
+ kernel = ss.str();
+ return CLFFT_SUCCESS;
+}
+
+template<>
+clfftStatus FFTPlan::GetKernelGenKeyPvt (FFTKernelGenKeyParams & params) const
+{
+
+ // Query the devices in this context for their local memory sizes
+ // How we generate a kernel depends on the *minimum* LDS size for all devices.
+ //
+ const FFTEnvelope * pEnvelope = NULL;
+ OPENCL_V(const_cast(this)->GetEnvelope (& pEnvelope), _T("GetEnvelope failed"));
+ BUG_CHECK (NULL != pEnvelope);
+
+ ::memset( ¶ms, 0, sizeof( params ) );
+ params.fft_precision = this->precision;
+ params.fft_placeness = this->placeness;
+ params.fft_inputLayout = this->inputLayout;
+
+ ARG_CHECK (this->inStride.size() == this->outStride.size())
+
+ if (CLFFT_INPLACE == this->placeness) {
+ // If this is an in-place transform the
+ // input and output layout, dimensions and strides
+ // *MUST* be the same.
+ //
+ ARG_CHECK (this->inputLayout == this->outputLayout)
+ params.fft_outputLayout = this->inputLayout;
+ for (size_t u = this->inStride.size(); u-- > 0; ) {
+ ARG_CHECK (this->inStride[u] == this->outStride[u]);
+ }
+ } else {
+ params.fft_outputLayout = this->outputLayout;
+ }
+
+ //we only support 2D transpose
+ switch (this->inStride.size()) {
+ // 2-D array is a 3-D data structure
+ // 2-D unit is a speical case of 2-D array.
+ case 2:
+ ARG_CHECK(this->length .size() > 1);
+ ARG_CHECK(this->outStride.size() > 1);
+ params.fft_DataDim = 3;
+ params.fft_N[0] = this->length[0];
+ params.fft_N[1] = this->length[1];
+ params.fft_inStride[0] = this->inStride[0];
+ params.fft_inStride[1] = this->inStride[1];
+ params.fft_inStride[2] = this->iDist;
+ params.fft_outStride[0] = this->outStride[0];
+ params.fft_outStride[1] = this->outStride[1];
+ params.fft_outStride[2] = this->oDist;
+ break;
+ default:
+ ARG_CHECK (false);
+ }
+
+ //ToDO: work group size setup
+ params.fft_R = 32; // divide the element into 32x32 blocks
+ params.fft_SIMD = 64; //work group size
+
+ return CLFFT_SUCCESS;
+}
+
+template<>
+clfftStatus FFTPlan::GetWorkSizesPvt (std::vector & globalWS, std::vector & localWS) const
+{
+ // How many numbers per workitem in the generated kernel?
+ FFTKernelGenKeyParams fftParams;
+ // Translate the user plan into the structure that we use to map plans to clPrograms
+ OPENCL_V( this->GetKernelGenKeyPvt( fftParams ), _T("GetKernelGenKey() failed!") );
+
+ unsigned long long count, count0, count1;
+ count0 = DivRoundingUp (this->length[0], fftParams.fft_R);
+ count1 = DivRoundingUp (this->length[1], fftParams.fft_R);
+ count = (count0>count1) ? count0 : count1;
+ count = (count * (count+1)) /2;
+ count *= fftParams.fft_SIMD;
+ count *= this->batchsize;
+
+ globalWS.push_back( static_cast< size_t >( count ) );
+ localWS.push_back( fftParams.fft_SIMD );
+
+ return CLFFT_SUCCESS;
+}
+
+
+// OpenCL does not take unicode strings as input, so this routine returns only ASCII strings
+// Feed this generator the FFTPlan, and it returns the generated program as a string
+template<>
+clfftStatus FFTPlan::GenerateKernelPvt ( FFTRepo& fftRepo ) const
+{
+ FFTKernelGenKeyParams params;
+ OPENCL_V( this->GetKernelGenKeyPvt (params), _T("GetKernelGenKey() failed!") );
+
+ std::string programCode;
+ OPENCL_V( GenerateTransposeKernel( params, programCode ), _T( "GenerateTransposeKernel() failed!" ) );
+
+ OPENCL_V( fftRepo.setProgramCode( Transpose, params, programCode ), _T( "fftRepo.setclString() failed!" ) );
+ OPENCL_V( fftRepo.setProgramEntryPoints( Transpose, params, "fft_trans", "fft_trans" ), _T( "fftRepo.setProgramEntryPoint() failed!" ) );
+
+ return CLFFT_SUCCESS;
+}
diff --git a/src/library/generator.transpose.h b/src/library/generator.transpose.h
new file mode 100644
index 00000000..b08e3d3a
--- /dev/null
+++ b/src/library/generator.transpose.h
@@ -0,0 +1,29 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+////////////////////////////////////////////
+// Copyright (C) 2011 Advanced Micro Devices, Inc. All Rights Reserved.
+////////////////////////////////////////////
+
+#pragma once
+#if !defined( AMD_CLFFT_generator_transpose_H )
+#define AMD_CLFFT_generator_transpose_H
+#include "private.h"
+#include "repo.h"
+#include "plan.h"
+
+#endif
+
diff --git a/src/library/lifetime.cpp b/src/library/lifetime.cpp
new file mode 100644
index 00000000..7548e9a9
--- /dev/null
+++ b/src/library/lifetime.cpp
@@ -0,0 +1,90 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+// clfft.lifetime.cpp : Functions that control the lifetime of the FFT library and their supporting functions
+//
+
+#include "stdafx.h"
+#include "private.h"
+#include "repo.h"
+#include "../include/sharedLibrary.h"
+#include "../statTimer/statisticalTimer.extern.h"
+
+// Allow AMD's implementation of FFT's to allocate internal resources
+clfftStatus clfftSetup( const clfftSetupData* sData )
+{
+ // Static data is not thread safe (to create), so we implement a lock to protect instantiation for the first call
+ // Implemented outside of FFTRepo::getInstance to minimize lock overhead; this is only necessary on first creation
+ scopedLock sLock( FFTRepo::lockRepo, _T( "FFTRepo::getInstance" ) );
+
+ // First invocation of this function will allocate the FFTRepo singleton; thereafter the object always exists
+ FFTRepo& fftRepo = FFTRepo::getInstance( );
+
+ // Discover and load the timer module if present
+ fftRepo.timerHandle = LoadSharedLibrary( "lib", "StatTimer", true );
+ if( fftRepo.timerHandle )
+ {
+ // Timer module discovered and loaded successfully
+ // Initialize function pointers to call into the shared module
+ PFGETSTATTIMER pfGetStatTimer = reinterpret_cast< PFGETSTATTIMER > ( LoadFunctionAddr( fftRepo.timerHandle, "getStatTimer" ) );
+
+ // Create and initialize our timer class, if the external timer shared library loaded
+ if( pfGetStatTimer )
+ {
+ fftRepo.pStatTimer = reinterpret_cast< GpuStatTimer* > ( pfGetStatTimer( CLFFT_GPU ) );
+ }
+ }
+
+ // If the client has no setupData, we are done
+ if( sData == NULL )
+ return CLFFT_SUCCESS;
+
+ // Versioning checks commented out until necessary
+ //// If the major version number between the client and library do not match, return mismatch
+ //if( sData->major > clfftVersionMajor )
+ // return CLFFT_VERSION_MISMATCH;
+
+ //// If the minor version number between the client and library do not match, return mismatch
+ //if( sData->minor > clfftVersionMinor )
+ // return CLFFT_VERSION_MISMATCH;
+
+ //// We ignore patch version number for version validation
+
+ fftRepo.setupData = *sData;
+
+ return CLFFT_SUCCESS;
+}
+
+// Allow AMD's implementation of FFT's to destroy internal resources
+clfftStatus clfftTeardown( )
+{
+ FFTRepo& fftRepo = FFTRepo::getInstance( );
+ fftRepo.releaseResources( );
+
+ FreeSharedLibrary( fftRepo.timerHandle );
+
+ return CLFFT_SUCCESS;
+}
+
+clfftStatus clfftGetVersion( cl_uint* major, cl_uint* minor, cl_uint* patch )
+{
+ *major = clfftVersionMajor;
+ *minor = clfftVersionMinor;
+ *patch = clfftVersionPatch;
+
+ return CLFFT_SUCCESS;
+}
diff --git a/src/library/lock.h b/src/library/lock.h
new file mode 100644
index 00000000..49c95aca
--- /dev/null
+++ b/src/library/lock.h
@@ -0,0 +1,248 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#pragma once
+#if !defined( CLFFT_lock_H )
+#define CLFFT_lock_H
+
+#if defined( _WIN32 )
+ #include
+#else
+ #include
+#endif
+
+#include "private.h"
+
+#if defined( _WIN32 )
+
+// lockRAII provides an abstraction for the concept of a mutex; it wraps all mutex functions in generic methods
+// On windows, the mutex is implemented as a CRITICAL_SECTION, as this is the fastest intraprocess mutex
+// available.
+// The template argument 'debugPrint' activates debugging information, but if not active the compiler optimizes
+// the print statements out
+template< bool debugPrint >
+class lockRAII
+{
+ CRITICAL_SECTION cs;
+ tstring csName;
+ tstringstream tstream;
+
+ // Does not make sense to create a copy of a lock object; private method
+ lockRAII( const lockRAII& rhs ): csName( rhs.csName )
+ {
+ tstream << std::hex << std::showbase;
+ ::InitializeCriticalSection( &cs );
+ }
+
+ public:
+ lockRAII( )
+ {
+ tstream << std::hex << std::showbase;
+ ::InitializeCriticalSection( &cs );
+ }
+
+ lockRAII( const tstring& name ): csName( name )
+ {
+ tstream << std::hex << std::showbase;
+ ::InitializeCriticalSection( &cs );
+ }
+
+ ~lockRAII( )
+ {
+ ::DeleteCriticalSection( &cs );
+ }
+
+ tstring& getName( )
+ {
+ return csName;
+ }
+
+ void setName( const tstring& name )
+ {
+ csName = name;
+ }
+
+ void enter( )
+ {
+ if( debugPrint )
+ {
+ tstream.str( _T( "" ) );
+ tstream << _T( "Attempting CRITICAL_SECTION( " ) << csName << _T( " )" ) << std::endl;
+ tout << tstream.str( );
+ }
+
+ ::EnterCriticalSection( &cs );
+
+ if( debugPrint )
+ {
+ tstream.str( _T( "" ) );
+ tstream << _T( "Acquired CRITICAL_SECTION( " ) << csName << _T( " )" ) << std::endl;
+ tstream << _T( "\tOwningThread( " ) << cs.OwningThread << _T( " )" ) << std::endl;
+ tstream << _T( "\tLockcount( " ) << cs.LockCount << _T( " )" ) << std::endl;
+ tstream << _T( "\tRecursionCount( " ) << cs.RecursionCount << _T( " )" ) << std::endl;
+ tout << tstream.str( );
+ }
+ }
+
+ void leave( )
+ {
+ if( debugPrint )
+ {
+ tstream.str( _T( "" ) );
+ tstream << _T( "Releasing CRITICAL_SECTION( " ) << csName << _T( " )" ) << std::endl;
+ tstream << _T( "\tOwningThread( " ) << cs.OwningThread << _T( " )" ) << std::endl;
+ tstream << _T( "\tLockcount( " ) << cs.LockCount << _T( " )" ) << std::endl;
+ tstream << _T( "\tRecursionCount( " ) << cs.RecursionCount << _T( " )" ) << std::endl << std::endl;
+ tout << tstream.str( );
+ }
+
+ ::LeaveCriticalSection( &cs );
+ }
+};
+
+#else
+// lockRAII provides an abstraction for the concept of a mutex; it wraps all mutex functions in generic methods
+// Linux implementation not done yet
+// The template argument 'debugPrint' activates debugging information, but if not active the compiler optimizes
+// the print statements out
+template< bool debugPrint >
+class lockRAII
+{
+ pthread_mutex_t mutex;
+ pthread_mutexattr_t mAttr;
+ tstring mutexName;
+ tstringstream tstream;
+
+ // Does not make sense to create a copy of a lock object; private method
+ lockRAII( const lockRAII& rhs ): mutexName( rhs.mutexName )
+ {
+ tstream << std::hex << std::showbase;
+ }
+
+ public:
+ lockRAII( )
+ {
+ tstream << std::hex << std::showbase;
+ pthread_mutexattr_init( &mAttr );
+ pthread_mutexattr_settype( &mAttr, PTHREAD_MUTEX_RECURSIVE );
+ pthread_mutex_init( &mutex, &mAttr );
+ }
+
+ lockRAII( const tstring& name ): mutexName( name )
+ {
+ tstream << std::hex << std::showbase;
+ pthread_mutexattr_init( &mAttr );
+ pthread_mutexattr_settype( &mAttr, PTHREAD_MUTEX_RECURSIVE );
+ pthread_mutex_init( &mutex, &mAttr );
+ }
+
+ ~lockRAII( )
+ {
+ pthread_mutex_destroy( &mutex );
+ pthread_mutexattr_destroy( &mAttr );
+ }
+
+ tstring& getName( )
+ {
+ return mutexName;
+ }
+
+ void setName( const tstring& name )
+ {
+ mutexName = name;
+ }
+
+ void enter( )
+ {
+ if( debugPrint )
+ {
+ tstream.str( _T( "" ) );
+ tstream << _T( "Attempting pthread_mutex_t( " ) << mutexName << _T( " )" ) << std::endl;
+ tout << tstream.str( );
+ }
+
+ ::pthread_mutex_lock( &mutex );
+
+ if( debugPrint )
+ {
+ tstream.str( _T( "" ) );
+ tstream << _T( "Acquired pthread_mutex_t( " ) << mutexName << _T( " )" ) << std::endl;
+ //tstream << _T( "\tOwningThread( " ) << mutex.OwningThread << _T( " )" ) << std::endl;
+ //tstream << _T( "\tLockcount( " ) << mutex.LockCount << _T( " )" ) << std::endl;
+ //tstream << _T( "\tRecursionCount( " ) << mutex.RecursionCount << _T( " )" ) << std::endl;
+ tout << tstream.str( );
+ }
+ }
+
+ void leave( )
+ {
+ if( debugPrint )
+ {
+ tstream.str( _T( "" ) );
+ tstream << _T( "Releasing pthread_mutex_t( " ) << mutexName << _T( " )" ) << std::endl;
+ //tstream << _T( "\tOwningThread( " ) << mutex.OwningThread << _T( " )" ) << std::endl;
+ //tstream << _T( "\tLockcount( " ) << mutex.LockCount << _T( " )" ) << std::endl;
+ //tstream << _T( "\tRecursionCount( " ) << mutex.RecursionCount << _T( " )" ) << std::endl << std::endl;
+ tout << tstream.str( );
+ }
+
+ ::pthread_mutex_unlock( &mutex );
+ }
+};
+#endif
+
+// Class used to make sure that we enter and leave critical sections in pairs
+// The template logic logs our CRITICAL_SECTION actions; if the template parameter is false,
+// the branch is constant and the compiler will optimize the branch out
+template< bool debugPrint >
+class scopedLock
+{
+ lockRAII< debugPrint >* sLock;
+ tstring sLockName;
+ tstringstream tstream;
+
+ public:
+ scopedLock( lockRAII< debugPrint >& lock, const tstring& name ): sLock( &lock ), sLockName( name )
+ {
+ if( debugPrint )
+ {
+ tstream.str( _T( "" ) );
+ tstream << _T( "Entering scopedLock( " ) << sLockName << _T( " )" ) << std::endl << std::endl;
+ tout << tstream.str( );
+ }
+
+ sLock->enter( );
+ }
+
+ ~scopedLock( )
+ {
+ sLock->leave( );
+
+ if( debugPrint )
+ {
+ tstream.str( _T( "" ) );
+ tstream << _T( "Left scopedLock( " ) << sLockName << _T( " )" ) << std::endl << std::endl;
+ tout << tstream.str( );
+ }
+ }
+};
+
+// Convenience macro to enable/disable debugging print statements
+#define lockRAII lockRAII< false >
+#define scopedLock scopedLock< false >
+
+#endif // CLFFT_lock_H
diff --git a/src/library/mainpage.h b/src/library/mainpage.h
new file mode 100644
index 00000000..326ad7a1
--- /dev/null
+++ b/src/library/mainpage.h
@@ -0,0 +1,556 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+/*! @file clFFT.mainpage.h
+
+This file contains all documentation, no code, in the form of comment text. It's purpose is to provide
+chapter 1 of the documentation we produce with doxygen. This included the title page, installation instructions
+and prose on the nature of FFT's and their use in our library.
+
+@mainpage OpenCL Fast Fourier Transforms (FFT's)
+
+The clFFT library is an OpenCL library implementation of discrete Fast Fourier Transforms. It:
+@li Provides a fast and accurate platform for calculating discrete FFTs.
+@li Works on CPU or GPU backends.
+@li Supports in-place or out-of-place transforms.
+@li Supports 1D, 2D, and 3D transforms with a batch size that can be greater than 1.
+@li Supports planar (real and complex components in separate arrays) and interleaved (real and complex
+components as a pair contiguous in memory) formats.
+@li Supports dimension lengths that can be any mix of powers of 2, 3, and 5.
+@li Supports single and double precision floating point formats.
+
+@section InstallFFT Installation of clFFT library
+
+@subsection DownBinaries Downloadable Binaries
+AMD provides clFFT library pre-compiled packages for recent versions of Microsoft Windows operating systems
+and several flavors of Linux.
+
+The downloadable binary packages are freely available from AMD at
+http://developer.amd.com/tools-and-sdks/heterogeneous-computing/amd-accelerated-parallel-processing-math-libraries/
+
+Once the appropriate package for the respective OS has finished downloading,
+uncompress the package using the native tools available on the platform in a
+directory of the user's choice. Everything needed to build a program using
+clFFT is included in the directory tree, including documentation, header files,
+binary library components, and sample programs for programming illustration.
+
+@subsubsection CMakeDependancy CMake
+After the clFFT package is uncompressed on the user's hard drive, a samples directory exists with source code,
+but no Visual Studio project files, Unix makefiles, or other native build system exist. Instead, it contains a
+\c CMakeLists.txt file. clFFT uses CMake as its build system, and other build files, such as Visual Studio projects,
+nmake makefiles, or Unix makefiles, are generated by the CMake build system, during configuration. CMake is freely
+available for download from: http://www.cmake.org/
+
+@note CMake generates the native OS build files, so any changes made to the native build files are overwritten the
+next time CMake is run.
+
+CMake is written to pull compiler information from environment variables, and to look in default install
+directories for tools. Once installed, a popular interface to control the process of creating native build
+files is CMake-gui. When the GUI is launched, two text boxes appear at the top of the dialog: a path to
+source and a separate path to generate binaries. For the \c browse source... box, find the path to where you
+unzipped clFFT, and select the root \c samples directory that contains the CMakeLists.txt; for clFFT,
+this should be \c clFFT/samples. For \c browse \c build..., select an appropriate directory where the build
+environment generates build files; a convenient location is a sibling directory to the source. This makes
+it easy to wipe all the binaries and start a fresh build. For instance, for a debug configuration of NMake,
+an example directory could be \c clFFT/bin/NMakeDebug. This is where the generated makefile, native build
+files, and intermediate object files are built. These generated files are kept separate from the source;
+this is referred to as 'out-of-source' builds, and is very similar in concept to what 'autotools' does for Linux.
+To build using NMake, simply type NMake in the build directory containing the makefile. To build using
+Visual Studio, generate the solution and project files into a directory such as \c clFFT/bin/vs10, find the
+generated \c .sln file, and open the solution.
+
+The first time the \c configure button near the bottom of the screen is clicked, it causes CMake to prompt for
+what type of native build files to make. Various properties appear in red in the \c properties box. Red indicates
+that the value has changed since last time \c configure was clicked. (The first time configure is clicked,
+everything is red.) CMake tries to configure itself automatically to the client's system by looking at a systems
+environment variables and by searching through default install locations for project dependencies. Take a moment to
+verify the settings and paths that are displayed on the configuration screen; if any changes must be made, you can
+provide correct paths or adjust settings by typing directly into the CMake configuration screen. Click the
+\c configure button a second time to 'bake' those settings and serialize them to disk.
+
+Options relevant to the clFFT project include:
+
+@li \c 'AMDAPPSDKROOT': Location of the Stream SDK installation. This value is already populated if CMake
+could determine the location by looking at the environment variables. If not, the user must provide a path to
+the root installation of the Stream SDK here.
+
+@li \c 'BOOST_ROOT': Location of the Boost SDK installation. This value is already populated if CMake could
+determine the location by looking at the environment variables or default install locations. If not, the user must
+provide a path to the root installation of the Stream SDK here. This dependency is only relevant to the sample
+client; the FFT library does not depend on Boost.
+
+@li \c 'CMAKE_BUILD_TYPE': Defines the build type (default is debug). For Visual Studio projects, this does
+not appear (modifiable in IDE); for makefile-based builds, this is set in CMake.
+
+@li \c 'CMAKE_INSTALL_PREFIX': The path to install all binaries and headers generated from the build. This is
+used when the user types \c make \c install or builds the INSTALL project in Visual Studio. All generated binaries and
+headers are copied into the path prefixed with \c CMAKE_INSTALL_PREFIX. The Visual Studio projects are self
+explanatory, but a few other projects are autogenerated; these might be unfamiliar.
+
+The Visual Studio projects are self explanatory, but a few other projects are autogenerated; these might be unfamiliar.
+
+@li \c 'ALL_BUILD': A project that is empty of files, but since it depends on all user projects, it provides a
+convenient way to rebuild everything.
+
+@li \c 'ZERO_CHECK': A CMake-specific project that checks to see if the generated solution and project files are in sync
+with the \c CMakeLists.txt file. If these files are modified, the solutions and projects are now out-of-sync, and this
+project prompts the user to regenerate their environment.
+
+@note If the user chooses to build on Windows with a NMake based build, it is important to launch CMake from within a
+Visual Studio Command Prompt (20xx). This is because CMake must be able to parse environment variables to properly
+initialize NMake. This is not necessary if a Visual Studio solution is generated, because solution files contain their
+own environmental setup.
+
+@subsubsection BoostDependancy Boost
+clFFT includes one sample project that has source dependencies on Boost: the sample client project. Boost is
+freely available from: http://www.boost.org/.
+
+The command-line clFFT sample client links with the \c program_options library, which provides functionality for
+parsing command-line parameters and \c .ini files in a cross-platform manner. Once Boost is downloaded and
+extracted on the hard drive, the \c program_options library must be compiled. The Boost build system
+uses the BJam builder (a project for a CMake-based Boost build is available for separate download). This is
+available for download from the Boost website, or the user can build BJam; Boost includes the source to BJam
+in its distribution, and the user can execute \c bootstrap.bat (located in the root boost directory) to build it.
+
+After BJam is either built or installed, an example BJam command-line is given below for building a 64-bit
+\c program_options binary, for both static and dynamic linking:
+@code
+bjam --with-program_options address-model=64 link=static,shared stage
+@endcode
+
+The last step to make boost readily available and usable by CMake and the native compiler is to add an environment
+variable to the system called \c BOOST_ROOT. In Windows, right click on the computer icon and go to
+@code
+'Properties|Advanced system settings|Advanced|Environment Variables...'
+@endcode
+Remember to relaunch any new processes that are open, in order to inherit the new environment variable. On Linux,
+consider modifying the \c .bash_rc file (or shell equivalent) to export a new environment variable every time you log in.
+
+If you are on a Linux system and have used a package manager to install Boost, you may have to confirm where the Boost
+\c include and \c library files have been placed. For example, after installing Boost with the Ubuntu Synaptic Package
+Manager, the Boost \c include files are in \c /usr/include/boost, and the library files either \c /usr/lib or \c /usr/lib64.
+The \c CMakeLists.txt file in this project defaults the \c BOOST_ROOT value to \c /usr on Linux; so, if the system is set up
+similarly, no further action is necessary. If the system is set up differently, you may have to set the \c BOOST_ROOT
+environmental variable accordingly.
+
+@note Note that CMake does not recognize version numbers at the end of the library filename; so, if the package
+manager only created a \c libboost_module_name.so.x.xx.x file (where x.xx.x is the version of Boost),
+the user may need to manually create a soft link called \c libboost_module_name.so to the versioned
+\c libboost_module_name.so.x.xx.x. See the clFFT binary artifacts in the install directory for an example.
+
+@section IntroFFT Introduction to clFFT
+
+The FFT is an implementation of the Discrete Fourier Transform (DFT) that makes use of symmetries in the FFT
+definition to reduce the mathematical intensity required from O(\f$N^2\f$) to O(\f$ N \log N\f$) when the
+sequence length, \c N, is the product of small prime factors. Currently, there is no standard API for FFT
+routines. Hardware vendors usually provide a set of high-performance FFTs optimized for their systems:
+no two vendors employ the same interfaces for their FFT routines. clFFT provides a set of FFT routines that
+are optimized for AMD graphics processors, and that also functional across CPU and other compute devices.
+
+@subsection SupportRadix Supported Radices
+clFFT supports powers of 2, 3 and 5 sizes. This means that the vector lengths that can be
+configured through a plan can be any length that is a power of two, three, and five; examples include \f$2^7, 2^1*3^1, 3^2*5^4, 2^2*3^3*5^5\f$,
+up to the limit that the device can support.
+
+@subsection SizeLimit Transform Size Limits
+Currently, there is an upper bound on the transform size the library supports. This
+limit is \f$2^{24}\f$ for single precision and \f$2^{22}\f$ for double precision. This means that the
+product of transform lengths must not exceed these values. As an example, a
+1D single-precision FFT of size 1024 is valid since 1024 \f$<= 2^{24}\f$. Similarly, a 2D
+double-precision FFT of size 1024x1024 is also valid, since 1024*1024 \f$<= 2^{22}\f$.
+But, a 2D single-precision FFT of size 4096x8192 is not valid because
+4096*8192 > 224.
+
+@subsection EnumDim Dimensionality
+clFFT currently supports FFTs of up to three dimensions, given by the enum \c clFFT-Dim. This enum
+is a required parameter into \c clfftCreateDefaultPlan() to create an initial plan; there is no default for
+this parameter. Depending on the dimensionality that the client requests, clFFT uses the formulations
+shown below to compute the DFT.
+
+The definition of a 1D complex DFT used by clFFT is given by:
+\f[
+{\tilde{x}}_j = {{1}\over{scale}}\sum_{k=0}^{n-1}x_k\exp\left({\pm i}{{2\pi jk}\over{n}}\right)\hbox{ for } j=0,1,\ldots,n-1
+\f]
+where \f$x_k\f$ are the complex data to be transformed, \f$\tilde{x}_j\f$ are the transformed data, and the sign
+of \f$\pm\f$ determines the direction of the transform: \f$-\f$ for forward and \f$+\f$ for backward. Note that
+the user must provided the scaling factor. Typically, the scale is set to 1 for forward transforms, and
+\f${{1}\over{N}}\f$ for backwards transforms.
+
+The definition of a complex 2D DFT used by clFFT is given by:
+\f[
+{\tilde{x}}_{jk} = {{1}\over{scale}}\sum_{q=0}^{m-1}\sum_{r=0}^{n-1}x_{rq}\exp\left({\pm i} {{2\pi jr}\over{n}}\right)\exp\left({\pm i}{{2\pi kq}\over{m}}\right)
+\f]
+for \f$j=0,1,\ldots,n-1\hbox{ and } k=0,1,\ldots,m-1\f$, where \f$x_{rq}\f$ are the complex data to be transformed,
+\f$\tilde{x}_{jk}\f$ are the transformed data, and the sign of \f$\pm\f$ determines the direction of the
+transform. Typically, the scale is set to 1 for forwards transforms and \f${{1}\over{M \cdot N}}\f$ for backwards transforms.
+
+The definition of a complex 3D DFT used by clFFT is given by:
+\f[
+\tilde{x}_{jkl} = {{1}\over{scale}}\sum_{s=0}^{p-1}\sum_{q=0}^{m-1}\sum_{r=0}^{n-1}
+x_{rqs}\exp\left({\pm i} {{2\pi jr}\over{n}}\right)\exp\left({\pm i}{{2\pi kq}\over{m}}\right)\exp\left({\pm i}{{2\pi ls}\over{p}}\right)
+\f]
+for \f$j=0,1,\ldots,n-1\hbox{ and } k=0,1,\ldots,m-1\hbox{ and } l=0,1,\ldots,p-1\f$, where \f$x_{rqs}\f$ are the complex data
+to be transformed, \f$\tilde{x}_{jkl}\f$ are the transformed data, and the sign of \f$\pm\f$ determines the direction of the
+transform. Typically, the scale is set to 1 for forwards transforms and \f${{1}\over{M \cdot N \cdot P}}\f$ for backwards transforms.
+
+@subsection InitLibrary Setup and Teardown of clFFT
+clFFT is initialized by a call to \c clfftSetup(), which must be called before any other API exported
+from clFFT. This allows the library to create resources used to manage the plans that are created and
+destroyed by the user. This API also takes a structure \c clfftInitSetupData that is initialized by the
+client to control the behavior of the library. The corresponding \c clfftTeardown() method must be called
+by the client when it is done using the library. This instructs clFFT to release all resources, including
+any acquired references to any OpenCL objects that may have been allocated or passed to it through the
+API.
+
+@subsection ThreadSafety Thread safety
+The clFFT API is designed to be thread-safe. It is safe to create plans from multiple threads, and to
+destroy those plans in separate threads. Multiple threads can call \c clfftEnqueueTransform() to place work
+into a command queue at the same time. clFFT does not provide a single-threaded version of the library.
+It is expected that the overhead of the synchronization mechanisms inside of clFFT thread safe is minor.
+
+Currently, multi-device operation must be managed by the user. OpenCL contexts can be created that are
+associated with multiple devices, but clFFT only uses a single device from that context to transform
+the data. Multi-device operation can be managed by the user by creating multiple contexts, where each
+context contains a different device, and the user is responsible for scheduling and partitioning the work
+across multiple devices and contexts.
+
+@subsection MajorFormat Row Major formats
+clFFT expects all multi-dimensional input passed to it to be in row-major format. This is compatible
+with C-based languages. However, clFFT is very flexible in the input and output data organization it
+accepts by allowing the user to specify a stride for each dimension. This feature can be used to process
+data in column major arrays, and other non-contiguous data formats. See \ref clfftSetPlanInStride and
+\ref clfftSetPlanOutStride.
+
+@subsection Object OpenCL object creation
+OpenCL objects, such as contexts, \c cl_mem buffers, and command queues, are the responsibility of the
+user application to allocate and manage. All of the clFFT interfaces that must interact with OpenCL
+objects take those objects as references through the API. Specifically, the plan creation function
+@ref clfftCreateDefaultPlan() takes an OpenCL context as a parameter reference, increments the reference
+count on that object, and keeps the object alive until the corresponding plan has been destroyed through
+a call to @ref clfftDestroyPlan().
+
+@subsection FlushQueue Flushing of command queues
+The clFFT API operates asynchronously, and with the exception of thread safety locking with multiple
+threads, all APIs return immediately. Specifically, the @ref clfftEnqueueTransform() API does not
+explicitly flush the command queues that are passed by reference to it; it pushes the transform work onto the
+command queues and returns the modified queues to the client. The client is free to issue its own blocking
+logic, using OpenCL synchronization mechanisms, or push further work onto the queue to continue processing.
+
+@section clFFTPlans clFFT Plans
+
+A plan is the collection of (almost) all of the parameters needed to specify an FFT computation.
+This includes:
+
+- What OpenCL context executes the transform?
+
- Is this a 1D, 2D or 3D transform?
+
- What are the lengths or extents of the data in each dimension?
+
- How many datasets are being transformed?
+
- What is the data precision?
+
- Should a scaling factor be applied to the transformed data?
+
- Does the output transformed data replace the original input data in the same buffer (or
+buffers), or is the output data written to a different buffer (or buffers).
+
- How is the input data stored in its data buffers?
+
- How is the output data stored in its data buffers?
+
+
+The plan does not include:
+
+- The OpenCL handles to the input and output data buffers.
+
- The OpenCL handle to a temporary scratch buffer (if needed).
+
- Whether to execute a forward or reverse transform.
+
+These are specified when the plan is executed.
+
+@subsection Default Default Plan Values
+
+When a new plan is created by calling @ref clfftCreateDefaultPlan, its parameters are initialized as
+follows:
+
+
+- Dimensions: as provided by the caller.
+
- Lengths: as provided by the caller.
+
- Batch size: 1.
+
- Precision: \c CLFFT_SINGLE.
+
- Scaling factors:
+
+ - For the forward transform, the default is 1.0, or no scale factor is applied.
+
- For the reverse transform, the default is 1.0 / P, where P is the product of the FFT lengths.
+
+ - Location: \c CLFFT_INPLACE.
+
- Input layout: \c CLFFT_COMPLEX_INTERLEAVED.
+
- Input strides: the strides of a multidimensional array of the lengths specified, where the data is
+compactly stored using the row-major convention.
+
- Output layout: \c CLFFT_COMPLEX_INTERLEAVED.
+
- Output strides: same as input strides.
+
+
+Writing client programs that depend on these initial values is not recommended.
+
+@subsection EnumLayout Supported Memory Layouts
+There are two main families of Discrete Fourier Transform (DFT):
+
+- Routines for the transformation of complex data. clFFT supports two layouts to store complex numbers:
+a 'planar' format, where the real and imaginary components are kept in separate arrays:
+
+ - Buffer1: \c RRRRR
+
- Buffer2: \c IIIII
+
+and an interleaved format, where the real and imaginary components are stored as contiguous pairs:
+
+ - Buffer1: \c RIRIRIRIRIRI
+
+ - Routines for the transformation of real to complex data and vice versa; clFFT provides enums to define
+these formats. For transforms involving real data, there are two possibilities:
+
+- Real data being subject to forward FFT transform that results in complex
+data.
+
- Complex data being subject to backward FFT transform that results in
+real data. See the Section "FFTs of Real Data".
+
+
+
+@subsubsection DistanceStridesandPitches Strides and Distances
+For one-dimensional data, if clStrides[0] = strideX = 1, successive elements in the first dimension are stored contiguously
+in memory. If strideX is an integral value greater than 1, gaps in memory exist between each element of
+the vectors.
+
+For multi-dimensional data, if clStrides[1] = strideY = LenX for 2 dimensional data and clStrides[2] = strideZ
+= LenX*LenY for 3 dimensional data, no gaps exist in memory between each element, and all vectors are
+stored tightly packed in memory. Here, LenX, LenY, and LenZ denote the transform lengths clLengths[0],
+clLengths[1], and clLengths[2], respectively, which are used to set up the plan.
+
+By specifying non-default strides, it is possible to process either
+row-major or column-major arrays. Data can be extracted from arrays of structures. Almost any regular
+data storage pattern can be accommodated.
+
+Distance is the amount of memory that exists between corresponding elements
+in an FFT primitive in a batch. Distance is measured in the units of the FFT
+primitive; complex data measures in complex units, and real data measures in
+real data. Stride between tightly packed elements is 1 in either case. Typically,
+one can measure the distance between any two elements in a batch primitive,
+be it 1D, 2D, or 3D data. For tightly packed data, the distance between FFT
+primitives is the size of the FFT primitive, such that dist=LenX for 1D data,
+dist=LenX*LenY for 2D data, and dist=LenX*LenY*LenZ for 3D data. It is
+possible to set the distance of a plan to be less than the size of the FFT vector;
+most often 1 for this case. When computing a batch of 1D FFT vectors, if
+distance == 1, and strideX == length( vector ), a transposed output is produced
+for a batch of 1D vectors. It is left to the user to verify that the distance and
+strides are valid (not intersecting); if not valid, undefined results can occur.
+
+A simple example is to perform a 1D length 4096 on each row of an array of 1024 rows x 4096 columns of
+values stored in a column-major array, such as a FORTRAN program might provide. (This would be equivalent
+to a C or C++ program that had an array of 4096 rows x 1024 columns stored in a row-major manner, and
+you wanted to perform a 1-D length 4096 transform on each column.) In this case, specify the strides
+[1024, 1].
+
+For a more complex example, an input buffer contained a raster grid of 1024 x 1024 monochrome pixel
+values, and you want to compute a 2D FFT for each 64 x 64 subtile of the grid. Specifying strides
+allows you to treat each horizontal band of 1024 x 64 pixels as an array of 16 64 x 64 matrixes,
+and process an entire band with a single call to @ref clfftEnqueueTransform. (Specifying strides is not
+quite flexible enough to transform the entire grid of this example with a single kernel execution.)
+It is possible to create a Plan to compute arrays of 64 x 64 2D FFTs, then specify three strides:
+[1, 1024, 64]. The first stride, 1, indicates that the rows of each matrix are stored consecutively;
+the second stride, 1024, gives the distance between rows, and the third stride, 64, defines the
+distance from matrix to matrix. Then call @ref clfftEnqueueTransform 16 times: once for each
+horizontal band of pixels.
+
+@subsection EnumPrecision Supported Precisions in clFFT
+Both \c CLFFT_SINGLE and \c CLFFT_DOUBLE precisions are supported by the library
+for all supported radices. With both of these enums the host computer's math
+functions are used to produce tables of sines and cosines for use by the OpenCL
+kernel.
+
+Both \c CLFFT_SINGLE_FAST and \c CLFFT_DOUBLE_FAST are meant to generate faster
+kernels with reduced accuracy, but are disabled in the current build..
+
+See @ref clfftPrecision, @ref clfftSetPlanPrecision, and @ref clfftGetPlanPrecision.
+
+@subsection FftDirection clfftDirection
+The direction of the transform is not baked into the plan; the same plan can be used to specify both forward
+and backward transforms. Instead, @ref clfftDirection is passed as a parameter into @ref clfftEnqueueTransform.
+
+@subsection EnumResultLocation In-Place and Out-of-Place
+The clFFT API supports both in-place and out-of-place transforms. With inplace
+transforms, only input buffers are provided to the @ref clfftEnqueueTransform() API,
+and the resulting data is written in the same buffers, overwriting the input data.
+With out-of-place transforms, distinct output buffers are provided to the
+@ref clfftEnqueueTransform() API, and the inputdata is preserved.
+In-place transforms require that the \c cl_mem objects the client
+creates have both \c read and \c write permissions. This is given in the nature of the
+in-place algorithm. Out-of-place transforms require that the destination buffers
+have \c read and \c write permissions, but input buffers can still be created with
+read-only permissions. This is a clFFT requirement because internally the
+algorithms may go back and forth between the destination buffers and internally
+allocated temp buffers. For out-of-place transforms, clFFT never writes back
+to the input buffers.
+
+@subsection clFFTEff Batches
+The efficiency of clFFT is improved by utilizing transforms in batches. Sending
+as much data as possible in a single transform call leverages the parallel
+compute capabilities of OpenCL devices (and GPU devices in particular), and
+minimizes the penalty of transfer overhead. It's best to think of an OpenCL device
+as a high-throughput, high-latency device. Using a networking analogy as an
+example, it's similar to having a massively high-bandwidth pipe with very high
+ping response times. If the client is ready to send data to the device for compute,
+it should be sent in as few API calls as possible. This can be done by batching.
+clFFT plans have a parameter to describe the number of transforms being
+batched: @ref clfftSetPlanBatchSize(), and to describe how those batches are
+laid out and spaced in memory: @ref clfftSetPlanDistance(). 1D, 2D, or 3D
+transforms can be batched.
+
+@section Outline Using clFFT on a Client Application
+
+To perform FFT calculations using clFFT, the client program must:
+
+ - Initialize the library by calling @ref clfftSetup.
+ - For each distinct type of FFT needed:
+
+ - Create an FFT Plan object. This usually is done by calling the factory function @ref clfftCreateDefaultPlan.
+ Some of the most fundamental parameters are specified at this time, and others assume default values. The OpenCL
+ context must be provided when the plan is created; it cannot be changed. Another way is to call @ref clfftCopyPlan.
+ In either case, the function returns an opaque handle to the Plan object.
+ - Complete the specification of all of the Plan parameters by calling the various parameter-setting functions,
+ \c clAmdFFtSet_____.
+ - Optionally, "bake" or finalize the plan, calling @ref clfftBakePlan. This signals to the library the end
+ of the specification phase, and causes it to generate and compile the exact OpenCL kernels needed to perform the
+ specified FFT on the OpenCL device provided.
+
+ At this point, all performance-enhancing optimizations are applied, possibly including executing benchmark kernels
+ on the OpenCL device context in order to maximize runtime performance.
+
+ Although this step is optional, most users probably want to include it so that they can control when this work is
+ done. Usually, this time consuming step is done when the application is initialized. If the user does not call
+ @ref clfftBakePlan, this work is done during the first call to @ref clfftEnqueueTransform.
+
+
+
+ - The OpenCL FFT kernels now are ready to execute as many times as needed.
+
+ - Call @ref clfftEnqueueTransform. At this point, specify whether you want to execute a forward or reverse
+ transform; also, provide the OpenCL \c cl_mem handles for the input buffer(s), output buffer(s)--unless you want
+ the transformed data to overwrite the input buffers, and (optionally) scratch buffer.
+
+ @ref clfftEnqueueTransform performs one or more calls to the OpenCL function clEnqueueNDRangeKernel.
+ Like clEnqueueNDRangeKernel, @ref clfftEnqueueTransform is a non-blocking call. The commands to
+ execute the FFT compute kernel(s) are added to the OpenCL context queue to be executed asynchronously.
+ An OpenCL event handle is returned to the caller. If multiple NDRangeKernel operations are queued,
+ the final event handle is returned.
+
+ - The application now can add additional OpenCL tasks to the OpenCL context's queue. For example, if the
+ next step in the application's process is to apply a filter to the transformed data, the application would generate
+ that clEnqueueNDRangeKernel, specifying the transform's output buffer(s) as the input to the filter kernel,
+ and providing the transform's event handle to ensure proper synchronization.
+ - If the application must access the transformed data directly, it must call one of the OpenCL functions
+ for synchronizing the host computer's execution with the OpenCL device (for example: clFinish()).
+
+ - Terminate the library by calling @ref clfftTeardown.
+
+
+@section RealFFT FFTs of Real Data
+
+When real data is subject to DFT transformation, the resulting complex output
+follows a special property. About half of the output is redundant because they are
+complex conjugates of the other half. This is called the Hermitian redundancy.
+So, for space and performance considerations, it is only necessary to store the
+non-redundant part of the data. Most FFT libraries use this property to offer
+specific storage layouts for FFTs involving real data. clFFT provides 3
+enumerated types to deal with real data FFTs:
+
+
+ - \c CLFFT_REAL
+
- \c CLFFT_HERMITIAN_INTERLEAVED
+
- \c CLFFT_HERMITIAN_PLANAR
+
+
+The first enum specifies that the data is purely real. This can be used to feed
+real input or get back real output. The second and third enums specify layouts
+for storing FFT output. They are similar to the corresponding full complex enums
+in the way they store real and imaginary components. The difference is that they
+store only about half of the complex output. Client applications can do just a
+forward transform and analyze the output. Or they can do some processing of
+the output and do a backward transform to get back real data. This is illustrated
+in the following figure.
+
+@image html realfft_fwdinv.jpg "Forward and Backward Transform Processes"
+
+Let us consider a 1D real FFT of length N. The full output looks as shown in
+following figure.
+
+@image html realfft_1dlen.jpg "1D Real FFT of Length N"
+
+Here, C* denotes the complex conjugate of. Since the values at indices greater
+than N/2 can be deduced from the first half of the array, clFFT stores data
+only up to the index N/2. This means that the output contains only 1 + N/2
+complex elements, where the division N/2 is rounded down. Examples for even
+and odd lengths are given below.
+
+Example for N = 8 is shown in following figure.
+
+@image html realfft_ex_n8.jpg "Example for N = 8"
+
+Example for N = 7 is shown in following figure.
+
+@image html realfft_ex_n7.jpg "Example for N = 7"
+
+
+For length 8, only (1 + 8/2) = 5 of the output complex numbers are stored, with
+the index ranging from 0 through 4. Similarly for length 7, only (1 + 7/2) = 4 of
+the output complex numbers are stored, with the index ranging from 0 through 3.
+
+For 2D and 3D FFTs, the FFT length along the least dimension is used to
+compute the (1 + N/2) value. This is because the FFT along the least dimension
+is what is computed first and is logically a real-to-hermitian transform. The FFTs
+along other dimensions are computed afterwards; they are simply 'complex-tocomplex'
+transforms. For example, assuming clLengths[2] is used to set up a 2D
+real FFT, let N1 = clLengths[1], and N0 = clLengths[0]. The output FFT has
+N1*(1 + N0/2) complex elements. Similarly, for a 3D FFT with clLengths[3] and
+N2 = clLengths[2], N1 = clLengths[1], and N0 = clLengths[0], the output has
+N2*N1*(1 + N0/2) complex elements.
+
+@subsection RealModes Supported Modes
+
+Out-of-place transforms:
+
+
+ - \c CLFFT_REAL to \c CLFFT_HERMITIAN_INTERLEAVED
+
- \c CLFFT_REAL to \c CLFFT_HERMITIAN_PLANAR
+
- \c CLFFT_HERMITIAN_INTERLEAVED to \c CLFFT_REAL
+
- \c CLFFT_ CLFFT_HERMITIAN_PLANAR to \c CLFFT_REAL
+
+
+In-place transforms:
+
+
+ - \c CLFFT_REAL to \c CLFFT_HERMITIAN_INTERLEAVED
+
- \c CLFFT_HERMITIAN_INTERLEAVED to \c CLFFT_REAL
+
+
+
+@subsection RealExamples Examples
+
+The following pages provide figures and examples to explain in detail the real
+FFT features of this library.
+
+@image html realfft_expl_01.jpg "1D FFT - Real to Hermitian"
+@image html realfft_expl_02.jpg "1D FFT - Real to Hermitian, Example 1"
+@image html realfft_expl_03.jpg "1D FFT - Real to Hermitian, Example 2"
+@image html realfft_expl_04.jpg "1D FFT - Real to Hermitian, Example 3"
+@image html realfft_expl_05.jpg "1D FFT - Hermitian to Real"
+@image html realfft_expl_06.jpg "1D FFT - Hermitian to Real, Example"
+@image html realfft_expl_07.jpg "2D FFT - Real to Hermitian In Place"
+@image html realfft_expl_08.jpg "2D FFT - Real to Hermitian, Example"
+
+ */
diff --git a/src/library/plan.cpp b/src/library/plan.cpp
new file mode 100644
index 00000000..ec87b2d4
--- /dev/null
+++ b/src/library/plan.cpp
@@ -0,0 +1,3302 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+////////////////////////////////////////////
+
+// clfft.plan.cpp : Defines the entry point for the console application.
+//
+
+#include "stdafx.h"
+#include
+#include "private.h"
+#include "repo.h"
+#include "plan.h"
+#include "generator.stockham.h"
+#include "../include/convenienceFunctions.h"
+
+using std::vector;
+
+const std::string beginning_of_binary( "<[Ł_beginning_of_binary_Ł]>" );
+const std::string end_of_binary( "<[Ł_I_may_be_a_sorry_case,_but_I_don't_write_jokes_in_base_13_Ł]>" );
+const std::string end_of_file( "<[Ł_You're_off_the_edge_of_the_map,_mate._Here_there_be_monsters_Ł]>" );
+
+// This operator is used to sort FFTKernelGenKeyParams structs inside of a std::map
+bool operator<( const FFTKernelGenKeyParams& lhs, const FFTKernelGenKeyParams& rhs)
+{
+ int ret = ::memcmp( &lhs, &rhs, sizeof( FFTKernelGenKeyParams ) );
+
+ if( ret < 0 )
+ return true;
+
+ return false;
+}
+
+clfftStatus clfftCreateDefaultPlan( clfftPlanHandle* plHandle, cl_context context, const clfftDim dim,
+ const size_t* clLengths )
+{
+ if( clLengths == NULL )
+ return CLFFT_INVALID_HOST_PTR;
+
+ size_t lenX = 1, lenY = 1, lenZ = 1;
+
+ switch( dim )
+ {
+ case CLFFT_1D:
+ {
+ // Minimum length size is 1
+ if( clLengths[ DimX ] == 0 )
+ return CLFFT_INVALID_ARG_VALUE;
+
+ if( !IsASupportedLength( clLengths[ DimX ] ) )
+ {
+ return CLFFT_NOTIMPLEMENTED;
+ }
+
+ lenX = clLengths[ DimX ];
+ }
+ break;
+ case CLFFT_2D:
+ {
+ // Minimum length size is 1
+ if( clLengths[ DimX ] == 0 || clLengths[ DimY ] == 0 )
+ return CLFFT_INVALID_ARG_VALUE;
+
+ if( !IsASupportedLength( clLengths[ DimX ] ) || !IsASupportedLength( clLengths[ DimY ] ) )
+ {
+ return CLFFT_NOTIMPLEMENTED;
+ }
+
+ lenX = clLengths[ DimX ];
+ lenY = clLengths[ DimY ];
+ }
+ break;
+ case CLFFT_3D:
+ {
+ // Minimum length size is 1
+ if( clLengths[ DimX ] == 0 || clLengths[ DimY ] == 0 || clLengths[ DimZ ] == 0 )
+ return CLFFT_INVALID_ARG_VALUE;
+
+ if( !IsASupportedLength( clLengths[ DimX ] ) || !IsASupportedLength( clLengths[ DimY ] ) ||
+ !IsASupportedLength( clLengths[ DimZ ] ))
+ {
+ return CLFFT_NOTIMPLEMENTED;
+ }
+
+ lenX = clLengths[ DimX ];
+ lenY = clLengths[ DimY ];
+ lenZ = clLengths[ DimZ ];
+ }
+ break;
+ default:
+ return CLFFT_NOTIMPLEMENTED;
+ break;
+ }
+
+ FFTPlan* fftPlan = NULL;
+ FFTRepo& fftRepo = FFTRepo::getInstance( );
+ OPENCL_V( fftRepo.createPlan( plHandle, fftPlan ), _T( "fftRepo.insertPlan failed" ) );
+
+ fftPlan->baked = false;
+ fftPlan->dim = dim;
+ fftPlan->placeness = CLFFT_INPLACE;
+ fftPlan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ fftPlan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ fftPlan->precision = CLFFT_SINGLE;
+ fftPlan->context = context;
+ fftPlan->forwardScale = 1.0;
+ fftPlan->backwardScale = 1.0 / static_cast< double >( lenX * lenY * lenZ );
+ fftPlan->batchsize = 1;
+
+ fftPlan->gen = Stockham; //default setting
+
+ OPENCL_V(fftPlan->SetEnvelope(), _T("SetEnvelope failed"));
+
+ clRetainContext( fftPlan->context );
+
+ /////////////////////////////////////////////////////////////////
+ // Detect OpenCL devices
+ /////////////////////////////////////////////////////////////////
+ // First, get the size of device list data
+ size_t deviceListSize;
+ OPENCL_V( ::clGetContextInfo( context, CL_CONTEXT_DEVICES, 0, NULL, &deviceListSize ),
+ "Getting device array size ( ::clGetContextInfo() )" );
+
+ // Allocate memory for the devices
+ fftPlan->devices.resize( deviceListSize / sizeof( cl_device_id ) );
+
+ /* Now, get the device list data */
+ OPENCL_V( ::clGetContextInfo( context, CL_CONTEXT_DEVICES, deviceListSize, &fftPlan->devices[ 0 ], NULL ),
+ "Getting device array ( ::clGetContextInfo() )" );
+
+ // Need to devise a way to generate better names
+ tstringstream tstream;
+ tstream << _T( "plan_" ) << *plHandle;
+
+ lockRAII* planLock = NULL;
+ OPENCL_V( fftRepo.getPlan( *plHandle, fftPlan, planLock ), _T( "fftRepo.getPlan failed" ) );
+ planLock->setName( tstream.str( ) );
+
+ // Set the lengths and default strides/pitches depending on the dim that the user passes to us
+ switch( dim )
+ {
+ case CLFFT_1D:
+ {
+ fftPlan->length.push_back( lenX );
+ fftPlan->inStride.push_back( 1 );
+ fftPlan->outStride.push_back( 1 );
+ fftPlan->iDist = lenX;
+ fftPlan->oDist = lenX;
+ }
+ break;
+ case CLFFT_2D:
+ {
+ fftPlan->length.push_back( lenX );
+ fftPlan->length.push_back( lenY );
+ fftPlan->inStride.push_back( 1 );
+ fftPlan->inStride.push_back( lenX );
+ fftPlan->outStride.push_back( 1 );
+ fftPlan->outStride.push_back( lenX );
+ fftPlan->iDist = lenX*lenY;
+ fftPlan->oDist = lenX*lenY;
+ }
+ break;
+ case CLFFT_3D:
+ {
+ fftPlan->length.push_back( lenX );
+ fftPlan->length.push_back( lenY );
+ fftPlan->length.push_back( lenZ );
+ fftPlan->inStride.push_back( 1 );
+ fftPlan->inStride.push_back( lenX );
+ fftPlan->inStride.push_back( lenX*lenY );
+ fftPlan->outStride.push_back( 1 );
+ fftPlan->outStride.push_back( lenX );
+ fftPlan->outStride.push_back( lenX*lenY );
+ fftPlan->iDist = lenX*lenY*lenZ;
+ fftPlan->oDist = lenX*lenY*lenZ;
+ }
+ break;
+ }
+
+ return CLFFT_SUCCESS;
+}
+
+// Read the kernels that this plan uses from file, and store into the plan
+clfftStatus WriteKernel( const clfftPlanHandle plHandle, const clfftGenerators gen, const FFTKernelGenKeyParams& fftParams )
+{
+ FFTRepo& fftRepo = FFTRepo::getInstance( );
+
+ // Logic to define a sensible filename
+ const std::string kernelPrefix( "clfft.kernel." );
+ std::string generatorName;
+ std::stringstream kernelPath;
+
+ switch( gen )
+ {
+ case Stockham: generatorName = "Stockham"; break;
+ case Transpose: generatorName = "Transpose"; break;
+ }
+
+ kernelPath << kernelPrefix << generatorName << plHandle << ".cl";
+
+ // Logic to write string contents out to file
+ tofstreamRAII< std::ofstream, std::string > kernelFile( kernelPath.str( ) );
+ if( !kernelFile.get( ) )
+ {
+ std::cerr << "Failed to open kernel file for writing: " << kernelPath.str( ) << std::endl;
+ return CLFFT_FILE_CREATE_FAILURE;
+ }
+
+ std::string kernel;
+ OPENCL_V( fftRepo.getProgramCode( gen, fftParams, kernel ), _T( "fftRepo.getProgramCode failed." ) );
+
+ kernelFile.get( ) << kernel << std::endl;
+
+ return CLFFT_SUCCESS;
+}
+
+// **************** TODO TODO TODO ***********************
+// Making CompileKernels function take in command queue parameter so we can build for 1 particular device only;
+// this may not be desirable for persistent plans, where we may have to compile for all devices in the context;
+// make changes appropriately before enabling persistent plans and then remove this comment
+
+// Compile the kernels that this plan uses, and store into the plan
+clfftStatus CompileKernels( const cl_command_queue commQueueFFT, const clfftPlanHandle plHandle, const clfftGenerators gen, FFTPlan* fftPlan )
+{
+ cl_int status = 0;
+ size_t deviceListSize = 0;
+
+ FFTRepo& fftRepo = FFTRepo::getInstance( );
+
+
+ // create a cl program executable for the device associated with command queue
+ // Get the device
+ cl_device_id q_device;
+ clGetCommandQueueInfo(commQueueFFT, CL_QUEUE_DEVICE, sizeof(cl_device_id), &q_device, NULL);
+
+ FFTKernelGenKeyParams fftParams;
+ OPENCL_V( fftPlan->GetKernelGenKey( fftParams ), _T("GetKernelGenKey() failed!") );
+
+ cl_program program;
+ if( fftRepo.getclProgram( gen, fftParams, program ) == CLFFT_INVALID_PROGRAM )
+ {
+ // If the user wishes us to write the kernels out to disk, we do so
+ if( fftRepo.setupData.debugFlags & CLFFT_DUMP_PROGRAMS )
+ {
+ OPENCL_V( WriteKernel( plHandle, gen, fftParams ), _T( "WriteKernel failed." ) );
+ }
+
+ std::string programCode;
+ OPENCL_V( fftRepo.getProgramCode( gen, fftParams, programCode ), _T( "fftRepo.getProgramCode failed." ) );
+
+ const char* source = programCode.c_str();
+ program = clCreateProgramWithSource( fftPlan->context, 1, &source, NULL, &status );
+ OPENCL_V( status, _T( "clCreateProgramWithSource failed." ) );
+
+ // create a cl program executable for the device associated with command queue
+
+#if defined(DEBUGGING)
+ status = clBuildProgram( program, 1, &q_device, "-g -cl-opt-disable", NULL, NULL); // good for debugging kernels
+
+// if you have trouble creating smbols that GDB can pick up to set a breakpoint after kernels are loaded into memory
+// this can be used to stop execution to allow you to set a breakpoint in a kernel after kernel symbols are in memory.
+#ifdef DEBUG_BREAK_GDB
+ __debugbreak();
+#endif
+#else
+ status = clBuildProgram( program, 1, &q_device, NULL, NULL, NULL);
+#endif
+ if( status != CL_SUCCESS )
+ {
+ if( status == CL_BUILD_PROGRAM_FAILURE )
+ {
+ size_t buildLogSize = 0;
+ OPENCL_V( clGetProgramBuildInfo( program, q_device, CL_PROGRAM_BUILD_LOG, 0, NULL, &buildLogSize ),
+ _T( "clGetProgramBuildInfo failed" ) );
+
+ vector< char > buildLog( buildLogSize );
+ ::memset( &buildLog[ 0 ], 0x0, buildLogSize );
+
+ OPENCL_V( clGetProgramBuildInfo( program, q_device, CL_PROGRAM_BUILD_LOG, buildLogSize, &buildLog[ 0 ], NULL ),
+ _T( "clGetProgramBuildInfo failed" ) );
+
+ std::cerr << "\n\t\t\tBUILD LOG\n";
+ std::cerr << "************************************************\n";
+ std::cerr << &buildLog[ 0 ] << std::endl;
+ std::cerr << "************************************************\n";
+ }
+
+ OPENCL_V( status, _T( "clBuildProgram failed" ) );
+ }
+
+ fftRepo.setclProgram( gen, fftParams, program );
+
+ // For real transforms we comppile either forward or backward kernel
+ bool r2c_transform = (fftParams.fft_inputLayout == CLFFT_REAL);
+ bool c2r_transform = (fftParams.fft_outputLayout == CLFFT_REAL);
+ bool real_transform = (gen == Copy) ? true : (r2c_transform || c2r_transform);
+ bool h2c = (gen == Copy) && ((fftParams.fft_inputLayout == CLFFT_HERMITIAN_PLANAR) || (fftParams.fft_inputLayout == CLFFT_HERMITIAN_INTERLEAVED));
+ bool c2h = (gen == Copy) && ((fftParams.fft_outputLayout == CLFFT_HERMITIAN_PLANAR) || (fftParams.fft_outputLayout == CLFFT_HERMITIAN_INTERLEAVED));
+
+ // get a kernel object handle for a kernel with the given name
+ cl_kernel kernel;
+ if( (!real_transform) || r2c_transform || c2h )
+ {
+ if( fftRepo.getclKernel( program, CLFFT_FORWARD, kernel ) == CLFFT_INVALID_KERNEL )
+ {
+ std::string entryPoint;
+ OPENCL_V( fftRepo.getProgramEntryPoint( gen, fftParams, CLFFT_FORWARD, entryPoint ), _T( "fftRepo.getProgramEntryPoint failed." ) );
+
+ kernel = clCreateKernel( program, entryPoint.c_str( ), &status );
+ OPENCL_V( status, _T( "clCreateKernel failed" ) );
+
+ fftRepo.setclKernel( program, CLFFT_FORWARD, kernel );
+ }
+ }
+
+ if( (!real_transform) || c2r_transform || h2c )
+ {
+ if( fftRepo.getclKernel( program, CLFFT_BACKWARD, kernel ) == CLFFT_INVALID_KERNEL )
+ {
+ std::string entryPoint;
+ OPENCL_V( fftRepo.getProgramEntryPoint( gen, fftParams, CLFFT_BACKWARD, entryPoint ), _T( "fftRepo.getProgramEntryPoint failed." ) );
+
+ kernel = clCreateKernel( program, entryPoint.c_str( ), &status );
+ OPENCL_V( status, _T( "clCreateKernel failed" ) );
+
+ fftRepo.setclKernel( program, CLFFT_BACKWARD, kernel );
+ }
+ }
+ }
+
+//TODO caching kernel binaries for later reload
+#if 0
+ // figure out number of devices and the sizes of the binary for each device.
+ OPENCL_V( clGetProgramInfo( program, CL_PROGRAM_NUM_DEVICES, sizeof(fftPlan->number_of_devices), &(fftPlan->number_of_devices), NULL ), _T("CompileKernels(): error getting number of devices") );
+
+
+ // get the sizes of the different binaries
+ fftPlan->ResetBinarySizes();
+ OPENCL_V( clGetProgramInfo( program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t) * fftPlan->number_of_devices, fftPlan->binary_sizes.get(), NULL ), _T("CompileKernels(): error getting binary sizes") );
+
+ // we need a list of naked pointers to all of the binaries for OpenCL
+ std::unique_ptr naked_binary_pointers( new char*[fftPlan->number_of_devices] );
+
+ // make space for all of the generated binaries
+ for( int i = 0; i < fftPlan->number_of_devices; i++ )
+ {
+ // this is our permanent storage place for the binaries
+ fftPlan->binaries.push_back( std::unique_ptr(new char[fftPlan->binary_sizes[i]] ) );
+ // and we need this second copy of it for OpenCL
+ naked_binary_pointers[i] = fftPlan->binaries[i].get();
+ }
+
+ // copy all of the generated binaries over
+ OPENCL_V( clGetProgramInfo( program, CL_PROGRAM_BINARIES, sizeof(char*) * fftPlan->number_of_devices, naked_binary_pointers.get(), NULL ), _T("CompileKernels(): error getting program binaries") );
+#endif
+
+ return CLFFT_SUCCESS;
+}
+
+//TODO caching kernel binaries for later reload
+#if 0
+// Compile the kernels that this plan uses, and store into the plan
+clfftStatus LoadCompiledKernels( const clfftPlanHandle plHandle, const clfftGenerators gen, FFTPlan* plan )
+{
+ // if there are no devices, there are not any kernels to load
+ if( plan->number_of_devices == 0 )
+ return CLFFT_SUCCESS;
+
+ FFTRepo& repo = FFTRepo::getInstance( );
+
+ FFTKernelGenKeyParams fftParams;
+ OPENCL_V( plan->GetKernelGenKey( fftParams ), _T("GetKernelGenKey() failed!") );
+
+ cl_program program;
+ if( repo.getclProgram( gen, fftParams, program ) == CLFFT_INVALID_PROGRAM )
+ {
+ //if( repo.setupData.debugFlags & CLFFT_DUMP_PROGRAMS )
+ //{
+ // OPENCL_V( WriteKernel( plHandle, gen, fftParams ), _T( "WriteKernel failed." ) );
+ // //TODO there's no source to spit out, but we should consider giving the user a helpful message
+ // // such as "there's no source to output -- kernel binaries loaded from file"
+ //}
+
+ std::unique_ptr binary_status( new cl_int[plan->number_of_devices] );
+ cl_int error_code;
+
+ std::unique_ptr binaries( new const unsigned char*[plan->number_of_devices] );
+ for( int i = 0; i < plan->number_of_devices; i++ )
+ {
+ binaries[i] = reinterpret_cast(plan->binaries[0].get());
+ }
+
+ if( plan->number_of_devices > 0 )
+ {
+ program = clCreateProgramWithBinary( plan->context,
+ (cl_uint)plan->number_of_devices, &plan->devices[0], &plan->binary_sizes[0], &binaries[0],
+ binary_status.get(), &error_code);
+
+ cl_int status = 0;
+ // create a cl program executable for all the devices specified
+ status = clBuildProgram( program, 1, &plan->devices[0], NULL, NULL, NULL);
+
+ if( status != CL_SUCCESS )
+ {
+ if( status == CL_BUILD_PROGRAM_FAILURE )
+ {
+ size_t buildLogSize = 0;
+ OPENCL_V( clGetProgramBuildInfo( program, plan->devices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &buildLogSize ),
+ _T( "clGetProgramBuildInfo failed" ) );
+
+ vector< char > buildLog( buildLogSize );
+ ::memset( &buildLog[ 0 ], 0x0, buildLogSize );
+
+ OPENCL_V( clGetProgramBuildInfo( program, plan->devices[0], CL_PROGRAM_BUILD_LOG, buildLogSize, &buildLog[ 0 ], NULL ),
+ _T( "clGetProgramBuildInfo failed" ) );
+
+ std::cerr << " \n\t\t\tBUILD LOG\n";
+ std::cerr << " ************************************************\n";
+ std::cerr << &buildLog[ 0 ] << std::endl;
+ std::cerr << " ************************************************\n";
+ }
+
+ OPENCL_V( status, _T( "clBuildProgram failed" ) );
+ }
+
+ repo.setclProgram( gen, fftParams, program );
+
+ // get a kernel object handle for a kernel with the given name
+ cl_kernel kernel;
+ if( repo.getclKernel( program, CLFFT_FORWARD, kernel ) == CLFFT_INVALID_KERNEL )
+ {
+ kernel = clCreateKernel( program, "fft_fwd", &status );
+ OPENCL_V( status, _T( "clCreateKernel failed" ) );
+
+ repo.setclKernel( program, CLFFT_FORWARD, kernel );
+ }
+
+ if( repo.getclKernel( program, CLFFT_BACKWARD, kernel ) == CLFFT_INVALID_KERNEL )
+ {
+ kernel = clCreateKernel( program, "fft_back", &status );
+ OPENCL_V( status, _T( "clCreateKernel failed" ) );
+
+ repo.setclKernel( program, CLFFT_BACKWARD, kernel );
+ }
+
+ FFTKernelGenKeyParams params;
+ plan->GetKernelGenKey( params );
+ OPENCL_V( repo.setProgramEntryPoints( Stockham, params, "fft_fwd", "fft_back" ), _T( "fftRepo.setProgramEntryPoint() failed!" ) );
+ }
+ }
+
+ return CLFFT_SUCCESS;
+}
+#endif
+
+clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_command_queue* commQueueFFT,
+ void (CL_CALLBACK *pfn_notify)( clfftPlanHandle plHandle, void *user_data ), void* user_data )
+{
+ // We do not currently support multi-GPU transforms
+ if( numQueues > 1 )
+ return CLFFT_NOTIMPLEMENTED;
+
+ // Notification mechanism is not set up yet; BakePlan can be called recursively to decompose higher dimension FFT's into
+ // arrays of 1d transforms, and this must be implemented to make only a single callback to the user.
+ if( pfn_notify != NULL )
+ return CLFFT_NOTIMPLEMENTED;
+
+ if( user_data != NULL )
+ return CLFFT_NOTIMPLEMENTED;
+
+ FFTRepo& fftRepo = FFTRepo::getInstance( );
+ FFTPlan* fftPlan = NULL;
+ lockRAII* planLock = NULL;
+
+ OPENCL_V( fftRepo.getPlan( plHandle, fftPlan, planLock ), _T( "fftRepo.getPlan failed" ) );
+ scopedLock sLock( *planLock, _T( "clfftBakePlan" ) );
+
+ // if we have already baked the plan and nothing has changed since, we're done here
+ if( fftPlan->baked == true )
+ {
+ return CLFFT_SUCCESS;
+ }
+
+ //find product of lengths
+ size_t pLength = 1;
+ switch(fftPlan->dim)
+ {
+ case CLFFT_3D: pLength *= fftPlan->length[DimZ];
+ case CLFFT_2D: pLength *= fftPlan->length[DimY];
+ case CLFFT_1D: pLength *= fftPlan->length[DimX];
+ }
+
+ // upper bounds on transfrom lengths - address this in the next release
+ size_t SP_MAX_LEN = 1 << 24;
+ size_t DP_MAX_LEN = 1 << 22;
+ if((fftPlan->precision == CLFFT_SINGLE) && (pLength > SP_MAX_LEN)) return CLFFT_NOTIMPLEMENTED;
+ if((fftPlan->precision == CLFFT_DOUBLE) && (pLength > DP_MAX_LEN)) return CLFFT_NOTIMPLEMENTED;
+
+
+ // release buffers, as these will be created only in EnqueueTransform
+ if( NULL != fftPlan->intBuffer ) { OPENCL_V( clReleaseMemObject( fftPlan->intBuffer ), _T( "Failed to release internal temporary buffer" ) ); fftPlan->intBuffer = NULL; }
+ if( NULL != fftPlan->intBufferRC ) { OPENCL_V( clReleaseMemObject( fftPlan->intBufferRC ), _T( "Failed to release internal temporary buffer" ) ); fftPlan->intBufferRC = NULL; }
+ if( NULL != fftPlan->intBufferC2R ) { OPENCL_V( clReleaseMemObject( fftPlan->intBufferC2R ), _T( "Failed to release internal temporary buffer" ) ); fftPlan->intBufferC2R = NULL; }
+
+
+ if(fftPlan->dim == fftPlan->length.size() && fftPlan->gen != Transpose && fftPlan->gen != Copy) // confirm it is top-level plan (user plan)
+ {
+ if(fftPlan->placeness == CLFFT_INPLACE)
+ {
+ if( (fftPlan->inputLayout == CLFFT_HERMITIAN_PLANAR) || (fftPlan->outputLayout == CLFFT_HERMITIAN_PLANAR) )
+ return CLFFT_INVALID_PLAN;
+ }
+
+ // Make sure strides & distance are same for C-C transforms
+ if(fftPlan->placeness == CLFFT_INPLACE)
+ {
+ if( (fftPlan->inputLayout != CLFFT_REAL) && (fftPlan->outputLayout != CLFFT_REAL) )
+ {
+ // check strides
+ for(size_t i=0; idim; i++)
+ if(fftPlan->inStride[i] != fftPlan->outStride[i])
+ return CLFFT_INVALID_PLAN;
+
+ // check distance
+ if(fftPlan->iDist != fftPlan->oDist)
+ return CLFFT_INVALID_PLAN;
+ }
+ }
+ }
+
+ if(fftPlan->gen == Copy)
+ {
+ OPENCL_V( fftPlan->GenerateKernel( fftRepo ), _T( "GenerateKernel() failed" ) );
+ OPENCL_V( CompileKernels( *commQueueFFT, plHandle, fftPlan->gen, fftPlan ), _T( "CompileKernels() failed" ) );
+ fftPlan->baked = true;
+ return CLFFT_SUCCESS;
+ }
+
+//TODO caching kernel binaries for later reload
+#if 0
+ if( fftPlan->readFromFile == true )
+ {
+ OPENCL_V( LoadCompiledKernels( plHandle, fftPlan->gen, fftPlan ), _T( "LoadCompiledKernels() failed" ) );
+
+ // all of the plan compressing and subplan making should be done already,
+ // but we still need to make constant buffers
+ OPENCL_V( fftPlan->AllocateBuffers(), _T("AllocateBuffers() failed"));
+ fftPlan->ConstructAndEnqueueConstantBuffers( commQueueFFT );
+
+ if( fftPlan->planX )
+ {
+ OPENCL_V( clfftBakePlan( fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ), "clfftBakePlan failed (planX)" );
+ }
+
+ if( fftPlan->planY )
+ {
+ OPENCL_V( clfftBakePlan( fftPlan->planY, numQueues, commQueueFFT, NULL, NULL ), "clfftBakePlan failed (planY)" );
+ }
+
+ if( fftPlan->planZ )
+ {
+ OPENCL_V( clfftBakePlan( fftPlan->planZ, numQueues, commQueueFFT, NULL, NULL ), "clfftBakePlan failed (planZ)" );
+ }
+
+ fftPlan->baked = true;
+ return CLFFT_SUCCESS;
+ }
+#endif
+
+ bool rc = (fftPlan->inputLayout == CLFFT_REAL) || (fftPlan->outputLayout == CLFFT_REAL);
+
+ // Compress the plan by discarding length '1' dimensions
+ // decision to pick generator
+ if(fftPlan->dim == fftPlan->length.size() && fftPlan->gen != Transpose && !rc) // confirm it is top-level plan (user plan)
+ {
+ size_t dmnsn = fftPlan->dim;
+ bool pow2flag = true;
+
+ // switch case flows with no 'break' statements
+ switch(fftPlan->dim)
+ {
+ case CLFFT_3D:
+
+ if(fftPlan->length[DimZ] == 1)
+ {
+ dmnsn -= 1;
+ fftPlan-> inStride.erase(fftPlan-> inStride.begin() + 2);
+ fftPlan->outStride.erase(fftPlan->outStride.begin() + 2);
+ fftPlan-> length.erase(fftPlan-> length.begin() + 2);
+ }
+ else
+ {
+ if( !IsPo2(fftPlan->length[DimZ])) pow2flag=false;
+ }
+ case CLFFT_2D:
+
+ if(fftPlan->length[DimY] == 1)
+ {
+ dmnsn -= 1;
+ fftPlan-> inStride.erase(fftPlan-> inStride.begin() + 1);
+ fftPlan->outStride.erase(fftPlan->outStride.begin() + 1);
+ fftPlan-> length.erase(fftPlan-> length.begin() + 1);
+ }
+ else
+ {
+ if( !IsPo2(fftPlan->length[DimY])) pow2flag=false;
+ }
+
+ case CLFFT_1D:
+
+ if( (fftPlan->length[DimX] == 1) && (dmnsn > 1) )
+ {
+ dmnsn -= 1;
+ fftPlan-> inStride.erase(fftPlan-> inStride.begin());
+ fftPlan->outStride.erase(fftPlan->outStride.begin());
+ fftPlan-> length.erase(fftPlan-> length.begin());
+ }
+ else
+ {
+ if( !IsPo2(fftPlan->length[DimX])) pow2flag=false;
+ }
+ }
+
+ fftPlan->dim = (clfftDim)dmnsn;
+ }
+
+ // first time check transposed
+ if (fftPlan->transposed != CLFFT_NOTRANSPOSE && fftPlan->dim != CLFFT_2D &&
+ fftPlan->dim == fftPlan->length.size())
+ return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
+
+ // The largest vector we can transform in a single pass
+ // depends on the GPU caps -- especially the amount of LDS
+ // available
+ //
+ size_t Large1DThreshold = 0;
+
+ //First time check or see if LDS paramters are set-up.
+ if (fftPlan->uLdsFraction == 0)
+ {
+ switch( fftPlan->dim )
+ {
+ case CLFFT_1D:
+ {
+ if (fftPlan->length[0] < 32768 || fftPlan->length[0] > 1048576)
+ fftPlan->uLdsFraction = 8;
+ else
+ fftPlan->uLdsFraction = 4;
+
+ if (fftPlan->length[0] < 1024 )
+ fftPlan->bLdsComplex = true;
+ else
+ fftPlan->bLdsComplex = false;
+ }
+ break;
+ case CLFFT_2D:
+ {
+ fftPlan->uLdsFraction = 4;
+ fftPlan->bLdsComplex = false;
+ }
+ break;
+ case CLFFT_3D:
+ {
+ //for case 128*128*128 and 1024*128*128, fraction = 8 is faster.
+ fftPlan->uLdsFraction = 4;
+ fftPlan->bLdsComplex = false;
+ }
+ break;
+ }
+ }
+ OPENCL_V(fftPlan->GetMax1DLength (&Large1DThreshold), _T("GetMax1DLength failed"));
+ BUG_CHECK (Large1DThreshold > 1);
+
+ // Verify that the data passed to us is packed
+ switch( fftPlan->dim )
+ {
+ case CLFFT_1D:
+ {
+ if ( fftPlan->length[0] > Large1DThreshold )
+ {
+ size_t clLengths[] = { 1, 1, 0 };
+ size_t in_1d, in_x, count;
+
+ BUG_CHECK (IsPo2 (Large1DThreshold))
+ //ARG_CHECK (IsPo2 (fftPlan->length[0]))
+
+ // see whether large1D_Xfactor are fixed or not
+ if (fftPlan->large1D_Xfactor == 0 )
+ {
+ if( IsPo2(fftPlan->length[0]) )
+ {
+ in_1d = BitScanF (Large1DThreshold); // this is log2(LARGE1D_THRESHOLD)
+ in_x = BitScanF (fftPlan->length[0]); // this is log2(length)
+ BUG_CHECK (in_1d > 0)
+ count = in_x/in_1d;
+ if (count*in_1d < in_x)
+ {
+ count++;
+ in_1d = in_x / count;
+ if (in_1d * count < in_x) in_1d++;
+ }
+ clLengths[1] = (size_t)1 << in_1d;
+
+ }
+ else
+ {
+ // This array must be kept sorted in the ascending order
+ size_t supported[] = { 1, 2, 3, 4, 5, 6, 8, 9, 10, 12, 15, 16, 18, 20, 24, 25, 27, 30, 32, 36, 40,
+ 45, 48, 50, 54, 60, 64, 72, 75, 80, 81, 90, 96, 100, 108, 120, 125, 128, 135,
+ 144, 150, 160, 162, 180, 192, 200, 216, 225, 240, 243, 250, 256, 270, 288,
+ 300, 320, 324, 360, 375, 384, 400, 405, 432, 450, 480, 486, 500, 512, 540,
+ 576, 600, 625, 640, 648, 675, 720, 729, 750, 768, 800, 810, 864, 900, 960,
+ 972, 1000, 1024, 1080, 1125, 1152, 1200, 1215, 1250, 1280, 1296, 1350, 1440,
+ 1458, 1500, 1536, 1600, 1620, 1728, 1800, 1875, 1920, 1944, 2000, 2025, 2048,
+ 2160, 2187, 2250, 2304, 2400, 2430, 2500, 2560, 2592, 2700, 2880, 2916, 3000,
+ 3072, 3125, 3200, 3240, 3375, 3456, 3600, 3645, 3750, 3840, 3888, 4000, 4050, 4096 };
+
+ size_t lenSupported = sizeof(supported)/sizeof(supported[0]);
+ size_t maxFactoredLength = (supported[lenSupported-1] < Large1DThreshold) ? supported[lenSupported-1] : Large1DThreshold;
+
+ size_t halfPowerLength = (size_t)1 << ( (StockhamGenerator::CeilPo2(fftPlan->length[0]) + 1) / 2 );
+ size_t factoredLengthStart = (halfPowerLength < maxFactoredLength) ? halfPowerLength : maxFactoredLength;
+
+ size_t indexStart = 0;
+ while(supported[indexStart] < factoredLengthStart) indexStart++;
+
+ for(size_t i = indexStart; i >= 1; i--)
+ {
+ if( fftPlan->length[0] % supported[i] == 0 )
+ {
+ clLengths[1] = supported[i];
+ break;
+ }
+ }
+ }
+
+ clLengths[0] = fftPlan->length[0]/clLengths[1];
+ }
+ else
+ {
+ //large1D_Xfactor will not pass to the second level of call
+ clLengths[0] = fftPlan->large1D_Xfactor;
+ clLengths[1] = fftPlan->length[0]/clLengths[0];
+ ARG_CHECK (fftPlan->length[0] == clLengths[0] * clLengths[1]);
+ }
+
+ while (1 && (fftPlan->inputLayout != CLFFT_REAL) && (fftPlan->outputLayout != CLFFT_REAL))
+ {
+ if (!IsPo2(fftPlan->length[0])) break;
+ //if (fftPlan->precision != CLFFT_SINGLE) break;
+ //TBD, only one dimension?
+ if (fftPlan->length.size() > 1) break;
+ if (fftPlan->inStride[0] != 1 || fftPlan->outStride[0] != 1) break;
+ //This length is good for using transpose
+ if (fftPlan->length[0] < 131072) break;
+
+ //first version not support huge1D, TBD
+ if (clLengths[0] > Large1DThreshold) break;
+ ARG_CHECK(clLengths[0]>=32 && clLengths[1]>=32);
+
+ if (fftPlan->tmpBufSize==0 )
+ {
+ fftPlan->tmpBufSize = clLengths[0] * clLengths[1] *
+ fftPlan->batchsize * fftPlan->ElementSize();
+ }
+
+ //Transpose
+ //Input --> tmp buffer
+ OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planTX, fftPlan->context, CLFFT_2D, clLengths ),
+ _T( "CreateDefaultPlan Large1d transpose 1 failed" ) );
+
+ FFTPlan* trans1Plan = NULL;
+ lockRAII* trans1Lock = NULL;
+ OPENCL_V( fftRepo.getPlan( fftPlan->planTX, trans1Plan, trans1Lock ), _T( "fftRepo.getPlan failed" ) );
+
+ trans1Plan->placeness = CLFFT_OUTOFPLACE;
+ trans1Plan->precision = fftPlan->precision;
+ trans1Plan->tmpBufSize = 0;
+ trans1Plan->batchsize = fftPlan->batchsize;
+ trans1Plan->envelope = fftPlan->envelope;
+ trans1Plan->inputLayout = fftPlan->inputLayout;
+ trans1Plan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ trans1Plan->inStride[0] = fftPlan->inStride[0];
+ trans1Plan->inStride[1] = clLengths[0];
+ trans1Plan->outStride[0] = 1;
+ trans1Plan->outStride[1] = clLengths[1];
+ trans1Plan->iDist = fftPlan->iDist;
+ trans1Plan->oDist = fftPlan->length[0];
+ trans1Plan->gen = Transpose;
+ trans1Plan->transflag = true;
+
+ OPENCL_V(clfftBakePlan(fftPlan->planTX, numQueues, commQueueFFT, NULL, NULL ),
+ _T( "BakePlan large1d trans1 plan failed" ) );
+
+ //Row transform
+ //tmp->output
+ //size clLengths[1], batch clLengths[0], with length[0] twiddle factor multiplication
+ OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planX, fftPlan->context, CLFFT_1D, &clLengths[1] ),
+ _T( "CreateDefaultPlan Large1d column failed" ) );
+
+ FFTPlan* row1Plan = NULL;
+ lockRAII* row1Lock = NULL;
+ OPENCL_V( fftRepo.getPlan( fftPlan->planX, row1Plan, row1Lock ), _T( "fftRepo.getPlan failed" ) );
+
+ row1Plan->placeness = CLFFT_OUTOFPLACE;
+ row1Plan->precision = fftPlan->precision;
+ row1Plan->forwardScale = 1.0f;
+ row1Plan->backwardScale = 1.0f;
+ row1Plan->tmpBufSize = 0;
+ row1Plan->batchsize = fftPlan->batchsize;
+ row1Plan->bLdsComplex = fftPlan->bLdsComplex;
+ row1Plan->uLdsFraction = fftPlan->uLdsFraction;
+ row1Plan->ldsPadding = fftPlan->ldsPadding;
+ row1Plan->gen = fftPlan->gen;
+ row1Plan->envelope = fftPlan->envelope;
+
+ //Pass large1D flag to confirm we need multiply twiddle factor
+ row1Plan->large1D = fftPlan->length[0];
+
+ row1Plan->length.push_back(clLengths[0]);
+ row1Plan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ row1Plan->outputLayout = fftPlan->outputLayout;
+ row1Plan->inStride[0] = 1;
+ row1Plan->outStride[0] = fftPlan->outStride[0];
+ row1Plan->iDist = fftPlan->length[0];
+ row1Plan->oDist = fftPlan->oDist;
+ row1Plan->inStride.push_back(clLengths[1]);
+ row1Plan->outStride.push_back(clLengths[1]);
+
+ OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ),
+ _T( "BakePlan large1d first row plan failed" ) );
+
+ //Transpose 2
+ //Output --> tmp buffer
+ clLengths[2] = clLengths[0];
+ OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planTY, fftPlan->context, CLFFT_2D, &clLengths[1] ),
+ _T( "CreateDefaultPlan Large1d transpose 1 failed" ) );
+
+ FFTPlan* trans2Plan = NULL;
+ lockRAII* trans2Lock = NULL;
+ OPENCL_V( fftRepo.getPlan( fftPlan->planTY, trans2Plan, trans2Lock ), _T( "fftRepo.getPlan failed" ) );
+
+ trans2Plan->placeness = CLFFT_OUTOFPLACE;
+ trans2Plan->precision = fftPlan->precision;
+ trans2Plan->tmpBufSize = 0;
+ trans2Plan->batchsize = fftPlan->batchsize;
+ trans2Plan->envelope = fftPlan->envelope;
+ trans2Plan->inputLayout = fftPlan->outputLayout;
+ trans2Plan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ trans2Plan->inStride[0] = fftPlan->outStride[0];
+ trans2Plan->inStride[1] = clLengths[1];
+ trans2Plan->outStride[0] = 1;
+ trans2Plan->outStride[1] = clLengths[0];
+ trans2Plan->iDist = fftPlan->oDist;
+ trans2Plan->oDist = fftPlan->length[0];
+ trans2Plan->gen = Transpose;
+ trans2Plan->transflag = true;
+
+ OPENCL_V(clfftBakePlan(fftPlan->planTY, numQueues, commQueueFFT, NULL, NULL ),
+ _T( "BakePlan large1d trans2 plan failed" ) );
+
+ //Row transform 2
+ //tmp->tmp
+ //size clLengths[0], batch clLengths[1]
+ OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planY, fftPlan->context, CLFFT_1D, &clLengths[0] ),
+ _T( "CreateDefaultPlan Large1d column failed" ) );
+
+ FFTPlan* row2Plan = NULL;
+ lockRAII* row2Lock = NULL;
+ OPENCL_V( fftRepo.getPlan( fftPlan->planY, row2Plan, row2Lock ), _T( "fftRepo.getPlan failed" ) );
+
+ row2Plan->placeness = CLFFT_INPLACE;
+ row2Plan->precision = fftPlan->precision;
+ row2Plan->forwardScale = fftPlan->forwardScale;
+ row2Plan->backwardScale = fftPlan->backwardScale;
+ row2Plan->tmpBufSize = 0;
+ row2Plan->batchsize = fftPlan->batchsize;
+ row2Plan->bLdsComplex = fftPlan->bLdsComplex;
+ row2Plan->uLdsFraction = fftPlan->uLdsFraction;
+ row2Plan->ldsPadding = fftPlan->ldsPadding;
+ row2Plan->gen = fftPlan->gen;
+ row2Plan->envelope = fftPlan->envelope;
+
+ //No twiddle factor is needed.
+ row2Plan->large1D = 0;
+
+ row2Plan->length.push_back(clLengths[1]);
+ row2Plan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ row2Plan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ row2Plan->inStride[0] = 1;
+ row2Plan->outStride[0] = 1;
+ row2Plan->iDist = fftPlan->length[0];
+ row2Plan->oDist = fftPlan->length[0];
+ row2Plan->inStride.push_back(clLengths[0]);
+ row2Plan->outStride.push_back(clLengths[0]);
+
+ OPENCL_V(clfftBakePlan(fftPlan->planY, numQueues, commQueueFFT, NULL, NULL ),
+ _T( "BakePlan large1d first row plan failed" ) );
+
+ //Transpose 3
+ //tmp --> output
+ OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planTZ, fftPlan->context, CLFFT_2D, clLengths ),
+ _T( "CreateDefaultPlan Large1d transpose 1 failed" ) );
+
+ FFTPlan* trans3Plan = NULL;
+ lockRAII* trans3Lock = NULL;
+ OPENCL_V( fftRepo.getPlan( fftPlan->planTZ, trans3Plan, trans3Lock ), _T( "fftRepo.getPlan failed" ) );
+
+ trans3Plan->placeness = CLFFT_OUTOFPLACE;
+ trans3Plan->precision = fftPlan->precision;
+ trans3Plan->tmpBufSize = 0;
+ trans3Plan->batchsize = fftPlan->batchsize;
+ trans3Plan->envelope = fftPlan->envelope;
+ trans3Plan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ trans3Plan->outputLayout = fftPlan->outputLayout;
+ trans3Plan->inStride[0] = 1;
+ trans3Plan->inStride[1] = clLengths[0];
+ trans3Plan->outStride[0] = fftPlan->outStride[0];
+ trans3Plan->outStride[1] = clLengths[1];
+ trans3Plan->iDist = fftPlan->length[0];
+ trans3Plan->oDist = fftPlan->oDist;
+ trans3Plan->gen = Transpose;
+ trans3Plan->transflag = true;
+
+ OPENCL_V(clfftBakePlan(fftPlan->planTZ, numQueues, commQueueFFT, NULL, NULL ),
+ _T( "BakePlan large1d trans3 plan failed" ) );
+
+ fftPlan->transflag = true;
+ fftPlan->baked = true;
+ return CLFFT_SUCCESS;
+ }
+
+ size_t length0 = clLengths[0];
+ size_t length1 = clLengths[1];
+
+ if(fftPlan->inputLayout == CLFFT_REAL)
+ {
+ if (fftPlan->tmpBufSizeRC==0 )
+ {
+ fftPlan->tmpBufSizeRC = length0 * length1 *
+ fftPlan->batchsize * fftPlan->ElementSize();
+ for (size_t index=1; index < fftPlan->length.size(); index++)
+ {
+ fftPlan->tmpBufSizeRC *= fftPlan->length[index];
+ }
+ }
+
+ // column FFT, size clLengths[1], batch clLengths[0], with length[0] twiddle factor multiplication
+ // transposed output
+ OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planX, fftPlan->context, CLFFT_1D, &clLengths[1] ),
+ _T( "CreateDefaultPlan Large1d column failed" ) );
+
+ FFTPlan* colTPlan = NULL;
+ lockRAII* colLock = NULL;
+ OPENCL_V( fftRepo.getPlan( fftPlan->planX, colTPlan, colLock ), _T( "fftRepo.getPlan failed" ) );
+
+ // current plan is to create intermediate buffer, packed and interleave
+ // This is a column FFT, the first elements distance between each FFT is the distance of the first two
+ // elements in the original buffer. Like a transpose of the matrix
+ // we need to pass clLengths[0] and instride size to kernel, so kernel can tell the difference
+
+ //this part are common for both passes
+ colTPlan->placeness = CLFFT_OUTOFPLACE;
+ colTPlan->precision = fftPlan->precision;
+ colTPlan->forwardScale = 1.0f;
+ colTPlan->backwardScale = 1.0f;
+ colTPlan->tmpBufSize = 0;
+ colTPlan->batchsize = fftPlan->batchsize;
+ colTPlan->bLdsComplex = fftPlan->bLdsComplex;
+ colTPlan->uLdsFraction = fftPlan->uLdsFraction;
+ colTPlan->ldsPadding = fftPlan->ldsPadding;
+ colTPlan->gen = fftPlan->gen;
+ colTPlan->envelope = fftPlan->envelope;
+
+ //Pass large1D flag to confirm we need multiply twiddle factor
+ colTPlan->large1D = fftPlan->length[0];
+ colTPlan->RCsimple = true;
+
+ colTPlan->length.push_back(clLengths[0]);
+
+ // first Pass
+ colTPlan->inputLayout = fftPlan->inputLayout;
+ colTPlan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ colTPlan->inStride[0] = fftPlan->inStride[0] * clLengths[0];
+ colTPlan->outStride[0] = 1;
+ colTPlan->iDist = fftPlan->iDist;
+ colTPlan->oDist = length0 * length1;//fftPlan->length[0];
+ colTPlan->inStride.push_back(fftPlan->inStride[0]);
+ colTPlan->outStride.push_back(length1);//clLengths[1]);
+
+ for (size_t index=1; index < fftPlan->length.size(); index++)
+ {
+ colTPlan->length.push_back(fftPlan->length[index]);
+ colTPlan->inStride.push_back(fftPlan->inStride[index]);
+ // tmp buffer is tightly packed
+ colTPlan->outStride.push_back(colTPlan->oDist);
+ colTPlan->oDist *= fftPlan->length[index];
+ }
+
+ OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d first column plan failed" ) );
+
+ //another column FFT, size clLengths[0], batch clLengths[1], output without transpose
+ OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planY, fftPlan->context, CLFFT_1D, &clLengths[0] ),
+ _T( "CreateDefaultPlan large1D row failed" ) );
+
+ FFTPlan* col2Plan = NULL;
+ lockRAII* rowLock = NULL;
+ OPENCL_V( fftRepo.getPlan( fftPlan->planY, col2Plan, rowLock ), _T( "fftRepo.getPlan failed" ) );
+
+ // This is second column fft, intermediate buffer is packed and interleaved
+ // we need to pass clLengths[1] and instride size to kernel, so kernel can tell the difference
+
+ // common part for both passes
+ col2Plan->placeness = CLFFT_INPLACE;
+ col2Plan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ col2Plan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
+
+ col2Plan->precision = fftPlan->precision;
+ col2Plan->forwardScale = fftPlan->forwardScale;
+ col2Plan->backwardScale = fftPlan->backwardScale;
+ col2Plan->tmpBufSize = 0;
+ col2Plan->batchsize = fftPlan->batchsize;
+ col2Plan->bLdsComplex = fftPlan->bLdsComplex;
+ col2Plan->uLdsFraction = fftPlan->uLdsFraction;
+ col2Plan->ldsPadding = fftPlan->ldsPadding;
+ col2Plan->gen = fftPlan->gen;
+ col2Plan->envelope = fftPlan->envelope;
+
+ col2Plan->length.push_back(length1);
+
+ col2Plan->inStride[0] = length1;
+ col2Plan->inStride.push_back(1);
+ col2Plan->iDist = length0 * length1;
+
+ col2Plan->outStride[0] = length1;
+ col2Plan->outStride.push_back(1);
+ col2Plan->oDist = length0 * length1;
+
+ for (size_t index=1; index < fftPlan->length.size(); index++)
+ {
+ col2Plan->length.push_back(fftPlan->length[index]);
+ col2Plan->inStride.push_back(col2Plan->iDist);
+ col2Plan->outStride.push_back(col2Plan->oDist);
+ col2Plan->iDist *= fftPlan->length[index];
+ col2Plan->oDist *= fftPlan->length[index];
+ }
+
+ OPENCL_V(clfftBakePlan(fftPlan->planY, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d second column plan failed" ) );
+
+
+ // copy plan to get back to hermitian
+ OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planRCcopy, fftPlan->context, CLFFT_1D, &fftPlan->length[0] ),
+ _T( "CreateDefaultPlan RC copy failed" ) );
+
+ FFTPlan* copyPlan = NULL;
+ lockRAII* copyLock = NULL;
+ OPENCL_V( fftRepo.getPlan( fftPlan->planRCcopy, copyPlan, copyLock ), _T( "fftRepo.getPlan failed" ) );
+
+ // This is second column fft, intermediate buffer is packed and interleaved
+ // we need to pass clLengths[1] and instride size to kernel, so kernel can tell the difference
+
+ // common part for both passes
+ copyPlan->placeness = CLFFT_OUTOFPLACE;
+ copyPlan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ copyPlan->outputLayout = fftPlan->outputLayout;
+
+ copyPlan->precision = fftPlan->precision;
+ copyPlan->forwardScale = 1.0f;
+ copyPlan->backwardScale = 1.0f;
+ copyPlan->tmpBufSize = 0;
+ copyPlan->batchsize = fftPlan->batchsize;
+ copyPlan->bLdsComplex = fftPlan->bLdsComplex;
+ copyPlan->uLdsFraction = fftPlan->uLdsFraction;
+ copyPlan->ldsPadding = fftPlan->ldsPadding;
+ copyPlan->gen = Copy;
+ copyPlan->envelope = fftPlan->envelope;
+
+
+ copyPlan->inStride[0] = 1;
+ copyPlan->iDist = fftPlan->length[0];
+
+ copyPlan->outStride[0] = fftPlan->outStride[0];
+ copyPlan->oDist = fftPlan->oDist;
+
+ for (size_t index=1; index < fftPlan->length.size(); index++)
+ {
+ copyPlan->length.push_back(fftPlan->length[index]);
+ copyPlan->inStride.push_back(copyPlan->inStride[index-1] * fftPlan->length[index-1]);
+ copyPlan->iDist *= fftPlan->length[index];
+ copyPlan->outStride.push_back(fftPlan->outStride[index]);
+ }
+
+ OPENCL_V(clfftBakePlan(fftPlan->planRCcopy, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d RC copy plan failed" ) );
+
+ }
+ else if(fftPlan->outputLayout == CLFFT_REAL)
+ {
+ if (fftPlan->tmpBufSizeRC==0 )
+ {
+ fftPlan->tmpBufSizeRC = length0 * length1 *
+ fftPlan->batchsize * fftPlan->ElementSize();
+ for (size_t index=1; index < fftPlan->length.size(); index++)
+ {
+ fftPlan->tmpBufSizeRC *= fftPlan->length[index];
+ }
+ }
+
+ // copy plan to from hermitian to full complex
+ OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planRCcopy, fftPlan->context, CLFFT_1D, &fftPlan->length[0] ),
+ _T( "CreateDefaultPlan RC copy failed" ) );
+
+ FFTPlan* copyPlan = NULL;
+ lockRAII* copyLock = NULL;
+ OPENCL_V( fftRepo.getPlan( fftPlan->planRCcopy, copyPlan, copyLock ), _T( "fftRepo.getPlan failed" ) );
+
+ // This is second column fft, intermediate buffer is packed and interleaved
+ // we need to pass clLengths[1] and instride size to kernel, so kernel can tell the difference
+
+ // common part for both passes
+ copyPlan->placeness = CLFFT_OUTOFPLACE;
+ copyPlan->inputLayout = fftPlan->inputLayout;
+ copyPlan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
+
+ copyPlan->precision = fftPlan->precision;
+ copyPlan->forwardScale = 1.0f;
+ copyPlan->backwardScale = 1.0f;
+ copyPlan->tmpBufSize = 0;
+ copyPlan->batchsize = fftPlan->batchsize;
+ copyPlan->bLdsComplex = fftPlan->bLdsComplex;
+ copyPlan->uLdsFraction = fftPlan->uLdsFraction;
+ copyPlan->ldsPadding = fftPlan->ldsPadding;
+ copyPlan->gen = Copy;
+ copyPlan->envelope = fftPlan->envelope;
+
+ copyPlan->inStride[0] = fftPlan->inStride[0];
+ copyPlan->iDist = fftPlan->iDist;
+
+ copyPlan->outStride[0] = 1;
+ copyPlan->oDist = fftPlan->length[0];
+
+ for (size_t index=1; index < fftPlan->length.size(); index++)
+ {
+ copyPlan->length.push_back(fftPlan->length[index]);
+ copyPlan->outStride.push_back(copyPlan->outStride[index-1] * fftPlan->length[index-1]);
+ copyPlan->oDist *= fftPlan->length[index];
+ copyPlan->inStride.push_back(fftPlan->inStride[index]);
+ }
+
+ OPENCL_V(clfftBakePlan(fftPlan->planRCcopy, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d RC copy plan failed" ) );
+
+ // column FFT, size clLengths[1], batch clLengths[0], with length[0] twiddle factor multiplication
+ // transposed output
+ OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planX, fftPlan->context, CLFFT_1D, &clLengths[1] ),
+ _T( "CreateDefaultPlan Large1d column failed" ) );
+
+ FFTPlan* colTPlan = NULL;
+ lockRAII* colLock = NULL;
+ OPENCL_V( fftRepo.getPlan( fftPlan->planX, colTPlan, colLock ), _T( "fftRepo.getPlan failed" ) );
+
+ // current plan is to create intermediate buffer, packed and interleave
+ // This is a column FFT, the first elements distance between each FFT is the distance of the first two
+ // elements in the original buffer. Like a transpose of the matrix
+ // we need to pass clLengths[0] and instride size to kernel, so kernel can tell the difference
+
+ //this part are common for both passes
+ colTPlan->placeness = CLFFT_INPLACE;
+ colTPlan->precision = fftPlan->precision;
+ colTPlan->forwardScale = 1.0f;
+ colTPlan->backwardScale = 1.0f;
+ colTPlan->tmpBufSize = 0;
+ colTPlan->batchsize = fftPlan->batchsize;
+ colTPlan->bLdsComplex = fftPlan->bLdsComplex;
+ colTPlan->uLdsFraction = fftPlan->uLdsFraction;
+ colTPlan->ldsPadding = fftPlan->ldsPadding;
+ colTPlan->gen = fftPlan->gen;
+ colTPlan->envelope = fftPlan->envelope;
+
+ //Pass large1D flag to confirm we need multiply twiddle factor
+ colTPlan->large1D = fftPlan->length[0];
+
+ colTPlan->length.push_back(clLengths[0]);
+
+ // first Pass
+ colTPlan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ colTPlan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
+
+
+ colTPlan->inStride[0] = length0;
+ colTPlan->inStride.push_back(1);
+ colTPlan->iDist = length0 * length1;
+
+ colTPlan->outStride[0] = length0;
+ colTPlan->outStride.push_back(1);
+ colTPlan->oDist = length0 * length1;
+
+ for (size_t index=1; index < fftPlan->length.size(); index++)
+ {
+ colTPlan->length.push_back(fftPlan->length[index]);
+ colTPlan->inStride.push_back(colTPlan->iDist);
+ colTPlan->outStride.push_back(colTPlan->oDist);
+ colTPlan->iDist *= fftPlan->length[index];
+ colTPlan->oDist *= fftPlan->length[index];
+ }
+
+
+ OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d first column plan failed" ) );
+
+ //another column FFT, size clLengths[0], batch clLengths[1], output without transpose
+ OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planY, fftPlan->context, CLFFT_1D, &clLengths[0] ),
+ _T( "CreateDefaultPlan large1D row failed" ) );
+
+ FFTPlan* col2Plan = NULL;
+ lockRAII* rowLock = NULL;
+ OPENCL_V( fftRepo.getPlan( fftPlan->planY, col2Plan, rowLock ), _T( "fftRepo.getPlan failed" ) );
+
+ // This is second column fft, intermediate buffer is packed and interleaved
+ // we need to pass clLengths[1] and instride size to kernel, so kernel can tell the difference
+
+ // common part for both passes
+ col2Plan->placeness = CLFFT_OUTOFPLACE;
+ col2Plan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ col2Plan->outputLayout = fftPlan->outputLayout;
+
+ col2Plan->precision = fftPlan->precision;
+ col2Plan->forwardScale = fftPlan->forwardScale;
+ col2Plan->backwardScale = fftPlan->backwardScale;
+ col2Plan->tmpBufSize = 0;
+ col2Plan->batchsize = fftPlan->batchsize;
+ col2Plan->bLdsComplex = fftPlan->bLdsComplex;
+ col2Plan->uLdsFraction = fftPlan->uLdsFraction;
+ col2Plan->ldsPadding = fftPlan->ldsPadding;
+ col2Plan->gen = fftPlan->gen;
+ col2Plan->envelope = fftPlan->envelope;
+
+ col2Plan->RCsimple = true;
+ col2Plan->length.push_back(length1);
+
+ col2Plan->inStride[0] = 1;
+ col2Plan->inStride.push_back(length0);
+ col2Plan->iDist = length0 * length1;
+
+ col2Plan->outStride[0] = length1 * fftPlan->outStride[0];
+ col2Plan->outStride.push_back(fftPlan->outStride[0]);
+ col2Plan->oDist = fftPlan->oDist;
+
+ for (size_t index=1; index < fftPlan->length.size(); index++)
+ {
+ col2Plan->length.push_back(fftPlan->length[index]);
+ col2Plan->inStride.push_back(col2Plan->iDist);
+ col2Plan->iDist *= fftPlan->length[index];
+ col2Plan->outStride.push_back(fftPlan->outStride[index]);
+ }
+
+ OPENCL_V(clfftBakePlan(fftPlan->planY, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d second column plan failed" ) );
+ }
+ else
+ {
+ if (fftPlan->cacheSize) {
+ length0 += fftPlan->cacheSize & 0xFF;
+ length1 += (fftPlan->cacheSize >> 8) & 0xFF;
+ if (length0 * length1 > 2 * fftPlan->length[0])
+ {
+ length0 = clLengths[0];
+ length1 = clLengths[1];
+ }
+ }
+ else
+ {
+ if (fftPlan->length[0] == 131072) length1 += 1; //x0=0, y0=1 good for Cayman card
+ else if (fftPlan->length[0] == 65536) length1 += 8; //x0=0, y0=8 good for Cypress card
+ }
+
+ if (clLengths[0] > Large1DThreshold)
+ {//make no change for Huge 1D case
+ length0 = clLengths[0];
+ length1 = clLengths[1];
+ }
+
+ if (fftPlan->tmpBufSize==0 )
+ {
+ fftPlan->tmpBufSize = length0 * length1 *
+ fftPlan->batchsize * fftPlan->ElementSize();
+ for (size_t index=1; index < fftPlan->length.size(); index++)
+ {
+ fftPlan->tmpBufSize *= fftPlan->length[index];
+ }
+ }
+ else
+ {//make no change for cases passed from higher dimension
+ length0 = clLengths[0];
+ length1 = clLengths[1];
+ }
+
+ // column FFT, size clLengths[1], batch clLengths[0], with length[0] twiddle factor multiplication
+ // transposed output
+ OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planX, fftPlan->context, CLFFT_1D, &clLengths[1] ),
+ _T( "CreateDefaultPlan Large1d column failed" ) );
+
+ FFTPlan* colTPlan = NULL;
+ lockRAII* colLock = NULL;
+ OPENCL_V( fftRepo.getPlan( fftPlan->planX, colTPlan, colLock ), _T( "fftRepo.getPlan failed" ) );
+
+ // current plan is to create intermediate buffer, packed and interleave
+ // This is a column FFT, the first elements distance between each FFT is the distance of the first two
+ // elements in the original buffer. Like a transpose of the matrix
+ // we need to pass clLengths[0] and instride size to kernel, so kernel can tell the difference
+
+ //this part are common for both passes
+ colTPlan->placeness = CLFFT_OUTOFPLACE;
+ colTPlan->precision = fftPlan->precision;
+ colTPlan->forwardScale = 1.0f;
+ colTPlan->backwardScale = 1.0f;
+ colTPlan->tmpBufSize = 0;
+ colTPlan->batchsize = fftPlan->batchsize;
+ colTPlan->bLdsComplex = fftPlan->bLdsComplex;
+ colTPlan->uLdsFraction = fftPlan->uLdsFraction;
+ colTPlan->ldsPadding = fftPlan->ldsPadding;
+ colTPlan->gen = fftPlan->gen;
+ colTPlan->envelope = fftPlan->envelope;
+
+ //Pass large1D flag to confirm we need multiply twiddle factor
+ colTPlan->large1D = fftPlan->length[0];
+
+ colTPlan->length.push_back(clLengths[0]);
+
+ if (fftPlan->large1D == 0)
+ {
+ // first Pass
+ colTPlan->inputLayout = fftPlan->inputLayout;
+ colTPlan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ colTPlan->inStride[0] = fftPlan->inStride[0] * clLengths[0];
+ colTPlan->outStride[0] = 1;
+ colTPlan->iDist = fftPlan->iDist;
+ colTPlan->oDist = length0 * length1;//fftPlan->length[0];
+ colTPlan->inStride.push_back(fftPlan->inStride[0]);
+ colTPlan->outStride.push_back(length1);//clLengths[1]);
+
+ for (size_t index=1; index < fftPlan->length.size(); index++)
+ {
+ colTPlan->length.push_back(fftPlan->length[index]);
+ colTPlan->inStride.push_back(fftPlan->inStride[index]);
+ // tmp buffer is tightly packed
+ colTPlan->outStride.push_back(colTPlan->oDist);
+ colTPlan->oDist *= fftPlan->length[index];
+ }
+ }
+ else
+ {
+ // second pass for huge 1D
+ colTPlan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ colTPlan->outputLayout = fftPlan->outputLayout;
+ colTPlan->inStride[0] = fftPlan->length[1]*clLengths[0];
+ colTPlan->outStride[0] = fftPlan->outStride[0];
+ colTPlan->iDist = fftPlan->length[0];
+ colTPlan->oDist = fftPlan->oDist;
+ colTPlan->inStride.push_back(fftPlan->length[1]);
+ colTPlan->outStride.push_back(fftPlan->outStride[0]*clLengths[1]);
+
+ for (size_t index=1; index < fftPlan->length.size(); index++)
+ {
+ colTPlan->length.push_back(fftPlan->length[index]);
+ colTPlan->inStride.push_back(fftPlan->inStride[index]);
+ colTPlan->outStride.push_back(fftPlan->outStride[index]);
+ colTPlan->iDist *= fftPlan->length[index];
+ }
+ }
+
+ OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d first column plan failed" ) );
+
+ //another column FFT, size clLengths[0], batch clLengths[1], output without transpose
+ OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planY, fftPlan->context, CLFFT_1D, &clLengths[0] ),
+ _T( "CreateDefaultPlan large1D row failed" ) );
+
+ FFTPlan* col2Plan = NULL;
+ lockRAII* rowLock = NULL;
+ OPENCL_V( fftRepo.getPlan( fftPlan->planY, col2Plan, rowLock ), _T( "fftRepo.getPlan failed" ) );
+
+ // This is second column fft, intermediate buffer is packed and interleaved
+ // we need to pass clLengths[1] and instride size to kernel, so kernel can tell the difference
+
+ // common part for both passes
+ col2Plan->outputLayout = fftPlan->outputLayout;
+ col2Plan->precision = fftPlan->precision;
+ col2Plan->forwardScale = fftPlan->forwardScale;
+ col2Plan->backwardScale = fftPlan->backwardScale;
+ col2Plan->tmpBufSize = 0;
+ col2Plan->batchsize = fftPlan->batchsize;
+ col2Plan->oDist = fftPlan->oDist;
+ col2Plan->bLdsComplex = fftPlan->bLdsComplex;
+ col2Plan->uLdsFraction = fftPlan->uLdsFraction;
+ col2Plan->ldsPadding = fftPlan->ldsPadding;
+ col2Plan->gen = fftPlan->gen;
+ col2Plan->envelope = fftPlan->envelope;
+
+ if (clLengths[0] > Large1DThreshold)
+ //prepare for huge 1D
+ col2Plan->large1D = fftPlan->length[0];
+
+ col2Plan->length.push_back(clLengths[1]);
+ col2Plan->outStride.push_back(fftPlan->outStride[0]);
+
+ if (fftPlan->large1D == 0)
+ {
+ //first layer, large 1D from tmp buffer to output buffer
+ col2Plan->placeness = CLFFT_OUTOFPLACE;
+ col2Plan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ col2Plan->inStride[0] = length1;//clLengths[1];
+ col2Plan->outStride[0] = fftPlan->outStride[0] * clLengths[1];
+ col2Plan->iDist = length0 * length1; //fftPlan->length[0];
+ col2Plan->inStride.push_back(1);
+
+ for (size_t index=1; index < fftPlan->length.size(); index++)
+ {
+ col2Plan->length.push_back(fftPlan->length[index]);
+ col2Plan->inStride.push_back(col2Plan->iDist);
+ col2Plan->outStride.push_back(fftPlan->outStride[index]);
+ col2Plan->iDist *= fftPlan->length[index];
+ }
+ }
+ else
+ {
+ //second layer, huge 1D from output buffer to output buffer
+ col2Plan->placeness = CLFFT_INPLACE;
+ col2Plan->inputLayout = fftPlan->outputLayout;
+ col2Plan->inStride[0] = fftPlan->outStride[0] * clLengths[1];
+ col2Plan->outStride[0] = col2Plan->inStride[0];
+ col2Plan->iDist = fftPlan->oDist;
+ col2Plan->inStride.push_back(fftPlan->outStride[0]);
+
+ for (size_t index=1; index < fftPlan->length.size(); index++)
+ {
+ col2Plan->length.push_back(fftPlan->length[index]);
+ col2Plan->inStride.push_back(fftPlan->outStride[index]);
+ col2Plan->outStride.push_back(fftPlan->outStride[index]);
+ }
+ }
+
+ OPENCL_V(clfftBakePlan(fftPlan->planY, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d second column plan failed" ) );
+ }
+
+ fftPlan->baked = true;
+ return CLFFT_SUCCESS;
+ }
+ }
+ break;
+ case CLFFT_2D:
+ {
+ size_t length0 = fftPlan->length[0];
+ size_t length1 = fftPlan->length[1];
+
+
+ if (fftPlan->cacheSize)
+ {
+ length0 += fftPlan->cacheSize & 0xFF;
+ length1 += (fftPlan->cacheSize >> 8) & 0xFF;
+ if (length0 * length1 > 2 * fftPlan->length[0] * fftPlan->length[1])
+ {
+ length0 = fftPlan->length[0];
+ length1 = fftPlan->length[1];
+ }
+ }
+ else
+ {
+ if (fftPlan->length[0]==256 && fftPlan->length[1]==256)
+ {
+ length0 += 8;
+ length1 += 1;
+ }
+ else if (fftPlan->length[0]==512 && fftPlan->length[1]==512)
+ {
+ length0 += 1;
+ length1 += 1;//length1 += 0;
+ }
+ else if (fftPlan->length[0]==1024 && fftPlan->length[1]==512)
+ {
+ length0 += 2;
+ length1 += 2;//length1 += 0;
+ }
+ else if (fftPlan->length[0]==1024 && fftPlan->length[1]==1024)
+ {
+ length0 += 1;
+ length1 += 1;//length1 += 0;
+ }
+ }
+
+ if (fftPlan->length[0] > Large1DThreshold ||
+ fftPlan->length[1] > Large1DThreshold)
+ fftPlan->large2D = true;
+
+ while (1 && (fftPlan->inputLayout != CLFFT_REAL) && (fftPlan->outputLayout != CLFFT_REAL))
+ {
+ //break;
+ if (fftPlan->transflag) //Transpose for 2D
+ {
+ OPENCL_V( fftPlan->GenerateKernel( fftRepo ), _T( "GenerateTransposeProgram() failed" ) );
+ OPENCL_V( CompileKernels( *commQueueFFT, plHandle, fftPlan->gen, fftPlan ), _T( "CompileKernels() failed" ) );
+
+ fftPlan->baked = true;
+ return CLFFT_SUCCESS;
+ }
+
+ if (fftPlan->length.size() != 2) break;
+ if (!(IsPo2(fftPlan->length[0])) || !(IsPo2(fftPlan->length[1])))
+ break;
+ if (fftPlan->length[1] < 32) break;
+ //TBD: restrict the use large2D in x!=y case becase we will need two temp buffers
+ // (1) for 2D usage (2) for 1D large usage
+ //if (fftPlan->large2D) break;
+ //Performance show 512 is the good case with transpose
+ //if user want the result to be transposed, then we will.
+ if (fftPlan->length[0] < 512 && fftPlan->transposed == CLFFT_NOTRANSPOSE) break;
+ if (fftPlan->length[0] < 32) break;
+ //x!=y case, we need tmp buffer, currently temp buffer only support interleaved format
+ //if (fftPlan->length[0] != fftPlan->length[1] && fftPlan->outputLayout == CLFFT_COMPLEX_PLANAR) break;
+ if (fftPlan->inStride[0] != 1 || fftPlan->outStride[0] != 1 ||
+ fftPlan->inStride[1] != fftPlan->length[0] || fftPlan->outStride[1] != fftPlan->length[0])
+ break;
+ //if (fftPlan->placeness != CLFFT_INPLACE || fftPlan->inputLayout != CLFFT_COMPLEX_PLANAR)
+ // break;
+ //if (fftPlan->batchsize != 1) break;
+ //if (fftPlan->precision != CLFFT_SINGLE) break;
+
+ fftPlan->transflag = true;
+
+ //create row plan,
+ // x=y & x!=y, In->In for inplace, In->out for outofplace
+ OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planX, fftPlan->context, CLFFT_1D, &fftPlan->length[ DimX ] ),
+ _T( "CreateDefaultPlan for planX failed" ) );
+
+ FFTPlan* rowPlan = NULL;
+ lockRAII* rowLock = NULL;
+ OPENCL_V( fftRepo.getPlan( fftPlan->planX, rowPlan, rowLock ), _T( "fftRepo.getPlan failed" ) );
+
+ rowPlan->inputLayout = fftPlan->inputLayout;
+ rowPlan->outputLayout = fftPlan->outputLayout;
+ rowPlan->placeness = fftPlan->placeness;
+ rowPlan->outStride[0] = fftPlan->outStride[0];
+ rowPlan->outStride.push_back(fftPlan->outStride[1]);
+ rowPlan->oDist = fftPlan->oDist;
+ rowPlan->precision = fftPlan->precision;
+ rowPlan->forwardScale = 1.0f;
+ rowPlan->backwardScale = 1.0f;
+ rowPlan->tmpBufSize = 0;
+ rowPlan->bLdsComplex = fftPlan->bLdsComplex;
+ rowPlan->uLdsFraction = fftPlan->uLdsFraction;
+ rowPlan->ldsPadding = fftPlan->ldsPadding;
+ rowPlan->gen = fftPlan->gen;
+ rowPlan->envelope = fftPlan->envelope;
+ rowPlan->batchsize = fftPlan->batchsize;
+ rowPlan->inStride[0] = fftPlan->inStride[0];
+ rowPlan->length.push_back(fftPlan->length[1]);
+ rowPlan->inStride.push_back(fftPlan->inStride[1]);
+ rowPlan->iDist = fftPlan->iDist;
+
+ OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ),
+ _T( "BakePlan for planX failed" ) );
+
+ //Create transpose plan for first transpose
+ //x=y: inplace. x!=y inplace: in->tmp, outofplace out->tmp
+ size_t clLengths[] = { 1, 1, 0 };
+ clLengths[0] = fftPlan->length[0];
+ clLengths[1] = fftPlan->length[1];
+
+ bool xyflag = (clLengths[0]==clLengths[1]) ? false : true;
+ if (xyflag && fftPlan->tmpBufSize==0 && fftPlan->length.size()<=2)
+ {
+ // we need tmp buffer for x!=y case
+ // we assume the tmp buffer is packed interleaved
+ fftPlan->tmpBufSize = length0 * length1 *
+ fftPlan->batchsize * fftPlan->ElementSize();
+ }
+
+ OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planTX, fftPlan->context, CLFFT_2D, clLengths ),
+ _T( "CreateDefaultPlan for planT failed" ) );
+
+ FFTPlan* transPlanX = NULL;
+ lockRAII* transLockX = NULL;
+ OPENCL_V( fftRepo.getPlan( fftPlan->planTX, transPlanX, transLockX ), _T( "fftRepo.getPlan failed" ) );
+
+ transPlanX->inputLayout = fftPlan->outputLayout;
+ transPlanX->precision = fftPlan->precision;
+ transPlanX->tmpBufSize = 0;
+ transPlanX->gen = Transpose;
+ transPlanX->envelope = fftPlan->envelope;
+ transPlanX->batchsize = fftPlan->batchsize;
+ transPlanX->inStride[0] = fftPlan->outStride[0];
+ transPlanX->inStride[1] = fftPlan->outStride[1];
+ transPlanX->iDist = fftPlan->oDist;
+ transPlanX->transflag = true;
+
+ if (xyflag)
+ {
+ transPlanX->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ transPlanX->placeness = CLFFT_OUTOFPLACE;
+ transPlanX->outStride[0] = 1;
+ transPlanX->outStride[1] = clLengths[0];
+ transPlanX->oDist = clLengths[0] * clLengths[1];
+ }
+ else
+ {
+ transPlanX->outputLayout = fftPlan->outputLayout;
+ transPlanX->placeness = CLFFT_INPLACE;
+ transPlanX->outStride[0] = fftPlan->outStride[0];
+ transPlanX->outStride[1] = fftPlan->outStride[1];
+ transPlanX->oDist = fftPlan->oDist;
+ }
+
+ OPENCL_V(clfftBakePlan(fftPlan->planTX, numQueues, commQueueFFT, NULL, NULL ),
+ _T( "BakePlan for planTX failed" ) );
+
+ //create second row plan
+ //x!=y: tmp->tmp, x=y case: In->In or Out->Out
+ //if Transposed result is a choice x!=y: tmp->In or out
+ OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planY, fftPlan->context, CLFFT_1D, &fftPlan->length[ DimY ] ),
+ _T( "CreateDefaultPlan for planY failed" ) );
+
+ FFTPlan* colPlan = NULL;
+ lockRAII* colLock = NULL;
+ OPENCL_V( fftRepo.getPlan( fftPlan->planY, colPlan, colLock ), _T( "fftRepo.getPlan failed" ) );
+
+ if (xyflag)
+ {
+ colPlan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ colPlan->inStride[0] = 1;
+ colPlan->inStride.push_back(clLengths[1]);
+ colPlan->iDist = clLengths[0] * clLengths[1];
+
+ if (fftPlan->transposed == CLFFT_NOTRANSPOSE)
+ {
+ colPlan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ colPlan->outStride[0] = 1;
+ colPlan->outStride.push_back(clLengths[1]);
+ colPlan->oDist = clLengths[0] * clLengths[1];
+ colPlan->placeness = CLFFT_INPLACE;
+ }
+ else
+ {
+ colPlan->outputLayout = fftPlan->outputLayout;
+ colPlan->outStride[0] = fftPlan->outStride[0];
+ colPlan->outStride.push_back(clLengths[1] * fftPlan->outStride[0]);
+ colPlan->oDist = fftPlan->oDist;
+ colPlan->placeness = CLFFT_OUTOFPLACE;
+ }
+ }
+ else
+ {
+ colPlan->inputLayout = fftPlan->outputLayout;
+ colPlan->outputLayout = fftPlan->outputLayout;
+ colPlan->outStride[0] = fftPlan->outStride[0];
+ colPlan->outStride.push_back(fftPlan->outStride[1]);
+ colPlan->oDist = fftPlan->oDist;
+ colPlan->inStride[0] = fftPlan->outStride[0];
+ colPlan->inStride.push_back(fftPlan->outStride[1]);
+ colPlan->iDist = fftPlan->oDist;
+ colPlan->placeness = CLFFT_INPLACE;
+ }
+
+ colPlan->precision = fftPlan->precision;
+ colPlan->forwardScale = fftPlan->forwardScale;
+ colPlan->backwardScale = fftPlan->backwardScale;
+ colPlan->tmpBufSize = 0;
+ colPlan->bLdsComplex = fftPlan->bLdsComplex;
+ colPlan->uLdsFraction = fftPlan->uLdsFraction;
+ colPlan->ldsPadding = fftPlan->ldsPadding;
+ colPlan->gen = fftPlan->gen;
+ colPlan->envelope = fftPlan->envelope;
+ colPlan->batchsize = fftPlan->batchsize;
+ colPlan->length.push_back(fftPlan->length[0]);
+
+ OPENCL_V(clfftBakePlan(fftPlan->planY, numQueues, commQueueFFT, NULL, NULL ),
+ _T( "BakePlan for planY failed" ) );
+
+ if (fftPlan->transposed == CLFFT_TRANSPOSED)
+ {
+ fftPlan->baked = true;
+ return CLFFT_SUCCESS;
+ }
+
+ //Create transpose plan for second transpose
+ //x!=y case tmp->In or Out, x=y case In->In or Out->out
+ clLengths[0] = fftPlan->length[1];
+ clLengths[1] = fftPlan->length[0];
+ OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planTY, fftPlan->context, CLFFT_2D, clLengths ),
+ _T( "CreateDefaultPlan for planTY failed" ) );
+
+ FFTPlan* transPlanY = NULL;
+ lockRAII* transLockY = NULL;
+ OPENCL_V( fftRepo.getPlan( fftPlan->planTY, transPlanY, transLockY ), _T( "fftRepo.getPlan failed" ) );
+
+ if (xyflag)
+ {
+ transPlanY->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ transPlanY->placeness = CLFFT_OUTOFPLACE;
+ transPlanY->inStride[0] = 1;
+ transPlanY->inStride[1] = clLengths[0];
+ transPlanY->iDist = clLengths[0] * clLengths[1];
+ }
+ else
+ {
+ transPlanY->inputLayout = fftPlan->outputLayout;
+ transPlanY->placeness = CLFFT_INPLACE;
+ transPlanY->inStride[0] = fftPlan->outStride[0];
+ transPlanY->inStride[1] = fftPlan->outStride[1];
+ transPlanY->iDist = fftPlan->oDist;
+ }
+ transPlanY->outputLayout = fftPlan->outputLayout;
+ transPlanY->outStride[0] = fftPlan->outStride[0];
+ transPlanY->outStride[1] = fftPlan->outStride[1];
+ transPlanY->oDist = fftPlan->oDist;
+ transPlanY->precision = fftPlan->precision;
+ transPlanY->tmpBufSize = 0;
+ transPlanY->gen = Transpose;
+ transPlanY->envelope = fftPlan->envelope;
+ transPlanY->batchsize = fftPlan->batchsize;
+ transPlanY->transflag = true;
+
+ OPENCL_V(clfftBakePlan(fftPlan->planTY, numQueues, commQueueFFT, NULL, NULL ),
+ _T( "BakePlan for planTY failed" ) );
+
+ fftPlan->baked = true;
+ return CLFFT_SUCCESS;
+ }
+
+ //check transposed
+ if (fftPlan->transposed != CLFFT_NOTRANSPOSE)
+ return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
+
+
+ if(fftPlan->inputLayout == CLFFT_REAL)
+ {
+ length0 = fftPlan->length[0];
+ length1 = fftPlan->length[1];
+
+ size_t Nt = (1 + length0/2);
+ if (fftPlan->tmpBufSize==0)
+ {
+ fftPlan->tmpBufSize = Nt * length1 * fftPlan->batchsize * fftPlan->ElementSize();
+ if(fftPlan->length.size() > 2) fftPlan->tmpBufSize *= fftPlan->length[2];
+ }
+
+ // create row plan
+ // real to hermitian
+
+ //create row plan
+ OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planX, fftPlan->context, CLFFT_1D, &fftPlan->length[ DimX ] ),
+ _T( "CreateDefaultPlan for planX failed" ) );
+
+ FFTPlan* rowPlan = NULL;
+ lockRAII* rowLock = NULL;
+ OPENCL_V( fftRepo.getPlan( fftPlan->planX, rowPlan, rowLock ), _T( "fftRepo.getPlan failed" ) );
+
+
+ rowPlan->outputLayout = fftPlan->outputLayout;
+ rowPlan->inputLayout = fftPlan->inputLayout;
+ rowPlan->placeness = fftPlan->placeness;
+ rowPlan->length.push_back(length1);
+
+ rowPlan->inStride[0] = fftPlan->inStride[0];
+ rowPlan->inStride.push_back(fftPlan->inStride[1]);
+ rowPlan->iDist = fftPlan->iDist;
+
+ rowPlan->precision = fftPlan->precision;
+ rowPlan->forwardScale = 1.0f;
+ rowPlan->backwardScale = 1.0f;
+ rowPlan->tmpBufSize = fftPlan->tmpBufSize;
+ rowPlan->bLdsComplex = fftPlan->bLdsComplex;
+ rowPlan->uLdsFraction = fftPlan->uLdsFraction;
+ rowPlan->ldsPadding = fftPlan->ldsPadding;
+ rowPlan->gen = fftPlan->gen;
+ rowPlan->envelope = fftPlan->envelope;
+
+ rowPlan->batchsize = fftPlan->batchsize;
+
+ rowPlan->outStride[0] = fftPlan->outStride[0];
+ rowPlan->outStride.push_back(fftPlan->outStride[1]);
+ rowPlan->oDist = fftPlan->oDist;
+
+ //this 2d is decomposed from 3d
+ if (fftPlan->length.size()>2)
+ {
+ rowPlan->length.push_back(fftPlan->length[2]);
+ rowPlan->inStride.push_back(fftPlan->inStride[2]);
+ rowPlan->outStride.push_back(fftPlan->outStride[2]);
+ }
+
+
+ OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan for planX failed" ) );
+
+ // create col plan
+ // complex to complex
+
+ OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planY, fftPlan->context, CLFFT_1D, &fftPlan->length[ DimY ] ),
+ _T( "CreateDefaultPlan for planY failed" ) );
+
+ FFTPlan* colPlan = NULL;
+ lockRAII* colLock = NULL;
+ OPENCL_V( fftRepo.getPlan( fftPlan->planY, colPlan, colLock ), _T( "fftRepo.getPlan failed" ) );
+
+ switch(fftPlan->outputLayout)
+ {
+ case CLFFT_HERMITIAN_INTERLEAVED:
+ {
+ colPlan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ colPlan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ }
+ break;
+ case CLFFT_HERMITIAN_PLANAR:
+ {
+ colPlan->outputLayout = CLFFT_COMPLEX_PLANAR;
+ colPlan->inputLayout = CLFFT_COMPLEX_PLANAR;
+ }
+ break;
+ default: assert(false);
+ }
+
+ colPlan->placeness = CLFFT_INPLACE;
+ colPlan->length.push_back(Nt);
+
+ colPlan->outStride[0] = fftPlan->outStride[1];
+ colPlan->outStride.push_back(fftPlan->outStride[0]);
+ colPlan->oDist = fftPlan->oDist;
+
+
+ colPlan->precision = fftPlan->precision;
+ colPlan->forwardScale = fftPlan->forwardScale;
+ colPlan->backwardScale = fftPlan->backwardScale;
+ colPlan->tmpBufSize = fftPlan->tmpBufSize;
+ colPlan->bLdsComplex = fftPlan->bLdsComplex;
+ colPlan->uLdsFraction = fftPlan->uLdsFraction;
+ colPlan->ldsPadding = fftPlan->ldsPadding;
+ colPlan->gen = fftPlan->gen;
+ colPlan->envelope = fftPlan->envelope;
+
+ colPlan->batchsize = fftPlan->batchsize;
+
+ colPlan->inStride[0] = rowPlan->outStride[1];
+ colPlan->inStride.push_back(rowPlan->outStride[0]);
+ colPlan->iDist = rowPlan->oDist;
+
+ //this 2d is decomposed from 3d
+ if (fftPlan->length.size()>2)
+ {
+ colPlan->length.push_back(fftPlan->length[2]);
+ colPlan->outStride.push_back(fftPlan->outStride[2]);
+ colPlan->inStride.push_back(rowPlan->outStride[2]);
+ }
+
+ OPENCL_V(clfftBakePlan(fftPlan->planY, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan for planY failed" ) );
+
+ }
+ else if(fftPlan->outputLayout == CLFFT_REAL)
+ {
+ length0 = fftPlan->length[0];
+ length1 = fftPlan->length[1];
+
+ size_t Nt = (1 + length0/2);
+ if (fftPlan->tmpBufSize==0)
+ {
+ fftPlan->tmpBufSize = Nt * length1 * fftPlan->batchsize * fftPlan->ElementSize();
+ if(fftPlan->length.size() > 2) fftPlan->tmpBufSize *= fftPlan->length[2];
+ }
+
+ // create col plan
+ // complex to complex
+
+ OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planY, fftPlan->context, CLFFT_1D, &fftPlan->length[ DimY ] ),
+ _T( "CreateDefaultPlan for planY failed" ) );
+
+ FFTPlan* colPlan = NULL;
+ lockRAII* colLock = NULL;
+ OPENCL_V( fftRepo.getPlan( fftPlan->planY, colPlan, colLock ), _T( "fftRepo.getPlan failed" ) );
+
+ colPlan->length.push_back(Nt);
+
+ colPlan->inStride[0] = fftPlan->inStride[1];
+ colPlan->inStride.push_back(fftPlan->inStride[0]);
+ colPlan->iDist = fftPlan->iDist;
+
+
+ //this 2d is decomposed from 3d
+ if (fftPlan->length.size()>2)
+ {
+ colPlan->placeness = CLFFT_INPLACE;
+
+ colPlan->length.push_back(fftPlan->length[2]);
+ colPlan->inStride.push_back(fftPlan->inStride[2]);
+ colPlan->outStride[0] = colPlan->inStride[0];
+ colPlan->outStride.push_back(colPlan->inStride[1]);
+ colPlan->outStride.push_back(colPlan->inStride[2]);
+ colPlan->oDist = fftPlan->iDist;
+ }
+ else
+ {
+ colPlan->placeness = CLFFT_OUTOFPLACE;
+
+ colPlan->outStride[0] = Nt;
+ colPlan->outStride.push_back(1);
+ colPlan->oDist = Nt*length1;
+ }
+
+
+ switch(fftPlan->inputLayout)
+ {
+ case CLFFT_HERMITIAN_INTERLEAVED:
+ {
+ colPlan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ colPlan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ }
+ break;
+ case CLFFT_HERMITIAN_PLANAR:
+ {
+ colPlan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ colPlan->inputLayout = CLFFT_COMPLEX_PLANAR;
+ }
+ break;
+ default: assert(false);
+ }
+
+
+ colPlan->precision = fftPlan->precision;
+ colPlan->forwardScale = 1.0f;
+ colPlan->backwardScale = 1.0f;
+ colPlan->tmpBufSize = fftPlan->tmpBufSize;
+ colPlan->bLdsComplex = fftPlan->bLdsComplex;
+ colPlan->uLdsFraction = fftPlan->uLdsFraction;
+ colPlan->ldsPadding = fftPlan->ldsPadding;
+ colPlan->gen = fftPlan->gen;
+ colPlan->envelope = fftPlan->envelope;
+
+ colPlan->batchsize = fftPlan->batchsize;
+
+ if ((fftPlan->tmpBufSizeC2R==0) && (length1 > Large1DThreshold) && (fftPlan->length.size()<=2))
+ {
+ fftPlan->tmpBufSizeC2R = Nt * length1 * fftPlan->batchsize * fftPlan->ElementSize();
+ if(fftPlan->length.size() > 2) fftPlan->tmpBufSizeC2R *= fftPlan->length[2];
+ }
+
+ OPENCL_V(clfftBakePlan(fftPlan->planY, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan for planY failed" ) );
+
+ // create row plan
+ // hermitian to real
+
+ //create row plan
+ OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planX, fftPlan->context, CLFFT_1D, &fftPlan->length[ DimX ] ),
+ _T( "CreateDefaultPlan for planX failed" ) );
+
+ FFTPlan* rowPlan = NULL;
+ lockRAII* rowLock = NULL;
+ OPENCL_V( fftRepo.getPlan( fftPlan->planX, rowPlan, rowLock ), _T( "fftRepo.getPlan failed" ) );
+
+
+ rowPlan->outputLayout = fftPlan->outputLayout;
+ rowPlan->inputLayout = CLFFT_HERMITIAN_INTERLEAVED;
+ rowPlan->placeness = CLFFT_OUTOFPLACE;
+ rowPlan->length.push_back(length1);
+
+ rowPlan->inStride[0] = 1;
+ rowPlan->inStride.push_back(Nt);
+ rowPlan->iDist = colPlan->oDist;
+
+ rowPlan->precision = fftPlan->precision;
+ rowPlan->forwardScale = fftPlan->forwardScale;
+ rowPlan->backwardScale = fftPlan->backwardScale;
+ rowPlan->tmpBufSize = fftPlan->tmpBufSize;
+ rowPlan->bLdsComplex = fftPlan->bLdsComplex;
+ rowPlan->uLdsFraction = fftPlan->uLdsFraction;
+ rowPlan->ldsPadding = fftPlan->ldsPadding;
+ rowPlan->gen = fftPlan->gen;
+ rowPlan->envelope = fftPlan->envelope;
+
+ rowPlan->batchsize = fftPlan->batchsize;
+
+ rowPlan->outStride[0] = fftPlan->outStride[0];
+ rowPlan->outStride.push_back(fftPlan->outStride[1]);
+ rowPlan->oDist = fftPlan->oDist;
+
+ //this 2d is decomposed from 3d
+ if (fftPlan->length.size()>2)
+ {
+ rowPlan->length.push_back(fftPlan->length[2]);
+ rowPlan->inStride.push_back(Nt*length1);
+ rowPlan->outStride.push_back(fftPlan->outStride[2]);
+ }
+
+
+ OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan for planX failed" ) );
+ }
+ else
+ {
+ if (fftPlan->tmpBufSize==0 && fftPlan->length.size()<=2)
+ {
+ fftPlan->tmpBufSize = length0 * length1 *
+ fftPlan->batchsize * fftPlan->ElementSize();
+ }
+
+ //create row plan
+ OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planX, fftPlan->context, CLFFT_1D, &fftPlan->length[ DimX ] ),
+ _T( "CreateDefaultPlan for planX failed" ) );
+
+ FFTPlan* rowPlan = NULL;
+ lockRAII* rowLock = NULL;
+ OPENCL_V( fftRepo.getPlan( fftPlan->planX, rowPlan, rowLock ), _T( "fftRepo.getPlan failed" ) );
+
+ rowPlan->inputLayout = fftPlan->inputLayout;
+ if (fftPlan->large2D || fftPlan->length.size()>2)
+ {
+ rowPlan->outputLayout = fftPlan->outputLayout;
+ rowPlan->placeness = fftPlan->placeness;
+ rowPlan->outStride[0] = fftPlan->outStride[0];
+ rowPlan->outStride.push_back(fftPlan->outStride[1]);
+ rowPlan->oDist = fftPlan->oDist;
+ }
+ else
+ {
+ rowPlan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ rowPlan->placeness = CLFFT_OUTOFPLACE;
+ rowPlan->outStride[0] = length1;//1;
+ rowPlan->outStride.push_back(1);//length0);
+ rowPlan->oDist = length0 * length1;
+ }
+ rowPlan->precision = fftPlan->precision;
+ rowPlan->forwardScale = 1.0f;
+ rowPlan->backwardScale = 1.0f;
+ rowPlan->tmpBufSize = fftPlan->tmpBufSize;
+ rowPlan->bLdsComplex = fftPlan->bLdsComplex;
+ rowPlan->uLdsFraction = fftPlan->uLdsFraction;
+ rowPlan->ldsPadding = fftPlan->ldsPadding;
+ rowPlan->gen = fftPlan->gen;
+ rowPlan->envelope = fftPlan->envelope;
+
+ // This is the row fft, the first elements distance between the first two FFTs is the distance of the first elements
+ // of the first two rows in the original buffer.
+ rowPlan->batchsize = fftPlan->batchsize;
+ rowPlan->inStride[0] = fftPlan->inStride[0];
+
+ //pass length and other info to kernel, so the kernel knows this is decomposed from higher dimension
+ rowPlan->length.push_back(fftPlan->length[1]);
+ rowPlan->inStride.push_back(fftPlan->inStride[1]);
+
+ //this 2d is decomposed from 3d
+ if (fftPlan->length.size()>2)
+ {
+ rowPlan->length.push_back(fftPlan->length[2]);
+ rowPlan->inStride.push_back(fftPlan->inStride[2]);
+ rowPlan->outStride.push_back(fftPlan->outStride[2]);
+ }
+
+ rowPlan->iDist = fftPlan->iDist;
+
+
+ OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan for planX failed" ) );
+
+ //create col plan
+ OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planY, fftPlan->context, CLFFT_1D, &fftPlan->length[ DimY ] ),
+ _T( "CreateDefaultPlan for planY failed" ) );
+
+ FFTPlan* colPlan = NULL;
+ lockRAII* colLock = NULL;
+ OPENCL_V( fftRepo.getPlan( fftPlan->planY, colPlan, colLock ), _T( "fftRepo.getPlan failed" ) );
+
+ if (fftPlan->large2D || fftPlan->length.size()>2)
+ {
+ colPlan->inputLayout = fftPlan->outputLayout;
+ colPlan->placeness = CLFFT_INPLACE;
+ colPlan->inStride[0] = fftPlan->outStride[1];
+ colPlan->inStride.push_back(fftPlan->outStride[0]);
+ colPlan->iDist = fftPlan->oDist;
+ }
+ else
+ {
+ colPlan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ colPlan->placeness = CLFFT_OUTOFPLACE;
+ colPlan->inStride[0] = 1;//length0;
+ colPlan->inStride.push_back(length1);//1);
+ colPlan->iDist = length0 * length1;
+ }
+
+ colPlan->outputLayout = fftPlan->outputLayout;
+ colPlan->precision = fftPlan->precision;
+ colPlan->forwardScale = fftPlan->forwardScale;
+ colPlan->backwardScale = fftPlan->backwardScale;
+ colPlan->tmpBufSize = fftPlan->tmpBufSize;
+ colPlan->bLdsComplex = fftPlan->bLdsComplex;
+ colPlan->uLdsFraction = fftPlan->uLdsFraction;
+ colPlan->ldsPadding = fftPlan->ldsPadding;
+ colPlan->gen = fftPlan->gen;
+ colPlan->envelope = fftPlan->envelope;
+
+ // This is a column FFT, the first elements distance between each FFT is the distance of the first two
+ // elements in the original buffer. Like a transpose of the matrix
+ colPlan->batchsize = fftPlan->batchsize;
+ colPlan->outStride[0] = fftPlan->outStride[1];
+
+ //pass length and other info to kernel, so the kernel knows this is decomposed from higher dimension
+ colPlan->length.push_back(fftPlan->length[0]);
+ colPlan->outStride.push_back(fftPlan->outStride[0]);
+ colPlan->oDist = fftPlan->oDist;
+
+ //this 2d is decomposed from 3d
+ if (fftPlan->length.size()>2)
+ {
+ //assert(fftPlan->large2D);
+ colPlan->length.push_back(fftPlan->length[2]);
+ colPlan->inStride.push_back(fftPlan->outStride[2]);
+ colPlan->outStride.push_back(fftPlan->outStride[2]);
+ }
+
+ OPENCL_V(clfftBakePlan(fftPlan->planY, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan for planY failed" ) );
+ }
+
+ fftPlan->baked = true;
+ return CLFFT_SUCCESS;
+ }
+ case CLFFT_3D:
+ {
+ if(fftPlan->inputLayout == CLFFT_REAL)
+ {
+ size_t clLengths[] = { 1, 1, 0 };
+ clLengths[0] = fftPlan->length[ DimX ];
+ clLengths[1] = fftPlan->length[ DimY ];
+
+ //create 2D xy plan
+ OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planX, fftPlan->context, CLFFT_2D, clLengths ),
+ _T( "CreateDefaultPlan 2D planX failed" ) );
+
+ FFTPlan* xyPlan = NULL;
+ lockRAII* rowLock = NULL;
+ OPENCL_V( fftRepo.getPlan( fftPlan->planX, xyPlan, rowLock ), _T( "fftRepo.getPlan failed" ) );
+
+ xyPlan->inputLayout = fftPlan->inputLayout;
+ xyPlan->outputLayout = fftPlan->outputLayout;
+ xyPlan->placeness = fftPlan->placeness;
+ xyPlan->precision = fftPlan->precision;
+ xyPlan->forwardScale = 1.0f;
+ xyPlan->backwardScale = 1.0f;
+ xyPlan->tmpBufSize = fftPlan->tmpBufSize;
+ xyPlan->bLdsComplex = fftPlan->bLdsComplex;
+ xyPlan->uLdsFraction = fftPlan->uLdsFraction;
+ xyPlan->ldsPadding = fftPlan->ldsPadding;
+ xyPlan->gen = fftPlan->gen;
+ xyPlan->envelope = fftPlan->envelope;
+
+ // This is the xy fft, the first elements distance between the first two FFTs is the distance of the first elements
+ // of the first two rows in the original buffer.
+ xyPlan->batchsize = fftPlan->batchsize;
+ xyPlan->inStride[0] = fftPlan->inStride[0];
+ xyPlan->inStride[1] = fftPlan->inStride[1];
+ xyPlan->outStride[0] = fftPlan->outStride[0];
+ xyPlan->outStride[1] = fftPlan->outStride[1];
+
+ //pass length and other info to kernel, so the kernel knows this is decomposed from higher dimension
+ xyPlan->length.push_back(fftPlan->length[2]);
+ xyPlan->inStride.push_back(fftPlan->inStride[2]);
+ xyPlan->outStride.push_back(fftPlan->outStride[2]);
+ xyPlan->iDist = fftPlan->iDist;
+ xyPlan->oDist = fftPlan->oDist;
+
+ OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan 3D->2D planX failed" ) );
+
+ clLengths[0] = fftPlan->length[ DimZ ];
+ clLengths[1] = clLengths[2] = 0;
+ //create 1D col plan
+ OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planZ, fftPlan->context, CLFFT_1D, clLengths ),
+ _T( "CreateDefaultPlan for planZ failed" ) );
+
+ FFTPlan* colPlan = NULL;
+ lockRAII* colLock = NULL;
+ OPENCL_V( fftRepo.getPlan( fftPlan->planZ, colPlan, colLock ), _T( "fftRepo.getPlan failed" ) );
+
+ switch(fftPlan->outputLayout)
+ {
+ case CLFFT_HERMITIAN_INTERLEAVED:
+ {
+ colPlan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ colPlan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ }
+ break;
+ case CLFFT_HERMITIAN_PLANAR:
+ {
+ colPlan->outputLayout = CLFFT_COMPLEX_PLANAR;
+ colPlan->inputLayout = CLFFT_COMPLEX_PLANAR;
+ }
+ break;
+ default: assert(false);
+ }
+
+ colPlan->placeness = CLFFT_INPLACE;
+ colPlan->precision = fftPlan->precision;
+ colPlan->forwardScale = fftPlan->forwardScale;
+ colPlan->backwardScale = fftPlan->backwardScale;
+ colPlan->tmpBufSize = fftPlan->tmpBufSize;
+ colPlan->bLdsComplex = fftPlan->bLdsComplex;
+ colPlan->uLdsFraction = fftPlan->uLdsFraction;
+ colPlan->ldsPadding = fftPlan->ldsPadding;
+ colPlan->gen = fftPlan->gen;
+ colPlan->envelope = fftPlan->envelope;
+
+ // This is a column FFT, the first elements distance between each FFT is the distance of the first two
+ // elements in the original buffer. Like a transpose of the matrix
+ colPlan->batchsize = fftPlan->batchsize;
+ colPlan->inStride[0] = fftPlan->outStride[2];
+ colPlan->outStride[0] = fftPlan->outStride[2];
+
+ //pass length and other info to kernel, so the kernel knows this is decomposed from higher dimension
+ colPlan->length.push_back(1 + fftPlan->length[0]/2);
+ colPlan->length.push_back(fftPlan->length[1]);
+ colPlan->inStride.push_back(fftPlan->outStride[0]);
+ colPlan->inStride.push_back(fftPlan->outStride[1]);
+ colPlan->outStride.push_back(fftPlan->outStride[0]);
+ colPlan->outStride.push_back(fftPlan->outStride[1]);
+ colPlan->iDist = fftPlan->oDist;
+ colPlan->oDist = fftPlan->oDist;
+
+ OPENCL_V(clfftBakePlan(fftPlan->planZ, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan 3D->1D planZ failed" ) );
+ }
+ else if(fftPlan->outputLayout == CLFFT_REAL)
+ {
+ if (fftPlan->tmpBufSize == 0)
+ {
+ fftPlan->tmpBufSize = fftPlan->length[2] * fftPlan->length[1] * (1 + fftPlan->length[0]/2);
+ fftPlan->tmpBufSize *= fftPlan->batchsize * fftPlan->ElementSize();
+ }
+
+ size_t clLengths[] = { 1, 1, 0 };
+
+ clLengths[0] = fftPlan->length[ DimZ ];
+ clLengths[1] = clLengths[2] = 0;
+
+ //create 1D col plan
+ OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planZ, fftPlan->context, CLFFT_1D, clLengths ),
+ _T( "CreateDefaultPlan for planZ failed" ) );
+
+ FFTPlan* colPlan = NULL;
+ lockRAII* colLock = NULL;
+ OPENCL_V( fftRepo.getPlan( fftPlan->planZ, colPlan, colLock ), _T( "fftRepo.getPlan failed" ) );
+
+ switch(fftPlan->inputLayout)
+ {
+ case CLFFT_HERMITIAN_INTERLEAVED:
+ {
+ colPlan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ colPlan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ }
+ break;
+ case CLFFT_HERMITIAN_PLANAR:
+ {
+ colPlan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ colPlan->inputLayout = CLFFT_COMPLEX_PLANAR;
+ }
+ break;
+ default: assert(false);
+ }
+
+ colPlan->placeness = CLFFT_OUTOFPLACE;
+
+ colPlan->precision = fftPlan->precision;
+ colPlan->forwardScale = 1.0f;
+ colPlan->backwardScale = 1.0f;
+ colPlan->tmpBufSize = fftPlan->tmpBufSize;
+ colPlan->bLdsComplex = fftPlan->bLdsComplex;
+ colPlan->uLdsFraction = fftPlan->uLdsFraction;
+ colPlan->ldsPadding = fftPlan->ldsPadding;
+ colPlan->gen = fftPlan->gen;
+ colPlan->envelope = fftPlan->envelope;
+
+ // This is a column FFT, the first elements distance between each FFT is the distance of the first two
+ // elements in the original buffer. Like a transpose of the matrix
+ colPlan->batchsize = fftPlan->batchsize;
+ colPlan->inStride[0] = fftPlan->inStride[2];
+ colPlan->outStride[0] = fftPlan->length[1] * (1 + fftPlan->length[0]/2);
+
+ //pass length and other info to kernel, so the kernel knows this is decomposed from higher dimension
+ colPlan->length.push_back(1 + fftPlan->length[0]/2);
+ colPlan->length.push_back(fftPlan->length[1]);
+ colPlan->inStride.push_back(fftPlan->inStride[0]);
+ colPlan->inStride.push_back(fftPlan->inStride[1]);
+ colPlan->outStride.push_back(1);
+ colPlan->outStride.push_back(1 + fftPlan->length[0]/2);
+ colPlan->iDist = fftPlan->iDist;
+ colPlan->oDist = fftPlan->length[2] * fftPlan->length[1] * (1 + fftPlan->length[0]/2);
+
+ if ((fftPlan->tmpBufSizeC2R==0) && ((fftPlan->length[2] > Large1DThreshold) || (fftPlan->length[1] > Large1DThreshold)))
+ {
+ fftPlan->tmpBufSizeC2R = (1 + fftPlan->length[0]/2) * (fftPlan->length[1]) * (fftPlan->length[2]) *
+ fftPlan->batchsize * fftPlan->ElementSize();
+ }
+
+ OPENCL_V(clfftBakePlan(fftPlan->planZ, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan 3D->1D planZ failed" ) );
+
+ clLengths[0] = fftPlan->length[ DimX ];
+ clLengths[1] = fftPlan->length[ DimY ];
+
+ //create 2D xy plan
+ OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planX, fftPlan->context, CLFFT_2D, clLengths ),
+ _T( "CreateDefaultPlan 2D planX failed" ) );
+
+ FFTPlan* xyPlan = NULL;
+ lockRAII* rowLock = NULL;
+ OPENCL_V( fftRepo.getPlan( fftPlan->planX, xyPlan, rowLock ), _T( "fftRepo.getPlan failed" ) );
+
+ xyPlan->inputLayout = CLFFT_HERMITIAN_INTERLEAVED;
+ xyPlan->outputLayout = fftPlan->outputLayout;
+
+ xyPlan->placeness = CLFFT_OUTOFPLACE;
+
+ xyPlan->precision = fftPlan->precision;
+ xyPlan->forwardScale = fftPlan->forwardScale;
+ xyPlan->backwardScale = fftPlan->backwardScale;
+ xyPlan->tmpBufSize = fftPlan->tmpBufSize;
+ xyPlan->bLdsComplex = fftPlan->bLdsComplex;
+ xyPlan->uLdsFraction = fftPlan->uLdsFraction;
+ xyPlan->ldsPadding = fftPlan->ldsPadding;
+ xyPlan->gen = fftPlan->gen;
+ xyPlan->envelope = fftPlan->envelope;
+
+ // This is the xy fft, the first elements distance between the first two FFTs is the distance of the first elements
+ // of the first two rows in the original buffer.
+ xyPlan->batchsize = fftPlan->batchsize;
+ xyPlan->inStride[0] = 1;
+ xyPlan->inStride[1] = (1 + fftPlan->length[0]/2);
+ xyPlan->outStride[0] = fftPlan->outStride[0];
+ xyPlan->outStride[1] = fftPlan->outStride[1];
+
+ //pass length and other info to kernel, so the kernel knows this is decomposed from higher dimension
+ xyPlan->length.push_back(fftPlan->length[2]);
+ xyPlan->inStride.push_back(fftPlan->length[1] * (1 + fftPlan->length[0]/2));
+ xyPlan->outStride.push_back(fftPlan->outStride[2]);
+ xyPlan->iDist = colPlan->oDist;
+ xyPlan->oDist = fftPlan->oDist;
+
+ OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan 3D->2D planX failed" ) );
+ }
+ else
+ {
+ if (fftPlan->tmpBufSize==0 && (
+ fftPlan->length[0] > Large1DThreshold ||
+ fftPlan->length[1] > Large1DThreshold ||
+ fftPlan->length[2] > Large1DThreshold
+ ))
+ {
+ fftPlan->tmpBufSize = fftPlan->length[0] * fftPlan->length[1] * fftPlan->length[2] *
+ fftPlan->batchsize * fftPlan->ElementSize();
+ }
+
+ size_t clLengths[] = { 1, 1, 0 };
+ clLengths[0] = fftPlan->length[ DimX ];
+ clLengths[1] = fftPlan->length[ DimY ];
+
+ //create 2D xy plan
+ OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planX, fftPlan->context, CLFFT_2D, clLengths ),
+ _T( "CreateDefaultPlan 2D planX failed" ) );
+
+ FFTPlan* xyPlan = NULL;
+ lockRAII* rowLock = NULL;
+ OPENCL_V( fftRepo.getPlan( fftPlan->planX, xyPlan, rowLock ), _T( "fftRepo.getPlan failed" ) );
+
+ xyPlan->inputLayout = fftPlan->inputLayout;
+ xyPlan->outputLayout = fftPlan->outputLayout;
+ xyPlan->placeness = fftPlan->placeness;
+ xyPlan->precision = fftPlan->precision;
+ xyPlan->forwardScale = 1.0f;
+ xyPlan->backwardScale = 1.0f;
+ xyPlan->tmpBufSize = fftPlan->tmpBufSize;
+ xyPlan->bLdsComplex = fftPlan->bLdsComplex;
+ xyPlan->uLdsFraction = fftPlan->uLdsFraction;
+ xyPlan->ldsPadding = fftPlan->ldsPadding;
+ xyPlan->gen = fftPlan->gen;
+ xyPlan->envelope = fftPlan->envelope;
+
+ // This is the xy fft, the first elements distance between the first two FFTs is the distance of the first elements
+ // of the first two rows in the original buffer.
+ xyPlan->batchsize = fftPlan->batchsize;
+ xyPlan->inStride[0] = fftPlan->inStride[0];
+ xyPlan->inStride[1] = fftPlan->inStride[1];
+ xyPlan->outStride[0] = fftPlan->outStride[0];
+ xyPlan->outStride[1] = fftPlan->outStride[1];
+
+ //pass length and other info to kernel, so the kernel knows this is decomposed from higher dimension
+ xyPlan->length.push_back(fftPlan->length[2]);
+ xyPlan->inStride.push_back(fftPlan->inStride[2]);
+ xyPlan->outStride.push_back(fftPlan->outStride[2]);
+ xyPlan->iDist = fftPlan->iDist;
+ xyPlan->oDist = fftPlan->oDist;
+
+ OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan 3D->2D planX failed" ) );
+
+ clLengths[0] = fftPlan->length[ DimZ ];
+ clLengths[1] = clLengths[2] = 0;
+ //create 1D col plan
+ OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planZ, fftPlan->context, CLFFT_1D, clLengths ),
+ _T( "CreateDefaultPlan for planZ failed" ) );
+
+ FFTPlan* colPlan = NULL;
+ lockRAII* colLock = NULL;
+ OPENCL_V( fftRepo.getPlan( fftPlan->planZ, colPlan, colLock ), _T( "fftRepo.getPlan failed" ) );
+
+ colPlan->inputLayout = fftPlan->outputLayout;
+ colPlan->outputLayout = fftPlan->outputLayout;
+ colPlan->placeness = CLFFT_INPLACE;
+ colPlan->precision = fftPlan->precision;
+ colPlan->forwardScale = fftPlan->forwardScale;
+ colPlan->backwardScale = fftPlan->backwardScale;
+ colPlan->tmpBufSize = fftPlan->tmpBufSize;
+ colPlan->bLdsComplex = fftPlan->bLdsComplex;
+ colPlan->uLdsFraction = fftPlan->uLdsFraction;
+ colPlan->ldsPadding = fftPlan->ldsPadding;
+ colPlan->gen = fftPlan->gen;
+ colPlan->envelope = fftPlan->envelope;
+
+ // This is a column FFT, the first elements distance between each FFT is the distance of the first two
+ // elements in the original buffer. Like a transpose of the matrix
+ colPlan->batchsize = fftPlan->batchsize;
+ colPlan->inStride[0] = fftPlan->outStride[2];
+ colPlan->outStride[0] = fftPlan->outStride[2];
+
+ //pass length and other info to kernel, so the kernel knows this is decomposed from higher dimension
+ colPlan->length.push_back(fftPlan->length[0]);
+ colPlan->length.push_back(fftPlan->length[1]);
+ colPlan->inStride.push_back(fftPlan->outStride[0]);
+ colPlan->inStride.push_back(fftPlan->outStride[1]);
+ colPlan->outStride.push_back(fftPlan->outStride[0]);
+ colPlan->outStride.push_back(fftPlan->outStride[1]);
+ colPlan->iDist = fftPlan->oDist;
+ colPlan->oDist = fftPlan->oDist;
+
+ OPENCL_V(clfftBakePlan(fftPlan->planZ, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan 3D->1D planZ failed" ) );
+ }
+
+ fftPlan->baked = true;
+ return CLFFT_SUCCESS;
+ }
+ }
+
+ // For the radices that we have factored, we need to load/compile and build the appropriate OpenCL kernels
+ OPENCL_V( fftPlan->GenerateKernel( fftRepo ), _T( "GenerateKernel() failed" ) );
+
+ // For the radices that we have factored, we need to load/compile and build the appropriate OpenCL kernels
+ OPENCL_V( CompileKernels( *commQueueFFT, plHandle, fftPlan->gen, fftPlan ), _T( "CompileKernels() failed" ) );
+
+ // Allocate resources
+ OPENCL_V( fftPlan->AllocateBuffers (), _T("AllocateBuffers() failed"));
+
+ fftPlan->ConstructAndEnqueueConstantBuffers( commQueueFFT );
+
+ // Record that we baked the plan
+ fftPlan->baked = true;
+
+ return CLFFT_SUCCESS;
+}
+
+clfftStatus clfftCopyPlan( clfftPlanHandle* out_plHandle, cl_context new_context, clfftPlanHandle in_plHandle )
+{
+ FFTRepo& fftRepo = FFTRepo::getInstance( );
+ FFTPlan* in_fftPlan = NULL, *out_fftPlan = NULL;
+ lockRAII* in_planLock = NULL, *out_planLock = NULL;
+
+ OPENCL_V( fftRepo.getPlan( in_plHandle, in_fftPlan, in_planLock ), _T( "fftRepo.getPlan failed" ) );
+
+ OPENCL_V( clfftCreateDefaultPlan( out_plHandle, new_context, in_fftPlan->dim, &in_fftPlan->length[ 0 ] ),
+ _T( "clfftCreateDefaultPlan failed" ) );
+
+ OPENCL_V( fftRepo.getPlan( *out_plHandle, out_fftPlan, out_planLock ), _T( "fftRepo.getPlan failed" ) );
+
+ // Let other operations complete before attempting to copy the plan
+ scopedLock sLock( *in_planLock, _T( "clfftCopyPlan" ) );
+
+ out_fftPlan->baked = false;
+ out_fftPlan->gen = in_fftPlan->gen;
+ out_fftPlan->envelope = in_fftPlan->envelope;
+ out_fftPlan->dim = in_fftPlan->dim;
+ out_fftPlan->inputLayout = in_fftPlan->inputLayout;
+ out_fftPlan->outputLayout = in_fftPlan->outputLayout;
+ out_fftPlan->placeness = in_fftPlan->placeness;
+ out_fftPlan->precision = in_fftPlan->precision;
+ out_fftPlan->forwardScale = in_fftPlan->forwardScale;
+ out_fftPlan->backwardScale = in_fftPlan->backwardScale;
+ out_fftPlan->iDist = in_fftPlan->iDist;
+ out_fftPlan->oDist = in_fftPlan->oDist;
+ out_fftPlan->length = in_fftPlan->length;
+ out_fftPlan->inStride = in_fftPlan->inStride;
+ out_fftPlan->outStride = in_fftPlan->outStride;
+ out_fftPlan->batchsize = in_fftPlan->batchsize;
+
+ return CLFFT_SUCCESS;
+}
+
+clfftStatus FFTPlan::ConstructAndEnqueueConstantBuffers( cl_command_queue* commQueueFFT )
+{
+ // Construct the constant buffer and call clEnqueueWriteBuffer
+ //
+ cb_t ConstantBufferParams [CLFFT_CB_SIZE];
+ memset (& ConstantBufferParams, 0, sizeof (ConstantBufferParams));
+
+ cl_uint nY = 1;
+ cl_uint nZ = 0;
+ cl_uint nW = 0;
+ cl_uint n5 = 0;
+
+ switch( /*fftPlan->*/length.size() )
+ {
+ case 1:
+ nY = std::max (1, cl_uint (/*fftPlan->*/batchsize));
+ break;
+
+ case 2:
+ nY = (cl_uint)/*fftPlan->*/length[DimY];
+ nZ = std::max (1, cl_uint (/*fftPlan->*/batchsize));
+ break;
+
+ case 3:
+ nY = (cl_uint)/*fftPlan->*/length[DimY];
+ nZ = (cl_uint)/*fftPlan->*/length[DimZ];
+ nW = std::max (1, cl_uint (/*fftPlan->*/batchsize));
+ break;
+
+ case 4:
+ nY = (cl_uint)/*fftPlan->*/length[DimY];
+ nZ = (cl_uint)/*fftPlan->*/length[DimZ];
+ nW = (cl_uint)/*fftPlan->*/length[DimW];
+ n5 = std::max (1, cl_uint (/*fftPlan->*/batchsize));
+ break;
+ }
+ ConstantBufferParams[CLFFT_CB_NY ].u = nY;
+ ConstantBufferParams[CLFFT_CB_NZ ].u = nZ;
+ ConstantBufferParams[CLFFT_CB_NW ].u = nW;
+ ConstantBufferParams[CLFFT_CB_N5 ].u = n5;
+
+ assert (/*fftPlan->*/inStride.size() == /*fftPlan->*/outStride.size());
+
+ switch (/*fftPlan->*/inStride.size()) {
+ case 1:
+ ConstantBufferParams[CLFFT_CB_ISX].u = cl_uint (/*fftPlan->*/inStride[0]);
+ ConstantBufferParams[CLFFT_CB_ISY].u = cl_uint (/*fftPlan->*/iDist);
+ break;
+
+ case 2:
+ ConstantBufferParams[CLFFT_CB_ISX].u = cl_uint (/*fftPlan->*/inStride[0]);
+ ConstantBufferParams[CLFFT_CB_ISY].u = cl_uint (/*fftPlan->*/inStride[1]);
+ ConstantBufferParams[CLFFT_CB_ISZ].u = cl_uint (/*fftPlan->*/iDist);
+ break;
+
+ case 3:
+ ConstantBufferParams[CLFFT_CB_ISX].u = cl_uint (/*fftPlan->*/inStride[0]);
+ ConstantBufferParams[CLFFT_CB_ISY].u = cl_uint (/*fftPlan->*/inStride[1]);
+ ConstantBufferParams[CLFFT_CB_ISZ].u = cl_uint (/*fftPlan->*/inStride[2]);
+ ConstantBufferParams[CLFFT_CB_ISW].u = cl_uint (/*fftPlan->*/iDist);
+ break;
+
+ case 4:
+ ConstantBufferParams[CLFFT_CB_ISX].u = cl_uint (/*fftPlan->*/inStride[0]);
+ ConstantBufferParams[CLFFT_CB_ISY].u = cl_uint (/*fftPlan->*/inStride[1]);
+ ConstantBufferParams[CLFFT_CB_ISZ].u = cl_uint (/*fftPlan->*/inStride[2]);
+ ConstantBufferParams[CLFFT_CB_ISW].u = cl_uint (/*fftPlan->*/inStride[3]);
+ ConstantBufferParams[CLFFT_CB_IS5].u = cl_uint (/*fftPlan->*/iDist);
+ break;
+ }
+
+ switch (/*fftPlan->*/outStride.size()) {
+ case 1:
+ ConstantBufferParams[CLFFT_CB_OSX].u = cl_uint (/*fftPlan->*/outStride[0]);
+ ConstantBufferParams[CLFFT_CB_OSY].u = cl_uint (/*fftPlan->*/oDist);
+ break;
+
+ case 2:
+ ConstantBufferParams[CLFFT_CB_OSX].u = cl_uint (/*fftPlan->*/outStride[0]);
+ ConstantBufferParams[CLFFT_CB_OSY].u = cl_uint (/*fftPlan->*/outStride[1]);
+ ConstantBufferParams[CLFFT_CB_OSZ].u = cl_uint (/*fftPlan->*/oDist);
+ break;
+
+ case 3:
+ ConstantBufferParams[CLFFT_CB_OSX].u = cl_uint (/*fftPlan->*/outStride[0]);
+ ConstantBufferParams[CLFFT_CB_OSY].u = cl_uint (/*fftPlan->*/outStride[1]);
+ ConstantBufferParams[CLFFT_CB_OSZ].u = cl_uint (/*fftPlan->*/outStride[2]);
+ ConstantBufferParams[CLFFT_CB_OSW].u = cl_uint (/*fftPlan->*/oDist);
+ break;
+
+ case 4:
+ ConstantBufferParams[CLFFT_CB_OSX].u = cl_uint (/*fftPlan->*/outStride[0]);
+ ConstantBufferParams[CLFFT_CB_OSY].u = cl_uint (/*fftPlan->*/outStride[1]);
+ ConstantBufferParams[CLFFT_CB_OSZ].u = cl_uint (/*fftPlan->*/outStride[2]);
+ ConstantBufferParams[CLFFT_CB_OSW].u = cl_uint (/*fftPlan->*/outStride[3]);
+ ConstantBufferParams[CLFFT_CB_OS5].u = cl_uint (/*fftPlan->*/oDist);
+ break;
+ }
+
+ OPENCL_V(clEnqueueWriteBuffer( *commQueueFFT,
+ /*fftPlan->*/const_buffer,
+ 1, // TODO? non-blocking write?
+ 0,
+ sizeof(ConstantBufferParams),
+ &ConstantBufferParams,
+ 0,
+ NULL,
+ NULL), _T("clEnqueueWriteBuffer failed") );
+
+ return CLFFT_SUCCESS;
+}
+
+//TODO caching kernel binaries for later reload
+#if 0
+typedef std::pair plan_tree_node_t;
+typedef std::vector< std::pair > plan_tree_t;
+
+void make_plan_tree( plan_tree_t & tree, std::string name, clfftPlanHandle handle )
+{
+ tree.push_back( plan_tree_node_t(name, handle) );
+
+ FFTPlan* plan = NULL;
+ FFTRepo& repo = FFTRepo::getInstance();
+ lockRAII* lock = NULL;
+ clfftStatus status = repo.getPlan( handle, plan, lock );
+ if( status != CLFFT_SUCCESS )
+ {
+ throw( "make_plan_tree failure: repo.getPlan" );
+ }
+
+ if( plan->planX )
+ {
+ std::string subplan(name);
+ subplan += "X";
+ make_plan_tree(tree, subplan, plan->planX );
+ }
+
+ if( plan->planY )
+ {
+ std::string subplan(name);
+ subplan += "Y";
+ make_plan_tree(tree, subplan, plan->planY );
+ }
+
+ if( plan->planZ )
+ {
+ std::string subplan(name);
+ subplan += "Z";
+ make_plan_tree(tree, subplan, plan->planZ );
+ }
+}
+
+clfftStatus clfftWritePlanToDisk( clfftPlanHandle plan_handle, const char* filename )
+{
+ plan_tree_t plan_tree;
+ make_plan_tree( plan_tree, "plan", plan_handle );
+
+ std::ofstream planfile;
+ planfile.open(filename, std::ios::binary);
+
+ while( !plan_tree.empty() )
+ {
+ plan_tree_node_t node( *plan_tree.begin() );
+ plan_tree.erase( plan_tree.begin() );
+
+ FFTPlan* plan = NULL;
+ FFTRepo& repo = FFTRepo::getInstance();
+ lockRAII* lock = NULL;
+ OPENCL_V(repo.getPlan( node.second, plan, lock ), _T("getPlan failure"));
+
+ // print the name of the node (plan, planX, planXX, planY, plan XY, etc)
+ planfile << node.first << " ";
+
+ planfile << "dimensions " << plan->dim << " " << plan->length.size();
+ // dimensions must be listed first because clfftReadPlanFromDisk
+ // will need to use dimensions for reading in strides and such
+
+ for( int i = 0; i < plan->length.size(); ++i )
+ {
+ planfile << " " << plan->length[i];
+ }
+
+ planfile << " batch " << plan->batchsize;
+
+ planfile << " instride " << plan->inStride.size();
+ for( int i = 0; i < plan->inStride.size(); ++i )
+ {
+ planfile << " " << plan->inStride[i];
+ }
+
+ planfile << " outstride " << plan->outStride.size();
+ for( int i = 0; i < plan->outStride.size(); ++i )
+ {
+ planfile << " " << plan->outStride[i];
+ }
+
+ planfile << " in-out-distances " << plan->iDist << " " << plan->oDist;
+ planfile << " in-out-layouts " << plan->inputLayout << " " << plan->outputLayout;
+ planfile << " resultlocation " << plan->placeness;
+ planfile << " precision " << plan->precision;
+ planfile << " forwardscale " << float_as_hex(plan->forwardScale);
+ planfile << " backwardscale " << float_as_hex(plan->backwardScale);
+ // we need to stash scales as hex so that we don't have any roundoff error
+ // clfftReadPlanFromDisk will read the hex back in as float
+
+ planfile << " gen " << plan->gen;
+ planfile << " bLdsComplex " << plan->bLdsComplex;
+ planfile << " ldsPadding " << plan->ldsPadding;
+ planfile << " uLdsFraction " << plan->uLdsFraction;
+ planfile << " large1D_Xfactor " << plan->large1D_Xfactor;
+ planfile << " cacheSize " << plan->cacheSize;
+ planfile << " tmpBufSize " << plan->tmpBufSize;
+ planfile << " large1D " << plan->large1D;
+ planfile << " large2D " << plan->large2D;
+
+ if( plan->baked == true )
+ {
+ planfile << " number-of-devices " << plan->number_of_devices;
+
+ if( plan->number_of_devices > 0 )
+ {
+ planfile << " binary-sizes";
+ for( int i = 0; i < plan->number_of_devices; i++ )
+ {
+ planfile << " " << *(plan->binary_sizes.get() + i);
+ }
+
+ planfile << " binaries ";
+ for( int i = 0; i < plan->number_of_devices; i++ )
+ {
+ planfile << beginning_of_binary;
+ planfile.write( plan->binaries[i].get(), plan->binary_sizes[i] );
+ planfile << end_of_binary;
+ }
+ }
+ else
+ {
+ planfile << " ";
+ }
+ }
+ }
+
+ planfile << " " << end_of_file;
+ planfile.close();
+ return CLFFT_SUCCESS;
+}
+
+void FFTPlan::ResetBinarySizes()
+{
+ binary_sizes.reset(new size_t[number_of_devices]);
+}
+
+void FFTPlan::ResetBinaries()
+{
+ binaries.clear();
+ for( int i = 0; i < number_of_devices; i++ )
+ {
+ binaries.push_back( std::unique_ptr(new char[binary_sizes[i]] ) );
+ }
+}
+
+std::string pop_next_word( std::string & str )
+{
+ size_t next_space = str.find_first_of(' ');
+
+ std::string next_word( str.substr( 0, next_space ) );
+ str.erase( 0, next_space+1 ); // we need the extra +1 to munch off the space
+
+ return next_word;
+}
+
+int my_string_to_int( std::string str )
+{
+ int i;
+ std::stringstream string_to_int( str );
+ string_to_int >> i;
+ return i;
+}
+
+bool start_of_a_plan( std::string word )
+{
+ if( word.substr(0,4) == "plan" )
+ return true;
+ else
+ return false;
+}
+
+clfftStatus clfftReadPlanFromDisk( clfftPlanHandle plan_handle, const char* filename )
+{
+ plan_tree_t tree;
+
+ FFTPlan* plan = NULL;
+ FFTRepo& repo = FFTRepo::getInstance();
+ lockRAII* lock = NULL;
+ OPENCL_V(repo.getPlan( plan_handle, plan, lock ), _T("getPlan failure"));
+
+ std::ifstream planfile;
+ planfile.open(filename, std::ios::in | std::ios::binary);
+
+ unsigned int dimensions = 0;
+ std::string next_word;
+
+ while( planfile >> next_word )
+ {
+ if( start_of_a_plan( next_word ) )
+ {
+ if( next_word.length() > 4 )
+ // if true, this is not a base plan
+ {
+ clfftDim temp_dimension = CLFFT_1D;
+ size_t temp_lengths[3] = {1,1,1};
+
+ // let's create the plan to represent the child plan
+ clfftPlanHandle child_plan;
+ OPENCL_V(clfftCreateDefaultPlan( &child_plan, plan->context, temp_dimension, temp_lengths ),
+ "clfftReadPlanFromDisk(): error calling clfftCreateDefaultPlan()");
+
+ tree.push_back( plan_tree_node_t( next_word, child_plan ) );
+
+ // we need to update the planX, Y, or Z pointer to point at the child plan
+ char child_plan_name = next_word.rbegin()[0]; // this tells us if this is planX, Y, or Z
+ next_word.erase( next_word.end()-1 ); // this tells us the parent plan
+ std::string parent_plan_name = next_word;
+
+ clfftPlanHandle parent_plan = 0;
+
+ for( int i = 0; i < tree.size(); i++ )
+ {
+ if( tree[i].first == parent_plan_name )
+ {
+ parent_plan = tree[i].second;
+ }
+ }
+
+ plan = NULL;
+ OPENCL_V(repo.getPlan( parent_plan, plan, lock ), _T("getPlan failure"));
+
+ if( child_plan_name == 'X' )
+ plan->planX = child_plan;
+ else if( child_plan_name == 'Y' )
+ plan->planY = child_plan;
+ else if( child_plan_name == 'Z' )
+ plan->planZ = child_plan;
+ else
+ OPENCL_V(CLFFT_INVALID_PLAN, "clfftReadPlanFromDisk(): could not identify child plan" );
+
+ // our child plan is now the active plan
+ plan = NULL;
+ OPENCL_V(repo.getPlan( child_plan, plan, lock ), _T("getPlan failure"));
+ plan_handle = child_plan;
+ }
+ else
+ // if this is a base plan, we don't need to do anything fancy.
+ // just add the node to the tree
+ {
+ tree.push_back( plan_tree_node_t( next_word, plan_handle ) );
+ }
+
+ plan->readFromFile = true;
+ }
+ else if( next_word == "dimensions" )
+ {
+ size_t lengths[3];
+
+ // read number of dimensions
+ planfile >> dimensions;
+
+ // number of length values that follow (subplans have some really strange things going on,
+ // so this might not always match the dimension of the transform)
+ size_t number_of_lengths = 0;
+ planfile >> number_of_lengths;
+
+ OPENCL_V( clfftSetPlanDim(plan_handle, static_cast(dimensions)), _T("clfftReadPlanFromDisk: clfftSetPlanDim") );
+
+ for( unsigned int i = 0; i < number_of_lengths; ++i )
+ {
+ planfile >> lengths[i]; // read one dimension
+
+ // We have to explicitly set the lengths instead of using clfftSetPlanLength here.
+ // Because the number of values to add might be greater than the number of dimensions in plan->dimension,
+ // we don't want to miss out on any super awesome numbers getting added to plan->length with clfftSetPlanLength
+ if( i >= plan->length.size() ) plan->length.push_back(1);
+ plan->length[i] = lengths[i];
+ }
+ }
+ else if( next_word == "batch" )
+ {
+ unsigned int batch;
+ planfile >> batch;
+
+ OPENCL_V( clfftSetPlanBatchSize(plan_handle, batch), _T("clfftReadPlanFromDisk: clfftSetPlanBatchSize") );
+ }
+ else if( next_word == "instride" )
+ {
+ size_t strides[3];
+
+ // number of stride values that follow (subplans have some really strange things going on,
+ // so this might not always match the dimension of the transform)
+ size_t number_of_strides = 0;
+ planfile >> number_of_strides;
+
+ for( unsigned int i = 0; i < number_of_strides; ++i )
+ {
+ planfile >> strides[i]; // read one dimension
+
+ // We have to explicitly set inStride instead of using clfftSetPlanInStride here.
+ // Because the number of values to add might be greater than the number of dimensions in plan->dimension,
+ // we don't want to miss out on any super awesome numbers getting added to plan->inStride with clfftSetPlanInStride
+ if( i >= plan->inStride.size() ) plan->inStride.push_back(1);
+ plan->inStride[i] = strides[i];
+ }
+ }
+ else if( next_word == "outstride" )
+ {
+ size_t strides[3];
+
+ // number of stride values that follow (subplans have some really strange things going on,
+ // so this might not always match the dimension of the transform)
+ size_t number_of_strides = 0;
+ planfile >> number_of_strides;
+
+ for( unsigned int i = 0; i < number_of_strides; ++i )
+ {
+ planfile >> strides[i]; // read one dimension
+
+ // We have to explicitly set outStride instead of using clfftSetPlanOutStride here.
+ // Because the number of values to add might be greater than the number of dimensions in plan->dimension,
+ // we don't want to miss out on any super awesome numbers getting added to plan->outStride with clfftSetPlanOutStride
+ if( i >= plan->outStride.size() ) plan->outStride.push_back(1);
+ plan->outStride[i] = strides[i];
+ }
+ }
+ else if( next_word == "in-out-distances" )
+ {
+ size_t indistance, outdistance;
+ planfile >> indistance >> outdistance;
+
+ OPENCL_V( clfftSetPlanDistance( plan_handle, indistance, outdistance ), _T("clfftReadPlanFromDisk: clfftSetPlanDistance" ) );
+ }
+ else if( next_word == "in-out-layouts" )
+ {
+ size_t inlayout, outlayout;
+ planfile >> inlayout >> outlayout;
+
+ OPENCL_V( clfftSetLayout( plan_handle, static_cast(inlayout), static_cast(outlayout) ), _T("clfftReadPlanFromDisk: clfftSetLayout") );
+ }
+ else if( next_word == "resultlocation" )
+ {
+ size_t location;
+ planfile >> location;
+
+ OPENCL_V( clfftSetResultLocation( plan_handle, static_cast(location) ), _T("clfftReadPlanFromDisk: clfftSetResultLocation") );
+ }
+ else if( next_word == "precision" )
+ {
+ size_t precision;
+ planfile >> precision;
+
+ OPENCL_V( clfftSetPlanPrecision( plan_handle, static_cast(precision) ), _T("clfftReadPlanFromDisk: clfftSetPlanPrecision") );
+ }
+ else if( next_word == "forwardscale" || next_word == "backwardscale" )
+ {
+ size_t scale;
+ planfile >> scale;
+
+ if( next_word == "forwardscale" )
+ {
+ OPENCL_V( clfftSetPlanScale( plan_handle, CLFFT_FORWARD, hex_as_float((unsigned int)scale) ), _T("clfftReadPlanFromDisk: clfftSetPlanScale") );
+ }
+ else
+ {
+ OPENCL_V( clfftSetPlanScale( plan_handle, CLFFT_BACKWARD, hex_as_float((unsigned int)scale) ), _T("clfftReadPlanFromDisk: clfftSetPlanScale") );
+ }
+ }
+ else if( next_word == "gen" )
+ {
+ int gen_read;
+ planfile >> gen_read;
+ plan->gen = static_cast(gen_read);
+ }
+ else if( next_word == "bLdsComplex" )
+ {
+ planfile >> plan->bLdsComplex;
+ }
+ else if( next_word == "ldsPadding" )
+ {
+ planfile >> plan->ldsPadding;
+ }
+ else if( next_word == "uLdsFraction" )
+ {
+ planfile >> plan->uLdsFraction;
+ }
+ else if( next_word == "large1D_Xfactor" )
+ {
+ planfile >> plan->large1D_Xfactor;
+ }
+ else if( next_word == "cacheSize" )
+ {
+ planfile >> plan->cacheSize;
+ }
+ else if( next_word == "tmpBufSize" )
+ {
+ planfile >> plan->tmpBufSize;
+ }
+ else if( next_word == "large1D" )
+ {
+ planfile >> plan->large1D;
+ }
+ else if( next_word == "large2D" )
+ {
+ planfile >> plan->large2D;
+ }
+ else if( next_word == "number-of-devices" )
+ {
+ planfile >> plan->number_of_devices;
+ }
+ else if( next_word == "binary-sizes" )
+ {
+ plan->ResetBinarySizes();
+ for( int i = 0; i < plan->number_of_devices; i++ )
+ {
+ planfile >> plan->binary_sizes[i];
+ }
+ }
+ else if( next_word == "binaries" )
+ {
+ plan->ResetBinaries();
+
+ size_t number_of_devices = plan->number_of_devices;
+
+ while( static_cast(planfile.peek()) == ' ' )
+ planfile.ignore();
+
+ // consume the beginning of binary message. the binary will begin with the character immediately following
+ std::unique_ptr beginning_message( new char[beginning_of_binary.size()] );
+ planfile.read( beginning_message.get(), beginning_of_binary.size() );
+
+ for( int i = 0; i < plan->number_of_devices; i++ )
+ {
+ planfile.read( plan->binaries[i].get(), plan->binary_sizes[i] );
+ }
+
+ std::unique_ptr end_message( new char[end_of_binary.size()] );
+ planfile.read( end_message.get(), end_of_binary.size() );
+ }
+ else if( next_word == end_of_file )
+ {
+ // we're at the end of the file
+ }
+ else
+ {
+ std::cout << next_word << std::endl;
+ OPENCL_V( CLFFT_INVALID_ARG_VALUE, _T("clfftReadPlanFromDisk: unrecognized parameter") );
+ }
+ }
+
+ return CLFFT_SUCCESS;
+}
+#endif
+
+clfftStatus clfftDestroyPlan( clfftPlanHandle* plHandle )
+{
+ FFTRepo& fftRepo = FFTRepo::getInstance( );
+ FFTPlan* fftPlan = NULL;
+ lockRAII* planLock = NULL;
+
+ OPENCL_V( fftRepo.getPlan( *plHandle, fftPlan, planLock ), _T( "fftRepo.getPlan failed" ) );
+
+ // Recursively destroy subplans, that are used for higher dimensional FFT's
+ if( fftPlan->planX )
+ clfftDestroyPlan( &fftPlan->planX );
+ if( fftPlan->planY )
+ clfftDestroyPlan( &fftPlan->planY );
+ if( fftPlan->planZ )
+ clfftDestroyPlan( &fftPlan->planZ );
+ if( fftPlan->planTX )
+ clfftDestroyPlan( &fftPlan->planTX );
+ if( fftPlan->planTY )
+ clfftDestroyPlan( &fftPlan->planTY );
+ if( fftPlan->planTZ )
+ clfftDestroyPlan( &fftPlan->planTZ );
+ if( fftPlan->planRCcopy )
+ clfftDestroyPlan( &fftPlan->planRCcopy );
+
+ fftRepo.deletePlan( plHandle );
+
+ return CLFFT_SUCCESS;
+}
+
+// This routine will query the OpenCL context for it's devices
+// and their hardware limitations, which we synthesize into a
+// hardware "envelope".
+// We only query the devices the first time we're called after
+// the object's context is set. On 2nd and subsequent calls,
+// we just return the pointer.
+//
+clfftStatus FFTPlan::SetEnvelope ()
+{
+
+ // TODO The caller has already acquired the lock on *this
+ // However, we shouldn't depend on it.
+
+ if (0 == envelope.limit_LocalMemSize) do {
+ // First time, query OpenCL for the device info
+ //
+ memset (&envelope, 0, sizeof(envelope));
+
+ // Get the size needed for the device list
+ //
+ size_t deviceListSize = 0;
+ OPENCL_V( ::clGetContextInfo( context, CL_CONTEXT_DEVICES, 0, NULL, &deviceListSize ),
+ _T("Getting device array size ( ::clGetContextInfo() )" ));
+ cl_uint n = cl_uint (deviceListSize / sizeof(cl_device_id));
+ if (n == 0) break;
+
+ std::vector< cl_device_id > devices( n+1 );
+ // Get the device list
+ //
+ OPENCL_V( ::clGetContextInfo( context, CL_CONTEXT_DEVICES, deviceListSize, &devices[ 0 ], NULL ),
+ _T("Getting device array ( ::clGetContextInfo() )") );
+
+ // Get the # of devices
+ //
+ cl_uint cContextDevices = 0;
+
+ size_t deviceVersionSize = 0;
+ OPENCL_V( ::clGetDeviceInfo( devices[0], CL_DEVICE_VERSION, 0, NULL, &deviceVersionSize ),
+ _T("Getting CL_DEVICE_VERSION Info string size ( ::clGetDeviceInfo() )" ));
+
+ std::vector< char > szDeviceVersion( deviceVersionSize );
+ OPENCL_V( ::clGetDeviceInfo( devices[0], CL_DEVICE_VERSION, deviceVersionSize, &szDeviceVersion[ 0 ], NULL ),
+ _T("Getting CL_DEVICE_VERSION Platform Info string ( ::clGetDeviceInfo() )" ));
+
+ char openclstr[11]="OpenCL 1.0";
+
+ if (!strncmp((const char*)&szDeviceVersion[ 0 ], openclstr, 10))
+ {
+ cContextDevices = 1;
+ }
+ else
+ {
+ OPENCL_V( ::clGetContextInfo( context, CL_CONTEXT_NUM_DEVICES, sizeof( cContextDevices ), &cContextDevices, NULL ),
+ _T("Getting number of context devices ( ::clGetContextInfo() )" ));
+ }
+
+ cContextDevices = std::min (cContextDevices, n);
+ if (0 == cContextDevices)
+ break;
+
+ envelope.limit_LocalMemSize = ~0;
+ envelope.limit_WorkGroupSize = ~0;
+ envelope.limit_Dimensions = countOf (envelope.limit_Size);
+ for (size_t u = 0; u < countOf (envelope.limit_Size); ++u) {
+ envelope.limit_Size[u] = ~0;
+ }
+
+ for( cl_uint i = 0; i < cContextDevices; ++i )
+ {
+ cl_device_id devId = devices[i];
+
+ cl_ulong memsize = 0;
+ unsigned int maxdim = 0;
+ size_t temp[countOf (envelope.limit_Size)];
+ memset (&temp, 0, sizeof(temp));
+
+ OPENCL_V( ::clGetDeviceInfo( devId, CL_DEVICE_LOCAL_MEM_SIZE, sizeof( cl_ulong ), &memsize, NULL ),
+ _T("Getting CL_DEVICE_LOCAL_MEM_SIZE device info ( ::clGetDeviceInfo() )") );
+ envelope.limit_LocalMemSize = std::min (envelope.limit_LocalMemSize, memsize);
+
+ OPENCL_V( ::clGetDeviceInfo( devId, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof( unsigned int ), &maxdim, NULL ),
+ _T("Getting CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS device info ( ::clGetDeviceInfo() )") );
+ BUG_CHECK (countOf (envelope.limit_Size) >= maxdim);
+ envelope.limit_Dimensions = std::min (envelope.limit_Dimensions, maxdim);
+
+ OPENCL_V( ::clGetDeviceInfo( devId, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof( size_t ), &temp[0], NULL ),
+ _T("Getting CL_DEVICE_MAX_WORK_GROUP_SIZE device info ( ::clGetDeviceInfo() )") );
+ envelope.limit_WorkGroupSize = std::min (envelope.limit_WorkGroupSize, temp[0]);
+
+ OPENCL_V( ::clGetDeviceInfo( devId, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof( temp ), &temp[0], NULL ),
+ _T("Getting CL_DEVICE_MAX_WORK_ITEM_SIZES device info ( ::clGetDeviceInfo() )") );
+ for (size_t u = 0; u < envelope.limit_Dimensions; ++u) {
+ BUG_CHECK (temp[u] > 0)
+ envelope.limit_Size[u] = std::min (envelope.limit_Size[u], temp[u]);
+ }
+ }
+
+ BUG_CHECK (envelope.limit_LocalMemSize >= 1024)
+ } while (0);
+
+ return CLFFT_SUCCESS;
+}
+
+clfftStatus FFTPlan::AllocateBuffers ()
+{
+ cl_int status = CL_SUCCESS;
+
+ assert (NULL == const_buffer);
+ ReleaseBuffers ();
+
+ assert(4 == sizeof(int));
+
+ do {
+ const_buffer = clCreateBuffer (context,
+ CL_MEM_READ_ONLY,
+ CLFFT_CB_SIZE * sizeof (int),
+ 0,
+ &status);
+ if (CL_SUCCESS != status)
+ break;
+ } while (0);
+
+ return (clfftStatus) status;
+}
+
+clfftStatus FFTPlan::ReleaseBuffers ()
+{
+ clfftStatus result = CLFFT_SUCCESS;
+ clfftStatus tmp;
+
+ if( NULL != const_buffer )
+ {
+ tmp = static_cast< clfftStatus >( clReleaseMemObject( const_buffer ) );
+ const_buffer = NULL;
+ if( CLFFT_SUCCESS == result )
+ result = tmp;
+ }
+
+ if( NULL != intBuffer )
+ {
+ tmp = static_cast< clfftStatus >( clReleaseMemObject( intBuffer ) );
+ intBuffer = NULL;
+ if( CLFFT_SUCCESS == result )
+ result = tmp;
+ }
+
+ if( NULL != intBufferRC )
+ {
+ tmp = static_cast< clfftStatus >( clReleaseMemObject( intBufferRC ) );
+ intBufferRC = NULL;
+ if( CLFFT_SUCCESS == result )
+ result = tmp;
+ }
+
+ return result;
+}
+
+clfftStatus FFTPlan::GetWorkSizes (std::vector & globalws, std::vector & localws) const
+{
+ switch(gen)
+ {
+ case Stockham: return GetWorkSizesPvt(globalws, localws);
+ case Transpose: return GetWorkSizesPvt(globalws, localws);
+ case Copy: return GetWorkSizesPvt(globalws, localws);
+ default: assert(false); return CLFFT_NOTIMPLEMENTED;
+ }
+}
+
+clfftStatus FFTPlan::GetKernelGenKey (FFTKernelGenKeyParams & params) const
+{
+ switch(gen)
+ {
+ case Stockham: return GetKernelGenKeyPvt(params);
+ case Transpose: return GetKernelGenKeyPvt(params);
+ case Copy: return GetKernelGenKeyPvt(params);
+ default: assert(false); return CLFFT_NOTIMPLEMENTED;
+ }
+}
+
+clfftStatus FFTPlan::GenerateKernel (FFTRepo & fftRepo) const
+{
+ switch(gen)
+ {
+ case Stockham: return GenerateKernelPvt(fftRepo);
+ case Transpose: return GenerateKernelPvt(fftRepo);
+ case Copy: return GenerateKernelPvt(fftRepo);
+ default: assert(false); return CLFFT_NOTIMPLEMENTED;
+ }
+}
+
+clfftStatus FFTPlan::GetMax1DLength (size_t *longest ) const
+{
+ switch(gen)
+ {
+ case Stockham: return GetMax1DLengthPvt(longest);
+ //No restriction for transpose kernel
+ case Transpose: *longest = 4096; return CLFFT_SUCCESS;
+ case Copy: *longest = 4096; return CLFFT_SUCCESS;
+ default: assert(false); return CLFFT_NOTIMPLEMENTED;
+ }
+}
+
+clfftStatus FFTPlan::GetEnvelope (const FFTEnvelope ** ppEnvelope) const
+{
+ if(&envelope == NULL) assert(false);
+ *ppEnvelope = &envelope;
+ return CLFFT_SUCCESS;
+}
+
+size_t FFTPlan::ElementSize() const
+{
+ return ( ((precision == CLFFT_DOUBLE) || (precision == CLFFT_DOUBLE_FAST)) ? sizeof( std::complex ) : sizeof( std::complex ) );
+}
+
diff --git a/src/library/plan.h b/src/library/plan.h
new file mode 100644
index 00000000..ec96fadb
--- /dev/null
+++ b/src/library/plan.h
@@ -0,0 +1,360 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#pragma once
+#if !defined( AMD_CLFFT_plan_H )
+#define AMD_CLFFT_plan_H
+#include
+#include "private.h"
+#include "lock.h"
+#include "generator.h"
+
+namespace ARBITRARY {
+ // TODO: These arbitrary parameters should be tuned for the type of GPU
+ // being used. These values are probably OK for Radeon 58xx and 68xx.
+ enum {
+ MAX_DIMS = 3,
+ // The clEnqueuNDRangeKernel accepts a multi-dimensional domain array.
+ // The # of dimensions is arbitrary, but limited by the OpenCL implementation
+ // usually to 3 dimensions (CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS).
+ // The kernel generator also assumes a limit on the # of dimensions.
+
+ SIMD_WIDTH = 64,
+ // Workgroup size. This is the # of work items that share
+ // local data storage (LDS). This # is best for Evergreen gpus,
+ // but might change in the future.
+
+ LDS_BANK_BITS = 5,
+ LDS_BANK_SIZE = (1 << LDS_BANK_BITS),
+ LDS_PADDING = false,//true,
+ // On AMD hardware, the low-order bits of the local_id enumerate
+ // the work items that access LDS in parallel. Ideally, we will
+ // pad our LDS arrays so that these work items access different banks
+ // of the LDS.
+ // 2 ** LDS_BANK_BITS is the number of LDS banks.
+ // If LDS_PADDING is non-zero, the kernel generator should pad the
+ // LDS arrays to reduce or eliminate bank conflicts.
+
+ LDS_FRACTION_IDEAL = 6, // i.e., 1/6th
+ LDS_FRACTION_MAX = 4, // i.e., 1/4
+ // For best performance, each workgroup should use 1/IDEAL'th the amount of LDS
+ // revealed by clGetDeviceInfo (.. CL_DEVICE_LOCAL_MEM_SIZE, ...)
+ // However, we can use up to 1/MAX'th of LDS per workgroup when necessary to
+ // perform the FFT in a single pass instead of multiple passes.
+ // This tuning parameter is a good value for Evergreen gpus,
+ // but might change in the future.
+
+ LDS_COMPLEX = false,
+ // This is the default value for FFTKernelGenKeyParams::fft_LdsComplex.
+ // The generated kernels require so many bytes of LDS for each single precision
+ //..complex number in the vector.
+ // If LDS_COMPLEX, then we declare an LDS array of complex numbers (8 bytes each)
+ // and swap data between workitems with a single barrier.
+ // If ! LDS_COMPLEX, then we declare an LDS array or scalar numbers (4 bytes each)
+ // and swap data between workitems in two phases, with extra barriers.
+ // The former approach uses fewer instructions and barriers;
+ // The latter uses half as much LDS space, so twice as many wavefronts can be run
+ // in parallel.
+
+ TWIDDLE_DEE = 4,
+ // 4 bits per row of matrix.
+ };
+};
+
+enum eConstantBuffer {
+ /* Layout of a constant buffer passed to the generated kernel
+ * This needs to be know by the kernel generator and by the
+ * framework code that creates the buffer and fills it at execution time.
+ */
+
+ // [0] uint NY This is the batchsize for a 1D Array,
+ // or the 2nd (Y dimension) for a 2D.
+ // [1] uint NZ This is the batchsize for a 2D Array,
+ // or the 3rd (Z dimension) for a 3D.
+ // [2] uint NW This is the batchsize for a 3D Array,
+ // or the 4th (W dimension) for a 4D.
+ // [3] uint N5 This is the batchsize for a 4D Array,
+ //
+ CLFFT_CB_NY = 0,
+ CLFFT_CB_NZ,
+ CLFFT_CB_NW,
+ CLFFT_CB_N5,
+
+ // [4] uint ISX Input data X stride (== 1 for row-major compact data)
+ // [5] uint ISY Input data Y stride (== X for row-major compact data)
+ // [6] uint ISZ Input data Z stride (== X*Y for row-major compact data)
+ // [7] uint ISW Input data W stride (== X*Y*Z for row-major compact data)
+ // [8] uint IS5 Input data 5th stride
+ //
+ CLFFT_CB_ISX,
+ CLFFT_CB_ISY,
+ CLFFT_CB_ISZ,
+ CLFFT_CB_ISW,
+ CLFFT_CB_IS5,
+
+ // [9] uint OSX Output data X stride
+ // [10] uint OSY Output data Y stride
+ // [11] uint OSZ Output data Z stride
+ // [12] uint OSW Output data W stride
+ // [13] uint OS5 Output data 5th stride
+ //
+ CLFFT_CB_OSX,
+ CLFFT_CB_OSY,
+ CLFFT_CB_OSZ,
+ CLFFT_CB_OSW,
+ CLFFT_CB_OS5,
+
+ CLFFT_CB_SIZE = 32,
+};
+
+struct FFTKernelGenKeyParams {
+ /*
+ * This structure distills a subset of the fftPlan data,
+ * including all information that is used to generate the OpenCL kernel.
+ * This structure can be used as a key to reusing kernels that have already
+ * been compiled.
+ */
+ size_t fft_DataDim; // Dimensionality of the data
+ size_t fft_N[5]; // [0] is FFT size, e.g. 1024
+ // This must be <= size of LDS!
+ size_t fft_inStride [5]; // input strides
+ size_t fft_outStride[5]; // output strides
+
+ clfftResultLocation fft_placeness;
+ clfftLayout fft_inputLayout;
+ clfftLayout fft_outputLayout;
+ clfftPrecision fft_precision;
+ double fft_fwdScale;
+ double fft_backScale;
+
+ size_t fft_SIMD; // Assume this SIMD/workgroup size
+ size_t fft_LDSsize; // Limit the use of LDS to this many bytes.
+ size_t fft_R; // # of complex values to keep in working registers
+ // SIMD size * R must be <= size of LDS!
+ size_t fft_MaxRadix; // Limit the radix to this value.
+ size_t fft_MaxWorkGroupSize; // Limit for work group size
+ bool fft_LdsComplex; // If true, store complex values in LDS memory
+ // If false, store scalare values in LDS.
+ // Generally, false will provide more efficient kernels,
+ // but not always.
+ // see FFTPlan::bLdsComplex and ARBITRARY::LDS_COMPLEX
+ bool fft_ldsPadding; // default padding is false
+ bool fft_3StepTwiddle; // This is one pass of the "3-step" algorithm;
+ // so extra twiddles are applied on output.
+ bool fft_UseFMA; // *** TODO
+ bool fft_RCsimple;
+};
+
+
+// Sorting operator for struct FFTKernelGenKeyParams, such that it can be used in a map
+bool operator<( const FFTKernelGenKeyParams& lhs, const FFTKernelGenKeyParams& rhs);
+
+// The "envelope" is a set of limits imposed by the hardware
+// This will depend on the GPU(s) in the OpenCL context.
+// If there are multiple devices, this should be the least
+// common denominators.
+//
+struct FFTEnvelope {
+ cl_ulong limit_LocalMemSize;
+ // this is the minimum of CL_DEVICE_LOCAL_MEM_SIZE
+ size_t limit_Dimensions;
+ // this is the minimum of CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS
+ size_t limit_Size[8];
+ // these are the minimima of CL_DEVICE_MAX_WORK_ITEM_SIZES[0..n]
+ size_t limit_WorkGroupSize;
+ // this is the minimum of CL_DEVICE_MAX_WORK_GROUP_SIZE
+
+ // ?? CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE
+
+ FFTEnvelope ()
+ : limit_LocalMemSize (0)
+ , limit_Dimensions (0)
+ , limit_WorkGroupSize (0)
+ {
+ ::memset (& limit_Size, 0, sizeof (limit_Size));
+ }
+};
+
+class FFTRepo;
+
+// This class contains objects that are specific to a particular FFT transform, and the data herein is useful
+// for us to know ahead of transform time such that we can optimize for these settings
+class FFTPlan
+{
+ template
+ clfftStatus GetWorkSizesPvt (std::vector & globalws, std::vector & localws) const;
+
+ template
+ clfftStatus GetKernelGenKeyPvt (FFTKernelGenKeyParams & params) const;
+
+ template
+ clfftStatus GenerateKernelPvt (FFTRepo& fftRepo) const;
+
+ template
+ clfftStatus GetMax1DLengthPvt (size_t *longest ) const;
+
+public:
+ bool baked;
+ bool readFromFile;
+
+ // Properties provided by the user.
+ clfftDim dim;
+ clfftLayout inputLayout;
+ clfftLayout outputLayout;
+ clfftResultLocation placeness;
+ clfftResultTransposed transposed;
+ clfftPrecision precision;
+ cl_context context;
+ double forwardScale, backwardScale;
+ size_t iDist, oDist;
+ size_t batchsize;
+
+ // Devices that the user specified in the context passed to the create function
+ std::vector< cl_device_id > devices;
+
+ // Length of the FFT in each dimension
+ std::vector< size_t > length;
+
+ // Stride of the FFT in each dimension
+ std::vector< size_t > inStride, outStride;
+
+ // Hardware Limits
+ FFTEnvelope envelope;
+
+ // Performance Tuning parameters
+ bool bLdsComplex; // see ARBITRARY::LDS_COMPLEX
+ bool ldsPadding; // see ARBITRARY::LDS_PADDING
+ unsigned uLdsFraction; // see ARBITRARY::LDS_FRACTION_IDEAL
+
+ // Reserved copy for large 1d, 2d, and 3d plan
+ size_t tmpBufSize;
+ cl_mem intBuffer;
+
+ // for RC copies
+ size_t tmpBufSizeRC;
+ cl_mem intBufferRC;
+
+ // for C-to-R transforms with largeness in Y or Z dimension
+ size_t tmpBufSizeC2R;
+ cl_mem intBufferC2R;
+
+ //extra cache size for 2d and 3d
+ size_t cacheSize;
+ size_t large1D;
+ bool large2D;
+ size_t large1D_Xfactor;
+ clfftPlanHandle planX;
+ clfftPlanHandle planY;
+ clfftPlanHandle planZ;
+
+ bool transflag;
+ clfftPlanHandle planTX;
+ clfftPlanHandle planTY;
+ clfftPlanHandle planTZ; //reserve for 3D transpose
+
+ clfftPlanHandle planRCcopy;
+
+ // Plan resources
+ //
+ cl_mem const_buffer;
+
+ // Generator type
+ clfftGenerators gen;
+
+ // stored binaries
+ size_t number_of_devices;
+
+//TODO caching kernel binaries for later reload
+#if 0
+ std::unique_ptr binary_sizes;
+ std::vector< std::unique_ptr > binaries;
+#endif
+
+ // Real-Complex simple flag
+ // if this is set we do real to-and-from full complex using simple algorithm
+ // where imaginary of input is set to zero in forward and imaginary not written in backward
+ bool RCsimple;
+
+ FFTPlan ()
+ : baked (false)
+ , readFromFile (false)
+ , dim (CLFFT_1D)
+ , inputLayout (CLFFT_COMPLEX_INTERLEAVED)
+ , outputLayout (CLFFT_COMPLEX_INTERLEAVED)
+ , placeness (CLFFT_INPLACE)
+ , transposed (CLFFT_NOTRANSPOSE)
+ , precision (CLFFT_SINGLE)
+ , context (NULL)
+ , forwardScale (1.0)
+ , backwardScale (1.0)
+ , iDist( 1 ), oDist( 1 )
+ , batchsize (1)
+ , tmpBufSize (0)
+ , intBuffer( NULL )
+ , tmpBufSizeRC (0)
+ , intBufferRC( NULL )
+ , tmpBufSizeC2R (0)
+ , intBufferC2R( NULL )
+ , large1D(0)
+ , large2D(false)
+ , planX( 0 )
+ , planY( 0 )
+ , planZ( 0 )
+ , transflag(false)
+ , RCsimple(false)
+ , planTX( 0 )
+ , planTY( 0 )
+ , planTZ( 0 )
+ , planRCcopy(0)
+ , const_buffer( NULL )
+ , bLdsComplex (ARBITRARY::LDS_COMPLEX)
+ , ldsPadding (ARBITRARY::LDS_PADDING)
+ , uLdsFraction (0/*ARBITRARY::LDS_FRACTION_IDEAL*/)
+ , large1D_Xfactor(0)
+ , cacheSize(0)
+ , number_of_devices(0)
+ , gen(Stockham)
+ {};
+
+
+ size_t ElementSize() const;
+
+ clfftStatus AllocateBuffers ();
+ clfftStatus ReleaseBuffers ();
+
+ clfftStatus GetWorkSizes (std::vector & globalws, std::vector & localws) const;
+ clfftStatus GetKernelGenKey (FFTKernelGenKeyParams & params) const;
+ clfftStatus GenerateKernel (FFTRepo & fftRepo) const;
+ clfftStatus GetMax1DLength (size_t *longest ) const;
+
+ void ResetBinarySizes();
+ void ResetBinaries();
+
+ clfftStatus CompressPlan();
+ clfftStatus ConstructAndEnqueueConstantBuffers( cl_command_queue* commQueueFFT );
+
+ clfftStatus GetEnvelope (const FFTEnvelope **) const;
+ clfftStatus SetEnvelope ();
+
+ ~FFTPlan ()
+ {
+ ReleaseBuffers ();
+ }
+};
+
+#endif // AMD_CLFFT_plan_H
+
diff --git a/src/library/private.h b/src/library/private.h
new file mode 100644
index 00000000..fe3770d7
--- /dev/null
+++ b/src/library/private.h
@@ -0,0 +1,342 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#pragma once
+#if !defined( CLFFT_private_H )
+#define CLFFT_private_H
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include "../include/clFFT.h"
+#include "../include/unicode.compatibility.h"
+
+#if defined(_MSC_VER)
+ // Microsoft Visual C++ compiler
+ //
+#define SPRINTF(_buffer, _count, _format,...) \
+ _snprintf_s (_buffer, _count, _TRUNCATE, _format, __VA_ARGS__)
+#elif defined( __GNUC__ )
+ // Gnu G++
+ //
+#define SPRINTF(_buffer, _count, _format,...) \
+ { size_t len = (_count)-1; \
+ snprintf (_buffer, len, _format,__VA_ARGS__); \
+ _buffer[len] = 0; \
+ }
+#else
+#error Unknown/unsupported C++ compiler.
+#endif
+
+// Creating a portable defintion of countof
+#if defined( _WIN32 )
+ #define countOf _countof
+#else
+ #define countOf( arr ) ( sizeof( arr ) / sizeof( arr[ 0 ] ) )
+#endif
+
+#if defined( _WIN32 )
+ #include
+
+ #if defined( _WIN64 )
+ inline void BSF( unsigned long* index, size_t& mask )
+ {
+ _BitScanForward64( index, mask );
+ }
+
+ inline size_t AtomicAdd( volatile size_t* value, size_t op )
+ {
+ return _InterlockedExchangeAdd64( reinterpret_cast< volatile __int64* >( value ), op );
+ }
+ #else
+ inline void BSF( unsigned long* index, size_t& mask )
+ {
+ _BitScanForward( index, mask );
+ }
+
+ inline size_t AtomicAdd( volatile size_t* value, size_t op )
+ {
+ return _InterlockedExchangeAdd( reinterpret_cast< volatile long* >( value ), op );
+ }
+ #endif
+#elif defined( __GNUC__ )
+ inline void BSF( unsigned long * index, size_t & mask )
+ {
+ *index = __builtin_ctz( mask );
+ }
+
+ inline size_t AtomicAdd( volatile size_t* value, size_t op )
+ {
+ return __sync_fetch_and_add( value, op );
+ }
+#endif
+
+// This header file is not visible to clients, and contains internal structures and functions for use
+// by the FFT library. Since this header is private to this implementation, there is no need to keep
+// strict C compliance.
+
+// Enum to help provide descriptive names to array indices, when indexing into our various vectors
+enum clfftDim_Index
+{
+ DimX, ///< 1 Dimension
+ DimY, ///< 2 Dimension
+ DimZ, ///< 3 Dimension
+ DimW, ///< 4th Dimension
+ ENDDIMINDEX ///< This value will always be last, and marks the length of clfftDim_Index
+};
+
+template< typename FileStreamType, typename StringType >
+class tofstreamRAII
+{
+ FileStreamType outFile;
+ StringType fileName;
+
+ public:
+ tofstreamRAII( const StringType& name ): fileName( name )
+ {
+ outFile.open( fileName.c_str( ) );
+ }
+
+ ~tofstreamRAII( )
+ {
+ outFile.close( );
+ }
+
+ StringType& getName( )
+ {
+ return fileName;
+ }
+
+ void setName( const StringType& name )
+ {
+ fileName = name;
+ }
+
+ FileStreamType& get( )
+ {
+ return outFile;
+ }
+};
+
+//(currently) true if length is a power of 2,3,5
+inline bool IsASupportedLength( size_t length )
+{
+ while( length > 1 )
+ {
+ if( length % 2 == 0 )
+ length /= 2;
+ else if( length % 3 == 0 )
+ length /= 3;
+ else if( length % 5 == 0 )
+ length /= 5;
+ else
+ return false;
+ }
+ return true;
+}
+
+inline tstring clfftErrorStatusAsString( const cl_int& status )
+{
+ switch( status )
+ {
+ case CLFFT_INVALID_GLOBAL_WORK_SIZE:
+ return _T( "CLFFT_INVALID_GLOBAL_WORK_SIZE" );
+ case CLFFT_INVALID_MIP_LEVEL:
+ return _T( "CLFFT_INVALID_MIP_LEVEL" );
+ case CLFFT_INVALID_BUFFER_SIZE:
+ return _T( "CLFFT_INVALID_BUFFER_SIZE" );
+ case CLFFT_INVALID_GL_OBJECT:
+ return _T( "CLFFT_INVALID_GL_OBJECT" );
+ case CLFFT_INVALID_OPERATION:
+ return _T( "CLFFT_INVALID_OPERATION" );
+ case CLFFT_INVALID_EVENT:
+ return _T( "CLFFT_INVALID_EVENT" );
+ case CLFFT_INVALID_EVENT_WAIT_LIST:
+ return _T( "CLFFT_INVALID_EVENT_WAIT_LIST" );
+ case CLFFT_INVALID_GLOBAL_OFFSET:
+ return _T( "CLFFT_INVALID_GLOBAL_OFFSET" );
+ case CLFFT_INVALID_WORK_ITEM_SIZE:
+ return _T( "CLFFT_INVALID_WORK_ITEM_SIZE" );
+ case CLFFT_INVALID_WORK_GROUP_SIZE:
+ return _T( "CLFFT_INVALID_WORK_GROUP_SIZE" );
+ case CLFFT_INVALID_WORK_DIMENSION:
+ return _T( "CLFFT_INVALID_WORK_DIMENSION" );
+ case CLFFT_INVALID_KERNEL_ARGS:
+ return _T( "CLFFT_INVALID_KERNEL_ARGS" );
+ case CLFFT_INVALID_ARG_SIZE:
+ return _T( "CLFFT_INVALID_ARG_SIZE" );
+ case CLFFT_INVALID_ARG_VALUE:
+ return _T( "CLFFT_INVALID_ARG_VALUE" );
+ case CLFFT_INVALID_ARG_INDEX:
+ return _T( "CLFFT_INVALID_ARG_INDEX" );
+ case CLFFT_INVALID_KERNEL:
+ return _T( "CLFFT_INVALID_KERNEL" );
+ case CLFFT_INVALID_KERNEL_DEFINITION:
+ return _T( "CLFFT_INVALID_KERNEL_DEFINITION" );
+ case CLFFT_INVALID_KERNEL_NAME:
+ return _T( "CLFFT_INVALID_KERNEL_NAME" );
+ case CLFFT_INVALID_PROGRAM_EXECUTABLE:
+ return _T( "CLFFT_INVALID_PROGRAM_EXECUTABLE" );
+ case CLFFT_INVALID_PROGRAM:
+ return _T( "CLFFT_INVALID_PROGRAM" );
+ case CLFFT_INVALID_BUILD_OPTIONS:
+ return _T( "CLFFT_INVALID_BUILD_OPTIONS" );
+ case CLFFT_INVALID_BINARY:
+ return _T( "CLFFT_INVALID_BINARY" );
+ case CLFFT_INVALID_SAMPLER:
+ return _T( "CLFFT_INVALID_SAMPLER" );
+ case CLFFT_INVALID_IMAGE_SIZE:
+ return _T( "CLFFT_INVALID_IMAGE_SIZE" );
+ case CLFFT_INVALID_IMAGE_FORMAT_DESCRIPTOR:
+ return _T( "CLFFT_INVALID_IMAGE_FORMAT_DESCRIPTOR" );
+ case CLFFT_INVALID_MEM_OBJECT:
+ return _T( "CLFFT_INVALID_MEM_OBJECT" );
+ case CLFFT_INVALID_HOST_PTR:
+ return _T( "CLFFT_INVALID_HOST_PTR" );
+ case CLFFT_INVALID_COMMAND_QUEUE:
+ return _T( "CLFFT_INVALID_COMMAND_QUEUE" );
+ case CLFFT_INVALID_QUEUE_PROPERTIES:
+ return _T( "CLFFT_INVALID_QUEUE_PROPERTIES" );
+ case CLFFT_INVALID_CONTEXT:
+ return _T( "CLFFT_INVALID_CONTEXT" );
+ case CLFFT_INVALID_DEVICE:
+ return _T( "CLFFT_INVALID_DEVICE" );
+ case CLFFT_INVALID_PLATFORM:
+ return _T( "CLFFT_INVALID_PLATFORM" );
+ case CLFFT_INVALID_DEVICE_TYPE:
+ return _T( "CLFFT_INVALID_DEVICE_TYPE" );
+ case CLFFT_INVALID_VALUE:
+ return _T( "CLFFT_INVALID_VALUE" );
+ case CLFFT_MAP_FAILURE:
+ return _T( "CLFFT_MAP_FAILURE" );
+ case CLFFT_BUILD_PROGRAM_FAILURE:
+ return _T( "CLFFT_BUILD_PROGRAM_FAILURE" );
+ case CLFFT_IMAGE_FORMAT_NOT_SUPPORTED:
+ return _T( "CLFFT_IMAGE_FORMAT_NOT_SUPPORTED" );
+ case CLFFT_IMAGE_FORMAT_MISMATCH:
+ return _T( "CLFFT_IMAGE_FORMAT_MISMATCH" );
+ case CLFFT_MEM_COPY_OVERLAP:
+ return _T( "CLFFT_MEM_COPY_OVERLAP" );
+ case CLFFT_PROFILING_INFO_NOT_AVAILABLE:
+ return _T( "CLFFT_PROFILING_INFO_NOT_AVAILABLE" );
+ case CLFFT_OUT_OF_HOST_MEMORY:
+ return _T( "CLFFT_OUT_OF_HOST_MEMORY" );
+ case CLFFT_OUT_OF_RESOURCES:
+ return _T( "CLFFT_OUT_OF_RESOURCES" );
+ case CLFFT_MEM_OBJECT_ALLOCATION_FAILURE:
+ return _T( "CLFFT_MEM_OBJECT_ALLOCATION_FAILURE" );
+ case CLFFT_COMPILER_NOT_AVAILABLE:
+ return _T( "CLFFT_COMPILER_NOT_AVAILABLE" );
+ case CLFFT_DEVICE_NOT_AVAILABLE:
+ return _T( "CLFFT_DEVICE_NOT_AVAILABLE" );
+ case CLFFT_DEVICE_NOT_FOUND:
+ return _T( "CLFFT_DEVICE_NOT_FOUND" );
+ case CLFFT_SUCCESS:
+ return _T( "CLFFT_SUCCESS" );
+ case CLFFT_NOTIMPLEMENTED:
+ return _T( "CLFFT_NOTIMPLEMENTED" );
+ case CLFFT_FILE_NOT_FOUND:
+ return _T( "CLFFT_FILE_NOT_FOUND" );
+ case CLFFT_FILE_CREATE_FAILURE:
+ return _T( "CLFFT_FILE_CREATE_FAILURE" );
+ case CLFFT_VERSION_MISMATCH:
+ return _T( "CLFFT_VERSION_MISMATCH" );
+ case CLFFT_INVALID_PLAN:
+ return _T( "CLFFT_INVALID_PLAN" );
+ default:
+ return _T( "Error code not defined" );
+ break;
+ }
+}
+
+// This is used to either wrap an OpenCL function call, or to explicitly check a variable for an OpenCL error condition.
+// If an error occurs, we issue a return statement to exit the calling function.
+#define OPENCL_V( fn, msg ) \
+{ \
+ clfftStatus vclStatus = static_cast< clfftStatus >( fn ); \
+ switch( vclStatus ) \
+ { \
+ case CL_SUCCESS: /**< No error */ \
+ break; \
+ default: \
+ { \
+ terr << _T( "OPENCL_V< " ); \
+ terr << clfftErrorStatusAsString( vclStatus ); \
+ terr << _T( " > (" )<< static_cast( __LINE__ ) << _T( "): " ); \
+ terr << msg << std::endl; \
+ return vclStatus; \
+ } \
+ } \
+}
+
+static inline bool IsPo2 (size_t u) {
+ return (u != 0) && (0 == (u & (u-1)));
+}
+
+template
+static inline T DivRoundingUp (T a, T b) {
+ return (a + (b-1)) / b;
+}
+
+static inline size_t BitScanF (size_t n) {
+ assert (n != 0);
+ unsigned long tmp = 0;
+ BSF (& tmp, n);
+ return (size_t) tmp;
+}
+
+#define ARG_CHECK(_proposition) \
+{ bool btmp = (_proposition); assert (btmp); if (! btmp) return CLFFT_INVALID_ARG_VALUE; }
+
+#define BUG_CHECK(_proposition) \
+ { bool btmp = (_proposition); assert (btmp); if (! btmp) return CLFFT_BUGCHECK; }
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+CLFFTAPI clfftStatus clfftLocalMemSize( const clfftPlanHandle plHandle, cl_ulong* local_mem_size );
+
+/*! @brief Save to disk a file containing the contents of a baked plan.
+* @details A plan is a repository of state for calculating FFT's. Saves the details for a plan to allow the user
+* to easily recreate a plan and execute it without having to first build the kernel.
+* @param[in] plHandle Handle to the plan to be written to disk
+* @param[in] filename The desired name of the output file for the stored plan
+* @return Enum describing error condition; superset of OpenCL error codes
+*/
+CLFFTAPI clfftStatus clfftWritePlanToDisk( clfftPlanHandle plHandle, const char* filename );
+
+/*! @brief Read from disk a file containing the contents of a baked plan.
+* @details A plan is a repository of state for calculating FFT's. Reads the details for a plan from a file on disk and duplicates
+* the plan in the provided plan handle.
+* @param[out] plHandle Handle to the plan to be set to details from the file
+* @param[in] filename The name of the file containing the stored plan
+* @return Enum describing error condition; superset of OpenCL error codes
+*/
+CLFFTAPI clfftStatus clfftReadPlanFromDisk( clfftPlanHandle plHandle, const char* filename );
+
+/* internal api to set up some plan paramters */
+CLFFTAPI clfftStatus clfftSetInternal( const clfftPlanHandle plHandle, void* dataInternal );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/library/repo.cpp b/src/library/repo.cpp
new file mode 100644
index 00000000..018dff5c
--- /dev/null
+++ b/src/library/repo.cpp
@@ -0,0 +1,320 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+// clfft.repo.cpp : Defines the entry point for the console application.
+//
+
+#include "stdafx.h"
+#include "repo.h"
+
+using std::map;
+using std::string;
+
+// Static initialization of the repo lock variable
+lockRAII FFTRepo::lockRepo( _T( "FFTRepo" ) );
+
+// Static initialization of the plan count variable
+size_t FFTRepo::planCount = 1;
+
+// Handle/Address of the dynamic module that contains the timer, that we discover and load during runtime
+void* FFTRepo::timerHandle = NULL;
+GpuStatTimer* FFTRepo::pStatTimer = NULL;
+
+clfftStatus FFTRepo::releaseResources( )
+{
+ scopedLock sLock( lockRepo, _T( "releaseResources" ) );
+
+ // Release all handles to Kernels
+ //
+ for(Kernel_iterator iKern = mapKernels.begin( ); iKern != mapKernels.end( ); ++iKern )
+ {
+ cl_kernel k = iKern->second.kernel_fwd;
+ iKern->second.kernel_fwd = NULL;
+ if (NULL != k)
+ clReleaseKernel( k );
+ k = iKern->second.kernel_back;
+ iKern->second.kernel_back = NULL;
+ if (NULL != k)
+ clReleaseKernel( k );
+ }
+ mapKernels.clear( );
+
+ // Release all handles to programs
+ //
+ for (fftRepo_iterator iProg = mapFFTs.begin( ); iProg != mapFFTs.end( ); ++iProg )
+ {
+ cl_program p = iProg->second.clProgram;
+ iProg->second.clProgram = NULL;
+ if (NULL != p)
+ clReleaseProgram (p);
+ }
+
+ // Free all memory allocated in the repoPlans; represents cached plans that were not destroyed by the client
+ //
+ for( repoPlansType::iterator iter = repoPlans.begin( ); iter != repoPlans.end( ); ++iter )
+ {
+ FFTPlan* plan = iter->second.first;
+ lockRAII* lock = iter->second.second;
+ if( plan != NULL )
+ {
+ delete plan;
+ }
+ if( lock != NULL )
+ {
+ delete lock;
+ }
+ }
+
+ // Reset the plan count to zero because we are guaranteed to have destroyed all plans
+ planCount = 1;
+
+ // Release all strings
+ mapFFTs.clear( );
+
+ return CLFFT_SUCCESS;
+}
+
+clfftStatus FFTRepo::setProgramCode( const clfftGenerators gen, const FFTKernelGenKeyParams& fftParam, const std::string& kernel )
+{
+ scopedLock sLock( lockRepo, _T( "setProgramCode" ) );
+
+ std::pair< clfftGenerators, FFTKernelGenKeyParams > key = std::make_pair( gen, fftParam );
+
+ // Prefix copyright statement at the top of generated kernels
+ std::stringstream ss;
+ ss << std::endl << std::endl;
+ ss << "// Copyright (C) 2010-2013 Advanced Micro Devices, Inc. All Rights Reserved." << std::endl << std::endl;
+ std::string prefixCopyright = ss.str();
+
+ mapFFTs[ key ].ProgramString = prefixCopyright + kernel;
+
+ return CLFFT_SUCCESS;
+}
+
+clfftStatus FFTRepo::getProgramCode( const clfftGenerators gen, const FFTKernelGenKeyParams& fftParam, std::string& kernel )
+{
+ scopedLock sLock( lockRepo, _T( "getProgramCode" ) );
+
+ std::pair< clfftGenerators, FFTKernelGenKeyParams > key = std::make_pair( gen, fftParam );
+
+ fftRepo_iterator pos = mapFFTs.find( key);
+ if( pos == mapFFTs.end( ) )
+ return CLFFT_FILE_NOT_FOUND;
+
+ kernel = pos->second.ProgramString;
+ return CLFFT_SUCCESS;
+}
+
+clfftStatus FFTRepo::setProgramEntryPoints( const clfftGenerators gen, const FFTKernelGenKeyParams& fftParam,
+ const char * kernel_fwd, const char * kernel_back )
+{
+ scopedLock sLock( lockRepo, _T( "setProgramEntryPoints" ) );
+
+ std::pair< clfftGenerators, FFTKernelGenKeyParams > key = std::make_pair( gen, fftParam );
+
+ fftRepoValue& fft = mapFFTs[ key ];
+ fft.EntryPoint_fwd = kernel_fwd;
+ fft.EntryPoint_back = kernel_back;
+
+ return CLFFT_SUCCESS;
+}
+
+clfftStatus FFTRepo::getProgramEntryPoint( const clfftGenerators gen, const FFTKernelGenKeyParams& fftParam,
+ clfftDirection dir, std::string& kernel )
+{
+ scopedLock sLock( lockRepo, _T( "getProgramEntryPoint" ) );
+
+ std::pair< clfftGenerators, FFTKernelGenKeyParams > key = std::make_pair( gen, fftParam );
+
+ fftRepo_iterator pos = mapFFTs.find( key );
+ if( pos == mapFFTs.end( ) )
+ return CLFFT_FILE_NOT_FOUND;
+
+ switch (dir) {
+ case CLFFT_FORWARD:
+ kernel = pos->second.EntryPoint_fwd;
+ break;
+ case CLFFT_BACKWARD:
+ kernel = pos->second.EntryPoint_back;
+ break;
+ default:
+ assert (false);
+ return CLFFT_INVALID_ARG_VALUE;
+ }
+
+ if (0 == kernel.size())
+ return CLFFT_FILE_NOT_FOUND;
+
+ return CLFFT_SUCCESS;
+}
+
+clfftStatus FFTRepo::setclProgram( const clfftGenerators gen, const FFTKernelGenKeyParams& fftParam, const cl_program& prog )
+{
+ scopedLock sLock( lockRepo, _T( "setclProgram" ) );
+
+ std::pair< clfftGenerators, FFTKernelGenKeyParams > key = std::make_pair( gen, fftParam );
+
+ fftRepo_iterator pos = mapFFTs.find( key );
+ if( pos == mapFFTs.end( ) )
+ mapFFTs[ key ].clProgram = prog;
+ else {
+ cl_program p = pos->second.clProgram;
+ assert (NULL == p);
+ if (NULL != p)
+ clReleaseProgram (p);
+ pos->second.clProgram = prog;
+ }
+
+ return CLFFT_SUCCESS;
+}
+
+clfftStatus FFTRepo::getclProgram( const clfftGenerators gen, const FFTKernelGenKeyParams& fftParam, cl_program& prog )
+{
+ scopedLock sLock( lockRepo, _T( "getclProgram" ) );
+
+ std::pair< clfftGenerators, FFTKernelGenKeyParams > key = std::make_pair( gen, fftParam );
+
+ fftRepo_iterator pos = mapFFTs.find( key );
+ if( pos == mapFFTs.end( ) )
+ return CLFFT_INVALID_PROGRAM;
+ prog = pos->second.clProgram;
+ if (NULL == prog)
+ return CLFFT_INVALID_PROGRAM;
+
+ return CLFFT_SUCCESS;
+}
+
+clfftStatus FFTRepo::setclKernel( cl_program prog, clfftDirection dir, const cl_kernel& kernel )
+{
+ scopedLock sLock( lockRepo, _T( "setclKernel" ) );
+
+ fftKernels & Kernels = mapKernels[ prog ];
+
+ cl_kernel * pk;
+ switch (dir) {
+ case CLFFT_FORWARD:
+ pk = & Kernels.kernel_fwd;
+ break;
+ case CLFFT_BACKWARD:
+ pk = & Kernels.kernel_back;
+ break;
+ default:
+ assert (false);
+ return CLFFT_INVALID_ARG_VALUE;
+ }
+
+ assert (NULL == *pk);
+ if (NULL != *pk)
+ clReleaseKernel( *pk );
+
+ *pk = kernel;
+
+ return CLFFT_SUCCESS;
+}
+
+clfftStatus FFTRepo::getclKernel( cl_program prog, clfftDirection dir, cl_kernel& kernel )
+{
+ scopedLock sLock( lockRepo, _T( "getclKernel" ) );
+
+ Kernel_iterator pos = mapKernels.find( prog );
+ if (pos == mapKernels.end( ) )
+ return CLFFT_INVALID_KERNEL;
+
+ switch (dir) {
+ case CLFFT_FORWARD:
+ kernel = pos->second.kernel_fwd;
+ break;
+ case CLFFT_BACKWARD:
+ kernel = pos->second.kernel_back;
+ break;
+ default:
+ assert (false);
+ return CLFFT_INVALID_ARG_VALUE;
+ }
+
+ if (NULL == kernel)
+ return CLFFT_INVALID_KERNEL;
+
+ return CLFFT_SUCCESS;
+}
+
+clfftStatus FFTRepo::createPlan( clfftPlanHandle* plHandle, FFTPlan*& fftPlan )
+{
+ scopedLock sLock( lockRepo, _T( "insertPlan" ) );
+
+ // We keep track of this memory in our own collection class, to make sure it's freed in releaseResources
+ // The lifetime of a plan is tracked by the client and is freed when the client calls ::clfftDestroyPlan()
+ fftPlan = new FFTPlan;
+
+ // We allocate a new lock here, and expect it to be freed in ::clfftDestroyPlan();
+ // The lifetime of the lock is the same as the lifetime of the plan
+ lockRAII* lockPlan = new lockRAII;
+
+ // Add and remember the fftPlan in our map
+ repoPlans[ planCount ] = std::make_pair( fftPlan, lockPlan );
+
+ // Assign the user handle the plan count (unique identifier), and bump the count for the next plan
+ *plHandle = planCount++;
+
+ return CLFFT_SUCCESS;
+}
+
+clfftStatus FFTRepo::getPlan( clfftPlanHandle plHandle, FFTPlan*& fftPlan, lockRAII*& planLock )
+{
+ scopedLock sLock( lockRepo, _T( "getPlan" ) );
+
+ // First, check if we have already created a plan with this exact same FFTPlan
+ repoPlansType::iterator iter = repoPlans.find( plHandle );
+ if( iter == repoPlans.end( ) )
+ return CLFFT_INVALID_PLAN;
+
+ // If plan is valid, return fill out the output pointers
+ fftPlan = iter->second.first;
+ planLock = iter->second.second;
+
+ return CLFFT_SUCCESS;
+}
+
+clfftStatus FFTRepo::deletePlan( clfftPlanHandle* plHandle )
+{
+ scopedLock sLock( lockRepo, _T( "deletePlan" ) );
+
+ // First, check if we have already created a plan with this exact same FFTPlan
+ repoPlansType::iterator iter = repoPlans.find( *plHandle );
+ if( iter == repoPlans.end( ) )
+ return CLFFT_INVALID_PLAN;
+
+ // We lock the plan object while we are in the process of deleting it
+ {
+ scopedLock sLock( *iter->second.second, _T( "clfftDestroyPlan" ) );
+ clReleaseContext( iter->second.first->context );
+
+ // Delete the FFTPlan
+ delete iter->second.first;
+ }
+
+ // Delete the lockRAII
+ delete iter->second.second;
+
+ // Remove entry from our map object
+ repoPlans.erase( iter );
+
+ // Clear the client's handle to signify that the plan is gone
+ *plHandle = 0;
+
+ return CLFFT_SUCCESS;
+}
diff --git a/src/library/repo.h b/src/library/repo.h
new file mode 100644
index 00000000..f68242de
--- /dev/null
+++ b/src/library/repo.h
@@ -0,0 +1,158 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#pragma once
+#if !defined( CLFFT_repo_H )
+#define CLFFT_repo_H
+#include