Cuda.cpp
上传用户:chinafayin
上传日期:2022-04-05
资源大小:153k
文件大小:10k
源码类别:

并行计算

开发平台:

Visual C++

  1. /*
  2.     FastGrid (formerly AutoGrid)
  3.     Copyright (C) 2009 The Scripps Research Institute. All rights reserved.
  4.     Copyright (C) 2009 Masaryk University. All rights reserved.
  5.     AutoGrid is a Trade Mark of The Scripps Research Institute.
  6.     This program is free software; you can redistribute it and/or
  7.     modify it under the terms of the GNU General Public License
  8.     as published by the Free Software Foundation; either version 2
  9.     of the License, or (at your option) any later version.
  10.     This program is distributed in the hope that it will be useful,
  11.     but WITHOUT ANY WARRANTY; without even the implied warranty of
  12.     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13.     GNU General Public License for more details.
  14.     You should have received a copy of the GNU General Public License
  15.     along with this program; if not, write to the Free Software
  16.     Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
  17. */
  18. #if defined(AG_CUDA)
  19. #include <algorithm>
  20. #include <cstring>
  21. #include "Electrostatics.h"
  22. #include "CudaEvents.h"
  23. #include "CudaGridMap.h"
  24. #include "CudaFloatTexture1D.h"
  25. #include "CudaConstantMemory.h"
  26. #include "../Exceptions.h"
  27. #include "../openthreads/Thread"
  28. ///////////////////////////////////////////////////////////////////////////////////////////////////
  29. typedef void (*SetupThreadBlocksProc)(int numThreads, int numGridPointsPerThread, const Vec3i &numGridPoints,
  30.                                       Vec3i *numGridPointsPadded, dim3 *dimGrid, dim3 *dimBlock);
  31. static void setupSliceThreadBlocks(int numThreads, int numGridPointsPerThread, const Vec3i &numGridPoints,
  32.                                    Vec3i *numGridPointsPadded, dim3 *dimGrid, dim3 *dimBlock)
  33. {
  34.     if (numThreads % 16 != 0)
  35.         throw ExitProgram(0xbad);
  36.     // One slice at a time
  37.     dimBlock->x = 16;
  38.     dimBlock->y = numThreads / dimBlock->x;
  39.     // Pad/align the grid to a size of the grid block
  40.     numGridPointsPadded->x = align(numGridPoints.x, dimBlock->x);
  41.     numGridPointsPadded->y = align(numGridPoints.y, dimBlock->y);
  42.     numGridPointsPadded->z = align(numGridPoints.z, numGridPointsPerThread);
  43.     dimGrid->x = numGridPointsPadded->x / dimBlock->x;
  44.     dimGrid->y = numGridPointsPadded->y / dimBlock->y;
  45. }
  46. static void setupGridThreadBlocks(int numThreads, int numGridPointsPerThread, const Vec3i &numGridPoints,
  47.                                   Vec3i *numGridPointsPadded, dim3 *dimGrid, dim3 *dimBlock)
  48. {
  49.     setupSliceThreadBlocks(numThreads, numGridPointsPerThread, numGridPoints, numGridPointsPadded, dimGrid, dimBlock);
  50.     dimGrid->x *= numGridPointsPadded->z / numGridPointsPerThread;
  51. }
  52. static void callKernel(CudaConstantMemory &constMem, int atomSubsetIndex, CudaKernelProc kernelProc,
  53.                        const dim3 &dimGrid, const dim3 &dimBlock, cudaStream_t stream,
  54.                        CudaInternalAPI &api)
  55. {
  56.     constMem.setAtomConstMem(atomSubsetIndex); // Move atoms to the constant memory
  57.     // Calculate the entire grid for the given subset of atoms
  58.     api.callKernel(kernelProc, dimGrid, dimBlock, stream);
  59. }
  60. static void calculateElectrostaticMapCUDA(const InputData *input, const ProgramParameters *programParams, GridMapList *gridmaps)
  61. {
  62.     // Set the setup function based on granularity
  63.     SetupThreadBlocksProc setupThreadBlocks =
  64.         programParams->calcSlicesSeparatelyCUDA() ? setupSliceThreadBlocks : setupGridThreadBlocks;
  65.     // Determine a number of threads per block and a number of grid points calculated in each thread
  66.     bool unroll = programParams->unrollLoopCUDA() != False; // Assume Unassigned == True
  67.     int numThreads = 128;
  68.     int numGridPointsPerThread = unroll ? 8 : 1;
  69.     // Calculate the size of the padded grid and determine dimensions of thread blocks and the overall thread grid
  70.     Vec3i numGridPointsPadded;
  71.     dim3 dimGrid, dimBlock;
  72.     setupThreadBlocks(numThreads, numGridPointsPerThread, input->numGridPoints,
  73.                       &numGridPointsPadded, &dimGrid, &dimBlock);
  74.     if (programParams->benchmarkEnabled())
  75.         fprintf(stderr, "CUDA: gridmap = (%i, %i, %i), dimBlock = (%i, %i), dimGrid = (%i, %i)n",
  76.                 numGridPointsPadded.x, numGridPointsPadded.y, numGridPointsPadded.z,
  77.                 dimBlock.x, dimBlock.y, dimGrid.x, dimGrid.y);
  78.     // With the knowledge of size of the padded grid, we can examine how much we lose by padding in terms of performance
  79.     if (programParams->unrollLoopCUDA() == Unassigned)
  80.     {
  81.         double z = input->numGridPointsPerMap;
  82.         double zPadded = numGridPointsPadded.z;
  83.         if (z*1.5 < zPadded)
  84.         {
  85.             // Disable unrolling
  86.             unroll = false;
  87.             numGridPointsPerThread = 1;
  88.             setupThreadBlocks(numThreads, numGridPointsPerThread, input->numGridPoints,
  89.                               &numGridPointsPadded, &dimGrid, &dimBlock);
  90.         }
  91.     }
  92.     // Determine which DDD algorithm to use
  93.     DielectricKind dddKind = programParams->getDDDKindCUDA();
  94.     if (dddKind == Diel_Unassigned)
  95.     {
  96.         double max = Mathd::Max(numGridPointsPadded.x, Mathd::Max(numGridPointsPadded.y, numGridPointsPadded.z));
  97.         if (max > 160)
  98.             dddKind = DistanceDependentDiel_TextureMem;
  99.         else
  100.             dddKind = DistanceDependentDiel_InPlace;
  101.     }
  102.     // Get the CUDA Internal API that is used to set variables stored in constant memory, samplers, and to call kernels
  103.     CudaInternalAPI api;
  104.     getCudaInternalAPI(dddKind, api);
  105.     cudaStream_t stream;
  106.     CUDA_SAFE_CALL(cudaSetDevice(programParams->getDeviceIDCUDA()));
  107.     CUDA_SAFE_CALL(cudaStreamCreate(&stream));
  108.     CudaEvents events(programParams->benchmarkEnabled(), stream);
  109.     events.recordInitialization();
  110.     // Create a padded gridmap on the GPU
  111.     CudaGridMap grid(input->numGridPoints, numGridPointsPadded, gridmaps->getElectrostaticMap().energies, stream);
  112.     // This class makes use of constant memory easier
  113.     CudaConstantMemory constMem(stream, &api);
  114.     constMem.setGridMapParameters(input->numGridPointsDiv2, input->gridSpacing,
  115.                                   numGridPointsPadded, grid.getEnergiesDevicePtr());
  116.     CudaFloatTexture1D *texture = 0;
  117.     if (input->distDepDiel)
  118.     {
  119.         // Create a texture for distance-dependent dielectric if needed
  120.         if (dddKind == DistanceDependentDiel_TextureMem)
  121.             texture = new CudaFloatTexture1D(MAX_DIST, input->epsilon, BindToKernel, stream, &api);
  122.         // Initialize global or constant memory for distance-dependent dielectric if needed
  123.         else if (dddKind == DistanceDependentDiel_GlobalMem ||
  124.                  dddKind == DistanceDependentDiel_ConstMem)
  125.             // by default, the table is put into constant memory,
  126.             // however the internal API may decide to put it into global memory instead
  127.             constMem.initDistDepDielLookUpTable(input->epsilon);
  128.     }
  129.     // Get a CUDA kernel function according to parameters
  130.     CudaKernelProc kernelProc = api.getKernelProc(input->distDepDiel, dddKind,
  131.                                                   programParams->calcSlicesSeparatelyCUDA(), unroll);
  132.     // Initialize storage for atoms in page-locked system memory
  133.     constMem.initAtoms(input->receptorAtom, input->numReceptorAtoms, programParams->calcSlicesSeparatelyCUDA());
  134.     int numAtomSubsets = constMem.getNumAtomSubsets();
  135.     events.recordStartCalculation();
  136.     // Do the gridmap calculation...
  137.     if (programParams->calcSlicesSeparatelyCUDA())
  138.     {
  139.         int step = unroll ? 8 : 1;
  140.         // For each Z
  141.         for (int z = 0; z < input->numGridPoints.z; z += step)
  142.         {
  143.             constMem.setZSlice(z);
  144.             for (int i = 0; i < numAtomSubsets; i++)
  145.                 callKernel(constMem, i, kernelProc, dimGrid, dimBlock, stream, api);
  146.         }
  147.     }
  148.     else // !calcSlicesSeparately
  149.         for (int i = 0; i < numAtomSubsets; i++)
  150.             callKernel(constMem, i, kernelProc, dimGrid, dimBlock, stream, api);
  151.     // Finalize
  152.     events.recordEndCalculation();
  153.     grid.copyFromDeviceToHost();
  154.     events.recordFinalization();
  155.     CUDA_SAFE_CALL(cudaStreamSynchronize(stream));
  156.     events.printTimes(input, numGridPointsPadded.Cube());
  157.     grid.readFromHost(gridmaps->getElectrostaticMap().energies);
  158.     // And save the gridmap
  159.     gridmaps->saveElectrostaticMap();
  160.     delete texture;
  161.     CUDA_SAFE_CALL(cudaStreamDestroy(stream));
  162. }
  163. ///////////////////////////////////////////////////////////////////////////////////////////////////
  164. void calculateElectrostaticMapCPU(const InputData *input, const ProgramParameters &programParams, GridMapList &gridmaps);
  165. class CudaThread : public OpenThreads::Thread
  166. {
  167. public:
  168.     CudaThread(const InputData *input, const ProgramParameters *programParams, GridMapList *gridmaps)
  169.         : input(input), programParams(programParams), gridmaps(gridmaps) {}
  170.     virtual void run()
  171.     {
  172.         calculateElectrostaticMapCUDA(input, programParams, gridmaps);
  173.     }
  174. private:
  175.     const InputData *input;
  176.     const ProgramParameters *programParams;
  177.     GridMapList *gridmaps;
  178. };
  179. void *calculateElectrostaticMapAsync(const InputData *input, const ProgramParameters &programParams, GridMapList &gridmaps)
  180. {
  181.     if (programParams.useCUDA())
  182.         if (programParams.useCUDAThread())
  183.         {
  184.             // Create and start the thread
  185.             CudaThread *thread = new CudaThread(input, &programParams, &gridmaps);
  186.             thread->start();
  187.             return thread;
  188.         }
  189.         else
  190.             calculateElectrostaticMapCUDA(input, &programParams, &gridmaps);
  191.     else
  192.         calculateElectrostaticMapCPU(input, programParams, gridmaps);
  193.     return 0;
  194. }
  195. void synchronizeCalculation(void *handle)
  196. {
  197.     if (handle)
  198.     {
  199.         CudaThread *thread = (CudaThread*)handle;
  200.         // Wait until the thread terminates
  201.         thread->join();
  202.         delete thread;
  203.     }
  204. }
  205. #endif