Kernels.inl
上传用户:chinafayin
上传日期:2022-04-05
资源大小:153k
文件大小:13k
源码类别:

并行计算

开发平台:

Visual C++

  1. /*
  2.     FastGrid (formerly AutoGrid)
  3.     Copyright (C) 2009 The Scripps Research Institute. All rights reserved.
  4.     Copyright (C) 2009 Masaryk University. All rights reserved.
  5.     AutoGrid is a Trade Mark of The Scripps Research Institute.
  6.     This program is free software; you can redistribute it and/or
  7.     modify it under the terms of the GNU General Public License
  8.     as published by the Free Software Foundation; either version 2
  9.     of the License, or (at your option) any later version.
  10.     This program is distributed in the hope that it will be useful,
  11.     but WITHOUT ANY WARRANTY; without even the implied warranty of
  12.     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13.     GNU General Public License for more details.
  14.     You should have received a copy of the GNU General Public License
  15.     along with this program; if not, write to the Free Software
  16.     Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
  17. */
  18. #include "Interface.h"
  19. #define AG_CALLCONV __device__
  20. #include "../../autogrid.h"
  21. #include <cstdio>
  22. // Static parameters
  23. static texture<float, 1, cudaReadModeElementType> epsilonTexture;
  24. static __constant__ int3 numGridPoints, numGridPointsDiv2;
  25. static __constant__ float gridSpacing;
  26. static __constant__ float *deviceEnergies;
  27. #if defined(USE_DDD_CONSTMEM)
  28. static __constant__ float epsilon[MAX_DIST];
  29. #else
  30. static __constant__ float *epsilon;
  31. #endif
  32. // Atoms
  33. static __constant__ int numAtoms;
  34. static __constant__ float4 atoms[STD_NUM_ATOMS_PER_KERNEL]; // {x, y, z, charge}
  35. // Per-slice parameters
  36. static __constant__ int zIndex; // index of slice in the Z direction
  37. // Contains a reciprocal square root and its parameter
  38. struct RsqrtDesc
  39. {
  40.     float x;
  41.     float result; // = rsqrt(x)
  42.     // notice that: sqrt(x) = x/sqrt(x) = x*result
  43.     // that is, we get sqrt with just rsqrt+mul
  44. };
  45. static inline __device__ RsqrtDesc myRsqrtf(float x)
  46. {
  47.     RsqrtDesc i;
  48.     i.x = x;
  49.     i.result = rsqrt(i.x);
  50.     return i;
  51. }
  52. // Forward declarations
  53. template<int DielectricKind>
  54. static inline __device__ float dielectric(RsqrtDesc rsqrt);
  55. template<int DielectricKind>
  56. static inline __device__ float distanceDependentDiel(RsqrtDesc rsqrt);
  57. // Get dielectric using the lookup table
  58. static inline __device__ float lookupEpsilonTable(RsqrtDesc rsqrt)
  59. {
  60.     return epsilon[min(int(A_DIVISOR * rsqrt.x * rsqrt.result), MAX_DIST-1)];
  61. }
  62. // Distance dependent dielectric - global memory
  63. template<>
  64. static inline __device__ float distanceDependentDiel<DistanceDependentDiel_GlobalMem>(RsqrtDesc rsqrt)
  65. {
  66.     return lookupEpsilonTable(rsqrt);
  67. }
  68. // Distance dependent dielectric - constant memory
  69. template<>
  70. static inline __device__ float distanceDependentDiel<DistanceDependentDiel_ConstMem>(RsqrtDesc rsqrt)
  71. {
  72.     return lookupEpsilonTable(rsqrt);
  73. }
  74. // Distance dependent dielectric - texture memory
  75. template<>
  76. static inline __device__ float distanceDependentDiel<DistanceDependentDiel_TextureMem>(RsqrtDesc rsqrt)
  77. {
  78.     return tex1D(epsilonTexture, (float(A_DIVISOR) / (MAX_DIST-1)) * rsqrt.x * rsqrt.result);
  79. }
  80. // Distance dependent dielectric - in-place calculation
  81. template<>
  82. static inline __device__ float distanceDependentDiel<DistanceDependentDiel_InPlace>(RsqrtDesc rsqrt)
  83. {
  84.     return calculateDistDepDielInv<float>(rsqrt.x * rsqrt.result);
  85. }
  86. // Constant dielectric
  87. template<>
  88. static inline __device__ float dielectric<ConstantDiel>(RsqrtDesc rsqrt)
  89. {
  90.     return fminf(rsqrt.result, 2.f);
  91. }
  92. // Distance-dependent dielectric - main function
  93. template<int DielectricKind>
  94. static inline __device__ float dielectric(RsqrtDesc rsqrt)
  95. {
  96.     float ddd = distanceDependentDiel<DielectricKind>(rsqrt);
  97.     return dielectric<ConstantDiel>(rsqrt) * ddd;
  98. }
  99. // Initializes gridPos and outputIndex based on the thread ID
  100. static inline __device__ void initialize(bool unrolling, int calcGranularity, float3 &gridPos, int &outputIndex)
  101. {
  102.     int x, y, z;
  103.     if (calcGranularity == CalcEntireGrid)
  104.     {
  105.         int gridDimZ = numGridPoints.z;
  106.         if (unrolling) gridDimZ /= 8;
  107.         x = (blockIdx.x / gridDimZ) * blockDim.x + threadIdx.x;
  108.         y = blockIdx.y * blockDim.y + threadIdx.y;
  109.         z = blockIdx.x % gridDimZ;
  110.         if (unrolling) z *= 8;
  111.     }
  112.     else
  113.     {
  114.         x = blockIdx.x * blockDim.x + threadIdx.x;
  115.         y = blockIdx.y * blockDim.y + threadIdx.y;
  116.         z = zIndex;
  117.     }
  118.     gridPos.x = (x - numGridPointsDiv2.x) * gridSpacing;
  119.     gridPos.y = (y - numGridPointsDiv2.y) * gridSpacing;
  120.     gridPos.z = (z - numGridPointsDiv2.z) * gridSpacing;
  121.     outputIndex = z * numGridPoints.y * numGridPoints.x + y * numGridPoints.x + x;
  122. }
  123. // The basic kernel, no loop unrolling
  124. template<int DielectricKind, int CalcGranularity>
  125. static __global__ void calc_1Point()
  126. {
  127.     float3 gridPos;
  128.     int outputIndex;
  129.     initialize(false, CalcGranularity, gridPos, outputIndex);
  130.     float energy = 0;
  131.     //  Do all Receptor (protein, DNA, etc.) atoms...
  132.     for (int ia = 0; ia < numAtoms; ia++)
  133.     {
  134.         float dx = gridPos.x - atoms[ia].x;
  135.         float dy = gridPos.y - atoms[ia].y;
  136.         float dz = gridPos.z - atoms[ia].z;
  137.         // The estat forcefield coefficient/weight is premultiplied in .w
  138.         energy += atoms[ia].w * dielectric<DielectricKind>(myRsqrtf(dx*dx + dy*dy + dz*dz));
  139.     }
  140.     deviceEnergies[outputIndex] += energy;
  141. }
  142. // The kernel where the loop over grid points is unrolled by 8
  143. template<int DielectricKind, int CalcGranularity>
  144. static __global__ void calc_8Points()
  145. {
  146.     float3 gridPos;
  147.     int outputIndex;
  148.     initialize(true, CalcGranularity, gridPos, outputIndex);
  149.     float energy0 = 0, energy1 = 0, energy2 = 0, energy3 = 0,
  150.           energy4 = 0, energy5 = 0, energy6 = 0, energy7 = 0;
  151.     //  Do all Receptor (protein, DNA, etc.) atoms...
  152.     for (int ia = 0; ia < numAtoms; ia++)
  153.     {
  154.         float dx = gridPos.x - atoms[ia].x;
  155.         float dy = gridPos.y - atoms[ia].y;
  156.         float dxdySq = dx*dx + dy*dy;
  157.         float dz0 = gridPos.z - atoms[ia].z;
  158.         float dz1 = dz0 + gridSpacing;
  159.         float dz2 = dz1 + gridSpacing;
  160.         float dz3 = dz2 + gridSpacing;
  161.         float dz4 = dz3 + gridSpacing;
  162.         float dz5 = dz4 + gridSpacing;
  163.         float dz6 = dz5 + gridSpacing;
  164.         float dz7 = dz6 + gridSpacing;
  165.         // The estat forcefield coefficient/weight is premultiplied in .w
  166.         energy0 += atoms[ia].w * dielectric<DielectricKind>(myRsqrtf(dxdySq + dz0*dz0));
  167.         energy1 += atoms[ia].w * dielectric<DielectricKind>(myRsqrtf(dxdySq + dz1*dz1));
  168.         energy2 += atoms[ia].w * dielectric<DielectricKind>(myRsqrtf(dxdySq + dz2*dz2));
  169.         energy3 += atoms[ia].w * dielectric<DielectricKind>(myRsqrtf(dxdySq + dz3*dz3));
  170.         energy4 += atoms[ia].w * dielectric<DielectricKind>(myRsqrtf(dxdySq + dz4*dz4));
  171.         energy5 += atoms[ia].w * dielectric<DielectricKind>(myRsqrtf(dxdySq + dz5*dz5));
  172.         energy6 += atoms[ia].w * dielectric<DielectricKind>(myRsqrtf(dxdySq + dz6*dz6));
  173.         energy7 += atoms[ia].w * dielectric<DielectricKind>(myRsqrtf(dxdySq + dz7*dz7));
  174.     }
  175.     int numGridPointsXMulY = numGridPoints.y * numGridPoints.x;
  176.     deviceEnergies[outputIndex] += energy0; outputIndex += numGridPointsXMulY;
  177.     deviceEnergies[outputIndex] += energy1; outputIndex += numGridPointsXMulY;
  178.     deviceEnergies[outputIndex] += energy2; outputIndex += numGridPointsXMulY;
  179.     deviceEnergies[outputIndex] += energy3; outputIndex += numGridPointsXMulY;
  180.     deviceEnergies[outputIndex] += energy4; outputIndex += numGridPointsXMulY;
  181.     deviceEnergies[outputIndex] += energy5; outputIndex += numGridPointsXMulY;
  182.     deviceEnergies[outputIndex] += energy6; outputIndex += numGridPointsXMulY;
  183.     deviceEnergies[outputIndex] += energy7;
  184. }
  185. void stdSetDistDepDielTexture(const cudaArray *ptr, const cudaChannelFormatDesc *desc)
  186. {
  187.     epsilonTexture.normalized = true;
  188.     epsilonTexture.filterMode = cudaFilterModePoint;
  189.     epsilonTexture.addressMode[0] = cudaAddressModeClamp;
  190.     CUDA_SAFE_CALL(cudaBindTextureToArray(&epsilonTexture, ptr, desc));
  191. }
  192. void stdSetDistDepDielLookUpTable(float **devicePtr, cudaStream_t stream)
  193. {
  194. #if defined(USE_DDD_CONSTMEM)
  195.     CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(::epsilon, *devicePtr, sizeof(float) * MAX_DIST, 0, cudaMemcpyDeviceToDevice, stream));
  196. #else
  197.     CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(::epsilon, devicePtr,  sizeof(float*),           0, cudaMemcpyHostToDevice,   stream));
  198. #endif
  199. }
  200. void stdSetGridMap(const int3 *numGridPoints, const int3 *numGridPointsDiv2,
  201.                    const float *gridSpacing, float **deviceEnergies, cudaStream_t stream)
  202. {
  203.     CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(::numGridPoints,        numGridPoints,        sizeof(int3),   0, cudaMemcpyHostToDevice, stream));
  204.     CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(::numGridPointsDiv2,    numGridPointsDiv2,    sizeof(int3),   0, cudaMemcpyHostToDevice, stream));
  205.     CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(::gridSpacing,          gridSpacing,          sizeof(float),  0, cudaMemcpyHostToDevice, stream));
  206.     CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(::deviceEnergies,       deviceEnergies,       sizeof(float*), 0, cudaMemcpyHostToDevice, stream));
  207. }
  208. void stdSetSlice(const int *zIndex, cudaStream_t stream)
  209. {
  210.     CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(::zIndex, zIndex, sizeof(int), 0, cudaMemcpyHostToDevice, stream));
  211. }
  212. void stdSetAtoms(const int *numAtoms, const float4 *atoms, cudaStream_t stream)
  213. {
  214.     CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(::atoms,    atoms,    sizeof(float4) * *numAtoms, 0, cudaMemcpyHostToDevice, stream));
  215.     CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(::numAtoms, numAtoms, sizeof(int),                0, cudaMemcpyHostToDevice, stream));
  216. }
  217. void stdCallKernel(CudaKernelProc kernel, const dim3 &grid, const dim3 &block, cudaStream_t stream)
  218. {
  219.     CUDA_SAFE_KERNEL((kernel<<<grid, block, stream>>>()));
  220. }
  221. CudaKernelProc stdGetKernelProc(bool distDepDiel, DielectricKind dddKind, bool calcSlicesSeparately, bool unrollLoop)
  222. {
  223. #if defined(USE_DDD_CONSTMEM)
  224.     if (calcSlicesSeparately)
  225.         if (unrollLoop)
  226.         {
  227.             if (distDepDiel)
  228.                 switch (dddKind)
  229.                 {
  230.                 case DistanceDependentDiel_ConstMem:
  231.                     return calc_8Points<DistanceDependentDiel_ConstMem, CalcOneSlice>;
  232.                 }
  233.         }
  234.         else
  235.         {
  236.             if (distDepDiel)
  237.                 switch (dddKind)
  238.                 {
  239.                 case DistanceDependentDiel_ConstMem:
  240.                     return calc_1Point<DistanceDependentDiel_ConstMem, CalcOneSlice>;
  241.                 }
  242.         }
  243.     else
  244.         if (unrollLoop)
  245.         {
  246.             if (distDepDiel)
  247.                 switch (dddKind)
  248.                 {
  249.                 case DistanceDependentDiel_ConstMem:
  250.                     return calc_8Points<DistanceDependentDiel_ConstMem, CalcEntireGrid>;
  251.                 }
  252.         }
  253.         else
  254.         {
  255.             if (distDepDiel)
  256.                 switch (dddKind)
  257.                 {
  258.                 case DistanceDependentDiel_ConstMem:
  259.                     return calc_1Point<DistanceDependentDiel_ConstMem, CalcEntireGrid>;
  260.                 }
  261.         }
  262. #else
  263.     if (calcSlicesSeparately)
  264.         if (unrollLoop)
  265.         {
  266.             if (distDepDiel)
  267.                 switch (dddKind)
  268.                 {
  269.                 case DistanceDependentDiel_GlobalMem:
  270.                     return calc_8Points<DistanceDependentDiel_GlobalMem, CalcOneSlice>;
  271.                 case DistanceDependentDiel_TextureMem:
  272.                     return calc_8Points<DistanceDependentDiel_TextureMem, CalcOneSlice>;
  273.                 case DistanceDependentDiel_InPlace:
  274.                     return calc_8Points<DistanceDependentDiel_InPlace, CalcOneSlice>;
  275.                 }
  276.             else
  277.                 return calc_8Points<ConstantDiel, CalcOneSlice>;
  278.         }
  279.         else
  280.         {
  281.             if (distDepDiel)
  282.                 switch (dddKind)
  283.                 {
  284.                 case DistanceDependentDiel_GlobalMem:
  285.                     return calc_1Point<DistanceDependentDiel_GlobalMem, CalcOneSlice>;
  286.                 case DistanceDependentDiel_TextureMem:
  287.                     return calc_1Point<DistanceDependentDiel_TextureMem, CalcOneSlice>;
  288.                 case DistanceDependentDiel_InPlace:
  289.                     return calc_1Point<DistanceDependentDiel_InPlace, CalcOneSlice>;
  290.                 }
  291.             else
  292.                 return calc_1Point<ConstantDiel, CalcOneSlice>;
  293.         }
  294.     else
  295.         if (unrollLoop)
  296.         {
  297.             if (distDepDiel)
  298.                 switch (dddKind)
  299.                 {
  300.                 case DistanceDependentDiel_GlobalMem:
  301.                     return calc_8Points<DistanceDependentDiel_GlobalMem, CalcEntireGrid>;
  302.                 case DistanceDependentDiel_TextureMem:
  303.                     return calc_8Points<DistanceDependentDiel_TextureMem, CalcEntireGrid>;
  304.                 case DistanceDependentDiel_InPlace:
  305.                     return calc_8Points<DistanceDependentDiel_InPlace, CalcEntireGrid>;
  306.                 }
  307.             else
  308.                 return calc_8Points<ConstantDiel, CalcEntireGrid>;
  309.         }
  310.         else
  311.         {
  312.             if (distDepDiel)
  313.                 switch (dddKind)
  314.                 {
  315.                 case DistanceDependentDiel_GlobalMem:
  316.                     return calc_1Point<DistanceDependentDiel_GlobalMem, CalcEntireGrid>;
  317.                 case DistanceDependentDiel_TextureMem:
  318.                     return calc_1Point<DistanceDependentDiel_TextureMem, CalcEntireGrid>;
  319.                 case DistanceDependentDiel_InPlace:
  320.                     return calc_1Point<DistanceDependentDiel_InPlace, CalcEntireGrid>;
  321.                 }
  322.             else
  323.                 return calc_1Point<ConstantDiel, CalcEntireGrid>;
  324.         }
  325. #endif
  326.     return 0;
  327. }