大规模计算

热传导方程的数值求解

Images

from __future__ import print_function
import numpy as np
import time

import matplotlib

matplotlib.use('Agg')
import matplotlib.pyplot as plt

# 设置色图
# plt.rcParams['image.cmap'] = 'BrBG'
plt.rcParams['image.cmap'] = 'jet'  # 可以使用其他的色图，如：colourMap = plt.cm.coolwarm
plt.figure(dpi=300)

# 设置颜色插值和色图
colorinterpolation = 100
colourMap = plt.cm.jet

# 基本参数
a = 0.1  # 扩散系数
timesteps = 10000  # 演化系统的时间步数
image_interval = 1000  # 保存png文件的频率

# 网格间距
dx = 0.01
dy = 0.01
dx2 = dx ** 2
dy2 = dy ** 2

# 时间步长的最大间隔（保持稳定性）
dt = dx2 * dy2 / (2 * a * (dx2 + dy2))

# 设置维度
lenX = lenY = 400  # 设置为矩形

# 设置网格
X, Y = np.meshgrid(np.arange(0, lenX), np.arange(0, lenY))

# 边界条件
Ttop = 100
Tbottom = 0
Tleft = 0
Tright = 30

# 内部网格的初始猜测
Tguess = 0


def init_fields():
    # 设置数组大小并用Tguess设置内部值
    field = np.empty((lenX, lenY))
    field.fill(Tguess)

    # 设置边界条件
    field[(lenY - 1):, :] = Ttop
    field[:1, :] = Tbottom
    field[:, (lenX - 1):] = Tright
    field[:, :1] = Tleft

    print("size is ", field.size)
    print(field, "\n")

    field0 = field.copy()  # 用于存储上一个时间步的温度场的数组
    return field, field0


def evolve(u, u_previous, a, dt, dx2, dy2):
    	"""
    	显式时间演化
       u:            新的温度场
       u_previous:   上一个时间步的场
       a:            扩散系数
       dt:           时间步长
       dx2:          网格间距的平方，即dx^2
      	 """

    u[1:-1, 1:-1] = u_previous[1:-1, 1:-1] + a * dt * (
            (u_previous[2:, 1:-1] - 2 * u_previous[1:-1, 1:-1] +
             u_previous[:-2, 1:-1]) / dx2 +
            (u_previous[1:-1, 2:] - 2 * u_previous[1:-1, 1:-1] +
             u_previous[1:-1, :-2]) / dy2)
    u_previous[:] = u[:]


def write_field(field, step):
    # plt.gca().clear()
    plt.cla()
    plt.clf()

    # 配置等高线图
    plt.title("Temperature Contour")
    plt.contourf(X, Y, field, colorinterpolation, cmap=colourMap)
    # 设置色条
    plt.colorbar()
    plt.axis('on')
    plt.savefig('heat_{0:03d}.png'.format(step))


def main():
    field, field0 = init_fields()
    write_field(field, 0)

    for m in range(1, timesteps + 1):
        evolve(field, field0, a, dt, dx2, dy2)
        if m % image_interval == 0:
            write_field(field, m)


if __name__ == '__main__':
    main()

Animation

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.animation as animation

SAVE_ANIMATION = True  # 是否保存动画，True为保存，False为显示动画

# 设置颜色映射
plt.rcParams['image.cmap'] = 'jet'

# 基本参数
a = 0.1  # 扩散系数
timesteps = 1000  # 演化系统的时间步数
image_interval = 1000  # 生成图像文件的间隔

# 网格间距
dx = 0.01
dy = 0.01
dx2 = dx ** 2
dy2 = dy ** 2

# 为了稳定性，计算时间步长
dt = dx2 * dy2 / (2 * a * (dx2 + dy2))

# 设置网格维度
lenX = lenY = 40

# 创建网格
X, Y = np.meshgrid(np.arange(0, lenX), np.arange(0, lenY))

# 边界条件
Ttop = 100
Tbottom = 0
Tleft = 0
Tright = 30

# 内部网格的初始猜测温度
Tguess = 30


def init_fields():
    # 设置数组大小，并用Tguess填充内部网格的值
    field = np.empty((lenX, lenY))
    field.fill(Tguess)

    # 设置边界条件
    field[(lenY - 1):, :] = Ttop
    field[:1, :] = Tbottom
    field[:, (lenX - 1):] = Tright
    field[:, :1] = Tleft

    field0 = field.copy()  # 用于保存上一个时间步的温度场
    return field, field0

u, u0 = init_fields()


def evolve(u, u0, a, dt, dx2, dy2):
    # 使用向前差分时间离散化、中心差分空间离散化进行演化
    u[1:-1, 1:-1] = u0[1:-1, 1:-1] + a * dt * (
            (u0[2:, 1:-1] - 2 * u0[1:-1, 1:-1] +
             u0[:-2, 1:-1]) / dx2 +
            (u0[1:-1, 2:] - 2 * u0[1:-1, 1:-1] +
             u0[1:-1, :-2]) / dy2)
    u0[:] = u[:]

    return u, u0

fig, ax = plt.subplots()
plt.title("Contour of Temperature")
plt.contourf(X, Y, u0, 100, cmap='jet')
plt.colorbar()
plt.axis('on')


def animate(i):
    """第i次迭代的动画数据更新函数"""

    global u0, u, a, dt, dx2, dy2

    plt.cla()
    plt.clf()
    plt.title('{:.1f} ms'.format(i * dt * 1000))
    plt.contourf(X, Y, u0, 100, cmap='jet')
    plt.colorbar()
    plt.axis('on')

    u, u0 = evolve(u, u0, a, dt, dx2, dy2)

    return u0, u

if SAVE_ANIMATION:
    anim = animation.FuncAnimation(fig, animate, frames=timesteps, repeat=False, interval=50)
    anim.save("heat_equation_solution.mp4", writer='ffmpeg', fps=10)
else:
    plt.show()

安装`ffmpeg`

到ffmpeg官网https://ffmpeg.org/download.html 按照下图所示红框位置点击下载下载获得ffmpeg安装包
解压文件，进入bin目录，能看到ffmpeg.exe、ffplay.exe、ffprobe.exe三个文件，复制路径
点击“系统属性->高级系统设置->环境变量->用户变量”，选择“Path”条目，点击“编辑->新建”，把第一步的**bin文件夹路径**复制粘贴进去，然后点击确定即可
打开cmd命令行窗口，输入命令“ffmpeg –version”。窗口返回ffmpeg的版本信息，说明安装成功。接下来你就可以直接使用命令行执行ffmpeg命令进行各种媒体格式的转换

部分参数解释

lenX和lenY变量用于设置网格的维度，它们都被设置为40。这意味着在二维网格中，X和Y方向上都有40个格点，总共形成了一个40x40的矩形网格。
变量timesteps是用于指定演化系统的时间步数。在热传导方程中，通过进行离散化和迭代计算，可以模拟出温度场随时间的演化。每个时间步长代表了在离散时间上的一次更新。在此脚本中，时间步数被设置为1000，意味着模拟将进行1000次时间步的更新。

问题

在使用之前未安装ffmpeg会导致运行时使用pillow代替，倒是运行会报错，具体原因不是很清楚
前面提到的参数len以及timesteps在源代码中较大，电脑的算力不够，于是将其调小之后便可得到实验结果，查阅资料了解其影响的主要原因，但是会影响实验精准度，需要权衡利弊。
1. 网格维度（lenX和lenY）：增加网格的维度会导致网格中的节点数量增加，进而增加了计算量。在每个时间步中，需要更新更多的节点，从而增加了计算的复杂性和运行时间。因此，较大的网格维度会导致模拟的运行时间增加。
2. 时间步数（timesteps）：增加时间步数会增加模拟的总计算量。在每个时间步中，需要进行温度场的更新计算。因此，较大的时间步数会导致模拟的运行时间增加。
一些必要的依赖库使用pycharm时会自动提示安装。

MPI

`mpi`概念

MPI（Message Passing Interface）是一种用于编写并行程序的通信协议和库。它是一个标准化的消息传递编程接口，用于在并行计算中的多个进程之间进行通信和同步操作。

MPI旨在提供一种可移植、高性能的编程模型，适用于各种并行计算环境，包括多处理器系统、集群系统和分布式内存系统等。它为开发者提供了一组丰富的通信和同步操作，使得多个进程之间可以相互交换数据、协调计算任务，并实现并行计算的目标。

MPI有多个实现，其中最常见的是MPI的开源实现mpi4py（用于Python）和MPICH（用于C/C++）。这些实现提供了丰富的函数和工具，使得编写并行程序变得更加简单和高效。

使用多进程数的MPI（Message Passing Interface）有以下几个好处：

加速计算
提高扩展性
并行算法实现
灵活性和可移植性

`mpi`安装

在pycharm中直接运行从github上下载的代码会遇到mpi模块无法找的报错，在pycharm中安装mpi软件包之后代码依然报相同的错误，应该是mpi路径问题，于是又在代码文件终端使用pip安装mpi任然无效，便在mpi官网进行下载并配置好相关环境。

mpi下载地址https://www.microsoft.com/en-us/download/details.aspx?id=100593
将压缩包下载并且解压缩至相关路径
将bin目录的路径赋值并添加至环境变量当中
重启pycharm运行代码即可

`mpi`运行命令

mpiexec -n <进程数> python your_code.py

其中<进程数>是你想要使用的MPI进程数

MPI编程中，进程数是指同时运行的MPI进程的数量，每个MPI进程都是独立执行的，进程数的选择会对MPI程序的执行产生影响。

具体代码

from __future__ import print_function
import numpy as np
import time
from mpi4py import MPI

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

# Set the colormap
plt.rcParams['image.cmap'] = 'BrBG'

# Basic parameters
a = 0.5                # 扩散常数
timesteps = 300        # 时间步数
image_interval = 40000  # 图像写入频率

# 网格间距
dx = 0.01
dy = 0.01
dx2 = dx**2
dy2 = dy**2

# 为了稳定性，这是时间步长的最大间隔:
dt = dx2*dy2 / ( 2*a*(dx2+dy2) )

# MPI 全局变量
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()

# 上/下相邻的 MPI 进程秩
up = rank - 1
if up < 0:
    up = MPI.PROC_NULL
down = rank + 1
if down > size - 1:
    down = MPI.PROC_NULL

def evolve(u, u_previous, a, dt, dx2, dy2):
    """显式时间演化。
       u:            新的温度场
       u_previous:   上一个时间步的场
       a:            扩散常数
       dt:           时间步长
       dx2:          网格间距的平方，即 dx^2
       dy2:          -- "" --          , 即 dy^2"""

    u[1:-1, 1:-1] = u_previous[1:-1, 1:-1] + a * dt * ( \
            (u_previous[2:, 1:-1] - 2*u_previous[1:-1, 1:-1] + \
             u_previous[:-2, 1:-1]) / dx2 + \
            (u_previous[1:-1, 2:] - 2*u_previous[1:-1, 1:-1] + \
                 u_previous[1:-1, :-2]) / dy2 )
    u_previous[:] = u[:]

def init_fields(filename):
    # 从文件中读取初始温度场
    field = np.loadtxt(filename)
    field0 = field.copy() # 用于存储上一个时间步的场
    return field, field0

def write_field(field, step):
    plt.gca().clear()
    plt.imshow(field)
    plt.axis('off')
    plt.savefig('heat_{0:03d}.png'.format(step))

def exchange(field):
    # 向下发送，从上接收
    sbuf = field[-2,:]
    rbuf = field[0,:]
    comm.Sendrecv(sbuf, dest=down, recvbuf=rbuf, source=up)
    # 向上发送，从下接收
    sbuf = field[1,:]
    rbuf = field[-1,:]
    comm.Sendrecv(sbuf, dest=up, recvbuf=rbuf, source=down)

def iterate(field, local_field, local_field0, timesteps, image_interval):
    for m in range(1, timesteps+1):
        exchange(local_field0)
        evolve(local_field, local_field0, a, dt, dx2, dy2)
        if m % image_interval == 0:
            comm.Gather(local_field[1:-1,:], field, root=0)
            if rank == 0:
                write_field(field, m)

def main():
    # 读取并分发初始温度场
    if rank == 0:
        field, field0 = init_fields('bottle.dat')
        shape = field.shape
        dtype = field.dtype
        comm.bcast(shape, 0)  # 广播维度
        comm.bcast(dtype, 0)  # 广播数据类型
    else:
        field = None
        shape = comm.bcast(None, 0)
        dtype = comm.bcast(None, 0)
    if shape[0] % size:
        raise ValueError('温度场行数 (' \
                + str(shape[0]) + ') 需要被 MPI 任务数 (' + str(size) + ') 整除.')
    n = int(shape[0] / size)  # 每个 MPI 任务的行数
    m = shape[1]         # 场的列数
    buff = np.zeros((n, m), dtype)
    comm.Scatter(field, buff, 0)  # 分发数据
    local_field = np.zeros((n + 2, m), dtype)  # 需要两行边界
    local_field[1:-1,:] = buff                 # 将数据复制到非边界行
    local_field0 = np.zeros_like(local_field)  # 用于存储上一个时间步的场
    # 修正边界的边界层以考虑非周期性?
    if True:
        if rank == 0:
            local_field[0,:] = local_field[1,:]
        if rank == size - 1:
            local_field[-1,:] = local_field[-2,:]
    local_field0[:] = local_field[:]

    # 绘制/保存初始场
    if rank == 0:
        write_field(field, 0)
    # 迭代
    t0 = time.time()
    iterate(field, local_field, local_field0, timesteps, image_interval)
    t1 = time.time()
    # 绘制/保存最终场
    comm.Gather(local_field[1:-1,:], field, root=0)
    if rank == 0:
        write_field(field, timesteps)
        print("运行时间: {0}".format(t1-t0))

if __name__ == '__main__':
    main()

部分代码解释

函数init_fields，用于从文件中读取初始温度场。
函数write_field，用于将当前温度场写入到图像文件中。
函数exchange，用于在不同进程之间进行温度场数据的交换。每个进程将自己的边界数据发送给相邻的进程，并接收相邻进程发送的边界数据。
函数iterate，用于执行整个时间演化过程。它循环进行温度场的更新，并在指定的时间步数上调用write_field函数写入图像文件。

CUDA

`CUDA`概念

CUDA是一种并行计算平台和编程模型，旨在利用GPU进行高性能计算。

并行计算平台：CUDA提供了一种并行计算的框架，使开发人员能够利用GPU的大规模并行处理能力。
GPU加速：CUDA充分利用了GPU的并行计算能力，将任务分解为多个并行运算，以加速计算速度。
编程模型：CUDA提供了一套编程接口和工具，使开发人员能够使用常用的编程语言（如C、C++、Fortran）来编写并行计算程序。

`CUDA`用途

科学计算：CUDA广泛应用于科学领域，如物理模拟、气象学、生物医学、量子化学等。通过利用GPU的并行计算能力，可以大幅加速复杂的数值计算和模拟。
深度学习和人工智能：CUDA在深度学习和人工智能领域发挥了重要作用。许多深度学习框架（如TensorFlow、PyTorch）都支持CUDA，通过GPU加速深度神经网络的训练和推理过程。
图像和视频处理：CUDA可以用于图像和视频处理任务，如图像滤波、图像识别、视频编码等。通过并行计算，可以提高处理速度和性能。
金融建模和数据分析：CUDA可用于金融建模、数据分析和大规模计算任务。通过利用GPU的并行计算能力，可以快速处理和分析大量的金融数据和统计模型。

安装历程

一开始先尝试使用老师的MPI+CUDA代码在pycharm中直接运行，很明显会报错。

首先我根据pycharm的智能提示去安装pycuda，但是会报错，原因是寻找不到某些依赖项。

我便根据报错提示去谷歌搜索pycharm中使用CUDA教程，根据教程我首先查看电脑是有GPU的，然后在官网下载NVIDIA CUDA Toolkit和相关的GPU驱动程序，然后根据教程安装，但是依然无法使用pycuda，我便尝试使用终端安装，报错类型相同。

我便去检查关于环境路径是否配置正确，答案是路径正确但是无法使用。

然后我之前听老师课上说使用的环境是Python3.8，我便重新配置python环境，显然还是失败。

于是接着查询资料，可能是版本不兼容问题，但是始终寻找不到相关版本兼容的资料，以失败告终。

后来我便在stack overflow上寻求解答，我便在上面找到类似问题，接着得到一个github地址，我是根据此教程配置成功的，详细内容我也说不太清楚，主要问题就出在版本兼容问题上，以及各种配置文件等，下面有教程地址，因为考虑到github可能会加载失败，我便寻求到相同内容的gitee项目，还有就是有些版本国内镜像并无下载路径，需要翻墙至官网下载对应版本。

温馨提示：一定要先确认版本再去下载，不然就和我一样，下载很多次，然后还要清理文件，特别麻烦。

在下载了几次CUDA版本之后，终于下载到适配的版本，正准备去运行时发现依然报错，原因是有库未安装，我便一一根据报错提示进行安装，其中我看教程中使用的是vs2017/vs2019，但是我的电脑安装的是vs2022，因为个人比较喜欢新产品，但是问题来了，cl.exe项目寻求不到，于是我便在文件中搜索cl.exe，应该是vs2022的问题，于是我将该目录的bin路径添加至环境变量，至此才可以得以正常运行。

但是实际过程遇到的困难远比这多，主要是因为第一次接触CUDA等新概念，以及网上教程较少，大多数错误是自己的认知不到位，但是普遍的错误应该就是如上这些。

教程地址

torch: torch (gitee.com)（详细版，应该是收费配置教程）

MPI+CUDA代码

from __future__ import print_function
import numpy as np
import time
import matplotlib
import matplotlib.pyplot as plt
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda import gpuarray
from pycuda.compiler import SourceModule

matplotlib.use('Agg')

# 设置颜色映射
# plt.rcParams['image.cmap'] = 'BrBG'
plt.rcParams['image.cmap'] = 'jet'

colorInterPolation = 100
colourMap = plt.cm.jet  # 你可以尝试其他的颜色映射，如：plt.cm.coolwarm

a = 0.1  # 扩散常数
timesteps = 10000  # 模拟的时间步数
image_interval = 1000  # 保存为图像的频率

# 网格间距
dx = 0.01
dy = 0.01
dx2 = dx ** 2
dy2 = dy ** 2

# 为了稳定性，这是时间步长的最大间隔：
dt = dx2 * dy2 / (2 * a * (dx2 + dy2))

# 设置维度和增量
lenX = lenY = 400  # 我们设置为矩形
delta = 1

# 边界条件
Ttop = 100
Tbottom = 0
Tleft = 0
Tright = 30

# 内部网格初始猜测
Tguess = 0

# 设置网格
X, Y = np.meshgrid(np.arange(0, lenX), np.arange(0, lenY))


def write_field(field, step):
    plt.gca().clear()
    # plt.clf()

    plt.figure(dpi=300)
    # 配置等高线图
    plt.title("Temperature Contour")
    plt.contourf(X, Y, field, colorInterPolation, cmap=colourMap)
    # 设置颜色条
    plt.colorbar()
    plt.axis('on')
    plt.savefig('heat_PyCUDA_{0:03d}.png'.format(step))


a = np.float32(a)
lenX = np.int32(lenX)
lenY = np.int32(lenY)
dx = np.float32(dx)
dy = np.float32(dy)
dx2 = np.float32(dx2)
dy2 = np.float32(dy2)
dt = np.float32(dt)
timesteps = np.int32(timesteps)
image_interval = np.int32(image_interval)


ker = SourceModule('''
/* 使用五点差分法更新温度值 */
__global__ void evolve_kernelCUDA(float *currdata, float *prevdata, float a, float dt, int nx, int ny,
                       float dx2, float dy2)
{

    /* 确定下一个时间步长的温度场
     * 由于我们有固定的边界条件，最外层的网格点不会被更新。 */
    int ind, iRight, iLeft, jUp, jDown;

    // CUDA线程按列主序排列；因此j索引来自x，i索引来自y
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    int j = blockIdx.y * blockDim.y + threadIdx.y;        

    if (i > 0 && j > 0 && i < nx-1 && j < ny-1) {
        ind = i * ny + j;
        iRight = (i+1)  * (ny) + j;
        iLeft = (i - 1) * (ny ) + j;
        jUp = i * (ny) + j + 1;
        jDown = i * (ny) + j - 1;
        currdata[ind] = prevdata[ind] + a * dt *
	      ((prevdata[iRight] -2.0 * prevdata[ind] + prevdata[iLeft]) / dx2 +
	      (prevdata[jUp] - 2.0 * prevdata[ind] + prevdata[jDown]) / dy2);
    }
}
''')

evolve_kernel = ker.get_function('evolve_kernelCUDA')


def main():
    # 设置数组大小，并用Tguess设置内部值
    field = np.empty((lenX, lenY))
    field.fill(Tguess)

    # 设置边界条件
    field[(lenY - 1):, :] = Ttop
    field[:1, :] = Tbottom
    field[:, (lenX - 1):] = Tright
    field[:, :1] = Tleft

    # gpuarray.to_gpu(field, allocator=None)
    # field = gpuarray.to_gpu(np.random.randn(lenX,lenX).astype(np.float32))
    # field_gpuArr = gpuarray.to_gpu(field.astype(np.float32))

    print("field's size is ", field.size)
    print(field, "\n")

    field0 = field.copy()  # 存储上一个时间步长的温度场
    print("field0's size is ", field0.size)
    print(field0, "\n")

    write_field(field, 0)

    field = field.astype(np.float32)
    field0 = field0.astype(np.float32)

    blocksize = 32  # !< CUDA线程块的维度
    dimBlock = (blocksize, blocksize, 1)
    # CUDA线程按列主序排列；因此创建ny x nx的网格
    # dimGrid = ((lenY + 2 + blocksize - 1) // blocksize,
    # (lenX + 2 + blocksize - 1) // blocksize,
    # 1)

    dimGrid = (int(lenX / blocksize + (0 if lenY % blocksize == 0 else 1)),
               int(lenY / blocksize + (0 if lenX % blocksize == 0 else 1)),
               1)
    print(dimBlock)
    print(dimGrid)
    print()

    # 在设备上分配内存
    field_gpu = cuda.mem_alloc(field.nbytes)
    field0_gpu = cuda.mem_alloc(field0.nbytes)

    # 迭代计算
    t0 = time.time()
    for m in range(1, timesteps + 1):
        # 将矩阵复制到设备内存
        cuda.memcpy_htod(field_gpu, field)
        cuda.memcpy_htod(field0_gpu, field0)

        evolve_kernel(field_gpu, field0_gpu, a, dt, lenX, lenY, dx2, dy2, block=dimBlock, grid=dimGrid)
        # cuda.cudaDeviceSynchronize()
        # cuda.Context.synchronize()

        if (m % image_interval == 0):
            # 复制结果回主机内存
            cuda.memcpy_dtoh(field, field_gpu)
            write_field(field, m);
            # print(field, "\n")

        cuda.memcpy_dtoh(field, field_gpu)
        cuda.memcpy_dtoh(field0, field0_gpu)

        # 交换当前场和上一个场，以便在下一次迭代中使用上一个场
        tmp = field
        field = field0
        field0 = tmp

    t1 = time.time()
    print("运行时间：{0}".format(t1 - t0))

    field_gpu.free()
    field0_gpu.free()


if __name__ == '__main__':
    main()

代码解释

一、导入库和模块

from __future__ import print_function
import numpy as np
import time
import matplotlib
import matplotlib.pyplot as plt
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda import gpuarray
from pycuda.compiler import SourceModule
matplotlib.use('Agg')

引入了代码所需的库和模块，numpy用于数值计算，time用于跟踪时间，matplotlib用于绘图，pycuda.autoinit用于初始化PyCUDA，pycuda.driver和pycuda.compiler用于与CUDA驱动程序和编译器交互。

二、绘制温度场

def write_field(field, step):
    plt.gca().clear()
    plt.figure(dpi=300)
    plt.title("Contour of Temperature")
    plt.contourf(X, Y, field, colorInterPolation, cmap=colourMap)
    plt.colorbar()
    plt.axis('on')
    plt.savefig('heat_PyCUDA_{0:03d}.png'.format(step))

使用Matplotlib库绘制温度场的等温线图，并保存为PNG文件。

三、主函数

在循环中，通过使用cuda.memcpy_htod函数将当前温度场数组和上一个时间步长的温度场数组复制到GPU内存中。然后，调用CUDA核函数evolve_kernel来计算温度场的更新。如果达到了指定的图像保存间隔，使用cuda.memcpy_dtoh函数将计算结果从GPU内存复制回主机内存，并调用write_field函数绘制温度场的图像。

四、内核函数

运行依赖

内核函数可以在Python中运行是因为代码中使用了PyCUDA库，它提供了与CUDA（Compute Unified Device Architecture）兼容的GPU编程接口。

PyCUDA允许在Python环境中编写CUDA内核函数，并通过与CUDA驱动程序的交互来在GPU上执行这些内核函数。

PyCUDA通过提供Python与CUDA之间的接口，使得在Python中能够方便地进行GPU编程。它提供了访问CUDA功能的高级抽象，使得编写和执行CUDA内核函数变得简单和灵活。

使用pycuda.autoinit初始化CUDA驱动程序

使用pycuda.driver模块来访问CUDA驱动程序的功能

使用pycuda.gpuarray类来在GPU上分配内存和传输数据

使用pycuda.compiler.SourceModule类来编译和加载CUDA内核函数

内核函数认识

内核函数是由CUDA编程模型定义的，并使用CUDA C或CUDA C++编写。

通常是针对问题的特定部分进行优化的，并且可以在许多线程上并行执行。每个线程都会执行相同的内核代码，但是处理不同的输入数据，以实现并行计算。

在GPU上执行内核函数时，可以同时启动多个线程来处理不同的网格点。每个线程负责计算一个特定网格点的温度更新，使用相邻点的温度值进行计算。通过并行执行多个线程，内核函数可以高效地计算大规模温度场的更新。

内核函数是在GPU上并行执行的一段特殊代码，用于高效地处理大规模数据并加速计算任务。它充分利用了GPU的并行处理能力，对每个线程进行相同的代码执行，并通过并行执行多个线程来实现高性能计算。

五、五点差分法

五点差分法是一种常用的数值方法，用于近似求解偏微分方程中的二阶导数。它基于对函数在某一点周围的相邻点进行差分计算来近似表示该点的导数。

在热传导方程中，五点差分法常用于计算温度场中每个点的二阶导数，以模拟热量在空间中的传播。差分法的基本思想是使用相邻点的温度值之间的差异来估计该点的二阶导数。

五点差分法是一种简单有效的数值方法，常用于模拟热传导、扩散等问题。在GPU上并行计算差分法可以显著加快计算速度，尤其对于大规模的温度场计算。

CTR

了解`kaggle`

定义

Kaggle是一个面向数据科学和机器学习社区的在线平台。它为数据科学家、机器学习工程师和其他相关专业的人们提供了一个交流、学习和竞赛的平台。

主要特点

数据集：Kaggle上有大量的公开数据集，涵盖各种领域和主题。用户可以浏览和下载这些数据集，用于自己的研究和项目。
笔记本：Kaggle提供了一个基于云端的Jupyter笔记本环境，用户可以在上面编写、运行和分享代码。这使得用户可以在不需要本地安装任何软件的情况下进行数据分析和建模工作。
内核：Kaggle内核是一种可运行的代码环境，用户可以使用内核来探索数据、实验不同的机器学习算法和模型，以及分享自己的成果和经验。
竞赛：Kaggle经常举办数据科学竞赛，吸引了全球范围内的数据科学家和机器学习专家参与。竞赛通常提供一个实际的问题和相应的数据集，参赛者需要通过建立模型和算法来解决这个问题，并在性能指标上取得最佳成绩。
讨论和交流：Kaggle的社区提供了一个讨论和交流的平台，用户可以在论坛上提问、回答问题，分享自己的经验和见解，与其他人进行交流和合作。

注册`kaggle`

第一次使用kaggle，使用的谷歌账号进行注册，需要梯子翻墙。

注册好之后，根据老师给的csdn网页打开kaggle中ctr竞赛，从中找到适合的代码副本参考优化。

可能由于竞赛时间较长（8年），部分副本数据文件有缺失，导致无法运行，个人能力无法解决。

找到正常运行的副本，新建notebook运行，由于算力有限，之前打开过几个副本，当再次打开其他副本是要求关闭其他副本。

完整代码

import numpy as np
import pandas as pd
import os
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
import seaborn as sns
import gzip

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


test_file = '../input/avazu-ctr-prediction/test.gz'
samplesubmision_file = '../input/avazu-ctr-prediction/sampleSubmission.gz'

chunksize = 10 ** 6
num_of_chunk = 0
train = pd.DataFrame()

for chunk in pd.read_csv('../input/avazu-ctr-train/train.csv', chunksize=chunksize):
    num_of_chunk += 1
    train = pd.concat([train, chunk.sample(frac=.05, replace=False, random_state=123)], axis=0)
    print('Processing Chunk No. ' + str(num_of_chunk))

train.reset_index(inplace=True)
train_len = len(train)
train_len

df = pd.concat([train, pd.read_csv(test_file, compression='gzip')]).drop(['index', 'id'], axis=1)

def get_date(hour):
    y = '20'+str(hour)[:2]
    m = str(hour)[2:4]
    d = str(hour)[4:6]
    return y+'-'+m+'-'+d

df['weekday'] = pd.to_datetime(df.hour.apply(get_date)).dt.dayofweek.astype(str)

def tran_hour(x):
    x = x % 100
    while x in [23,0]:
        return '23-01'
    while x in [1,2]:
        return '01-03'
    while x in [3,4]:
        return '03-05'
    while x in [5,6]:
        return '05-07'
    while x in [7,8]:
        return '07-09'
    while x in [9,10]:
        return '09-11'
    while x in [11,12]:
        return '11-13'
    while x in [13,14]:
        return '13-15'
    while x in [15,16]:
        return '15-17'
    while x in [17,18]:
        return '17-19'
    while x in [19,20]:
        return '19-21'
    while x in [21,22]:
        return '21-23'

df['hour'] = df.hour.apply(tran_hour)

df.info()

len_of_feature_count = []
for i in df.columns[2:23].tolist():
    print(i, ':', len(df[i].astype(str).value_counts()))
    len_of_feature_count.append(len(df[i].astype(str).value_counts()))

need_tran_feature = df.columns[2:4].tolist() + df.columns[13:23].tolist()

for i in need_tran_feature:
    df[i] = df[i].astype(str)


obj_features = []

for i in range(len(len_of_feature_count)):
    if len_of_feature_count[i] > 10:
        obj_features.append(df.columns[2:23].tolist()[i])
obj_features


df_describe = df.describe()
df_describe

def obj_clean(X):
    def get_click_rate(x):

        temp = train[train[X.columns[0]] == x]
        res = round((temp.click.sum() / temp.click.count()), 3)
        return res

    def get_type(V, str):

        very_high = df_describe.loc['mean', 'click'] + 0.04
        higher = df_describe.loc['mean', 'click'] + 0.02
        lower = df_describe.loc['mean', 'click'] - 0.02
        very_low = df_describe.loc['mean', 'click'] - 0.04

        vh_type = V[V[str] > very_high].index.tolist()
        hr_type = V[(V[str] > higher) & (V[str] < very_high)].index.tolist()
        vl_type = V[V[str] < very_low].index.tolist()
        lr_type = V[(V[str] < lower) & (V[str] > very_low)].index.tolist()

        return vh_type, hr_type, vl_type, lr_type

    def clean_function(x):

        while x in type_[0]:
            return 'very_high'
        while x in type_[1]:
            return 'higher'
        while x in type_[2]:
            return 'very_low'
        while x in type_[3]:
            return 'lower'
        return 'mid'

    print('Run: ', X.columns[0])
    fq = X[X.columns[0]].value_counts()

    if len(fq) > 1000:
        fq = fq[:1000]

    fq = pd.DataFrame(fq)
    fq['new_column'] = fq.index
    fq['click_rate'] = fq.new_column.apply(get_click_rate)
    type_ = get_type(fq, 'click_rate')
    return X[X.columns[0]].apply(clean_function)


for i in obj_features:
    df[[i]] = obj_clean(df[[i]])

df

for i in df.columns:
    sns.countplot(x = i, hue = "click", data = df)
    plt.show()

df.drop(['device_id', 'C14', 'C17', 'C19', 'C20', 'C21'], axis=1, inplace=True)

df = pd.get_dummies(df)
train = df[:train_len]
test = df[train_len:]

del df

pre_X = train[train['click'] == 0].sample(n=len(train[train['click'] == 1]), random_state=111)
pre_X = pd.concat([pre_X, train[train['click'] == 1]]).sample(frac=1)
pre_y = pre_X[['click']]
pre_X.drop(['click'], axis=1, inplace=True)
test.drop(['click'], axis=1, inplace=True)

pre_X_train, pre_X_test, pre_y_train, pre_y_test = train_test_split(pre_X, pre_y, test_size=0.20, stratify=pre_y, random_state=1)

params = {"criterion":["gini", "entropy"], "max_depth":range(1,20)}
grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid=params, scoring='roc_auc', cv=100, verbose=1, n_jobs=-1)
grid_search.fit(pre_X_train, pre_y_train)
grid_search.best_score_, grid_search.best_estimator_, grid_search.best_params_

tree = grid_search.best_estimator_
tree.fit(pre_X,pre_y)

feature_importances = pd.DataFrame(tree.feature_importances_)
feature_importances.index = pre_X_train.columns
feature_importances = feature_importances.sort_values(0,ascending=False)
feature_importances

pre_X_train = pre_X_train[feature_importances.index[:int(len(feature_importances)/3)]]
pre_X_test = pre_X_test[feature_importances.index[:int(len(feature_importances)/3)]]

params = {"criterion":["gini", "entropy"], "max_depth":range(1,12)}
grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid=params, scoring='roc_auc', cv=100, verbose=1, n_jobs=-1)
grid_search.fit(pre_X_train, pre_y_train)
grid_search.best_score_, grid_search.best_estimator_, grid_search.best_params_

pre_X = pre_X[feature_importances.index[:int(len(feature_importances)/3)]]

tree = grid_search.best_estimator_
tree.fit(pre_X,pre_y)
feature_importances = pd.DataFrame(tree.feature_importances_)
feature_importances.index = pre_X_train.columns
feature_importances = feature_importances.sort_values(0,ascending=False)
feature_importances

feature_len = len(feature_importances[feature_importances[feature_importances.columns[0]] > 0.005])

y = train[['click']]
X = train[feature_importances[:feature_len].index]
test = test[feature_importances[:feature_len].index]

model = XGBClassifier(tree_method = 'gpu_hist', n_jobs=-1, n_estimators=500, max_depth=11)
model.fit(X,y.values.ravel())
y_pred = model.predict(X)
print("Roc_auc_score: ",roc_auc_score(y,y_pred)*100,"%")

confmat = confusion_matrix(y_true=y, y_pred=y_pred, labels=[0, 1])

fig, ax = plt.subplots(figsize=(2.5, 2.5))
ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.3)
for i in range(confmat.shape[0]):
    for j in range(confmat.shape[1]):
        ax.text(x=j, y=i, s=confmat[i, j], va='center', ha='center')

plt.xlabel('Predicted label')
plt.ylabel('True label')

plt.tight_layout()
plt.show()

import seaborn as sns
sns.heatmap(confmat,annot=True,fmt='',xticklabels=[0,1],yticklabels=[0,1],linewidth=.5, cmap=sns.cubehelix_palette(as_cmap=True))

plt.xlabel('Predicted label',fontsize=15)
plt.ylabel('True label',fontsize=15)
plt.title('Confusion Matrix',fontsize=20)
plt.figure(figsize=(30.0,30.0),dpi=200)
plt.tight_layout()
plt.show()

import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

# Plot ROC curve for each model

fpr3, tpr3, _ = roc_curve(y, y_pred)
roc_auc3 = auc(fpr3, tpr3)

plt.figure(figsize=(10,7))
plt.plot(fpr3, tpr3, color='blue', lw=2, label='XGBoost (AUC = %0.2f)' % roc_auc3)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate',fontsize=15)
plt.ylabel('True Positive Rate',fontsize=15)
plt.title('Receiver Operating Characteristic (ROC) Curve',fontsize=20)
plt.legend(loc="lower right")
plt.show()

import matplotlib.pyplot as plt
import numpy
from sklearn import metrics

from sklearn.metrics import precision_recall_fscore_support

print(precision_recall_fscore_support(y, y_pred , average='macro'))
cm=confusion_matrix(y, y_pred)
total1=sum(sum(cm))
accuracy1=(cm[0,0]+cm[1,1])/total1
print ('Accuracy : ', accuracy1)
sensitivity1 = cm[0,0]/(cm[0,0]+cm[0,1])
print('Sensitivity : ', sensitivity1 )
specificity1 = cm[1,1]/(cm[1,0]+cm[1,1])
print('Specificity : ', specificity1)
F1_score = metrics.f1_score(y, y_pred)
print('F1 Score : ', F1_score)

代码解释

白话解释

当我们在互联网上浏览网页时，经常会看到各种广告。

广告主希望能够提高广告的点击率，也就是让更多的人点击广告。这段代码的目的是根据给定的数据，帮助预测用户在看到广告后是否会点击它。

首先，代码读取了一些数据，这些数据包含了广告展示的一些信息，比如展示时间、广告位、设备信息等。

为了加快处理速度，代码只选择了其中的一部分数据进行分析。

然后，代码对数据进行了一些处理。它从展示时间中提取了日期和小时，并转换成更容易理解的格式。它还对一些特征进行了转换，将它们转换成字符串类型，方便后续处理。

接下来，代码对数据进行了探索性分析。它计算了每个特征中不同取值的数量，以帮助我们了解特征的多样性。这可以帮助我们确定哪些特征可能对预测点击率有重要影响。

然后，代码对一些特征进行了清洗。它根据点击率将特征值分成了几个类型，比如高点击率、低点击率等。这样做的目的是将特征与点击率之间的关系更明确地表示出来。

接着，代码进行了数据可视化，绘制了柱状图来展示特征与点击率之间的关系。通过这些图表，我们可以直观地看到不同特征对点击率的影响程度。

然后，代码选择了一些最相关的特征，并对它们进行了编码，将分类特征转换成了数值特征。这是为了让计算机更容易理解和处理这些特征，以便用于训练模型。

代码使用决策树模型进行训练，决策树模型可以学习特征与点击率之间的关系，并进行预测。通过调整模型的参数，代码寻找了最佳的参数组合，以提高模型的性能。

最后，代码评估了模型的性能，并计算了一些指标，比如准确率、召回率、F1分数等。这些指标可以帮助我们了解模型在预测点击率方面的表现如何。（不太了解）

一、导入库和模块

import numpy as np
import pandas as pd
import os
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
import seaborn as sns
import gzip

numpy和pandas用于数据处理，os用于文件操作，sklearn中的各种模块用于建模和评估，matplotlib和seaborn用于绘图。

二、遍历使用文件

1
2
3

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

遍历指定目录下的文件，并打印出文件的路径。

三、定义路径

1 2	test_file = '../input/avazu-ctr-prediction/test.gz' samplesubmision_file = '../input/avazu-ctr-prediction/sampleSubmission.gz'

定义了测试数据文件和提交示例文件的路径。

四、训练数据分块读取

chunksize = 10 ** 6
num_of_chunk = 0
train = pd.DataFrame()

for chunk in pd.read_csv('../input/avazu-ctr-train/train.csv', chunksize=chunksize):
    num_of_chunk += 1
    train = pd.concat([train, chunk.sample(frac=.05, replace=False, random_state=123)], axis=0)
    print('Processing Chunk No. ' + str(num_of_chunk))

此段代码将训练数据分块读取，并通过随机抽样的方式选择部分数据进行训练集构建。

每个数据块的大小由chunksize定义。

num_of_chunk用于记录处理的块数，最后将所有块的数据合并成一个训练集。

五、相关数据处理

1
2
3

train.reset_index(inplace=True)
train_len = len(train)
train_len

用于重置训练集的索引，并记录训练集的长度

1	df = pd.concat([train, pd.read_csv(test_file, compression='gzip')]).drop(['index', 'id'], axis=1)

将训练集和测试数据集进行合并，并删除’index’和’id’列

六、转换日期时间

def tran_hour(x):
    x = x % 100
    while x in [23,0]:
        return '23-01'
    while x in [1,2]:
        return '01-03'
    while x in [3,4]:
        return '03-05'
    while x in [5,6]:
        return '05-07'
    while x in [7,8]:
        return '07-09'
    while x in [9,10]:
        return '09-11'
    while x in [11,12]:
        return '11-13'
    while x in [13,14]:
        return '13-15'
    while x in [15,16]:
        return '15-17'
    while x in [17,18]:
        return '17-19'
    while x in [19,20]:
        return '19-21'
    while x in [21,22]:
        return '21-23'

df['hour'] = df.hour.apply(tran_hour)

定义函数tran_hour()，用于将小时数转换为相应的时间段。将时间段添加到数据帧df中的’hour’列

def get_date(hour):
    y = '20'+str(hour)[:2]
    m = str(hour)[2:4]
    d = str(hour)[4:6]
    return y+'-'+m+'-'+d

df['weekday'] = pd.to_datetime(df.hour.apply(get_date)).dt.dayofweek.astype(str)

定义函数get_date()，用于从小时数提取日期。

将日期转换为星期几，并添加到数据帧df中。

七、信息打印

df.info()

打印出数据帧df的基本信息，包括列名、数据类型和非空值的数量。

len_of_feature_count = []
for i in df.columns[2:23].tolist():
    print(i, ':', len(df[i].astype(str).value_counts()))
    len_of_feature_count.append(len(df[i].astype(str).value_counts()))

遍历数据帧df中的列，并打印每列的唯一值数量。

将每列的唯一值数量添加到len_of_feature_count列表中。

八、数据类型转换

need_tran_feature = df.columns[2:4].tolist() + df.columns[13:23].tolist()

for i in need_tran_feature:
    df[i] = df[i].astype(str)

定义了需要进行类型转换的特征列，并将这些列的数据类型转换为字符串类型，便于处理。

九、点击率和样本数量排序处理

for i in obj_features:
    df[[i, 'click']].groupby(i).agg(['mean', 'count']).sort_values(by=[(i, 'mean')], ascending=False).head(10)
    df[[i, 'click']].groupby(i).agg(['mean', 'count']).sort_values(by=[(i, 'mean')], ascending=True).head(10)
    df[[i, 'click']].groupby(i).agg(['mean', 'count']).sort_values(by=[(i, 'count')], ascending=False).head(10)
    df[[i, 'click']].groupby(i).agg(['mean', 'count']).sort_values(by=[(i, 'count')], ascending=True).head(10)

此段代码对数据帧df中的每个分类特征i进行分组，并计算每组的点击率和样本数量。

根据点击率和样本数量对分组结果进行排序，分别以升序和降序的方式获取点击率和样本数量最高或最低的前10个组。

df[[i, 'click']].groupby(i).agg(['mean', 'count']).sort_values(by=[(i, 'mean')], ascending=False).to_csv(
    './'+i+'_mean.csv', compression='gzip')
df[[i, 'click']].groupby(i).agg(['mean', 'count']).sort_values(by=[(i, 'mean')], ascending=True).to_csv(
    './'+i+'_mean.csv', compression='gzip')
df[[i, 'click']].groupby(i).agg(['mean', 'count']).sort_values(by=[(i, 'count')], ascending=False).to_csv(
    './'+i+'_count.csv', compression='gzip')
df[[i, 'click']].groupby(i).agg(['mean', 'count']).sort_values(by=[(i, 'count')], ascending=True).to_csv(
    './'+i+'_count.csv', compression='gzip')

此段代码将排序后的结果以CSV格式保存到文件中，分别保存了点击率升序和降序、样本数量升序和降序的结果。

文件名以特征名和统计指标（mean和count）命名，并使用gzip进行压缩。

df = pd.merge(df, pd.read_csv('./'+i+'_mean.csv.gz'), on=i, how='left')
df = pd.merge(df, pd.read_csv('./'+i+'_count.csv.gz'), on=i, how='left')
df.drop([i], axis=1, inplace=True)
df.drop([(i, 'mean')], axis=1, inplace=True)
df.drop([(i, 'count')], axis=1, inplace=True)
df.drop(['index'], axis=1, inplace=True)

df = obj_clean(df)

此段代码将保存的排序结果文件读取回来，并与原始数据帧df进行合并。

删除原始数据帧中的分类特征列i、点击率和样本数量的列，以及多余的索引列。

调用obj_clean()函数对数据帧进行进一步的清洗。

十、数据特征选择

1 2	pre_X_train = pre_X_train[feature_importances.index[:int(len(feature_importances)/3)]] pre_X_test = pre_X_test[feature_importances.index[:int(len(feature_importances)/3)]]

此段代码根据特征重要性对训练集和测试集进行特征选择，只保留重要性排名前三分之一的特征，便于处理数据。

十一、实验结果图绘制

import seaborn as sns
sns.heatmap(confmat, annot=True, fmt='', xticklabels=[0, 1], yticklabels=[0, 1], linewidth=.5, cmap=sns.cubehelix_palette(as_cmap=True))

plt.xlabel('Predicted label', fontsize=15)
plt.ylabel('True label', fontsize=15)
plt.title('Confusion Matrix', fontsize=20)
plt.figure(figsize=(30.0, 30.0), dpi=200)
plt.tight_layout()
plt.show()

此段代码使用Seaborn库绘制较为美观的混淆矩阵热力图。

通过设置参数（annot=True）在每个单元格中显示对应的样本数量。

自定义标签的字体大小和标题，以及调整图像的大小和分辨率。

代码优化

一、数据加载和处理

在处理大型数据集时，可以考虑使用更高效的数据加载和处理方法。

可以使用Pandas的read_csv函数的nrows参数来限制读取的行数，从而避免读取整个文件。

还可以使用Pandas的astype方法一次转换多个列的数据类型，而不是使用循环逐列转换，可以提高效率。

二、数据采样和平衡

副本代码使用了负样本欠采样的方法来平衡数据集，可能导致信息损失。

可以尝试其他的数据采样方法，如过采样（如SMOTE）或集成采样方法，以更好地平衡数据并提高模型的性能。

大规模计算

热传导方程的数值求解

Images

相关代码过程

Animation

相关代码过程

安装ffmpeg

部分参数解释

问题

MPI

mpi概念

mpi安装

mpi运行命令

具体代码

部分代码解释

CUDA

CUDA概念

CUDA用途

安装历程

教程地址

MPI+CUDA代码

代码解释

一、导入库和模块

二、绘制温度场

三、主函数

四、内核函数

运行依赖

内核函数认识

五、五点差分法

CTR

了解kaggle

定义

主要特点

注册kaggle

完整代码

代码解释

白话解释

一、导入库和模块

二、遍历使用文件

三、定义路径

四、训练数据分块读取

五、相关数据处理

六、转换日期时间

七、信息打印

八、数据类型转换

九、点击率和样本数量排序处理

十、数据特征选择

十一、实验结果图绘制

代码优化

一、数据加载和处理

二、数据采样和平衡

安装`ffmpeg`

`mpi`概念

`mpi`安装

`mpi`运行命令

`CUDA`概念

`CUDA`用途

了解`kaggle`

注册`kaggle`