gpt4 book ai didi

linux-kernel - 无法使用 ib_create_qp 创建队列对

转载 作者:行者123 更新时间:2023-12-04 15:42:08 24 4
gpt4 key购买 nike

我正在编写一个 RDMA (InfiniBand) 内核模块。

到目前为止,我已经成功创建了保护域、发送和接收队列的完成队列。

但是,每当我尝试通过调用 ib_create_qp 创建队列对时,都无法创建队列对。我写的代码如下所示:

#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/list.h>
#include <linux/module.h>
#include <linux/err.h>
#include "myClient.h"


struct workqueue_struct *myClient_workqueue;
struct ib_sa_client myClient_sa_client;
/*
static void myClient_add_one(struct ib_device *device);
static void myClient_remove_one(struct ib_device *device);
*/

struct ib_pd *mypd;
struct ib_cq *myrcvcq;
struct ib_cq *myClientsendcq;
struct ib_qp *myClientqp;

void myClient_ib_recvcompletion(struct ib_cq *cq)
{
printk("A user-specified callback that is invoked when a completion event occurs on the CQ.\n");
}


void myClient_ib_sendcompletion(struct ib_cq *cq)
{
printk("A user-specified callback that is invoked when a completion event occurs on the CQ.\n");
}
static void my_qp_event_handler(struct ib_event *myqpAsyncEvent, void *anyPointer)
{
printk(KERN_INFO "Dummy affiliated asynchronous event occured function called \n");
}


static void myClient_add_one(struct ib_device *device)
{
union ib_gid tmp_gid;
int ret;
int hcaport = 1;
int result = -ENOMEM;
u16 port1Pkey;
struct ib_port_attr attr;

ret = ib_query_port(device,hcaport,&attr);
printk("ib query port result %d \n", ret);

// Creating the Protection Domain for RDMA
mypd = ib_alloc_pd(device);

if(IS_ERR(mypd)){
printk(KERN_INFO "Failed to allocate PD\n");
return;
}
else{
printk(KERN_INFO "1Successfully allocated the PD\n");
pdset = true;
}

// Creating the receive completion queue for RDMA
myrcvcq = ib_create_cq(device,myClient_ib_recvcompletion,NULL,NULL,myClient_recvq_size,0);
if(IS_ERR(myrcvcq)){
pr_err("%s:%d error code for receive cq%d\n", __func__, __LINE__, PTR_ERR(myrcvcq));
//printk("Error creating QP: %d \n",PTR_ERR(myClientqp));
}
else{
printk("Recieve CQ successfully created in address: %x \n",myrcvcq);
}

// Creating the send completion queue for RDMA
myClientsendcq = ib_create_cq(device,myClient_ib_sendcompletion, NULL, NULL,myClient_sendq_size,0 );
if(IS_ERR(myClientsendcq)){
pr_err("%s:%d scqerror code for send cq%d\n", __func__, __LINE__, PTR_ERR(myClientsendcq));
//printk("Error creating QP: %d \n",PTR_ERR(myClientqp));
}
else{
printk("1Send CQ successfully created in address: %x \n",myClientsendcq);
}

// Creating the queue pair
// Creating the queue pair

struct ib_qp_init_attr init_qpattr;

memset(&init_qpattr,0,sizeof(init_qpattr));
init_qpattr.event_handler = myClient_qp_event_handler;
init_qpattr.cap.max_send_wr = 2;
init_qpattr.cap.max_recv_wr = 2;
init_qpattr.cap.max_recv_sge = 1;
init_qpattr.cap.max_send_sge = 1;
init_qpattr.sq_sig_type = IB_SIGNAL_ALL_WR;
init_qpattr.qp_type = IB_QPT_UD;
init_qpattr.send_cq = myClientsendcq;
init_qpattr.recv_cq = myrcvcq;

myClientqp = ib_create_qp(mypd,&init_qpattr);

if(IS_ERR(myClientqp)){
pr_err("%s:%d error code %d\n", __func__, __LINE__, PTR_ERR(myClientqp));
//printk("Error creating QP: %d \n",PTR_ERR(myClientqp));
}
else{
printk(KERN_INFO "1The queue pair is successfully created \n");
qpcreated = true;
}



}
static void myClient_remove_one(struct ib_device *device)
{
}

static struct ib_client my_client = {
.name = "myRDMAclient",
.add = myClient_add_one,
.remove = myClient_remove_one
};


static int __init myRDMAclient_init(void)
{
int ret;

ret = ib_register_client(&my_client);
if(ret){
//printk(KERN_ALERT "KERN_ERR Failed to register IB client\n");
goto err_sa;
}
printk(KERN_ALERT "lKERN_INFO Successfully registered myRDMAclient module \n");
return 0;

err_sa:


return ret;
}


module_init(myRDMAclient_init);

除了 ib_create_qp(mypd,&init_qpattr); 之外,这里所有的查询都有效。无法创建队列对。

更新:在创建队列对之前注册内存。但它仍然显示 ib_create_qp 的无效参数错误(错误代码 -22)
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/list.h>
#include <linux/module.h>
#include <linux/err.h>
#include "myClient.h"


struct workqueue_struct *myClient_workqueue;
struct ib_sa_client myClient_sa_client;
/*
static void myClient_add_one(struct ib_device *device);
static void myClient_remove_one(struct ib_device *device);
*/

struct ib_pd *mypd;
struct ib_cq *myrcvcq;
struct ib_cq *myClientsendcq;
struct ib_qp *myClientqp;
struct ib_mr *mymr;

void myClient_ib_recvcompletion(struct ib_cq *cq)
{
printk("A user-specified callback that is invoked when a completion event occurs on the CQ.\n");
}


void myClient_ib_sendcompletion(struct ib_cq *cq)
{
printk("A user-specified callback that is invoked when a completion event occurs on the CQ.\n");
}
static void my_qp_event_handler(struct ib_event *myqpAsyncEvent, void *anyPointer)
{
printk(KERN_INFO "Dummy affiliated asynchronous event occured function called \n");
}


static void myClient_add_one(struct ib_device *device)
{
union ib_gid tmp_gid;
int ret;
int hcaport = 1;
int result = -ENOMEM;
u16 port1Pkey;
struct ib_port_attr attr;

ret = ib_query_port(device,hcaport,&attr);
printk("ib query port result %d \n", ret);

// Creating the Protection Domain for RDMA
mypd = ib_alloc_pd(device);

if(IS_ERR(mypd)){
printk(KERN_INFO "Failed to allocate PD\n");
return;
}
else{
printk(KERN_INFO "1Successfully allocated the PD\n");
pdset = true;
}
// Registering Memory
mymr = ib_get_dma_mr(mypd,IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ| IB_ACCESS_REMOTE_WRITE);
if(IS_ERR(mymr)){
printk("failed to register memory :( %d \n",PTR_ERR(mymr));
}else{
printk(KERN_INFO "Successfully registered memory region :) \n");
}
// End Registering Memory
// Creating the receive completion queue for RDMA
myrcvcq = ib_create_cq(device,myClient_ib_recvcompletion,NULL,NULL,myClient_recvq_size,0);
if(IS_ERR(myrcvcq)){
pr_err("%s:%d error code for receive cq%d\n", __func__, __LINE__, PTR_ERR(myrcvcq));
//printk("Error creating QP: %d \n",PTR_ERR(myClientqp));
}
else{
printk("Recieve CQ successfully created in address: %x \n",myrcvcq);
}

// Creating the send completion queue for RDMA
myClientsendcq = ib_create_cq(device,myClient_ib_sendcompletion, NULL, NULL,myClient_sendq_size,0 );
if(IS_ERR(myClientsendcq)){
pr_err("%s:%d scqerror code for send cq%d\n", __func__, __LINE__, PTR_ERR(myClientsendcq));
//printk("Error creating QP: %d \n",PTR_ERR(myClientqp));
}
else{
printk("1Send CQ successfully created in address: %x \n",myClientsendcq);
}

// Creating the queue pair
// Creating the queue pair

struct ib_qp_init_attr init_qpattr;

memset(&init_qpattr,0,sizeof(init_qpattr));
init_qpattr.event_handler = myClient_qp_event_handler;
init_qpattr.cap.max_send_wr = 2;
init_qpattr.cap.max_recv_wr = 2;
init_qpattr.cap.max_recv_sge = 1;
init_qpattr.cap.max_send_sge = 1;
init_qpattr.sq_sig_type = IB_SIGNAL_ALL_WR;
init_qpattr.qp_type = IB_QPT_UD;
init_qpattr.send_cq = myClientsendcq;
init_qpattr.recv_cq = myrcvcq;

myClientqp = ib_create_qp(mypd,&init_qpattr);

if(IS_ERR(myClientqp)){
pr_err("%s:%d error code %d\n", __func__, __LINE__, PTR_ERR(myClientqp));
//printk("Error creating QP: %d \n",PTR_ERR(myClientqp));
}
else{
printk(KERN_INFO "1The queue pair is successfully created \n");
qpcreated = true;
}



}
static void myClient_remove_one(struct ib_device *device)
{
}

static struct ib_client my_client = {
.name = "myRDMAclient",
.add = myClient_add_one,
.remove = myClient_remove_one
};


static int __init myRDMAclient_init(void)
{
int ret;

ret = ib_register_client(&my_client);
if(ret){
//printk(KERN_ALERT "KERN_ERR Failed to register IB client\n");
goto err_sa;
}
printk(KERN_ALERT "lKERN_INFO Successfully registered myRDMAclient module \n");
return 0;

err_sa:


return ret;
}


module_init(myRDMAclient_init);

最佳答案

更新 :

根据下面评论中的讨论,我猜您在当前发行版之上安装了 Mellanox OFED 驱动程序。查看 Mellanox OFED 内核驱动程序的 3.1-1.0.3 源代码,我看到他们更改了 struct ib_qp_init_attr 的布局通过添加一些字段。我很确定您的问题是您正在针对原始 SLE 3.0.76-0.11 内核头文件构建模块,因此 init_qpattr您传递给 create QP 函数的结构没有您在正确位置设置的值。

我不知道您是如何安装新的树外驱动程序的,所以我无法准确告诉您如何正确构建模块,但您可以尝试添加类似

    init_qpattr.qpg_type = 0;

到你设置结构的地方。 (我知道你 memset 整个事情已经为零,但这将确保你正在构建的 header 具有新的 qpg_type 结构成员。我认为这是 OFED 添加的一个新字段,它不是t 在您的原始内核头文件中,因此如果您的模块编译,那么您正在构建正确的头文件)

旧答案:

因此,我怀疑您在 mlx4 驱动程序中遇到了与创建如此小的 QP( max_send_wr == max_recv_wr == 2max_send_sge == max_recv_sge == 1)相关的错误。我设法找到了您正在使用的 3.0.76-0.11 内核的源代码,不幸的是,我没有看到任何明显的错误。

您可以尝试帮助调试的一些事情
  • 添加模块参数debug_level=1mlx4_core加载模块时。用驱动程序初始化的所有输出更新您的问题(一堆关于“Max CQEs:”等的行。mlx4 驱动程序中有相当多的逻辑取决于初始化期间固件返回的参数,这个输出会让我们看看那些是什么。
  • 就此而言,值得检查您的 HCA 固件是否是最新的 - 使用更新的固件可能会获得更好的结果(尽管驱动程序无论如何应该可以工作,但由于缺少固件功能,您可能会遇到未经测试的驱动程序代码中的错误触发不同的代码路径)。
  • 尝试更新您的代码以增加这些参数。您可以尝试增加 max_send_sgemax_recv_sge到 2 并增加 max_send_wrmax_recv_wr例如,32 或 128。(尝试单独或组合增加这些)
  • 如果您知道如何启用功能跟踪器(This LWN article 很有帮助;我假设旧的 SLES 内核具有所有必需的功能),那么启用 mlx4_ib 和 mlx4_core 模块的跟踪然后加载您的模块会很棒。如果您使用跟踪更新您的问题,那么我们可以查看创建 QP 操作失败的位置 — 例如,它是否在 set_rq_size() 中失败, 到达 set_kernel_sq_size()还是在其他地方失败?
  • 关于linux-kernel - 无法使用 ib_create_qp 创建队列对,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/34788781/

    24 4 0
    Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
    广告合作:1813099741@qq.com 6ren.com