c - Linux 模块 : performance counter does not work-6ren

c - Linux 模块 : performance counter does not work

转载作者：IT王子更新时间：2023-10-29 01:15:07

我想监控末级缓存中的缓存请求数。我根据教程编写了一个 Linux 模块来获取该信息 here .

可以编译运行，但是输出结果一直是0，也就是说，我用rdmsr的时候，总是给我edx=0，eax=0。我什至尝试了 tutorial 中的演示代码，输出还是0。

我整整一个星期都被这个问题困住了。谁能帮我指出我在程序中犯的错误？

我知道有一些现有的程序在做同样的事情，但我必须知道如何自己编写代码，因为我想在 Xen hypervisor 中监视缓存请求。我无法在 Xen 中使用这些工具，除非我将这些工具合并到 Xen 的管理程序中，这似乎更有效。

/*
 * Record the cache miss rate of Intel Sandybridge cpu
 * To confirm the event is correctly set!
 */
#include <linux/module.h>   /* Needed by all modules */
#include <linux/kernel.h>   /* Needed for KERN_INFO */

/*4 Performance Counters Selector for %ecx in insn wrmsr*/
#define PERFEVTSEL0    0x186
#define PERFEVTSEL1    0x187
#define PERFEVTSEL2    0x188
#define PERFEVTSEL3    0x189

/*4 MSR Performance Counter for the above selector*/
#define PMC0    0xc1
#define PMC1    0xc2
#define PMC2    0xc2
#define PMC3    0xc3

/*Intel Software Developer Manual Page 2549*/ /*L1I L1D cache events has not been confirmed!*/
/*L1 Instruction Cache Performance Tuning Events*/
#define L1I_ALLHIT_EVENT    0x80
#define L1I_ALLHIT_MASK     0x01
#define L1I_ALLMISS_EVENT   0x80    /*confirmed*/
#define L1I_ALLMISS_MASK    0x02    /*confirmed*/

/*L1 Data Cache Performance Tuning Events*/ 
/*Intel does not have the ALLREQ Miss mask; have to add LD_miss and ST_miss*/
#define L1D_ALLREQ_EVENT    0x43
#define L1D_ALLREQ_MASK     0x01
#define L1D_LDMISS_EVENT    0x40
#define L1D_LDMISS_MASK     0x01
#define L1D_STMISS_EVENT    0x28
#define L1D_STMISS_MASK     0x01

/*L2 private cache for each core*/ /*confirmed*/
#define L2_ALLREQ_EVENT     0x24
#define L2_ALLREQ_MASK      L2_ALLCODEREQ_MASK  /*0xFF*/
#define L2_ALLMISS_EVENT    0x24
#define L2_ALLMISS_MASK     L2_ALLCODEMISS_MASK /*0xAA*/

#define L2_ALLCODEREQ_MASK  0x30
#define L2_ALLCODEMISS_MASK 0x20

/*L3 shared cache*/ /*confirmed*/
/*Use the last level cache event and mask*/
#define L3_ALLREQ_EVENT     0x2E
#define L3_ALLREQ_MASK      0x4F
#define L3_ALLMISS_EVENT    0x2E
#define L3_ALLMISS_MASK     0x41 

#define USR_BIT             (0x01UL << 16)
#define OS_BIT              (0x01UL << 17)


#define SET_MSR_USR_BIT(eax)    eax |= USR_BIT
#define CLEAR_MSR_USR_BIT(exa)  eax &= (~USR_BIT)
#define SET_MSR_OS_BIT(eax)     eax |= OS_BIT
#define CLEAR_MSR_OS_BIT(eax)   eax &= (~OS_BIT)

#define SET_EVENT_MASK(eax, event, umask)    eax |= (event | (umask << 8))  

/*MSR EN flag: when set start the counter!*/
//#define MSR_ENFLAG      (0x1<<22)
#define MSR_ENFLAG      (0x1<<22)


/* 32bit insn v3*/
static inline void rtxen_write_msr(uint32_t eax, uint32_t ecx)
{
    /*clear counter first*/
   __asm__ __volatile__ ("movl %0, %%ecx\n\t"
        "xorl %%edx, %%edx\n\t"
        "xorl %%eax, %%eax\n\t"
        "wrmsr\n\t"
        : /* no outputs */
        : "m" (ecx)
        : "eax", "ecx", "edx" /* all clobbered */);

   eax |= MSR_ENFLAG;

   __asm__("movl %0, %%ecx\n\t" /* ecx contains the number of the MSR to set */
        "xorl %%edx, %%edx\n\t"/* edx contains the high bits to set the MSR to */
        "movl %1, %%eax\n\t" /* eax contains the log bits to set the MSR to */
        "wrmsr\n\t"
        : /* no outputs */
        : "m" (ecx), "m" (eax)
        : "eax", "ecx", "edx" /* clobbered */);
}

static inline void  rtxen_read_msr(uint32_t* ecx, uint32_t *eax, uint32_t* edx)
{    __asm__ __volatile__(\
        "rdmsr"\
        :"=d" (*edx), "=a" (*eax)\
        :"c"(*ecx)
        );
}

static inline void delay(void )
{
    char tmp[1000]; 
    int i;
    for( i = 0; i < 1000; i++ )
    {
        tmp[i] = i * 2;
    }
}

enum cache_level
{
    UOPS,
    L1I,
    L1D,
    L2,
    L3
};

int init_module(void)
{
    enum cache_level op;
    uint32_t eax, edx, ecx;
    uint64_t l3_all;
    op = UOPS;
    switch(op)
    {
    case UOPS:
        eax = 0x0001010E;
        eax |= MSR_ENFLAG;
        ecx = 0x187;
        printk(KERN_INFO "UOPS Demo: write_msr: eax=%#010x, ecx=%#010x\n", eax, ecx);
        rtxen_write_msr(eax, ecx);
        ecx = 0xc2;
        eax = 1;
        edx = 2;
        rtxen_read_msr(&ecx, &eax, &edx);
        printk(KERN_INFO "UOPS Demo: read_msr: edx=%#010x, eax=%#010x\n", edx, eax);
        break;
    case L3: 
        eax = 0;
        SET_MSR_USR_BIT(eax);
        SET_MSR_OS_BIT(eax);
        SET_EVENT_MASK(eax, L3_ALLREQ_EVENT, L3_ALLREQ_MASK);
        eax |= MSR_ENFLAG;
        ecx = PERFEVTSEL2;
        printk(KERN_INFO "before wrmsr: eax=%#010x, ecx=%#010x\n", eax, ecx);
        rtxen_write_msr(eax, ecx);
        printk(KERN_INFO "after wrmsr: eax=%#010x, ecx=%#010x\n", eax, ecx);
        printk(KERN_INFO "L3 all request set MSR PMC2\n");
        printk(KERN_INFO "delay by access an array\n");
        delay();
        ecx = PMC2;
        eax = 1;
        edx = 2;
        printk(KERN_INFO "rdmsr: ecx=%#010x\n", ecx);
        rtxen_read_msr(&ecx, &eax, &edx); /*need to pass into address!*/
        l3_all = ( ((uint64_t) edx << 32) | eax );
        printk(KERN_INFO "rdmsr: L3 all request is %llu (%#010lx)\n", l3_all, (unsigned long)l3_all);
        break;
    default:
        printk(KERN_INFO "operation not implemented yet\n");   
    }
    /* 
     * A non 0 return means init_module failed; module can't be loaded. 
     */
    return 0;
}

void cleanup_module(void)
{
    printk(KERN_INFO "Goodbye world 1.\n");
}

我得到的结果是:

[ 1780.946584] UOPS Demo: write_msr: eax=0x0001010e, ecx=0x00000187
[ 1780.946590] UOPS Demo: read_msr: edx=0x00000000, eax=0x00000000
[ 1818.595055] Goodbye world 1.
[ 1821.153947] UOPS Demo: write_msr: eax=0x0041010e, ecx=0x00000187
[ 1821.153950] UOPS Demo: read_msr: edx=0x00000000, eax=0x00000000

最佳答案

我终于在@Manuel Selva 的帮助下解决了这个问题!

设置性能的正确流程。计数器是:

第一步:设置msr并通过设置eax中的EN位使能计数器；

第二步:写入msr停止计数器

第三步:读取计数器

我错过了第 2 步，这就是为什么它总是给我 0。如果我想在停止之前读取计数器，报告 0 是有意义的。

switch语句的正确代码如下:

 switch(op)
    {
    case UOPS:
        eax = 0x0051010E;
        eax |= MSR_ENFLAG;
        ecx = 0x187;
        printk(KERN_INFO "UOPS Demo: write_msr: eax=%#010x, ecx=%#010x\n", eax, ecx);
        rtxen_write_msr(eax, ecx);
        //stop counting
        eax = 0x0011010E;
        rtxen_write_msr(eax,ecx);
        ecx = 0xc2;
        eax = 1;
        edx = 2;
        rtxen_read_msr(&ecx, &eax, &edx);
        printk(KERN_INFO "UOPS Demo: read_msr: edx=%#010x, eax=%#010x\n", edx, eax);
        break;
    case L3: 
        eax = 0;
        SET_MSR_USR_BIT(eax);
        SET_MSR_OS_BIT(eax);
        SET_EVENT_MASK(eax, L3_ALLREQ_EVENT, L3_ALLREQ_MASK);
        eax |= MSR_ENFLAG;
        eax |= (1<<20); //INT bit: counter overflow
        ecx = PERFEVTSEL2;
        printk(KERN_INFO "before wrmsr: eax=%#010x, ecx=%#010x\n", eax, ecx);
        rtxen_write_msr(eax, ecx);
        printk(KERN_INFO "after wrmsr: eax=%#010x, ecx=%#010x\n", eax, ecx);
        printk(KERN_INFO "L3 all request set MSR PMC2\n");
        printk(KERN_INFO "delay by access an array\n");
        delay();
        eax &= (~MSR_ENFLAG);
        rtxen_write_msr(eax, ecx);
        printk(KERN_INFO "stop the counter, eax=%#010x\n", eax);
        ecx = PMC2;
        eax = 1;
        edx = 2;
        printk(KERN_INFO "rdmsr: ecx=%#010x\n", ecx);
        rtxen_read_msr(&ecx, &eax, &edx); /*need to pass into address!*/
        l3_all = ( ((uint64_t) edx << 32) | eax );
        printk(KERN_INFO "rdmsr: L3 all request is %llu (%#010lx)\n", l3_all, (unsigned long)l3_all);
        break;
    default:
        printk(KERN_INFO "operation not implemented yet\n");   
    }

关于c - Linux 模块 : performance counter does not work，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/21652256/

文章推荐： php - laravel blade，如何附加到一个部分

文章推荐： git - 使用 Git 获取或 pull 自动修剪

文章推荐： Git:如何压缩分支上的所有提交

文章推荐： linux - Bash - if 语句结合邮件命令

performance - "performant"软件究竟是什么意思？
关闭。这个问题是opinion-based .它目前不接受答案。想改善这个问题吗？更新问题，以便可以通过 editing this post 用事实和引文回答问题. 8年前关闭。 Improve t
performance - 灿灿授权: Performance Issue
暂时忘记能力的定义，只关注能力的“检查”(使用“授权!”)，我看到 CanCan 添加了大约 400 毫秒，用于简单地检查用户是否具有特定的能力主题/模型。这是预期的吗(我假设不是)？或者，有没有可
performance - Swift 显式与推断类型 : Performance
我正在阅读有关 Swift 的教程 ( http://www.raywenderlich.com/74438/swift-tutorial-a-quick-start )，它预定义为不显式设置类型，因
performance - 编码优先级 : Performance, 可维护性、可重用性？
这主要是由于对 SQL 问题的回答。由于性能原因，有意省略了 UDF 和子查询。我没有包括可靠性并不是说它应该被视为理所当然，但代码必须工作。性能永远是第一位的吗？提供了许多以性能为主要优先事项的答
performance - Scala递归与循环: performance and runtime considerations
我已经编写了一个简单的测试平台来测量三种阶乘实现的性能:基于循环的，非尾递归的和尾递归的。 Surprisingly to me the worst performant was the loop o
performance - ui-performance 插件无法在开发模式下工作 (Grails)
我已将 ui-performance 插件应用到我的应用程序中。不幸的是，在开发模式下运行应用程序时它似乎不起作用。例如，我的 javascript 导入是用“vnull”版本呈现的。例如不会
performance - 编译 F# 引用 : performance?
我有一个我操作的 F# 引用(我在各处添加对象池以回收经常创建和删除的短期对象)。我想运行结果报价；现在我使用了 F# PowerPack，它提供了将引用转换为表达式树和委托(delegate)的方法
performance - Spark独立: SparklyR : Performance issues
我正在尝试在 Spark 服务器上运行 SparklyR 库中的机器学习算法。 1 个簇 8 核 24G内存 Ubuntu 16.04 星火2.2 独立配置 1名师傅/2名 worker 每个执行器的
performance - 架构和索引以及主键 : Differences in lookup performance?
我有一个数据库(准确地说是在 postgres 上运行)，具有以下结构: user1 (schema) | - cars (table) - airplanes (table, again) .
performance - iOS/核心动画 : Performance tuning
我的应用程序在我的 iPad 上运行。但它的表现非常糟糕——我的速度低于 15fps。谁能帮我优化一下？它基本上是一个轮子(派生自 UIView)，包含 12 个按钮(派生自 UIControl)。
performance - coursera progfun1 : scala union performance
在完成“Scala 中的函数式编程原则”@coursera 类(class)第 3 周的作业时，我发现当我实现视频类(class)中所示的函数联合时: override def union(tha
performance - Symfony2 依赖注入(inject) : performances impact
我正在重构我的一个 Controller 以使其成为一项服务，我想知道不将整个服务容器注入(inject)我的 Controller 是否会对性能产生影响。这样效率更高吗: innova.path.
performance - facelet tag performance
我有一个要显示的内容很大的文件。例如在显示用户配置文件时，中的每个 EL 表达式需要一个 userId 作为 bean 的参数，该参数取自 session 上下文。我在 xhtml 文件中将这个 u
performance - OpenGL/DirectX : How does Mipmapping improve performance?
我非常了解 mipmapping。我不明白(在硬件/驱动程序级别)是 mipmapping 如何提高应用程序的性能(至少这是经常声称的)。在执行片段着色器之前，驱动程序不知道要访问哪个 mipmap
performance - Scala 惰性值 : performance penalty? 线程安全？
这个问题在这里已经有了答案: 10年前关闭。 Possible Duplicate: What's the (hidden) cost of lazy val? (Scala) Scala 允许定义惰
java - build().perform() 和 Perform() 之间有什么区别
一些文章建议现在 build() 包含在 perform() 本身中，而其他人则建议当要链接多个操作时使用 build().perform()一起。最佳答案 build() 包含在 perform(
performance - postgres 函数 : when does IMMUTABLE hurt performance?
Postgres docs说 For best optimization results, you should label your functions with the strictest vol
performance - 零成本抽象 : performance of for-loop vs. 迭代器
阅读Zero-cost abstractions看着 Introduction to rust: a low-level language with high-level abstractions我尝
performance - MQ : CPU Performance 上的 SSL
我想在 MQ 服务器上部署 SSL，但我想知道我当前的 CPU 容量是否支持 SSL。 (我没有预算增加 CPU 内核和 MQ PVU 的数量) 我的规范: Windows 2003 服务器 SP2，
performance - Chrome Performance Profiler 中的“Timings”选项卡丢失
因此，我在 Chrome 开发者工具的性能选项卡内的时间部分成功地监控了我的 React Native 应用程序的性能。突然在应用程序的特定重新加载时，Timings 标签丢失。我已尝试重置

IT王子

个人简介

我是一名优秀的程序员,十分优秀！

作者热门文章

滴滴打车优惠券免费领取

全站热门文章

首页

博学

6Ren·AI

商城

c - Linux 模块 : performance counter does not work