c++ - 在具有 constant_tsc 和 nonstop_tsc 的 cpu 上，为什么我的时间会漂移？-6ren

c++ - 在具有 constant_tsc 和 nonstop_tsc 的 cpu 上，为什么我的时间会漂移？

转载作者：IT王子更新时间：2023-10-29 00:34:27

我正在使用 constant_tsc 和 nonstop_tsc 的 cpu 上运行此测试

$ grep -m 1 ^flags /proc/cpuinfo | sed 's/ /\n/g' | egrep "constant_tsc|nonstop_tsc"
constant_tsc
nonstop_tsc

第 1 步:计算 tsc 的滴答率:

我将 _ticks_per_ns 计算为多个观察值的中位数。我使用 rdtscp 来确保按顺序执行。

static const int trials = 13;
std::array<double, trials> rates;

for (int i = 0; i < trials; ++i)
{
    timespec beg_ts, end_ts;
    uint64_t beg_tsc, end_tsc;

    clock_gettime(CLOCK_MONOTONIC, &beg_ts);
    beg_tsc = rdtscp();

    uint64_t elapsed_ns;
    do
    {
        clock_gettime(CLOCK_MONOTONIC, &end_ts);
        end_tsc = rdtscp();

        elapsed_ns = to_ns(end_ts - beg_ts); // calculates ns between two timespecs
    }
    while (elapsed_ns < 10 * 1e6); // busy spin for 10ms

    rates[i] = (double)(end_tsc - beg_tsc) / (double)elapsed_ns;
}

std::nth_element(rates.begin(), rates.begin() + trials/2, rates.end());

_ticks_per_ns = rates[trials/2];

第 2 步:计算起始挂钟时间和 tsc

uint64_t beg, end;
timespec ts;

// loop to ensure we aren't interrupted between the two tsc reads
while (1)
{
    beg = rdtscp();
    clock_gettime(CLOCK_REALTIME, &ts);
    end = rdtscp();

    if ((end - beg) <= 2000) // max ticks per clock call
        break;
}

_start_tsc        = end;
_start_clock_time = to_ns(ts); // converts timespec to ns since epoch

第 3 步:创建一个可以从 tsc 返回挂钟时间的函数

uint64_t tsc_to_ns(uint64_t tsc)
{
    int64_t diff = tsc - _start_tsc;
    return _start_clock_time + (diff / _ticks_per_ns);
}

第 4 步:循环运行，从 clock_gettime 和 rdtscp

打印挂钟时间

// lock the test to a single core
cpu_set_t mask;
CPU_ZERO(&mask);
CPU_SET(6, &mask);
sched_setaffinity(0, sizeof(cpu_set_t), &mask);

while (1)
{
    timespec utc_now;
    clock_gettime(CLOCK_REALTIME, &utc_now);
    uint64_t utc_ns = to_ns(utc_now);
    uint64_t tsc_ns = tsc_to_ns(rdtscp());

    uint64_t ns_diff = tsc_ns - utc_ns;

    std::cout << "clock_gettime " << ns_to_str(utc_ns) << '\n';
    std::cout << "tsc_time      " << ns_to_str(tsc_ns) << " diff=" << ns_diff << "ns\n";

    sleep(10);
}

输出:

clock_gettime 11:55:34.824419837
tsc_time      11:55:34.824419840 diff=3ns
clock_gettime 11:55:44.826260245
tsc_time      11:55:44.826260736 diff=491ns
clock_gettime 11:55:54.826516358
tsc_time      11:55:54.826517248 diff=890ns
clock_gettime 11:56:04.826683578
tsc_time      11:56:04.826684672 diff=1094ns
clock_gettime 11:56:14.826853056
tsc_time      11:56:14.826854656 diff=1600ns
clock_gettime 11:56:24.827013478
tsc_time      11:56:24.827015424 diff=1946ns

问题:

很明显，以这两种方式计算的时间会迅速偏离。

我假设对于 constant_tsc 和 nonstop_tsc，tsc 速率是恒定的。

这是漂移的机载时钟吗？它肯定不会以这种速度漂移吗？
这种漂移的原因是什么？
我能做些什么来使它们保持同步(除了在第 2 步中非常频繁地重新计算 _start_tsc 和 _start_clock_time 之外)？

最佳答案

在 OP 中看到漂移的原因，至少在我的机器上，是每 ns 的 TSC 滴答声偏离其原始值 _ticks_per_ns。以下结果来自这台机器:

don@HAL:~/UNIX/OS/3EZPcs/Ch06$ uname -a
Linux HAL 4.4.0-81-generic #104-Ubuntu SMP Wed Jun 14 08:17:06 UTC 2017 x86_64 x86_64 x86_64 GNU/Linux
don@HAL:~/UNIX/OS/3EZPcs/Ch06$  cat /sys/devices/system/clocksource/clocksource0/current_clocksource
tsc

cat/proc/cpuinfo 显示 constant_tsc 和 nonstop_tsc 标志。

可以运行 viewRates.cc 来查看机器上每 ns 的当前 TSC Ticks:

rdtscp.h:

static inline unsigned long rdtscp_start(void) {
  unsigned long var;
  unsigned int hi, lo;

  __asm volatile ("cpuid\n\t"
          "rdtsc\n\t" : "=a" (lo), "=d" (hi)
          :: "%rbx", "%rcx");

  var = ((unsigned long)hi << 32) | lo;
  return (var);
}

static inline unsigned long rdtscp_end(void) {
  unsigned long var;
  unsigned int hi, lo;

  __asm volatile ("rdtscp\n\t"
          "mov %%edx, %1\n\t"
          "mov %%eax, %0\n\t"
          "cpuid\n\t"  : "=r" (lo), "=r" (hi)
          :: "%rax", "%rbx", "%rcx", "%rdx");

  var = ((unsigned long)hi << 32) | lo;
  return (var);
  }

/*see https://www.intel.com/content/www/us/en/embedded/training/ia-32-ia-64-benchmark-code-execution-paper.html
 */

viewRates.cc:

#include <time.h>
#include <unistd.h>
#include <iostream>
#include <iomanip>
#include <cstdlib>
#include "rdtscp.h"
using std::cout;  using std::cerr;  using std::endl;

#define CLOCK CLOCK_REALTIME

uint64_t to_ns(const timespec &ts);   // Converts a struct timespec to ns (since epoch).
void view_ticks_per_ns(int runs =10, int sleep =10);

int main(int argc, char **argv) {
  int runs = 10, sleep = 10;
  if (argc != 1 && argc != 3) {
    cerr << "Usage: " << argv[0] << " [ RUNS SLEEP ] \n";
    exit(1);
  } else if (argc == 3) {
    runs = std::atoi(argv[1]);
    sleep = std::atoi(argv[2]);
  }

  view_ticks_per_ns(runs, sleep); 
}

  void view_ticks_per_ns(int RUNS, int SLEEP) {
// Prints out stream of RUNS tsc ticks per ns, each calculated over a SLEEP secs interval.
  timespec clock_start, clock_end;
  unsigned long tsc1, tsc2, tsc_start, tsc_end;
  unsigned long elapsed_ns, elapsed_ticks;
  double rate; // ticks per ns from each run.

  clock_getres(CLOCK, &clock_start);
  cout <<  "Clock resolution: " << to_ns(clock_start) << "ns\n\n";

  cout << " tsc ticks      " << "ns      " << " tsc ticks per ns\n";
  for (int i = 0; i < RUNS; ++i) {
    tsc1 = rdtscp_start();
    clock_gettime(CLOCK, &clock_start);
    tsc2 = rdtscp_end();                      
    tsc_start = (tsc1 + tsc2) / 2;

    sleep(SLEEP);

    tsc1 = rdtscp_start();
    clock_gettime(CLOCK, &clock_end);
    tsc2 = rdtscp_end();                     
    tsc_end = (tsc1 + tsc2) / 2;

    elapsed_ticks = tsc_end - tsc_start;
    elapsed_ns = to_ns(clock_end) - to_ns(clock_start);
    rate = static_cast<double>(elapsed_ticks) / elapsed_ns;

    cout << elapsed_ticks << " " << elapsed_ns << " " << std::setprecision(12) << rate << endl;
  } 
}

可以运行 linearExtrapolator.cc 来重新创建 OP 的实验:

线性外推器.cc:

#include <time.h>
#include <unistd.h>
#include <iostream>
#include <iomanip>
#include <algorithm>
#include <array>
#include "rdtscp.h"

using std::cout;  using std::endl;  using std::array;

#define CLOCK CLOCK_REALTIME

uint64_t to_ns(const timespec &ts);   // Converts a struct timespec to ns (since epoch).
void set_ticks_per_ns(bool set_rate); // Display or set tsc ticks per ns, _ticks_per_ns.
void get_start();             // Sets the 'start' time point: _start_tsc[in ticks] and _start_clock_time[in ns].
uint64_t tsc_to_ns(uint64_t tsc);     // Convert tsc ticks since _start_tsc to ns (since epoch) linearly using
                                      // _ticks_per_ns with origin(0) at the 'start' point set by get_start().

uint64_t _start_tsc, _start_clock_time; // The 'start' time point as both tsc tick number, start_tsc, and as
                                        // clock_gettime ns since epoch as _start_clock_time.
double _ticks_per_ns;                   // Calibrated in set_ticks_per_ns()

int main() {
  set_ticks_per_ns(true); // Set _ticks_per_ns as the initial TSC ticks per ns.

  uint64_t tsc1, tsc2, tsc_now, tsc_ns, utc_ns;
  int64_t ns_diff;
  bool first_pass{true};
  for (int i = 0; i < 10; ++i) {
    timespec utc_now;
    if (first_pass) {
      get_start(); //Get start time in both ns since epoch (_start_clock_time), and tsc tick number(_start_tsc)
      cout << "_start_clock_time: " <<  _start_clock_time << ", _start_tsc: " << _start_tsc << endl;
      utc_ns = _start_clock_time;
      tsc_ns = tsc_to_ns(_start_tsc);   // == _start_clock_time by definition.
      tsc_now = _start_tsc;
      first_pass = false;
    } else {
      tsc1 = rdtscp_start();
      clock_gettime(CLOCK, &utc_now);
      tsc2 = rdtscp_end();
      tsc_now = (tsc1 + tsc2) / 2;
      tsc_ns = tsc_to_ns(tsc_now);
      utc_ns = to_ns(utc_now);
    }

    ns_diff = tsc_ns - (int64_t)utc_ns;

    cout << "elapsed ns: " << utc_ns - _start_clock_time << ", elapsed ticks: " << tsc_now - _start_tsc 
     << ", ns_diff: " << ns_diff << '\n' << endl;

    set_ticks_per_ns(false);  // Display current TSC ticks per ns (does not alter original _ticks_per_ns).
  }
}

void set_ticks_per_ns(bool set_rate) {
  constexpr int RUNS {1}, SLEEP{10};
  timespec clock_start, clock_end;
  uint64_t tsc1, tsc2, tsc_start, tsc_end;
  uint64_t elapsed_ns[RUNS], elapsed_ticks[RUNS];
  array<double, RUNS> rates; // ticks per ns from each run.

  if (set_rate) {
    clock_getres(CLOCK, &clock_start);
    cout <<  "Clock resolution: " << to_ns(clock_start) << "ns\n";
  }

  for (int i = 0; i < RUNS; ++i) {
    tsc1 = rdtscp_start();
    clock_gettime(CLOCK, &clock_start);
    tsc2 = rdtscp_end();                      
    tsc_start = (tsc1 + tsc2) / 2;

    sleep(SLEEP);

    tsc1 = rdtscp_start();
    clock_gettime(CLOCK, &clock_end);
    tsc2 = rdtscp_end();                     
    tsc_end = (tsc1 + tsc2) / 2;

    elapsed_ticks[i] = tsc_end - tsc_start;
    elapsed_ns[i] = to_ns(clock_end) - to_ns(clock_start);
    rates[i] = static_cast<double>(elapsed_ticks[i]) / elapsed_ns[i];
  }

  cout << " tsc ticks      " << "ns     " << "tsc ticks per ns" << endl;
  for (int i = 0; i < RUNS; ++i)
    cout << elapsed_ticks[i] << " " << elapsed_ns[i] << " " << std::setprecision(12) << rates[i] << endl;

  if (set_rate)
    _ticks_per_ns = rates[RUNS-1];
}

constexpr uint64_t BILLION {1000000000};

uint64_t to_ns(const timespec &ts) {
  return ts.tv_sec * BILLION + ts.tv_nsec;
}

void get_start() { // Get start time both in tsc ticks as _start_tsc, and in ns since epoch as _start_clock_time
  timespec ts;
  uint64_t beg, end;

// loop to ensure we aren't interrupted between the two tsc reads
  while (1) {
    beg = rdtscp_start();
    clock_gettime(CLOCK, &ts);
    end = rdtscp_end();   
    if ((end - beg) <= 2000) // max ticks per clock call
      break;
  }

  _start_tsc = (end + beg) / 2;
  _start_clock_time = to_ns(ts); // converts timespec to ns since epoch
}

uint64_t tsc_to_ns(uint64_t tsc) { // Convert tsc ticks into absolute ns:
  // Absolute ns is defined by this linear extrapolation from the start point where
  //_start_tsc[in ticks] corresponds to _start_clock_time[in ns].
  uint64_t diff = tsc - _start_tsc;
  return _start_clock_time + static_cast<uint64_t>(diff / _ticks_per_ns);
}

这是运行 viewRates 后紧跟 linearExtrapolator 的输出:

don@HAL:~/UNIX/OS/3EZPcs/Ch06$ ./viewRates 
Clock resolution: 1ns

 tsc ticks      ns       tsc ticks per ns
28070466526 10000176697 2.8069970538
28070500272 10000194599 2.80699540335
28070489661 10000196097 2.80699392179
28070404159 10000170879 2.80699245029
28070464811 10000197285 2.80699110338
28070445753 10000195177 2.80698978932
28070430538 10000194298 2.80698851457
28070427907 10000197673 2.80698730414
28070409903 10000195492 2.80698611597
28070398177 10000195328 2.80698498942
don@HAL:~/UNIX/OS/3EZPcs/Ch06$ ./linearExtrapolator
Clock resolution: 1ns
 tsc ticks      ns     tsc ticks per ns
28070385587 10000197480 2.8069831264
_start_clock_time: 1497966724156422794, _start_tsc: 4758879747559
elapsed ns: 0, elapsed ticks: 0, ns_diff: 0

 tsc ticks      ns     tsc ticks per ns
28070364084 10000193633 2.80698205596
elapsed ns: 10000247486, elapsed ticks: 28070516229, ns_diff: -3465

 tsc ticks      ns     tsc ticks per ns
28070358445 10000195130 2.80698107188
elapsed ns: 20000496849, elapsed ticks: 56141027929, ns_diff: -10419

 tsc ticks      ns     tsc ticks per ns
28070350693 10000195646 2.80698015186
elapsed ns: 30000747550, elapsed ticks: 84211534141, ns_diff: -20667

 tsc ticks      ns     tsc ticks per ns
28070324772 10000189692 2.80697923105
elapsed ns: 40000982325, elapsed ticks: 112281986547, ns_diff: -34158

 tsc ticks      ns     tsc ticks per ns
28070340494 10000198352 2.80697837242
elapsed ns: 50001225563, elapsed ticks: 140352454025, ns_diff: -50742

 tsc ticks      ns     tsc ticks per ns
28070325598 10000196057 2.80697752704
elapsed ns: 60001465937, elapsed ticks: 168422905017, ns_diff: -70335

^C

viewRates 输出显示每 ns 的 TSC 滴答声随着时间的推移相当迅速地减少，对应于上图中的那些急剧下降之一。 linearExtrapolator 输出显示，与 OP 中一样，clock_gettime() 报告的经过的 ns 与通过将经过的 TSC 刻度转换为经过的经过的 ns 之间的差异ns 使用 _ticks_per_ns == 2.8069831264 在开始时获得。而不是 sleep(10); 在每次打印出 elapsed ns、elapsed ticks、ns_diff 之间，我使用 10s 窗口重新运行 TSC ticks per ns 计算；这会打印出当前的 tsc ticks per ns 比率。可以看出，从 viewRates 输出中观察到的每 ns 的 TSC ticks 减少的趋势在 linearExtrapolator 的整个运行过程中持续存在。

将 elapsed ticks 除以 _ticks_per_ns 并减去相应的 elapsed ns 得到 ns_diff，例如:( 84211534141/2.8069831264) - 30000747550 = -20667。但这不是 0，主要是因为 TSC 每 ns 滴答的漂移。如果我们使用从最后 10 秒间隔获得的每 ns 2.80698015186 个滴答的值，结果将是:(84211534141/2.80698015186) - 30000747550 = 11125。在最后 10 秒间隔期间累积的额外误差，-20667 - -10419 = - 10248，当正确的 TSC ticks per ns 值用于该间隔时几乎消失:(84211534141 - 56141027929)/2.80698015186 - (30000747550 - 20000496849) = 349。

如果 linearExtrapolator 在每 ns 的 TSC 滴答数保持不变的时候运行，则精度将受到(常数)_ticks_per_ns 的确定程度的限制，然后它会付出代价，例如，几个估计的中位数。如果 _ticks_per_ns 偏离固定的十亿分之 40，则预计每 10 秒会有大约 400ns 的恒定漂移，因此 ns_diff 将每 10 秒增长/缩小 400秒。

genTimeSeriesofRates.cc 可用于为上面的图生成数据:genTimeSeriesofRates.cc:

#include <time.h>
#include <unistd.h>
#include <iostream>
#include <iomanip>
#include <algorithm>
#include <array>
#include "rdtscp.h"

using std::cout;  using std::cerr;  using std::endl;  using std::array;

double get_ticks_per_ns(long &ticks, long &ns); // Get median tsc ticks per ns, ticks and ns.
long ts_to_ns(const timespec &ts);

#define CLOCK CLOCK_REALTIME            // clock_gettime() clock to use.
#define TIMESTEP 10
#define NSTEPS  10000
#define RUNS 5            // Number of RUNS and SLEEP interval used for each sample in get_ticks_per_ns().
#define SLEEP 1

int main() {
  timespec ts;
  clock_getres(CLOCK, &ts);
  cerr << "CLOCK resolution: " << ts_to_ns(ts) << "ns\n";

  clock_gettime(CLOCK, &ts);
  int start_time = ts.tv_sec;

  double ticks_per_ns;
  int running_elapsed_time = 0; //approx secs since start_time to center of the sampling done by get_ticks_per_ns()
  long ticks, ns;
  for (int timestep = 0; timestep < NSTEPS; ++timestep) {
    clock_gettime(CLOCK, &ts);
    ticks_per_ns = get_ticks_per_ns(ticks, ns);
    running_elapsed_time = ts.tv_sec - start_time + RUNS * SLEEP / 2;

    cout << running_elapsed_time << ' ' << ticks << ' ' << ns << ' ' 
     << std::setprecision(12) << ticks_per_ns << endl;

    sleep(10);
  }
}

double get_ticks_per_ns(long &ticks, long &ns) {
  // get the median over RUNS runs of elapsed tsc ticks, CLOCK ns, and their ratio over a SLEEP secs time interval 
  timespec clock_start, clock_end;
  long tsc_start, tsc_end;
  array<long, RUNS> elapsed_ns, elapsed_ticks;
  array<double, RUNS> rates; // arrays from each run from which to get medians.

  for (int i = 0; i < RUNS; ++i) {
    clock_gettime(CLOCK, &clock_start);
    tsc_start = rdtscp_end(); // minimizes time between clock_start and tsc_start.
    sleep(SLEEP);
    clock_gettime(CLOCK, &clock_end);
    tsc_end = rdtscp_end();

    elapsed_ticks[i] = tsc_end - tsc_start;
    elapsed_ns[i] = ts_to_ns(clock_end) - ts_to_ns(clock_start);
    rates[i] = static_cast<double>(elapsed_ticks[i]) / elapsed_ns[i];
  }

  // get medians:
  std::nth_element(elapsed_ns.begin(), elapsed_ns.begin() + RUNS/2, elapsed_ns.end());
  std::nth_element(elapsed_ticks.begin(), elapsed_ticks.begin() + RUNS/2, elapsed_ticks.end());
  std::nth_element(rates.begin(), rates.begin() + RUNS/2, rates.end());
  ticks = elapsed_ticks[RUNS/2];
  ns = elapsed_ns[RUNS/2];

  return rates[RUNS/2];
}

constexpr long BILLION {1000000000};

long ts_to_ns(const timespec &ts) {
  return ts.tv_sec * BILLION + ts.tv_nsec;
}