gpt4 book ai didi

c++ - 如何告诉编译器不要优化内联汇编中使用的变量?

转载 作者:行者123 更新时间:2023-11-30 04:08:26 25 4
gpt4 key购买 nike



我的内联汇编使用与 C 版本相同的变量(作为输入参数传递)

asm ( "" : : "r" (arg1), "r" (arg2) :);

我的问题是,如何让编译器不优化这些变量?该功能仅在使用优化编译时中断汇编。我试过 volatile,但它仍然不正常。



我正在尝试为 OpenCV 实现 NEON 优化,特别是 lkpyramid.cpp 文件。问题是,在 Release模式(设置了优化)下,它无法正常工作。但是,在 Debug模式下,它工作正常。我追踪了一个正在优化的特定变量(FLT_SCALE)并使其易变,之后该部分工作正常,但我仍然有另一个优化的不正确行为。

gcc 版本可能会有所不同,因为这是一个开源项目,但我目前使用的是 4.8.1。目标架构是带有 NEON 的 ARMv7。我正在测试的处理器是 ARM Cortex-A15(big.LITTLE)。

下面是我当前状态下的代码。忽略所有注释和 volatiles(用于测试此问题)。这是在制品。我删除了不相关的代码,所以我可以把它放在这里。我认为问题出在最底部的 asm block 中,因为如果我执行 if(false) 跳过它,我就不会遇到问题。谢谢。

#include "precomp.hpp"
#include <float.h>
#include <stdio.h>
#include "lkpyramid.hpp"

static void calcSharrDeriv(const cv::Mat& src, cv::Mat& dst)
using namespace cv;
using cv::detail::deriv_type;
int rows = src.rows, cols = src.cols, cn = src.channels(), colsn = cols*cn, depth = src.depth();
CV_Assert(depth == CV_8U);
dst.create(rows, cols, CV_MAKETYPE(DataType<deriv_type>::depth, cn*2));

if (tegra::calcSharrDeriv(src, dst))

int x, y, delta = (int)alignSize((cols + 2)*cn, 16);
AutoBuffer<deriv_type> _tempBuf(delta*2 + 64);
deriv_type *trow0 = alignPtr(_tempBuf + cn, 16), *trow1 = alignPtr(trow0 + delta, 16);

int three = 3, ten = 10;

for( y = 0; y < rows; y++ )
const uchar* srow0 = src.ptr<uchar>(y > 0 ? y-1 : rows > 1 ? 1 : 0);
const uchar* srow1 = src.ptr<uchar>(y);
const uchar* srow2 = src.ptr<uchar>(y < rows-1 ? y+1 : rows > 1 ? rows-2 : 0);
deriv_type* drow = dst.ptr<deriv_type>(y);

// do vertical convolution
x = 0;

#ifdef CV_NEON
//assumes deriv_type is 16 bits
if(sizeof(deriv_type) == 2 && colsn >= 16)

__asm__ volatile ( "vdup.16 q8, %0\n\t"
"vdup.8 d18, %1\n\t"
: "r" (three), "r" (ten)
: );

for( ; x <= colsn - 8; x += 8)

__asm__ volatile ( "vld1.8 {d0}, [%0]\n\t"
"vld1.8 {d1}, [%1]\n\t"
"vld1.8 {d2}, [%2]\n\t"
"vaddl.u8 q4, d0, d2\n\t"
"vsubl.u8 q11, d2, d0\n\t"
"vmul.u16 q5, q4, q8\n\t"
"vmull.u8 q6, d1, d18\n\t"
"vadd.u16 q10, q6, q5\n\t"
"vst1.16 {q10}, [%3]\n\t"
"vst1.16 {q11}, [%4]\n\t"
: "r" (srow0 + x),
"r" (srow1 + x),
"r" (srow2 + x),
"r" (trow0 + x),
"r" (trow1 + x)

for( ; x < colsn; x++ )
int t0 = (srow0[x] + srow2[x])*3 + srow1[x]*10;
int t1 = srow2[x] - srow0[x];
trow0[x] = (deriv_type)t0;
trow1[x] = (deriv_type)t1;

// make border
int x0 = (cols > 1 ? 1 : 0)*cn, x1 = (cols > 1 ? cols-2 : 0)*cn;
for( int k = 0; k < cn; k++ )
trow0[-cn + k] = trow0[x0 + k]; trow0[colsn + k] = trow0[x1 + k];
trow1[-cn + k] = trow1[x0 + k]; trow1[colsn + k] = trow1[x1 + k];

#ifdef CV_NEON
__asm__ volatile ( "vdup.16 q8, %0\n\t"
"vdup.16 q9, %1\n\t"
: "r" (three), "r" (ten)
: );

// do horizontal convolution, interleave the results and store them to dst
x = 0;

#ifdef CV_NEON
//assumes size of deriv_type is 16 bits
if(sizeof(deriv_type) == 2 && colsn >= 16)
for( ; x <= colsn - 8; x += 8 )
__asm__ volatile (
"vld1.16 {q0}, [%0]\n\t" //trow0[x + cn]
"vld1.16 {q1}, [%1]\n\t" //trow0[x - cn]
"vsub.i16 q5, q0, q1\n\t" //this is t0
"vld1.16 {q2}, [%2]\n\t" //trow1[x + cn]
"vld1.16 {q3}, [%3]\n\t" //trow1[x - cn]
"vadd.i16 q6, q2, q3\n\t" //this needs mult by 3
"vld1.16 {q4}, [%4]\n\t" //trow1[x]
"vmul.i16 q7, q6, q8\n\t" //this needs to add to trow1[x]*10
"vmul.i16 q10, q4, q9\n\t" //this is trow1[x]*10
"vadd.i16 q11, q7, q10\n\t" //this is t1
"vswp d22, d11\n\t"
"vst2.16 {q5}, [%5]\n\t" //interleave
"vst2.16 {q11}, [%6]\n\t" //interleave
: "r" (trow0 + x + cn), //0
"r" (trow0 + x - cn), //1
"r" (trow1 + x + cn), //2
"r" (trow1 + x - cn), //3
"r" (trow1 + x), //4
"r" (drow + (x*2)), //5
"r" (drow + (x*2)+8) //6



for( ; x < colsn; x++ )
deriv_type t0 = (deriv_type)(trow0[x+cn] - trow0[x-cn]);
deriv_type t1 = (deriv_type)((trow1[x+cn] + trow1[x-cn])*3 + trow1[x]*10);
drow[x*2] = t0; drow[x*2+1] = t1;


const Mat& _prevImg, const Mat& _prevDeriv, const Mat& _nextImg,
const Point2f* _prevPts, Point2f* _nextPts,
uchar* _status, float* _err,
Size _winSize, TermCriteria _criteria,
int _level, int _maxLevel, int _flags, float _minEigThreshold )
prevImg = &_prevImg;
prevDeriv = &_prevDeriv;
nextImg = &_nextImg;
prevPts = _prevPts;
nextPts = _nextPts;
status = _status;
err = _err;
winSize = _winSize;
criteria = _criteria;
level = _level;
maxLevel = _maxLevel;
flags = _flags;
minEigThreshold = _minEigThreshold;

void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
Point2f halfWin((winSize.width-1)*0.5f, (winSize.height-1)*0.5f);
const Mat& I = *prevImg;
const Mat& J = *nextImg;
const Mat& derivI = *prevDeriv;

int j, cn = I.channels(), cn2 = cn*2;
cv::AutoBuffer<deriv_type> _buf(winSize.area()*(cn + cn2));
int derivDepth = DataType<deriv_type>::depth;

Mat IWinBuf(winSize, CV_MAKETYPE(derivDepth, cn), (deriv_type*)_buf);
Mat derivIWinBuf(winSize, CV_MAKETYPE(derivDepth, cn2), (deriv_type*)_buf + winSize.area()*cn);

for( int ptidx = range.start; ptidx < range.end; ptidx++ )
Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level));
Point2f nextPt;
if( level == maxLevel )
nextPt = nextPts[ptidx]*(float)(1./(1 << level));
nextPt = prevPt;
nextPt = nextPts[ptidx]*2.f;
nextPts[ptidx] = nextPt;

Point2i iprevPt, inextPt;
prevPt -= halfWin;
iprevPt.x = cvFloor(prevPt.x);
iprevPt.y = cvFloor(prevPt.y);

if( iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols ||
iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows )
if( level == 0 )
if( status )
status[ptidx] = false;
if( err )
err[ptidx] = 0;

volatile float a = prevPt.x - iprevPt.x;
volatile float b = prevPt.y - iprevPt.y;
volatile const int W_BITS = 14, W_BITS1 = 14;
volatile const float FLT_SCALE = 1.f/(1 << 20); //volatile is needed because compiler will optimize this out for NEON
volatile int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
volatile int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
volatile int iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
volatile int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;

volatile int dstep = (int)(derivI.step/derivI.elemSize1());
volatile int stepI = (int)(I.step/I.elemSize1());
volatile int stepJ = (int)(J.step/J.elemSize1());
volatile float A11 = 0, A12 = 0, A22 = 0;

#ifdef CV_NEON

volatile int CV_DECL_ALIGNED(16) nA11[] = {0, 0, 0, 0}, nA12[] = {0, 0, 0, 0}, nA22[] = {0, 0, 0, 0};
volatile const int shifter1 = -(W_BITS - 5); //negative so it shifts right
volatile const int shifter2 = -(W_BITS);

if(sizeof(deriv_type) == 2)

__asm__ volatile ( "vdup.16 d26, %0\n\t"
"vdup.16 d27, %1\n\t"
"vdup.16 d28, %2\n\t"
"vdup.16 d29, %3\n\t"
"vdup.32 q11, %4\n\t"
"vdup.32 q12, %5\n\t"
: "r" ((short)iw00),
"r" ((short)iw01),
"r" ((short)iw10),
"r" ((short)iw11),
"r" (shifter1),
"r" (shifter2)
: );



// extract the patch from the first image, compute covariation matrix of derivatives
volatile int x, y;
for( y = 0; y < winSize.height; y++ )
volatile const uchar* src = (const uchar*) + (y + iprevPt.y)*stepI + iprevPt.x*cn;
volatile const deriv_type* dsrc = (const deriv_type*) + (y + iprevPt.y)*dstep + iprevPt.x*cn2;

volatile deriv_type* Iptr = (deriv_type*)( + y*IWinBuf.step);
volatile deriv_type* dIptr = (deriv_type*)( + y*derivIWinBuf.step);

x = 0;

#ifdef CV_NEON

if(sizeof(deriv_type) == 2 && winSize.width*cn >= 12)

for( ; x <= winSize.width*cn - 4; x += 4, dsrc += 4*2, dIptr += 4*2 )

__asm__ volatile (
"vld1.8 {d0}, [%0]\n\t" //ignores last 4 bytes
"vmovl.u8 q0, d0\n\t" //expand to 16-bit
"vld1.8 {d2}, [%1]\n\t"
"vmovl.u8 q1, d2\n\t"

"vmull.s16 q5, d0, d26\n\t"
"vmull.s16 q6, d2, d27\n\t"

"vld1.8 {d4}, [%2]\n\t"
"vmovl.u8 q2, d4\n\t" //expand
"vld1.8 {d6}, [%3]\n\t"
"vmovl.u8 q3, d6\n\t"

"vmull.s16 q7, d4, d28\n\t"
"vmull.s16 q8, d6, d29\n\t"

"vadd.i32 q5, q5, q6\n\t"
"vadd.i32 q7, q7, q8\n\t"
"vadd.i32 q5, q5, q7\n\t"

"vld2.16 {d0, d1}, [%4]\n\t" //evens in d0 and d2
"vld2.16 {d2, d3}, [%5]\n\t"

"vqrshl.s32 q5, q5, q11\n\t"

"vmull.s16 q4, d0, d26\n\t" //q4 is mult of even 1
"vmull.s16 q6, d1, d26\n\t" //q6 is mult of odd 1

"vmovn.s32 d0, q5\n\t"

"vmull.s16 q7, d2, d27\n\t" //q7 is mult of even 2
"vmull.s16 q8, d3, d27\n\t" //q8 is mult of odd 2

"vst1.16 {d0}, [%8]\n\t"

"vld2.16 {d4, d5}, [%6]\n\t" //evens in d4 and d6
"vld2.16 {d6, d7}, [%7]\n\t"

"vadd.i32 q4, q4, q7\n\t" //this frees up q7 and q8
"vadd.i32 q6, q6, q8\n\t" //q4 is added even 1 and 2
//q6 is added odd 1 and 2

"vmull.s16 q7, d4, d28\n\t" //q7 is mult of even 3
"vmull.s16 q0, d5, d28\n\t" //q0 is mult of odd 3
"vmull.s16 q8, d6, d29\n\t" //q8 is mult of even 4
"vmull.s16 q15, d7, d29\n\t" //q15 is mult of odd 4

"vadd.i32 q7, q7, q8\n\t" //q7 is added even 3 and 4
"vadd.i32 q0, q0, q15\n\t" //q0 is added odd 3 and 4

"vadd.i32 q4, q4, q7\n\t" //q4 is added even 1,2,3,4 -- will be ixval
"vadd.i32 q6, q6, q0\n\t" //q6 is added odd 1,2,3,4 -- will be iyval

"vld1.32 {q1}, [%11]\n\t"
"vld1.32 {q2}, [%12]\n\t"
"vld1.32 {q0}, [%10]\n\t" //get the loads prepared

"vqrshl.s32 q4, q4, q12\n\t" //q4 is descaled evens added
"vqrshl.s32 q6, q6, q12\n\t" //q6 is descaled odds added

//now ixval is stored in q4 and iyval is stored in q6 and ival is in q5

"vmul.s32 q7, q4, q4\n\t"
"vmul.s32 q8, q4, q6\n\t"
"vmul.s32 q15, q6, q6\n\t"

"vadd.i32 q0, q0, q7\n\t"
"vadd.i32 q1, q1, q8\n\t"
"vadd.i32 q2, q2, q15\n\t"

"vst1.32 {q0}, [%10]\n\t"
"vst1.32 {q1}, [%11]\n\t"
"vst1.32 {q2}, [%12]\n\t"

"vmovn.i32 d8, q4\n\t" //bring ixval to short
"vmovn.i32 d12, q6\n\t" //bring iyval to short
"vswp d9, d12\n\t" //now d8 is ixval and d9 is iyval
"vst2.16 {d8, d9}, [%9]\n\t"

: "r" (src + x), //0
"r" (src + x + cn), //1
"r" (src + x + stepI), //2
"r" (src + x + stepI + cn), //3
"r" (dsrc), //4
"r" (dsrc + cn2), //5
"r" (dsrc + dstep), //6
"r" (dsrc + dstep + cn2), //7
"r" (Iptr + x), //8
"r" (dIptr), //9
"r" (nA11), //10
"r" (nA12), //11
"r" (nA22) //12
: );




for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2 )
int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 +
src[x+stepI]*iw10 + src[x+stepI+cn]*iw11, W_BITS1-5);

int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 +
dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1);
int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 +
dsrc[dstep+cn2+1]*iw11, W_BITS1);

Iptr[x] = (short)ival;
dIptr[0] = (short)ixval;
dIptr[1] = (short)iyval;

A11 += (float)(ixval*ixval);
A12 += (float)(ixval*iyval);
A22 += (float)(iyval*iyval);

#ifdef CV_NEON
A11 += (float)(nA11[0] + nA11[1] + nA11[2] + nA11[3]);
A12 += (float)(nA12[0] + nA12[1] + nA12[2] + nA12[3]);
A22 += (float)(nA22[0] + nA22[1] + nA22[2] + nA22[3]);


volatile float D = A11*A22 - A12*A12;
float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) +

if( err && (flags & CV_LKFLOW_GET_MIN_EIGENVALS) != 0 )
err[ptidx] = (float)minEig;

if( minEig < minEigThreshold || D < FLT_EPSILON )
if( level == 0 && status )
status[ptidx] = false;

D = 1.f/D;

nextPt -= halfWin;
Point2f prevDelta;

for( j = 0; j < criteria.maxCount; j++ )
inextPt.x = cvFloor(nextPt.x);
inextPt.y = cvFloor(nextPt.y);

if( inextPt.x < -winSize.width || inextPt.x >= J.cols ||
inextPt.y < -winSize.height || inextPt.y >= J.rows )
if( level == 0 && status )
status[ptidx] = false;

a = nextPt.x - inextPt.x;
b = nextPt.y - inextPt.y;
iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
float b1 = 0, b2 = 0;

for( y = 0; y < winSize.height; y++ )
const uchar* Jptr = (const uchar*) + (y + inextPt.y)*stepJ + inextPt.x*cn;
const deriv_type* Iptr = (const deriv_type*)( + y*IWinBuf.step);
const deriv_type* dIptr = (const deriv_type*)( + y*derivIWinBuf.step);

x = 0;

for( ; x < winSize.width*cn; x++, dIptr += 2 )
int diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
Jptr[x+stepJ]*iw10 + Jptr[x+stepJ+cn]*iw11,
W_BITS1-5) - Iptr[x];
b1 += (float)(diff*dIptr[0]);
b2 += (float)(diff*dIptr[1]);

b1 *= FLT_SCALE;
b2 *= FLT_SCALE;

Point2f delta( (float)((A12*b2 - A22*b1) * D),
(float)((A12*b1 - A11*b2) * D));
//delta = -delta;

nextPt += delta;
nextPts[ptidx] = nextPt + halfWin;

if( delta.ddot(delta) <= criteria.epsilon )

if( j > 0 && std::abs(delta.x + prevDelta.x) < 0.01 &&
std::abs(delta.y + prevDelta.y) < 0.01 )
nextPts[ptidx] -= delta*0.5f;
prevDelta = delta;

if( status[ptidx] && err && level == 0 && (flags & CV_LKFLOW_GET_MIN_EIGENVALS) == 0 )
Point2f nextPoint = nextPts[ptidx] - halfWin;
Point inextPoint;

inextPoint.x = cvFloor(nextPoint.x);
inextPoint.y = cvFloor(nextPoint.y);

if( inextPoint.x < -winSize.width || inextPoint.x >= J.cols ||
inextPoint.y < -winSize.height || inextPoint.y >= J.rows )
if( status )
status[ptidx] = false;

float aa = nextPoint.x - inextPoint.x;
float bb = nextPoint.y - inextPoint.y;
iw00 = cvRound((1.f - aa)*(1.f - bb)*(1 << W_BITS));
iw01 = cvRound(aa*(1.f - bb)*(1 << W_BITS));
iw10 = cvRound((1.f - aa)*bb*(1 << W_BITS));
iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
float errval = 0.f;

for( y = 0; y < winSize.height; y++ )
const uchar* Jptr = (const uchar*) + (y + inextPoint.y)*stepJ + inextPoint.x*cn;
const deriv_type* Iptr = (const deriv_type*)( + y*IWinBuf.step);

for( x = 0; x < winSize.width*cn; x++ )
int diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
Jptr[x+stepJ]*iw10 + Jptr[x+stepJ+cn]*iw11,
W_BITS1-5) - Iptr[x];
errval += std::abs((float)diff);
err[ptidx] = errval * 1.f/(32*winSize.width*cn*winSize.height);


仔细阅读extended asm GCC 文档部分。

请告诉我们更多关于您使用的 GCC 版本、目标处理器和真正的 asm 指令....

您可能需要 asm volatileasm goto


{ volatile auto varg1 = arg1;
volatile auto varg2 = arg2;
volatile asm ("" : : "r" (varg1), "r" (varg2) :);

并且您应该使用最新版本的 GCC,例如GCC 4.8 (今天,2014 年 2 月)和几周后即将到来的 GCC 4.9

也许 是一个更好的询问地点。

关于c++ - 如何告诉编译器不要优化内联汇编中使用的变量?,我们在Stack Overflow上找到一个类似的问题:

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号