今天發現一個奇怪的現象,用linux內核的bitops.h裡面的bit_set和test_bit居然沒有不用匯編的代碼快。記錄下來,備忘。
下面是可執行程序,THUIRDB暫時沒有bitops裡面的實現,改用我自己寫的實現了。當然bitops裡的實現,nr可以很大,而我的代碼nr只能是0-63之間的一個數,對THUIRDB的應用來說,足夠了。
#include "stdint.h"
#include <iostream>
using namespace std;
#define BITOP_ADDR(x) "=m" (*(volatile long *) (x))
#define ADDR BITOP_ADDR(addr)
#ifdef CONFIG_SMP
#define LOCK_PREFIX /
".section .smp_locks,/"a / "/n" /
_ASM_ALIGN "/n" /
_ASM_PTR "661f/n" /
".previous/n" /
"661:/n/tlock; "
#else
#define LOCK_PREFIX ""
#endif
static inline void
set_bit(unsigned int nr, volatile unsigned long* addr)
{
__asm__ volatile(LOCK_PREFIX "bts %1,%0"
: BITOP_ADDR(addr) : "Ir"(nr) : "memory");
}
static inline bool test_bit(int nr, volatile const unsigned long* addr)
{
int oldbit;
asm volatile("bt %2,%1/n/t"
"sbb %0,%0"
: "=r"(oldbit)
: "m"(*(unsigned long*)addr), "Ir"(nr));
return oldbit != 0;
}
static void _set_bit(int nr, uint64_t* addr)
{
addr[0] |= (1UL << nr);
};
static bool _test_bit(int nr, uint64_t* addr)
{
return (0 < (addr[0] & (1UL << nr)));
};
int main(void)
{
uint64_t bit_arr = 0;
#ifdef T1
for (int i = 0; i < 1000000; ++i) {
_set_bit(id, &bit_arr);
}
#endif
#ifdef T2
for (int i = 0; i < 1000000; ++i) {
set_bit(id, &bit_arr);
}
#endif
#ifdef T3
for (int i = 0; i < 1000000; ++i) {
_test_bit(id, &bit_arr);
}
#endif
#ifdef T4
for (int i = 0; i < 1000000; ++i) {
test_bit(id, &bit_arr);
}
#endif
return 0;
}
以下是反彙編的結果,可以看出用內核的方法,代碼短小,但效率較低,用移位與或的方法代碼多,但效率高。
000000000040067c <_ZL7set_bitjPVm>: //set_bit
40067c: 55 push %rbp
40067d: 48 89 e5 mov %rsp,%rbp
400680: 89 7d fc mov % edi,-0x4(%rbp)
400683: 48 89 75 f0 mov %rsi,-0x10(%rbp)
400687: 48 8b 45 f0 mov -0x10(%rbp),%rax
40068b: 8b 55 fc mov -0x4(%rbp),% edx
40068e: 0f ab 10 bts % edx,(%rax)
400691: c9 leaveq
400692: c3 retq
0000000000400693 <_ZL8test_bitiPVKm>://test_bit
400693: 55 push %rbp
400694: 48 89 e5 mov %rsp,%rbp
400697: 89 7d ec mov % edi,-0x14(%rbp)
40069a: 48 89 75 e0 mov %rsi,-0x20(%rbp)
40069e: 48 8b 45 e0 mov -0x20(%rbp),%rax
4006a2: 8b 55 ec mov -0x14(%rbp),% edx
4006a5: 0f a3 10 bt % edx,(%rax)
4006a8: 19 c0 sbb % eax,% eax
4006aa: 89 45 fc mov % eax,-0x4(%rbp)
4006ad: 83 7d fc 00 cmpl $0x0,-0x4(%rbp)
4006b1: 0f 95 c0 setne %al
4006b4: c9 leaveq
4006b5: c3 retq
00000000004006b6 <_ZL8_set_bitiPm>://_set_bit
4006b6: 55 push %rbp
4006b7: 48 89 e5 mov %rsp,%rbp
4006ba: 53 push %rbx
4006bb: 89 7d f4 mov % edi,-0xc(%rbp)
4006be: 48 89 75 e8 mov %rsi,-0x18(%rbp)
4006c2: 48 8b 45 e8 mov -0x18(%rbp),%rax
4006c6: 48 8b 10 mov (%rax),%rdx
4006c9: 8b 45 f4 mov -0xc(%rbp),% eax
4006cc: bb 01 00 00 00 mov $0x1,% ebx
4006d1: 48 89 de mov %rbx,%rsi
4006d4: 89 c1 mov % eax,% ecx
4006d6: 48 d3 e6 shl %cl,%rsi
4006d9: 48 89 f0 mov %rsi,%rax
4006dc: 48 09 c2 or %rax,%rdx
4006df: 48 8b 45 e8 mov -0x18(%rbp),%rax
4006e3: 48 89 10 mov %rdx,(%rax)
4006e6: 5b pop %rbx
4006e7: c9 leaveq
4006e8: c3 retq
00000000004006e9 <_ZL9_test_bitiPm>://_test_bit
4006e9: 55 push %rbp
4006ea: 48 89 e5 mov %rsp,%rbp
4006ed: 53 push %rbx
4006ee: 89 7d f4 mov % edi,-0xc(%rbp)
4006f1: 48 89 75 e8 mov %rsi,-0x18(%rbp)
4006f5: 48 8b 45 e8 mov -0x18(%rbp),%rax
4006f9: 48 8b 10 mov (%rax),%rdx
4006fc: 8b 45 f4 mov -0xc(%rbp),% eax
4006ff: 48 89 d3 mov %rdx,%rbx
400702: 89 c1 mov % eax,% ecx
400704: 48 d3 eb shr %cl,%rbx
400707: 48 89 d8 mov %rbx,%rax
40070a: 83 e0 01 and $0x1,% eax
40070d: 5b pop %rbx
40070e: c9 leaveq
40070f: c3 retq
T2用-O2優化後的代碼,內聯到main裡面從400910到40093b
00000000004008f0 <main>:
4008f0: 55 push %rbp
4008f1: 31 c0 xor %eax,%eax
4008f3: 53 push %rbx
4008f4: 48 83 ec 18 sub $0x18,%rsp
4008f8: 48 8d 4c 24 08 lea 0x8(%rsp),%rcx
4008fd: 48 c7 44 24 08 00 00 movq $0x0,0x8(%rsp)
400904: 00 00
400906: 66 2e 0f 1f 84 00 00 nopw %cs:0x0(%rax,%rax,1)
40090d: 00 00 00
400910: 89 c2 mov %eax,%edx
400912: 83 e2 3f and $0x3f,%edx
400915: 0f ab 11 bts %edx,(%rcx)
400918: 83 c0 01 add $0x1,%eax
40091b: 3d 40 42 0f 00 cmp $0xf4240,%eax
400920: 75 ee jne 400910 <main+0x20>
400922: 48 8b 74 24 08 mov 0x8(%rsp),%rsi
400927: bf e0 0d 60 00 mov $0x600de0,%edi
40092c: e8 67 fe ff ff callq 400798 <_ZNSo9_M_insertImEERSoT_@plt>
400931: 48 89 c3 mov %rax,%rbx
400934: 48 8b 00 mov (%rax),%rax
400937: 48 8b 40 e8 mov -0x18(%rax),%rax
40093b: 48 8b ac 03 f0 00 00 mov 0xf0(%rbx,%rax,1),%rbp
400942: 00
400943: 48 85 ed test %rbp,%rbp
400946: 74 3f je 400987 <main+0x97>
400948: 80 7d 38 00 cmpb $0x0,0x38(%rbp)
40094c: 74 20 je 40096e <main+0x7e>
40094e: 0f b6 45 43 movzbl 0x43(%rbp),%eax
400952: 48 89 df mov %rbx,%rdi
400955: 0f be f0 movsbl %al,%esi
400958: e8 5b fe ff ff callq 4007b8 <_ZNSo3putEc@plt>
40095d: 48 89 c7 mov %rax,%rdi
400960: e8 23 fe ff ff callq 400788 <_ZNSo5flushEv@plt>
400965: 48 83 c4 18 add $0x18,%rsp
400969: 31 c0 xor %eax,%eax
40096b: 5b pop %rbx
40096c: 5d pop %rbp
40096d: c3 retq
40096e: 48 89 ef mov %rbp,%rdi
400971: e8 c2 fd ff ff callq 400738 <_ZNKSt5ctypeIcE13_M_widen_initEv@plt>
400976: 48 8b 45 00 mov 0x0(%rbp),%rax
40097a: be 0a 00 00 00 mov $0xa,%esi
40097f: 48 89 ef mov %rbp,%rdi
400982: ff 50 30 callq *0x30(%rax)
400985: eb cb jmp 400952 <main+0x62>
400987: e8 9c fd ff ff callq 400728 <_ZSt16__throw_bad_castv@plt>
40098c: 90 nop
40098d: 90 nop
40098e: 90 nop
40098f: 90 nop