"寫雖然是16bytes/cycle但那是througput 也就是說我們需要儘可能使每個cycle有一個寫操作進入write buffer after allocation stage從而保證 16bytes/cycle. 上面的代碼palignr 在前端阻擋(從讀取到譯碼階段)了寫操作儘快的進入decode以及allocatoin stage 所以每個循環會比下面的代碼多出 2~3個cycle",該帖子全部內容參見最後的參考閱讀。
我將Maling的代碼做了一個實現,但由於我試驗機不支持SSE3指令,在運行palignr指令會出非法操作,dmesg給出:trap invalid opcode rip:4004cc rsp:7fff74e5b728 error:0,這樣的錯誤,因此改成了三段指令拼的方法。
理解這段代碼的功能並不複雜,但為什麼快的原因,就是Maling說的那幾條,我們可以看出在亂序後性能的提升,這是一個很好的例子,我把它引在了我的系列中,代碼和原理來自Maling,我只是包裝了下實現,便於大家實際運行,並體驗。
#include<stdio.h>
#include<stdlib.h>
#include <stddef.h>
#include <stdint.h>
asm(" .text ");
asm(" .type shl_7, @function ");
asm("shl_7: push %rbp ");
asm(" mov %rsp,%rbp ");
asm("loop: sub $0x80, %rdx ");
asm(" movaps -0x07(%rsi), %xmm1 ");
asm(" movaps 0x09(%rsi), %xmm2 ");
asm(" movaps 0x19(%rsi), %xmm3 ");
asm(" movaps 0x29(%rsi), %xmm4 ");
asm(" movaps 0x39(%rsi), %xmm5 ");
asm(" movaps 0x49(%rsi), %xmm6 ");
asm(" movaps 0x59(%rsi), %xmm7 ");
asm(" movaps 0x69(%rsi), %xmm8 ");
asm(" movaps 0x79(%rsi), %xmm9 ");
asm(" lea 0x80(%rsi), %rsi ");
asm(" psrldq $7, %xmm8 ");
asm(" pslldq $9, %xmm9 ");
asm(" por %xmm8, %xmm9 ");
asm(" psrldq $7, %xmm7 ");
asm(" pslldq $9, %xmm8 ");
asm(" por %xmm7, %xmm8 ");
asm(" psrldq $7, %xmm6 ");
asm(" pslldq $9, %xmm7 ");
asm(" por %xmm6, %xmm7 ");
asm(" psrldq $7, %xmm5 ");
asm(" pslldq $9, %xmm6 ");
asm(" por %xmm5, %xmm6 ");
asm(" psrldq $7, %xmm4 ");
asm(" pslldq $9, %xmm5 ");
asm(" por %xmm4, %xmm5 ");
asm(" psrldq $7, %xmm3 ");
asm(" pslldq $9, %xmm4 ");
asm(" por %xmm3, %xmm4 ");
asm(" psrldq $7, %xmm2 ");
asm(" pslldq $9, %xmm3 ");
asm(" por %xmm2, %xmm3 ");
asm(" psrldq $7, %xmm1 ");
asm(" pslldq $9, %xmm2 ");
asm(" por %xmm1, %xmm2 ");
asm(" movaps %xmm9, 0x70(%rdi) ");
asm(" movaps %xmm8, 0x60(%rdi) ");
asm(" movaps %xmm7, 0x50(%rdi) ");
asm(" movaps %xmm6, 0x40(%rdi) ");
asm(" movaps %xmm5, 0x30(%rdi) ");
asm(" movaps %xmm4, 0x20(%rdi) ");
asm(" movaps %xmm3, 0x10(%rdi) ");
asm(" movaps %xmm2, 0x0(%rdi) ");
asm(" leaveq ");
asm(" retq ");
asm(" .text ");
asm(" .type shl_7_f, @function ");
asm("shl_7_f: push %rbp ");
asm(" mov %rsp,%rbp ");
asm("loop_f: sub $0x80, %rdx ");
asm(" movaps -0x07(%rsi), %xmm1 ");
asm(" movaps 0x09(%rsi), %xmm2 ");
asm(" movaps 0x19(%rsi), %xmm3 ");
asm(" movaps 0x29(%rsi), %xmm4 ");
asm(" movaps 0x39(%rsi), %xmm5 ");
asm(" movaps 0x49(%rsi), %xmm6 ");
asm(" movaps 0x59(%rsi), %xmm7 ");
asm(" movaps 0x69(%rsi), %xmm8 ");
asm(" movaps 0x79(%rsi), %xmm9 ");
asm(" lea 0x80(%rsi), %rsi ");
asm(" psrldq $7, %xmm8 ");
asm(" pslldq $9, %xmm9 ");
asm(" por %xmm8, %xmm9 ");
asm(" movaps %xmm9, 0x70(%rdi) ");
asm(" psrldq $7, %xmm7 ");
asm(" pslldq $9, %xmm8 ");
asm(" por %xmm7, %xmm8 ");
asm(" movaps %xmm8, 0x60(%rdi) ");
asm(" psrldq $7, %xmm6 ");
asm(" pslldq $9, %xmm7 ");
asm(" por %xmm6, %xmm7 ");
asm(" movaps %xmm7, 0x50(%rdi) ");
asm(" psrldq $7, %xmm5 ");
asm(" pslldq $9, %xmm6 ");
asm(" por %xmm5, %xmm6 ");
asm(" movaps %xmm6, 0x40(%rdi) ");
asm(" psrldq $7, %xmm4 ");
asm(" pslldq $9, %xmm5 ");
asm(" por %xmm4, %xmm5 ");
asm(" movaps %xmm5, 0x30(%rdi) ");
asm(" psrldq $7, %xmm3 ");
asm(" pslldq $9, %xmm4 ");
asm(" por %xmm3, %xmm4 ");
asm(" movaps %xmm4, 0x20(%rdi) ");
asm(" psrldq $7, %xmm2 ");
asm(" pslldq $9, %xmm3 ");
asm(" por %xmm2, %xmm3 ");
asm(" movaps %xmm3, 0x10(%rdi) ");
asm(" psrldq $7, %xmm1 ");
asm(" pslldq $9, %xmm2 ");
asm(" por %xmm1, %xmm2 ");
asm(" movaps %xmm2, 0x0(%rdi) ");
asm(" leaveq ");
asm(" retq ");
int main(void)
{
uint8_t* src = (uint8_t*)malloc(sizeof(uint8_t) * (128 + 7));
uint8_t* des = (uint8_t*)malloc(sizeof(uint8_t) * 128);
int i = 0;
for (; i < 128; ++i) {
src[i] = 0xFF;
}
src[0] = 0x0;
src[1] = 0x1;
src[2] = 0x2;
src[7] = 0x7;
src[8] = 0x8;
src += 7;
/*shl_7(des,src);
i = 0;
for(;i<128;++i)
{
printf("%d/n",des[i]);
}
i = 0;*/
for (; i < 100000000; ++i) {
#ifdef SLOW
shl_7(des, src);
#endif
#ifdef FAST
shl_7_f(des, src);
#endif
}
return 0;
}