PForDelta索引壓縮算法的實現

前日一個朋友給我發來了一個索引壓縮算法,寫得非常漂亮而且簡潔,壓縮比和解壓性能方面大大超過目前已知的一些字節對齊的算法和Pfordelta這樣的非字節對齊的算法,讓人歎為觀止,這是我看到的最好的壓縮算法,他將會以論文的形式發表,相信必將震驚世界,我之前也寫了很多Pfordelta的博客,大家對這個算法的具體實現很好奇,有幾個難點,一個是bit pack和unpack,一個是關於exception的佔位符在做鏈接的時候如果間隔超過了2的b次方如何處理等等,在以前的博客中曾應用了一個日本開發者開發的算法,但目前已經不開源了,我昨天又讀了一遍pfordelta實現方面的論文【Super-Scalar RAM-CPU Cache compression】,今天一天寫了個大概,bit pack和unpack在1,2,4,8,16...這種2的冪是比較容易處理的,其餘的比較困難,我這裡只實現了pack3和unpack3,論文【Balancing Vectorized Query Execution with Bandwidth-optimized Storage】中給出了unpack12的參考代碼,理論上應該實現從pack3到pack16的全部方法,限於博客篇幅不全寫了,有興趣讀者朋友依葫蘆畫瓢可以繼續完成。

值得注意的是:(1)PFORDelta可以對0值進行壓縮,這是非常有利的。(2)pack和unpack的函數指針數組也比較有特色,PACKbitwidth;通過bitwidth值來取相應的pack和unpack函數,提高CPU流水線的通暢性。

算法本博客不做過多解釋,有興趣的朋友可以參見我此前的博客:

http://blog.csdn.net/pennyliang/archive/2010/09/25/5905691.aspx

另外,我這位朋友想讓我做一個最快的Pfordelta算法來PK一下,會輸多少,我知道肯定會輸,而且會很大,只是想盡可能做個最合格的對手,如果有讀者朋友有更好的實現,也請發給我參考,非常感謝。

#include<stdio.h>
#include<stdlib.h>
#include <stddef.h>
#include <stdint.h>
const int N = 64;
const int MAX = 64;
uint32_t MAXCODE[] = {0,
                      1,//1U<<0 - 1
                      3,//1u<<1 - 1
                      (1U << 3) - 1,
                      (1U << 4) - 1,
                      (1U << 5) - 1,
                      (1U << 6) - 1,
                      (1U << 7) - 1,
                      (1U << 8) - 1,
                      (1U << 9) - 1,
                      (1U << 10) - 1,
                      (1U << 11) - 1,
                      (1U << 12) - 1,
                      (1U << 13) - 1,
                      (1U << 14) - 1,
                      (1U << 15) - 1,
                      (1U << 16) - 1,
                      (1U << 17) - 1,
                      (1U << 18) - 1,
                      (1U << 19) - 1,
                      (1U << 20) - 1,
                      (1U << 21) - 1,
                      (1U << 22) - 1,
                      (1U << 23) - 1,
                      (1U << 24) - 1,
                      (1U << 25) - 1,
                      (1U << 26) - 1,
                      (1U << 27) - 1,
                      (1U << 28) - 1,
                      (1U << 29) - 1,
                      (1U << 30) - 1,
                      0x7FFFFFFF,
                      0xFFFFFFFF,
                     };
typedef void(*pattern_fun)(uint32_t* code, uint32_t* data, size_t n);



void PACK3(uint32_t* code, uint32_t* data, size_t n)
{
    for (int i = 0, j = 0; j < n; i += 3, j += 32) {
        code[i] = 0;
        code[i + 1] = 0;
        code[i + 2] = 0;
        code[i] |= data[j + 0] << 0;
        code[i] |= data[j + 1] << 3;
        code[i] |= data[j + 2] << 6;
        code[i] |= data[j + 3] << 9;
        code[i] |= data[j + 4] << 12;
        code[i] |= data[j + 5] << 15;
        code[i] |= data[j + 6] << 18;
        code[i] |= data[j + 7] << 21;
        code[i] |= data[j + 8] << 24;
        code[i] |= data[j + 9] << 27;
        code[i] |= (data[j + 10] & 0x00000003) << 30;
        code[i + 1] |= (data[j + 10] & 0x00000004) >> 2;
        code[i + 1] |= data[j + 11] << 1;
        code[i + 1] |= data[j + 12] << 4;
        code[i + 1] |= data[j + 13] << 7;
        code[i + 1] |= data[j + 14] << 10;
        code[i + 1] |= data[j + 15] << 13;
        code[i + 1] |= data[j + 16] << 16;
        code[i + 1] |= data[j + 17] << 19;
        code[i + 1] |= data[j + 18] << 22;
        code[i + 1] |= data[j + 19] << 25;
        code[i + 1] |= data[j + 20] << 28;
        code[i + 1] |= (data[j + 21] & 0x00000001) << 31;
        code[i + 2] |= (data[j + 21] & 0x00000006) >> 1;
        code[i + 2] |= data[j + 22] << 2;
        code[i + 2] |= data[j + 23] << 5;
        code[i + 2] |= data[j + 24] << 8;
        code[i + 2] |= data[j + 25] << 11;
        code[i + 2] |= data[j + 26] << 14;
        code[i + 2] |= data[j + 27] << 17;
        code[i + 2] |= data[j + 28] << 20;
        code[i + 2] |= data[j + 29] << 23;
        code[i + 2] |= data[j + 30] << 26;
        code[i + 2] |= data[j + 31] << 29;
    }
}



void UNPACK3(uint32_t* data, uint32_t* code, size_t n)
{
    for (int i = 0, j = 0; i < n; i += 32, j += 3) {
        data[i + 0] = ((code[j + 0] & 0x00000007) >> 0); // 000......111
        data[i + 1] = ((code[j + 0] & 0x00000038) >> 3); // 000...111000
        data[i + 2] = ((code[j + 0] & 0x000001C0) >> 6);
        data[i + 3] = ((code[j + 0] & 0x00000E00) >> 9);
        data[i + 4] = ((code[j + 0] & 0x00007000) >> 12); //
        data[i + 5] = ((code[j + 0] & 0x00038000) >> 15); //
        data[i + 6] = ((code[j + 0] & 0x001C0000) >> 18);
        data[i + 7] = ((code[j + 0] & 0x00E00000) >> 21);
        data[i + 8] = ((code[j + 0] & 0x07000000) >> 24); //
        data[i + 9] = ((code[j + 0] & 0x38000000) >> 27); //

        data[i + 10] = ((code[j + 0] & 0xC0000000) >> 30) | (code[j + 1] & 0x00000001)
                       << 2;

        data[i + 11] = ((code[j + 1] & 0x0000000E) >> 1);
        data[i + 12] = ((code[j + 1] & 0x00000070) >> 4);
        data[i + 13] = ((code[j + 1] & 0x00000380) >> 7);
        data[i + 14] = ((code[j + 1] & 0x00001C00) >> 10);
        data[i + 15] = ((code[j + 1] & 0x0000E000) >> 13);
        data[i + 16] = ((code[j + 1] & 0x00070000) >> 16);
        data[i + 17] = ((code[j + 1] & 0x00380000) >> 19);
        data[i + 18] = ((code[j + 1] & 0x01C00000) >> 22);
        data[i + 19] = ((code[j + 1] & 0x0E000000) >> 25);
        data[i + 20] = ((code[j + 1] & 0x70000000) >> 28);

        data[i + 21] = ((code[j + 1] & 0x80000000) >> 31) | (code[j + 2] & 0x00000003)
                       << 1;

        data[i + 22] = ((code[j + 2] & 0x0000001C) >> 2);
        data[i + 23] = ((code[j + 2] & 0x000000E0) >> 5);
        data[i + 24] = ((code[j + 2] & 0x00000700) >> 8);
        data[i + 25] = ((code[j + 2] & 0x00003800) >> 11);
        data[i + 26] = ((code[j + 2] & 0x0001C000) >> 14);
        data[i + 27] = ((code[j + 2] & 0x000E0000) >> 17);
        data[i + 28] = ((code[j + 2] & 0x00700000) >> 20);
        data[i + 29] = ((code[j + 2] & 0x03800000) >> 23);
        data[i + 30] = ((code[j + 2] & 0x1C000000) >> 26);
        data[i + 31] = ((code[j + 2] & 0xE0000000) >> 29);
    }
}
pattern_fun PACK[] = {NULL, NULL, NULL, PACK3};
pattern_fun UNPACK[] = {NULL, NULL, NULL, UNPACK3};



int compress(size_t n, int bitwidth, uint32_t* input, uint32_t* code,
             uint32_t* exception, uint32_t* f_e)
{
    uint32_t miss[N], data[N], prev = 0;
    int j = 0;
    int last_e = -1;

    for (int i = 0; i < MAX; i++) {
        int val = input[i];
        data[i] = val;
        miss[j] = i;

        if ((val > MAXCODE[bitwidth])) {
            j++;
            last_e = i;
        } else if (last_e >= 0 && ((i - last_e)) > MAXCODE[bitwidth]) {
            j++;
            last_e = i;
        } else {}

    }

    if (j > 0) {
        *f_e = miss[0];
        prev = miss[0];
        exception[0] = input[prev];

        for (int i = 1; i < j; i++) {
            int cur = miss[i];
            exception[i] = input[cur];
            data[prev] = (cur - prev) - 1;
            prev = cur;
        }
    }

    PACK[bitwidth](code, data, MAX);
    return j;
};

int decompress(size_t exception_cnt, int bitwidth, uint32_t* output,
               uint32_t* input, uint32_t* exception, uint32_t* f_e)
{
    uint32_t next, code[N], cur = *f_e;
    UNPACK[bitwidth](code, input, MAX);

    for (int i = 0; i < MAX; ++i) {
        output[i] = code[i];
    }

    if (cur != MAX && exception_cnt > 0)

    {
        cur = *f_e;
        next = output[cur];
        output[cur] = exception[0];
        cur += next + 1;

        for (int i = 1, j = 1; cur < MAX && j < exception_cnt; i++, j++) {
            next = cur + code[cur] + 1 ;
            output[cur] = exception[i];
            cur = next;
        }
    }

    return MAX;
}

int main(void)
{
    uint32_t input[] = {10, 13, 14, 16, 20, 22, 25, 30, 37, 40, 44, 47, 48, 50, 54, 56, 58, 63, 70, 73, 74, 77, 78, 80, 84, 86, 89, 94, 101, 104, 106, 109, 110, 112, 115, 117, 120, 121, 123, 133, 141, 151, 152, 157, 158, 166, 168, 178, 186, 195, 196, 202, 203, 209, 299, 301, 304, 329, 336, 339, 352, 354, 357, 359};
    uint32_t* code = (uint32_t*)malloc(1024 * sizeof(uint32_t));
    uint32_t* output = (uint32_t*)malloc(1024 * sizeof(uint32_t));
    uint32_t* exception = code + 6;
    uint32_t  first_exception = MAX;
    uint32_t compressed_temp_arr[MAX] = {0};
    compressed_temp_arr[0] = input[0];

    for (int i = 1; i < MAX; ++i) {
        compressed_temp_arr[i] = input[i] - input[i - 1] - 1;
    }

    int j = compress(MAX, 3, compressed_temp_arr, code, exception,
                     &first_exception);

    decompress(j, 3, output, code, exception, &first_exception);

    for (int i = 1; i < MAX; ++i) {
        output[i] = output[i] + output[i - 1] + 1;
    }

    for (int i = 0; i < MAX; ++i) {
        printf("%d,", output[i]);
    }

    printf("/n");

    for (int i = 0; i < MAX; ++i) {
        printf("%d,", input[i]);
    }

    printf("/n");
    int ori_size = sizeof(input) / sizeof(uint32_t) * 32;
    int com_size = j + MAX * 3;
    float ratio = ori_size / com_size;
    printf("original size:%d bit,compressed size:%d bit,compressed ratio:%f/n,bit per docid:%f",
           ori_size, com_size, ratio, com_size * 1.0f / 64);
    return 0;
}

书籍推荐