memcpy优化失败

Posted by shensunbo on October 31, 2024

尝试了多种方式,仍然无法明显优化memcpy的拷贝速度,memcpy应该是已经高度优化过的了

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#include <cstring>
#include <iostream>
#include <memory>
#include <chrono>
#include <fstream>
#include <cstdlib>
#include <emmintrin.h>
#include <vector>
#include <thread>

void threaded_memcpy(char* dest, const char* src, size_t size, size_t block_size);
void* simple_memcpy(void* dest, const void* src, size_t n);
void sse_memcpy(char* dest, const char* src, size_t n);

int main(){
    int width = 1920;
    int height = 1080;
    int comp = 4; // RGBA
    int size = width * height * comp;
    // char *data = new char[size];
    // char *outdata = new char[size];
    char *data = static_cast<char*>(aligned_alloc(64, size));
    char *outdata = static_cast<char*>(aligned_alloc(64, size));

    std::ifstream file(RGBA_PATH, std::ios::binary); // open the file in binary mode
    if (file.is_open()) {
        file.read(data, size); // read the entire file into the vector
        file.close();
    } else {
        std::cerr << "Unable to open file" << std::endl;
        return 1;
    }

    auto start = std::chrono::high_resolution_clock::now();
    // data size 8100KB
    memcpy(outdata, data, size); // 2.82638 ms
    // std::copy(data, data+size, outdata);// same with memcpy
    // simple_memcpy(outdata, data, size); // 14ms
    // sse_memcpy(outdata, data, size);// a bit slow than memcpy
    // threaded_memcpy(outdata, data, size, size / 4); // same with memcpy

    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double, std::milli> elapsed = end - start;
    std::cout <<"line: "<<__LINE__ << ", Function execution time: " << elapsed.count() << " ms" << std::endl;

    std::ofstream outFile("output_1920x1080.rgba", std::ios::binary);
    if (!outFile) {
        std::cerr << "无法打开文件进行写入!" << std::endl;
        return 1;
    }

    outFile.write(outdata, size);

    if (!outFile) {
        std::cerr << "写入数据时发生错误!" << std::endl;
        return 1;
    }

    outFile.close();

    // delete[] data;
    // delete[] outdata;

    std::free(data);
    std::free(outdata);

    return 0;
}

void* simple_memcpy(void* dest, const void* src, size_t n) {
    char* d = (char*)dest;
    const char* s = (const char*)src;
    while (n--) {
        *d++ = *s++;
    }
    return dest;
}

void sse_memcpy(char* dest, const char* src, size_t n) {
    size_t i;
    // 每次拷贝16字节(128位)
    for (i = 0; i < n / 16; i++) {
        _mm_store_si128(reinterpret_cast<__m128i*>(dest + i * 16), _mm_load_si128(reinterpret_cast<const __m128i*>(src + i * 16)));
    }
    // 处理剩余的字节
    for (; i * 16 < n; i++) {
        dest[i * 16] = src[i * 16];
    }
}

void threaded_memcpy(char* dest, const char* src, size_t size, size_t block_size) {
    size_t total_blocks = size / block_size;
    std::vector<std::thread> threads;
    
    for (size_t i = 0; i < total_blocks; ++i) {
        threads.emplace_back([=]() { memcpy(dest + i * block_size, src + i * block_size, block_size); });
    }

    // 等待所有线程完成
    for (auto& t : threads) {
        t.join();
    }
}