尝试了多种方式,仍然无法明显优化memcpy的拷贝速度,memcpy应该是已经高度优化过的了
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#include <cstring>
#include <iostream>
#include <memory>
#include <chrono>
#include <fstream>
#include <cstdlib>
#include <emmintrin.h>
#include <vector>
#include <thread>
void threaded_memcpy(char* dest, const char* src, size_t size, size_t block_size);
void* simple_memcpy(void* dest, const void* src, size_t n);
void sse_memcpy(char* dest, const char* src, size_t n);
int main(){
int width = 1920;
int height = 1080;
int comp = 4; // RGBA
int size = width * height * comp;
// char *data = new char[size];
// char *outdata = new char[size];
char *data = static_cast<char*>(aligned_alloc(64, size));
char *outdata = static_cast<char*>(aligned_alloc(64, size));
std::ifstream file(RGBA_PATH, std::ios::binary); // open the file in binary mode
if (file.is_open()) {
file.read(data, size); // read the entire file into the vector
file.close();
} else {
std::cerr << "Unable to open file" << std::endl;
return 1;
}
auto start = std::chrono::high_resolution_clock::now();
// data size 8100KB
memcpy(outdata, data, size); // 2.82638 ms
// std::copy(data, data+size, outdata);// same with memcpy
// simple_memcpy(outdata, data, size); // 14ms
// sse_memcpy(outdata, data, size);// a bit slow than memcpy
// threaded_memcpy(outdata, data, size, size / 4); // same with memcpy
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> elapsed = end - start;
std::cout <<"line: "<<__LINE__ << ", Function execution time: " << elapsed.count() << " ms" << std::endl;
std::ofstream outFile("output_1920x1080.rgba", std::ios::binary);
if (!outFile) {
std::cerr << "无法打开文件进行写入!" << std::endl;
return 1;
}
outFile.write(outdata, size);
if (!outFile) {
std::cerr << "写入数据时发生错误!" << std::endl;
return 1;
}
outFile.close();
// delete[] data;
// delete[] outdata;
std::free(data);
std::free(outdata);
return 0;
}
void* simple_memcpy(void* dest, const void* src, size_t n) {
char* d = (char*)dest;
const char* s = (const char*)src;
while (n--) {
*d++ = *s++;
}
return dest;
}
void sse_memcpy(char* dest, const char* src, size_t n) {
size_t i;
// 每次拷贝16字节(128位)
for (i = 0; i < n / 16; i++) {
_mm_store_si128(reinterpret_cast<__m128i*>(dest + i * 16), _mm_load_si128(reinterpret_cast<const __m128i*>(src + i * 16)));
}
// 处理剩余的字节
for (; i * 16 < n; i++) {
dest[i * 16] = src[i * 16];
}
}
void threaded_memcpy(char* dest, const char* src, size_t size, size_t block_size) {
size_t total_blocks = size / block_size;
std::vector<std::thread> threads;
for (size_t i = 0; i < total_blocks; ++i) {
threads.emplace_back([=]() { memcpy(dest + i * block_size, src + i * block_size, block_size); });
}
// 等待所有线程完成
for (auto& t : threads) {
t.join();
}
}