129 lines
4.4 KiB
C++
129 lines
4.4 KiB
C++
#include <cstdio>
|
|
#include <cstring>
|
|
#include <chrono>
|
|
#include <algorithm>
|
|
|
|
using Clock = std::chrono::high_resolution_clock;
|
|
|
|
static volatile int32_t g_sink = 0;
|
|
|
|
static constexpr int BLK = 512;
|
|
static int8_t A[BLK*BLK] __attribute__((aligned(64)));
|
|
static int8_t B[BLK*BLK] __attribute__((aligned(64)));
|
|
static int32_t C[BLK*BLK] __attribute__((aligned(64)));
|
|
|
|
static void init()
|
|
{
|
|
for (int i = 0; i < BLK*BLK; ++i) {
|
|
A[i] = (int8_t)((i * 13 + 7) % 127 - 63);
|
|
B[i] = (int8_t)((i * 29 + 3) % 127 - 63);
|
|
}
|
|
}
|
|
|
|
// 运行足够 reps 使时长 >= target_sec
|
|
static double bench(int M, int N, int K, double target_sec)
|
|
{
|
|
auto run = [&](int reps) {
|
|
auto t0 = Clock::now();
|
|
for (int r = 0; r < reps; ++r) {
|
|
for (int i = 0; i < M; ++i) {
|
|
int8_t* __restrict ar = A + i * K;
|
|
int32_t* __restrict cr = C + i * N;
|
|
for (int j = 0; j < N; ++j) {
|
|
int32_t s = 0;
|
|
for (int k = 0; k < K; ++k)
|
|
s += (int32_t)ar[k] * (int32_t)B[k * N + j];
|
|
cr[j] = s;
|
|
}
|
|
}
|
|
g_sink += C[0];
|
|
}
|
|
auto t1 = Clock::now();
|
|
return std::chrono::duration<double>(t1 - t0).count();
|
|
};
|
|
|
|
// 短跑估计 rep
|
|
int reps = 1;
|
|
double t = run(reps);
|
|
while (t < 0.1 && reps < 10000) { reps *= 2; t = run(reps); }
|
|
|
|
// 按 target_sec 缩放 reps
|
|
int final_reps = (int)(reps * target_sec / t);
|
|
if (final_reps < 1) final_reps = 1;
|
|
|
|
// 正式跑
|
|
double t_real = run(final_reps);
|
|
if (t_real < 0.5) {
|
|
final_reps *= 10;
|
|
t_real = run(final_reps);
|
|
}
|
|
|
|
double ops = (double)M * N * K * final_reps * 2; // 1 MAC = 2 ops
|
|
double gops = ops / t_real / 1e9;
|
|
printf(" (%4dx%4dx%4d) x%6d rep %7.1f ms → %8.3f GOPS (%8.3f GMACS)\n",
|
|
M, N, K, final_reps, t_real * 1000.0, gops, gops/2.0);
|
|
return gops;
|
|
}
|
|
|
|
int main()
|
|
{
|
|
printf("═══════════════════════════════════════════\n");
|
|
printf(" INT8 算力持续压力测试 (每项 ~10s)\n");
|
|
printf("═══════════════════════════════════════════\n\n");
|
|
|
|
init();
|
|
// warmup
|
|
bench(64, 64, 64, 1.0);
|
|
|
|
const double T = 10.0; // 每项跑 10 秒
|
|
|
|
struct { int m,n,k; const char* desc; } tests[] = {
|
|
{ 16, 16, 16, "微型 16x16x16" },
|
|
{ 32, 32, 32, "小阵 32x32x32" },
|
|
{ 64, 64, 64, "中阵 64x64x64" },
|
|
{128, 64, 64, "宽中 128x64x64" },
|
|
{256, 64, 64, "宽大 256x64x64" },
|
|
{128,128, 64, "大方 128x128x64" },
|
|
{ 64,128, 64, "长条 64x128x64" },
|
|
{ 64, 64,128, "深乘 64x64x128" },
|
|
{ 32,128,128, "扁平 32x128x128" },
|
|
};
|
|
|
|
double sum = 0; int cnt = 0;
|
|
for (auto& t : tests)
|
|
{
|
|
double g = bench(t.m, t.n, t.k, T);
|
|
sum += g; cnt++;
|
|
}
|
|
|
|
double avg = sum / cnt;
|
|
printf("\n═══════════════════════════════════════════\n");
|
|
printf(" INT8 平均: %.3f GOPS\n", avg);
|
|
printf("═══════════════════════════════════════════\n");
|
|
|
|
printf("\n 模型帧率预估 (INT8):\n");
|
|
printf(" %-30s %8s %8s\n", "模型", "MACs(M)", "FPS");
|
|
printf(" ----------------------------------------------------\n");
|
|
|
|
struct { const char* n; double m; } models[] = {
|
|
{"TinyCNN 40x30x3 -> 8ch k3", 3},
|
|
{"TinyCNN 40x30x8 -> 16ch k3", 8},
|
|
{"TinyCNN 40x30x16 -> 32ch k3", 18},
|
|
{"TinyCNN 56x56x3 -> 16ch k3", 12},
|
|
{"TinyCNN 56x56x16 -> 32ch k3", 45},
|
|
{"MicroNet 32x32x8, 3层", 5},
|
|
{"MicroNet 32x32x16, 3层", 15},
|
|
{"MicroNet 48x48x8, 3层", 12},
|
|
{"MicroNet 48x48x16, 3层", 40},
|
|
{"ShuffleNetV2 0.5x", 41},
|
|
{"MobileNetV2 0.35x 96x96", 35},
|
|
{"FC-128 (分类头)", 0.1},
|
|
};
|
|
|
|
for (auto& m : models)
|
|
printf(" %-30s %8.0f %8.1f\n", m.n, m.m, avg * 1e3 / m.m);
|
|
|
|
printf("\n >>> 稳 30FPS 上限: %.0fM MACs (INT8 %.3f GOPS)\n\n", avg * 1e3 / 30.0, avg);
|
|
return 0;
|
|
}
|