Files
Loongson_2k0300_SmartCar_Fr…/bench/bench.cpp

129 lines
4.4 KiB
C++

#include <cstdio>
#include <cstring>
#include <chrono>
#include <algorithm>
using Clock = std::chrono::high_resolution_clock;
static volatile int32_t g_sink = 0;
static constexpr int BLK = 512;
static int8_t A[BLK*BLK] __attribute__((aligned(64)));
static int8_t B[BLK*BLK] __attribute__((aligned(64)));
static int32_t C[BLK*BLK] __attribute__((aligned(64)));
static void init()
{
for (int i = 0; i < BLK*BLK; ++i) {
A[i] = (int8_t)((i * 13 + 7) % 127 - 63);
B[i] = (int8_t)((i * 29 + 3) % 127 - 63);
}
}
// 运行足够 reps 使时长 >= target_sec
static double bench(int M, int N, int K, double target_sec)
{
auto run = [&](int reps) {
auto t0 = Clock::now();
for (int r = 0; r < reps; ++r) {
for (int i = 0; i < M; ++i) {
int8_t* __restrict ar = A + i * K;
int32_t* __restrict cr = C + i * N;
for (int j = 0; j < N; ++j) {
int32_t s = 0;
for (int k = 0; k < K; ++k)
s += (int32_t)ar[k] * (int32_t)B[k * N + j];
cr[j] = s;
}
}
g_sink += C[0];
}
auto t1 = Clock::now();
return std::chrono::duration<double>(t1 - t0).count();
};
// 短跑估计 rep
int reps = 1;
double t = run(reps);
while (t < 0.1 && reps < 10000) { reps *= 2; t = run(reps); }
// 按 target_sec 缩放 reps
int final_reps = (int)(reps * target_sec / t);
if (final_reps < 1) final_reps = 1;
// 正式跑
double t_real = run(final_reps);
if (t_real < 0.5) {
final_reps *= 10;
t_real = run(final_reps);
}
double ops = (double)M * N * K * final_reps * 2; // 1 MAC = 2 ops
double gops = ops / t_real / 1e9;
printf(" (%4dx%4dx%4d) x%6d rep %7.1f ms → %8.3f GOPS (%8.3f GMACS)\n",
M, N, K, final_reps, t_real * 1000.0, gops, gops/2.0);
return gops;
}
int main()
{
printf("═══════════════════════════════════════════\n");
printf(" INT8 算力持续压力测试 (每项 ~10s)\n");
printf("═══════════════════════════════════════════\n\n");
init();
// warmup
bench(64, 64, 64, 1.0);
const double T = 10.0; // 每项跑 10 秒
struct { int m,n,k; const char* desc; } tests[] = {
{ 16, 16, 16, "微型 16x16x16" },
{ 32, 32, 32, "小阵 32x32x32" },
{ 64, 64, 64, "中阵 64x64x64" },
{128, 64, 64, "宽中 128x64x64" },
{256, 64, 64, "宽大 256x64x64" },
{128,128, 64, "大方 128x128x64" },
{ 64,128, 64, "长条 64x128x64" },
{ 64, 64,128, "深乘 64x64x128" },
{ 32,128,128, "扁平 32x128x128" },
};
double sum = 0; int cnt = 0;
for (auto& t : tests)
{
double g = bench(t.m, t.n, t.k, T);
sum += g; cnt++;
}
double avg = sum / cnt;
printf("\n═══════════════════════════════════════════\n");
printf(" INT8 平均: %.3f GOPS\n", avg);
printf("═══════════════════════════════════════════\n");
printf("\n 模型帧率预估 (INT8):\n");
printf(" %-30s %8s %8s\n", "模型", "MACs(M)", "FPS");
printf(" ----------------------------------------------------\n");
struct { const char* n; double m; } models[] = {
{"TinyCNN 40x30x3 -> 8ch k3", 3},
{"TinyCNN 40x30x8 -> 16ch k3", 8},
{"TinyCNN 40x30x16 -> 32ch k3", 18},
{"TinyCNN 56x56x3 -> 16ch k3", 12},
{"TinyCNN 56x56x16 -> 32ch k3", 45},
{"MicroNet 32x32x8, 3层", 5},
{"MicroNet 32x32x16, 3层", 15},
{"MicroNet 48x48x8, 3层", 12},
{"MicroNet 48x48x16, 3层", 40},
{"ShuffleNetV2 0.5x", 41},
{"MobileNetV2 0.35x 96x96", 35},
{"FC-128 (分类头)", 0.1},
};
for (auto& m : models)
printf(" %-30s %8.0f %8.1f\n", m.n, m.m, avg * 1e3 / m.m);
printf("\n >>> 稳 30FPS 上限: %.0fM MACs (INT8 %.3f GOPS)\n\n", avg * 1e3 / 30.0, avg);
return 0;
}