#include #include #include #include using Clock = std::chrono::high_resolution_clock; static volatile int32_t g_sink = 0; static constexpr int BLK = 512; static int8_t A[BLK*BLK] __attribute__((aligned(64))); static int8_t B[BLK*BLK] __attribute__((aligned(64))); static int32_t C[BLK*BLK] __attribute__((aligned(64))); static void init() { for (int i = 0; i < BLK*BLK; ++i) { A[i] = (int8_t)((i * 13 + 7) % 127 - 63); B[i] = (int8_t)((i * 29 + 3) % 127 - 63); } } // 运行足够 reps 使时长 >= target_sec static double bench(int M, int N, int K, double target_sec) { auto run = [&](int reps) { auto t0 = Clock::now(); for (int r = 0; r < reps; ++r) { for (int i = 0; i < M; ++i) { int8_t* __restrict ar = A + i * K; int32_t* __restrict cr = C + i * N; for (int j = 0; j < N; ++j) { int32_t s = 0; for (int k = 0; k < K; ++k) s += (int32_t)ar[k] * (int32_t)B[k * N + j]; cr[j] = s; } } g_sink += C[0]; } auto t1 = Clock::now(); return std::chrono::duration(t1 - t0).count(); }; // 短跑估计 rep int reps = 1; double t = run(reps); while (t < 0.1 && reps < 10000) { reps *= 2; t = run(reps); } // 按 target_sec 缩放 reps int final_reps = (int)(reps * target_sec / t); if (final_reps < 1) final_reps = 1; // 正式跑 double t_real = run(final_reps); if (t_real < 0.5) { final_reps *= 10; t_real = run(final_reps); } double ops = (double)M * N * K * final_reps * 2; // 1 MAC = 2 ops double gops = ops / t_real / 1e9; printf(" (%4dx%4dx%4d) x%6d rep %7.1f ms → %8.3f GOPS (%8.3f GMACS)\n", M, N, K, final_reps, t_real * 1000.0, gops, gops/2.0); return gops; } int main() { printf("═══════════════════════════════════════════\n"); printf(" INT8 算力持续压力测试 (每项 ~10s)\n"); printf("═══════════════════════════════════════════\n\n"); init(); // warmup bench(64, 64, 64, 1.0); const double T = 10.0; // 每项跑 10 秒 struct { int m,n,k; const char* desc; } tests[] = { { 16, 16, 16, "微型 16x16x16" }, { 32, 32, 32, "小阵 32x32x32" }, { 64, 64, 64, "中阵 64x64x64" }, {128, 64, 64, "宽中 128x64x64" }, {256, 64, 64, "宽大 256x64x64" }, {128,128, 64, "大方 128x128x64" }, { 64,128, 64, "长条 64x128x64" }, { 64, 64,128, "深乘 64x64x128" }, { 32,128,128, "扁平 32x128x128" }, }; double sum = 0; int cnt = 0; for (auto& t : tests) { double g = bench(t.m, t.n, t.k, T); sum += g; cnt++; } double avg = sum / cnt; printf("\n═══════════════════════════════════════════\n"); printf(" INT8 平均: %.3f GOPS\n", avg); printf("═══════════════════════════════════════════\n"); printf("\n 模型帧率预估 (INT8):\n"); printf(" %-30s %8s %8s\n", "模型", "MACs(M)", "FPS"); printf(" ----------------------------------------------------\n"); struct { const char* n; double m; } models[] = { {"TinyCNN 40x30x3 -> 8ch k3", 3}, {"TinyCNN 40x30x8 -> 16ch k3", 8}, {"TinyCNN 40x30x16 -> 32ch k3", 18}, {"TinyCNN 56x56x3 -> 16ch k3", 12}, {"TinyCNN 56x56x16 -> 32ch k3", 45}, {"MicroNet 32x32x8, 3层", 5}, {"MicroNet 32x32x16, 3层", 15}, {"MicroNet 48x48x8, 3层", 12}, {"MicroNet 48x48x16, 3层", 40}, {"ShuffleNetV2 0.5x", 41}, {"MobileNetV2 0.35x 96x96", 35}, {"FC-128 (分类头)", 0.1}, }; for (auto& m : models) printf(" %-30s %8.0f %8.1f\n", m.n, m.m, avg * 1e3 / m.m); printf("\n >>> 稳 30FPS 上限: %.0fM MACs (INT8 %.3f GOPS)\n\n", avg * 1e3 / 30.0, avg); return 0; }