Add async model thread, vision FPS throttle, configurable FPS, .gitignore __pycache__

2026-05-25 11:04:05 +08:00
parent 28d9c6da58
commit 610f0a7549
8 changed files with 503 additions and 55 deletions
--- a/model/model.cpp
+++ b/model/model.cpp
@@ -1,13 +1,287 @@
 #include "model.hpp"
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <cmath>
+#include <algorithm>

-bool model_init()
-{
-    // TODO: 加载 TFLite 模型，初始化解释器
-    return false;
+// ============================================================
+// 内存预分配
+// ============================================================
+struct M {
+    float stem[8*60*80];
+    float b1[16*60*80];  float b1_tmp[16*60*80];
+    float b2[28*30*40];  float b2_dw[16*30*40];
+    float b3[40*30*40];  float b3_dw[28*30*40];
+    float b4[56*15*20];  float b4_dw[40*15*20];
+    float b5[64*15*20];  float b5_dw[56*15*20];
+    float sh[24*15*20];
+    float cls[4*15*20];
+    float sz[2*15*20];
+    float dw_out[64*60*80];  // max(8*60*80, 16*30*40, ...) = 38400
+};
+static M* m = nullptr;
+
+// ============================================================
+// 权重表
+// ============================================================
+struct WT { char n[64]; int nd; int s[4]; float* d; };
+static WT* gw = nullptr; static int gn = 0;
+
+static float* wf(const char* name) {
+    for (int i = 0; i < gn; ++i) if (!std::strcmp(gw[i].n, name)) return gw[i].d;
+    printf("[MODEL] MISS %s\n", name); return nullptr;
 }

-void model_infer(const uint8* image, int w, int h, float* output, int num_classes)
+// ============================================================
+// 基础算子
+// ============================================================
+static void relu_f(float* x, int N) { for (int i=0;i<N;++i) if (x[i]<0) x[i]=0; }
+static void sigmoid_f(float* x, int N) { for (int i=0;i<N;++i) x[i]=1.0f/(1.0f+std::exp(-x[i])); }
+
+static void conv2d(float* o, const float* in, const float* w, const float* bias,
+                   int H, int W, int iC, int oC, int K, int str, int grp)
 {
-    // TODO: 运行推理，填充 output[0..num_classes-1]
-    for (int i = 0; i < num_classes; ++i) output[i] = 0.0f;
+    int Ho=H/str, Wo=W/str, pad=K/2;
+    int icpg = iC/grp, ocpg = oC/grp;
+    for (int g=0; g<grp; ++g) {
+        for (int oc=0; oc<ocpg; ++oc) {
+            int occ = g*ocpg + oc;
+            float* oo = o + occ*Ho*Wo;
+            for (int y=0; y<Ho; ++y) for (int x=0; x<Wo; ++x) {
+                float s = bias ? bias[occ] : 0;
+                for (int ic=0; ic<icpg; ++ic) {
+                    int icc = g*icpg + ic;
+                    const float* ww = w + ((occ*iC + icc)*K*K);
+                    const float* ii = in + icc*H*W;
+                    for (int ky=0; ky<K; ++ky) {
+                        int iy = y*str + ky - pad;
+                        if (iy<0||iy>=H) continue;
+                        for (int kx=0; kx<K; ++kx) {
+                            int ix = x*str + kx - pad;
+                            if (ix<0||ix>=W) continue;
+                            s += ww[ky*K+kx] * ii[iy*W+ix];
+                        }
+                    }
+                }
+                oo[y*Wo+x] = s;
+            }
+        }
+    }
 }
+
+static void batchnorm_f(float* x, const float* w, const float* b,
+                       const float* mean, const float* var, int C, int N)
+{
+    const float eps = 1e-5f;
+    for (int c=0; c<C; ++c) {
+        float W = w?w[c]:1, B = b?b[c]:0;
+        float iv = 1.0f/std::sqrt(var[c]+eps);
+        float* xc = x + c*N;
+        for (int i=0; i<N; ++i) xc[i] = (xc[i]-mean[c])*iv*W + B;
+    }
+}
+
+// conv + BN + optional ReLU
+static void conv_bn(float* o, const float* in, int H, int W,
+                   const float* cw, const float* cb,
+                   const float* bn_w, const float* bn_b,
+                   const float* bn_m, const float* bn_v,
+                   int iC, int oC, int K, int str, int grp, bool relu_flag)
+{
+    conv2d(o, in, cw, nullptr, H, W, iC, oC, K, str, grp);
+    int N = (H/str)*(W/str);
+    batchnorm_f(o, bn_w, bn_b, bn_m, bn_v, oC, N);
+    if (relu_flag) relu_f(o, oC*N);
+}
+
+// global avg pool
+static void gap(float* o, const float* in, int C, int H, int W) {
+    int N=H*W;
+    for (int c=0; c<C; ++c) {
+        float s=0; const float* ic=in+c*N;
+        for (int i=0; i<N; ++i) s+=ic[i];
+        o[c]=s/(float)N;
+    }
+}
+
+static void softmax_c(float* x, int C, int N) {
+    for (int i=0; i<N; ++i) {
+        float mx=-1e9f;
+        for (int c=0; c<C; ++c) mx=std::max(mx, x[c*N+i]);
+        float sum=0;
+        for (int c=0; c<C; ++c) { x[c*N+i]=std::exp(x[c*N+i]-mx); sum+=x[c*N+i]; }
+        for (int c=0; c<C; ++c) x[c*N+i]/=sum;
+    }
+}
+
+// ============================================================
+// SE-ResDW Block
+// ============================================================
+static void se_block(float* o, const float* in, int iC, int oC, int H, int W, int str, const char* pfx)
+{
+    int Ho=H/str, Wo=W/str, No=Ho*Wo;
+    char na[128];
+
+    // depthwise conv + BN + ReLU
+    std::snprintf(na,128,"%s.dw.0.weight",pfx);
+    conv2d(m->dw_out, in, wf(na), nullptr, H, W, iC, iC, 3, str, iC);
+    std::snprintf(na,128,"%s.dw.1.weight",pfx);
+    batchnorm_f(m->dw_out, wf(na), wf((std::snprintf(na,128,"%s.dw.1.bias",pfx),na)),
+                wf((std::snprintf(na,128,"%s.dw.1.running_mean",pfx),na)),
+                wf((std::snprintf(na,128,"%s.dw.1.running_var",pfx),na)), iC, iC*No);
+    relu_f(m->dw_out, iC*No);
+
+    // pointwise conv + BN + ReLU (to output)
+    std::snprintf(na,128,"%s.pw.0.weight",pfx);
+    conv2d(o, m->dw_out, wf(na), nullptr, Ho, Wo, iC, oC, 1, 1, 1);
+    std::snprintf(na,128,"%s.pw.1.weight",pfx);
+    batchnorm_f(o, wf(na), wf((std::snprintf(na,128,"%s.pw.1.bias",pfx),na)),
+                wf((std::snprintf(na,128,"%s.pw.1.running_mean",pfx),na)),
+                wf((std::snprintf(na,128,"%s.pw.1.running_var",pfx),na)), oC, oC*No);
+    relu_f(o, oC*No);
+
+    // skip (no ReLU)
+    float* skip = m->dw_out;  // reuse buffer
+    if (str==1 && iC==oC) {
+        std::memcpy(skip, in, iC*H*W*4);
+    } else {
+        std::snprintf(na,128,"%s.skip.0.weight",pfx);
+        conv2d(skip, in, wf(na), nullptr, H, W, iC, oC, 1, str, 1);
+        std::snprintf(na,128,"%s.skip.1.weight",pfx);
+        batchnorm_f(skip, wf(na), wf((std::snprintf(na,128,"%s.skip.1.bias",pfx),na)),
+                    wf((std::snprintf(na,128,"%s.skip.1.running_mean",pfx),na)),
+                    wf((std::snprintf(na,128,"%s.skip.1.running_var",pfx),na)), oC, oC*No);
+    }
+
+    // add skip
+    for (int i=0; i<oC*No; ++i) o[i] += skip[i];
+
+    // SE (输入是 o, 即 pw+skip 的和)
+    float se_avg[64]; gap(se_avg, o, oC, Ho, Wo);
+
+    int rc = oC/4;
+    float se1[16], se2[64];
+    std::snprintf(na,128,"%s.se.1.weight",pfx);
+    float* se1w=wf(na); std::snprintf(na,128,"%s.se.1.bias",pfx); float* se1b=wf(na);
+    for (int i=0; i<rc; ++i) { float s=se1b?se1b[i]:0; for (int j=0;j<oC;++j) s+=se1w[i*oC+j]*se_avg[j]; se1[i]=s; }
+    relu_f(se1, rc);
+
+    std::snprintf(na,128,"%s.se.3.weight",pfx);
+    float* se2w=wf(na); std::snprintf(na,128,"%s.se.3.bias",pfx); float* se2b=wf(na);
+    for (int i=0; i<oC; ++i) { float s=se2b?se2b[i]:0; for (int j=0;j<rc;++j) s+=se2w[i*rc+j]*se1[j]; se2[i]=s; }
+    sigmoid_f(se2, oC);
+
+    for (int c=0; c<oC; ++c) { float sc=se2[c]; float* oc=o+c*No; for (int i=0;i<No;++i) oc[i]*=sc; }
+    relu_f(o, oC*No);
+}
+
+// ============================================================
+// 检测后处理
+// ============================================================
+static constexpr int OH=15, OW=20, ST=8, NC=3;
+static constexpr float RS[3][2] = {{75.36f,62.15f},{49.79f,38.63f},{67.59f,34.94f}};
+
+static int decode(DetectBox* boxes, int max, float th)
+{
+    int N=OH*OW;
+    softmax_c(m->cls, 4, N);
+    int cnt=0;
+
+    for (int gy=0; gy<OH && cnt<max; ++gy)
+    for (int gx=0; gx<OW && cnt<max; ++gx)
+    {
+        int idx = gy*OW + gx;
+        int bc=-1; float bs=0;
+        for (int c=0; c<NC; ++c) { float s=m->cls[c*N+idx]; if (s>bs) {bs=s; bc=c;} }
+        if (bs<th) continue;
+
+        bool pk=true;
+        for (int dy=-1; dy<=1&&pk; ++dy)
+        for (int dx=-1; dx<=1&&pk; ++dx)
+        {
+            int ny=gy+dy, nx=gx+dx;
+            if (ny<0||ny>=OH||nx<0||nx>=OW) continue;
+            if (m->cls[bc*N+ny*OW+nx] > bs) pk=false;
+        }
+        if (!pk) continue;
+
+        float pw=m->sz[0*N+idx], ph=m->sz[1*N+idx];
+        boxes[cnt].cls=bc; boxes[cnt].conf=bs;
+        boxes[cnt].cx=((float)gx+0.5f)*ST; boxes[cnt].cy=((float)gy+0.5f)*ST;
+        boxes[cnt].w=pw*RS[bc][0]; boxes[cnt].h=ph*RS[bc][1];
+        cnt++;
+    }
+    return cnt;
+}
+
+// ============================================================
+// 前向推理
+// ============================================================
+static void forward(const uint8* bgr)
+{
+    // 预处理 BGR→RGB float [0,1], CHW
+    float in[3*120*160];
+    for (int c=0; c<3; ++c) {
+        int sc=2-c; float* ch=in+c*120*160;
+        for (int y=0; y<120; ++y) {
+            const uint8* row=bgr+y*160*3;
+            for (int x=0; x<160; ++x) ch[y*160+x]=(float)row[x*3+sc]/255.0f;
+        }
+    }
+
+    // stem
+    conv_bn(m->stem, in, 120,160, wf("stem.0.weight"),nullptr,
+            wf("stem.1.weight"),wf("stem.1.bias"),
+            wf("stem.1.running_mean"),wf("stem.1.running_var"),
+            3,8,3,2,1,true);
+
+    se_block(m->b1, m->stem, 8,16, 60,80, 1,"block1");
+    se_block(m->b2, m->b1,   16,28, 60,80, 2,"block2");
+    se_block(m->b3, m->b2,   28,40, 30,40, 1,"block3");
+    se_block(m->b4, m->b3,   40,56, 30,40, 2,"block4");
+    se_block(m->b5, m->b4,   56,64, 15,20, 1,"block5");
+
+    // shared
+    conv_bn(m->sh, m->b5, 15,20, wf("shared.0.weight"),nullptr,
+            wf("shared.1.weight"),wf("shared.1.bias"),
+            wf("shared.1.running_mean"),wf("shared.1.running_var"),
+            64,24,1,1,1,true);
+
+    // heads (1x1 conv, 无 BN/ReLU)
+    conv2d(m->cls, m->sh, wf("cls_head.weight"), wf("cls_head.bias"), 15,20, 24,4, 1,1,1);
+    conv2d(m->sz,  m->sh, wf("size_head.weight"),wf("size_head.bias"),15,20, 24,2, 1,1,1);
+}
+
+// ============================================================
+// 接口
+// ============================================================
+static bool g_rdy = false;
+
+bool model_init(const char* path) {
+    FILE* f=fopen(path,"rb");
+    if(!f){ printf("[MODEL] open fail: %s\n",path); return false; }
+    fread(&gn,sizeof(int),1,f);
+    gw=new WT[gn];
+    for(int i=0;i<gn;++i){ WT& t=gw[i]; int nl; fread(&nl,4,1,f); fread(t.n,1,nl,f); t.n[nl]=0;
+        fread(&t.nd,4,1,f); int tot=1; for(int d=0;d<t.nd;++d){ fread(&t.s[d],4,1,f); tot*=t.s[d]; }
+        for(int d=t.nd;d<4;++d) t.s[d]=1;
+        t.d=new float[tot]; fread(t.d,4,tot,f);
+    }
+    fclose(f);
+    m=new M(); std::memset(m,0,sizeof(M));
+    g_rdy=true; printf("[MODEL] load ok: %d layers\n",gn); return true;
+}
+
+int model_detect(const uint8* bgr, int w, int h, DetectBox* boxes, int max, float th) {
+    if(!g_rdy||w!=160||h!=120) return 0;
+    forward(bgr);
+    return decode(boxes, max, th);
+}
+
+void model_deinit() {
+    if(gw){ for(int i=0;i<gn;++i) delete[] gw[i].d; delete[] gw; gw=nullptr; }
+    delete m; m=nullptr; g_rdy=false;
+}
+
+bool model_ready() { return g_rdy; }