293 lines
11 KiB
C++
293 lines
11 KiB
C++
#include "model.hpp"
|
||
#include <cstdio>
|
||
#include <cstdlib>
|
||
#include <cstring>
|
||
#include <cmath>
|
||
#include <new>
|
||
#include <algorithm>
|
||
|
||
// ============================================================
|
||
// 内存预分配
|
||
// ============================================================
|
||
struct M {
|
||
float stem[8*60*80];
|
||
float b1[16*60*80]; float b1_tmp[16*60*80];
|
||
float b2[28*30*40]; float b2_dw[16*30*40];
|
||
float b3[40*30*40]; float b3_dw[28*30*40];
|
||
float b4[56*15*20]; float b4_dw[40*15*20];
|
||
float b5[64*15*20]; float b5_dw[56*15*20];
|
||
float sh[24*15*20];
|
||
float cls[4*15*20];
|
||
float sz[2*15*20];
|
||
float dw_out[16*60*80]; // max(block1 skip 16×60×80, other dw)
|
||
};
|
||
static M* m = nullptr;
|
||
|
||
// ============================================================
|
||
// 权重表
|
||
// ============================================================
|
||
struct WT { char n[64]; int nd; int s[4]; float* d; };
|
||
static WT* gw = nullptr; static int gn = 0;
|
||
|
||
static float* wf(const char* name) {
|
||
for (int i = 0; i < gn; ++i) if (!std::strcmp(gw[i].n, name)) return gw[i].d;
|
||
printf("[MODEL] MISS %s\n", name); return nullptr;
|
||
}
|
||
|
||
// ============================================================
|
||
// 基础算子
|
||
// ============================================================
|
||
static void relu_f(float* x, int N) { for (int i=0;i<N;++i) if (x[i]<0) x[i]=0; }
|
||
static void sigmoid_f(float* x, int N) { for (int i=0;i<N;++i) x[i]=1.0f/(1.0f+std::exp(-x[i])); }
|
||
|
||
static void conv2d(float* o, const float* in, const float* w, const float* bias,
|
||
int H, int W, int iC, int oC, int K, int str, int grp)
|
||
{
|
||
int Ho=H/str, Wo=W/str, pad=K/2;
|
||
int icpg = iC/grp, ocpg = oC/grp;
|
||
for (int g=0; g<grp; ++g) {
|
||
for (int oc=0; oc<ocpg; ++oc) {
|
||
int occ = g*ocpg + oc;
|
||
float* oo = o + occ*Ho*Wo;
|
||
for (int y=0; y<Ho; ++y) for (int x=0; x<Wo; ++x) {
|
||
float s = bias ? bias[occ] : 0;
|
||
for (int ic=0; ic<icpg; ++ic) {
|
||
int icc = g*icpg + ic;
|
||
const float* ww = w + ((occ*iC + icc)*K*K);
|
||
const float* ii = in + icc*H*W;
|
||
for (int ky=0; ky<K; ++ky) {
|
||
int iy = y*str + ky - pad;
|
||
if (iy<0||iy>=H) continue;
|
||
for (int kx=0; kx<K; ++kx) {
|
||
int ix = x*str + kx - pad;
|
||
if (ix<0||ix>=W) continue;
|
||
s += ww[ky*K+kx] * ii[iy*W+ix];
|
||
}
|
||
}
|
||
}
|
||
oo[y*Wo+x] = s;
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
static void batchnorm_f(float* x, const float* w, const float* b,
|
||
const float* mean, const float* var, int C, int N)
|
||
{
|
||
const float eps = 1e-5f;
|
||
for (int c=0; c<C; ++c) {
|
||
float W = w?w[c]:1, B = b?b[c]:0;
|
||
float iv = 1.0f/std::sqrt(var[c]+eps);
|
||
float* xc = x + c*N;
|
||
for (int i=0; i<N; ++i) xc[i] = (xc[i]-mean[c])*iv*W + B;
|
||
}
|
||
}
|
||
|
||
// conv + BN + optional ReLU
|
||
static void conv_bn(float* o, const float* in, int H, int W,
|
||
const float* cw, const float* cb,
|
||
const float* bn_w, const float* bn_b,
|
||
const float* bn_m, const float* bn_v,
|
||
int iC, int oC, int K, int str, int grp, bool relu_flag)
|
||
{
|
||
conv2d(o, in, cw, nullptr, H, W, iC, oC, K, str, grp);
|
||
int N = (H/str)*(W/str);
|
||
batchnorm_f(o, bn_w, bn_b, bn_m, bn_v, oC, N);
|
||
if (relu_flag) relu_f(o, oC*N);
|
||
}
|
||
|
||
// global avg pool
|
||
static void gap(float* o, const float* in, int C, int H, int W) {
|
||
int N=H*W;
|
||
for (int c=0; c<C; ++c) {
|
||
float s=0; const float* ic=in+c*N;
|
||
for (int i=0; i<N; ++i) s+=ic[i];
|
||
o[c]=s/(float)N;
|
||
}
|
||
}
|
||
|
||
static void softmax_c(float* x, int C, int N) {
|
||
for (int i=0; i<N; ++i) {
|
||
float mx=-1e9f;
|
||
for (int c=0; c<C; ++c) mx=std::max(mx, x[c*N+i]);
|
||
float sum=0;
|
||
for (int c=0; c<C; ++c) { x[c*N+i]=std::exp(x[c*N+i]-mx); sum+=x[c*N+i]; }
|
||
for (int c=0; c<C; ++c) x[c*N+i]/=sum;
|
||
}
|
||
}
|
||
|
||
// ============================================================
|
||
// SE-ResDW Block
|
||
// ============================================================
|
||
static void se_block(float* o, const float* in, int iC, int oC, int H, int W, int str, const char* pfx)
|
||
{
|
||
int Ho=H/str, Wo=W/str, No=Ho*Wo;
|
||
char na[128];
|
||
|
||
// depthwise conv + BN + ReLU
|
||
std::snprintf(na,128,"%s.dw.0.weight",pfx);
|
||
conv2d(m->dw_out, in, wf(na), nullptr, H, W, iC, iC, 3, str, iC);
|
||
std::snprintf(na,128,"%s.dw.1.weight",pfx);
|
||
batchnorm_f(m->dw_out, wf(na), wf((std::snprintf(na,128,"%s.dw.1.bias",pfx),na)),
|
||
wf((std::snprintf(na,128,"%s.dw.1.running_mean",pfx),na)),
|
||
wf((std::snprintf(na,128,"%s.dw.1.running_var",pfx),na)), iC, iC*No);
|
||
relu_f(m->dw_out, iC*No);
|
||
|
||
// pointwise conv + BN + ReLU (to output)
|
||
std::snprintf(na,128,"%s.pw.0.weight",pfx);
|
||
conv2d(o, m->dw_out, wf(na), nullptr, Ho, Wo, iC, oC, 1, 1, 1);
|
||
std::snprintf(na,128,"%s.pw.1.weight",pfx);
|
||
batchnorm_f(o, wf(na), wf((std::snprintf(na,128,"%s.pw.1.bias",pfx),na)),
|
||
wf((std::snprintf(na,128,"%s.pw.1.running_mean",pfx),na)),
|
||
wf((std::snprintf(na,128,"%s.pw.1.running_var",pfx),na)), oC, oC*No);
|
||
relu_f(o, oC*No);
|
||
|
||
// skip (no ReLU)
|
||
float* skip = m->dw_out; // reuse buffer
|
||
if (str==1 && iC==oC) {
|
||
std::memcpy(skip, in, iC*H*W*4);
|
||
} else {
|
||
std::snprintf(na,128,"%s.skip.0.weight",pfx);
|
||
conv2d(skip, in, wf(na), nullptr, H, W, iC, oC, 1, str, 1);
|
||
std::snprintf(na,128,"%s.skip.1.weight",pfx);
|
||
batchnorm_f(skip, wf(na), wf((std::snprintf(na,128,"%s.skip.1.bias",pfx),na)),
|
||
wf((std::snprintf(na,128,"%s.skip.1.running_mean",pfx),na)),
|
||
wf((std::snprintf(na,128,"%s.skip.1.running_var",pfx),na)), oC, oC*No);
|
||
}
|
||
|
||
// add skip
|
||
for (int i=0; i<oC*No; ++i) o[i] += skip[i];
|
||
|
||
// SE (输入是 o, 即 pw+skip 的和)
|
||
float se_avg[64]; gap(se_avg, o, oC, Ho, Wo);
|
||
|
||
int rc = oC/4;
|
||
float se1[16], se2[64];
|
||
std::snprintf(na,128,"%s.se.1.weight",pfx);
|
||
float* se1w=wf(na); std::snprintf(na,128,"%s.se.1.bias",pfx); float* se1b=wf(na);
|
||
for (int i=0; i<rc; ++i) { float s=se1b?se1b[i]:0; for (int j=0;j<oC;++j) s+=se1w[i*oC+j]*se_avg[j]; se1[i]=s; }
|
||
relu_f(se1, rc);
|
||
|
||
std::snprintf(na,128,"%s.se.3.weight",pfx);
|
||
float* se2w=wf(na); std::snprintf(na,128,"%s.se.3.bias",pfx); float* se2b=wf(na);
|
||
for (int i=0; i<oC; ++i) { float s=se2b?se2b[i]:0; for (int j=0;j<rc;++j) s+=se2w[i*rc+j]*se1[j]; se2[i]=s; }
|
||
sigmoid_f(se2, oC);
|
||
|
||
for (int c=0; c<oC; ++c) { float sc=se2[c]; float* oc=o+c*No; for (int i=0;i<No;++i) oc[i]*=sc; }
|
||
relu_f(o, oC*No);
|
||
}
|
||
|
||
// ============================================================
|
||
// 检测后处理
|
||
// ============================================================
|
||
static constexpr int OH=15, OW=20, ST=8, NC=3;
|
||
static constexpr float RS[3][2] = {{75.36f,62.15f},{49.79f,38.63f},{67.59f,34.94f}};
|
||
|
||
static int decode(DetectBox* boxes, int max, float th)
|
||
{
|
||
int N=OH*OW;
|
||
softmax_c(m->cls, 4, N);
|
||
int cnt=0;
|
||
|
||
for (int gy=0; gy<OH && cnt<max; ++gy)
|
||
for (int gx=0; gx<OW && cnt<max; ++gx)
|
||
{
|
||
int idx = gy*OW + gx;
|
||
int bc=-1; float bs=0;
|
||
for (int c=0; c<NC; ++c) { float s=m->cls[c*N+idx]; if (s>bs) {bs=s; bc=c;} }
|
||
if (bs<th) continue;
|
||
|
||
bool pk=true;
|
||
for (int dy=-1; dy<=1&&pk; ++dy)
|
||
for (int dx=-1; dx<=1&&pk; ++dx)
|
||
{
|
||
int ny=gy+dy, nx=gx+dx;
|
||
if (ny<0||ny>=OH||nx<0||nx>=OW) continue;
|
||
if (m->cls[bc*N+ny*OW+nx] > bs) pk=false;
|
||
}
|
||
if (!pk) continue;
|
||
|
||
float pw=m->sz[0*N+idx], ph=m->sz[1*N+idx];
|
||
boxes[cnt].cls=bc; boxes[cnt].conf=bs;
|
||
boxes[cnt].cx=((float)gx+0.5f)*ST; boxes[cnt].cy=((float)gy+0.5f)*ST;
|
||
boxes[cnt].w=pw*RS[bc][0]; boxes[cnt].h=ph*RS[bc][1];
|
||
cnt++;
|
||
}
|
||
return cnt;
|
||
}
|
||
|
||
// ============================================================
|
||
// 前向推理
|
||
// ============================================================
|
||
static void forward(const uint8* bgr)
|
||
{
|
||
// 预处理 BGR→RGB float [0,1], CHW (堆分配, 避免栈溢出)
|
||
float* in = new float[3*120*160];
|
||
for (int c=0; c<3; ++c) {
|
||
int sc=2-c; float* ch=in+c*120*160;
|
||
for (int y=0; y<120; ++y) {
|
||
const uint8* row=bgr+y*160*3;
|
||
for (int x=0; x<160; ++x) ch[y*160+x]=(float)row[x*3+sc]/255.0f;
|
||
}
|
||
}
|
||
|
||
// stem
|
||
conv_bn(m->stem, in, 120,160, wf("stem.0.weight"),nullptr,
|
||
wf("stem.1.weight"),wf("stem.1.bias"),
|
||
wf("stem.1.running_mean"),wf("stem.1.running_var"),
|
||
3,8,3,2,1,true);
|
||
|
||
se_block(m->b1, m->stem, 8,16, 60,80, 1,"block1");
|
||
se_block(m->b2, m->b1, 16,28, 60,80, 2,"block2");
|
||
se_block(m->b3, m->b2, 28,40, 30,40, 1,"block3");
|
||
se_block(m->b4, m->b3, 40,56, 30,40, 2,"block4");
|
||
se_block(m->b5, m->b4, 56,64, 15,20, 1,"block5");
|
||
|
||
// shared
|
||
conv_bn(m->sh, m->b5, 15,20, wf("shared.0.weight"),nullptr,
|
||
wf("shared.1.weight"),wf("shared.1.bias"),
|
||
wf("shared.1.running_mean"),wf("shared.1.running_var"),
|
||
64,24,1,1,1,true);
|
||
|
||
// heads (1x1 conv, 无 BN/ReLU)
|
||
conv2d(m->cls, m->sh, wf("cls_head.weight"), wf("cls_head.bias"), 15,20, 24,4, 1,1,1);
|
||
conv2d(m->sz, m->sh, wf("size_head.weight"),wf("size_head.bias"),15,20, 24,2, 1,1,1);
|
||
|
||
delete[] in;
|
||
}
|
||
|
||
// ============================================================
|
||
// 接口
|
||
// ============================================================
|
||
static bool g_rdy = false;
|
||
|
||
bool model_init(const char* path) {
|
||
FILE* f=fopen(path,"rb");
|
||
if(!f){ printf("[MODEL] open fail: %s\n",path); return false; }
|
||
fread(&gn,sizeof(int),1,f);
|
||
gw=new WT[gn];
|
||
for(int i=0;i<gn;++i){ WT& t=gw[i]; int nl; fread(&nl,4,1,f); fread(t.n,1,nl,f); t.n[nl]=0;
|
||
fread(&t.nd,4,1,f); int tot=1; for(int d=0;d<t.nd;++d){ fread(&t.s[d],4,1,f); tot*=t.s[d]; }
|
||
for(int d=t.nd;d<4;++d) t.s[d]=1;
|
||
t.d=new float[tot]; fread(t.d,4,tot,f);
|
||
}
|
||
fclose(f);
|
||
m=new (std::nothrow) M();
|
||
if (!m) { printf("[MODEL] OOM: 无法分配推理内存\n"); fclose(f); delete[] gw; gw=nullptr; return false; }
|
||
std::memset(m,0,sizeof(M));
|
||
g_rdy=true; printf("[MODEL] load ok: %d layers\n",gn); return true;
|
||
}
|
||
|
||
int model_detect(const uint8* bgr, int w, int h, DetectBox* boxes, int max, float th) {
|
||
if(!g_rdy||w!=160||h!=120) return 0;
|
||
forward(bgr);
|
||
return decode(boxes, max, th);
|
||
}
|
||
|
||
void model_deinit() {
|
||
if(gw){ for(int i=0;i<gn;++i) delete[] gw[i].d; delete[] gw; gw=nullptr; }
|
||
delete m; m=nullptr; g_rdy=false;
|
||
}
|
||
|
||
bool model_ready() { return g_rdy; }
|