Files
Loongson_2k0300_SmartCar_Fr…/model/model.cpp

288 lines
10 KiB
C++

#include "model.hpp"
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <cmath>
#include <algorithm>
// ============================================================
// 内存预分配
// ============================================================
struct M {
float stem[8*60*80];
float b1[16*60*80]; float b1_tmp[16*60*80];
float b2[28*30*40]; float b2_dw[16*30*40];
float b3[40*30*40]; float b3_dw[28*30*40];
float b4[56*15*20]; float b4_dw[40*15*20];
float b5[64*15*20]; float b5_dw[56*15*20];
float sh[24*15*20];
float cls[4*15*20];
float sz[2*15*20];
float dw_out[64*60*80]; // max(8*60*80, 16*30*40, ...) = 38400
};
static M* m = nullptr;
// ============================================================
// 权重表
// ============================================================
struct WT { char n[64]; int nd; int s[4]; float* d; };
static WT* gw = nullptr; static int gn = 0;
static float* wf(const char* name) {
for (int i = 0; i < gn; ++i) if (!std::strcmp(gw[i].n, name)) return gw[i].d;
printf("[MODEL] MISS %s\n", name); return nullptr;
}
// ============================================================
// 基础算子
// ============================================================
static void relu_f(float* x, int N) { for (int i=0;i<N;++i) if (x[i]<0) x[i]=0; }
static void sigmoid_f(float* x, int N) { for (int i=0;i<N;++i) x[i]=1.0f/(1.0f+std::exp(-x[i])); }
static void conv2d(float* o, const float* in, const float* w, const float* bias,
int H, int W, int iC, int oC, int K, int str, int grp)
{
int Ho=H/str, Wo=W/str, pad=K/2;
int icpg = iC/grp, ocpg = oC/grp;
for (int g=0; g<grp; ++g) {
for (int oc=0; oc<ocpg; ++oc) {
int occ = g*ocpg + oc;
float* oo = o + occ*Ho*Wo;
for (int y=0; y<Ho; ++y) for (int x=0; x<Wo; ++x) {
float s = bias ? bias[occ] : 0;
for (int ic=0; ic<icpg; ++ic) {
int icc = g*icpg + ic;
const float* ww = w + ((occ*iC + icc)*K*K);
const float* ii = in + icc*H*W;
for (int ky=0; ky<K; ++ky) {
int iy = y*str + ky - pad;
if (iy<0||iy>=H) continue;
for (int kx=0; kx<K; ++kx) {
int ix = x*str + kx - pad;
if (ix<0||ix>=W) continue;
s += ww[ky*K+kx] * ii[iy*W+ix];
}
}
}
oo[y*Wo+x] = s;
}
}
}
}
static void batchnorm_f(float* x, const float* w, const float* b,
const float* mean, const float* var, int C, int N)
{
const float eps = 1e-5f;
for (int c=0; c<C; ++c) {
float W = w?w[c]:1, B = b?b[c]:0;
float iv = 1.0f/std::sqrt(var[c]+eps);
float* xc = x + c*N;
for (int i=0; i<N; ++i) xc[i] = (xc[i]-mean[c])*iv*W + B;
}
}
// conv + BN + optional ReLU
static void conv_bn(float* o, const float* in, int H, int W,
const float* cw, const float* cb,
const float* bn_w, const float* bn_b,
const float* bn_m, const float* bn_v,
int iC, int oC, int K, int str, int grp, bool relu_flag)
{
conv2d(o, in, cw, nullptr, H, W, iC, oC, K, str, grp);
int N = (H/str)*(W/str);
batchnorm_f(o, bn_w, bn_b, bn_m, bn_v, oC, N);
if (relu_flag) relu_f(o, oC*N);
}
// global avg pool
static void gap(float* o, const float* in, int C, int H, int W) {
int N=H*W;
for (int c=0; c<C; ++c) {
float s=0; const float* ic=in+c*N;
for (int i=0; i<N; ++i) s+=ic[i];
o[c]=s/(float)N;
}
}
static void softmax_c(float* x, int C, int N) {
for (int i=0; i<N; ++i) {
float mx=-1e9f;
for (int c=0; c<C; ++c) mx=std::max(mx, x[c*N+i]);
float sum=0;
for (int c=0; c<C; ++c) { x[c*N+i]=std::exp(x[c*N+i]-mx); sum+=x[c*N+i]; }
for (int c=0; c<C; ++c) x[c*N+i]/=sum;
}
}
// ============================================================
// SE-ResDW Block
// ============================================================
static void se_block(float* o, const float* in, int iC, int oC, int H, int W, int str, const char* pfx)
{
int Ho=H/str, Wo=W/str, No=Ho*Wo;
char na[128];
// depthwise conv + BN + ReLU
std::snprintf(na,128,"%s.dw.0.weight",pfx);
conv2d(m->dw_out, in, wf(na), nullptr, H, W, iC, iC, 3, str, iC);
std::snprintf(na,128,"%s.dw.1.weight",pfx);
batchnorm_f(m->dw_out, wf(na), wf((std::snprintf(na,128,"%s.dw.1.bias",pfx),na)),
wf((std::snprintf(na,128,"%s.dw.1.running_mean",pfx),na)),
wf((std::snprintf(na,128,"%s.dw.1.running_var",pfx),na)), iC, iC*No);
relu_f(m->dw_out, iC*No);
// pointwise conv + BN + ReLU (to output)
std::snprintf(na,128,"%s.pw.0.weight",pfx);
conv2d(o, m->dw_out, wf(na), nullptr, Ho, Wo, iC, oC, 1, 1, 1);
std::snprintf(na,128,"%s.pw.1.weight",pfx);
batchnorm_f(o, wf(na), wf((std::snprintf(na,128,"%s.pw.1.bias",pfx),na)),
wf((std::snprintf(na,128,"%s.pw.1.running_mean",pfx),na)),
wf((std::snprintf(na,128,"%s.pw.1.running_var",pfx),na)), oC, oC*No);
relu_f(o, oC*No);
// skip (no ReLU)
float* skip = m->dw_out; // reuse buffer
if (str==1 && iC==oC) {
std::memcpy(skip, in, iC*H*W*4);
} else {
std::snprintf(na,128,"%s.skip.0.weight",pfx);
conv2d(skip, in, wf(na), nullptr, H, W, iC, oC, 1, str, 1);
std::snprintf(na,128,"%s.skip.1.weight",pfx);
batchnorm_f(skip, wf(na), wf((std::snprintf(na,128,"%s.skip.1.bias",pfx),na)),
wf((std::snprintf(na,128,"%s.skip.1.running_mean",pfx),na)),
wf((std::snprintf(na,128,"%s.skip.1.running_var",pfx),na)), oC, oC*No);
}
// add skip
for (int i=0; i<oC*No; ++i) o[i] += skip[i];
// SE (输入是 o, 即 pw+skip 的和)
float se_avg[64]; gap(se_avg, o, oC, Ho, Wo);
int rc = oC/4;
float se1[16], se2[64];
std::snprintf(na,128,"%s.se.1.weight",pfx);
float* se1w=wf(na); std::snprintf(na,128,"%s.se.1.bias",pfx); float* se1b=wf(na);
for (int i=0; i<rc; ++i) { float s=se1b?se1b[i]:0; for (int j=0;j<oC;++j) s+=se1w[i*oC+j]*se_avg[j]; se1[i]=s; }
relu_f(se1, rc);
std::snprintf(na,128,"%s.se.3.weight",pfx);
float* se2w=wf(na); std::snprintf(na,128,"%s.se.3.bias",pfx); float* se2b=wf(na);
for (int i=0; i<oC; ++i) { float s=se2b?se2b[i]:0; for (int j=0;j<rc;++j) s+=se2w[i*rc+j]*se1[j]; se2[i]=s; }
sigmoid_f(se2, oC);
for (int c=0; c<oC; ++c) { float sc=se2[c]; float* oc=o+c*No; for (int i=0;i<No;++i) oc[i]*=sc; }
relu_f(o, oC*No);
}
// ============================================================
// 检测后处理
// ============================================================
static constexpr int OH=15, OW=20, ST=8, NC=3;
static constexpr float RS[3][2] = {{75.36f,62.15f},{49.79f,38.63f},{67.59f,34.94f}};
static int decode(DetectBox* boxes, int max, float th)
{
int N=OH*OW;
softmax_c(m->cls, 4, N);
int cnt=0;
for (int gy=0; gy<OH && cnt<max; ++gy)
for (int gx=0; gx<OW && cnt<max; ++gx)
{
int idx = gy*OW + gx;
int bc=-1; float bs=0;
for (int c=0; c<NC; ++c) { float s=m->cls[c*N+idx]; if (s>bs) {bs=s; bc=c;} }
if (bs<th) continue;
bool pk=true;
for (int dy=-1; dy<=1&&pk; ++dy)
for (int dx=-1; dx<=1&&pk; ++dx)
{
int ny=gy+dy, nx=gx+dx;
if (ny<0||ny>=OH||nx<0||nx>=OW) continue;
if (m->cls[bc*N+ny*OW+nx] > bs) pk=false;
}
if (!pk) continue;
float pw=m->sz[0*N+idx], ph=m->sz[1*N+idx];
boxes[cnt].cls=bc; boxes[cnt].conf=bs;
boxes[cnt].cx=((float)gx+0.5f)*ST; boxes[cnt].cy=((float)gy+0.5f)*ST;
boxes[cnt].w=pw*RS[bc][0]; boxes[cnt].h=ph*RS[bc][1];
cnt++;
}
return cnt;
}
// ============================================================
// 前向推理
// ============================================================
static void forward(const uint8* bgr)
{
// 预处理 BGR→RGB float [0,1], CHW
float in[3*120*160];
for (int c=0; c<3; ++c) {
int sc=2-c; float* ch=in+c*120*160;
for (int y=0; y<120; ++y) {
const uint8* row=bgr+y*160*3;
for (int x=0; x<160; ++x) ch[y*160+x]=(float)row[x*3+sc]/255.0f;
}
}
// stem
conv_bn(m->stem, in, 120,160, wf("stem.0.weight"),nullptr,
wf("stem.1.weight"),wf("stem.1.bias"),
wf("stem.1.running_mean"),wf("stem.1.running_var"),
3,8,3,2,1,true);
se_block(m->b1, m->stem, 8,16, 60,80, 1,"block1");
se_block(m->b2, m->b1, 16,28, 60,80, 2,"block2");
se_block(m->b3, m->b2, 28,40, 30,40, 1,"block3");
se_block(m->b4, m->b3, 40,56, 30,40, 2,"block4");
se_block(m->b5, m->b4, 56,64, 15,20, 1,"block5");
// shared
conv_bn(m->sh, m->b5, 15,20, wf("shared.0.weight"),nullptr,
wf("shared.1.weight"),wf("shared.1.bias"),
wf("shared.1.running_mean"),wf("shared.1.running_var"),
64,24,1,1,1,true);
// heads (1x1 conv, 无 BN/ReLU)
conv2d(m->cls, m->sh, wf("cls_head.weight"), wf("cls_head.bias"), 15,20, 24,4, 1,1,1);
conv2d(m->sz, m->sh, wf("size_head.weight"),wf("size_head.bias"),15,20, 24,2, 1,1,1);
}
// ============================================================
// 接口
// ============================================================
static bool g_rdy = false;
bool model_init(const char* path) {
FILE* f=fopen(path,"rb");
if(!f){ printf("[MODEL] open fail: %s\n",path); return false; }
fread(&gn,sizeof(int),1,f);
gw=new WT[gn];
for(int i=0;i<gn;++i){ WT& t=gw[i]; int nl; fread(&nl,4,1,f); fread(t.n,1,nl,f); t.n[nl]=0;
fread(&t.nd,4,1,f); int tot=1; for(int d=0;d<t.nd;++d){ fread(&t.s[d],4,1,f); tot*=t.s[d]; }
for(int d=t.nd;d<4;++d) t.s[d]=1;
t.d=new float[tot]; fread(t.d,4,tot,f);
}
fclose(f);
m=new M(); std::memset(m,0,sizeof(M));
g_rdy=true; printf("[MODEL] load ok: %d layers\n",gn); return true;
}
int model_detect(const uint8* bgr, int w, int h, DetectBox* boxes, int max, float th) {
if(!g_rdy||w!=160||h!=120) return 0;
forward(bgr);
return decode(boxes, max, th);
}
void model_deinit() {
if(gw){ for(int i=0;i<gn;++i) delete[] gw[i].d; delete[] gw; gw=nullptr; }
delete m; m=nullptr; g_rdy=false;
}
bool model_ready() { return g_rdy; }