2 Commits

Author SHA1 Message Date
DSQ
9e3fe7150b [0.2.7.8][ci]
All checks were successful
CI / docker-ci (push) Successful in 26s
2026-03-23 11:02:00 +08:00
DSQ
c9611fa622 [0.2.7.7][ci]
All checks were successful
CI / docker-ci (push) Successful in 35s
2026-03-23 10:37:46 +08:00
3 changed files with 200 additions and 184 deletions

View File

@@ -445,6 +445,11 @@ async function generateReport() {
try { try {
const params = buildReportParams(); const params = buildReportParams();
const resp = await fetch(`/elastic/report/?${params.toString()}`, { credentials: 'same-origin' }); const resp = await fetch(`/elastic/report/?${params.toString()}`, { credentials: 'same-origin' });
const ct = (resp.headers.get('content-type') || '').toLowerCase();
if (!ct.includes('application/json')) {
const text = await resp.text();
throw new Error(text ? String(text).slice(0, 200) : `HTTP ${resp.status}`);
}
const data = await resp.json(); const data = await resp.json();
if (data.status !== 'success') { if (data.status !== 'success') {
reportBox.className = 'search-result error'; reportBox.className = 'search-result error';

View File

@@ -494,6 +494,11 @@ uploadForm.addEventListener('submit', async (e) => {
body: formData, body: formData,
}); });
clearInterval(timer); clearInterval(timer);
const ct = (resp.headers.get('content-type') || '').toLowerCase();
if (!ct.includes('application/json')) {
const text = await resp.text();
throw new Error(text ? String(text).slice(0, 200) : `HTTP ${resp.status}`);
}
const data = await resp.json(); const data = await resp.json();
if (!resp.ok || data.status !== 'success') { if (!resp.ok || data.status !== 'success') {
throw new Error(data.message || '上传识别失败'); throw new Error(data.message || '上传识别失败');

View File

@@ -8,6 +8,7 @@ import base64
import json import json
import csv import csv
import io import io
from datetime import datetime, timezone, timedelta
import tempfile import tempfile
import concurrent.futures import concurrent.futures
from django.conf import settings from django.conf import settings
@@ -721,115 +722,117 @@ def upload_page(request):
# 上传并识别(不入库) # 上传并识别(不入库)
@require_http_methods(["POST"]) @require_http_methods(["POST"])
def upload(request): def upload(request):
if request.session.get("user_id") is None: try:
fallback_uid = request.POST.get("user_id") or request.GET.get("user_id") if request.session.get("user_id") is None:
if fallback_uid: fallback_uid = request.POST.get("user_id") or request.GET.get("user_id")
request.session["user_id"] = fallback_uid if fallback_uid:
request.session.setdefault("permission", 1) request.session["user_id"] = fallback_uid
else: request.session.setdefault("permission", 1)
return JsonResponse({"status": "error", "message": "未登录"}, status=401) else:
return JsonResponse({"status": "error", "message": "未登录"}, status=401)
files = request.FILES.getlist("file") files = request.FILES.getlist("file")
if not files: if not files:
one = request.FILES.get("file") one = request.FILES.get("file")
if one: if one:
files = [one] files = [one]
if not files: if not files:
return JsonResponse({"status": "error", "message": "未选择文件"}, status=400) return JsonResponse({"status": "error", "message": "未选择文件"}, status=400)
images_dir = os.path.join(settings.MEDIA_ROOT, "images") images_dir = os.path.join(settings.MEDIA_ROOT, "images")
os.makedirs(images_dir, exist_ok=True) os.makedirs(images_dir, exist_ok=True)
# 按照原始文件进行分组处理
file_results = []
for f in files:
group_images = [] # 存储该文件生成的所有图片路径信息 (abs_path, filename)
is_pdf = f.name.lower().endswith('.pdf')
if is_pdf: file_results = []
if not HAS_PDF_SUPPORT:
return JsonResponse({"status": "error", "message": f"服务器未安装PDF处理组件(PyMuPDF): {PDF_ERROR}"}, status=500) for f in files:
group_images = []
is_pdf = f.name.lower().endswith('.pdf')
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp: if is_pdf:
for chunk in f.chunks(): if not HAS_PDF_SUPPORT:
tmp.write(chunk) return JsonResponse({"status": "error", "message": f"服务器未安装PDF处理组件(PyMuPDF): {PDF_ERROR}"}, status=500)
tmp_path = tmp.name
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
try: for chunk in f.chunks():
doc = fitz.open(tmp_path) tmp.write(chunk)
for i in range(len(doc)): tmp_path = tmp.name
page = doc.load_page(i)
pix = page.get_pixmap(dpi=150) try:
img_filename = f"{uuid.uuid4()}_page_{i+1}.jpg" doc = fitz.open(tmp_path)
img_abs_path = os.path.join(images_dir, img_filename) for i in range(len(doc)):
pix.save(img_abs_path) page = doc.load_page(i)
group_images.append((img_abs_path, img_filename)) pix = page.get_pixmap(dpi=150)
doc.close() img_filename = f"{uuid.uuid4()}_page_{i+1}.jpg"
except Exception as e: img_abs_path = os.path.join(images_dir, img_filename)
return JsonResponse({"status": "error", "message": f"PDF {f.name} 转换失败: {str(e)}"}, status=500) pix.save(img_abs_path)
finally: group_images.append((img_abs_path, img_filename))
if os.path.exists(tmp_path): doc.close()
os.remove(tmp_path) except Exception as e:
else: return JsonResponse({"status": "error", "message": f"PDF {f.name} 转换失败: {str(e)}"}, status=500)
filename = f"{uuid.uuid4()}_{f.name}" finally:
abs_path = os.path.join(images_dir, filename) if os.path.exists(tmp_path):
with open(abs_path, "wb") as dst: os.remove(tmp_path)
for chunk in f.chunks(): else:
dst.write(chunk) filename = f"{uuid.uuid4()}_{f.name}"
group_images.append((abs_path, filename)) abs_path = os.path.join(images_dir, filename)
with open(abs_path, "wb") as dst:
for chunk in f.chunks():
dst.write(chunk)
group_images.append((abs_path, filename))
# 对该组图片并行进行 OCR 识别 def run_ocr(img_info):
def run_ocr(img_info): abs_p, fname = img_info
abs_p, fname = img_info try:
try: data = ocr_and_extract_info(abs_p)
data = ocr_and_extract_info(abs_p) return data
return data except Exception:
except Exception: return None
return None
group_data_list = [] group_data_list = []
with concurrent.futures.ThreadPoolExecutor(max_workers=min(len(group_images), 8)) as executor: with concurrent.futures.ThreadPoolExecutor(max_workers=min(len(group_images), 8)) as executor:
futures = [executor.submit(run_ocr, img_info) for img_info in group_images] futures = [executor.submit(run_ocr, img_info) for img_info in group_images]
for future in concurrent.futures.as_completed(futures): for future in concurrent.futures.as_completed(futures):
res = future.result() res = future.result()
if res: if res:
group_data_list.append(res) group_data_list.append(res)
# 合并该文件的多页识别结果 merged_group_data = {}
merged_group_data = {} for item in group_data_list:
for item in group_data_list: if not isinstance(item, dict):
if not isinstance(item, dict): continue continue
for k, v in item.items(): for k, v in item.items():
key = str(k).strip() key = str(k).strip()
if not key: continue if not key:
if key not in merged_group_data or merged_group_data.get(key) in (None, ''): continue
merged_group_data[key] = v if key not in merged_group_data or merged_group_data.get(key) in (None, ''):
elif merged_group_data.get(key) != v: merged_group_data[key] = v
base = key elif merged_group_data.get(key) != v:
idx = 2 base = key
while f"{base}_{idx}" in merged_group_data: idx += 1 idx = 2
merged_group_data[f"{base}_{idx}"] = v while f"{base}_{idx}" in merged_group_data:
idx += 1
merged_group_data[f"{base}_{idx}"] = v
if not merged_group_data: if not merged_group_data:
# 如果没识别到,至少保留一个空结构或者包含文件名的提示 merged_group_data = {"文件名": f.name, "提示": "未识别到具体内容"}
merged_group_data = {"文件名": f.name, "提示": "未识别到具体内容"}
rel_paths = [f"images/{img[1]}" for img in group_images] rel_paths = [f"images/{img[1]}" for img in group_images]
image_urls = [request.build_absolute_uri(settings.MEDIA_URL + rp) for rp in rel_paths] image_urls = [request.build_absolute_uri(settings.MEDIA_URL + rp) for rp in rel_paths]
file_results.append({ file_results.append({
"name": f.name, "name": f.name,
"data": merged_group_data, "data": merged_group_data,
"images": rel_paths, "images": rel_paths,
"image_urls": image_urls, "image_urls": image_urls,
})
return JsonResponse({
"status": "success",
"message": f"成功处理 {len(file_results)} 个文件,请确认数据后点击录入",
"items": file_results,
}) })
except Exception as e:
return JsonResponse({ return JsonResponse({"status": "error", "message": str(e) or "上传失败"}, status=500)
"status": "success",
"message": f"成功处理 {len(file_results)} 个文件,请确认数据后点击录入",
"items": file_results,
})
# 确认并入库 # 确认并入库
@@ -1580,104 +1583,107 @@ def _dt_label(dt, interval: str):
@require_http_methods(["GET"]) @require_http_methods(["GET"])
def report_view(request): def report_view(request):
session_user_id = request.session.get("user_id") try:
if session_user_id is None: session_user_id = request.session.get("user_id")
return JsonResponse({"status": "error", "message": "未登录"}, status=401) if session_user_id is None:
is_admin = int(request.session.get("permission", 1)) == 0 return JsonResponse({"status": "error", "message": "未登录"}, status=401)
me = get_user_by_id(session_user_id) or {} is_admin = int(request.session.get("permission", 1)) == 0
has_manage_key = bool(me.get("manage_key") or []) me = get_user_by_id(session_user_id) or {}
if (not is_admin) and (not has_manage_key): has_manage_key = bool(me.get("manage_key") or [])
return JsonResponse({"status": "error", "message": "无权限"}, status=403) if (not is_admin) and (not has_manage_key):
return JsonResponse({"status": "error", "message": "无权限"}, status=403)
gte = (request.GET.get("from") or "").strip() gte = (request.GET.get("from") or "").strip()
lte = (request.GET.get("to") or "").strip() lte = (request.GET.get("to") or "").strip()
interval = (request.GET.get("interval") or "day").strip() interval = (request.GET.get("interval") or "day").strip()
key = (request.GET.get("key") or "").strip() key = (request.GET.get("key") or "").strip()
typ = (request.GET.get("type") or "").strip() typ = (request.GET.get("type") or "").strip()
gte_dt = _parse_dt(gte) if gte else None gte_dt = _parse_dt(gte) if gte else None
lte_dt = _parse_dt(lte) if lte else None lte_dt = _parse_dt(lte) if lte else None
results = search_all() results = search_all()
results = _filter_results_for_user(request, results) results = _filter_results_for_user(request, results)
filtered = list(results or []) filtered = list(results or [])
if key: if key:
selected = str(key).strip() selected = str(key).strip()
try:
users = get_all_users() or []
except Exception:
users = []
writer_keys_by_id = {}
for u in users:
try: try:
u_id = str(u.get("user_id", "")).strip() users = get_all_users() or []
except Exception: except Exception:
u_id = "" users = []
if not u_id: writer_keys_by_id = {}
continue for u in users:
try: try:
u_keys = {str(k).strip() for k in (u.get("key") or []) if str(k).strip()} u_id = str(u.get("user_id", "")).strip()
except Exception: except Exception:
u_keys = set() u_id = ""
writer_keys_by_id[u_id] = u_keys if not u_id:
continue
try:
u_keys = {str(k).strip() for k in (u.get("key") or []) if str(k).strip()}
except Exception:
u_keys = set()
writer_keys_by_id[u_id] = u_keys
tmp = [] tmp = []
for r in filtered:
writer_id = str(r.get("writer_id", "")).strip()
writer_keys = writer_keys_by_id.get(writer_id)
if writer_keys and selected in writer_keys:
tmp.append(r)
continue
if selected and selected in str(r.get("data", "")):
tmp.append(r)
filtered = tmp
if typ:
tsel = str(typ).strip()
filtered = [r for r in filtered if _extract_type_from_data(r.get("data")) == tsel]
ranged = []
for r in filtered: for r in filtered:
writer_id = str(r.get("writer_id", "")).strip() t = _parse_dt(r.get("time"))
writer_keys = writer_keys_by_id.get(writer_id) if (gte_dt or lte_dt) and (t is None):
if writer_keys and selected in writer_keys:
tmp.append(r)
continue continue
if selected and selected in str(r.get("data", "")): if gte_dt and t and t < gte_dt:
tmp.append(r) continue
filtered = tmp if lte_dt and t and t > lte_dt:
continue
rr = dict(r)
rr["_time_dt"] = t
ranged.append(rr)
if typ: by_type = {}
tsel = str(typ).strip() by_bucket = {}
filtered = [r for r in filtered if _extract_type_from_data(r.get("data")) == tsel] for r in ranged:
tval = _extract_type_from_data(r.get("data"))
if tval:
by_type[tval] = by_type.get(tval, 0) + 1
bucket_dt = _floor_dt(r.get("_time_dt"), interval)
if bucket_dt:
label = _dt_label(bucket_dt, interval)
if label:
by_bucket[label] = by_bucket.get(label, 0) + 1
ranged = [] by_type_arr = [{"type": k, "count": int(v)} for k, v in sorted(by_type.items(), key=lambda x: (-x[1], x[0]))]
for r in filtered: by_time_arr = [{"bucket": k, "count": int(v)} for k, v in sorted(by_bucket.items(), key=lambda x: x[0])]
t = _parse_dt(r.get("time"))
if (gte_dt or lte_dt) and (t is None):
continue
if gte_dt and t and t < gte_dt:
continue
if lte_dt and t and t > lte_dt:
continue
rr = dict(r)
rr["_time_dt"] = t
ranged.append(rr)
by_type = {} return JsonResponse(
by_bucket = {} {
for r in ranged: "status": "success",
tval = _extract_type_from_data(r.get("data")) "data": {
if tval: "generated_at": datetime.now(timezone.utc).isoformat(),
by_type[tval] = by_type.get(tval, 0) + 1 "range": {"from": gte or "", "to": lte or ""},
bucket_dt = _floor_dt(r.get("_time_dt"), interval) "filters": {"key": key or "", "type": typ or "", "interval": interval or "day"},
if bucket_dt: "total": len(ranged),
label = _dt_label(bucket_dt, interval) "by_type": by_type_arr,
if label: "by_time": by_time_arr,
by_bucket[label] = by_bucket.get(label, 0) + 1 },
}
by_type_arr = [{"type": k, "count": int(v)} for k, v in sorted(by_type.items(), key=lambda x: (-x[1], x[0]))] )
by_time_arr = [{"bucket": k, "count": int(v)} for k, v in sorted(by_bucket.items(), key=lambda x: x[0])] except Exception as e:
return JsonResponse({"status": "error", "message": str(e) or "生成失败"}, status=500)
return JsonResponse(
{
"status": "success",
"data": {
"generated_at": datetime.now(timezone.utc).isoformat(),
"range": {"from": gte or "", "to": lte or ""},
"filters": {"key": key or "", "type": typ or "", "interval": interval or "day"},
"total": len(ranged),
"by_type": by_type_arr,
"by_time": by_time_arr,
},
}
)
@require_http_methods(["GET"]) @require_http_methods(["GET"])