diff --git a/elastic/templates/elastic/upload.html b/elastic/templates/elastic/upload.html index 397f3e3..0b59b79 100644 --- a/elastic/templates/elastic/upload.html +++ b/elastic/templates/elastic/upload.html @@ -494,6 +494,11 @@ uploadForm.addEventListener('submit', async (e) => { body: formData, }); clearInterval(timer); + const ct = (resp.headers.get('content-type') || '').toLowerCase(); + if (!ct.includes('application/json')) { + const text = await resp.text(); + throw new Error(text ? String(text).slice(0, 200) : `HTTP ${resp.status}`); + } const data = await resp.json(); if (!resp.ok || data.status !== 'success') { throw new Error(data.message || '上传识别失败'); diff --git a/elastic/views.py b/elastic/views.py index 8fb674b..2fb3665 100644 --- a/elastic/views.py +++ b/elastic/views.py @@ -722,115 +722,117 @@ def upload_page(request): # 上传并识别(不入库) @require_http_methods(["POST"]) def upload(request): - if request.session.get("user_id") is None: - fallback_uid = request.POST.get("user_id") or request.GET.get("user_id") - if fallback_uid: - request.session["user_id"] = fallback_uid - request.session.setdefault("permission", 1) - else: - return JsonResponse({"status": "error", "message": "未登录"}, status=401) + try: + if request.session.get("user_id") is None: + fallback_uid = request.POST.get("user_id") or request.GET.get("user_id") + if fallback_uid: + request.session["user_id"] = fallback_uid + request.session.setdefault("permission", 1) + else: + return JsonResponse({"status": "error", "message": "未登录"}, status=401) - files = request.FILES.getlist("file") - if not files: - one = request.FILES.get("file") - if one: - files = [one] - if not files: - return JsonResponse({"status": "error", "message": "未选择文件"}, status=400) + files = request.FILES.getlist("file") + if not files: + one = request.FILES.get("file") + if one: + files = [one] + if not files: + return JsonResponse({"status": "error", "message": "未选择文件"}, status=400) - images_dir = os.path.join(settings.MEDIA_ROOT, "images") - os.makedirs(images_dir, exist_ok=True) - - # 按照原始文件进行分组处理 - file_results = [] - - for f in files: - group_images = [] # 存储该文件生成的所有图片路径信息 (abs_path, filename) - is_pdf = f.name.lower().endswith('.pdf') + images_dir = os.path.join(settings.MEDIA_ROOT, "images") + os.makedirs(images_dir, exist_ok=True) - if is_pdf: - if not HAS_PDF_SUPPORT: - return JsonResponse({"status": "error", "message": f"服务器未安装PDF处理组件(PyMuPDF): {PDF_ERROR}"}, status=500) + file_results = [] + + for f in files: + group_images = [] + is_pdf = f.name.lower().endswith('.pdf') - with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp: - for chunk in f.chunks(): - tmp.write(chunk) - tmp_path = tmp.name - - try: - doc = fitz.open(tmp_path) - for i in range(len(doc)): - page = doc.load_page(i) - pix = page.get_pixmap(dpi=150) - img_filename = f"{uuid.uuid4()}_page_{i+1}.jpg" - img_abs_path = os.path.join(images_dir, img_filename) - pix.save(img_abs_path) - group_images.append((img_abs_path, img_filename)) - doc.close() - except Exception as e: - return JsonResponse({"status": "error", "message": f"PDF {f.name} 转换失败: {str(e)}"}, status=500) - finally: - if os.path.exists(tmp_path): - os.remove(tmp_path) - else: - filename = f"{uuid.uuid4()}_{f.name}" - abs_path = os.path.join(images_dir, filename) - with open(abs_path, "wb") as dst: - for chunk in f.chunks(): - dst.write(chunk) - group_images.append((abs_path, filename)) + if is_pdf: + if not HAS_PDF_SUPPORT: + return JsonResponse({"status": "error", "message": f"服务器未安装PDF处理组件(PyMuPDF): {PDF_ERROR}"}, status=500) + + with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp: + for chunk in f.chunks(): + tmp.write(chunk) + tmp_path = tmp.name + + try: + doc = fitz.open(tmp_path) + for i in range(len(doc)): + page = doc.load_page(i) + pix = page.get_pixmap(dpi=150) + img_filename = f"{uuid.uuid4()}_page_{i+1}.jpg" + img_abs_path = os.path.join(images_dir, img_filename) + pix.save(img_abs_path) + group_images.append((img_abs_path, img_filename)) + doc.close() + except Exception as e: + return JsonResponse({"status": "error", "message": f"PDF {f.name} 转换失败: {str(e)}"}, status=500) + finally: + if os.path.exists(tmp_path): + os.remove(tmp_path) + else: + filename = f"{uuid.uuid4()}_{f.name}" + abs_path = os.path.join(images_dir, filename) + with open(abs_path, "wb") as dst: + for chunk in f.chunks(): + dst.write(chunk) + group_images.append((abs_path, filename)) - # 对该组图片并行进行 OCR 识别 - def run_ocr(img_info): - abs_p, fname = img_info - try: - data = ocr_and_extract_info(abs_p) - return data - except Exception: - return None + def run_ocr(img_info): + abs_p, fname = img_info + try: + data = ocr_and_extract_info(abs_p) + return data + except Exception: + return None - group_data_list = [] - with concurrent.futures.ThreadPoolExecutor(max_workers=min(len(group_images), 8)) as executor: - futures = [executor.submit(run_ocr, img_info) for img_info in group_images] - for future in concurrent.futures.as_completed(futures): - res = future.result() - if res: - group_data_list.append(res) + group_data_list = [] + with concurrent.futures.ThreadPoolExecutor(max_workers=min(len(group_images), 8)) as executor: + futures = [executor.submit(run_ocr, img_info) for img_info in group_images] + for future in concurrent.futures.as_completed(futures): + res = future.result() + if res: + group_data_list.append(res) - # 合并该文件的多页识别结果 - merged_group_data = {} - for item in group_data_list: - if not isinstance(item, dict): continue - for k, v in item.items(): - key = str(k).strip() - if not key: continue - if key not in merged_group_data or merged_group_data.get(key) in (None, ''): - merged_group_data[key] = v - elif merged_group_data.get(key) != v: - base = key - idx = 2 - while f"{base}_{idx}" in merged_group_data: idx += 1 - merged_group_data[f"{base}_{idx}"] = v + merged_group_data = {} + for item in group_data_list: + if not isinstance(item, dict): + continue + for k, v in item.items(): + key = str(k).strip() + if not key: + continue + if key not in merged_group_data or merged_group_data.get(key) in (None, ''): + merged_group_data[key] = v + elif merged_group_data.get(key) != v: + base = key + idx = 2 + while f"{base}_{idx}" in merged_group_data: + idx += 1 + merged_group_data[f"{base}_{idx}"] = v - if not merged_group_data: - # 如果没识别到,至少保留一个空结构或者包含文件名的提示 - merged_group_data = {"文件名": f.name, "提示": "未识别到具体内容"} + if not merged_group_data: + merged_group_data = {"文件名": f.name, "提示": "未识别到具体内容"} - rel_paths = [f"images/{img[1]}" for img in group_images] - image_urls = [request.build_absolute_uri(settings.MEDIA_URL + rp) for rp in rel_paths] + rel_paths = [f"images/{img[1]}" for img in group_images] + image_urls = [request.build_absolute_uri(settings.MEDIA_URL + rp) for rp in rel_paths] - file_results.append({ - "name": f.name, - "data": merged_group_data, - "images": rel_paths, - "image_urls": image_urls, + file_results.append({ + "name": f.name, + "data": merged_group_data, + "images": rel_paths, + "image_urls": image_urls, + }) + + return JsonResponse({ + "status": "success", + "message": f"成功处理 {len(file_results)} 个文件,请确认数据后点击录入", + "items": file_results, }) - - return JsonResponse({ - "status": "success", - "message": f"成功处理 {len(file_results)} 个文件,请确认数据后点击录入", - "items": file_results, - }) + except Exception as e: + return JsonResponse({"status": "error", "message": str(e) or "上传失败"}, status=500) # 确认并入库