能上传并识别PDF

This commit is contained in:
DSQ
2026-03-12 20:05:48 +08:00
parent 109c06e1d9
commit d69858434f
5 changed files with 205 additions and 85 deletions

View File

@@ -6,6 +6,8 @@ import re
import uuid
import base64
import json
import tempfile
import concurrent.futures
from django.conf import settings
from django.http import JsonResponse
from django.shortcuts import render
@@ -21,6 +23,19 @@ from .es_connect import (
analytics_recent as es_analytics_recent,
)
from PIL import Image
try:
import fitz # PyMuPDF
HAS_PDF_SUPPORT = True
PDF_ERROR = ""
except ImportError as e:
try:
import pymupdf
fitz = pymupdf
HAS_PDF_SUPPORT = True
PDF_ERROR = ""
except ImportError:
HAS_PDF_SUPPORT = False
PDF_ERROR = str(e)
def _filter_results_for_user(request, results):
@@ -58,18 +73,24 @@ def _image_ref_to_url(request, image_ref: str) -> str:
if not s:
return ''
if not s.startswith('minio:'):
return ''
if s.startswith('minio:'):
object_name = s[len('minio:'):].lstrip('/')
if not object_name:
return ''
object_name = s[len('minio:'):].lstrip('/')
if not object_name:
return ''
try:
from minio_storage.minio_connect import presigned_get_url
return presigned_get_url(object_name, expires_seconds=8 * 60 * 60)
except Exception:
return ''
try:
from minio_storage.minio_connect import presigned_get_url
return presigned_get_url(object_name, expires_seconds=8 * 60 * 60)
except Exception:
return ''
if s.startswith('local:'):
rel_path = s[len('local:'):].lstrip('/')
if not rel_path:
return ''
return request.build_absolute_uri(settings.MEDIA_URL + rel_path)
return ''
def _parse_image_refs(image_ref):
@@ -429,8 +450,16 @@ def delete_user_by_id_view(request, user_id):
return JsonResponse({"status": "success", "message": "用户删除成功"})
# 辅助JSON 转换(兼容 a.py 行为)
# 辅助JSON 转换
def json_to_string(obj):
if isinstance(obj, str):
# 如果已经是字符串,尝试解析一下验证是否为有效 JSON或者是普通文本
try:
json.loads(obj)
return obj # 是有效 JSON 字符串,直接返回
except Exception:
# 不是 JSON 字符串,可能是普通文本,将其封装成 JSON
return json.dumps(obj, ensure_ascii=False)
try:
return json.dumps(obj, ensure_ascii=False)
except Exception:
@@ -581,21 +610,73 @@ def upload(request):
rel_paths = []
image_urls = []
data_list = []
for file in files:
filename = f"{uuid.uuid4()}_{file.name}"
abs_path = os.path.join(images_dir, filename)
with open(abs_path, "wb") as dst:
for chunk in file.chunks():
dst.write(chunk)
# 预处理文件列表处理PDF转换
processed_files = []
for f in files:
if f.name.lower().endswith('.pdf'):
if not HAS_PDF_SUPPORT:
return JsonResponse({"status": "error", "message": f"服务器未安装PDF处理组件(PyMuPDF): {PDF_ERROR}"}, status=500)
# 将PDF保存到临时文件
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
for chunk in f.chunks():
tmp.write(chunk)
tmp_path = tmp.name
try:
# 转换PDF为图片列表
doc = fitz.open(tmp_path)
for i in range(len(doc)):
page = doc.load_page(i)
# 降低 DPI 从 200 到 150在保证准确率的同时显著减小图片体积加快上传速度
pix = page.get_pixmap(dpi=150)
img_filename = f"{uuid.uuid4()}_page_{i+1}.jpg"
img_abs_path = os.path.join(images_dir, img_filename)
pix.save(img_abs_path)
processed_files.append((img_abs_path, img_filename, f"{f.name}_p{i+1}"))
doc.close()
except Exception as e:
return JsonResponse({"status": "error", "message": f"PDF转换失败: {str(e)}"}, status=500)
finally:
if os.path.exists(tmp_path):
os.remove(tmp_path)
else:
# 普通图片处理
filename = f"{uuid.uuid4()}_{f.name}"
abs_path = os.path.join(images_dir, filename)
with open(abs_path, "wb") as dst:
for chunk in f.chunks():
dst.write(chunk)
processed_files.append((abs_path, filename, f.name))
# 使用线程池并行调用 OCR 接口以提升速度
def run_ocr(file_info):
abs_path, filename, original_name = file_info
try:
data = ocr_and_extract_info(abs_path)
return (data, filename)
except Exception as e:
return JsonResponse({"status": "error", "message": str(e)}, status=500)
if data:
data_list.append(data)
rel_path = f"images/{filename}"
rel_paths.append(rel_path)
image_urls.append(request.build_absolute_uri(settings.MEDIA_URL + rel_path))
return (e, filename)
rel_paths = []
image_urls = []
data_list = []
# 限制最大线程数为 8避免过多并发导致 API 限制或资源耗尽
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
future_to_file = {executor.submit(run_ocr, f_info): f_info for f_info in processed_files}
for future in concurrent.futures.as_completed(future_to_file):
result, filename = future.result()
if isinstance(result, Exception):
return JsonResponse({"status": "error", "message": f"文件 {filename} 识别失败: {str(result)}"}, status=500)
if result:
data_list.append(result)
rel_path = f"images/{filename}"
rel_paths.append(rel_path)
image_urls.append(request.build_absolute_uri(settings.MEDIA_URL + rel_path))
if not data_list:
return JsonResponse({"status": "error", "message": "无法识别图片内容"}, status=400)
@@ -687,12 +768,18 @@ def confirm(request):
try:
object_name = f"images/{webp_name}"
from minio_storage.minio_connect import upload_file
upload_file(webp_abs, object_name, content_type="image/webp")
image_refs.append(f"minio:{object_name}")
temp_files_to_delete.extend([src_abs, webp_abs])
from minio_storage.minio_connect import upload_file, is_minio_configured
if is_minio_configured():
upload_file(webp_abs, object_name, content_type="image/webp")
image_refs.append(f"minio:{object_name}")
temp_files_to_delete.extend([src_abs, webp_abs])
else:
# Fallback to local storage
image_refs.append(f"local:{object_name}")
# In local case, we keep the webp file and only delete the original temporary file
temp_files_to_delete.append(src_abs)
except Exception as e:
return JsonResponse({"status": "error", "message": f"上传到MinIO失败: {e}"}, status=500)
return JsonResponse({"status": "error", "message": f"存储图片失败: {e}"}, status=500)
if len(image_refs) == 1:
image_ref_to_store = image_refs[0]
elif len(image_refs) > 1:
@@ -750,21 +837,10 @@ def manage_page(request):
raw_results.append(r)
break
results = []
for r in raw_results:
try:
r_data = string_to_json(r.get("data", "{}"))
r_data["_id"] = r["id"]
r_data["_image"] = r.get("image", "")
results.append(r_data)
except Exception:
pass
return render(
request,
"elastic/manage.html",
{
"results": results,
"is_admin": is_admin,
"user_id": session_user_id,
"username": me.get("username"),