能上传并识别PDF
This commit is contained in:
154
elastic/views.py
154
elastic/views.py
@@ -6,6 +6,8 @@ import re
|
||||
import uuid
|
||||
import base64
|
||||
import json
|
||||
import tempfile
|
||||
import concurrent.futures
|
||||
from django.conf import settings
|
||||
from django.http import JsonResponse
|
||||
from django.shortcuts import render
|
||||
@@ -21,6 +23,19 @@ from .es_connect import (
|
||||
analytics_recent as es_analytics_recent,
|
||||
)
|
||||
from PIL import Image
|
||||
try:
|
||||
import fitz # PyMuPDF
|
||||
HAS_PDF_SUPPORT = True
|
||||
PDF_ERROR = ""
|
||||
except ImportError as e:
|
||||
try:
|
||||
import pymupdf
|
||||
fitz = pymupdf
|
||||
HAS_PDF_SUPPORT = True
|
||||
PDF_ERROR = ""
|
||||
except ImportError:
|
||||
HAS_PDF_SUPPORT = False
|
||||
PDF_ERROR = str(e)
|
||||
|
||||
|
||||
def _filter_results_for_user(request, results):
|
||||
@@ -58,18 +73,24 @@ def _image_ref_to_url(request, image_ref: str) -> str:
|
||||
if not s:
|
||||
return ''
|
||||
|
||||
if not s.startswith('minio:'):
|
||||
return ''
|
||||
if s.startswith('minio:'):
|
||||
object_name = s[len('minio:'):].lstrip('/')
|
||||
if not object_name:
|
||||
return ''
|
||||
|
||||
object_name = s[len('minio:'):].lstrip('/')
|
||||
if not object_name:
|
||||
return ''
|
||||
try:
|
||||
from minio_storage.minio_connect import presigned_get_url
|
||||
return presigned_get_url(object_name, expires_seconds=8 * 60 * 60)
|
||||
except Exception:
|
||||
return ''
|
||||
|
||||
try:
|
||||
from minio_storage.minio_connect import presigned_get_url
|
||||
return presigned_get_url(object_name, expires_seconds=8 * 60 * 60)
|
||||
except Exception:
|
||||
return ''
|
||||
if s.startswith('local:'):
|
||||
rel_path = s[len('local:'):].lstrip('/')
|
||||
if not rel_path:
|
||||
return ''
|
||||
return request.build_absolute_uri(settings.MEDIA_URL + rel_path)
|
||||
|
||||
return ''
|
||||
|
||||
|
||||
def _parse_image_refs(image_ref):
|
||||
@@ -429,8 +450,16 @@ def delete_user_by_id_view(request, user_id):
|
||||
return JsonResponse({"status": "success", "message": "用户删除成功"})
|
||||
|
||||
|
||||
# 辅助:JSON 转换(兼容 a.py 行为)
|
||||
# 辅助:JSON 转换
|
||||
def json_to_string(obj):
|
||||
if isinstance(obj, str):
|
||||
# 如果已经是字符串,尝试解析一下验证是否为有效 JSON,或者是普通文本
|
||||
try:
|
||||
json.loads(obj)
|
||||
return obj # 是有效 JSON 字符串,直接返回
|
||||
except Exception:
|
||||
# 不是 JSON 字符串,可能是普通文本,将其封装成 JSON
|
||||
return json.dumps(obj, ensure_ascii=False)
|
||||
try:
|
||||
return json.dumps(obj, ensure_ascii=False)
|
||||
except Exception:
|
||||
@@ -581,21 +610,73 @@ def upload(request):
|
||||
rel_paths = []
|
||||
image_urls = []
|
||||
data_list = []
|
||||
for file in files:
|
||||
filename = f"{uuid.uuid4()}_{file.name}"
|
||||
abs_path = os.path.join(images_dir, filename)
|
||||
with open(abs_path, "wb") as dst:
|
||||
for chunk in file.chunks():
|
||||
dst.write(chunk)
|
||||
|
||||
# 预处理文件列表,处理PDF转换
|
||||
processed_files = []
|
||||
for f in files:
|
||||
if f.name.lower().endswith('.pdf'):
|
||||
if not HAS_PDF_SUPPORT:
|
||||
return JsonResponse({"status": "error", "message": f"服务器未安装PDF处理组件(PyMuPDF): {PDF_ERROR}"}, status=500)
|
||||
|
||||
# 将PDF保存到临时文件
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
|
||||
for chunk in f.chunks():
|
||||
tmp.write(chunk)
|
||||
tmp_path = tmp.name
|
||||
|
||||
try:
|
||||
# 转换PDF为图片列表
|
||||
doc = fitz.open(tmp_path)
|
||||
for i in range(len(doc)):
|
||||
page = doc.load_page(i)
|
||||
# 降低 DPI 从 200 到 150,在保证准确率的同时显著减小图片体积,加快上传速度
|
||||
pix = page.get_pixmap(dpi=150)
|
||||
img_filename = f"{uuid.uuid4()}_page_{i+1}.jpg"
|
||||
img_abs_path = os.path.join(images_dir, img_filename)
|
||||
pix.save(img_abs_path)
|
||||
processed_files.append((img_abs_path, img_filename, f"{f.name}_p{i+1}"))
|
||||
doc.close()
|
||||
except Exception as e:
|
||||
return JsonResponse({"status": "error", "message": f"PDF转换失败: {str(e)}"}, status=500)
|
||||
finally:
|
||||
if os.path.exists(tmp_path):
|
||||
os.remove(tmp_path)
|
||||
else:
|
||||
# 普通图片处理
|
||||
filename = f"{uuid.uuid4()}_{f.name}"
|
||||
abs_path = os.path.join(images_dir, filename)
|
||||
with open(abs_path, "wb") as dst:
|
||||
for chunk in f.chunks():
|
||||
dst.write(chunk)
|
||||
processed_files.append((abs_path, filename, f.name))
|
||||
|
||||
# 使用线程池并行调用 OCR 接口以提升速度
|
||||
def run_ocr(file_info):
|
||||
abs_path, filename, original_name = file_info
|
||||
try:
|
||||
data = ocr_and_extract_info(abs_path)
|
||||
return (data, filename)
|
||||
except Exception as e:
|
||||
return JsonResponse({"status": "error", "message": str(e)}, status=500)
|
||||
if data:
|
||||
data_list.append(data)
|
||||
rel_path = f"images/{filename}"
|
||||
rel_paths.append(rel_path)
|
||||
image_urls.append(request.build_absolute_uri(settings.MEDIA_URL + rel_path))
|
||||
return (e, filename)
|
||||
|
||||
rel_paths = []
|
||||
image_urls = []
|
||||
data_list = []
|
||||
|
||||
# 限制最大线程数为 8,避免过多并发导致 API 限制或资源耗尽
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
|
||||
future_to_file = {executor.submit(run_ocr, f_info): f_info for f_info in processed_files}
|
||||
for future in concurrent.futures.as_completed(future_to_file):
|
||||
result, filename = future.result()
|
||||
if isinstance(result, Exception):
|
||||
return JsonResponse({"status": "error", "message": f"文件 {filename} 识别失败: {str(result)}"}, status=500)
|
||||
|
||||
if result:
|
||||
data_list.append(result)
|
||||
|
||||
rel_path = f"images/{filename}"
|
||||
rel_paths.append(rel_path)
|
||||
image_urls.append(request.build_absolute_uri(settings.MEDIA_URL + rel_path))
|
||||
|
||||
if not data_list:
|
||||
return JsonResponse({"status": "error", "message": "无法识别图片内容"}, status=400)
|
||||
@@ -687,12 +768,18 @@ def confirm(request):
|
||||
|
||||
try:
|
||||
object_name = f"images/{webp_name}"
|
||||
from minio_storage.minio_connect import upload_file
|
||||
upload_file(webp_abs, object_name, content_type="image/webp")
|
||||
image_refs.append(f"minio:{object_name}")
|
||||
temp_files_to_delete.extend([src_abs, webp_abs])
|
||||
from minio_storage.minio_connect import upload_file, is_minio_configured
|
||||
if is_minio_configured():
|
||||
upload_file(webp_abs, object_name, content_type="image/webp")
|
||||
image_refs.append(f"minio:{object_name}")
|
||||
temp_files_to_delete.extend([src_abs, webp_abs])
|
||||
else:
|
||||
# Fallback to local storage
|
||||
image_refs.append(f"local:{object_name}")
|
||||
# In local case, we keep the webp file and only delete the original temporary file
|
||||
temp_files_to_delete.append(src_abs)
|
||||
except Exception as e:
|
||||
return JsonResponse({"status": "error", "message": f"上传到MinIO失败: {e}"}, status=500)
|
||||
return JsonResponse({"status": "error", "message": f"存储图片失败: {e}"}, status=500)
|
||||
if len(image_refs) == 1:
|
||||
image_ref_to_store = image_refs[0]
|
||||
elif len(image_refs) > 1:
|
||||
@@ -750,21 +837,10 @@ def manage_page(request):
|
||||
raw_results.append(r)
|
||||
break
|
||||
|
||||
results = []
|
||||
for r in raw_results:
|
||||
try:
|
||||
r_data = string_to_json(r.get("data", "{}"))
|
||||
r_data["_id"] = r["id"]
|
||||
r_data["_image"] = r.get("image", "")
|
||||
results.append(r_data)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return render(
|
||||
request,
|
||||
"elastic/manage.html",
|
||||
{
|
||||
"results": results,
|
||||
"is_admin": is_admin,
|
||||
"user_id": session_user_id,
|
||||
"username": me.get("username"),
|
||||
|
||||
Reference in New Issue
Block a user