diff --git a/elastic/templates/elastic/manage.html b/elastic/templates/elastic/manage.html index a0298e5..1107c03 100644 --- a/elastic/templates/elastic/manage.html +++ b/elastic/templates/elastic/manage.html @@ -16,8 +16,10 @@ table{width:100%;border-collapse:collapse;margin-top:20px} th,td{border-bottom:1px solid #eee;padding:12px 8px;text-align:left;vertical-align:top} th{background:#f8f9fa;font-weight:600} - .inner-table { width: 100%; margin: 0; border: 1px solid #e0e0e0; border-collapse: collapse; } - .inner-table th, .inner-table td { border: 1px solid #e0e0e0; padding: 6px; font-size: 12px; } + .inner-table { width: 100%; margin: 0; border: 1px solid #e0e0e0; border-collapse: collapse; table-layout: fixed; } + .inner-table th, .inner-table td { border: 1px solid #e0e0e0; padding: 8px; font-size: 13px; word-break: break-all; } + .inner-table td:first-child { width: 30%; background-color: #f8fafc; font-weight: 600; color: #475569; } + .inner-table td:last-child { width: 70%; background-color: #fff; } .inner-table th { background-color: #f9f9f9; } img{max-width:120px;border:1px solid #eee;border-radius:6px;cursor:pointer} .btn{padding:6px 10px;border:none;border-radius:6px;cursor:pointer;font-size:14px;margin:2px} @@ -320,27 +322,37 @@ function renderTable(data) { // 解析data字段,如果是JSON字符串则格式化显示 let displayData = item.data || ''; + let parsed = null; try { - const parsed = JSON.parse(item.data); - displayData = ` - - - ${Object.entries(parsed).map(([key, value]) => ` - - - - - `).join('')} - -
${escapeHtml(key)}${escapeHtml(typeof value === 'object' ? JSON.stringify(value, null, 2) : value)}
- `; + if (typeof displayData === 'object' && displayData !== null) { + parsed = displayData; + } else if (typeof displayData === 'string') { + parsed = JSON.parse(displayData); + } + + if (parsed && typeof parsed === 'object' && !Array.isArray(parsed)) { + displayData = ` + + + ${Object.entries(parsed).map(([key, value]) => ` + + + + + `).join('')} + +
${escapeHtml(key)}${escapeHtml(typeof value === 'object' ? JSON.stringify(value, null, 2) : value)}
+ `; + } else { + throw new Error('Not a valid JSON object'); + } } catch (e) { displayData = ` - +
原始数据${escapeHtml(displayData)}${escapeHtml(typeof displayData === 'object' ? JSON.stringify(displayData) : displayData)}
@@ -423,16 +435,24 @@ function createRow(k = '', v = '') { } function renderForm(obj){ - kvForm.innerHTML=''; + kvForm.innerHTML=` +
+
字段名
+
字段值
+
操作
+
+ `; Object.keys(obj||{}).forEach(k=> kvForm.appendChild(createRow(k, obj[k]))); - if (!kvForm.children.length) kvForm.appendChild(createRow()); + if (kvForm.querySelectorAll('div[style*="grid"]').length <= 1) kvForm.appendChild(createRow()); syncTextarea(); } function formToObject(){ const o={}; - Array.from(kvForm.children).forEach(row=>{ + Array.from(kvForm.children).forEach((row, index)=>{ + if (index === 0) return; // 跳过表头 const [kI,vI] = row.querySelectorAll('input'); + if (!kI || !vI) return; const k=(kI.value||'').trim(); if(!k) return; const raw=vI.value; try{ diff --git a/elastic/templates/elastic/upload.html b/elastic/templates/elastic/upload.html index c7539df..e6dabf3 100644 --- a/elastic/templates/elastic/upload.html +++ b/elastic/templates/elastic/upload.html @@ -49,9 +49,10 @@ .result-box {flex: 1;} .result-box h3 { margin-top: 0; color: #334155;} .form-controls { display: flex;gap: 8px;margin-bottom: 12px;flex-wrap: wrap;} - #kvForm {border: 1px solid #e2e8f0; border-radius: 8px; padding: 12px; max-height: 300px; overflow: auto;margin-bottom: 12px;background: white;} - .form-row {display: grid;grid-template-columns: 1fr 1fr auto;gap: 8px; margin-bottom: 6px; } - .form-row input {padding: 8px;border: 1px solid #cbd5e1;border-radius: 4px;} + #kvForm {border: 1px solid #e2e8f0; border-radius: 8px; padding: 12px; max-height: 400px; overflow: auto;margin-bottom: 12px;background: #f8fafc;} + .form-header { display: grid; grid-template-columns: 1fr 1fr auto; gap: 8px; margin-bottom: 8px; padding: 0 4px; font-weight: 600; color: #475569; font-size: 14px;} + .form-row {display: grid;grid-template-columns: 1fr 1fr auto;gap: 8px; margin-bottom: 6px; align-items: center;} + .form-row input {padding: 8px;border: 1px solid #cbd5e1;border-radius: 4px; width: 100%; box-sizing: border-box;} #resultBox { width: 100%;min-height: 200px;font-family: ui-monospace, SFMono-Regular, Menlo, monospace;font-size: 14px; padding: 12px; border: 1px solid #e2e8f0; border-radius: 8px; resize: vertical;box-sizing: border-box; } .status-message { padding: 10px; margin: 10px 0; border-radius: 6px; display: none; } @@ -83,17 +84,17 @@
-

图片上传与识别

-

选择图片后上传,服务端调用大模型解析为可编辑的 JSON,再确认入库。

+

图片与PDF上传识别

+

选择图片或PDF文件后上传,服务端调用大模型解析为可编辑的 JSON,再确认入库。

-

上传图片

-

点击下方按钮选择图片,或拖拽图片到此区域

+

上传文件

+

点击下方按钮选择图片或PDF文件,或拖拽文件到此区域

{% csrf_token %} - +
@@ -268,7 +269,7 @@ function updateFileHint() { } function addFiles(files) { - const incoming = Array.from(files || []).filter(f => f && f.type.startsWith('image/')); + const incoming = Array.from(files || []).filter(f => f && (f.type.startsWith('image/') || f.name.toLowerCase().endsWith('.pdf'))); const existingKeys = new Set(selectedFiles.map(f => `${f.name}|${f.size}|${f.lastModified}`)); incoming.forEach(f => { const key = `${f.name}|${f.size}|${f.lastModified}`; @@ -277,10 +278,20 @@ function addFiles(files) { selectedFiles.push(f); } }); - const urls = selectedFiles.map(f => URL.createObjectURL(f)); + const urls = selectedFiles.map(f => { + if (f.name.toLowerCase().endsWith('.pdf')) { + // 使用一个简单的 SVG PDF 图标 Data URI + return 'data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHdpZHRoPSI0OCIgaGVpZ2h0PSI0OCIgdmlld0JveD0iMCAwIDI0IDI0IiBmaWxsPSJub25lIiBzdHJva2U9IiNlZjQ0NDQiIHN0cm9rZS13aWR0aD0iMiIgc3Ryb2tlLWxpbmVjYXA9InJvdW5kIiBzdHJva2UtbGluZWpvaW49InJvdW5kIj48cGF0aCBkPSJNMTQgMmgyYTIgMiAwIDAgMSAyIDJ2MTZhMiAyIDAgMCAxLTIgMmgtMTJhMiAyIDAgMCAxLTItMlY0YTIgMiAwIDAgMSAyLTJoMiIvPjxwYXRoIGQ9Ik0xNCAydjRjMCAxLjEgLjkgMiAyIDJoNCIvPjxwYXRoIGQ9Ik03IDloNSIvPjxwYXRoIGQ9Ik03IDEzaDUiLz48cGF0aCBkPSJNNyAxN2g4Ii8+PC9zdmc+'; + } + return URL.createObjectURL(f); + }); setPreviewList(urls); updateFileHint(); - setTimeout(() => urls.forEach(u => URL.revokeObjectURL(u)), 0); + setTimeout(() => { + urls.forEach(u => { + if (u.startsWith('blob:')) URL.revokeObjectURL(u); + }); + }, 0); } fileInput.addEventListener('change', function(e) { @@ -304,7 +315,7 @@ function createRow(k = '', v = '') { delBtn.className = 'btn btn-danger'; delBtn.textContent = '删除'; delBtn.onclick = () => { - if (kvForm.children.length > 1) { + if (kvForm.querySelectorAll('.form-row').length > 1) { kvForm.removeChild(row); } else { keyInput.value = ''; @@ -321,17 +332,24 @@ function createRow(k = '', v = '') { } function renderFormFromObject(obj) { - kvForm.innerHTML = ''; + kvForm.innerHTML = ` +
+
字段名
+
字段值
+
操作
+
+ `; Object.keys(obj || {}).forEach(k => { kvForm.appendChild(createRow(k, obj[k])); }); - if (!kvForm.children.length) kvForm.appendChild(createRow()); + if (kvForm.children.length <= 1) kvForm.appendChild(createRow()); syncTextarea(); } function objectFromForm() { const obj = {}; Array.from(kvForm.children).forEach(row => { + if (row.classList.contains('form-header')) return; const [kInput, vInput] = row.querySelectorAll('input'); const k = (kInput.value || '').trim(); if (!k) return; @@ -380,31 +398,32 @@ uploadForm.addEventListener('submit', async (e) => { resultBox.value = ''; currentImageRel = []; - const files = Array.from(selectedFiles || []).filter(f => f && f.type.startsWith('image/')); + const files = Array.from(selectedFiles || []).filter(f => f && (f.type.startsWith('image/') || f.name.toLowerCase().endsWith('.pdf'))); if (!files.length) { - uploadMsg.textContent = '请选择图片文件'; + uploadMsg.textContent = '请选择图片或PDF文件'; uploadMsg.className = 'status-message error'; uploadMsg.style.display = 'block'; return; } showProgress(); - setProgress(5, '转换为JPG'); + setProgress(5, '预处理中'); const formData = new FormData(); - const converted = []; for (let i = 0; i < files.length; i++) { const file = files[i]; - let jpegFile = file; - try { - jpegFile = await convertToJpeg(file); - } catch (_) { - jpegFile = file; + if (file.type.startsWith('image/')) { + setProgress(5 + Math.round((i/files.length)*45), '转换图片'); + try { + const jpegFile = await convertToJpeg(file); + formData.append('file', jpegFile); + } catch (_) { + formData.append('file', file); + } + } else { + // PDF 直接添加 + formData.append('file', file); } - converted.push(jpegFile); - const pct = 5 + Math.round(((i + 1) / files.length) * 45); - setProgress(pct, '转换为JPG'); } - converted.forEach(f => formData.append('file', f)); try { let prog = 50; diff --git a/elastic/views.py b/elastic/views.py index 772c3c6..ea3a5ea 100644 --- a/elastic/views.py +++ b/elastic/views.py @@ -6,6 +6,8 @@ import re import uuid import base64 import json +import tempfile +import concurrent.futures from django.conf import settings from django.http import JsonResponse from django.shortcuts import render @@ -21,6 +23,19 @@ from .es_connect import ( analytics_recent as es_analytics_recent, ) from PIL import Image +try: + import fitz # PyMuPDF + HAS_PDF_SUPPORT = True + PDF_ERROR = "" +except ImportError as e: + try: + import pymupdf + fitz = pymupdf + HAS_PDF_SUPPORT = True + PDF_ERROR = "" + except ImportError: + HAS_PDF_SUPPORT = False + PDF_ERROR = str(e) def _filter_results_for_user(request, results): @@ -58,18 +73,24 @@ def _image_ref_to_url(request, image_ref: str) -> str: if not s: return '' - if not s.startswith('minio:'): - return '' + if s.startswith('minio:'): + object_name = s[len('minio:'):].lstrip('/') + if not object_name: + return '' - object_name = s[len('minio:'):].lstrip('/') - if not object_name: - return '' + try: + from minio_storage.minio_connect import presigned_get_url + return presigned_get_url(object_name, expires_seconds=8 * 60 * 60) + except Exception: + return '' - try: - from minio_storage.minio_connect import presigned_get_url - return presigned_get_url(object_name, expires_seconds=8 * 60 * 60) - except Exception: - return '' + if s.startswith('local:'): + rel_path = s[len('local:'):].lstrip('/') + if not rel_path: + return '' + return request.build_absolute_uri(settings.MEDIA_URL + rel_path) + + return '' def _parse_image_refs(image_ref): @@ -429,8 +450,16 @@ def delete_user_by_id_view(request, user_id): return JsonResponse({"status": "success", "message": "用户删除成功"}) -# 辅助:JSON 转换(兼容 a.py 行为) +# 辅助:JSON 转换 def json_to_string(obj): + if isinstance(obj, str): + # 如果已经是字符串,尝试解析一下验证是否为有效 JSON,或者是普通文本 + try: + json.loads(obj) + return obj # 是有效 JSON 字符串,直接返回 + except Exception: + # 不是 JSON 字符串,可能是普通文本,将其封装成 JSON + return json.dumps(obj, ensure_ascii=False) try: return json.dumps(obj, ensure_ascii=False) except Exception: @@ -581,21 +610,73 @@ def upload(request): rel_paths = [] image_urls = [] data_list = [] - for file in files: - filename = f"{uuid.uuid4()}_{file.name}" - abs_path = os.path.join(images_dir, filename) - with open(abs_path, "wb") as dst: - for chunk in file.chunks(): - dst.write(chunk) + + # 预处理文件列表,处理PDF转换 + processed_files = [] + for f in files: + if f.name.lower().endswith('.pdf'): + if not HAS_PDF_SUPPORT: + return JsonResponse({"status": "error", "message": f"服务器未安装PDF处理组件(PyMuPDF): {PDF_ERROR}"}, status=500) + + # 将PDF保存到临时文件 + with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp: + for chunk in f.chunks(): + tmp.write(chunk) + tmp_path = tmp.name + + try: + # 转换PDF为图片列表 + doc = fitz.open(tmp_path) + for i in range(len(doc)): + page = doc.load_page(i) + # 降低 DPI 从 200 到 150,在保证准确率的同时显著减小图片体积,加快上传速度 + pix = page.get_pixmap(dpi=150) + img_filename = f"{uuid.uuid4()}_page_{i+1}.jpg" + img_abs_path = os.path.join(images_dir, img_filename) + pix.save(img_abs_path) + processed_files.append((img_abs_path, img_filename, f"{f.name}_p{i+1}")) + doc.close() + except Exception as e: + return JsonResponse({"status": "error", "message": f"PDF转换失败: {str(e)}"}, status=500) + finally: + if os.path.exists(tmp_path): + os.remove(tmp_path) + else: + # 普通图片处理 + filename = f"{uuid.uuid4()}_{f.name}" + abs_path = os.path.join(images_dir, filename) + with open(abs_path, "wb") as dst: + for chunk in f.chunks(): + dst.write(chunk) + processed_files.append((abs_path, filename, f.name)) + + # 使用线程池并行调用 OCR 接口以提升速度 + def run_ocr(file_info): + abs_path, filename, original_name = file_info try: data = ocr_and_extract_info(abs_path) + return (data, filename) except Exception as e: - return JsonResponse({"status": "error", "message": str(e)}, status=500) - if data: - data_list.append(data) - rel_path = f"images/{filename}" - rel_paths.append(rel_path) - image_urls.append(request.build_absolute_uri(settings.MEDIA_URL + rel_path)) + return (e, filename) + + rel_paths = [] + image_urls = [] + data_list = [] + + # 限制最大线程数为 8,避免过多并发导致 API 限制或资源耗尽 + with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor: + future_to_file = {executor.submit(run_ocr, f_info): f_info for f_info in processed_files} + for future in concurrent.futures.as_completed(future_to_file): + result, filename = future.result() + if isinstance(result, Exception): + return JsonResponse({"status": "error", "message": f"文件 {filename} 识别失败: {str(result)}"}, status=500) + + if result: + data_list.append(result) + + rel_path = f"images/{filename}" + rel_paths.append(rel_path) + image_urls.append(request.build_absolute_uri(settings.MEDIA_URL + rel_path)) if not data_list: return JsonResponse({"status": "error", "message": "无法识别图片内容"}, status=400) @@ -687,12 +768,18 @@ def confirm(request): try: object_name = f"images/{webp_name}" - from minio_storage.minio_connect import upload_file - upload_file(webp_abs, object_name, content_type="image/webp") - image_refs.append(f"minio:{object_name}") - temp_files_to_delete.extend([src_abs, webp_abs]) + from minio_storage.minio_connect import upload_file, is_minio_configured + if is_minio_configured(): + upload_file(webp_abs, object_name, content_type="image/webp") + image_refs.append(f"minio:{object_name}") + temp_files_to_delete.extend([src_abs, webp_abs]) + else: + # Fallback to local storage + image_refs.append(f"local:{object_name}") + # In local case, we keep the webp file and only delete the original temporary file + temp_files_to_delete.append(src_abs) except Exception as e: - return JsonResponse({"status": "error", "message": f"上传到MinIO失败: {e}"}, status=500) + return JsonResponse({"status": "error", "message": f"存储图片失败: {e}"}, status=500) if len(image_refs) == 1: image_ref_to_store = image_refs[0] elif len(image_refs) > 1: @@ -750,21 +837,10 @@ def manage_page(request): raw_results.append(r) break - results = [] - for r in raw_results: - try: - r_data = string_to_json(r.get("data", "{}")) - r_data["_id"] = r["id"] - r_data["_image"] = r.get("image", "") - results.append(r_data) - except Exception: - pass - return render( request, "elastic/manage.html", { - "results": results, "is_admin": is_admin, "user_id": session_user_id, "username": me.get("username"), diff --git a/minio_storage/minio_connect.py b/minio_storage/minio_connect.py index 650fddf..a71555b 100644 --- a/minio_storage/minio_connect.py +++ b/minio_storage/minio_connect.py @@ -63,6 +63,10 @@ def get_minio_client() -> Minio | None: ) +def is_minio_configured() -> bool: + return get_minio_client() is not None + + def get_bucket_name() -> str: return _get_env('MINIO_BUCKET', default='achievement') or 'achievement' diff --git a/requirements.txt b/requirements.txt index de05d27..3ec8791 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,3 +14,4 @@ whitenoise==6.6.0 django-browser-reload==1.21.0 captcha==0.7.1 cryptography==46.0.3 +pymupdf==1.25.3