能上传并识别PDF

This commit is contained in:
DSQ
2026-03-12 20:05:48 +08:00
parent 109c06e1d9
commit d69858434f
5 changed files with 205 additions and 85 deletions

View File

@@ -16,8 +16,10 @@
table{width:100%;border-collapse:collapse;margin-top:20px} table{width:100%;border-collapse:collapse;margin-top:20px}
th,td{border-bottom:1px solid #eee;padding:12px 8px;text-align:left;vertical-align:top} th,td{border-bottom:1px solid #eee;padding:12px 8px;text-align:left;vertical-align:top}
th{background:#f8f9fa;font-weight:600} th{background:#f8f9fa;font-weight:600}
.inner-table { width: 100%; margin: 0; border: 1px solid #e0e0e0; border-collapse: collapse; } .inner-table { width: 100%; margin: 0; border: 1px solid #e0e0e0; border-collapse: collapse; table-layout: fixed; }
.inner-table th, .inner-table td { border: 1px solid #e0e0e0; padding: 6px; font-size: 12px; } .inner-table th, .inner-table td { border: 1px solid #e0e0e0; padding: 8px; font-size: 13px; word-break: break-all; }
.inner-table td:first-child { width: 30%; background-color: #f8fafc; font-weight: 600; color: #475569; }
.inner-table td:last-child { width: 70%; background-color: #fff; }
.inner-table th { background-color: #f9f9f9; } .inner-table th { background-color: #f9f9f9; }
img{max-width:120px;border:1px solid #eee;border-radius:6px;cursor:pointer} img{max-width:120px;border:1px solid #eee;border-radius:6px;cursor:pointer}
.btn{padding:6px 10px;border:none;border-radius:6px;cursor:pointer;font-size:14px;margin:2px} .btn{padding:6px 10px;border:none;border-radius:6px;cursor:pointer;font-size:14px;margin:2px}
@@ -320,8 +322,15 @@ function renderTable(data) {
// 解析data字段如果是JSON字符串则格式化显示 // 解析data字段如果是JSON字符串则格式化显示
let displayData = item.data || ''; let displayData = item.data || '';
let parsed = null;
try { try {
const parsed = JSON.parse(item.data); if (typeof displayData === 'object' && displayData !== null) {
parsed = displayData;
} else if (typeof displayData === 'string') {
parsed = JSON.parse(displayData);
}
if (parsed && typeof parsed === 'object' && !Array.isArray(parsed)) {
displayData = ` displayData = `
<table class="inner-table"> <table class="inner-table">
<tbody> <tbody>
@@ -334,13 +343,16 @@ function renderTable(data) {
</tbody> </tbody>
</table> </table>
`; `;
} else {
throw new Error('Not a valid JSON object');
}
} catch (e) { } catch (e) {
displayData = ` displayData = `
<table class="inner-table"> <table class="inner-table">
<tbody> <tbody>
<tr> <tr>
<td>原始数据</td> <td>原始数据</td>
<td>${escapeHtml(displayData)}</td> <td>${escapeHtml(typeof displayData === 'object' ? JSON.stringify(displayData) : displayData)}</td>
</tr> </tr>
</tbody> </tbody>
</table> </table>
@@ -423,16 +435,24 @@ function createRow(k = '', v = '') {
} }
function renderForm(obj){ function renderForm(obj){
kvForm.innerHTML=''; kvForm.innerHTML=`
<div style="display: grid; grid-template-columns: 1fr 1fr auto; gap: 8px; margin-bottom: 8px; font-weight: 600; color: #475569; font-size: 14px;">
<div>字段名</div>
<div>字段值</div>
<div>操作</div>
</div>
`;
Object.keys(obj||{}).forEach(k=> kvForm.appendChild(createRow(k, obj[k]))); Object.keys(obj||{}).forEach(k=> kvForm.appendChild(createRow(k, obj[k])));
if (!kvForm.children.length) kvForm.appendChild(createRow()); if (kvForm.querySelectorAll('div[style*="grid"]').length <= 1) kvForm.appendChild(createRow());
syncTextarea(); syncTextarea();
} }
function formToObject(){ function formToObject(){
const o={}; const o={};
Array.from(kvForm.children).forEach(row=>{ Array.from(kvForm.children).forEach((row, index)=>{
if (index === 0) return; // 跳过表头
const [kI,vI] = row.querySelectorAll('input'); const [kI,vI] = row.querySelectorAll('input');
if (!kI || !vI) return;
const k=(kI.value||'').trim(); if(!k) return; const k=(kI.value||'').trim(); if(!k) return;
const raw=vI.value; const raw=vI.value;
try{ try{

View File

@@ -49,9 +49,10 @@
.result-box {flex: 1;} .result-box {flex: 1;}
.result-box h3 { margin-top: 0; color: #334155;} .result-box h3 { margin-top: 0; color: #334155;}
.form-controls { display: flex;gap: 8px;margin-bottom: 12px;flex-wrap: wrap;} .form-controls { display: flex;gap: 8px;margin-bottom: 12px;flex-wrap: wrap;}
#kvForm {border: 1px solid #e2e8f0; border-radius: 8px; padding: 12px; max-height: 300px; overflow: auto;margin-bottom: 12px;background: white;} #kvForm {border: 1px solid #e2e8f0; border-radius: 8px; padding: 12px; max-height: 400px; overflow: auto;margin-bottom: 12px;background: #f8fafc;}
.form-row {display: grid;grid-template-columns: 1fr 1fr auto;gap: 8px; margin-bottom: 6px; } .form-header { display: grid; grid-template-columns: 1fr 1fr auto; gap: 8px; margin-bottom: 8px; padding: 0 4px; font-weight: 600; color: #475569; font-size: 14px;}
.form-row input {padding: 8px;border: 1px solid #cbd5e1;border-radius: 4px;} .form-row {display: grid;grid-template-columns: 1fr 1fr auto;gap: 8px; margin-bottom: 6px; align-items: center;}
.form-row input {padding: 8px;border: 1px solid #cbd5e1;border-radius: 4px; width: 100%; box-sizing: border-box;}
#resultBox { width: 100%;min-height: 200px;font-family: ui-monospace, SFMono-Regular, Menlo, monospace;font-size: 14px; padding: 12px; border: 1px solid #e2e8f0; #resultBox { width: 100%;min-height: 200px;font-family: ui-monospace, SFMono-Regular, Menlo, monospace;font-size: 14px; padding: 12px; border: 1px solid #e2e8f0;
border-radius: 8px; resize: vertical;box-sizing: border-box; } border-radius: 8px; resize: vertical;box-sizing: border-box; }
.status-message { padding: 10px; margin: 10px 0; border-radius: 6px; display: none; } .status-message { padding: 10px; margin: 10px 0; border-radius: 6px; display: none; }
@@ -83,17 +84,17 @@
<div class="container"> <div class="container">
<div class="header"> <div class="header">
<div> <div>
<h2>图片上传识别</h2> <h2>图片与PDF上传识别</h2>
<p>选择图片后上传,服务端调用大模型解析为可编辑的 JSON再确认入库。</p> <p>选择图片或PDF文件后上传,服务端调用大模型解析为可编辑的 JSON再确认入库。</p>
</div> </div>
</div> </div>
<div class="upload-section" id="dropArea"> <div class="upload-section" id="dropArea">
<h3>上传图片</h3> <h3>上传文件</h3>
<p>点击下方按钮选择图片,或拖拽图片到此区域</p> <p>点击下方按钮选择图片或PDF文件,或拖拽文件到此区域</p>
<form id="uploadForm" enctype="multipart/form-data"> <form id="uploadForm" enctype="multipart/form-data">
{% csrf_token %} {% csrf_token %}
<input type="file" id="fileInput" name="file" accept="image/*" multiple /> <input type="file" id="fileInput" name="file" accept="image/*,.pdf" multiple />
<span id="fileHint" class="muted"></span> <span id="fileHint" class="muted"></span>
<br> <br>
<button type="submit" class="btn btn-primary">上传并识别</button> <button type="submit" class="btn btn-primary">上传并识别</button>
@@ -268,7 +269,7 @@ function updateFileHint() {
} }
function addFiles(files) { function addFiles(files) {
const incoming = Array.from(files || []).filter(f => f && f.type.startsWith('image/')); const incoming = Array.from(files || []).filter(f => f && (f.type.startsWith('image/') || f.name.toLowerCase().endsWith('.pdf')));
const existingKeys = new Set(selectedFiles.map(f => `${f.name}|${f.size}|${f.lastModified}`)); const existingKeys = new Set(selectedFiles.map(f => `${f.name}|${f.size}|${f.lastModified}`));
incoming.forEach(f => { incoming.forEach(f => {
const key = `${f.name}|${f.size}|${f.lastModified}`; const key = `${f.name}|${f.size}|${f.lastModified}`;
@@ -277,10 +278,20 @@ function addFiles(files) {
selectedFiles.push(f); selectedFiles.push(f);
} }
}); });
const urls = selectedFiles.map(f => URL.createObjectURL(f)); const urls = selectedFiles.map(f => {
if (f.name.toLowerCase().endsWith('.pdf')) {
// 使用一个简单的 SVG PDF 图标 Data URI
return 'data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHdpZHRoPSI0OCIgaGVpZ2h0PSI0OCIgdmlld0JveD0iMCAwIDI0IDI0IiBmaWxsPSJub25lIiBzdHJva2U9IiNlZjQ0NDQiIHN0cm9rZS13aWR0aD0iMiIgc3Ryb2tlLWxpbmVjYXA9InJvdW5kIiBzdHJva2UtbGluZWpvaW49InJvdW5kIj48cGF0aCBkPSJNMTQgMmgyYTIgMiAwIDAgMSAyIDJ2MTZhMiAyIDAgMCAxLTIgMmgtMTJhMiAyIDAgMCAxLTItMlY0YTIgMiAwIDAgMSAyLTJoMiIvPjxwYXRoIGQ9Ik0xNCAydjRjMCAxLjEgLjkgMiAyIDJoNCIvPjxwYXRoIGQ9Ik03IDloNSIvPjxwYXRoIGQ9Ik03IDEzaDUiLz48cGF0aCBkPSJNNyAxN2g4Ii8+PC9zdmc+';
}
return URL.createObjectURL(f);
});
setPreviewList(urls); setPreviewList(urls);
updateFileHint(); updateFileHint();
setTimeout(() => urls.forEach(u => URL.revokeObjectURL(u)), 0); setTimeout(() => {
urls.forEach(u => {
if (u.startsWith('blob:')) URL.revokeObjectURL(u);
});
}, 0);
} }
fileInput.addEventListener('change', function(e) { fileInput.addEventListener('change', function(e) {
@@ -304,7 +315,7 @@ function createRow(k = '', v = '') {
delBtn.className = 'btn btn-danger'; delBtn.className = 'btn btn-danger';
delBtn.textContent = '删除'; delBtn.textContent = '删除';
delBtn.onclick = () => { delBtn.onclick = () => {
if (kvForm.children.length > 1) { if (kvForm.querySelectorAll('.form-row').length > 1) {
kvForm.removeChild(row); kvForm.removeChild(row);
} else { } else {
keyInput.value = ''; keyInput.value = '';
@@ -321,17 +332,24 @@ function createRow(k = '', v = '') {
} }
function renderFormFromObject(obj) { function renderFormFromObject(obj) {
kvForm.innerHTML = ''; kvForm.innerHTML = `
<div class="form-header">
<div>字段名</div>
<div>字段值</div>
<div>操作</div>
</div>
`;
Object.keys(obj || {}).forEach(k => { Object.keys(obj || {}).forEach(k => {
kvForm.appendChild(createRow(k, obj[k])); kvForm.appendChild(createRow(k, obj[k]));
}); });
if (!kvForm.children.length) kvForm.appendChild(createRow()); if (kvForm.children.length <= 1) kvForm.appendChild(createRow());
syncTextarea(); syncTextarea();
} }
function objectFromForm() { function objectFromForm() {
const obj = {}; const obj = {};
Array.from(kvForm.children).forEach(row => { Array.from(kvForm.children).forEach(row => {
if (row.classList.contains('form-header')) return;
const [kInput, vInput] = row.querySelectorAll('input'); const [kInput, vInput] = row.querySelectorAll('input');
const k = (kInput.value || '').trim(); const k = (kInput.value || '').trim();
if (!k) return; if (!k) return;
@@ -380,31 +398,32 @@ uploadForm.addEventListener('submit', async (e) => {
resultBox.value = ''; resultBox.value = '';
currentImageRel = []; currentImageRel = [];
const files = Array.from(selectedFiles || []).filter(f => f && f.type.startsWith('image/')); const files = Array.from(selectedFiles || []).filter(f => f && (f.type.startsWith('image/') || f.name.toLowerCase().endsWith('.pdf')));
if (!files.length) { if (!files.length) {
uploadMsg.textContent = '请选择图片文件'; uploadMsg.textContent = '请选择图片或PDF文件';
uploadMsg.className = 'status-message error'; uploadMsg.className = 'status-message error';
uploadMsg.style.display = 'block'; uploadMsg.style.display = 'block';
return; return;
} }
showProgress(); showProgress();
setProgress(5, '转换为JPG'); setProgress(5, '预处理中');
const formData = new FormData(); const formData = new FormData();
const converted = [];
for (let i = 0; i < files.length; i++) { for (let i = 0; i < files.length; i++) {
const file = files[i]; const file = files[i];
let jpegFile = file; if (file.type.startsWith('image/')) {
setProgress(5 + Math.round((i/files.length)*45), '转换图片');
try { try {
jpegFile = await convertToJpeg(file); const jpegFile = await convertToJpeg(file);
formData.append('file', jpegFile);
} catch (_) { } catch (_) {
jpegFile = file; formData.append('file', file);
}
} else {
// PDF 直接添加
formData.append('file', file);
} }
converted.push(jpegFile);
const pct = 5 + Math.round(((i + 1) / files.length) * 45);
setProgress(pct, '转换为JPG');
} }
converted.forEach(f => formData.append('file', f));
try { try {
let prog = 50; let prog = 50;

View File

@@ -6,6 +6,8 @@ import re
import uuid import uuid
import base64 import base64
import json import json
import tempfile
import concurrent.futures
from django.conf import settings from django.conf import settings
from django.http import JsonResponse from django.http import JsonResponse
from django.shortcuts import render from django.shortcuts import render
@@ -21,6 +23,19 @@ from .es_connect import (
analytics_recent as es_analytics_recent, analytics_recent as es_analytics_recent,
) )
from PIL import Image from PIL import Image
try:
import fitz # PyMuPDF
HAS_PDF_SUPPORT = True
PDF_ERROR = ""
except ImportError as e:
try:
import pymupdf
fitz = pymupdf
HAS_PDF_SUPPORT = True
PDF_ERROR = ""
except ImportError:
HAS_PDF_SUPPORT = False
PDF_ERROR = str(e)
def _filter_results_for_user(request, results): def _filter_results_for_user(request, results):
@@ -58,9 +73,7 @@ def _image_ref_to_url(request, image_ref: str) -> str:
if not s: if not s:
return '' return ''
if not s.startswith('minio:'): if s.startswith('minio:'):
return ''
object_name = s[len('minio:'):].lstrip('/') object_name = s[len('minio:'):].lstrip('/')
if not object_name: if not object_name:
return '' return ''
@@ -71,6 +84,14 @@ def _image_ref_to_url(request, image_ref: str) -> str:
except Exception: except Exception:
return '' return ''
if s.startswith('local:'):
rel_path = s[len('local:'):].lstrip('/')
if not rel_path:
return ''
return request.build_absolute_uri(settings.MEDIA_URL + rel_path)
return ''
def _parse_image_refs(image_ref): def _parse_image_refs(image_ref):
if not image_ref: if not image_ref:
@@ -429,8 +450,16 @@ def delete_user_by_id_view(request, user_id):
return JsonResponse({"status": "success", "message": "用户删除成功"}) return JsonResponse({"status": "success", "message": "用户删除成功"})
# 辅助JSON 转换(兼容 a.py 行为) # 辅助JSON 转换
def json_to_string(obj): def json_to_string(obj):
if isinstance(obj, str):
# 如果已经是字符串,尝试解析一下验证是否为有效 JSON或者是普通文本
try:
json.loads(obj)
return obj # 是有效 JSON 字符串,直接返回
except Exception:
# 不是 JSON 字符串,可能是普通文本,将其封装成 JSON
return json.dumps(obj, ensure_ascii=False)
try: try:
return json.dumps(obj, ensure_ascii=False) return json.dumps(obj, ensure_ascii=False)
except Exception: except Exception:
@@ -581,18 +610,70 @@ def upload(request):
rel_paths = [] rel_paths = []
image_urls = [] image_urls = []
data_list = [] data_list = []
for file in files:
filename = f"{uuid.uuid4()}_{file.name}" # 预处理文件列表处理PDF转换
processed_files = []
for f in files:
if f.name.lower().endswith('.pdf'):
if not HAS_PDF_SUPPORT:
return JsonResponse({"status": "error", "message": f"服务器未安装PDF处理组件(PyMuPDF): {PDF_ERROR}"}, status=500)
# 将PDF保存到临时文件
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
for chunk in f.chunks():
tmp.write(chunk)
tmp_path = tmp.name
try:
# 转换PDF为图片列表
doc = fitz.open(tmp_path)
for i in range(len(doc)):
page = doc.load_page(i)
# 降低 DPI 从 200 到 150在保证准确率的同时显著减小图片体积加快上传速度
pix = page.get_pixmap(dpi=150)
img_filename = f"{uuid.uuid4()}_page_{i+1}.jpg"
img_abs_path = os.path.join(images_dir, img_filename)
pix.save(img_abs_path)
processed_files.append((img_abs_path, img_filename, f"{f.name}_p{i+1}"))
doc.close()
except Exception as e:
return JsonResponse({"status": "error", "message": f"PDF转换失败: {str(e)}"}, status=500)
finally:
if os.path.exists(tmp_path):
os.remove(tmp_path)
else:
# 普通图片处理
filename = f"{uuid.uuid4()}_{f.name}"
abs_path = os.path.join(images_dir, filename) abs_path = os.path.join(images_dir, filename)
with open(abs_path, "wb") as dst: with open(abs_path, "wb") as dst:
for chunk in file.chunks(): for chunk in f.chunks():
dst.write(chunk) dst.write(chunk)
processed_files.append((abs_path, filename, f.name))
# 使用线程池并行调用 OCR 接口以提升速度
def run_ocr(file_info):
abs_path, filename, original_name = file_info
try: try:
data = ocr_and_extract_info(abs_path) data = ocr_and_extract_info(abs_path)
return (data, filename)
except Exception as e: except Exception as e:
return JsonResponse({"status": "error", "message": str(e)}, status=500) return (e, filename)
if data:
data_list.append(data) rel_paths = []
image_urls = []
data_list = []
# 限制最大线程数为 8避免过多并发导致 API 限制或资源耗尽
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
future_to_file = {executor.submit(run_ocr, f_info): f_info for f_info in processed_files}
for future in concurrent.futures.as_completed(future_to_file):
result, filename = future.result()
if isinstance(result, Exception):
return JsonResponse({"status": "error", "message": f"文件 {filename} 识别失败: {str(result)}"}, status=500)
if result:
data_list.append(result)
rel_path = f"images/{filename}" rel_path = f"images/{filename}"
rel_paths.append(rel_path) rel_paths.append(rel_path)
image_urls.append(request.build_absolute_uri(settings.MEDIA_URL + rel_path)) image_urls.append(request.build_absolute_uri(settings.MEDIA_URL + rel_path))
@@ -687,12 +768,18 @@ def confirm(request):
try: try:
object_name = f"images/{webp_name}" object_name = f"images/{webp_name}"
from minio_storage.minio_connect import upload_file from minio_storage.minio_connect import upload_file, is_minio_configured
if is_minio_configured():
upload_file(webp_abs, object_name, content_type="image/webp") upload_file(webp_abs, object_name, content_type="image/webp")
image_refs.append(f"minio:{object_name}") image_refs.append(f"minio:{object_name}")
temp_files_to_delete.extend([src_abs, webp_abs]) temp_files_to_delete.extend([src_abs, webp_abs])
else:
# Fallback to local storage
image_refs.append(f"local:{object_name}")
# In local case, we keep the webp file and only delete the original temporary file
temp_files_to_delete.append(src_abs)
except Exception as e: except Exception as e:
return JsonResponse({"status": "error", "message": f"上传到MinIO失败: {e}"}, status=500) return JsonResponse({"status": "error", "message": f"存储图片失败: {e}"}, status=500)
if len(image_refs) == 1: if len(image_refs) == 1:
image_ref_to_store = image_refs[0] image_ref_to_store = image_refs[0]
elif len(image_refs) > 1: elif len(image_refs) > 1:
@@ -750,21 +837,10 @@ def manage_page(request):
raw_results.append(r) raw_results.append(r)
break break
results = []
for r in raw_results:
try:
r_data = string_to_json(r.get("data", "{}"))
r_data["_id"] = r["id"]
r_data["_image"] = r.get("image", "")
results.append(r_data)
except Exception:
pass
return render( return render(
request, request,
"elastic/manage.html", "elastic/manage.html",
{ {
"results": results,
"is_admin": is_admin, "is_admin": is_admin,
"user_id": session_user_id, "user_id": session_user_id,
"username": me.get("username"), "username": me.get("username"),

View File

@@ -63,6 +63,10 @@ def get_minio_client() -> Minio | None:
) )
def is_minio_configured() -> bool:
return get_minio_client() is not None
def get_bucket_name() -> str: def get_bucket_name() -> str:
return _get_env('MINIO_BUCKET', default='achievement') or 'achievement' return _get_env('MINIO_BUCKET', default='achievement') or 'achievement'

View File

@@ -14,3 +14,4 @@ whitenoise==6.6.0
django-browser-reload==1.21.0 django-browser-reload==1.21.0
captcha==0.7.1 captcha==0.7.1
cryptography==46.0.3 cryptography==46.0.3
pymupdf==1.25.3