能上传并识别PDF

This commit is contained in:
DSQ
2026-03-12 20:05:48 +08:00
parent 109c06e1d9
commit d69858434f
5 changed files with 205 additions and 85 deletions

View File

@@ -16,8 +16,10 @@
table{width:100%;border-collapse:collapse;margin-top:20px}
th,td{border-bottom:1px solid #eee;padding:12px 8px;text-align:left;vertical-align:top}
th{background:#f8f9fa;font-weight:600}
.inner-table { width: 100%; margin: 0; border: 1px solid #e0e0e0; border-collapse: collapse; }
.inner-table th, .inner-table td { border: 1px solid #e0e0e0; padding: 6px; font-size: 12px; }
.inner-table { width: 100%; margin: 0; border: 1px solid #e0e0e0; border-collapse: collapse; table-layout: fixed; }
.inner-table th, .inner-table td { border: 1px solid #e0e0e0; padding: 8px; font-size: 13px; word-break: break-all; }
.inner-table td:first-child { width: 30%; background-color: #f8fafc; font-weight: 600; color: #475569; }
.inner-table td:last-child { width: 70%; background-color: #fff; }
.inner-table th { background-color: #f9f9f9; }
img{max-width:120px;border:1px solid #eee;border-radius:6px;cursor:pointer}
.btn{padding:6px 10px;border:none;border-radius:6px;cursor:pointer;font-size:14px;margin:2px}
@@ -320,8 +322,15 @@ function renderTable(data) {
// 解析data字段如果是JSON字符串则格式化显示
let displayData = item.data || '';
let parsed = null;
try {
const parsed = JSON.parse(item.data);
if (typeof displayData === 'object' && displayData !== null) {
parsed = displayData;
} else if (typeof displayData === 'string') {
parsed = JSON.parse(displayData);
}
if (parsed && typeof parsed === 'object' && !Array.isArray(parsed)) {
displayData = `
<table class="inner-table">
<tbody>
@@ -334,13 +343,16 @@ function renderTable(data) {
</tbody>
</table>
`;
} else {
throw new Error('Not a valid JSON object');
}
} catch (e) {
displayData = `
<table class="inner-table">
<tbody>
<tr>
<td>原始数据</td>
<td>${escapeHtml(displayData)}</td>
<td>${escapeHtml(typeof displayData === 'object' ? JSON.stringify(displayData) : displayData)}</td>
</tr>
</tbody>
</table>
@@ -423,16 +435,24 @@ function createRow(k = '', v = '') {
}
function renderForm(obj){
kvForm.innerHTML='';
kvForm.innerHTML=`
<div style="display: grid; grid-template-columns: 1fr 1fr auto; gap: 8px; margin-bottom: 8px; font-weight: 600; color: #475569; font-size: 14px;">
<div>字段名</div>
<div>字段值</div>
<div>操作</div>
</div>
`;
Object.keys(obj||{}).forEach(k=> kvForm.appendChild(createRow(k, obj[k])));
if (!kvForm.children.length) kvForm.appendChild(createRow());
if (kvForm.querySelectorAll('div[style*="grid"]').length <= 1) kvForm.appendChild(createRow());
syncTextarea();
}
function formToObject(){
const o={};
Array.from(kvForm.children).forEach(row=>{
Array.from(kvForm.children).forEach((row, index)=>{
if (index === 0) return; // 跳过表头
const [kI,vI] = row.querySelectorAll('input');
if (!kI || !vI) return;
const k=(kI.value||'').trim(); if(!k) return;
const raw=vI.value;
try{

View File

@@ -49,9 +49,10 @@
.result-box {flex: 1;}
.result-box h3 { margin-top: 0; color: #334155;}
.form-controls { display: flex;gap: 8px;margin-bottom: 12px;flex-wrap: wrap;}
#kvForm {border: 1px solid #e2e8f0; border-radius: 8px; padding: 12px; max-height: 300px; overflow: auto;margin-bottom: 12px;background: white;}
.form-row {display: grid;grid-template-columns: 1fr 1fr auto;gap: 8px; margin-bottom: 6px; }
.form-row input {padding: 8px;border: 1px solid #cbd5e1;border-radius: 4px;}
#kvForm {border: 1px solid #e2e8f0; border-radius: 8px; padding: 12px; max-height: 400px; overflow: auto;margin-bottom: 12px;background: #f8fafc;}
.form-header { display: grid; grid-template-columns: 1fr 1fr auto; gap: 8px; margin-bottom: 8px; padding: 0 4px; font-weight: 600; color: #475569; font-size: 14px;}
.form-row {display: grid;grid-template-columns: 1fr 1fr auto;gap: 8px; margin-bottom: 6px; align-items: center;}
.form-row input {padding: 8px;border: 1px solid #cbd5e1;border-radius: 4px; width: 100%; box-sizing: border-box;}
#resultBox { width: 100%;min-height: 200px;font-family: ui-monospace, SFMono-Regular, Menlo, monospace;font-size: 14px; padding: 12px; border: 1px solid #e2e8f0;
border-radius: 8px; resize: vertical;box-sizing: border-box; }
.status-message { padding: 10px; margin: 10px 0; border-radius: 6px; display: none; }
@@ -83,17 +84,17 @@
<div class="container">
<div class="header">
<div>
<h2>图片上传识别</h2>
<p>选择图片后上传,服务端调用大模型解析为可编辑的 JSON再确认入库。</p>
<h2>图片与PDF上传识别</h2>
<p>选择图片或PDF文件后上传,服务端调用大模型解析为可编辑的 JSON再确认入库。</p>
</div>
</div>
<div class="upload-section" id="dropArea">
<h3>上传图片</h3>
<p>点击下方按钮选择图片,或拖拽图片到此区域</p>
<h3>上传文件</h3>
<p>点击下方按钮选择图片或PDF文件,或拖拽文件到此区域</p>
<form id="uploadForm" enctype="multipart/form-data">
{% csrf_token %}
<input type="file" id="fileInput" name="file" accept="image/*" multiple />
<input type="file" id="fileInput" name="file" accept="image/*,.pdf" multiple />
<span id="fileHint" class="muted"></span>
<br>
<button type="submit" class="btn btn-primary">上传并识别</button>
@@ -268,7 +269,7 @@ function updateFileHint() {
}
function addFiles(files) {
const incoming = Array.from(files || []).filter(f => f && f.type.startsWith('image/'));
const incoming = Array.from(files || []).filter(f => f && (f.type.startsWith('image/') || f.name.toLowerCase().endsWith('.pdf')));
const existingKeys = new Set(selectedFiles.map(f => `${f.name}|${f.size}|${f.lastModified}`));
incoming.forEach(f => {
const key = `${f.name}|${f.size}|${f.lastModified}`;
@@ -277,10 +278,20 @@ function addFiles(files) {
selectedFiles.push(f);
}
});
const urls = selectedFiles.map(f => URL.createObjectURL(f));
const urls = selectedFiles.map(f => {
if (f.name.toLowerCase().endsWith('.pdf')) {
// 使用一个简单的 SVG PDF 图标 Data URI
return 'data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHdpZHRoPSI0OCIgaGVpZ2h0PSI0OCIgdmlld0JveD0iMCAwIDI0IDI0IiBmaWxsPSJub25lIiBzdHJva2U9IiNlZjQ0NDQiIHN0cm9rZS13aWR0aD0iMiIgc3Ryb2tlLWxpbmVjYXA9InJvdW5kIiBzdHJva2UtbGluZWpvaW49InJvdW5kIj48cGF0aCBkPSJNMTQgMmgyYTIgMiAwIDAgMSAyIDJ2MTZhMiAyIDAgMCAxLTIgMmgtMTJhMiAyIDAgMCAxLTItMlY0YTIgMiAwIDAgMSAyLTJoMiIvPjxwYXRoIGQ9Ik0xNCAydjRjMCAxLjEgLjkgMiAyIDJoNCIvPjxwYXRoIGQ9Ik03IDloNSIvPjxwYXRoIGQ9Ik03IDEzaDUiLz48cGF0aCBkPSJNNyAxN2g4Ii8+PC9zdmc+';
}
return URL.createObjectURL(f);
});
setPreviewList(urls);
updateFileHint();
setTimeout(() => urls.forEach(u => URL.revokeObjectURL(u)), 0);
setTimeout(() => {
urls.forEach(u => {
if (u.startsWith('blob:')) URL.revokeObjectURL(u);
});
}, 0);
}
fileInput.addEventListener('change', function(e) {
@@ -304,7 +315,7 @@ function createRow(k = '', v = '') {
delBtn.className = 'btn btn-danger';
delBtn.textContent = '删除';
delBtn.onclick = () => {
if (kvForm.children.length > 1) {
if (kvForm.querySelectorAll('.form-row').length > 1) {
kvForm.removeChild(row);
} else {
keyInput.value = '';
@@ -321,17 +332,24 @@ function createRow(k = '', v = '') {
}
function renderFormFromObject(obj) {
kvForm.innerHTML = '';
kvForm.innerHTML = `
<div class="form-header">
<div>字段名</div>
<div>字段值</div>
<div>操作</div>
</div>
`;
Object.keys(obj || {}).forEach(k => {
kvForm.appendChild(createRow(k, obj[k]));
});
if (!kvForm.children.length) kvForm.appendChild(createRow());
if (kvForm.children.length <= 1) kvForm.appendChild(createRow());
syncTextarea();
}
function objectFromForm() {
const obj = {};
Array.from(kvForm.children).forEach(row => {
if (row.classList.contains('form-header')) return;
const [kInput, vInput] = row.querySelectorAll('input');
const k = (kInput.value || '').trim();
if (!k) return;
@@ -380,31 +398,32 @@ uploadForm.addEventListener('submit', async (e) => {
resultBox.value = '';
currentImageRel = [];
const files = Array.from(selectedFiles || []).filter(f => f && f.type.startsWith('image/'));
const files = Array.from(selectedFiles || []).filter(f => f && (f.type.startsWith('image/') || f.name.toLowerCase().endsWith('.pdf')));
if (!files.length) {
uploadMsg.textContent = '请选择图片文件';
uploadMsg.textContent = '请选择图片或PDF文件';
uploadMsg.className = 'status-message error';
uploadMsg.style.display = 'block';
return;
}
showProgress();
setProgress(5, '转换为JPG');
setProgress(5, '预处理中');
const formData = new FormData();
const converted = [];
for (let i = 0; i < files.length; i++) {
const file = files[i];
let jpegFile = file;
if (file.type.startsWith('image/')) {
setProgress(5 + Math.round((i/files.length)*45), '转换图片');
try {
jpegFile = await convertToJpeg(file);
const jpegFile = await convertToJpeg(file);
formData.append('file', jpegFile);
} catch (_) {
jpegFile = file;
formData.append('file', file);
}
} else {
// PDF 直接添加
formData.append('file', file);
}
converted.push(jpegFile);
const pct = 5 + Math.round(((i + 1) / files.length) * 45);
setProgress(pct, '转换为JPG');
}
converted.forEach(f => formData.append('file', f));
try {
let prog = 50;

View File

@@ -6,6 +6,8 @@ import re
import uuid
import base64
import json
import tempfile
import concurrent.futures
from django.conf import settings
from django.http import JsonResponse
from django.shortcuts import render
@@ -21,6 +23,19 @@ from .es_connect import (
analytics_recent as es_analytics_recent,
)
from PIL import Image
try:
import fitz # PyMuPDF
HAS_PDF_SUPPORT = True
PDF_ERROR = ""
except ImportError as e:
try:
import pymupdf
fitz = pymupdf
HAS_PDF_SUPPORT = True
PDF_ERROR = ""
except ImportError:
HAS_PDF_SUPPORT = False
PDF_ERROR = str(e)
def _filter_results_for_user(request, results):
@@ -58,9 +73,7 @@ def _image_ref_to_url(request, image_ref: str) -> str:
if not s:
return ''
if not s.startswith('minio:'):
return ''
if s.startswith('minio:'):
object_name = s[len('minio:'):].lstrip('/')
if not object_name:
return ''
@@ -71,6 +84,14 @@ def _image_ref_to_url(request, image_ref: str) -> str:
except Exception:
return ''
if s.startswith('local:'):
rel_path = s[len('local:'):].lstrip('/')
if not rel_path:
return ''
return request.build_absolute_uri(settings.MEDIA_URL + rel_path)
return ''
def _parse_image_refs(image_ref):
if not image_ref:
@@ -429,8 +450,16 @@ def delete_user_by_id_view(request, user_id):
return JsonResponse({"status": "success", "message": "用户删除成功"})
# 辅助JSON 转换(兼容 a.py 行为)
# 辅助JSON 转换
def json_to_string(obj):
if isinstance(obj, str):
# 如果已经是字符串,尝试解析一下验证是否为有效 JSON或者是普通文本
try:
json.loads(obj)
return obj # 是有效 JSON 字符串,直接返回
except Exception:
# 不是 JSON 字符串,可能是普通文本,将其封装成 JSON
return json.dumps(obj, ensure_ascii=False)
try:
return json.dumps(obj, ensure_ascii=False)
except Exception:
@@ -581,18 +610,70 @@ def upload(request):
rel_paths = []
image_urls = []
data_list = []
for file in files:
filename = f"{uuid.uuid4()}_{file.name}"
# 预处理文件列表处理PDF转换
processed_files = []
for f in files:
if f.name.lower().endswith('.pdf'):
if not HAS_PDF_SUPPORT:
return JsonResponse({"status": "error", "message": f"服务器未安装PDF处理组件(PyMuPDF): {PDF_ERROR}"}, status=500)
# 将PDF保存到临时文件
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
for chunk in f.chunks():
tmp.write(chunk)
tmp_path = tmp.name
try:
# 转换PDF为图片列表
doc = fitz.open(tmp_path)
for i in range(len(doc)):
page = doc.load_page(i)
# 降低 DPI 从 200 到 150在保证准确率的同时显著减小图片体积加快上传速度
pix = page.get_pixmap(dpi=150)
img_filename = f"{uuid.uuid4()}_page_{i+1}.jpg"
img_abs_path = os.path.join(images_dir, img_filename)
pix.save(img_abs_path)
processed_files.append((img_abs_path, img_filename, f"{f.name}_p{i+1}"))
doc.close()
except Exception as e:
return JsonResponse({"status": "error", "message": f"PDF转换失败: {str(e)}"}, status=500)
finally:
if os.path.exists(tmp_path):
os.remove(tmp_path)
else:
# 普通图片处理
filename = f"{uuid.uuid4()}_{f.name}"
abs_path = os.path.join(images_dir, filename)
with open(abs_path, "wb") as dst:
for chunk in file.chunks():
for chunk in f.chunks():
dst.write(chunk)
processed_files.append((abs_path, filename, f.name))
# 使用线程池并行调用 OCR 接口以提升速度
def run_ocr(file_info):
abs_path, filename, original_name = file_info
try:
data = ocr_and_extract_info(abs_path)
return (data, filename)
except Exception as e:
return JsonResponse({"status": "error", "message": str(e)}, status=500)
if data:
data_list.append(data)
return (e, filename)
rel_paths = []
image_urls = []
data_list = []
# 限制最大线程数为 8避免过多并发导致 API 限制或资源耗尽
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
future_to_file = {executor.submit(run_ocr, f_info): f_info for f_info in processed_files}
for future in concurrent.futures.as_completed(future_to_file):
result, filename = future.result()
if isinstance(result, Exception):
return JsonResponse({"status": "error", "message": f"文件 {filename} 识别失败: {str(result)}"}, status=500)
if result:
data_list.append(result)
rel_path = f"images/{filename}"
rel_paths.append(rel_path)
image_urls.append(request.build_absolute_uri(settings.MEDIA_URL + rel_path))
@@ -687,12 +768,18 @@ def confirm(request):
try:
object_name = f"images/{webp_name}"
from minio_storage.minio_connect import upload_file
from minio_storage.minio_connect import upload_file, is_minio_configured
if is_minio_configured():
upload_file(webp_abs, object_name, content_type="image/webp")
image_refs.append(f"minio:{object_name}")
temp_files_to_delete.extend([src_abs, webp_abs])
else:
# Fallback to local storage
image_refs.append(f"local:{object_name}")
# In local case, we keep the webp file and only delete the original temporary file
temp_files_to_delete.append(src_abs)
except Exception as e:
return JsonResponse({"status": "error", "message": f"上传到MinIO失败: {e}"}, status=500)
return JsonResponse({"status": "error", "message": f"存储图片失败: {e}"}, status=500)
if len(image_refs) == 1:
image_ref_to_store = image_refs[0]
elif len(image_refs) > 1:
@@ -750,21 +837,10 @@ def manage_page(request):
raw_results.append(r)
break
results = []
for r in raw_results:
try:
r_data = string_to_json(r.get("data", "{}"))
r_data["_id"] = r["id"]
r_data["_image"] = r.get("image", "")
results.append(r_data)
except Exception:
pass
return render(
request,
"elastic/manage.html",
{
"results": results,
"is_admin": is_admin,
"user_id": session_user_id,
"username": me.get("username"),

View File

@@ -63,6 +63,10 @@ def get_minio_client() -> Minio | None:
)
def is_minio_configured() -> bool:
return get_minio_client() is not None
def get_bucket_name() -> str:
return _get_env('MINIO_BUCKET', default='achievement') or 'achievement'

View File

@@ -14,3 +14,4 @@ whitenoise==6.6.0
django-browser-reload==1.21.0
captcha==0.7.1
cryptography==46.0.3
pymupdf==1.25.3