能上传并识别PDF
This commit is contained in:
@@ -16,8 +16,10 @@
|
||||
table{width:100%;border-collapse:collapse;margin-top:20px}
|
||||
th,td{border-bottom:1px solid #eee;padding:12px 8px;text-align:left;vertical-align:top}
|
||||
th{background:#f8f9fa;font-weight:600}
|
||||
.inner-table { width: 100%; margin: 0; border: 1px solid #e0e0e0; border-collapse: collapse; }
|
||||
.inner-table th, .inner-table td { border: 1px solid #e0e0e0; padding: 6px; font-size: 12px; }
|
||||
.inner-table { width: 100%; margin: 0; border: 1px solid #e0e0e0; border-collapse: collapse; table-layout: fixed; }
|
||||
.inner-table th, .inner-table td { border: 1px solid #e0e0e0; padding: 8px; font-size: 13px; word-break: break-all; }
|
||||
.inner-table td:first-child { width: 30%; background-color: #f8fafc; font-weight: 600; color: #475569; }
|
||||
.inner-table td:last-child { width: 70%; background-color: #fff; }
|
||||
.inner-table th { background-color: #f9f9f9; }
|
||||
img{max-width:120px;border:1px solid #eee;border-radius:6px;cursor:pointer}
|
||||
.btn{padding:6px 10px;border:none;border-radius:6px;cursor:pointer;font-size:14px;margin:2px}
|
||||
@@ -320,8 +322,15 @@ function renderTable(data) {
|
||||
|
||||
// 解析data字段,如果是JSON字符串则格式化显示
|
||||
let displayData = item.data || '';
|
||||
let parsed = null;
|
||||
try {
|
||||
const parsed = JSON.parse(item.data);
|
||||
if (typeof displayData === 'object' && displayData !== null) {
|
||||
parsed = displayData;
|
||||
} else if (typeof displayData === 'string') {
|
||||
parsed = JSON.parse(displayData);
|
||||
}
|
||||
|
||||
if (parsed && typeof parsed === 'object' && !Array.isArray(parsed)) {
|
||||
displayData = `
|
||||
<table class="inner-table">
|
||||
<tbody>
|
||||
@@ -334,13 +343,16 @@ function renderTable(data) {
|
||||
</tbody>
|
||||
</table>
|
||||
`;
|
||||
} else {
|
||||
throw new Error('Not a valid JSON object');
|
||||
}
|
||||
} catch (e) {
|
||||
displayData = `
|
||||
<table class="inner-table">
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>原始数据</td>
|
||||
<td>${escapeHtml(displayData)}</td>
|
||||
<td>${escapeHtml(typeof displayData === 'object' ? JSON.stringify(displayData) : displayData)}</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
@@ -423,16 +435,24 @@ function createRow(k = '', v = '') {
|
||||
}
|
||||
|
||||
function renderForm(obj){
|
||||
kvForm.innerHTML='';
|
||||
kvForm.innerHTML=`
|
||||
<div style="display: grid; grid-template-columns: 1fr 1fr auto; gap: 8px; margin-bottom: 8px; font-weight: 600; color: #475569; font-size: 14px;">
|
||||
<div>字段名</div>
|
||||
<div>字段值</div>
|
||||
<div>操作</div>
|
||||
</div>
|
||||
`;
|
||||
Object.keys(obj||{}).forEach(k=> kvForm.appendChild(createRow(k, obj[k])));
|
||||
if (!kvForm.children.length) kvForm.appendChild(createRow());
|
||||
if (kvForm.querySelectorAll('div[style*="grid"]').length <= 1) kvForm.appendChild(createRow());
|
||||
syncTextarea();
|
||||
}
|
||||
|
||||
function formToObject(){
|
||||
const o={};
|
||||
Array.from(kvForm.children).forEach(row=>{
|
||||
Array.from(kvForm.children).forEach((row, index)=>{
|
||||
if (index === 0) return; // 跳过表头
|
||||
const [kI,vI] = row.querySelectorAll('input');
|
||||
if (!kI || !vI) return;
|
||||
const k=(kI.value||'').trim(); if(!k) return;
|
||||
const raw=vI.value;
|
||||
try{
|
||||
|
||||
@@ -49,9 +49,10 @@
|
||||
.result-box {flex: 1;}
|
||||
.result-box h3 { margin-top: 0; color: #334155;}
|
||||
.form-controls { display: flex;gap: 8px;margin-bottom: 12px;flex-wrap: wrap;}
|
||||
#kvForm {border: 1px solid #e2e8f0; border-radius: 8px; padding: 12px; max-height: 300px; overflow: auto;margin-bottom: 12px;background: white;}
|
||||
.form-row {display: grid;grid-template-columns: 1fr 1fr auto;gap: 8px; margin-bottom: 6px; }
|
||||
.form-row input {padding: 8px;border: 1px solid #cbd5e1;border-radius: 4px;}
|
||||
#kvForm {border: 1px solid #e2e8f0; border-radius: 8px; padding: 12px; max-height: 400px; overflow: auto;margin-bottom: 12px;background: #f8fafc;}
|
||||
.form-header { display: grid; grid-template-columns: 1fr 1fr auto; gap: 8px; margin-bottom: 8px; padding: 0 4px; font-weight: 600; color: #475569; font-size: 14px;}
|
||||
.form-row {display: grid;grid-template-columns: 1fr 1fr auto;gap: 8px; margin-bottom: 6px; align-items: center;}
|
||||
.form-row input {padding: 8px;border: 1px solid #cbd5e1;border-radius: 4px; width: 100%; box-sizing: border-box;}
|
||||
#resultBox { width: 100%;min-height: 200px;font-family: ui-monospace, SFMono-Regular, Menlo, monospace;font-size: 14px; padding: 12px; border: 1px solid #e2e8f0;
|
||||
border-radius: 8px; resize: vertical;box-sizing: border-box; }
|
||||
.status-message { padding: 10px; margin: 10px 0; border-radius: 6px; display: none; }
|
||||
@@ -83,17 +84,17 @@
|
||||
<div class="container">
|
||||
<div class="header">
|
||||
<div>
|
||||
<h2>图片上传与识别</h2>
|
||||
<p>选择图片后上传,服务端调用大模型解析为可编辑的 JSON,再确认入库。</p>
|
||||
<h2>图片与PDF上传识别</h2>
|
||||
<p>选择图片或PDF文件后上传,服务端调用大模型解析为可编辑的 JSON,再确认入库。</p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="upload-section" id="dropArea">
|
||||
<h3>上传图片</h3>
|
||||
<p>点击下方按钮选择图片,或拖拽图片到此区域</p>
|
||||
<h3>上传文件</h3>
|
||||
<p>点击下方按钮选择图片或PDF文件,或拖拽文件到此区域</p>
|
||||
<form id="uploadForm" enctype="multipart/form-data">
|
||||
{% csrf_token %}
|
||||
<input type="file" id="fileInput" name="file" accept="image/*" multiple />
|
||||
<input type="file" id="fileInput" name="file" accept="image/*,.pdf" multiple />
|
||||
<span id="fileHint" class="muted"></span>
|
||||
<br>
|
||||
<button type="submit" class="btn btn-primary">上传并识别</button>
|
||||
@@ -268,7 +269,7 @@ function updateFileHint() {
|
||||
}
|
||||
|
||||
function addFiles(files) {
|
||||
const incoming = Array.from(files || []).filter(f => f && f.type.startsWith('image/'));
|
||||
const incoming = Array.from(files || []).filter(f => f && (f.type.startsWith('image/') || f.name.toLowerCase().endsWith('.pdf')));
|
||||
const existingKeys = new Set(selectedFiles.map(f => `${f.name}|${f.size}|${f.lastModified}`));
|
||||
incoming.forEach(f => {
|
||||
const key = `${f.name}|${f.size}|${f.lastModified}`;
|
||||
@@ -277,10 +278,20 @@ function addFiles(files) {
|
||||
selectedFiles.push(f);
|
||||
}
|
||||
});
|
||||
const urls = selectedFiles.map(f => URL.createObjectURL(f));
|
||||
const urls = selectedFiles.map(f => {
|
||||
if (f.name.toLowerCase().endsWith('.pdf')) {
|
||||
// 使用一个简单的 SVG PDF 图标 Data URI
|
||||
return 'data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHdpZHRoPSI0OCIgaGVpZ2h0PSI0OCIgdmlld0JveD0iMCAwIDI0IDI0IiBmaWxsPSJub25lIiBzdHJva2U9IiNlZjQ0NDQiIHN0cm9rZS13aWR0aD0iMiIgc3Ryb2tlLWxpbmVjYXA9InJvdW5kIiBzdHJva2UtbGluZWpvaW49InJvdW5kIj48cGF0aCBkPSJNMTQgMmgyYTIgMiAwIDAgMSAyIDJ2MTZhMiAyIDAgMCAxLTIgMmgtMTJhMiAyIDAgMCAxLTItMlY0YTIgMiAwIDAgMSAyLTJoMiIvPjxwYXRoIGQ9Ik0xNCAydjRjMCAxLjEgLjkgMiAyIDJoNCIvPjxwYXRoIGQ9Ik03IDloNSIvPjxwYXRoIGQ9Ik03IDEzaDUiLz48cGF0aCBkPSJNNyAxN2g4Ii8+PC9zdmc+';
|
||||
}
|
||||
return URL.createObjectURL(f);
|
||||
});
|
||||
setPreviewList(urls);
|
||||
updateFileHint();
|
||||
setTimeout(() => urls.forEach(u => URL.revokeObjectURL(u)), 0);
|
||||
setTimeout(() => {
|
||||
urls.forEach(u => {
|
||||
if (u.startsWith('blob:')) URL.revokeObjectURL(u);
|
||||
});
|
||||
}, 0);
|
||||
}
|
||||
|
||||
fileInput.addEventListener('change', function(e) {
|
||||
@@ -304,7 +315,7 @@ function createRow(k = '', v = '') {
|
||||
delBtn.className = 'btn btn-danger';
|
||||
delBtn.textContent = '删除';
|
||||
delBtn.onclick = () => {
|
||||
if (kvForm.children.length > 1) {
|
||||
if (kvForm.querySelectorAll('.form-row').length > 1) {
|
||||
kvForm.removeChild(row);
|
||||
} else {
|
||||
keyInput.value = '';
|
||||
@@ -321,17 +332,24 @@ function createRow(k = '', v = '') {
|
||||
}
|
||||
|
||||
function renderFormFromObject(obj) {
|
||||
kvForm.innerHTML = '';
|
||||
kvForm.innerHTML = `
|
||||
<div class="form-header">
|
||||
<div>字段名</div>
|
||||
<div>字段值</div>
|
||||
<div>操作</div>
|
||||
</div>
|
||||
`;
|
||||
Object.keys(obj || {}).forEach(k => {
|
||||
kvForm.appendChild(createRow(k, obj[k]));
|
||||
});
|
||||
if (!kvForm.children.length) kvForm.appendChild(createRow());
|
||||
if (kvForm.children.length <= 1) kvForm.appendChild(createRow());
|
||||
syncTextarea();
|
||||
}
|
||||
|
||||
function objectFromForm() {
|
||||
const obj = {};
|
||||
Array.from(kvForm.children).forEach(row => {
|
||||
if (row.classList.contains('form-header')) return;
|
||||
const [kInput, vInput] = row.querySelectorAll('input');
|
||||
const k = (kInput.value || '').trim();
|
||||
if (!k) return;
|
||||
@@ -380,31 +398,32 @@ uploadForm.addEventListener('submit', async (e) => {
|
||||
resultBox.value = '';
|
||||
currentImageRel = [];
|
||||
|
||||
const files = Array.from(selectedFiles || []).filter(f => f && f.type.startsWith('image/'));
|
||||
const files = Array.from(selectedFiles || []).filter(f => f && (f.type.startsWith('image/') || f.name.toLowerCase().endsWith('.pdf')));
|
||||
if (!files.length) {
|
||||
uploadMsg.textContent = '请选择图片文件';
|
||||
uploadMsg.textContent = '请选择图片或PDF文件';
|
||||
uploadMsg.className = 'status-message error';
|
||||
uploadMsg.style.display = 'block';
|
||||
return;
|
||||
}
|
||||
|
||||
showProgress();
|
||||
setProgress(5, '转换为JPG');
|
||||
setProgress(5, '预处理中');
|
||||
const formData = new FormData();
|
||||
const converted = [];
|
||||
for (let i = 0; i < files.length; i++) {
|
||||
const file = files[i];
|
||||
let jpegFile = file;
|
||||
if (file.type.startsWith('image/')) {
|
||||
setProgress(5 + Math.round((i/files.length)*45), '转换图片');
|
||||
try {
|
||||
jpegFile = await convertToJpeg(file);
|
||||
const jpegFile = await convertToJpeg(file);
|
||||
formData.append('file', jpegFile);
|
||||
} catch (_) {
|
||||
jpegFile = file;
|
||||
formData.append('file', file);
|
||||
}
|
||||
} else {
|
||||
// PDF 直接添加
|
||||
formData.append('file', file);
|
||||
}
|
||||
converted.push(jpegFile);
|
||||
const pct = 5 + Math.round(((i + 1) / files.length) * 45);
|
||||
setProgress(pct, '转换为JPG');
|
||||
}
|
||||
converted.forEach(f => formData.append('file', f));
|
||||
|
||||
try {
|
||||
let prog = 50;
|
||||
|
||||
122
elastic/views.py
122
elastic/views.py
@@ -6,6 +6,8 @@ import re
|
||||
import uuid
|
||||
import base64
|
||||
import json
|
||||
import tempfile
|
||||
import concurrent.futures
|
||||
from django.conf import settings
|
||||
from django.http import JsonResponse
|
||||
from django.shortcuts import render
|
||||
@@ -21,6 +23,19 @@ from .es_connect import (
|
||||
analytics_recent as es_analytics_recent,
|
||||
)
|
||||
from PIL import Image
|
||||
try:
|
||||
import fitz # PyMuPDF
|
||||
HAS_PDF_SUPPORT = True
|
||||
PDF_ERROR = ""
|
||||
except ImportError as e:
|
||||
try:
|
||||
import pymupdf
|
||||
fitz = pymupdf
|
||||
HAS_PDF_SUPPORT = True
|
||||
PDF_ERROR = ""
|
||||
except ImportError:
|
||||
HAS_PDF_SUPPORT = False
|
||||
PDF_ERROR = str(e)
|
||||
|
||||
|
||||
def _filter_results_for_user(request, results):
|
||||
@@ -58,9 +73,7 @@ def _image_ref_to_url(request, image_ref: str) -> str:
|
||||
if not s:
|
||||
return ''
|
||||
|
||||
if not s.startswith('minio:'):
|
||||
return ''
|
||||
|
||||
if s.startswith('minio:'):
|
||||
object_name = s[len('minio:'):].lstrip('/')
|
||||
if not object_name:
|
||||
return ''
|
||||
@@ -71,6 +84,14 @@ def _image_ref_to_url(request, image_ref: str) -> str:
|
||||
except Exception:
|
||||
return ''
|
||||
|
||||
if s.startswith('local:'):
|
||||
rel_path = s[len('local:'):].lstrip('/')
|
||||
if not rel_path:
|
||||
return ''
|
||||
return request.build_absolute_uri(settings.MEDIA_URL + rel_path)
|
||||
|
||||
return ''
|
||||
|
||||
|
||||
def _parse_image_refs(image_ref):
|
||||
if not image_ref:
|
||||
@@ -429,8 +450,16 @@ def delete_user_by_id_view(request, user_id):
|
||||
return JsonResponse({"status": "success", "message": "用户删除成功"})
|
||||
|
||||
|
||||
# 辅助:JSON 转换(兼容 a.py 行为)
|
||||
# 辅助:JSON 转换
|
||||
def json_to_string(obj):
|
||||
if isinstance(obj, str):
|
||||
# 如果已经是字符串,尝试解析一下验证是否为有效 JSON,或者是普通文本
|
||||
try:
|
||||
json.loads(obj)
|
||||
return obj # 是有效 JSON 字符串,直接返回
|
||||
except Exception:
|
||||
# 不是 JSON 字符串,可能是普通文本,将其封装成 JSON
|
||||
return json.dumps(obj, ensure_ascii=False)
|
||||
try:
|
||||
return json.dumps(obj, ensure_ascii=False)
|
||||
except Exception:
|
||||
@@ -581,18 +610,70 @@ def upload(request):
|
||||
rel_paths = []
|
||||
image_urls = []
|
||||
data_list = []
|
||||
for file in files:
|
||||
filename = f"{uuid.uuid4()}_{file.name}"
|
||||
|
||||
# 预处理文件列表,处理PDF转换
|
||||
processed_files = []
|
||||
for f in files:
|
||||
if f.name.lower().endswith('.pdf'):
|
||||
if not HAS_PDF_SUPPORT:
|
||||
return JsonResponse({"status": "error", "message": f"服务器未安装PDF处理组件(PyMuPDF): {PDF_ERROR}"}, status=500)
|
||||
|
||||
# 将PDF保存到临时文件
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
|
||||
for chunk in f.chunks():
|
||||
tmp.write(chunk)
|
||||
tmp_path = tmp.name
|
||||
|
||||
try:
|
||||
# 转换PDF为图片列表
|
||||
doc = fitz.open(tmp_path)
|
||||
for i in range(len(doc)):
|
||||
page = doc.load_page(i)
|
||||
# 降低 DPI 从 200 到 150,在保证准确率的同时显著减小图片体积,加快上传速度
|
||||
pix = page.get_pixmap(dpi=150)
|
||||
img_filename = f"{uuid.uuid4()}_page_{i+1}.jpg"
|
||||
img_abs_path = os.path.join(images_dir, img_filename)
|
||||
pix.save(img_abs_path)
|
||||
processed_files.append((img_abs_path, img_filename, f"{f.name}_p{i+1}"))
|
||||
doc.close()
|
||||
except Exception as e:
|
||||
return JsonResponse({"status": "error", "message": f"PDF转换失败: {str(e)}"}, status=500)
|
||||
finally:
|
||||
if os.path.exists(tmp_path):
|
||||
os.remove(tmp_path)
|
||||
else:
|
||||
# 普通图片处理
|
||||
filename = f"{uuid.uuid4()}_{f.name}"
|
||||
abs_path = os.path.join(images_dir, filename)
|
||||
with open(abs_path, "wb") as dst:
|
||||
for chunk in file.chunks():
|
||||
for chunk in f.chunks():
|
||||
dst.write(chunk)
|
||||
processed_files.append((abs_path, filename, f.name))
|
||||
|
||||
# 使用线程池并行调用 OCR 接口以提升速度
|
||||
def run_ocr(file_info):
|
||||
abs_path, filename, original_name = file_info
|
||||
try:
|
||||
data = ocr_and_extract_info(abs_path)
|
||||
return (data, filename)
|
||||
except Exception as e:
|
||||
return JsonResponse({"status": "error", "message": str(e)}, status=500)
|
||||
if data:
|
||||
data_list.append(data)
|
||||
return (e, filename)
|
||||
|
||||
rel_paths = []
|
||||
image_urls = []
|
||||
data_list = []
|
||||
|
||||
# 限制最大线程数为 8,避免过多并发导致 API 限制或资源耗尽
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
|
||||
future_to_file = {executor.submit(run_ocr, f_info): f_info for f_info in processed_files}
|
||||
for future in concurrent.futures.as_completed(future_to_file):
|
||||
result, filename = future.result()
|
||||
if isinstance(result, Exception):
|
||||
return JsonResponse({"status": "error", "message": f"文件 {filename} 识别失败: {str(result)}"}, status=500)
|
||||
|
||||
if result:
|
||||
data_list.append(result)
|
||||
|
||||
rel_path = f"images/{filename}"
|
||||
rel_paths.append(rel_path)
|
||||
image_urls.append(request.build_absolute_uri(settings.MEDIA_URL + rel_path))
|
||||
@@ -687,12 +768,18 @@ def confirm(request):
|
||||
|
||||
try:
|
||||
object_name = f"images/{webp_name}"
|
||||
from minio_storage.minio_connect import upload_file
|
||||
from minio_storage.minio_connect import upload_file, is_minio_configured
|
||||
if is_minio_configured():
|
||||
upload_file(webp_abs, object_name, content_type="image/webp")
|
||||
image_refs.append(f"minio:{object_name}")
|
||||
temp_files_to_delete.extend([src_abs, webp_abs])
|
||||
else:
|
||||
# Fallback to local storage
|
||||
image_refs.append(f"local:{object_name}")
|
||||
# In local case, we keep the webp file and only delete the original temporary file
|
||||
temp_files_to_delete.append(src_abs)
|
||||
except Exception as e:
|
||||
return JsonResponse({"status": "error", "message": f"上传到MinIO失败: {e}"}, status=500)
|
||||
return JsonResponse({"status": "error", "message": f"存储图片失败: {e}"}, status=500)
|
||||
if len(image_refs) == 1:
|
||||
image_ref_to_store = image_refs[0]
|
||||
elif len(image_refs) > 1:
|
||||
@@ -750,21 +837,10 @@ def manage_page(request):
|
||||
raw_results.append(r)
|
||||
break
|
||||
|
||||
results = []
|
||||
for r in raw_results:
|
||||
try:
|
||||
r_data = string_to_json(r.get("data", "{}"))
|
||||
r_data["_id"] = r["id"]
|
||||
r_data["_image"] = r.get("image", "")
|
||||
results.append(r_data)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return render(
|
||||
request,
|
||||
"elastic/manage.html",
|
||||
{
|
||||
"results": results,
|
||||
"is_admin": is_admin,
|
||||
"user_id": session_user_id,
|
||||
"username": me.get("username"),
|
||||
|
||||
@@ -63,6 +63,10 @@ def get_minio_client() -> Minio | None:
|
||||
)
|
||||
|
||||
|
||||
def is_minio_configured() -> bool:
|
||||
return get_minio_client() is not None
|
||||
|
||||
|
||||
def get_bucket_name() -> str:
|
||||
return _get_env('MINIO_BUCKET', default='achievement') or 'achievement'
|
||||
|
||||
|
||||
@@ -14,3 +14,4 @@ whitenoise==6.6.0
|
||||
django-browser-reload==1.21.0
|
||||
captcha==0.7.1
|
||||
cryptography==46.0.3
|
||||
pymupdf==1.25.3
|
||||
|
||||
Reference in New Issue
Block a user