修上传BUG
This commit is contained in:
268
elastic/views.py
268
elastic/views.py
@@ -607,107 +607,96 @@ def upload(request):
|
||||
|
||||
images_dir = os.path.join(settings.MEDIA_ROOT, "images")
|
||||
os.makedirs(images_dir, exist_ok=True)
|
||||
rel_paths = []
|
||||
image_urls = []
|
||||
data_list = []
|
||||
|
||||
# 预处理文件列表,处理PDF转换
|
||||
processed_files = []
|
||||
# 按照原始文件进行分组处理
|
||||
file_results = []
|
||||
|
||||
for f in files:
|
||||
if f.name.lower().endswith('.pdf'):
|
||||
group_images = [] # 存储该文件生成的所有图片路径信息 (abs_path, filename)
|
||||
is_pdf = f.name.lower().endswith('.pdf')
|
||||
|
||||
if is_pdf:
|
||||
if not HAS_PDF_SUPPORT:
|
||||
return JsonResponse({"status": "error", "message": f"服务器未安装PDF处理组件(PyMuPDF): {PDF_ERROR}"}, status=500)
|
||||
|
||||
# 将PDF保存到临时文件
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
|
||||
for chunk in f.chunks():
|
||||
tmp.write(chunk)
|
||||
tmp_path = tmp.name
|
||||
|
||||
try:
|
||||
# 转换PDF为图片列表
|
||||
doc = fitz.open(tmp_path)
|
||||
for i in range(len(doc)):
|
||||
page = doc.load_page(i)
|
||||
# 降低 DPI 从 200 到 150,在保证准确率的同时显著减小图片体积,加快上传速度
|
||||
pix = page.get_pixmap(dpi=150)
|
||||
pix = page.get_pixmap(dpi=150)
|
||||
img_filename = f"{uuid.uuid4()}_page_{i+1}.jpg"
|
||||
img_abs_path = os.path.join(images_dir, img_filename)
|
||||
pix.save(img_abs_path)
|
||||
processed_files.append((img_abs_path, img_filename, f"{f.name}_p{i+1}"))
|
||||
group_images.append((img_abs_path, img_filename))
|
||||
doc.close()
|
||||
except Exception as e:
|
||||
return JsonResponse({"status": "error", "message": f"PDF转换失败: {str(e)}"}, status=500)
|
||||
return JsonResponse({"status": "error", "message": f"PDF {f.name} 转换失败: {str(e)}"}, status=500)
|
||||
finally:
|
||||
if os.path.exists(tmp_path):
|
||||
os.remove(tmp_path)
|
||||
else:
|
||||
# 普通图片处理
|
||||
filename = f"{uuid.uuid4()}_{f.name}"
|
||||
abs_path = os.path.join(images_dir, filename)
|
||||
with open(abs_path, "wb") as dst:
|
||||
for chunk in f.chunks():
|
||||
dst.write(chunk)
|
||||
processed_files.append((abs_path, filename, f.name))
|
||||
group_images.append((abs_path, filename))
|
||||
|
||||
# 使用线程池并行调用 OCR 接口以提升速度
|
||||
def run_ocr(file_info):
|
||||
abs_path, filename, original_name = file_info
|
||||
try:
|
||||
data = ocr_and_extract_info(abs_path)
|
||||
return (data, filename)
|
||||
except Exception as e:
|
||||
return (e, filename)
|
||||
# 对该组图片并行进行 OCR 识别
|
||||
def run_ocr(img_info):
|
||||
abs_p, fname = img_info
|
||||
try:
|
||||
data = ocr_and_extract_info(abs_p)
|
||||
return data
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
rel_paths = []
|
||||
image_urls = []
|
||||
data_list = []
|
||||
group_data_list = []
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=min(len(group_images), 8)) as executor:
|
||||
futures = [executor.submit(run_ocr, img_info) for img_info in group_images]
|
||||
for future in concurrent.futures.as_completed(futures):
|
||||
res = future.result()
|
||||
if res:
|
||||
group_data_list.append(res)
|
||||
|
||||
# 限制最大线程数为 8,避免过多并发导致 API 限制或资源耗尽
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
|
||||
future_to_file = {executor.submit(run_ocr, f_info): f_info for f_info in processed_files}
|
||||
for future in concurrent.futures.as_completed(future_to_file):
|
||||
result, filename = future.result()
|
||||
if isinstance(result, Exception):
|
||||
return JsonResponse({"status": "error", "message": f"文件 {filename} 识别失败: {str(result)}"}, status=500)
|
||||
|
||||
if result:
|
||||
data_list.append(result)
|
||||
|
||||
rel_path = f"images/{filename}"
|
||||
rel_paths.append(rel_path)
|
||||
image_urls.append(request.build_absolute_uri(settings.MEDIA_URL + rel_path))
|
||||
# 合并该文件的多页识别结果
|
||||
merged_group_data = {}
|
||||
for item in group_data_list:
|
||||
if not isinstance(item, dict): continue
|
||||
for k, v in item.items():
|
||||
key = str(k).strip()
|
||||
if not key: continue
|
||||
if key not in merged_group_data or merged_group_data.get(key) in (None, ''):
|
||||
merged_group_data[key] = v
|
||||
elif merged_group_data.get(key) != v:
|
||||
base = key
|
||||
idx = 2
|
||||
while f"{base}_{idx}" in merged_group_data: idx += 1
|
||||
merged_group_data[f"{base}_{idx}"] = v
|
||||
|
||||
if not data_list:
|
||||
return JsonResponse({"status": "error", "message": "无法识别图片内容"}, status=400)
|
||||
if not merged_group_data:
|
||||
# 如果没识别到,至少保留一个空结构或者包含文件名的提示
|
||||
merged_group_data = {"文件名": f.name, "提示": "未识别到具体内容"}
|
||||
|
||||
merged = {}
|
||||
for item in data_list:
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
for k, v in item.items():
|
||||
key = str(k).strip()
|
||||
if not key:
|
||||
continue
|
||||
if key not in merged or merged.get(key) in (None, ''):
|
||||
merged[key] = v
|
||||
continue
|
||||
if merged.get(key) == v:
|
||||
continue
|
||||
base = key
|
||||
idx = 2
|
||||
while f"{base}_{idx}" in merged:
|
||||
idx += 1
|
||||
merged[f"{base}_{idx}"] = v
|
||||
rel_paths = [f"images/{img[1]}" for img in group_images]
|
||||
image_urls = [request.build_absolute_uri(settings.MEDIA_URL + rp) for rp in rel_paths]
|
||||
|
||||
file_results.append({
|
||||
"name": f.name,
|
||||
"data": merged_group_data,
|
||||
"images": rel_paths,
|
||||
"image_urls": image_urls,
|
||||
})
|
||||
|
||||
return JsonResponse({
|
||||
"status": "success",
|
||||
"message": "识别成功,请确认数据后点击录入",
|
||||
"data": merged,
|
||||
"images": rel_paths,
|
||||
"image_urls": image_urls,
|
||||
"image": rel_paths[0] if rel_paths else "",
|
||||
"image_url": image_urls[0] if image_urls else "",
|
||||
"message": f"成功处理 {len(file_results)} 个文件,请确认数据后点击录入",
|
||||
"items": file_results,
|
||||
})
|
||||
|
||||
|
||||
@@ -715,7 +704,6 @@ def upload(request):
|
||||
@require_http_methods(["POST"])
|
||||
def confirm(request):
|
||||
if request.session.get("user_id") is None:
|
||||
# 允许从payload中带入user_id作为后备(便于前端已知用户时继续操作)
|
||||
try:
|
||||
payload_for_uid = json.loads(request.body.decode("utf-8"))
|
||||
except Exception:
|
||||
@@ -732,77 +720,97 @@ def confirm(request):
|
||||
except json.JSONDecodeError:
|
||||
return JsonResponse({"status": "error", "message": "JSON无效"}, status=400)
|
||||
|
||||
edited = payload.get("data") or {}
|
||||
image_rel = payload.get("image") or ""
|
||||
if not isinstance(edited, dict) or not edited:
|
||||
return JsonResponse({"status": "error", "message": "数据不能为空"}, status=400)
|
||||
# 支持单项或批量入库
|
||||
items = payload.get("items")
|
||||
if not items:
|
||||
# 兼容旧版本单项入库
|
||||
items = [{
|
||||
"data": payload.get("data"),
|
||||
"image": payload.get("image")
|
||||
}]
|
||||
|
||||
ensure_type_in_list(edited.get("数据类型"))
|
||||
image_ref_to_store = ""
|
||||
temp_files_to_delete = []
|
||||
image_rels = _parse_image_refs(image_rel)
|
||||
if image_rels:
|
||||
images_dir = os.path.join(settings.MEDIA_ROOT, "images")
|
||||
os.makedirs(images_dir, exist_ok=True)
|
||||
image_refs = []
|
||||
for rel in image_rels:
|
||||
src_abs = os.path.join(settings.MEDIA_ROOT, rel)
|
||||
if not os.path.isfile(src_abs):
|
||||
return JsonResponse({"status": "error", "message": "图片文件不存在"}, status=400)
|
||||
webp_name = f"{uuid.uuid4().hex}.webp"
|
||||
webp_abs = os.path.join(images_dir, webp_name)
|
||||
try:
|
||||
with Image.open(src_abs) as im:
|
||||
if im.mode in ("RGBA", "LA", "P"):
|
||||
im = im.convert("RGBA")
|
||||
else:
|
||||
im = im.convert("RGB")
|
||||
im.save(webp_abs, format="WEBP", quality=80)
|
||||
except Exception:
|
||||
success_count = 0
|
||||
errors = []
|
||||
|
||||
for item in items:
|
||||
edited = item.get("data") or {}
|
||||
image_rel = item.get("image") or ""
|
||||
|
||||
if not isinstance(edited, dict) or not edited:
|
||||
errors.append("数据项不能为空")
|
||||
continue
|
||||
|
||||
ensure_type_in_list(edited.get("数据类型"))
|
||||
image_ref_to_store = ""
|
||||
temp_files_to_delete = []
|
||||
image_rels = _parse_image_refs(image_rel)
|
||||
|
||||
if image_rels:
|
||||
images_dir = os.path.join(settings.MEDIA_ROOT, "images")
|
||||
os.makedirs(images_dir, exist_ok=True)
|
||||
image_refs = []
|
||||
for rel in image_rels:
|
||||
src_abs = os.path.join(settings.MEDIA_ROOT, rel)
|
||||
if not os.path.isfile(src_abs):
|
||||
errors.append(f"图片文件 {rel} 不存在")
|
||||
continue
|
||||
|
||||
webp_name = f"{uuid.uuid4().hex}.webp"
|
||||
webp_abs = os.path.join(images_dir, webp_name)
|
||||
try:
|
||||
if os.path.isfile(webp_abs):
|
||||
os.remove(webp_abs)
|
||||
with Image.open(src_abs) as im:
|
||||
if im.mode in ("RGBA", "LA", "P"):
|
||||
im = im.convert("RGBA")
|
||||
else:
|
||||
im = im.convert("RGB")
|
||||
im.save(webp_abs, format="WEBP", quality=80)
|
||||
except Exception:
|
||||
pass
|
||||
return JsonResponse({"status": "error", "message": "图片转换WEBP失败"}, status=500)
|
||||
errors.append(f"图片 {rel} 转换WEBP失败")
|
||||
continue
|
||||
|
||||
try:
|
||||
object_name = f"images/{webp_name}"
|
||||
from minio_storage.minio_connect import upload_file, is_minio_configured
|
||||
if is_minio_configured():
|
||||
upload_file(webp_abs, object_name, content_type="image/webp")
|
||||
image_refs.append(f"minio:{object_name}")
|
||||
temp_files_to_delete.extend([src_abs, webp_abs])
|
||||
else:
|
||||
# Fallback to local storage
|
||||
image_refs.append(f"local:{object_name}")
|
||||
# In local case, we keep the webp file and only delete the original temporary file
|
||||
temp_files_to_delete.append(src_abs)
|
||||
except Exception as e:
|
||||
return JsonResponse({"status": "error", "message": f"存储图片失败: {e}"}, status=500)
|
||||
if len(image_refs) == 1:
|
||||
image_ref_to_store = image_refs[0]
|
||||
elif len(image_refs) > 1:
|
||||
image_ref_to_store = json_to_string(image_refs)
|
||||
try:
|
||||
object_name = f"images/{webp_name}"
|
||||
from minio_storage.minio_connect import upload_file, is_minio_configured
|
||||
if is_minio_configured():
|
||||
upload_file(webp_abs, object_name, content_type="image/webp")
|
||||
image_refs.append(f"minio:{object_name}")
|
||||
temp_files_to_delete.extend([src_abs, webp_abs])
|
||||
else:
|
||||
image_refs.append(f"local:{object_name}")
|
||||
temp_files_to_delete.append(src_abs)
|
||||
except Exception as e:
|
||||
errors.append(f"存储图片 {rel} 失败: {e}")
|
||||
continue
|
||||
|
||||
if len(image_refs) == 1:
|
||||
image_ref_to_store = image_refs[0]
|
||||
elif len(image_refs) > 1:
|
||||
image_ref_to_store = json_to_string(image_refs)
|
||||
|
||||
to_store = {
|
||||
"writer_id": str(request.session.get("user_id")),
|
||||
"data": json_to_string(edited),
|
||||
"image": image_ref_to_store,
|
||||
}
|
||||
to_store = {
|
||||
"writer_id": str(request.session.get("user_id")),
|
||||
"data": json_to_string(edited),
|
||||
"image": image_ref_to_store,
|
||||
}
|
||||
|
||||
ok = insert_data(to_store)
|
||||
if not ok:
|
||||
return JsonResponse({"status": "error", "message": "写入ES失败"}, status=500)
|
||||
ok = insert_data(to_store)
|
||||
if ok:
|
||||
success_count += 1
|
||||
# 清理临时文件
|
||||
for p in temp_files_to_delete:
|
||||
if p and os.path.isfile(p):
|
||||
try: os.remove(p)
|
||||
except: pass
|
||||
else:
|
||||
errors.append("写入ES失败")
|
||||
|
||||
try:
|
||||
for p in temp_files_to_delete:
|
||||
if p and os.path.isfile(p):
|
||||
os.remove(p)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return JsonResponse({"status": "success", "message": "数据录入成功", "data": edited})
|
||||
if success_count > 0:
|
||||
msg = f"成功录入 {success_count} 条数据"
|
||||
if errors:
|
||||
msg += f" (遇到 {len(errors)} 个错误)"
|
||||
return JsonResponse({"status": "success", "message": msg})
|
||||
else:
|
||||
return JsonResponse({"status": "error", "message": "录入失败: " + "; ".join(errors[:3])}, status=500)
|
||||
|
||||
|
||||
@require_http_methods(["GET"])
|
||||
|
||||
Reference in New Issue
Block a user