#!/usr/bin/env python3 """ 修复旧数据的 upload_path 字段 通过 file_hash 匹配 uploads 目录中的 PDF 文件 """ import sqlite3 import os import hashlib DB_PATH = 'instance/pdf_translate.db' UPLOADS_DIR = 'uploads' def compute_file_hash(filepath): """计算文件MD5""" hasher = hashlib.md5() with open(filepath, 'rb') as f: hasher.update(f.read()) return hasher.hexdigest() def main(): conn = sqlite3.connect(DB_PATH) cursor = conn.cursor() # 获取所有 upload_path 为空的翻译记录 cursor.execute('SELECT id, file_hash, original_filename, output_path FROM translations WHERE upload_path IS NULL') records = cursor.fetchall() print(f"找到 {len(records)} 条需要修复的记录") if not records: print("无需修复") conn.close() return # 找所有上传的PDF文件 pdf_files = [] for root, dirs, files in os.walk(UPLOADS_DIR): for f in files: if f.endswith('.pdf'): pdf_files.append(os.path.join(root, f)) print(f"找到 {len(pdf_files)} 个PDF文件") # 按hash匹配 fixed_count = 0 for record in records: id, file_hash, filename, output_path = record # 找匹配hash的文件 for pdf_path in pdf_files: try: pdf_hash = compute_file_hash(pdf_path) if pdf_hash == file_hash: print(f"ID {id}: 找到匹配 {pdf_path}") cursor.execute('UPDATE translations SET upload_path = ? WHERE id = ?', (pdf_path, id)) conn.commit() fixed_count += 1 break except Exception as e: print(f"处理 {pdf_path} 失败: {e}") conn.close() print(f"修复完成,共修复 {fixed_count} 条记录") if __name__ == '__main__': main()