第一个版本

This commit is contained in:
2026-01-11 04:17:53 +08:00
commit c160320892
11 changed files with 2383 additions and 0 deletions

93
export.py Normal file
View File

@@ -0,0 +1,93 @@
import json
import argparse
from pathlib import Path
from datetime import datetime
from database import LLMDatabase
from proxy_addon import load_config
def export_training_data(output_path: str, db_path: str = "llm_data.db",
include_reasoning: bool = True) -> int:
db = LLMDatabase(db_path)
count = db.export_to_jsonl(output_path, include_reasoning)
return count
def export_with_metadata(output_path: str, db_path: str = "llm_data.db") -> int:
db = LLMDatabase(db_path)
conversations = db.get_all_conversations()
count = 0
with open(output_path, 'w', encoding='utf-8') as f:
for conv in conversations:
messages = db.get_conversation_messages(conv['conversation_id'])
if not messages:
continue
data = {
'messages': messages,
'metadata': {
'conversation_id': conv['conversation_id'],
'created_at': conv['created_at'],
'updated_at': conv['updated_at']
}
}
jsonl_line = json.dumps(data, ensure_ascii=False)
f.write(jsonl_line + '\n')
count += 1
return count
def main():
config = load_config()
export_config = config.get('export', {})
db_config = config.get('database', {})
parser = argparse.ArgumentParser(description='Export LLM training data to JSONL format')
parser.add_argument('--output', '-o', type=str,
default=f"exports/training_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.jsonl",
help='Output file path')
parser.add_argument('--db', type=str, default=db_config.get('path', 'llm_data.db'),
help='Database file path')
parser.add_argument('--no-reasoning', action='store_true',
help='Exclude reasoning content from export')
parser.add_argument('--with-metadata', action='store_true',
help='Include metadata in export')
parser.add_argument('--stats', action='store_true',
help='Show database statistics')
args = parser.parse_args()
if args.stats:
db = LLMDatabase(args.db)
stats = db.get_stats()
print("\nDatabase Statistics:")
print(f" Conversations: {stats['conversations']}")
print(f" Requests: {stats['requests']}")
print(f" Responses: {stats['responses']}")
print(f" Total Tokens: {stats['total_tokens']}")
return
output_path = Path(args.output)
output_path.parent.mkdir(parents=True, exist_ok=True)
include_reasoning = not args.no_reasoning
if args.with_metadata:
count = export_with_metadata(str(output_path), args.db)
print(f"\nExported {count} conversations with metadata to: {output_path}")
else:
count = export_training_data(str(output_path), args.db, include_reasoning)
print(f"\nExported {count} conversations to: {output_path}")
if include_reasoning:
print(" (Reasoning content included)")
else:
print(" (Reasoning content excluded)")
if __name__ == '__main__':
main()