第一个版本
This commit is contained in:
93
export.py
Normal file
93
export.py
Normal file
@@ -0,0 +1,93 @@
|
||||
import json
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from database import LLMDatabase
|
||||
from proxy_addon import load_config
|
||||
|
||||
|
||||
def export_training_data(output_path: str, db_path: str = "llm_data.db",
|
||||
include_reasoning: bool = True) -> int:
|
||||
db = LLMDatabase(db_path)
|
||||
count = db.export_to_jsonl(output_path, include_reasoning)
|
||||
return count
|
||||
|
||||
|
||||
def export_with_metadata(output_path: str, db_path: str = "llm_data.db") -> int:
|
||||
db = LLMDatabase(db_path)
|
||||
conversations = db.get_all_conversations()
|
||||
count = 0
|
||||
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
for conv in conversations:
|
||||
messages = db.get_conversation_messages(conv['conversation_id'])
|
||||
|
||||
if not messages:
|
||||
continue
|
||||
|
||||
data = {
|
||||
'messages': messages,
|
||||
'metadata': {
|
||||
'conversation_id': conv['conversation_id'],
|
||||
'created_at': conv['created_at'],
|
||||
'updated_at': conv['updated_at']
|
||||
}
|
||||
}
|
||||
|
||||
jsonl_line = json.dumps(data, ensure_ascii=False)
|
||||
f.write(jsonl_line + '\n')
|
||||
count += 1
|
||||
|
||||
return count
|
||||
|
||||
|
||||
def main():
|
||||
config = load_config()
|
||||
export_config = config.get('export', {})
|
||||
db_config = config.get('database', {})
|
||||
|
||||
parser = argparse.ArgumentParser(description='Export LLM training data to JSONL format')
|
||||
parser.add_argument('--output', '-o', type=str,
|
||||
default=f"exports/training_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.jsonl",
|
||||
help='Output file path')
|
||||
parser.add_argument('--db', type=str, default=db_config.get('path', 'llm_data.db'),
|
||||
help='Database file path')
|
||||
parser.add_argument('--no-reasoning', action='store_true',
|
||||
help='Exclude reasoning content from export')
|
||||
parser.add_argument('--with-metadata', action='store_true',
|
||||
help='Include metadata in export')
|
||||
parser.add_argument('--stats', action='store_true',
|
||||
help='Show database statistics')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.stats:
|
||||
db = LLMDatabase(args.db)
|
||||
stats = db.get_stats()
|
||||
print("\nDatabase Statistics:")
|
||||
print(f" Conversations: {stats['conversations']}")
|
||||
print(f" Requests: {stats['requests']}")
|
||||
print(f" Responses: {stats['responses']}")
|
||||
print(f" Total Tokens: {stats['total_tokens']}")
|
||||
return
|
||||
|
||||
output_path = Path(args.output)
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
include_reasoning = not args.no_reasoning
|
||||
|
||||
if args.with_metadata:
|
||||
count = export_with_metadata(str(output_path), args.db)
|
||||
print(f"\nExported {count} conversations with metadata to: {output_path}")
|
||||
else:
|
||||
count = export_training_data(str(output_path), args.db, include_reasoning)
|
||||
print(f"\nExported {count} conversations to: {output_path}")
|
||||
|
||||
if include_reasoning:
|
||||
print(" (Reasoning content included)")
|
||||
else:
|
||||
print(" (Reasoning content excluded)")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user