94 lines
3.1 KiB
Python
94 lines
3.1 KiB
Python
import json
|
|
import argparse
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
from database import LLMDatabase
|
|
from proxy_addon import load_config
|
|
|
|
|
|
def export_training_data(output_path: str, db_path: str = "llm_data.db",
|
|
include_reasoning: bool = True) -> int:
|
|
db = LLMDatabase(db_path)
|
|
count = db.export_to_jsonl(output_path, include_reasoning)
|
|
return count
|
|
|
|
|
|
def export_with_metadata(output_path: str, db_path: str = "llm_data.db") -> int:
|
|
db = LLMDatabase(db_path)
|
|
conversations = db.get_all_conversations()
|
|
count = 0
|
|
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
for conv in conversations:
|
|
messages = db.get_conversation_messages(conv['conversation_id'])
|
|
|
|
if not messages:
|
|
continue
|
|
|
|
data = {
|
|
'messages': messages,
|
|
'metadata': {
|
|
'conversation_id': conv['conversation_id'],
|
|
'created_at': conv['created_at'],
|
|
'updated_at': conv['updated_at']
|
|
}
|
|
}
|
|
|
|
jsonl_line = json.dumps(data, ensure_ascii=False)
|
|
f.write(jsonl_line + '\n')
|
|
count += 1
|
|
|
|
return count
|
|
|
|
|
|
def main():
|
|
config = load_config()
|
|
export_config = config.get('export', {})
|
|
db_config = config.get('database', {})
|
|
|
|
parser = argparse.ArgumentParser(description='Export LLM training data to JSONL format')
|
|
parser.add_argument('--output', '-o', type=str,
|
|
default=f"exports/training_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.jsonl",
|
|
help='Output file path')
|
|
parser.add_argument('--db', type=str, default=db_config.get('path', 'llm_data.db'),
|
|
help='Database file path')
|
|
parser.add_argument('--no-reasoning', action='store_true',
|
|
help='Exclude reasoning content from export')
|
|
parser.add_argument('--with-metadata', action='store_true',
|
|
help='Include metadata in export')
|
|
parser.add_argument('--stats', action='store_true',
|
|
help='Show database statistics')
|
|
|
|
args = parser.parse_args()
|
|
|
|
if args.stats:
|
|
db = LLMDatabase(args.db)
|
|
stats = db.get_stats()
|
|
print("\nDatabase Statistics:")
|
|
print(f" Conversations: {stats['conversations']}")
|
|
print(f" Requests: {stats['requests']}")
|
|
print(f" Responses: {stats['responses']}")
|
|
print(f" Total Tokens: {stats['total_tokens']}")
|
|
return
|
|
|
|
output_path = Path(args.output)
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
include_reasoning = not args.no_reasoning
|
|
|
|
if args.with_metadata:
|
|
count = export_with_metadata(str(output_path), args.db)
|
|
print(f"\nExported {count} conversations with metadata to: {output_path}")
|
|
else:
|
|
count = export_training_data(str(output_path), args.db, include_reasoning)
|
|
print(f"\nExported {count} conversations to: {output_path}")
|
|
|
|
if include_reasoning:
|
|
print(" (Reasoning content included)")
|
|
else:
|
|
print(" (Reasoning content excluded)")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|