LLMPoxy/export.py

import json
import argparse
from pathlib import Path
from datetime import datetime
from database import LLMDatabase
from proxy_addon import load_config


def export_training_data(output_path: str, db_path: str = "llm_data.db",
                         include_reasoning: bool = True) -> int:
    db = LLMDatabase(db_path)
    count = db.export_to_jsonl(output_path, include_reasoning)
    return count


def export_with_metadata(output_path: str, db_path: str = "llm_data.db") -> int:
    db = LLMDatabase(db_path)
    conversations = db.get_all_conversations()
    count = 0

    with open(output_path, 'w', encoding='utf-8') as f:
        for conv in conversations:
            messages = db.get_conversation_messages(conv['conversation_id'])

            if not messages:
                continue

            data = {
                'messages': messages,
                'metadata': {
                    'conversation_id': conv['conversation_id'],
                    'created_at': conv['created_at'],
                    'updated_at': conv['updated_at']
                }
            }

            jsonl_line = json.dumps(data, ensure_ascii=False)
            f.write(jsonl_line + '\n')
            count += 1

    return count


def main():
    config = load_config()
    export_config = config.get('export', {})
    db_config = config.get('database', {})

    parser = argparse.ArgumentParser(description='Export LLM training data to JSONL format')
    parser.add_argument('--output', '-o', type=str,
                       default=f"exports/training_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.jsonl",
                       help='Output file path')
    parser.add_argument('--db', type=str, default=db_config.get('path', 'llm_data.db'),
                       help='Database file path')
    parser.add_argument('--no-reasoning', action='store_true',
                       help='Exclude reasoning content from export')
    parser.add_argument('--with-metadata', action='store_true',
                       help='Include metadata in export')
    parser.add_argument('--stats', action='store_true',
                       help='Show database statistics')

    args = parser.parse_args()

    if args.stats:
        db = LLMDatabase(args.db)
        stats = db.get_stats()
        print("\nDatabase Statistics:")
        print(f"  Conversations: {stats['conversations']}")
        print(f"  Requests: {stats['requests']}")
        print(f"  Responses: {stats['responses']}")
        print(f"  Total Tokens: {stats['total_tokens']}")
        return

    output_path = Path(args.output)
    output_path.parent.mkdir(parents=True, exist_ok=True)

    include_reasoning = not args.no_reasoning

    if args.with_metadata:
        count = export_with_metadata(str(output_path), args.db)
        print(f"\nExported {count} conversations with metadata to: {output_path}")
    else:
        count = export_training_data(str(output_path), args.db, include_reasoning)
        print(f"\nExported {count} conversations to: {output_path}")

    if include_reasoning:
        print("  (Reasoning content included)")
    else:
        print("  (Reasoning content excluded)")


if __name__ == '__main__':
    main()