第一个版本

2026-01-11 04:17:53 +08:00
commit c160320892
11 changed files with 2383 additions and 0 deletions
--- a/export.py
+++ b/export.py
@@ -0,0 +1,93 @@
+import json
+import argparse
+from pathlib import Path
+from datetime import datetime
+from database import LLMDatabase
+from proxy_addon import load_config
+
+
+def export_training_data(output_path: str, db_path: str = "llm_data.db", 
+                         include_reasoning: bool = True) -> int:
+    db = LLMDatabase(db_path)
+    count = db.export_to_jsonl(output_path, include_reasoning)
+    return count
+
+
+def export_with_metadata(output_path: str, db_path: str = "llm_data.db") -> int:
+    db = LLMDatabase(db_path)
+    conversations = db.get_all_conversations()
+    count = 0
+
+    with open(output_path, 'w', encoding='utf-8') as f:
+        for conv in conversations:
+            messages = db.get_conversation_messages(conv['conversation_id'])
+
+            if not messages:
+                continue
+
+            data = {
+                'messages': messages,
+                'metadata': {
+                    'conversation_id': conv['conversation_id'],
+                    'created_at': conv['created_at'],
+                    'updated_at': conv['updated_at']
+                }
+            }
+
+            jsonl_line = json.dumps(data, ensure_ascii=False)
+            f.write(jsonl_line + '\n')
+            count += 1
+
+    return count
+
+
+def main():
+    config = load_config()
+    export_config = config.get('export', {})
+    db_config = config.get('database', {})
+
+    parser = argparse.ArgumentParser(description='Export LLM training data to JSONL format')
+    parser.add_argument('--output', '-o', type=str, 
+                       default=f"exports/training_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.jsonl",
+                       help='Output file path')
+    parser.add_argument('--db', type=str, default=db_config.get('path', 'llm_data.db'),
+                       help='Database file path')
+    parser.add_argument('--no-reasoning', action='store_true',
+                       help='Exclude reasoning content from export')
+    parser.add_argument('--with-metadata', action='store_true',
+                       help='Include metadata in export')
+    parser.add_argument('--stats', action='store_true',
+                       help='Show database statistics')
+
+    args = parser.parse_args()
+
+    if args.stats:
+        db = LLMDatabase(args.db)
+        stats = db.get_stats()
+        print("\nDatabase Statistics:")
+        print(f"  Conversations: {stats['conversations']}")
+        print(f"  Requests: {stats['requests']}")
+        print(f"  Responses: {stats['responses']}")
+        print(f"  Total Tokens: {stats['total_tokens']}")
+        return
+
+    output_path = Path(args.output)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    include_reasoning = not args.no_reasoning
+
+    if args.with_metadata:
+        count = export_with_metadata(str(output_path), args.db)
+        print(f"\nExported {count} conversations with metadata to: {output_path}")
+    else:
+        count = export_training_data(str(output_path), args.db, include_reasoning)
+        print(f"\nExported {count} conversations to: {output_path}")
+
+    if include_reasoning:
+        print("  (Reasoning content included)")
+    else:
+        print("  (Reasoning content excluded)")
+
+
+if __name__ == '__main__':
+    main()