postgresql-singularity/store_codebase_in_memory.py at main · ourochronos/postgresql-singularity · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
#!/usr/bin/env python3
"""
Store Bob's codebase in pg_singularity for self-analysis
This allows Bob to understand his own architecture deeply
"""

import asyncio
import os
from pathlib import Path
from typing import List, Dict
import mimetypes

from src.memory.postgres_client import postgres_memory


class CodebaseMemorizer:
    """Store codebase files in Bob's memory for analysis"""

    def __init__(self, base_path: Path):
        self.base_path = base_path
        self.stats = {
            'files_processed': 0,
            'files_stored': 0,
            'errors': 0,
            'total_size': 0
        }

    async def store_file(self, file_path: Path) -> bool:
        """Store a single file in memory"""
        try:
            # Get relative path for concept
            rel_path = file_path.relative_to(self.base_path)

            # Read file content
            try:
                content = file_path.read_text(encoding='utf-8')
            except UnicodeDecodeError:
                # Skip binary files
                return False

            # Skip very large files
            if len(content) > 100000:  # 100KB limit
                print(f"⚠️  Skipping large file: {rel_path}")
                return False

            # Determine file type
            mime_type, _ = mimetypes.guess_type(str(file_path))
            file_type = 'code'
            if file_path.suffix == '.md':
                file_type = 'documentation'
            elif file_path.suffix == '.json':
                file_type = 'configuration'
            elif file_path.suffix in ['.yml', '.yaml']:
                file_type = 'configuration'

            # Store in memory
            thought_id = await postgres_memory.think(
                concept=f"Codebase file: {rel_path}",
                context=content,
                importance=0.6,  # Medium importance for codebase files
                metadata={
                    'type': 'codebase_file',
                    'file_type': file_type,
                    'file_path': str(rel_path),
                    'file_size': len(content),
                    'mime_type': mime_type or 'text/plain',
                    'kb_source': 'codebase_analysis'
                }
            )

            self.stats['files_stored'] += 1
            self.stats['total_size'] += len(content)

            print(f"✓ Stored: {rel_path} ({len(content)} bytes)")
            return True

        except Exception as e:
            print(f"✗ Error storing {file_path}: {e}")
            self.stats['errors'] += 1
            return False

    async def scan_directory(self, directory: Path, patterns: List[str]) -> List[Path]:
        """Scan directory for files matching patterns"""
        files = []

        # Define files/dirs to skip
        skip_dirs = {
            '__pycache__', '.git', 'node_modules', 'venv', 'env',
            '.pytest_cache', '.mypy_cache', 'htmlcov', 'dist', 'build'
        }

        skip_files = {
            '.pyc', '.pyo', '.pyd', '.so', '.dylib', '.dll',
            '.log', '.pid', '.lock', '.tmp'
        }

        for pattern in patterns:
            for file_path in directory.rglob(pattern):
                # Skip if in ignored directory
                if any(skip_dir in file_path.parts for skip_dir in skip_dirs):
                    continue

                # Skip if ignored file type
                if any(file_path.suffix == skip for skip in skip_files):
                    continue

                if file_path.is_file():
                    files.append(file_path)

        return sorted(files)

    async def memorize_codebase(self):
        """Store entire codebase in memory"""
        print(f"\n🧠 Storing codebase from: {self.base_path}")
        print("=" * 60)

        # Define file patterns to include
        patterns = [
            "*.py",      # Python files
            "*.md",      # Documentation
            "*.json",    # Configuration
            "*.yml",     # YAML configs
            "*.yaml",    # YAML configs
            "*.sh",      # Shell scripts
            "*.sql",     # SQL files
            "*.txt",     # Requirements, etc
            "*.toml",    # Config files
        ]

        # Scan for files
        print("\n📂 Scanning for files...")
        files = await self.scan_directory(self.base_path, patterns)
        print(f"Found {len(files)} files to process")

        # Process files
        print("\n💾 Storing files in memory...")
        for file_path in files:
            self.stats['files_processed'] += 1
            await self.store_file(file_path)

        # Print summary
        print("\n" + "=" * 60)
        print("📊 Summary:")
        print(f"  Files processed: {self.stats['files_processed']}")
        print(f"  Files stored: {self.stats['files_stored']}")
        print(f"  Errors: {self.stats['errors']}")
        print(f"  Total size: {self.stats['total_size']:,} bytes")

        # Store summary thought
        await postgres_memory.think(
            concept="Codebase memorization complete",
            context=f"Stored {self.stats['files_stored']} files from my codebase in memory. "
                   f"Total size: {self.stats['total_size']:,} bytes. "
                   f"This allows me to deeply understand my own architecture and suggest improvements.",
            importance=0.8,
            metadata={
                'type': 'codebase_analysis',
                'stats': self.stats,
                'base_path': str(self.base_path)
            }
        )

        print("\n✨ Codebase stored in memory!")


async def main():
    """Store Bob's codebase in memory"""
    # Initialize memory
    await postgres_memory.initialize()

    # Store current Bob codebase
    bob_path = Path(__file__).parent
    memorizer = CodebaseMemorizer(bob_path)
    await memorizer.memorize_codebase()

    # Close memory connection
    await postgres_memory.close()

    print("\n💭 Honest review incoming...")
    print("   Now I can analyze my own codebase from memory!")


if __name__ == "__main__":
    asyncio.run(main())