PDF-Assistant-RAG/.env.example at dev · param20h/PDF-Assistant-RAG · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
#  Document AI Analyst — Environment Configuration
#  Copy this file to backend/.env and fill in your values:
#    cp .env.example backend/.env


# ── Application Config ──────────────────────────────────────────────

# Secret key for signing JWT tokens.
# Generate one: python -c "import secrets; print(secrets.token_urlsafe(32))"
# Required in production
SECRET_KEY=change-me-in-production

# Runtime environment. Set to "production" in production.
# Optional — defaults to "development"
ENVIRONMENT=development

# Debug mode. Do NOT enable in production.
# Optional — defaults to False
# DEBUG=False

# Comma-separated list of allowed CORS origins.
# Only used when ENVIRONMENT=production.
# Optional — defaults to "http://localhost:3000,http://localhost:7860"
ALLOWED_ORIGINS=http://localhost:3000,http://localhost:7860


# ── Database ─────────────────────────────────────────────────

# SQLAlchemy database connection string.
# Default: SQLite stored at ./data/app.db
# For Postgres: postgresql://user:pass@host:5432/dbname
# Optional — defaults to sqlite:///./data/app.db
# DATABASE_URL=sqlite:///./data/app.db


# ── Field-level Encryption ─────────────────────────────────

# Dedicated key for encrypting sensitive user fields (tokens, secrets).
# Must be set in production — no default value is provided.
# Generate one: python -c "import secrets; print(secrets.token_urlsafe(32))"
# Required in production
# FIELD_ENCRYPTION_KEY=your_32_byte_base64_encoded_key


# ── Authentication ──────────────────────────────────────────

# JWT signing algorithm. Leave as default (HS256) unless you know what you're doing.
# Optional — defaults to "HS256"
# JWT_ALGORITHM=HS256

# JWT access token expiry in minutes.
# Optional — defaults to 15
# JWT_ACCESS_EXPIRY_MINUTES=15

# JWT refresh token expiry in days.
# Optional — defaults to 7
# JWT_REFRESH_EXPIRY_DAYS=7

# Google OAuth client ID for backend ID-token verification.
# Optional — required only for Google sign-in.
# GOOGLE_CLIENT_ID=your_google_oauth_client_id.apps.googleusercontent.com

# Google OAuth client secret for Drive read-only access.
# Optional — required only for Google Drive sync.
# GOOGLE_CLIENT_SECRET=your_google_oauth_client_secret

# Callback URL registered in your Google OAuth web client.
# Optional — required only for Google Drive sync.
# GOOGLE_DRIVE_REDIRECT_URI=http://localhost:8000/api/v1/auth/google-drive/callback

# Public frontend URL used to build email verification links.
# Optional — defaults to http://localhost:3000
FRONTEND_URL=http://localhost:3000

# Email verification token lifetime in hours.
# Optional — defaults to 24
# EMAIL_VERIFICATION_TOKEN_EXPIRE_HOURS=24


# ── SMTP / Email ───────────────────────────────────────────

# SMTP settings used to send account verification emails.
MAIL_USERNAME=your_smtp_username
MAIL_PASSWORD=your_smtp_or_gmail_app_password
MAIL_FROM=your_sender_email@example.com
MAIL_SERVER=smtp.example.com
MAIL_PORT=587
MAIL_STARTTLS=True
MAIL_SSL_TLS=False


# ── Celery / Redis Background Processing ───────────────────

# Redis URL used by FastAPI to enqueue PDF processing jobs.
# Optional — defaults to redis://localhost:6379/0
# CELERY_BROKER_URL=redis://localhost:6379/0

# Redis URL used by Celery to store task results/status.
# Optional — defaults to redis://localhost:6379/1
# CELERY_RESULT_BACKEND=redis://localhost:6379/1


# ── File Upload ─────────────────────────────────────────────

# Directory where uploaded documents are stored.
# Optional — defaults to "./data/uploads"
# UPLOAD_DIR=./data/uploads

# Maximum upload file size in megabytes.
# Optional — defaults to 50
# MAX_UPLOAD_SIZE_MB=50

# Comma-separated list of allowed file extensions.
# Optional — defaults to "pdf,docx,txt,md"
# ALLOWED_EXTENSIONS=pdf,docx,txt,md


# ── HuggingFace (Required for LLM inference and OAuth) ───────

# HuggingFace API token. Used to call the Inference API for LLM responses.
# Get yours: https://huggingface.co/settings/tokens (free tier available)
# Required (app won't generate answers without it)
HF_TOKEN=your_huggingface_token_here

# HuggingFace OAuth client ID and secret for native login support
# Optional — required only for Hugging Face sign-in
# HF_CLIENT_ID=your_hf_oauth_client_id
# HF_CLIENT_SECRET=your_hf_oauth_client_secret
# HF_REDIRECT_URI=http://localhost:8000/api/v1/auth/callback/huggingface


# ── LLM Configuration ───────────────────────────────────────

# HuggingFace model ID used for answer generation.
# Optional — defaults to "Qwen/Qwen2.5-72B-Instruct"
# LLM_MODEL=Qwen/Qwen2.5-72B-Instruct

# Sampling temperature (0.0 = deterministic, 1.0 = very creative).
# Optional — defaults to 0.3
# LLM_TEMPERATURE=0.3

# Maximum number of tokens the LLM can generate per response.
# Optional — defaults to 1024
# LLM_MAX_NEW_TOKENS=1024


# ── LangSmith Tracing (Optional) ────────────────────────

# Enable LangSmith tracing for the backend RAG pipeline.
# Optional — defaults to False
# LANGSMITH_TRACING=False

# LangSmith API key.
# Optional — only needed when LANGSMITH_TRACING=True
# LANGSMITH_API_KEY=

# LangSmith API endpoint.
# Optional — defaults to "https://api.smith.langchain.com"
# LANGSMITH_ENDPOINT=https://api.smith.langchain.com

# LangSmith project name used for traced runs.
# Optional — defaults to "pdf-assistant-rag"
# LANGSMITH_PROJECT=pdf-assistant-rag


# ── Embeddings ──────────────────────────────────────────────

# SentenceTransformer model ID for generating document embeddings.
# Model is downloaded once and cached locally. No external API call.
# Optional — defaults to "sentence-transformers/all-MiniLM-L6-v2"
# EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2

# Dimension of the embedding vectors (must match the model output).
# Optional — defaults to 384
# EMBEDDING_DIMENSION=384


# ── RAG Config ──────────────────────────────────────────────

# Number of characters per document chunk.
# Optional — defaults to 1000
# CHUNK_SIZE=1000

# Character overlap between consecutive chunks.
# Optional — defaults to 200
# CHUNK_OVERLAP=200

# Number of candidate chunks retrieved during semantic search.
# Optional — defaults to 20
# TOP_K_RETRIEVAL=20

# Number of top chunks passed to the LLM after reranking.
# Optional — defaults to 8
# TOP_K_RERANK=8

# Cross-encoder model used for reranking.
# Optional — defaults to "BAAI/bge-reranker-v2-m3"
# RERANKER_MODEL=BAAI/bge-reranker-v2-m3


# ── Knowledge Graph / GraphRAG ──────────────────────────────

# Directory where GraphRAG stores per-document knowledge graphs.
# Optional — defaults to "./data/graphs"
# GRAPH_PERSIST_DIR=./data/graphs

# Maximum number of graph relationships appended to the RAG prompt.
# Optional — defaults to 12
# GRAPH_MAX_RELATIONSHIPS=12


# ── ChromaDB (Vector Store) ─────────────────────────────────

# Directory where ChromaDB persists its vector index to disk.
# Optional — defaults to "./data/chroma_db"
# CHROMA_PERSIST_DIR=./data/chroma_db


# ── Document Cleanup ────────────────────────────────────────

# Enable automatic cleanup of inactive active documents.
# Optional — defaults to True
# DOC_CLEANUP_ENABLED=True

# Number of days without access before an active document is purged.
# Optional — defaults to 30
# DOC_CLEANUP_INACTIVE_DAYS=30

# Number of days a soft-deleted document is kept before permanent deletion.
# Optional — defaults to 90
# DOC_CLEANUP_MAX_AGE_DAYS=90


# ── Workspace Invitations ──────────────────────────────────

# Public-facing app URL used in invitation emails.
# Optional — defaults to "http://localhost:3000"
# APP_URL=http://localhost:3000

# Invitation token expiry in hours.
# Optional — defaults to 72
# INVITE_TOKEN_EXPIRY_HOURS=72


# ── Response Caching ───────────────────────────────────────

# Redis connection URL. Leave empty to use in-memory LRU fallback.
# REDIS_URL=

# Cache TTL in seconds (default: 3600 = 1 hour)
# CACHE_TTL=3600

# Max entries for in-memory LRU cache when Redis is unavailable
# CACHE_LRU_MAX_SIZE=128