-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathupload_transcripts.py
More file actions
68 lines (52 loc) · 1.72 KB
/
upload_transcripts.py
File metadata and controls
68 lines (52 loc) · 1.72 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import os
import sys
import boto3
from botocore.exceptions import ClientError
from dotenv import load_dotenv
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
sys.stderr.reconfigure(encoding="utf-8", errors="replace")
load_dotenv()
INPUT_FOLDER = "transcripts"
BUCKET_NAME = "destiny-transcripts"
R2_ACCOUNT_ID = os.environ["R2_ACCOUNT_ID"]
R2_ENDPOINT = f"https://{R2_ACCOUNT_ID}.r2.cloudflarestorage.com"
def get_s3_client():
return boto3.client(
"s3",
endpoint_url=R2_ENDPOINT,
aws_access_key_id=os.environ["R2_ACCESS_KEY_ID"],
aws_secret_access_key=os.environ["R2_SECRET_ACCESS_KEY"],
region_name="auto",
)
def file_exists_in_r2(s3, key):
try:
s3.head_object(Bucket=BUCKET_NAME, Key=key)
return True
except ClientError as e:
if e.response["Error"]["Code"] == "404":
return False
raise
def main():
s3 = get_s3_client()
txt_files = sorted(
f for f in os.listdir(INPUT_FOLDER) if f.lower().endswith(".txt")
)
total = len(txt_files)
uploaded = 0
skipped = 0
print(f"Found {total} transcript files to upload")
for i, filename in enumerate(txt_files, start=1):
key = filename
local_path = os.path.join(INPUT_FOLDER, filename)
if file_exists_in_r2(s3, key):
skipped += 1
print(f"[{i}/{total}] SKIP (exists): {filename}")
continue
print(f"[{i}/{total}] Uploading: {filename}")
s3.upload_file(
local_path, BUCKET_NAME, key, ExtraArgs={"ContentType": "text/plain"}
)
uploaded += 1
print(f"\nDone. Uploaded: {uploaded}, Skipped: {skipped}, Total: {total}")
if __name__ == "__main__":
main()