-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsem_alloc.py
More file actions
80 lines (66 loc) · 3.24 KB
/
sem_alloc.py
File metadata and controls
80 lines (66 loc) · 3.24 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import os
import backend.processer as processer
import json
import numpy as np
import pickle
MIME_TYPES = {'.jpg': 'jpeg', '.jpeg': 'jpeg', '.png':'png', '.webp':'webp'}
def semantic_allocation(directory_path):
with open("cold_storage.json", "w") as f:
f.write("{\n")
for root, _, files in os.walk(directory_path):
for file_ in files:
full_path = os.path.join(root, file_)
try:
_, ext = os.path.splitext(file_)
ext = ext.lower()
if ext in ['.mp3', '.mp4', '.m4a']:
output = processer.process_audio(full_path).replace('\n', ' ')
s = fr"{output}"
f.write(f"\"{full_path}\":\"{s}\",\n")
elif ext in ['.jpg', '.jpeg', '.png', '.webp']:
output = processer.process_image(full_path, MIME_TYPES[ext]).replace('\n', ' ')
s = fr"{output}"
f.write(f"\"{full_path}\":\"{s}\",\n")
elif ext in ['.txt']:
with open(full_path, 'r', encoding='utf-8') as txt_file:
output = txt_file.read().replace('\n', ' ')
s = fr"{output}"
f.write(f"\"{full_path}\":\"{s}\",\n")
else:
print(f"Unsupported file type for {full_path}")
print(f"Embedding file: {full_path}")
except Exception as e:
print(f"Error embedding {full_path}: {e}")
f.write("}")
def embed_everything(cold_storage_file):
with open(cold_storage_file, 'r') as f:
summaries = json.load(f)
num_summaries = len(summaries)
batch_size = 15
summaries_items = list(summaries.items())
embeddings = {}
embedding_to_file = {}
embeddings_ = []
for i in range(0, num_summaries, batch_size):
batch = summaries_items[i:i + batch_size]
batch_texts = [text for _, text in batch]
batch_embeddings_response = processer.gemini_embed(strings=batch_texts)
batch_embeddings = batch_embeddings_response.embeddings
count = 0
for (file_path, _), embedding_data in zip(batch, batch_embeddings):
embedding_array = np.array(embedding_data.values)
embeddings[file_path] = {"array": embedding_array.tolist(), "id": i + count}
embeddings_.append(embedding_array)
count += 1
print(f"Processed batch {i // batch_size + 1}/{(num_summaries + batch_size - 1) // batch_size}")
with open("embeddings.json", "w") as f2:
json.dump(embeddings, f2)
with open("embeddings.pkl", "wb") as pkl_file:
embeddings_ = np.asarray(embeddings_)
# np.savez_compressed('embeddings.npz', embeddings=embeddings_ )
np.save('embeddings.npy', embeddings_)
pickle.dump(embeddings_, pkl_file)
return embeddings, embeddings_
# directory_path = "/Users/derekzhu/Code/EmbedAnything/EmbedAnything/test"
# semantic_allocation(directory_path)
embed_everything("/Users/derekzhu/Code/EmbedAnything/EmbedAnything/cold_storage.json")