-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathindex_vault.py
181 lines (144 loc) · 6.5 KB
/
index_vault.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
from utils import load_dataset
from tqdm import tqdm
import sys
import json
from utils import add_highlight, encode_query, get_tokens, load_bm25_index, save_json, remove_old_note, fast_find_document_by_name
from jina import Document, DocumentArray
n_dim = 1536
ef_search = 50
max_connection = 16
ef_construction = 200
#--------------
json_path = sys.argv[1]
key = sys.argv[2]
plugin_path = sys.argv[3]
#------------------
data_path = plugin_path + '/vault_index/all_notes/'
bm25_index_filepath = plugin_path + '/BM25/bm25_index.json'
backup_path = plugin_path + '/backup'
da = load_dataset(data_path, metric='cosine', n_dim=n_dim, max_connection=max_connection, ef_search=ef_search)
bm25_index = load_bm25_index(bm25_index_filepath)
# If you want to backup your index to Jina Cloud, set this bool to true
# todo get the user decision through the settings in Obsidian
push_to_jina = False
def average_chunks_embedding(total_note: Document, factor:int):
#factor = 1 if name is embedded, otherwise zero to avoid extra division
#note_embed = np.zeros_like(total_note.chunks[0].embedding)
note_embed = total_note.embedding
for chunk in total_note.chunks:
note_embed+= chunk.embedding
if len(total_note.chunks) > 0:
#+1 because we also include name of the note
note_embed/= (len(total_note.chunks) + factor)
return note_embed
def get_highlight_with_embedded_notes(openai_key, highlight, notes_list, n_dim=4096, embedd_names=True, highlight_readwise=None):
"""You get the name of the note and the text included and return a Jina Document
which has text as file name and chunks (subdocuments) consisting of individual sentences in the note
Args:
highlight (str): the text inside a note
notes_list (list): The sentences inside the note to be embedded
n_dim (int, optional): the number of dimensions the text gets embedded into. Defaults to 4096 (cohere models)
Returns:
Document (Jina): _description_
"""
#database = Document(text=highlight)
#Encode the name of the note itself so it is added to the average
#Todo should the name get a special weight to emphasize, demphasize it? -> scaling the vector would preserve meaning right? just change average
database = encode_query(openai_key, highlight, n_dim=n_dim)
if embedd_names:
x = 1
else:
x = 0
database.embedding = database.embedding * x
children = DocumentArray(storage='annlite', config={
'n_dim': n_dim, 'metric': 'cosine'})
sentences = notes_list
print("-- Encoding Sentences --")
for sentence in sentences:
child = encode_query(openai_key, sentence, n_dim)
child.parent_id = database.id
children.append(child)
database.chunks = children
#todo The Embedding for the "Highlight or Note Title" should be average of sentences actually not its own!
avg_note_embed = average_chunks_embedding(database, x)
database.embedding = avg_note_embed
if highlight_readwise:
database.tags['alias'] = highlight_readwise
return database
def index_vault(files: dict):
print("--Embedding Files--")
counter = 0
count_added_files = 0
for file_name, value in tqdm(files.items()): #tqdm(files.items())
embedded_note = None
file = value['full_path']
is_modified = value['change_type']
if is_modified == "deleted":
#If target is just to delete a note from knowledge base
note_name = file_name #extract_note_title(file)
note = Document(text=note_name)
try:
#remove entity from index if it exists
bm25_index.pop(note.text)
remove_old_note(da, note)
print(f"Deleted file: {note.text}")
except Exception as e:
print(f'{file_name} was not indexed to be deleted')
print(f'Full path: ${file}')
print(e)
continue
if is_modified == 'new':
# If the document already exists and was not modified, do not index it again
#old_note, counter = fast_find_document_by_name(da, file_name)
try:
old_note = bm25_index[file_name]
if old_note:
print(f'{file_name}: already indexed and has not been changed since')
continue
except KeyError:
print(f'{file_name}: seems to be new')
try:
notes_list, highlight, highlight_readwise = add_highlight(file)
except Exception as e:
print(f'Error at: {file}')
print(e)
continue
if notes_list and highlight:
# Create a document with the note title as text and all (embedded) sentences as chunks
try:
embedded_note = get_highlight_with_embedded_notes(key, highlight, notes_list, embedd_names=True, n_dim=n_dim, highlight_readwise=highlight_readwise)
except Exception as e:
print(f'Error at: {file}')
print (e)
continue
tokens = get_tokens(embedded_note)
note_name = embedded_note.text
bm25_index[note_name] = {'id': embedded_note.id,'tokens': tokens}
#if is_modified == 'modified':
#If this is just a modified note, remove old one from database
#remove_old_note(da, embedded_note)
if(embedded_note):
print(f"Encoded file: {embedded_note.text}")
with da:
#Always check that there is no duplicate in database
if da and len(da) > 0:
remove_old_note(da, embedded_note)
da.append(embedded_note)
count_added_files+= 1
if counter % 20 == 0 and counter!= 0:
#checkpoint reached
save_json(bm25_index_filepath, bm25_index)
counter+=1
if files:
print('Some changes happened in the vault since last index')
save_json(bm25_index_filepath, bm25_index)
if count_added_files > 20 and push_to_jina:
#If this is a major update to the database, create backup
with da:
#da.save_json(backup_path + '/backup.json')
da.push('notes_all_update', show_progress=True)
return da
files = []
with open(json_path, 'r') as f:
files = json.load(f)
index_vault(files)