-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprocess_text.py
More file actions
71 lines (59 loc) · 2.92 KB
/
Copy pathprocess_text.py
File metadata and controls
71 lines (59 loc) · 2.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
from auto_ingest_config import get_fileserver_path
import os, re, uuid, csv, json, time
from datetime import datetime
import numpy as np
from transformers import AutoTokenizer, AutoModel
from gliner import GLiNER
imbeddings_tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
imbeddings_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
entity_recognition_classifier = GLiNER.from_pretrained("urchade/gliner_large-v2")
def named_entity_recognizer(query, labels=["Person", "Place", "Event", "Date", "Subject"]):
entities = entity_recognition_classifier.predict_entities(query, labels)
return entities
# Function to generate embeddings
def get_embeddings_for_text(text):
inputs = imbeddings_tokenizer(text, return_tensors='pt', padding=True, truncation=True)
outputs = imbeddings_model(**inputs)
return outputs.last_hidden_state.mean(dim=1).detach().numpy().tolist()
def list_files(directory):
file_keys = set([])
for filename in os.listdir(directory):
if re.search("_R.MP4", filename):
file_keys.add(filename.rsplit('_R.', 1)[0])
return sorted(list(file_keys))
def is_valid_date_structure(dir_name):
try:
datetime.strptime(dir_name, "%Y/%m/%d")
return True
except ValueError:
return False
def list_directories(base_path):
for root, dirs, files in os.walk(base_path):
path_parts = root[len(base_path):].split(os.sep)
normalized_path = "/".join(path_parts)
temp_path = root[len(base_path):]
if is_valid_date_structure(temp_path):
print(f"Valid directory structure found: {root}")
file_path = root
transcriptions = get_fileserver_path("dashcam/transcriptions")
key_list = list_files(file_path)
total = len(key_list)
current = 1
print(f"{total} Videos left to process")
for x in range(len(key_list)):
if os.path.exists(f"{transcriptions}/{key_list[x]}_medium_transcription.txt"):
with open(f"{transcriptions}/{key_list[x]}_medium_transcription.txt", 'r') as file:
# Load the content as a JSON object
data = json.load(file)
print(key_list[x], end="|")
embeddings = get_embeddings_for_text(data['text'])
print(embeddings, end="|")
print(data['text'], end="|")
print("")
with open(f"{transcriptions}/{key_list[x]}_medium_transcription_imbeddings.txt", 'w') as file:
print(key_list[x])
print(data['text'])
for segment in data["segments"]:
print(segment['text'], segment['tokens'])
list_directories(get_fileserver_path("dashcam"))
list_directories(get_fileserver_path("dashcam"))