-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtest_vectorizer.py
More file actions
88 lines (69 loc) · 2.52 KB
/
Copy pathtest_vectorizer.py
File metadata and controls
88 lines (69 loc) · 2.52 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
"""
Test TF-IDF vectorizer.
"""
import sys
import os
project_root = os.path.dirname(os.path.abspath(__file__))
sys.path.insert(0, project_root)
import numpy as np
from src.loader import DocumentLoader
from src.preprocessing import TextPreprocessor
from src.vectorizer import TFIDFVectorizer
def main():
print("="*70)
print("TF-IDF VECTORIZER TEST")
print("="*70)
print()
# Load documents
loader = DocumentLoader('data/raw_texts')
try:
documents = loader.load_documents()
except Exception as e:
print(f"❌ ERROR: {e}")
print("\nMake sure you have .txt files in data/raw_texts folder")
return
print(f"\n✅ Loaded {len(documents)} documents\n")
# Preprocess documents
print("="*70)
print("PREPROCESSING DOCUMENTS...")
print("="*70)
preprocessor = TextPreprocessor(use_stemming=True, remove_stopwords=True)
all_contents = [doc['content'] for doc in documents]
processed_docs = preprocessor.preprocess_documents(all_contents)
print("✅ Preprocessing complete\n")
# Build TF-IDF vectors
print("="*70)
print("BUILDING TF-IDF VECTORS...")
print("="*70)
vectorizer = TFIDFVectorizer()
tfidf_matrix = vectorizer.fit_transform(processed_docs)
print(f"\n✅ TF-IDF matrix shape: {tfidf_matrix.shape}")
print(f" (documents × vocabulary size)\n")
# Analyze first document
print("="*70)
print(f"TOP TERMS IN '{documents[0]['title']}':")
print("="*70)
doc_idx = 0
doc_vector = tfidf_matrix[doc_idx]
# Get non-zero indices and sort by TF-IDF score
nonzero_indices = np.where(doc_vector > 0)[0]
top_indices = nonzero_indices[np.argsort(-doc_vector[nonzero_indices])][:15]
# Create reverse vocabulary mapping
reverse_vocab = {idx: term for term, idx in vectorizer.vocabulary.items()}
print()
for rank, idx in enumerate(top_indices, 1):
term = reverse_vocab[idx]
score = doc_vector[idx]
print(f" {rank:2d}. {term:20s} → {score:.4f}")
# Show vocabulary statistics
print("\n" + "="*70)
print("VOCABULARY STATISTICS:")
print("="*70)
print(f" • Total unique terms: {len(vectorizer.vocabulary):,}")
print(f" • Documents: {vectorizer.num_documents}")
print(f" • Average non-zero terms per document: {np.count_nonzero(tfidf_matrix) / len(documents):,.0f}")
print("\n" + "="*70)
print("✅ Test completed successfully!")
print("="*70)
if __name__ == '__main__':
main()