-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtest_preprocessing.py
More file actions
72 lines (56 loc) · 1.93 KB
/
Copy pathtest_preprocessing.py
File metadata and controls
72 lines (56 loc) · 1.93 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
"""
Test preprocessing pipeline.
"""
import sys
import os
project_root = os.path.dirname(os.path.abspath(__file__))
sys.path.insert(0, project_root)
from src.loader import DocumentLoader
from src.preprocessing import TextPreprocessor
def main():
print("="*70)
print("TEXT PREPROCESSING TEST")
print("="*70)
print()
# Load documents
loader = DocumentLoader('data/raw_texts')
try:
documents = loader.load_documents()
except Exception as e:
print(f"❌ ERROR: {e}")
print("\nMake sure you have .txt files in data/raw_texts folder")
return
print(f"\n✅ Loaded {len(documents)} documents\n")
# Initialize preprocessor
preprocessor = TextPreprocessor(use_stemming=True, remove_stopwords=True)
# Test on sample text
sample_text = documents[0]['content'][:500]
print("="*70)
print("ORIGINAL TEXT (first 500 chars):")
print("="*70)
print(sample_text)
print("\n" + "="*70 + "\n")
tokens = preprocessor.preprocess(sample_text)
print(f"PROCESSED TOKENS ({len(tokens)} tokens):")
print("="*70)
print(tokens[:50]) # Show first 50 tokens
print()
# Process all documents
print("="*70)
print("PROCESSING ALL DOCUMENTS...")
print("="*70)
all_contents = [doc['content'] for doc in documents]
processed_docs = preprocessor.preprocess_documents(all_contents)
avg_tokens = sum(len(d) for d in processed_docs) / len(processed_docs)
print(f"\n✅ Total documents processed: {len(processed_docs)}")
print(f"✅ Average tokens per document: {avg_tokens:,.0f}")
print("\n" + "="*70)
print("DOCUMENT STATISTICS:")
print("="*70)
for doc, tokens in zip(documents, processed_docs):
print(f" • {doc['title']}: {len(tokens):,} tokens")
print("\n" + "="*70)
print("✅ Test completed successfully!")
print("="*70)
if __name__ == '__main__':
main()