-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathquality_monitoring.py
More file actions
101 lines (85 loc) · 4.18 KB
/
quality_monitoring.py
File metadata and controls
101 lines (85 loc) · 4.18 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
"""VynFi Quality — monitor data quality scores across generation jobs."""
import json
import os
import vynfi
client = vynfi.VynFi(api_key=os.environ["VYNFI_API_KEY"])
# ── Quality scores for recent jobs ──────────────────────────────────────────
#
# VynFi computes quality scores for each generation job:
# - overall_score: Composite quality metric (0-100)
# - benford_score: How well leading digit distribution matches Benford's Law
# - correlation_score: Statistical correlation quality between related fields
# - distribution_score: How realistic the data distributions are
print("=== Quality Scores ===")
scores = client.quality.scores()
if scores:
for s in scores:
print(f" Job {s.job_id}:")
print(f" Overall: {s.overall_score}")
print(f" Benford: {s.benford_score}")
print(f" Correlation: {s.correlation_score}")
print(f" Distribution: {s.distribution_score}")
print(f" Table: {s.table_type}, Rows: {s.rows}")
print()
else:
print(" No quality scores yet — generate some data first!")
# ── Quality timeline ────────────────────────────────────────────────────────
#
# Track quality trends over time (useful for regression testing).
print("=== Quality Timeline (30 days) ===")
timeline = client.quality.timeline(days=30)
if timeline:
for day in timeline:
bar = "#" * int(day.score / 5) if day.score else ""
print(f" {day.date}: {day.score:5.1f} {bar}")
else:
print(" No timeline data yet.")
# ── Quality as DataFrame ────────────────────────────────────────────────────
try:
from vynfi.integrations.pandas import quality_to_dataframe
df = quality_to_dataframe(client)
if not df.empty:
print(f"\n=== Quality DataFrame ===")
print(f" Shape: {df.shape}")
print(f" Mean overall score: {df['overall_score'].mean():.1f}")
print(f" Min overall score: {df['overall_score'].min():.1f}")
except ImportError:
print("\n (Install pandas for DataFrame support: pip install vynfi[pandas])")
# ── Inline quality check using per-file download ─────────────────────────────
#
# Each archive includes data_quality_stats.json and balance_validation.json.
# Use download_file() to grab them directly — no need to download the full archive.
print("\n=== Inline Quality from Latest Job ===")
jobs = client.jobs.list(status="completed", limit=5)
job_id = None
quality_files: list = []
for j in jobs.data:
try:
file_list = client.jobs.list_files(j.id)
except vynfi.NotFoundError:
continue # archive GC'd — try the next one
job_id = j.id
quality_files = [f for f in file_list.files if "quality" in f.path or "validation" in f.path]
break
if job_id is None:
raise SystemExit("No completed jobs with accessible archives found.")
print(f" Quality-related files in job {job_id}:")
for f in quality_files:
print(f" {f.path} ({f.size_bytes:,} bytes)")
# Download individual quality files (no full archive needed)
try:
stats = json.loads(client.jobs.download_file(job_id, "data_quality_stats.json"))
print("\n Data quality stats:")
print(f" Records with issues: {stats.get('records_with_issues', 'N/A')}")
print(f" Duplicates: {stats.get('duplicates', {}).get('total_duplicates', 'N/A')}")
except vynfi.NotFoundError:
print(" No data_quality_stats.json in this job")
try:
validation = json.loads(client.jobs.download_file(job_id, "balance_validation.json"))
print("\n Balance validation:")
print(f" Balanced: {validation.get('is_balanced')}")
print(f" Entries processed: {validation.get('entries_processed', 'N/A'):,}")
print(f" Total debits: {validation.get('total_debits', 'N/A')}")
print(f" Total credits: {validation.get('total_credits', 'N/A')}")
except vynfi.NotFoundError:
print(" No balance_validation.json in this job")