VynFi-python/examples/behavioral_fraud_patterns.py at main · VynFi/VynFi-python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
"""VynFi DataSynth 3.1.1 — behavioral fraud patterns.

DS 3.1.1 injects canonical forensic signals into fraud-labeled entries:

- 30 % weekend posting bias  →  measured ~32× lift
- 40 % round-dollar amount bias  →  measured ~170× lift
- 25 % post-close marking bias  →  measured ~3,100× lift
- 35 % off-hours created_at bias  →  not yet firing (upstream)

This example fits a RandomForest with behavioral features, reports
feature importance, and computes a direct fraud/baseline lift ratio per
signal so you can see the DS 3.1.1 bias working.

Each bias that fires also emits a secondary `AnomalyType::ProcessIssue`
label (`WeekendPosting`, `AfterHoursPosting`, `PostClosePosting`) into
`labels/anomaly_labels.json`.
"""

import os

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

import vynfi

client = vynfi.VynFi(api_key=os.environ["VYNFI_API_KEY"], timeout=180.0)

config = {
    "sector": "retail",
    "country": "US",
    "accountingFramework": "us_gaap",
    "rows": 1000,
    "companies": 3,
    "periods": 2,
    "periodLength": "monthly",
    "processModels": ["o2c", "p2p"],
    "exportFormat": "json",
    "fraudPacks": ["revenue_fraud"],
    "fraudRate": 0.05,
    # Opt into 3.1 behavioral biases (defaults are on, this makes it explicit)
    "output": {"numericMode": "native"},
}

print("Submitting 3.1 config to check behavioral fraud signal...")
job = client.jobs.generate_config(config=config)
done = client.jobs.wait(job.id, timeout=300)
print(f"Job: {done.id}  status: {done.status}")
if done.status != "completed":
    raise SystemExit(done.error_detail)

archive = client.jobs.download_archive(done.id)
print(f"{archive}\n")

# ── Build feature frame ─────────────────────────────────────────────────────

entries = archive.json("journal_entries.json")
rows = []
for e in entries:
    h = e.get("header", e)
    for line in e.get("lines", [e]) if "lines" in e else [e]:
        rows.append({**h, **line})
df = pd.DataFrame(rows)

for col in ("debit_amount", "credit_amount"):
    df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0)
df["posting_date"] = pd.to_datetime(df["posting_date"], errors="coerce", utc=True)
df["created_dt"] = pd.to_datetime(df.get("created_at"), errors="coerce", utc=True)

df["amount"] = np.maximum(df["debit_amount"], df["credit_amount"])
df["log_amount"] = np.log1p(df["amount"])
df["is_round_1000"] = ((df["amount"] % 1000 == 0) & (df["amount"] > 0)).astype(int)
df["is_round_5000"] = ((df["amount"] % 5000 == 0) & (df["amount"] > 0)).astype(int)
df["is_round_10k"] = ((df["amount"] % 10000 == 0) & (df["amount"] > 0)).astype(int)
df["is_weekend"] = (df["posting_date"].dt.dayofweek >= 5).astype(int)
df["is_month_end"] = (df["posting_date"].dt.day >= 28).astype(int)
df["created_hour"] = df["created_dt"].dt.hour.fillna(-1).astype(int)
df["is_off_hours"] = ((df["created_hour"] >= 22) | (df["created_hour"] < 6)).astype(int)
df["is_post_close_int"] = df.get("is_post_close", False).astype(int)
df["is_manual_int"] = df["is_manual"].astype(int)

features = [
    "log_amount",
    "is_round_1000",
    "is_round_5000",
    "is_round_10k",
    "is_weekend",
    "is_month_end",
    "is_off_hours",
    "is_post_close_int",
    "is_manual_int",
]
X = df[features].fillna(0).astype(float).values
y = df["is_fraud"].astype(int).values
n_fraud = int(y.sum())
print(f"Fraud prevalence: {n_fraud}/{len(y)} ({100 * n_fraud / max(len(y), 1):.1f}%)")

# ── Direct lift table (fraud-rate / baseline-rate per signal) ───────────────

print("\nBehavioral bias lift (fraud rate vs baseline rate):")
fraud_mask = df["is_fraud"].astype(bool)
baseline_mask = ~fraud_mask
for signal in ("is_weekend", "is_round_1000", "is_round_5000", "is_off_hours", "is_post_close_int"):
    if signal not in df.columns:
        continue
    f_rate = df.loc[fraud_mask, signal].mean() if fraud_mask.any() else 0.0
    b_rate = df.loc[baseline_mask, signal].mean() if baseline_mask.any() else 0.0
    lift = (f_rate / b_rate) if b_rate > 0 else float("inf") if f_rate > 0 else 0.0
    lift_str = f"{lift:>8.1f}×" if lift != float("inf") else "       ∞"
    print(f"  {signal:22s}  fraud {f_rate:6.2%}   baseline {b_rate:6.2%}   lift {lift_str}")

if n_fraud < 20:
    print("\nNot enough fraud labels to fit a model — try a larger rows/periods config.")
    archive.close()
    raise SystemExit(0)

clf = RandomForestClassifier(
    n_estimators=300, max_depth=8, class_weight="balanced", random_state=42, n_jobs=-1
).fit(X, y)

print("\nPer-feature contribution vs. fraud label:")
for name, imp in sorted(zip(features, clf.feature_importances_), key=lambda x: -x[1]):
    bar = "█" * int(imp * 60)
    behavioral_prefixes = ("is_weekend", "is_round", "is_off_hours", "is_post_close")
    flag = "  ← behavioral" if name.startswith(behavioral_prefixes) else ""
    print(f"  {name:20s}  {imp:.4f}  {bar}{flag}")

# ── Look for the secondary ProcessIssue labels ──────────────────────────────

print("\nSecondary ProcessIssue labels from labels/anomaly_labels.json:")
try:
    labels = archive.json("labels/anomaly_labels.json")
    counts = {}
    for lab in labels:
        atype = lab.get("anomaly_type", {})
        if isinstance(atype, dict):
            cat = next(iter(atype), "")
            sub = atype.get(cat, "") if cat else ""
            if cat == "ProcessIssue":
                counts[sub] = counts.get(sub, 0) + 1
    if counts:
        for k, v in sorted(counts.items(), key=lambda x: -x[1]):
            print(f"  {k:22s}  {v}")
    else:
        print("  No ProcessIssue labels — DS 3.1 behavioral tagging may not be live yet.")
except KeyError:
    print("  labels/anomaly_labels.json not present")

# ── 5-fold ROC-AUC for the model ────────────────────────────────────────────

try:
    auc_scores = cross_val_score(clf, X, y, cv=5, scoring="roc_auc", n_jobs=-1)
    print(f"\nROC-AUC (5-fold CV): {auc_scores.mean():.3f} ± {auc_scores.std():.3f}")
except ValueError:
    pass

archive.close()

print("""

Interpretation (DS 3.1.1, measured live):
- is_weekend fraud rate ~40 % vs baseline ~1.3 %  (~32× lift)
- is_round_1000 fraud rate ~19 % vs baseline ~0.1 %  (~170× lift)
- is_post_close_int fraud rate ~31 % vs baseline ~0.01 %  (~3,106× lift)
- log_amount typically still dominates importance, but the behavioral
  features now register meaningfully (>3–5 %) rather than the 0 % seen
  on 2.3.x / 3.0 / 3.1.0.
- is_off_hours remains 0 % in both populations — upstream bug tracked
  in docs/ds-3.1.1-verification.md § A.
""")