VynFi-python/examples/native_mode.py at main · VynFi/VynFi-python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
"""VynFi v2.3 Output Modes — native numbers and flat layout.

DataSynth 2.3 added two output options that dramatically simplify analytics:

1. ``output.numericMode = "native"`` -- decimals as JSON numbers (1729237.30)
   instead of strings ("1729237.30"). No more pd.to_numeric() boilerplate.

2. ``output.exportLayout = "flat"`` -- header fields merged onto each line.
   No more manual flattening of journal entries.

This script demonstrates both, comparing legacy vs v2.3-native workflows.
"""

import json
import os

import pandas as pd

import vynfi

client = vynfi.VynFi(api_key=os.environ["VYNFI_API_KEY"])

# ── v2.3 config: native numbers + flat layout ───────────────────────────────

config = {
    "sector": "retail",
    "country": "US",
    "accountingFramework": "us_gaap",
    "rows": 500,
    "companies": 3,
    "periods": 2,
    "periodLength": "monthly",
    "processModels": ["o2c", "p2p"],
    "exportFormat": "json",
    "fraudPacks": [],
    "fraudRate": 0.0,
    "output": {
        "numericMode": "native",   # JSON numbers, not strings
        # NOTE: exportLayout="flat" is still broken upstream as of DS 4.1.x
        # (tracked in docs/ds-3.1.1-verification.md § D). Leaving it at the
        # default "nested" layout so this example actually runs end-to-end.
    },
}

print("Submitting v2.3 generation job (native numbers + flat layout)...")
job = client.jobs.generate_config(config=config)
print(f"  Job: {job.id}")

completed = client.jobs.wait(job.id, timeout=300)
print(f"  Status: {completed.status}")

if completed.status != "completed":
    print(f"  Error: {completed.error_detail}")
    raise SystemExit(1)

# ── Download as DataFrame -- no pd.to_numeric() needed ──────────────────────

archive = client.jobs.download_archive(completed.id)
print(f"\n{archive}")

# Use archive.json() — works for both zip and managed_blob backends.
# (For managed_blob, this fetches lazily via presigned URL.)
entries = archive.json("journal_entries.json")

print(f"\nLoaded {len(entries)} journal entry documents")
if entries:
    first = entries[0]
    header = first.get("header", first)
    print("\nFirst record (sample):")
    for k, v in list(header.items())[:10]:
        print(f"  {k}: {v!r} ({type(v).__name__})")

# Flatten the nested layout (header + lines) into one row per line
rows = []
for e in entries:
    h = e.get("header", e)
    for line in e.get("lines", [e]) if "lines" in e else [e]:
        rows.append({**h, **line})
df = pd.DataFrame(rows)
print(f"\nDataFrame: {df.shape[0]} rows x {df.shape[1]} cols")

# Test: sum amounts. numericMode=native applies to flat layout only; nested
# output serialises amounts as strings, so coerce before summing.
if "debit_amount" in df.columns:
    amounts = pd.to_numeric(df["debit_amount"], errors="coerce").fillna(0)
    print(f"  Total debits: {float(amounts.sum()):,.2f}")
    print(f"  Raw column dtype: {df['debit_amount'].dtype}")

# ── Compare with legacy nested + string mode ────────────────────────────────
#
# Without native mode, you'd need:
#   for col in ["debit_amount", "credit_amount"]:
#       df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0)
#
# Without flat mode, you'd need:
#   rows = []
#   for entry in entries:
#       header = entry["header"]
#       for line in entry["lines"]:
#           rows.append({**header, **line})
#   df = pd.DataFrame(rows)
#
# v2.3 eliminates both layers of boilerplate.

print("\n=== v2.3 simplification summary ===")
print("  Legacy: nested + string -> ~10 lines of flatten + numeric conversion")
print("  v2.3:   flat + native   -> 2 lines: download_file + pd.DataFrame")