Files
FPandA-Engine/testing/generators/generate_data.py
2026-03-20 21:53:55 +01:00

383 lines
14 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
FP&A Test Data Generator
Generates realistic CSV data for: Budget vs Actuals, Cash Flow, P&L, Headcount
Covers 2 years (20232024), 4 departments, product & service revenue mix.
"""
import csv
import random
import os
from datetime import date, timedelta
from dataclasses import dataclass, fields, asdict
from typing import List
random.seed(42) # reproducible data
OUTPUT_DIR = os.path.join(os.path.dirname(__file__), "..", "data", "csv")
os.makedirs(OUTPUT_DIR, exist_ok=True)
# ── Company config ────────────────────────────────────────────────────────────
COMPANY = "AcmeSaaS Inc."
DEPARTMENTS = ["Engineering", "Sales", "Marketing", "Operations"]
YEARS = [2023, 2024]
# Revenue split: product (SaaS subscriptions) vs service (consulting/support)
PRODUCT_REVENUE_MIX = 0.70 # 70% product (recurring SaaS)
SERVICE_REVENUE_MIX = 0.30 # 30% services
# Monthly growth rates (realistic SaaS-style)
PRODUCT_MONTHLY_GROWTH = 0.025 # 2.5% MoM
SERVICE_MONTHLY_GROWTH = 0.015 # 1.5% MoM
# Base monthly revenue ($)
BASE_PRODUCT_REVENUE = 180_000
BASE_SERVICE_REVENUE = 75_000
# Variance helpers — actuals deviate from budget by ±%
def vary(value: float, pct: float = 0.08) -> float:
"""Apply random variance to simulate actuals vs budget."""
return round(value * (1 + random.uniform(-pct, pct)), 2)
def months_range(years: List[int]):
for year in years:
for month in range(1, 13):
yield year, month
# ── 1. Revenue (Budget vs Actuals) ───────────────────────────────────────────
@dataclass
class RevenueRow:
company: str
year: int
month: int
period: str # e.g. "2023-01"
revenue_type: str # "Product" | "Service"
budget_amount: float
actual_amount: float
variance: float # actual - budget
variance_pct: float # variance / budget * 100
def generate_revenue():
rows = []
prod_base = BASE_PRODUCT_REVENUE
svc_base = BASE_SERVICE_REVENUE
for year, month in months_range(YEARS):
period = f"{year}-{month:02d}"
for rev_type, base in [("Product", prod_base), ("Service", svc_base)]:
budget = round(base, 2)
actual = vary(budget, pct=0.10)
variance = round(actual - budget, 2)
vpct = round((variance / budget) * 100, 2) if budget else 0
rows.append(RevenueRow(COMPANY, year, month, period, rev_type,
budget, actual, variance, vpct))
# grow base each month
prod_base *= (1 + PRODUCT_MONTHLY_GROWTH)
svc_base *= (1 + SERVICE_MONTHLY_GROWTH)
write_csv("revenue_budget_vs_actuals.csv", rows)
print(f" ✓ revenue_budget_vs_actuals.csv ({len(rows)} rows)")
# ── 2. Department Opex (Budget vs Actuals) ───────────────────────────────────
DEPT_BUDGETS = {
# (base_monthly_opex, growth_rate)
"Engineering": (95_000, 0.012),
"Sales": (70_000, 0.018),
"Marketing": (55_000, 0.015),
"Operations": (40_000, 0.008),
}
OPEX_CATEGORIES = ["Salaries", "Software & Tools", "Travel", "Marketing Spend",
"Cloud Infrastructure", "Contractors", "Office & Facilities"]
@dataclass
class OpexRow:
company: str
department: str
year: int
month: int
period: str
category: str
budget_amount: float
actual_amount: float
variance: float
variance_pct: float
def generate_opex():
rows = []
dept_bases = {d: v[0] for d, v in DEPT_BUDGETS.items()}
for year, month in months_range(YEARS):
period = f"{year}-{month:02d}"
for dept, (_, growth) in DEPT_BUDGETS.items():
total_budget = dept_bases[dept]
# Split across categories with random weights
weights = [random.uniform(0.05, 0.35) for _ in OPEX_CATEGORIES]
total_w = sum(weights)
weights = [w / total_w for w in weights]
for cat, w in zip(OPEX_CATEGORIES, weights):
budget = round(total_budget * w, 2)
actual = vary(budget, pct=0.12)
variance = round(actual - budget, 2)
vpct = round((variance / budget) * 100, 2) if budget else 0
rows.append(OpexRow(COMPANY, dept, year, month, period, cat,
budget, actual, variance, vpct))
dept_bases[dept] *= (1 + growth)
write_csv("opex_budget_vs_actuals.csv", rows)
print(f" ✓ opex_budget_vs_actuals.csv ({len(rows)} rows)")
# ── 3. P&L / Income Statement ─────────────────────────────────────────────────
@dataclass
class PLRow:
company: str
year: int
month: int
period: str
product_revenue: float
service_revenue: float
total_revenue: float
cogs_product: float # ~25% of product rev
cogs_service: float # ~45% of service rev (labor-heavy)
total_cogs: float
gross_profit: float
gross_margin_pct: float
total_opex: float
ebitda: float
ebitda_margin_pct: float
net_income: float # after ~25% tax estimate
def generate_pl():
rows = []
prod_base = BASE_PRODUCT_REVENUE
svc_base = BASE_SERVICE_REVENUE
dept_bases = {d: v[0] for d, v in DEPT_BUDGETS.items()}
for year, month in months_range(YEARS):
period = f"{year}-{month:02d}"
prod_rev = vary(prod_base, 0.08)
svc_rev = vary(svc_base, 0.10)
total_rev = round(prod_rev + svc_rev, 2)
cogs_prod = round(prod_rev * vary(0.25, 0.05), 2)
cogs_svc = round(svc_rev * vary(0.45, 0.06), 2)
total_cogs = round(cogs_prod + cogs_svc, 2)
gross_profit = round(total_rev - total_cogs, 2)
gm_pct = round((gross_profit / total_rev) * 100, 2) if total_rev else 0
total_opex = round(sum(
vary(dept_bases[d], 0.10) for d in DEPARTMENTS
), 2)
ebitda = round(gross_profit - total_opex, 2)
ebitda_pct = round((ebitda / total_rev) * 100, 2) if total_rev else 0
net_income = round(ebitda * 0.75, 2) # rough 25% tax
rows.append(PLRow(
COMPANY, year, month, period,
round(prod_rev, 2), round(svc_rev, 2), total_rev,
cogs_prod, cogs_svc, total_cogs,
gross_profit, gm_pct,
total_opex, ebitda, ebitda_pct, net_income
))
prod_base *= (1 + PRODUCT_MONTHLY_GROWTH)
svc_base *= (1 + SERVICE_MONTHLY_GROWTH)
for d, (_, g) in DEPT_BUDGETS.items():
dept_bases[d] *= (1 + g)
write_csv("pl_income_statement.csv", rows)
print(f" ✓ pl_income_statement.csv ({len(rows)} rows)")
# ── 4. Cash Flow ──────────────────────────────────────────────────────────────
@dataclass
class CashFlowRow:
company: str
year: int
month: int
period: str
# Operating
cash_collected_product: float # product ARR collections (may lag revenue)
cash_collected_service: float
cash_paid_opex: float
cash_paid_cogs: float
net_operating_cash_flow: float
# Investing
capex: float # infra / hardware
net_investing_cash_flow: float
# Financing
loan_repayment: float
equity_raised: float
net_financing_cash_flow: float
# Summary
net_change_in_cash: float
closing_cash_balance: float
def generate_cashflow():
rows = []
cash_balance = 1_200_000.0 # starting cash (seed round runway)
prod_base = BASE_PRODUCT_REVENUE
svc_base = BASE_SERVICE_REVENUE
dept_bases = {d: v[0] for d, v in DEPT_BUDGETS.items()}
for year, month in months_range(YEARS):
period = f"{year}-{month:02d}"
# Collections slightly lag invoicing (DSO ~30 days effect)
cash_prod = vary(prod_base * 0.95, 0.06)
cash_svc = vary(svc_base * 0.90, 0.08) # services collect slower
opex_paid = sum(vary(dept_bases[d], 0.08) for d in DEPARTMENTS)
cogs_paid = vary((prod_base * 0.25) + (svc_base * 0.45), 0.07)
net_op = round(cash_prod + cash_svc - opex_paid - cogs_paid, 2)
# Investing — occasional capex spikes
capex = vary(8_000, 0.40) if random.random() > 0.4 else 0.0
net_inv = round(-capex, 2)
# Financing — occasional loan repayment
loan = vary(5_000, 0.20) if month % 3 == 0 else 0.0
equity = 0.0
if year == 2023 and month == 6:
equity = 500_000.0 # Series A mid-2023
net_fin = round(equity - loan, 2)
net_change = round(net_op + net_inv + net_fin, 2)
cash_balance = round(cash_balance + net_change, 2)
rows.append(CashFlowRow(
COMPANY, year, month, period,
round(cash_prod, 2), round(cash_svc, 2),
round(opex_paid, 2), round(cogs_paid, 2), net_op,
round(capex, 2), net_inv,
round(loan, 2), round(equity, 2), net_fin,
net_change, cash_balance
))
prod_base *= (1 + PRODUCT_MONTHLY_GROWTH)
svc_base *= (1 + SERVICE_MONTHLY_GROWTH)
for d, (_, g) in DEPT_BUDGETS.items():
dept_bases[d] *= (1 + g)
write_csv("cash_flow.csv", rows)
print(f" ✓ cash_flow.csv ({len(rows)} rows)")
# ── 5. Headcount & Workforce ──────────────────────────────────────────────────
ROLES = {
"Engineering": [("Software Engineer", 120_000), ("Senior Engineer", 160_000),
("Engineering Manager", 180_000), ("DevOps Engineer", 130_000)],
"Sales": [("Account Executive", 90_000), ("Sales Manager", 140_000),
("SDR", 65_000)],
"Marketing": [("Marketing Manager", 110_000), ("Content Strategist", 80_000),
("Growth Analyst", 95_000)],
"Operations": [("Operations Manager", 115_000), ("Customer Success", 75_000),
("Finance Analyst", 95_000)],
}
@dataclass
class HeadcountRow:
company: str
employee_id: str
department: str
role: str
hire_date: str
termination_date: str # empty if active
status: str # Active | Terminated
annual_salary_budget: float
actual_salary_paid_ytd: float # YTD for the given year
year: int
month: int
period: str
headcount_fte: float # 1.0 full time, 0.5 contractor etc.
def generate_headcount():
rows = []
emp_id = 1000
employees = []
# Seed initial employees at start of 2023
for dept, role_list in ROLES.items():
# Start with 2-4 per department
count = random.randint(2, 4)
for _ in range(count):
role, salary = random.choice(role_list)
hire_date = date(2022, random.randint(1, 12), random.randint(1, 28))
employees.append({
"id": f"EMP{emp_id}",
"dept": dept, "role": role, "salary": salary,
"hire_date": hire_date, "term_date": None,
"fte": 1.0,
})
emp_id += 1
for year, month in months_range(YEARS):
period = f"{year}-{month:02d}"
current = date(year, month, 1)
# Random hiring each month
if random.random() > 0.55:
dept = random.choice(DEPARTMENTS)
role, salary = random.choice(ROLES[dept])
employees.append({
"id": f"EMP{emp_id}",
"dept": dept, "role": role, "salary": salary,
"hire_date": current, "term_date": None,
"fte": random.choice([1.0, 1.0, 1.0, 0.5]), # mostly FT
})
emp_id += 1
# Occasional attrition
active = [e for e in employees if e["term_date"] is None]
if len(active) > 6 and random.random() > 0.85:
leaver = random.choice(active)
leaver["term_date"] = current
# Snapshot each employee for this month
for emp in employees:
if emp["hire_date"] > current:
continue # not hired yet
status = "Active" if emp["term_date"] is None or emp["term_date"] > current else "Terminated"
months_in_year = month if emp["hire_date"].year < year else (
month - emp["hire_date"].month + 1
)
months_in_year = max(0, min(months_in_year, month))
ytd_paid = round((emp["salary"] / 12) * months_in_year * vary(1.0, 0.02), 2)
rows.append(HeadcountRow(
COMPANY, emp["id"], emp["dept"], emp["role"],
str(emp["hire_date"]),
str(emp["term_date"]) if emp["term_date"] else "",
status, emp["salary"], ytd_paid,
year, month, period, emp["fte"]
))
write_csv("headcount_workforce.csv", rows)
print(f" ✓ headcount_workforce.csv ({len(rows)} rows)")
# ── CSV writer ────────────────────────────────────────────────────────────────
def write_csv(filename: str, rows: list):
if not rows:
return
path = os.path.join(OUTPUT_DIR, filename)
with open(path, "w", newline="") as f:
writer = csv.DictWriter(f, fieldnames=[field.name for field in fields(rows[0])])
writer.writeheader()
writer.writerows([asdict(r) for r in rows])
# ── Entry point ───────────────────────────────────────────────────────────────
if __name__ == "__main__":
print(f"\n🏗 Generating FP&A test data for {COMPANY}")
print(f" Periods : {YEARS[0]}-01 → {YEARS[-1]}-12 (24 months)")
print(f" Depts : {', '.join(DEPARTMENTS)}\n")
generate_revenue()
generate_opex()
generate_pl()
generate_cashflow()
generate_headcount()
print(f"\n✅ All CSV files written to: {os.path.abspath(OUTPUT_DIR)}\n")