Star Schema DDL
-- =====================================================
-- FINVU DATA WAREHOUSE - STAR SCHEMA DDL
-- =====================================================
-- Business Process: Consent Journey Funnel Analysis
-- Grain: One row per Consent Handle per Day
-- Performance Target: <3 second drill-down capability
-- Data Retention: 7+ years for regulatory compliance
-- =====================================================
-- =====================================================
-- TIER 1 DIMENSIONS - IMMEDIATE IMPLEMENTATION
-- =====================================================
-- -----------------------------------------------------
-- DIM_FIU: Client Performance Analysis
-- Business Context: Revenue optimization per FIU client
-- MD-Space: "FIU_WISE_BREAKDOWN = [Navi, Cred, Axis, ICICI, Bajaj Finance, RKB Finance]"
-- -----------------------------------------------------
CREATE TABLE gold.dim_fiu (
fiu_key Int32,
fiu_name LowCardinality(String) COMMENT 'Client name: navi, cred, axis, icici, bajaj_finance, rkb_finance',
fiu_id String COMMENT 'Original FIU identifier from events',
client_type LowCardinality(String) COMMENT 'LENDING, PFM, MIXED based on purpose code patterns',
revenue_tier LowCardinality(String) COMMENT 'HIGH, MEDIUM, LOW based on monthly volume',
revenue_model LowCardinality(String) COMMENT 'PER_FETCH, PER_USER_MONTHLY, HYBRID',
onboarding_date Date COMMENT 'Client onboarding date for tenure analysis',
monthly_target_volume UInt32 COMMENT 'Target consents per month for deficit tracking',
is_active Bool COMMENT 'Current active status',
created_at DateTime64(3) DEFAULT now64(3),
updated_at DateTime64(3) DEFAULT now64(3)
) ENGINE = MergeTree()
ORDER BY (fiu_key)
SETTINGS index_granularity = 8192
COMMENT 'FIU dimension for client performance analysis and revenue attribution';
-- -----------------------------------------------------
-- DIM_PURPOSE_CODE: Revenue Attribution
-- Business Context: Billing model determination (₹3 vs ₹0.10)
-- MD-Space: "PURPOSE_CODE_CORRELATION = 103 (lending), 101/102 (PFM)"
-- -----------------------------------------------------
CREATE TABLE gold.dim_purpose_code (
purpose_code_key Int32,
purpose_code LowCardinality(String) COMMENT 'RBI purpose codes: 101, 102, 103, 104, 105',
description String COMMENT 'Human readable purpose description',
billing_model LowCardinality(String) COMMENT 'PER_FETCH, PER_USER_MONTHLY',
billing_rate Decimal(10,2) COMMENT 'Rate in INR: 3.00 for lending, 0.10 for PFM',
revenue_type LowCardinality(String) COMMENT 'LENDING, PFM, ACCOUNT_QUERY',
frequency_type LowCardinality(String) COMMENT 'ONE_TIME, RECURRING, REGULAR',
is_regulated Bool COMMENT 'Whether purpose code requires special compliance',
sahamati_category LowCardinality(String) COMMENT 'Category for Sahamati reporting',
created_at DateTime64(3) DEFAULT now64(3),
updated_at DateTime64(3) DEFAULT now64(3)
) ENGINE = MergeTree()
ORDER BY (purpose_code_key)
SETTINGS index_granularity = 8192
COMMENT 'Purpose code dimension for revenue attribution and billing model determination';
-- -----------------------------------------------------
-- DIM_FIP: FIP Health Attribution
-- Business Context: Separate FIP issues from AA performance
-- MD-Space: "FIP_HEALTH_ATTRIBUTION = separate FIP vs AA performance issues"
-- -----------------------------------------------------
CREATE TABLE gold.dim_fip (
fip_key Int32,
fip_name LowCardinality(String) COMMENT 'Bank/FIP name: axis_bank, hdfc_bank, icici_bank',
fip_id String COMMENT 'Original FIP identifier from events',
fip_type LowCardinality(String) COMMENT 'BANK, NBFC, MUTUAL_FUND, INSURANCE',
bank_category LowCardinality(String) COMMENT 'PUBLIC, PRIVATE, COOPERATIVE, FOREIGN',
uptime_score Decimal(5,2) COMMENT 'Historical uptime percentage (0-100)',
success_rate Decimal(5,2) COMMENT 'Historical success rate percentage (0-100)',
avg_response_time_ms UInt32 COMMENT 'Average API response time in milliseconds',
performance_tier LowCardinality(String) COMMENT 'HIGH, MEDIUM, LOW based on reliability metrics',
is_active Bool COMMENT 'Current active status',
onboarding_date Date COMMENT 'FIP onboarding date',
created_at DateTime64(3) DEFAULT now64(3),
updated_at DateTime64(3) DEFAULT now64(3)
) ENGINE = MergeTree()
ORDER BY (fip_key)
SETTINGS index_granularity = 8192
COMMENT 'FIP dimension for health attribution and performance analysis';
-- -----------------------------------------------------
-- DIM_DATE: Time Intelligence
-- Business Context: Daily/MTD tracking and growth analysis
-- MD-Space: "DAY_ON_DAY_TRACKING = short-term trend identification"
-- -----------------------------------------------------
CREATE TABLE gold.dim_date (
date_key Date,
year UInt16,
month UInt8,
day UInt8,
quarter UInt8,
month_name LowCardinality(String),
day_of_week LowCardinality(String),
day_of_year UInt16,
week_of_year UInt8,
is_business_day Bool COMMENT 'Indian business day (Mon-Fri, excluding holidays)',
is_month_end Bool,
is_quarter_end Bool,
is_year_end Bool,
fiscal_year UInt16 COMMENT 'Indian fiscal year (Apr-Mar)',
fiscal_quarter UInt8,
fiscal_month UInt8,
days_in_month UInt8,
created_at DateTime64(3) DEFAULT now64(3)
) ENGINE = MergeTree()
ORDER BY (date_key)
SETTINGS index_granularity = 8192
COMMENT 'Date dimension for time intelligence and trend analysis';
-- -----------------------------------------------------
-- DIM_LICENSE_TYPE: Regulatory Categorization
-- Business Context: Automated Sahamati/RBI reporting
-- MD-Space: "LICENSE_TYPE_DIMENSION = SEBI, RBI, other regulatory categorization"
-- -----------------------------------------------------
CREATE TABLE gold.dim_license_type (
license_type_key Int32,
license_category LowCardinality(String) COMMENT 'SEBI, RBI, INSURANCE, UNREGULATED',
regulatory_body LowCardinality(String) COMMENT 'Governing regulatory authority',
entity_type LowCardinality(String) COMMENT 'Regulated_FIU, LSP, TSP',
compliance_level LowCardinality(String) COMMENT 'FULL, PARTIAL, NONE',
is_regulated Bool COMMENT 'Whether entity requires regulatory reporting',
reporting_frequency LowCardinality(String) COMMENT 'DAILY, MONTHLY, QUARTERLY',
sahamati_category LowCardinality(String) COMMENT 'Category for Sahamati reporting',
rbi_category LowCardinality(String) COMMENT 'Category for RBI reporting',
created_at DateTime64(3) DEFAULT now64(3),
updated_at DateTime64(3) DEFAULT now64(3)
) ENGINE = MergeTree()
ORDER BY (license_type_key)
SETTINGS index_granularity = 8192
COMMENT 'License type dimension for regulatory categorization and compliance reporting';
-- -----------------------------------------------------
-- DIM_USER_TYPE: User Behavior Analysis
-- Business Context: New vs existing user performance comparison
-- MD-Space: "USER_CLASSIFICATION_LOGIC: NEW_USERS vs EXISTING_USERS"
-- -----------------------------------------------------
CREATE TABLE gold.dim_user_type (
user_type_key Int32,
user_category LowCardinality(String) COMMENT 'NEW, EXISTING, RETURNING',
experience_level LowCardinality(String) COMMENT 'FIRST_TIME, EXPERIENCED, EXPERT',
has_linked_accounts Bool COMMENT 'Whether user has previously linked bank accounts',
journey_complexity LowCardinality(String) COMMENT 'FULL_FUNNEL, ABBREVIATED_FUNNEL',
expected_success_rate Decimal(5,2) COMMENT 'Expected success rate based on user type',
typical_drop_off_stage LowCardinality(String) COMMENT 'Most common drop-off point',
support_required Bool COMMENT 'Whether user type typically requires assistance',
created_at DateTime64(3) DEFAULT now64(3),
updated_at DateTime64(3) DEFAULT now64(3)
) ENGINE = MergeTree()
ORDER BY (user_type_key)
SETTINGS index_granularity = 8192
COMMENT 'User type dimension for behavior analysis and funnel optimization';
-- =====================================================
-- TIER 2 DIMENSIONS - PHASE 2 IMPLEMENTATION
-- =====================================================
-- -----------------------------------------------------
-- DIM_JOURNEY_TYPE: Performance Comparison
-- Business Context: Assisted vs DIY journey analysis
-- MD-Space: "ASSISTED_JOURNEYS = bank branch RMs, higher success rates"
-- -----------------------------------------------------
CREATE TABLE gold.dim_journey_type (
journey_type_key Int32,
journey_category LowCardinality(String) COMMENT 'ASSISTED, DIY, HYBRID',
assistance_level LowCardinality(String) COMMENT 'FULL_SUPPORT, PARTIAL_SUPPORT, SELF_SERVICE',
channel_type LowCardinality(String) COMMENT 'BRANCH, MOBILE_APP, WEB, API',
target_audience LowCardinality(String) COMMENT 'BRANCH_CUSTOMERS, TECH_SAVVY, GENERAL',
typical_success_rate Decimal(5,2) COMMENT 'Historical success rate for this journey type',
avg_completion_time_minutes UInt16 COMMENT 'Average time to complete journey',
created_at DateTime64(3) DEFAULT now64(3),
updated_at DateTime64(3) DEFAULT now64(3)
) ENGINE = MergeTree()
ORDER BY (journey_type_key)
SETTINGS index_granularity = 8192
COMMENT 'Journey type dimension for assisted vs DIY performance comparison';
-- -----------------------------------------------------
-- DIM_FI_TYPE: Regulatory Reporting
-- Business Context: Sahamati FI type breakdown requirements
-- MD-Space: "FI_TYPE_BREAKDOWN = deposit, mutual fund, insurance, etc."
-- -----------------------------------------------------
CREATE TABLE gold.dim_fi_type (
fi_type_key Int32,
fi_category LowCardinality(String) COMMENT 'DEPOSIT, INVESTMENT, INSURANCE, CREDIT',
data_type LowCardinality(String) COMMENT 'Specific FI data type from Sahamati spec',
regulatory_classification LowCardinality(String) COMMENT 'RBI, SEBI, IRDAI classification',
requires_special_handling Bool COMMENT 'Whether FI type needs special processing',
sahamati_category LowCardinality(String) COMMENT 'Official Sahamati category for reporting',
data_sensitivity LowCardinality(String) COMMENT 'HIGH, MEDIUM, LOW',
encryption_required Bool COMMENT 'Whether additional encryption is needed',
created_at DateTime64(3) DEFAULT now64(3),
updated_at DateTime64(3) DEFAULT now64(3)
) ENGINE = MergeTree()
ORDER BY (fi_type_key)
SETTINGS index_granularity = 8192
COMMENT 'FI type dimension for regulatory reporting and Sahamati compliance';
-- -----------------------------------------------------
-- DIM_TSP: Technical Service Provider Segmentation
-- Business Context: Integration performance analysis
-- MD-Space: "TSP_DIMENSION = technical service provider segmentation"
-- -----------------------------------------------------
CREATE TABLE gold.dim_tsp (
tsp_key Int32,
tsp_name LowCardinality(String) COMMENT 'Technical service provider name',
tsp_id String COMMENT 'Original TSP identifier',
integration_type LowCardinality(String) COMMENT 'DIRECT, PARTNER, AGGREGATOR',
service_tier LowCardinality(String) COMMENT 'PREMIUM, STANDARD, BASIC',
performance_score Decimal(5,2) COMMENT 'Overall performance score (0-100)',
partnership_start Date COMMENT 'Start date of partnership',
is_active Bool COMMENT 'Current active status',
created_at DateTime64(3) DEFAULT now64(3),
updated_at DateTime64(3) DEFAULT now64(3)
) ENGINE = MergeTree()
ORDER BY (tsp_key)
SETTINGS index_granularity = 8192
COMMENT 'TSP dimension for technical service provider performance analysis';
-- =====================================================
-- CENTRAL FACT TABLE
-- =====================================================
-- -----------------------------------------------------
-- FACT_CONSENT_JOURNEY: Core Business Metrics
-- Business Context: Consent funnel analysis with <3 second drill-down
-- Grain: One row per Consent Handle per Day
-- MD-Space: "DRILL_DOWN_CAPABILITY = 5-stage → 8-stage → error analysis"
-- -----------------------------------------------------
CREATE TABLE gold.fact_consent_journey (
-- Grain Definition
consent_handle String COMMENT 'Primary business identifier for consent journey',
event_date Date COMMENT 'Date of consent journey activity',
-- Foreign Keys to Dimensions
fiu_key Int32 COMMENT 'FK to dim_fiu',
purpose_code_key Int32 COMMENT 'FK to dim_purpose_code',
fip_key Int32 COMMENT 'FK to dim_fip',
date_key Date COMMENT 'FK to dim_date',
license_type_key Int32 COMMENT 'FK to dim_license_type',
user_type_key Int32 COMMENT 'FK to dim_user_type',
-- Phase 2 Foreign Keys
journey_type_key Nullable(Int32) COMMENT 'FK to dim_journey_type (Phase 2)',
fi_type_key Nullable(Int32) COMMENT 'FK to dim_fi_type (Phase 2)',
tsp_key Nullable(Int32) COMMENT 'FK to dim_tsp (Phase 2)',
-- ADDITIVE FACTS (Can be summed across all dimensions)
-- Funnel Stage Counts - MD-Space: "CI→CA→CF→DRI→DRF"
consent_initiated_count UInt32 DEFAULT 0 COMMENT 'CI: Consents initiated',
consent_approved_count UInt32 DEFAULT 0 COMMENT 'CA: Consents approved by user',
consent_fulfilled_count UInt32 DEFAULT 0 COMMENT 'CF: Consents fulfilled (first data fetch)',
data_fetch_initiated_count UInt32 DEFAULT 0 COMMENT 'DRI: Data fetch requests initiated',
data_fetch_fulfilled_count UInt32 DEFAULT 0 COMMENT 'DRF: Data fetch requests fulfilled',
-- Revenue Facts - MD-Space: "REVENUE_ATTRIBUTION = ₹3/fetch vs ₹0.10/month"
revenue_amount Decimal(10,2) DEFAULT 0 COMMENT 'Revenue in INR based on purpose code and billing model',
-- Target Tracking Facts - MD-Space: "DAILY_DEFICIT_TRACKING"
target_consents_initiated UInt32 DEFAULT 0 COMMENT 'Target CI for this day',
target_consents_fulfilled UInt32 DEFAULT 0 COMMENT 'Target CF for this day',
deficit_from_target Decimal(10,2) DEFAULT 0 COMMENT 'Shortfall from daily targets',
-- Error and Failure Tracking
consent_rejected_count UInt32 DEFAULT 0 COMMENT 'Consents rejected by user',
consent_failed_count UInt32 DEFAULT 0 COMMENT 'Consents failed due to technical issues',
data_fetch_failed_count UInt32 DEFAULT 0 COMMENT 'Data fetch requests that failed',
fip_timeout_count UInt32 DEFAULT 0 COMMENT 'Requests that timed out at FIP level',
-- SEMI-ADDITIVE FACTS (Can be averaged, not summed across time)
journey_duration_minutes UInt32 DEFAULT 0 COMMENT 'Total journey time from CI to CF',
fip_response_time_ms UInt32 DEFAULT 0 COMMENT 'Average FIP API response time',
-- Regulatory Compliance Facts - MD-Space: "SAHAMATI_REPORTING"
avg_timelag_fi_notification_ms UInt32 DEFAULT 0 COMMENT 'Average notification timelag for Sahamati',
-- Customer Value Facts - MD-Space: "CUSTOMER_IDENTITY_RESOLUTION_ENHANCED"
customer_lifetime_value_estimate Decimal(10,2) DEFAULT 0 COMMENT 'Estimated CLV based on user behavior',
-- Market Analysis Facts - MD-Space: "GROWTH_RATE_TRACKING = 2x market rate"
market_growth_rate Decimal(8,4) DEFAULT 0 COMMENT 'Market growth rate for comparison',
-- NON-ADDITIVE FACTS (Calculated at query time using other facts)
-- These are stored for performance but should be calculated
conversion_rate_ci_to_cf Decimal(8,6) DEFAULT 0 COMMENT 'CI to CF conversion rate',
overall_success_rate Decimal(8,6) DEFAULT 0 COMMENT 'End-to-end success rate',
fi_ready_to_request_ratio Decimal(8,6) DEFAULT 0 COMMENT 'FI ready to FI request ratio',
fip_attribution_ratio Decimal(8,6) DEFAULT 0 COMMENT 'Ratio of failures attributed to FIP',
aa_reliability_score Decimal(8,6) DEFAULT 0 COMMENT 'AA reliability excluding FIP issues',
competitive_position_score Decimal(8,4) DEFAULT 0 COMMENT 'Position vs OnemoneyAA',
-- Metadata
created_at DateTime64(3) DEFAULT now64(3),
updated_at DateTime64(3) DEFAULT now64(3)
) ENGINE = MergeTree()
PARTITION BY toYYYYMM(event_date)
ORDER BY (event_date, fiu_key, purpose_code_key, consent_handle)
SETTINGS index_granularity = 8192
COMMENT 'Central fact table for consent journey funnel analysis - Grain: One row per consent handle per day';
-- =====================================================
-- PERFORMANCE OPTIMIZATION
-- =====================================================
-- Data Skipping Index for faster FIP health queries
ALTER TABLE gold.fact_consent_journey
ADD INDEX idx_fip_performance (fip_key, fip_response_time_ms) TYPE minmax GRANULARITY 4;
-- Data Skipping Index for revenue analysis
ALTER TABLE gold.fact_consent_journey
ADD INDEX idx_revenue (purpose_code_key, revenue_amount) TYPE minmax GRANULARITY 4;
-- Data Skipping Index for success rate analysis
ALTER TABLE gold.fact_consent_journey
ADD INDEX idx_success_rates (overall_success_rate, aa_reliability_score) TYPE minmax GRANULARITY 4;
-- =====================================================
-- MATERIALIZED VIEWS FOR COMMON AGGREGATIONS
-- =====================================================
-- Daily Summary for Executive Dashboard (5-stage funnel)
CREATE MATERIALIZED VIEW gold.mv_daily_funnel_summary
ENGINE = SummingMergeTree()
ORDER BY (event_date, fiu_key)
POPULATE
AS SELECT
event_date,
fiu_key,
sum(consent_initiated_count) as total_ci,
sum(consent_approved_count) as total_ca,
sum(consent_fulfilled_count) as total_cf,
sum(data_fetch_initiated_count) as total_dri,
sum(data_fetch_fulfilled_count) as total_drf,
sum(revenue_amount) as total_revenue,
avg(overall_success_rate) as avg_success_rate
FROM gold.fact_consent_journey
GROUP BY event_date, fiu_key;
-- FIP Health Summary for Operations Team
CREATE MATERIALIZED VIEW gold.mv_fip_health_summary
ENGINE = ReplacingMergeTree()
ORDER BY (event_date, fip_key)
POPULATE
AS SELECT
event_date,
fip_key,
sum(fip_timeout_count) as total_timeouts,
avg(fip_response_time_ms) as avg_response_time,
avg(fip_attribution_ratio) as avg_fip_attribution,
sum(consent_initiated_count) as total_attempts,
sum(consent_failed_count) as total_failures
FROM gold.fact_consent_journey
GROUP BY event_date, fip_key;
-- =====================================================
-- DATA QUALITY CONSTRAINTS
-- =====================================================
-- Ensure referential integrity through CHECK constraints where possible
-- Note: ClickHouse doesn't enforce FK constraints, so these are documentation
-- Example data quality checks (can be run as monitoring queries)
/*
-- Daily data quality check for fact table completeness
SELECT
event_date,
count(*) as total_records,
sum(consent_initiated_count) as total_ci,
countIf(consent_initiated_count = 0) as zero_ci_records,
countIf(fiu_key = 0) as missing_fiu_key
FROM gold.fact_consent_journey
WHERE event_date = today() - 1
GROUP BY event_date;
*/
-- =====================================================
-- COMMENTS AND DOCUMENTATION
-- =====================================================
/*
BUSINESS CONTEXT SUMMARY:
- Supports <3 second drill-down from executive funnel to individual logs
- Enables revenue attribution based on purpose codes (₹3 vs ₹0.10)
- Separates FIP health issues from AA performance problems
- Supports automated Sahamati and RBI regulatory reporting
- Tracks progress toward 2x market growth rate target
- Grain allows aggregation up to monthly and drill-down to events
PERFORMANCE CONSIDERATIONS:
- Monthly partitioning for efficient query pruning
- Ordered by date, fiu, purpose_code for optimal compression
- Data skipping indices for common query patterns
- Materialized views for frequently accessed aggregations
- Semi-additive facts stored for performance but calculable from atomic facts
MAINTENANCE:
- Partition drops for old data retention (7+ years for compliance)
- Regular OPTIMIZE TABLE operations for better compression
- Monitor materialized view refresh performance
- Update dimension tables via SCD Type 1 (overwrite) or Type 2 (versioning)
*/ ```