Finvu Data Warehouse Documentation

Finvu Home

Star Schema DDL

-- =====================================================
-- FINVU DATA WAREHOUSE - STAR SCHEMA DDL
-- =====================================================
-- Business Process: Consent Journey Funnel Analysis
-- Grain: One row per Consent Handle per Day
-- Performance Target: <3 second drill-down capability
-- Data Retention: 7+ years for regulatory compliance
-- =====================================================

-- =====================================================
-- TIER 1 DIMENSIONS - IMMEDIATE IMPLEMENTATION
-- =====================================================

-- -----------------------------------------------------
-- DIM_FIU: Client Performance Analysis
-- Business Context: Revenue optimization per FIU client
-- MD-Space: "FIU_WISE_BREAKDOWN = [Navi, Cred, Axis, ICICI, Bajaj Finance, RKB Finance]"
-- -----------------------------------------------------
CREATE TABLE gold.dim_fiu (
    fiu_key Int32,
    fiu_name LowCardinality(String) COMMENT 'Client name: navi, cred, axis, icici, bajaj_finance, rkb_finance',
    fiu_id String COMMENT 'Original FIU identifier from events',
    client_type LowCardinality(String) COMMENT 'LENDING, PFM, MIXED based on purpose code patterns',
    revenue_tier LowCardinality(String) COMMENT 'HIGH, MEDIUM, LOW based on monthly volume',
    revenue_model LowCardinality(String) COMMENT 'PER_FETCH, PER_USER_MONTHLY, HYBRID',
    onboarding_date Date COMMENT 'Client onboarding date for tenure analysis',
    monthly_target_volume UInt32 COMMENT 'Target consents per month for deficit tracking',
    is_active Bool COMMENT 'Current active status',
    created_at DateTime64(3) DEFAULT now64(3),
    updated_at DateTime64(3) DEFAULT now64(3)
) ENGINE = MergeTree()
ORDER BY (fiu_key)
SETTINGS index_granularity = 8192
COMMENT 'FIU dimension for client performance analysis and revenue attribution';

-- -----------------------------------------------------
-- DIM_PURPOSE_CODE: Revenue Attribution
-- Business Context: Billing model determination (₹3 vs ₹0.10)
-- MD-Space: "PURPOSE_CODE_CORRELATION = 103 (lending), 101/102 (PFM)"
-- -----------------------------------------------------
CREATE TABLE gold.dim_purpose_code (
    purpose_code_key Int32,
    purpose_code LowCardinality(String) COMMENT 'RBI purpose codes: 101, 102, 103, 104, 105',
    description String COMMENT 'Human readable purpose description',
    billing_model LowCardinality(String) COMMENT 'PER_FETCH, PER_USER_MONTHLY',
    billing_rate Decimal(10,2) COMMENT 'Rate in INR: 3.00 for lending, 0.10 for PFM',
    revenue_type LowCardinality(String) COMMENT 'LENDING, PFM, ACCOUNT_QUERY',
    frequency_type LowCardinality(String) COMMENT 'ONE_TIME, RECURRING, REGULAR',
    is_regulated Bool COMMENT 'Whether purpose code requires special compliance',
    sahamati_category LowCardinality(String) COMMENT 'Category for Sahamati reporting',
    created_at DateTime64(3) DEFAULT now64(3),
    updated_at DateTime64(3) DEFAULT now64(3)
) ENGINE = MergeTree()
ORDER BY (purpose_code_key)
SETTINGS index_granularity = 8192
COMMENT 'Purpose code dimension for revenue attribution and billing model determination';

-- -----------------------------------------------------
-- DIM_FIP: FIP Health Attribution  
-- Business Context: Separate FIP issues from AA performance
-- MD-Space: "FIP_HEALTH_ATTRIBUTION = separate FIP vs AA performance issues"
-- -----------------------------------------------------
CREATE TABLE gold.dim_fip (
    fip_key Int32,
    fip_name LowCardinality(String) COMMENT 'Bank/FIP name: axis_bank, hdfc_bank, icici_bank',
    fip_id String COMMENT 'Original FIP identifier from events',
    fip_type LowCardinality(String) COMMENT 'BANK, NBFC, MUTUAL_FUND, INSURANCE',
    bank_category LowCardinality(String) COMMENT 'PUBLIC, PRIVATE, COOPERATIVE, FOREIGN',
    uptime_score Decimal(5,2) COMMENT 'Historical uptime percentage (0-100)',
    success_rate Decimal(5,2) COMMENT 'Historical success rate percentage (0-100)',
    avg_response_time_ms UInt32 COMMENT 'Average API response time in milliseconds',
    performance_tier LowCardinality(String) COMMENT 'HIGH, MEDIUM, LOW based on reliability metrics',
    is_active Bool COMMENT 'Current active status',
    onboarding_date Date COMMENT 'FIP onboarding date',
    created_at DateTime64(3) DEFAULT now64(3),
    updated_at DateTime64(3) DEFAULT now64(3)
) ENGINE = MergeTree()
ORDER BY (fip_key)
SETTINGS index_granularity = 8192
COMMENT 'FIP dimension for health attribution and performance analysis';

-- -----------------------------------------------------
-- DIM_DATE: Time Intelligence
-- Business Context: Daily/MTD tracking and growth analysis
-- MD-Space: "DAY_ON_DAY_TRACKING = short-term trend identification"
-- -----------------------------------------------------
CREATE TABLE gold.dim_date (
    date_key Date,
    year UInt16,
    month UInt8,
    day UInt8,
    quarter UInt8,
    month_name LowCardinality(String),
    day_of_week LowCardinality(String),
    day_of_year UInt16,
    week_of_year UInt8,
    is_business_day Bool COMMENT 'Indian business day (Mon-Fri, excluding holidays)',
    is_month_end Bool,
    is_quarter_end Bool,
    is_year_end Bool,
    fiscal_year UInt16 COMMENT 'Indian fiscal year (Apr-Mar)',
    fiscal_quarter UInt8,
    fiscal_month UInt8,
    days_in_month UInt8,
    created_at DateTime64(3) DEFAULT now64(3)
) ENGINE = MergeTree()
ORDER BY (date_key)
SETTINGS index_granularity = 8192
COMMENT 'Date dimension for time intelligence and trend analysis';

-- -----------------------------------------------------
-- DIM_LICENSE_TYPE: Regulatory Categorization
-- Business Context: Automated Sahamati/RBI reporting
-- MD-Space: "LICENSE_TYPE_DIMENSION = SEBI, RBI, other regulatory categorization"
-- -----------------------------------------------------
CREATE TABLE gold.dim_license_type (
    license_type_key Int32,
    license_category LowCardinality(String) COMMENT 'SEBI, RBI, INSURANCE, UNREGULATED',
    regulatory_body LowCardinality(String) COMMENT 'Governing regulatory authority',
    entity_type LowCardinality(String) COMMENT 'Regulated_FIU, LSP, TSP',
    compliance_level LowCardinality(String) COMMENT 'FULL, PARTIAL, NONE',
    is_regulated Bool COMMENT 'Whether entity requires regulatory reporting',
    reporting_frequency LowCardinality(String) COMMENT 'DAILY, MONTHLY, QUARTERLY',
    sahamati_category LowCardinality(String) COMMENT 'Category for Sahamati reporting',
    rbi_category LowCardinality(String) COMMENT 'Category for RBI reporting',
    created_at DateTime64(3) DEFAULT now64(3),
    updated_at DateTime64(3) DEFAULT now64(3)
) ENGINE = MergeTree()
ORDER BY (license_type_key)
SETTINGS index_granularity = 8192
COMMENT 'License type dimension for regulatory categorization and compliance reporting';

-- -----------------------------------------------------
-- DIM_USER_TYPE: User Behavior Analysis
-- Business Context: New vs existing user performance comparison
-- MD-Space: "USER_CLASSIFICATION_LOGIC: NEW_USERS vs EXISTING_USERS"
-- -----------------------------------------------------
CREATE TABLE gold.dim_user_type (
    user_type_key Int32,
    user_category LowCardinality(String) COMMENT 'NEW, EXISTING, RETURNING',
    experience_level LowCardinality(String) COMMENT 'FIRST_TIME, EXPERIENCED, EXPERT',
    has_linked_accounts Bool COMMENT 'Whether user has previously linked bank accounts',
    journey_complexity LowCardinality(String) COMMENT 'FULL_FUNNEL, ABBREVIATED_FUNNEL',
    expected_success_rate Decimal(5,2) COMMENT 'Expected success rate based on user type',
    typical_drop_off_stage LowCardinality(String) COMMENT 'Most common drop-off point',
    support_required Bool COMMENT 'Whether user type typically requires assistance',
    created_at DateTime64(3) DEFAULT now64(3),
    updated_at DateTime64(3) DEFAULT now64(3)
) ENGINE = MergeTree()
ORDER BY (user_type_key)
SETTINGS index_granularity = 8192
COMMENT 'User type dimension for behavior analysis and funnel optimization';

-- =====================================================
-- TIER 2 DIMENSIONS - PHASE 2 IMPLEMENTATION
-- =====================================================

-- -----------------------------------------------------
-- DIM_JOURNEY_TYPE: Performance Comparison
-- Business Context: Assisted vs DIY journey analysis
-- MD-Space: "ASSISTED_JOURNEYS = bank branch RMs, higher success rates"
-- -----------------------------------------------------
CREATE TABLE gold.dim_journey_type (
    journey_type_key Int32,
    journey_category LowCardinality(String) COMMENT 'ASSISTED, DIY, HYBRID',
    assistance_level LowCardinality(String) COMMENT 'FULL_SUPPORT, PARTIAL_SUPPORT, SELF_SERVICE',
    channel_type LowCardinality(String) COMMENT 'BRANCH, MOBILE_APP, WEB, API',
    target_audience LowCardinality(String) COMMENT 'BRANCH_CUSTOMERS, TECH_SAVVY, GENERAL',
    typical_success_rate Decimal(5,2) COMMENT 'Historical success rate for this journey type',
    avg_completion_time_minutes UInt16 COMMENT 'Average time to complete journey',
    created_at DateTime64(3) DEFAULT now64(3),
    updated_at DateTime64(3) DEFAULT now64(3)
) ENGINE = MergeTree()
ORDER BY (journey_type_key)
SETTINGS index_granularity = 8192
COMMENT 'Journey type dimension for assisted vs DIY performance comparison';

-- -----------------------------------------------------
-- DIM_FI_TYPE: Regulatory Reporting
-- Business Context: Sahamati FI type breakdown requirements
-- MD-Space: "FI_TYPE_BREAKDOWN = deposit, mutual fund, insurance, etc."
-- -----------------------------------------------------
CREATE TABLE gold.dim_fi_type (
    fi_type_key Int32,
    fi_category LowCardinality(String) COMMENT 'DEPOSIT, INVESTMENT, INSURANCE, CREDIT',
    data_type LowCardinality(String) COMMENT 'Specific FI data type from Sahamati spec',
    regulatory_classification LowCardinality(String) COMMENT 'RBI, SEBI, IRDAI classification',
    requires_special_handling Bool COMMENT 'Whether FI type needs special processing',
    sahamati_category LowCardinality(String) COMMENT 'Official Sahamati category for reporting',
    data_sensitivity LowCardinality(String) COMMENT 'HIGH, MEDIUM, LOW',
    encryption_required Bool COMMENT 'Whether additional encryption is needed',
    created_at DateTime64(3) DEFAULT now64(3),
    updated_at DateTime64(3) DEFAULT now64(3)
) ENGINE = MergeTree()
ORDER BY (fi_type_key)
SETTINGS index_granularity = 8192
COMMENT 'FI type dimension for regulatory reporting and Sahamati compliance';

-- -----------------------------------------------------
-- DIM_TSP: Technical Service Provider Segmentation
-- Business Context: Integration performance analysis
-- MD-Space: "TSP_DIMENSION = technical service provider segmentation"
-- -----------------------------------------------------
CREATE TABLE gold.dim_tsp (
    tsp_key Int32,
    tsp_name LowCardinality(String) COMMENT 'Technical service provider name',
    tsp_id String COMMENT 'Original TSP identifier',
    integration_type LowCardinality(String) COMMENT 'DIRECT, PARTNER, AGGREGATOR',
    service_tier LowCardinality(String) COMMENT 'PREMIUM, STANDARD, BASIC',
    performance_score Decimal(5,2) COMMENT 'Overall performance score (0-100)',
    partnership_start Date COMMENT 'Start date of partnership',
    is_active Bool COMMENT 'Current active status',
    created_at DateTime64(3) DEFAULT now64(3),
    updated_at DateTime64(3) DEFAULT now64(3)
) ENGINE = MergeTree()
ORDER BY (tsp_key)
SETTINGS index_granularity = 8192
COMMENT 'TSP dimension for technical service provider performance analysis';

-- =====================================================
-- CENTRAL FACT TABLE
-- =====================================================

-- -----------------------------------------------------
-- FACT_CONSENT_JOURNEY: Core Business Metrics
-- Business Context: Consent funnel analysis with <3 second drill-down
-- Grain: One row per Consent Handle per Day
-- MD-Space: "DRILL_DOWN_CAPABILITY = 5-stage → 8-stage → error analysis"
-- -----------------------------------------------------
CREATE TABLE gold.fact_consent_journey (
    -- Grain Definition
    consent_handle String COMMENT 'Primary business identifier for consent journey',
    event_date Date COMMENT 'Date of consent journey activity',
    
    -- Foreign Keys to Dimensions
    fiu_key Int32 COMMENT 'FK to dim_fiu',
    purpose_code_key Int32 COMMENT 'FK to dim_purpose_code', 
    fip_key Int32 COMMENT 'FK to dim_fip',
    date_key Date COMMENT 'FK to dim_date',
    license_type_key Int32 COMMENT 'FK to dim_license_type',
    user_type_key Int32 COMMENT 'FK to dim_user_type',
    
    -- Phase 2 Foreign Keys
    journey_type_key Nullable(Int32) COMMENT 'FK to dim_journey_type (Phase 2)',
    fi_type_key Nullable(Int32) COMMENT 'FK to dim_fi_type (Phase 2)',
    tsp_key Nullable(Int32) COMMENT 'FK to dim_tsp (Phase 2)',
    
    -- ADDITIVE FACTS (Can be summed across all dimensions)
    -- Funnel Stage Counts - MD-Space: "CI→CA→CF→DRI→DRF"
    consent_initiated_count UInt32 DEFAULT 0 COMMENT 'CI: Consents initiated',
    consent_approved_count UInt32 DEFAULT 0 COMMENT 'CA: Consents approved by user',
    consent_fulfilled_count UInt32 DEFAULT 0 COMMENT 'CF: Consents fulfilled (first data fetch)',
    data_fetch_initiated_count UInt32 DEFAULT 0 COMMENT 'DRI: Data fetch requests initiated',
    data_fetch_fulfilled_count UInt32 DEFAULT 0 COMMENT 'DRF: Data fetch requests fulfilled',
    
    -- Revenue Facts - MD-Space: "REVENUE_ATTRIBUTION = ₹3/fetch vs ₹0.10/month"
    revenue_amount Decimal(10,2) DEFAULT 0 COMMENT 'Revenue in INR based on purpose code and billing model',
    
    -- Target Tracking Facts - MD-Space: "DAILY_DEFICIT_TRACKING"
    target_consents_initiated UInt32 DEFAULT 0 COMMENT 'Target CI for this day',
    target_consents_fulfilled UInt32 DEFAULT 0 COMMENT 'Target CF for this day',
    deficit_from_target Decimal(10,2) DEFAULT 0 COMMENT 'Shortfall from daily targets',
    
    -- Error and Failure Tracking
    consent_rejected_count UInt32 DEFAULT 0 COMMENT 'Consents rejected by user',
    consent_failed_count UInt32 DEFAULT 0 COMMENT 'Consents failed due to technical issues',
    data_fetch_failed_count UInt32 DEFAULT 0 COMMENT 'Data fetch requests that failed',
    fip_timeout_count UInt32 DEFAULT 0 COMMENT 'Requests that timed out at FIP level',
    
    -- SEMI-ADDITIVE FACTS (Can be averaged, not summed across time)
    journey_duration_minutes UInt32 DEFAULT 0 COMMENT 'Total journey time from CI to CF',
    fip_response_time_ms UInt32 DEFAULT 0 COMMENT 'Average FIP API response time',
    
    -- Regulatory Compliance Facts - MD-Space: "SAHAMATI_REPORTING"
    avg_timelag_fi_notification_ms UInt32 DEFAULT 0 COMMENT 'Average notification timelag for Sahamati',
    
    -- Customer Value Facts - MD-Space: "CUSTOMER_IDENTITY_RESOLUTION_ENHANCED"
    customer_lifetime_value_estimate Decimal(10,2) DEFAULT 0 COMMENT 'Estimated CLV based on user behavior',
    
    -- Market Analysis Facts - MD-Space: "GROWTH_RATE_TRACKING = 2x market rate"
    market_growth_rate Decimal(8,4) DEFAULT 0 COMMENT 'Market growth rate for comparison',
    
    -- NON-ADDITIVE FACTS (Calculated at query time using other facts)
    -- These are stored for performance but should be calculated
    conversion_rate_ci_to_cf Decimal(8,6) DEFAULT 0 COMMENT 'CI to CF conversion rate',
    overall_success_rate Decimal(8,6) DEFAULT 0 COMMENT 'End-to-end success rate',
    fi_ready_to_request_ratio Decimal(8,6) DEFAULT 0 COMMENT 'FI ready to FI request ratio',
    fip_attribution_ratio Decimal(8,6) DEFAULT 0 COMMENT 'Ratio of failures attributed to FIP',
    aa_reliability_score Decimal(8,6) DEFAULT 0 COMMENT 'AA reliability excluding FIP issues',
    competitive_position_score Decimal(8,4) DEFAULT 0 COMMENT 'Position vs OnemoneyAA',
    
    -- Metadata
    created_at DateTime64(3) DEFAULT now64(3),
    updated_at DateTime64(3) DEFAULT now64(3)
    
) ENGINE = MergeTree()
PARTITION BY toYYYYMM(event_date) 
ORDER BY (event_date, fiu_key, purpose_code_key, consent_handle)
SETTINGS index_granularity = 8192
COMMENT 'Central fact table for consent journey funnel analysis - Grain: One row per consent handle per day';

-- =====================================================
-- PERFORMANCE OPTIMIZATION
-- =====================================================

-- Data Skipping Index for faster FIP health queries
ALTER TABLE gold.fact_consent_journey 
ADD INDEX idx_fip_performance (fip_key, fip_response_time_ms) TYPE minmax GRANULARITY 4;

-- Data Skipping Index for revenue analysis
ALTER TABLE gold.fact_consent_journey 
ADD INDEX idx_revenue (purpose_code_key, revenue_amount) TYPE minmax GRANULARITY 4;

-- Data Skipping Index for success rate analysis  
ALTER TABLE gold.fact_consent_journey
ADD INDEX idx_success_rates (overall_success_rate, aa_reliability_score) TYPE minmax GRANULARITY 4;

-- =====================================================
-- MATERIALIZED VIEWS FOR COMMON AGGREGATIONS
-- =====================================================

-- Daily Summary for Executive Dashboard (5-stage funnel)
CREATE MATERIALIZED VIEW gold.mv_daily_funnel_summary
ENGINE = SummingMergeTree()
ORDER BY (event_date, fiu_key)
POPULATE
AS SELECT
    event_date,
    fiu_key,
    sum(consent_initiated_count) as total_ci,
    sum(consent_approved_count) as total_ca, 
    sum(consent_fulfilled_count) as total_cf,
    sum(data_fetch_initiated_count) as total_dri,
    sum(data_fetch_fulfilled_count) as total_drf,
    sum(revenue_amount) as total_revenue,
    avg(overall_success_rate) as avg_success_rate
FROM gold.fact_consent_journey
GROUP BY event_date, fiu_key;

-- FIP Health Summary for Operations Team
CREATE MATERIALIZED VIEW gold.mv_fip_health_summary  
ENGINE = ReplacingMergeTree()
ORDER BY (event_date, fip_key)
POPULATE
AS SELECT
    event_date,
    fip_key,
    sum(fip_timeout_count) as total_timeouts,
    avg(fip_response_time_ms) as avg_response_time,
    avg(fip_attribution_ratio) as avg_fip_attribution,
    sum(consent_initiated_count) as total_attempts,
    sum(consent_failed_count) as total_failures
FROM gold.fact_consent_journey  
GROUP BY event_date, fip_key;

-- =====================================================
-- DATA QUALITY CONSTRAINTS
-- =====================================================

-- Ensure referential integrity through CHECK constraints where possible
-- Note: ClickHouse doesn't enforce FK constraints, so these are documentation

-- Example data quality checks (can be run as monitoring queries)
/*
-- Daily data quality check for fact table completeness
SELECT 
    event_date,
    count(*) as total_records,
    sum(consent_initiated_count) as total_ci,
    countIf(consent_initiated_count = 0) as zero_ci_records,
    countIf(fiu_key = 0) as missing_fiu_key
FROM gold.fact_consent_journey 
WHERE event_date = today() - 1
GROUP BY event_date;
*/

-- =====================================================
-- COMMENTS AND DOCUMENTATION
-- =====================================================

/*
BUSINESS CONTEXT SUMMARY:
- Supports <3 second drill-down from executive funnel to individual logs
- Enables revenue attribution based on purpose codes (₹3 vs ₹0.10)  
- Separates FIP health issues from AA performance problems
- Supports automated Sahamati and RBI regulatory reporting
- Tracks progress toward 2x market growth rate target
- Grain allows aggregation up to monthly and drill-down to events

PERFORMANCE CONSIDERATIONS:
- Monthly partitioning for efficient query pruning
- Ordered by date, fiu, purpose_code for optimal compression
- Data skipping indices for common query patterns
- Materialized views for frequently accessed aggregations
- Semi-additive facts stored for performance but calculable from atomic facts

MAINTENANCE:
- Partition drops for old data retention (7+ years for compliance)
- Regular OPTIMIZE TABLE operations for better compression
- Monitor materialized view refresh performance
- Update dimension tables via SCD Type 1 (overwrite) or Type 2 (versioning)
*/ ```