# ingest.py
# Generates schema_chunks.json from your PostgreSQL database metadata

import psycopg2
import pandas as pd
import json
from pathlib import Path

# ────────────────────────────────────────────────
# Connection parameters
# ────────────────────────────────────────────────
conn_params = {
    'dbname':   'four_s_india',
    'user':     'postgres',
    'password': 'gQube1_#@96740',
    'host':     '139.59.87.82',
    'port':     '5432'
}

output_json = "schema_chunks.json"

# ────────────────────────────────────────────────
# Custom notes for key tables (expand as needed)
# ────────────────────────────────────────────────
table_notes = {
    'upload_farmer': (
        "Main table for REGISTERED FARMERS. "
        "Use this for counting farmers, registration stats, or farmer lists. "
        "Column 'state' is character varying(255) → string/text. "
        "Likely values: 'West Bengal', 'WB', or numeric ID stored as string '4'. "
        "Do NOT compare to integer directly — use quotes: state = 'West Bengal' or state = '4'. "
        "userType or similar column may indicate farmer (value 2)."
    ),
    'agri_users': (
        "General user profiles. "
        "user_type indicates role (1=supervisor, 2=farmer, etc.). "
        "state_id is likely numeric ID for location."
    ),
    'upload_farmer_details': (
        "Farmer plot and crop details. "
        "Join to upload_farmer on contactNumber or user_id."
    ),
    'agri_account_transactions': (
        "Financial transactions. "
        "Join to agri_account_heads on head_id for meaningful category names."
    ),
    'agri_account_heads': (
        "Master table for account categories (e.g. head_name = 'Fertilizer Cost', 'Transport'). "
        "Reference via head_id in other tables."
    ),
    # Add more tables here with specific hints
}

# ────────────────────────────────────────────────
# Fetch metadata
# ────────────────────────────────────────────────
query = """
SELECT 
    table_schema,
    table_name,
    column_name,
    ordinal_position,
    column_default,
    is_nullable,
    data_type,
    character_maximum_length,
    numeric_precision,
    numeric_scale,
    udt_name
FROM information_schema.columns
WHERE table_schema NOT IN ('pg_catalog', 'information_schema')
-- AND table_schema = 'public'  -- uncomment if you want only public schema
ORDER BY table_schema, table_name, ordinal_position;
"""

print("Connecting to database...")
try:
    conn = psycopg2.connect(**conn_params)
    df = pd.read_sql_query(query, conn)
    conn.close()
    print(f"Fetched metadata for {df['table_name'].nunique()} tables, {len(df)} columns total.")
except Exception as e:
    print(f"Connection failed: {e}")
    exit(1)

# ────────────────────────────────────────────────
# Build chunks
# ────────────────────────────────────────────────
chunks = []

grouped = df.groupby(['table_schema', 'table_name'])

for (schema, table), group in grouped:
    lines = [f"Table: {schema}.{table}"]

    # Add type summary
    col_types = group['data_type'].value_counts().to_dict()
    type_summary = ", ".join([f"{k}: {v}" for k,v in col_types.items()])
    lines.append(f"Column types summary: {type_summary}")

    lines.append("Columns:")

    for _, row in group.iterrows():
        col_info = f"  - {row['column_name']} : {row['data_type']}"

        # Add friendly type hint
        if 'char' in row['data_type'] or 'text' in row['data_type']:
            col_info += " (string/text)"
        elif 'int' in row['data_type'] or 'numeric' in row['data_type']:
            col_info += " (number)"
        elif 'timestamp' in row['data_type'] or 'date' in row['data_type']:
            col_info += " (date/time)"
        elif 'bool' in row['data_type']:
            col_info += " (boolean)"

        if pd.notna(row['character_maximum_length']):
            col_info += f" ({int(row['character_maximum_length'])} chars)"
        if pd.notna(row['numeric_precision']):
            prec = int(row['numeric_precision'])
            scale = int(row['numeric_scale']) if pd.notna(row['numeric_scale']) else 0
            col_info += f" ({prec},{scale})"

        nullable = "NULL" if row['is_nullable'] == 'YES' else "NOT NULL"
        col_info += f" {nullable}"

        if pd.notna(row['column_default']):
            default_val = str(row['column_default']).replace('\n', ' ').strip()
            col_info += f" DEFAULT {default_val}"

        lines.append(col_info)

    # Add custom note
    note = table_notes.get(table, f"Agriculture-related table for {table.replace('agri_', '').replace('_', ' ')}.")
    lines.append("")
    lines.append(f"Important notes: {note}")

    chunk_text = "\n".join(lines)

    chunks.append({
        "text": chunk_text,
        "metadata": {
            "schema": schema,
            "table": table,
            "full_table_name": f"{schema}.{table}"
        }
    })

    print(f"  ✔ {schema}.{table} ({len(group)} columns)")

# ────────────────────────────────────────────────
# Save to file
# ────────────────────────────────────────────────
try:
    with open(output_json, "w", encoding="utf-8") as f:
        json.dump(chunks, f, indent=2, ensure_ascii=False)
    print(f"\n✅ Done! {len(chunks)} table chunks saved to {output_json}")
except Exception as e:
    print(f"Failed to save JSON: {e}")