"""Run all source crawlers and then deduplicate into the provider database."""

import sys
import time
from pathlib import Path

from base import get_db


def run_all(gathered_here_limit: int | None = None):
    """Run all crawlers sequentially."""
    print("=" * 60)
    print("PROVIDER DISCOVERY PIPELINE")
    print("=" * 60)

    # Import crawlers
    import crawl_nfda
    import crawl_funerals_australia
    import crawl_vic_register
    import crawl_gathered_here

    # Run in order: fast API sources first, then slower HTML scraping
    print("\n--- 1/4: NFDA Directory ---")
    crawl_nfda.run()

    print("\n--- 2/4: Funerals Australia ---")
    crawl_funerals_australia.run()

    print("\n--- 3/4: VIC Consumer Affairs Register ---")
    crawl_vic_register.run()

    print("\n--- 4/4: Gathered Here ---")
    crawl_gathered_here.run(limit=gathered_here_limit)

    # Summary
    db = get_db()
    print("\n" + "=" * 60)
    print("CRAWL SUMMARY")
    print("=" * 60)

    rows = db.execute(
        """SELECT source_name,
                  COUNT(*) as total,
                  SUM(CASE WHEN matched_brand_id IS NOT NULL THEN 1 ELSE 0 END) as matched
           FROM source_record
           GROUP BY source_name"""
    ).fetchall()

    for row in rows:
        print(f"  {row['source_name']:25s} {row['total']:5d} records "
              f"({row['matched']} matched)")

    total = db.execute("SELECT COUNT(*) as n FROM source_record").fetchone()["n"]
    print(f"  {'TOTAL':25s} {total:5d} records")

    db.close()


if __name__ == "__main__":
    limit = None
    if "--test" in sys.argv:
        limit = 10
        print("TEST MODE: Gathered Here limited to 10 profiles")
    elif len(sys.argv) > 1:
        try:
            limit = int(sys.argv[1])
        except ValueError:
            pass

    run_all(gathered_here_limit=limit)