"""Run all source crawlers and then deduplicate into the provider database.""" import sys import time from pathlib import Path from base import get_db def run_all(gathered_here_limit: int | None = None): """Run all crawlers sequentially.""" print("=" * 60) print("PROVIDER DISCOVERY PIPELINE") print("=" * 60) # Import crawlers import crawl_nfda import crawl_funerals_australia import crawl_vic_register import crawl_gathered_here # Run in order: fast API sources first, then slower HTML scraping print("\n--- 1/4: NFDA Directory ---") crawl_nfda.run() print("\n--- 2/4: Funerals Australia ---") crawl_funerals_australia.run() print("\n--- 3/4: VIC Consumer Affairs Register ---") crawl_vic_register.run() print("\n--- 4/4: Gathered Here ---") crawl_gathered_here.run(limit=gathered_here_limit) # Summary db = get_db() print("\n" + "=" * 60) print("CRAWL SUMMARY") print("=" * 60) rows = db.execute( """SELECT source_name, COUNT(*) as total, SUM(CASE WHEN matched_brand_id IS NOT NULL THEN 1 ELSE 0 END) as matched FROM source_record GROUP BY source_name""" ).fetchall() for row in rows: print(f" {row['source_name']:25s} {row['total']:5d} records " f"({row['matched']} matched)") total = db.execute("SELECT COUNT(*) as n FROM source_record").fetchone()["n"] print(f" {'TOTAL':25s} {total:5d} records") db.close() if __name__ == "__main__": limit = None if "--test" in sys.argv: limit = 10 print("TEST MODE: Gathered Here limited to 10 profiles") elif len(sys.argv) > 1: try: limit = int(sys.argv[1]) except ValueError: pass run_all(gathered_here_limit=limit)