Initial commit: funeral provider discovery pipeline

Python crawlers for VIC Register, Funerals Australia, NFDA n8n workflows for scheduled discovery and enrichment SQLite schema and seeded dev database (1,463 providers) End-to-end process documentation in n8n/PROCESS.md
2026-04-24 10:27:08 +10:00
commit cc91427789
30 changed files with 4706 additions and 0 deletions
--- a/crawlers/crawl_all.py
+++ b/crawlers/crawl_all.py
@@ -0,0 +1,70 @@
+"""Run all source crawlers and then deduplicate into the provider database."""
+
+import sys
+import time
+from pathlib import Path
+
+from base import get_db
+
+
+def run_all(gathered_here_limit: int | None = None):
+    """Run all crawlers sequentially."""
+    print("=" * 60)
+    print("PROVIDER DISCOVERY PIPELINE")
+    print("=" * 60)
+
+    # Import crawlers
+    import crawl_nfda
+    import crawl_funerals_australia
+    import crawl_vic_register
+    import crawl_gathered_here
+
+    # Run in order: fast API sources first, then slower HTML scraping
+    print("\n--- 1/4: NFDA Directory ---")
+    crawl_nfda.run()
+
+    print("\n--- 2/4: Funerals Australia ---")
+    crawl_funerals_australia.run()
+
+    print("\n--- 3/4: VIC Consumer Affairs Register ---")
+    crawl_vic_register.run()
+
+    print("\n--- 4/4: Gathered Here ---")
+    crawl_gathered_here.run(limit=gathered_here_limit)
+
+    # Summary
+    db = get_db()
+    print("\n" + "=" * 60)
+    print("CRAWL SUMMARY")
+    print("=" * 60)
+
+    rows = db.execute(
+        """SELECT source_name,
+                  COUNT(*) as total,
+                  SUM(CASE WHEN matched_brand_id IS NOT NULL THEN 1 ELSE 0 END) as matched
+           FROM source_record
+           GROUP BY source_name"""
+    ).fetchall()
+
+    for row in rows:
+        print(f"  {row['source_name']:25s} {row['total']:5d} records "
+              f"({row['matched']} matched)")
+
+    total = db.execute("SELECT COUNT(*) as n FROM source_record").fetchone()["n"]
+    print(f"  {'TOTAL':25s} {total:5d} records")
+
+    db.close()
+
+
+if __name__ == "__main__":
+    limit = None
+    if "--test" in sys.argv:
+        limit = 10
+        print("TEST MODE: Gathered Here limited to 10 profiles")
+    elif len(sys.argv) > 1:
+        try:
+            limit = int(sys.argv[1])
+        except ValueError:
+            pass
+
+    run_all(gathered_here_limit=limit)