Initial commit: funeral provider discovery pipeline

Python crawlers for VIC Register, Funerals Australia, NFDA
n8n workflows for scheduled discovery and enrichment
SQLite schema and seeded dev database (1,463 providers)
End-to-end process documentation in n8n/PROCESS.md
This commit is contained in:
Richie
2026-04-24 10:27:08 +10:00
commit cc91427789
30 changed files with 4706 additions and 0 deletions

70
crawlers/crawl_all.py Normal file
View File

@@ -0,0 +1,70 @@
"""Run all source crawlers and then deduplicate into the provider database."""
import sys
import time
from pathlib import Path
from base import get_db
def run_all(gathered_here_limit: int | None = None):
"""Run all crawlers sequentially."""
print("=" * 60)
print("PROVIDER DISCOVERY PIPELINE")
print("=" * 60)
# Import crawlers
import crawl_nfda
import crawl_funerals_australia
import crawl_vic_register
import crawl_gathered_here
# Run in order: fast API sources first, then slower HTML scraping
print("\n--- 1/4: NFDA Directory ---")
crawl_nfda.run()
print("\n--- 2/4: Funerals Australia ---")
crawl_funerals_australia.run()
print("\n--- 3/4: VIC Consumer Affairs Register ---")
crawl_vic_register.run()
print("\n--- 4/4: Gathered Here ---")
crawl_gathered_here.run(limit=gathered_here_limit)
# Summary
db = get_db()
print("\n" + "=" * 60)
print("CRAWL SUMMARY")
print("=" * 60)
rows = db.execute(
"""SELECT source_name,
COUNT(*) as total,
SUM(CASE WHEN matched_brand_id IS NOT NULL THEN 1 ELSE 0 END) as matched
FROM source_record
GROUP BY source_name"""
).fetchall()
for row in rows:
print(f" {row['source_name']:25s} {row['total']:5d} records "
f"({row['matched']} matched)")
total = db.execute("SELECT COUNT(*) as n FROM source_record").fetchone()["n"]
print(f" {'TOTAL':25s} {total:5d} records")
db.close()
if __name__ == "__main__":
limit = None
if "--test" in sys.argv:
limit = 10
print("TEST MODE: Gathered Here limited to 10 profiles")
elif len(sys.argv) > 1:
try:
limit = int(sys.argv[1])
except ValueError:
pass
run_all(gathered_here_limit=limit)