Initial commit: funeral provider discovery pipeline
Python crawlers for VIC Register, Funerals Australia, NFDA n8n workflows for scheduled discovery and enrichment SQLite schema and seeded dev database (1,463 providers) End-to-end process documentation in n8n/PROCESS.md
This commit is contained in:
70
crawlers/crawl_all.py
Normal file
70
crawlers/crawl_all.py
Normal file
@@ -0,0 +1,70 @@
|
||||
"""Run all source crawlers and then deduplicate into the provider database."""
|
||||
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
from base import get_db
|
||||
|
||||
|
||||
def run_all(gathered_here_limit: int | None = None):
|
||||
"""Run all crawlers sequentially."""
|
||||
print("=" * 60)
|
||||
print("PROVIDER DISCOVERY PIPELINE")
|
||||
print("=" * 60)
|
||||
|
||||
# Import crawlers
|
||||
import crawl_nfda
|
||||
import crawl_funerals_australia
|
||||
import crawl_vic_register
|
||||
import crawl_gathered_here
|
||||
|
||||
# Run in order: fast API sources first, then slower HTML scraping
|
||||
print("\n--- 1/4: NFDA Directory ---")
|
||||
crawl_nfda.run()
|
||||
|
||||
print("\n--- 2/4: Funerals Australia ---")
|
||||
crawl_funerals_australia.run()
|
||||
|
||||
print("\n--- 3/4: VIC Consumer Affairs Register ---")
|
||||
crawl_vic_register.run()
|
||||
|
||||
print("\n--- 4/4: Gathered Here ---")
|
||||
crawl_gathered_here.run(limit=gathered_here_limit)
|
||||
|
||||
# Summary
|
||||
db = get_db()
|
||||
print("\n" + "=" * 60)
|
||||
print("CRAWL SUMMARY")
|
||||
print("=" * 60)
|
||||
|
||||
rows = db.execute(
|
||||
"""SELECT source_name,
|
||||
COUNT(*) as total,
|
||||
SUM(CASE WHEN matched_brand_id IS NOT NULL THEN 1 ELSE 0 END) as matched
|
||||
FROM source_record
|
||||
GROUP BY source_name"""
|
||||
).fetchall()
|
||||
|
||||
for row in rows:
|
||||
print(f" {row['source_name']:25s} {row['total']:5d} records "
|
||||
f"({row['matched']} matched)")
|
||||
|
||||
total = db.execute("SELECT COUNT(*) as n FROM source_record").fetchone()["n"]
|
||||
print(f" {'TOTAL':25s} {total:5d} records")
|
||||
|
||||
db.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
limit = None
|
||||
if "--test" in sys.argv:
|
||||
limit = 10
|
||||
print("TEST MODE: Gathered Here limited to 10 profiles")
|
||||
elif len(sys.argv) > 1:
|
||||
try:
|
||||
limit = int(sys.argv[1])
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
run_all(gathered_here_limit=limit)
|
||||
Reference in New Issue
Block a user