Python crawlers for VIC Register, Funerals Australia, NFDA n8n workflows for scheduled discovery and enrichment SQLite schema and seeded dev database (1,463 providers) End-to-end process documentation in n8n/PROCESS.md
71 lines
1.8 KiB
Python
71 lines
1.8 KiB
Python
"""Run all source crawlers and then deduplicate into the provider database."""
|
|
|
|
import sys
|
|
import time
|
|
from pathlib import Path
|
|
|
|
from base import get_db
|
|
|
|
|
|
def run_all(gathered_here_limit: int | None = None):
|
|
"""Run all crawlers sequentially."""
|
|
print("=" * 60)
|
|
print("PROVIDER DISCOVERY PIPELINE")
|
|
print("=" * 60)
|
|
|
|
# Import crawlers
|
|
import crawl_nfda
|
|
import crawl_funerals_australia
|
|
import crawl_vic_register
|
|
import crawl_gathered_here
|
|
|
|
# Run in order: fast API sources first, then slower HTML scraping
|
|
print("\n--- 1/4: NFDA Directory ---")
|
|
crawl_nfda.run()
|
|
|
|
print("\n--- 2/4: Funerals Australia ---")
|
|
crawl_funerals_australia.run()
|
|
|
|
print("\n--- 3/4: VIC Consumer Affairs Register ---")
|
|
crawl_vic_register.run()
|
|
|
|
print("\n--- 4/4: Gathered Here ---")
|
|
crawl_gathered_here.run(limit=gathered_here_limit)
|
|
|
|
# Summary
|
|
db = get_db()
|
|
print("\n" + "=" * 60)
|
|
print("CRAWL SUMMARY")
|
|
print("=" * 60)
|
|
|
|
rows = db.execute(
|
|
"""SELECT source_name,
|
|
COUNT(*) as total,
|
|
SUM(CASE WHEN matched_brand_id IS NOT NULL THEN 1 ELSE 0 END) as matched
|
|
FROM source_record
|
|
GROUP BY source_name"""
|
|
).fetchall()
|
|
|
|
for row in rows:
|
|
print(f" {row['source_name']:25s} {row['total']:5d} records "
|
|
f"({row['matched']} matched)")
|
|
|
|
total = db.execute("SELECT COUNT(*) as n FROM source_record").fetchone()["n"]
|
|
print(f" {'TOTAL':25s} {total:5d} records")
|
|
|
|
db.close()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
limit = None
|
|
if "--test" in sys.argv:
|
|
limit = 10
|
|
print("TEST MODE: Gathered Here limited to 10 profiles")
|
|
elif len(sys.argv) > 1:
|
|
try:
|
|
limit = int(sys.argv[1])
|
|
except ValueError:
|
|
pass
|
|
|
|
run_all(gathered_here_limit=limit)
|