Files
Provider-Crawl/crawlers/run_overnight.sh
Richie cc91427789 Initial commit: funeral provider discovery pipeline
Python crawlers for VIC Register, Funerals Australia, NFDA
n8n workflows for scheduled discovery and enrichment
SQLite schema and seeded dev database (1,463 providers)
End-to-end process documentation in n8n/PROCESS.md
2026-04-24 10:27:08 +10:00

112 lines
4.8 KiB
Bash
Executable File

#!/bin/bash
# Full pipeline overnight run
# Usage: ./run_overnight.sh
#
# Before running:
# 1. Add your Serper API key to config.json
# 2. Optionally add your Anthropic API key for AI extraction
#
# This script runs all steps sequentially and logs everything.
set -e
cd "$(dirname "$0")"
LOG="../logs/overnight_$(date +%Y%m%d_%H%M%S).log"
mkdir -p ../logs
echo "=== OVERNIGHT PIPELINE RUN ===" | tee "$LOG"
echo "Started: $(date)" | tee -a "$LOG"
echo "" | tee -a "$LOG"
# Check config
SERPER_KEY=$(python3 -c "import json; c=json.load(open('config.json')); print(c.get('serper_api_key') or '')")
ANTHROPIC_KEY=$(python3 -c "import json; c=json.load(open('config.json')); print(c.get('anthropic_api_key') or '')")
if [ -z "$SERPER_KEY" ]; then
echo "WARNING: No Serper API key — website discovery will use DDG (slower, lower hit rate)" | tee -a "$LOG"
else
echo "Serper API key: configured" | tee -a "$LOG"
fi
if [ -z "$ANTHROPIC_KEY" ]; then
echo "WARNING: No Anthropic API key — AI extraction will be skipped" | tee -a "$LOG"
else
echo "Anthropic API key: configured" | tee -a "$LOG"
fi
echo "" | tee -a "$LOG"
# Step 1: Source crawlers
echo "=== STEP 1: Source Crawlers ===" | tee -a "$LOG"
echo "[$(date +%H:%M:%S)] Running VIC Register crawler..." | tee -a "$LOG"
python3 crawl_vic_register.py 2>&1 | tee -a "$LOG"
echo "[$(date +%H:%M:%S)] Running Funerals Australia crawler..." | tee -a "$LOG"
python3 crawl_funerals_australia.py 2>&1 | tee -a "$LOG"
echo "[$(date +%H:%M:%S)] Running NFDA crawler..." | tee -a "$LOG"
python3 crawl_nfda.py 2>&1 | tee -a "$LOG"
# Step 2: Deduplication
echo "" | tee -a "$LOG"
echo "=== STEP 2: Deduplication ===" | tee -a "$LOG"
echo "[$(date +%H:%M:%S)] Running dedup..." | tee -a "$LOG"
python3 dedup.py 2>&1 | tee -a "$LOG"
# Step 3: Website discovery (all providers without one)
echo "" | tee -a "$LOG"
echo "=== STEP 3: Website Discovery ===" | tee -a "$LOG"
NEED_WEBSITE=$(python3 -c "from base import get_db; db=get_db(); print(db.execute('SELECT COUNT(*) FROM funeral_brand WHERE website IS NULL AND verified=0').fetchone()[0])")
echo "[$(date +%H:%M:%S)] Providers needing websites: $NEED_WEBSITE" | tee -a "$LOG"
# Process in batches of 200 to avoid issues
BATCH=200
OFFSET=0
while [ $OFFSET -lt $NEED_WEBSITE ]; do
REMAINING=$((NEED_WEBSITE - OFFSET))
CURRENT=$((REMAINING < BATCH ? REMAINING : BATCH))
echo "[$(date +%H:%M:%S)] Discovering websites batch $((OFFSET/BATCH + 1)) ($CURRENT providers)..." | tee -a "$LOG"
python3 discover_websites.py --limit=$CURRENT 2>&1 | tee -a "$LOG"
OFFSET=$((OFFSET + BATCH))
# Brief pause between batches
sleep 5
done
# Step 4: Website enrichment (all with website, not yet enriched)
echo "" | tee -a "$LOG"
echo "=== STEP 4: Website Enrichment ===" | tee -a "$LOG"
NEED_ENRICH=$(python3 -c "from base import get_db; db=get_db(); print(db.execute('SELECT COUNT(*) FROM funeral_brand WHERE website IS NOT NULL AND enrichment_status=\"pending\" AND verified=0').fetchone()[0])")
echo "[$(date +%H:%M:%S)] Providers needing enrichment: $NEED_ENRICH" | tee -a "$LOG"
python3 enrich_websites.py --limit=$NEED_ENRICH 2>&1 | tee -a "$LOG"
# Step 5: Compute tiers
echo "" | tee -a "$LOG"
echo "=== STEP 5: Compute Tiers ===" | tee -a "$LOG"
python3 compute_tiers.py 2>&1 | tee -a "$LOG"
# Final summary
echo "" | tee -a "$LOG"
echo "=== FINAL SUMMARY ===" | tee -a "$LOG"
python3 -c "
from base import get_db
db = get_db()
print('Database Status:')
print(f' Total providers: {db.execute(\"SELECT COUNT(*) FROM funeral_brand\").fetchone()[0]}')
print(f' With phone: {db.execute(\"SELECT COUNT(*) FROM funeral_brand WHERE phone IS NOT NULL\").fetchone()[0]}')
print(f' With email: {db.execute(\"SELECT COUNT(*) FROM funeral_brand WHERE email IS NOT NULL\").fetchone()[0]}')
print(f' With website: {db.execute(\"SELECT COUNT(*) FROM funeral_brand WHERE website IS NOT NULL\").fetchone()[0]}')
print(f' With description: {db.execute(\"SELECT COUNT(*) FROM funeral_brand WHERE description IS NOT NULL\").fetchone()[0]}')
print()
print('Listing Tiers:')
for row in db.execute('SELECT listing_tier, COUNT(*) as n FROM funeral_brand GROUP BY listing_tier ORDER BY n DESC'):
print(f' {row[0]:12s} {row[1]:>6d}')
print()
print('Pricing Pages:')
print(f' Total crawled: {db.execute(\"SELECT COUNT(*) FROM source_record WHERE source_name=\\'website_crawl\\'\").fetchone()[0]}')
print(f' With pricing: {db.execute(\"SELECT COUNT(*) FROM source_record WHERE source_name=\\'website_crawl\\' AND json_extract(raw_data, \\'$.has_pricing\\')=1\").fetchone()[0]}')
print(f' With PDF links: {db.execute(\"SELECT COUNT(*) FROM source_record WHERE source_name=\\'website_crawl\\' AND json_extract(raw_data, \\'$.pdf_links\\') != \\'[]\\'\").fetchone()[0]}')
" 2>&1 | tee -a "$LOG"
echo "" | tee -a "$LOG"
echo "Finished: $(date)" | tee -a "$LOG"
echo "Log saved to: $LOG"