#!/bin/bash # Full pipeline overnight run # Usage: ./run_overnight.sh # # Before running: # 1. Add your Serper API key to config.json # 2. Optionally add your Anthropic API key for AI extraction # # This script runs all steps sequentially and logs everything. set -e cd "$(dirname "$0")" LOG="../logs/overnight_$(date +%Y%m%d_%H%M%S).log" mkdir -p ../logs echo "=== OVERNIGHT PIPELINE RUN ===" | tee "$LOG" echo "Started: $(date)" | tee -a "$LOG" echo "" | tee -a "$LOG" # Check config SERPER_KEY=$(python3 -c "import json; c=json.load(open('config.json')); print(c.get('serper_api_key') or '')") ANTHROPIC_KEY=$(python3 -c "import json; c=json.load(open('config.json')); print(c.get('anthropic_api_key') or '')") if [ -z "$SERPER_KEY" ]; then echo "WARNING: No Serper API key — website discovery will use DDG (slower, lower hit rate)" | tee -a "$LOG" else echo "Serper API key: configured" | tee -a "$LOG" fi if [ -z "$ANTHROPIC_KEY" ]; then echo "WARNING: No Anthropic API key — AI extraction will be skipped" | tee -a "$LOG" else echo "Anthropic API key: configured" | tee -a "$LOG" fi echo "" | tee -a "$LOG" # Step 1: Source crawlers echo "=== STEP 1: Source Crawlers ===" | tee -a "$LOG" echo "[$(date +%H:%M:%S)] Running VIC Register crawler..." | tee -a "$LOG" python3 crawl_vic_register.py 2>&1 | tee -a "$LOG" echo "[$(date +%H:%M:%S)] Running Funerals Australia crawler..." | tee -a "$LOG" python3 crawl_funerals_australia.py 2>&1 | tee -a "$LOG" echo "[$(date +%H:%M:%S)] Running NFDA crawler..." | tee -a "$LOG" python3 crawl_nfda.py 2>&1 | tee -a "$LOG" # Step 2: Deduplication echo "" | tee -a "$LOG" echo "=== STEP 2: Deduplication ===" | tee -a "$LOG" echo "[$(date +%H:%M:%S)] Running dedup..." | tee -a "$LOG" python3 dedup.py 2>&1 | tee -a "$LOG" # Step 3: Website discovery (all providers without one) echo "" | tee -a "$LOG" echo "=== STEP 3: Website Discovery ===" | tee -a "$LOG" NEED_WEBSITE=$(python3 -c "from base import get_db; db=get_db(); print(db.execute('SELECT COUNT(*) FROM funeral_brand WHERE website IS NULL AND verified=0').fetchone()[0])") echo "[$(date +%H:%M:%S)] Providers needing websites: $NEED_WEBSITE" | tee -a "$LOG" # Process in batches of 200 to avoid issues BATCH=200 OFFSET=0 while [ $OFFSET -lt $NEED_WEBSITE ]; do REMAINING=$((NEED_WEBSITE - OFFSET)) CURRENT=$((REMAINING < BATCH ? REMAINING : BATCH)) echo "[$(date +%H:%M:%S)] Discovering websites batch $((OFFSET/BATCH + 1)) ($CURRENT providers)..." | tee -a "$LOG" python3 discover_websites.py --limit=$CURRENT 2>&1 | tee -a "$LOG" OFFSET=$((OFFSET + BATCH)) # Brief pause between batches sleep 5 done # Step 4: Website enrichment (all with website, not yet enriched) echo "" | tee -a "$LOG" echo "=== STEP 4: Website Enrichment ===" | tee -a "$LOG" NEED_ENRICH=$(python3 -c "from base import get_db; db=get_db(); print(db.execute('SELECT COUNT(*) FROM funeral_brand WHERE website IS NOT NULL AND enrichment_status=\"pending\" AND verified=0').fetchone()[0])") echo "[$(date +%H:%M:%S)] Providers needing enrichment: $NEED_ENRICH" | tee -a "$LOG" python3 enrich_websites.py --limit=$NEED_ENRICH 2>&1 | tee -a "$LOG" # Step 5: Compute tiers echo "" | tee -a "$LOG" echo "=== STEP 5: Compute Tiers ===" | tee -a "$LOG" python3 compute_tiers.py 2>&1 | tee -a "$LOG" # Final summary echo "" | tee -a "$LOG" echo "=== FINAL SUMMARY ===" | tee -a "$LOG" python3 -c " from base import get_db db = get_db() print('Database Status:') print(f' Total providers: {db.execute(\"SELECT COUNT(*) FROM funeral_brand\").fetchone()[0]}') print(f' With phone: {db.execute(\"SELECT COUNT(*) FROM funeral_brand WHERE phone IS NOT NULL\").fetchone()[0]}') print(f' With email: {db.execute(\"SELECT COUNT(*) FROM funeral_brand WHERE email IS NOT NULL\").fetchone()[0]}') print(f' With website: {db.execute(\"SELECT COUNT(*) FROM funeral_brand WHERE website IS NOT NULL\").fetchone()[0]}') print(f' With description: {db.execute(\"SELECT COUNT(*) FROM funeral_brand WHERE description IS NOT NULL\").fetchone()[0]}') print() print('Listing Tiers:') for row in db.execute('SELECT listing_tier, COUNT(*) as n FROM funeral_brand GROUP BY listing_tier ORDER BY n DESC'): print(f' {row[0]:12s} {row[1]:>6d}') print() print('Pricing Pages:') print(f' Total crawled: {db.execute(\"SELECT COUNT(*) FROM source_record WHERE source_name=\\'website_crawl\\'\").fetchone()[0]}') print(f' With pricing: {db.execute(\"SELECT COUNT(*) FROM source_record WHERE source_name=\\'website_crawl\\' AND json_extract(raw_data, \\'$.has_pricing\\')=1\").fetchone()[0]}') print(f' With PDF links: {db.execute(\"SELECT COUNT(*) FROM source_record WHERE source_name=\\'website_crawl\\' AND json_extract(raw_data, \\'$.pdf_links\\') != \\'[]\\'\").fetchone()[0]}') " 2>&1 | tee -a "$LOG" echo "" | tee -a "$LOG" echo "Finished: $(date)" | tee -a "$LOG" echo "Log saved to: $LOG"