Initial commit: funeral provider discovery pipeline
Python crawlers for VIC Register, Funerals Australia, NFDA n8n workflows for scheduled discovery and enrichment SQLite schema and seeded dev database (1,463 providers) End-to-end process documentation in n8n/PROCESS.md
This commit is contained in:
111
crawlers/run_overnight.sh
Executable file
111
crawlers/run_overnight.sh
Executable file
@@ -0,0 +1,111 @@
|
||||
#!/bin/bash
|
||||
# Full pipeline overnight run
|
||||
# Usage: ./run_overnight.sh
|
||||
#
|
||||
# Before running:
|
||||
# 1. Add your Serper API key to config.json
|
||||
# 2. Optionally add your Anthropic API key for AI extraction
|
||||
#
|
||||
# This script runs all steps sequentially and logs everything.
|
||||
|
||||
set -e
|
||||
cd "$(dirname "$0")"
|
||||
|
||||
LOG="../logs/overnight_$(date +%Y%m%d_%H%M%S).log"
|
||||
mkdir -p ../logs
|
||||
|
||||
echo "=== OVERNIGHT PIPELINE RUN ===" | tee "$LOG"
|
||||
echo "Started: $(date)" | tee -a "$LOG"
|
||||
echo "" | tee -a "$LOG"
|
||||
|
||||
# Check config
|
||||
SERPER_KEY=$(python3 -c "import json; c=json.load(open('config.json')); print(c.get('serper_api_key') or '')")
|
||||
ANTHROPIC_KEY=$(python3 -c "import json; c=json.load(open('config.json')); print(c.get('anthropic_api_key') or '')")
|
||||
|
||||
if [ -z "$SERPER_KEY" ]; then
|
||||
echo "WARNING: No Serper API key — website discovery will use DDG (slower, lower hit rate)" | tee -a "$LOG"
|
||||
else
|
||||
echo "Serper API key: configured" | tee -a "$LOG"
|
||||
fi
|
||||
|
||||
if [ -z "$ANTHROPIC_KEY" ]; then
|
||||
echo "WARNING: No Anthropic API key — AI extraction will be skipped" | tee -a "$LOG"
|
||||
else
|
||||
echo "Anthropic API key: configured" | tee -a "$LOG"
|
||||
fi
|
||||
echo "" | tee -a "$LOG"
|
||||
|
||||
# Step 1: Source crawlers
|
||||
echo "=== STEP 1: Source Crawlers ===" | tee -a "$LOG"
|
||||
echo "[$(date +%H:%M:%S)] Running VIC Register crawler..." | tee -a "$LOG"
|
||||
python3 crawl_vic_register.py 2>&1 | tee -a "$LOG"
|
||||
|
||||
echo "[$(date +%H:%M:%S)] Running Funerals Australia crawler..." | tee -a "$LOG"
|
||||
python3 crawl_funerals_australia.py 2>&1 | tee -a "$LOG"
|
||||
|
||||
echo "[$(date +%H:%M:%S)] Running NFDA crawler..." | tee -a "$LOG"
|
||||
python3 crawl_nfda.py 2>&1 | tee -a "$LOG"
|
||||
|
||||
# Step 2: Deduplication
|
||||
echo "" | tee -a "$LOG"
|
||||
echo "=== STEP 2: Deduplication ===" | tee -a "$LOG"
|
||||
echo "[$(date +%H:%M:%S)] Running dedup..." | tee -a "$LOG"
|
||||
python3 dedup.py 2>&1 | tee -a "$LOG"
|
||||
|
||||
# Step 3: Website discovery (all providers without one)
|
||||
echo "" | tee -a "$LOG"
|
||||
echo "=== STEP 3: Website Discovery ===" | tee -a "$LOG"
|
||||
NEED_WEBSITE=$(python3 -c "from base import get_db; db=get_db(); print(db.execute('SELECT COUNT(*) FROM funeral_brand WHERE website IS NULL AND verified=0').fetchone()[0])")
|
||||
echo "[$(date +%H:%M:%S)] Providers needing websites: $NEED_WEBSITE" | tee -a "$LOG"
|
||||
|
||||
# Process in batches of 200 to avoid issues
|
||||
BATCH=200
|
||||
OFFSET=0
|
||||
while [ $OFFSET -lt $NEED_WEBSITE ]; do
|
||||
REMAINING=$((NEED_WEBSITE - OFFSET))
|
||||
CURRENT=$((REMAINING < BATCH ? REMAINING : BATCH))
|
||||
echo "[$(date +%H:%M:%S)] Discovering websites batch $((OFFSET/BATCH + 1)) ($CURRENT providers)..." | tee -a "$LOG"
|
||||
python3 discover_websites.py --limit=$CURRENT 2>&1 | tee -a "$LOG"
|
||||
OFFSET=$((OFFSET + BATCH))
|
||||
# Brief pause between batches
|
||||
sleep 5
|
||||
done
|
||||
|
||||
# Step 4: Website enrichment (all with website, not yet enriched)
|
||||
echo "" | tee -a "$LOG"
|
||||
echo "=== STEP 4: Website Enrichment ===" | tee -a "$LOG"
|
||||
NEED_ENRICH=$(python3 -c "from base import get_db; db=get_db(); print(db.execute('SELECT COUNT(*) FROM funeral_brand WHERE website IS NOT NULL AND enrichment_status=\"pending\" AND verified=0').fetchone()[0])")
|
||||
echo "[$(date +%H:%M:%S)] Providers needing enrichment: $NEED_ENRICH" | tee -a "$LOG"
|
||||
python3 enrich_websites.py --limit=$NEED_ENRICH 2>&1 | tee -a "$LOG"
|
||||
|
||||
# Step 5: Compute tiers
|
||||
echo "" | tee -a "$LOG"
|
||||
echo "=== STEP 5: Compute Tiers ===" | tee -a "$LOG"
|
||||
python3 compute_tiers.py 2>&1 | tee -a "$LOG"
|
||||
|
||||
# Final summary
|
||||
echo "" | tee -a "$LOG"
|
||||
echo "=== FINAL SUMMARY ===" | tee -a "$LOG"
|
||||
python3 -c "
|
||||
from base import get_db
|
||||
db = get_db()
|
||||
print('Database Status:')
|
||||
print(f' Total providers: {db.execute(\"SELECT COUNT(*) FROM funeral_brand\").fetchone()[0]}')
|
||||
print(f' With phone: {db.execute(\"SELECT COUNT(*) FROM funeral_brand WHERE phone IS NOT NULL\").fetchone()[0]}')
|
||||
print(f' With email: {db.execute(\"SELECT COUNT(*) FROM funeral_brand WHERE email IS NOT NULL\").fetchone()[0]}')
|
||||
print(f' With website: {db.execute(\"SELECT COUNT(*) FROM funeral_brand WHERE website IS NOT NULL\").fetchone()[0]}')
|
||||
print(f' With description: {db.execute(\"SELECT COUNT(*) FROM funeral_brand WHERE description IS NOT NULL\").fetchone()[0]}')
|
||||
print()
|
||||
print('Listing Tiers:')
|
||||
for row in db.execute('SELECT listing_tier, COUNT(*) as n FROM funeral_brand GROUP BY listing_tier ORDER BY n DESC'):
|
||||
print(f' {row[0]:12s} {row[1]:>6d}')
|
||||
print()
|
||||
print('Pricing Pages:')
|
||||
print(f' Total crawled: {db.execute(\"SELECT COUNT(*) FROM source_record WHERE source_name=\\'website_crawl\\'\").fetchone()[0]}')
|
||||
print(f' With pricing: {db.execute(\"SELECT COUNT(*) FROM source_record WHERE source_name=\\'website_crawl\\' AND json_extract(raw_data, \\'$.has_pricing\\')=1\").fetchone()[0]}')
|
||||
print(f' With PDF links: {db.execute(\"SELECT COUNT(*) FROM source_record WHERE source_name=\\'website_crawl\\' AND json_extract(raw_data, \\'$.pdf_links\\') != \\'[]\\'\").fetchone()[0]}')
|
||||
" 2>&1 | tee -a "$LOG"
|
||||
|
||||
echo "" | tee -a "$LOG"
|
||||
echo "Finished: $(date)" | tee -a "$LOG"
|
||||
echo "Log saved to: $LOG"
|
||||
Reference in New Issue
Block a user