Python crawlers for VIC Register, Funerals Australia, NFDA n8n workflows for scheduled discovery and enrichment SQLite schema and seeded dev database (1,463 providers) End-to-end process documentation in n8n/PROCESS.md
147 lines
8.8 KiB
JSON
147 lines
8.8 KiB
JSON
{
|
|
"name": "3. Daily Website Enrichment",
|
|
"nodes": [
|
|
{
|
|
"parameters": {
|
|
"rule": {
|
|
"interval": [{ "field": "days", "daysInterval": 1, "triggerAtHour": 6 }]
|
|
}
|
|
},
|
|
"id": "schedule",
|
|
"name": "Daily Schedule",
|
|
"type": "n8n-nodes-base.scheduleTrigger",
|
|
"typeVersion": 1.2,
|
|
"position": [200, 300]
|
|
},
|
|
{
|
|
"parameters": {
|
|
"command": "cd /opt/crawlers && python3 enrich_websites.py --limit=50 2>&1"
|
|
},
|
|
"id": "enrich",
|
|
"name": "Crawl & Extract (batch 50)",
|
|
"type": "n8n-nodes-base.executeCommand",
|
|
"typeVersion": 1,
|
|
"position": [450, 300],
|
|
"executeOnce": true
|
|
},
|
|
{
|
|
"parameters": {
|
|
"command": "cd /opt/crawlers && python3 -c \"\nimport json, sqlite3\ndb = sqlite3.connect('/opt/database/providers.db')\ndb.row_factory = sqlite3.Row\nrows = db.execute('''\n SELECT sr.id, sr.source_url, sr.matched_brand_id,\n json_extract(sr.raw_data, \\\"$.pricing_text\\\") as pricing_text,\n json_extract(sr.raw_data, \\\"$.has_pricing\\\") as has_pricing\n FROM source_record sr\n WHERE sr.source_name = 'website_crawl'\n AND sr.processed_at IS NULL\n AND json_extract(sr.raw_data, \\\"$.has_pricing\\\") = 1\n LIMIT 20\n''').fetchall()\nresult = [{'id': r['id'], 'brand_id': r['matched_brand_id'], 'url': r['source_url'], 'text_length': len(r['pricing_text'] or '')} for r in rows]\nprint(json.dumps(result))\n\" 2>&1"
|
|
},
|
|
"id": "get_queue",
|
|
"name": "Get Pricing Pages Queue",
|
|
"type": "n8n-nodes-base.executeCommand",
|
|
"typeVersion": 1,
|
|
"position": [700, 300]
|
|
},
|
|
{
|
|
"parameters": {
|
|
"jsCode": "const output = $input.first().json.stdout.trim();\ntry {\n const items = JSON.parse(output);\n return items.map(item => ({ json: item }));\n} catch(e) {\n return [{ json: { error: 'No pricing pages to process', raw: output } }];\n}"
|
|
},
|
|
"id": "parse_queue",
|
|
"name": "Parse Queue Items",
|
|
"type": "n8n-nodes-base.code",
|
|
"typeVersion": 2,
|
|
"position": [950, 300]
|
|
},
|
|
{
|
|
"parameters": {
|
|
"conditions": {
|
|
"conditions": [
|
|
{
|
|
"id": "has_text",
|
|
"leftValue": "={{ $json.text_length }}",
|
|
"rightValue": 100,
|
|
"operator": { "type": "number", "operation": "gt" }
|
|
}
|
|
]
|
|
}
|
|
},
|
|
"id": "has_text",
|
|
"name": "Has Pricing Text?",
|
|
"type": "n8n-nodes-base.if",
|
|
"typeVersion": 2.2,
|
|
"position": [1200, 300]
|
|
},
|
|
{
|
|
"parameters": {
|
|
"command": "={{ 'cd /opt/crawlers && python3 -c \"import json, sqlite3; db=sqlite3.connect(\\'/opt/database/providers.db\\'); r=db.execute(\\'SELECT json_extract(raw_data, \\\\\\\"$.pricing_text\\\\\\\") as t FROM source_record WHERE id=' + $json.id + '\\').fetchone(); print(r[0][:6000] if r and r[0] else \\'\\')\"' }}"
|
|
},
|
|
"id": "get_text",
|
|
"name": "Get Pricing Text",
|
|
"type": "n8n-nodes-base.executeCommand",
|
|
"typeVersion": 1,
|
|
"position": [1450, 240]
|
|
},
|
|
{
|
|
"parameters": {
|
|
"url": "https://api.anthropic.com/v1/messages",
|
|
"sendHeaders": true,
|
|
"headerParameters": {
|
|
"parameters": [
|
|
{ "name": "x-api-key", "value": "={{ $env.ANTHROPIC_API_KEY }}" },
|
|
{ "name": "anthropic-version", "value": "2023-06-01" },
|
|
{ "name": "content-type", "value": "application/json" }
|
|
]
|
|
},
|
|
"sendBody": true,
|
|
"specifyBody": "json",
|
|
"jsonBody": "={{ JSON.stringify({ model: 'claude-haiku-4-5-20251001', max_tokens: 2048, messages: [{ role: 'user', content: 'Extract funeral packages and pricing from this funeral director\\'s pricing page. Return ONLY valid JSON matching this schema:\\n\\n{\\n \"packages\": [\\n {\\n \"name\": \"Package name\",\\n \"funeralType\": \"one of: Service & Cremation, Service & Burial, Cremation Only, Graveside Burial\",\\n \"price\": 0,\\n \"inclusions\": [\\n {\"item\": \"Inclusion name\", \"price\": 0, \"optional\": false, \"complimentary\": false}\\n ]\\n }\\n ]\\n}\\n\\nUse these inclusion type names where possible: Professional Service Fee, Transportation Service Fee, Professional Mortuary Care, Death Registration Certificate, Cremation Certificate/Permit, Government Levy, Accommodation, Viewing Fee, Coffin, Cremation Fee, Saturday Service Fee, Dressing Fee, Embalming, Digital Recording, Webstreaming, After Hours Transfer Surcharge.\\n\\nIf a price cannot be determined, use null. If no packages/pricing found, return {\"packages\": []}.\\n\\nPricing page text:\\n' + $('Get Pricing Text').first().json.stdout.substring(0, 5000) }] }) }}"
|
|
},
|
|
"id": "ai_extract",
|
|
"name": "AI Extract (Claude Haiku)",
|
|
"type": "n8n-nodes-base.httpRequest",
|
|
"typeVersion": 4.2,
|
|
"position": [1700, 240]
|
|
},
|
|
{
|
|
"parameters": {
|
|
"jsCode": "const response = $input.first().json;\nconst sourceId = $('Parse Queue Items').first().json.id;\nconst brandId = $('Parse Queue Items').first().json.brand_id;\n\nlet packages = [];\ntry {\n const content = response.content[0].text;\n // Extract JSON from the response (may be wrapped in markdown)\n const jsonMatch = content.match(/\\{[\\s\\S]*\\}/);\n if (jsonMatch) {\n const parsed = JSON.parse(jsonMatch[0]);\n packages = parsed.packages || [];\n }\n} catch(e) {\n // AI response wasn't valid JSON\n}\n\nreturn [{ json: { sourceId, brandId, packages, packageCount: packages.length } }];"
|
|
},
|
|
"id": "parse_ai",
|
|
"name": "Parse AI Response",
|
|
"type": "n8n-nodes-base.code",
|
|
"typeVersion": 2,
|
|
"position": [1950, 240]
|
|
},
|
|
{
|
|
"parameters": {
|
|
"command": "={{ 'cd /opt/crawlers && python3 -c \"\\nimport json, sqlite3\\ndb = sqlite3.connect(\\'/opt/database/providers.db\\')\\npackages = ' + JSON.stringify(JSON.stringify($json.packages)) + '\\npackages = json.loads(packages)\\nbrand_id = ' + $json.brandId + '\\nsource_id = ' + $json.sourceId + '\\n\\nfor pkg in packages:\\n if not pkg.get(\\'price\\'):\\n continue\\n cur = db.execute(\\n \\'INSERT INTO package (title, funeral_type, brand_id, source_url, extraction_confidence) VALUES (?, ?, ?, ?, ?)\\',\\n (pkg[\\'name\\'], pkg.get(\\'funeralType\\'), brand_id, \\'ai_extraction\\', 0.7)\\n )\\n pkg_id = cur.lastrowid\\n for inc in pkg.get(\\'inclusions\\', []):\\n if inc.get(\\'price\\') is not None:\\n db.execute(\\n \\'INSERT INTO package_inclusion (price, optional, complimentary, inclusion_type_title, package_id) VALUES (?, ?, ?, ?, ?)\\',\\n (inc[\\'price\\'], 1 if inc.get(\\'optional\\') else 0, 1 if inc.get(\\'complimentary\\') else 0, inc[\\'item\\'], pkg_id)\\n )\\n\\ndb.execute(\\'UPDATE source_record SET processed_at=datetime(\\\\\\'now\\\\\\') WHERE id=?\\', (source_id,))\\ndb.execute(\\'UPDATE funeral_brand SET enrichment_status=\\\\\\'complete\\\\\\', last_enriched_at=datetime(\\\\\\'now\\\\\\') WHERE id=?\\', (brand_id,))\\ndb.commit()\\nprint(f\\'{len(packages)} packages saved for brand {brand_id}\\')\\n\" 2>&1' }}"
|
|
},
|
|
"id": "save_packages",
|
|
"name": "Save Packages to DB",
|
|
"type": "n8n-nodes-base.executeCommand",
|
|
"typeVersion": 1,
|
|
"position": [2200, 240]
|
|
},
|
|
{
|
|
"parameters": {
|
|
"command": "cd /opt/crawlers && python3 compute_tiers.py 2>&1"
|
|
},
|
|
"id": "recompute_tiers",
|
|
"name": "Recompute Listing Tiers",
|
|
"type": "n8n-nodes-base.executeCommand",
|
|
"typeVersion": 1,
|
|
"position": [2450, 300]
|
|
}
|
|
],
|
|
"connections": {
|
|
"Daily Schedule": { "main": [[ { "node": "Crawl & Extract (batch 50)", "type": "main", "index": 0 } ]] },
|
|
"Crawl & Extract (batch 50)": { "main": [[ { "node": "Get Pricing Pages Queue", "type": "main", "index": 0 } ]] },
|
|
"Get Pricing Pages Queue": { "main": [[ { "node": "Parse Queue Items", "type": "main", "index": 0 } ]] },
|
|
"Parse Queue Items": { "main": [[ { "node": "Has Pricing Text?", "type": "main", "index": 0 } ]] },
|
|
"Has Pricing Text?": {
|
|
"main": [
|
|
[{ "node": "Get Pricing Text", "type": "main", "index": 0 }],
|
|
[{ "node": "Recompute Listing Tiers", "type": "main", "index": 0 }]
|
|
]
|
|
},
|
|
"Get Pricing Text": { "main": [[ { "node": "AI Extract (Claude Haiku)", "type": "main", "index": 0 } ]] },
|
|
"AI Extract (Claude Haiku)": { "main": [[ { "node": "Parse AI Response", "type": "main", "index": 0 } ]] },
|
|
"Parse AI Response": { "main": [[ { "node": "Save Packages to DB", "type": "main", "index": 0 } ]] },
|
|
"Save Packages to DB": { "main": [[ { "node": "Recompute Listing Tiers", "type": "main", "index": 0 } ]] }
|
|
},
|
|
"settings": { "executionOrder": "v1" },
|
|
"tags": [{ "name": "funeral-arranger" }]
|
|
}
|