Provider-Crawl/n8n/workflows/3_daily_enrichment.json

{
  "name": "3. Daily Website Enrichment",
  "nodes": [
    {
      "parameters": {
        "rule": {
          "interval": [{ "field": "days", "daysInterval": 1, "triggerAtHour": 6 }]
        }
      },
      "id": "schedule",
      "name": "Daily Schedule",
      "type": "n8n-nodes-base.scheduleTrigger",
      "typeVersion": 1.2,
      "position": [200, 300]
    },
    {
      "parameters": {
        "command": "cd /opt/crawlers && python3 enrich_websites.py --limit=50 2>&1"
      },
      "id": "enrich",
      "name": "Crawl & Extract (batch 50)",
      "type": "n8n-nodes-base.executeCommand",
      "typeVersion": 1,
      "position": [450, 300],
      "executeOnce": true
    },
    {
      "parameters": {
        "command": "cd /opt/crawlers && python3 -c \"\nimport json, sqlite3\ndb = sqlite3.connect('/opt/database/providers.db')\ndb.row_factory = sqlite3.Row\nrows = db.execute('''\n    SELECT sr.id, sr.source_url, sr.matched_brand_id,\n           json_extract(sr.raw_data, \\\"$.pricing_text\\\") as pricing_text,\n           json_extract(sr.raw_data, \\\"$.has_pricing\\\") as has_pricing\n    FROM source_record sr\n    WHERE sr.source_name = 'website_crawl'\n      AND sr.processed_at IS NULL\n      AND json_extract(sr.raw_data, \\\"$.has_pricing\\\") = 1\n    LIMIT 20\n''').fetchall()\nresult = [{'id': r['id'], 'brand_id': r['matched_brand_id'], 'url': r['source_url'], 'text_length': len(r['pricing_text'] or '')} for r in rows]\nprint(json.dumps(result))\n\" 2>&1"
      },
      "id": "get_queue",
      "name": "Get Pricing Pages Queue",
      "type": "n8n-nodes-base.executeCommand",
      "typeVersion": 1,
      "position": [700, 300]
    },
    {
      "parameters": {
        "jsCode": "const output = $input.first().json.stdout.trim();\ntry {\n  const items = JSON.parse(output);\n  return items.map(item => ({ json: item }));\n} catch(e) {\n  return [{ json: { error: 'No pricing pages to process', raw: output } }];\n}"
      },
      "id": "parse_queue",
      "name": "Parse Queue Items",
      "type": "n8n-nodes-base.code",
      "typeVersion": 2,
      "position": [950, 300]
    },
    {
      "parameters": {
        "conditions": {
          "conditions": [
            {
              "id": "has_text",
              "leftValue": "={{ $json.text_length }}",
              "rightValue": 100,
              "operator": { "type": "number", "operation": "gt" }
            }
          ]
        }
      },
      "id": "has_text",
      "name": "Has Pricing Text?",
      "type": "n8n-nodes-base.if",
      "typeVersion": 2.2,
      "position": [1200, 300]
    },
    {
      "parameters": {
        "command": "={{ 'cd /opt/crawlers && python3 -c \"import json, sqlite3; db=sqlite3.connect(\\'/opt/database/providers.db\\'); r=db.execute(\\'SELECT json_extract(raw_data, \\\\\\\"$.pricing_text\\\\\\\") as t FROM source_record WHERE id=' + $json.id + '\\').fetchone(); print(r[0][:6000] if r and r[0] else \\'\\')\"' }}"
      },
      "id": "get_text",
      "name": "Get Pricing Text",
      "type": "n8n-nodes-base.executeCommand",
      "typeVersion": 1,
      "position": [1450, 240]
    },
    {
      "parameters": {
        "url": "https://api.anthropic.com/v1/messages",
        "sendHeaders": true,
        "headerParameters": {
          "parameters": [
            { "name": "x-api-key", "value": "={{ $env.ANTHROPIC_API_KEY }}" },
            { "name": "anthropic-version", "value": "2023-06-01" },
            { "name": "content-type", "value": "application/json" }
          ]
        },
        "sendBody": true,
        "specifyBody": "json",
        "jsonBody": "={{ JSON.stringify({ model: 'claude-haiku-4-5-20251001', max_tokens: 2048, messages: [{ role: 'user', content: 'Extract funeral packages and pricing from this funeral director\\'s pricing page. Return ONLY valid JSON matching this schema:\\n\\n{\\n  \"packages\": [\\n    {\\n      \"name\": \"Package name\",\\n      \"funeralType\": \"one of: Service & Cremation, Service & Burial, Cremation Only, Graveside Burial\",\\n      \"price\": 0,\\n      \"inclusions\": [\\n        {\"item\": \"Inclusion name\", \"price\": 0, \"optional\": false, \"complimentary\": false}\\n      ]\\n    }\\n  ]\\n}\\n\\nUse these inclusion type names where possible: Professional Service Fee, Transportation Service Fee, Professional Mortuary Care, Death Registration Certificate, Cremation Certificate/Permit, Government Levy, Accommodation, Viewing Fee, Coffin, Cremation Fee, Saturday Service Fee, Dressing Fee, Embalming, Digital Recording, Webstreaming, After Hours Transfer Surcharge.\\n\\nIf a price cannot be determined, use null. If no packages/pricing found, return {\"packages\": []}.\\n\\nPricing page text:\\n' + $('Get Pricing Text').first().json.stdout.substring(0, 5000) }] }) }}"
      },
      "id": "ai_extract",
      "name": "AI Extract (Claude Haiku)",
      "type": "n8n-nodes-base.httpRequest",
      "typeVersion": 4.2,
      "position": [1700, 240]
    },
    {
      "parameters": {
        "jsCode": "const response = $input.first().json;\nconst sourceId = $('Parse Queue Items').first().json.id;\nconst brandId = $('Parse Queue Items').first().json.brand_id;\n\nlet packages = [];\ntry {\n  const content = response.content[0].text;\n  // Extract JSON from the response (may be wrapped in markdown)\n  const jsonMatch = content.match(/\\{[\\s\\S]*\\}/);\n  if (jsonMatch) {\n    const parsed = JSON.parse(jsonMatch[0]);\n    packages = parsed.packages || [];\n  }\n} catch(e) {\n  // AI response wasn't valid JSON\n}\n\nreturn [{ json: { sourceId, brandId, packages, packageCount: packages.length } }];"
      },
      "id": "parse_ai",
      "name": "Parse AI Response",
      "type": "n8n-nodes-base.code",
      "typeVersion": 2,
      "position": [1950, 240]
    },
    {
      "parameters": {
        "command": "={{ 'cd /opt/crawlers && python3 -c \"\\nimport json, sqlite3\\ndb = sqlite3.connect(\\'/opt/database/providers.db\\')\\npackages = ' + JSON.stringify(JSON.stringify($json.packages)) + '\\npackages = json.loads(packages)\\nbrand_id = ' + $json.brandId + '\\nsource_id = ' + $json.sourceId + '\\n\\nfor pkg in packages:\\n    if not pkg.get(\\'price\\'):\\n        continue\\n    cur = db.execute(\\n        \\'INSERT INTO package (title, funeral_type, brand_id, source_url, extraction_confidence) VALUES (?, ?, ?, ?, ?)\\',\\n        (pkg[\\'name\\'], pkg.get(\\'funeralType\\'), brand_id, \\'ai_extraction\\', 0.7)\\n    )\\n    pkg_id = cur.lastrowid\\n    for inc in pkg.get(\\'inclusions\\', []):\\n        if inc.get(\\'price\\') is not None:\\n            db.execute(\\n                \\'INSERT INTO package_inclusion (price, optional, complimentary, inclusion_type_title, package_id) VALUES (?, ?, ?, ?, ?)\\',\\n                (inc[\\'price\\'], 1 if inc.get(\\'optional\\') else 0, 1 if inc.get(\\'complimentary\\') else 0, inc[\\'item\\'], pkg_id)\\n            )\\n\\ndb.execute(\\'UPDATE source_record SET processed_at=datetime(\\\\\\'now\\\\\\') WHERE id=?\\', (source_id,))\\ndb.execute(\\'UPDATE funeral_brand SET enrichment_status=\\\\\\'complete\\\\\\', last_enriched_at=datetime(\\\\\\'now\\\\\\') WHERE id=?\\', (brand_id,))\\ndb.commit()\\nprint(f\\'{len(packages)} packages saved for brand {brand_id}\\')\\n\" 2>&1' }}"
      },
      "id": "save_packages",
      "name": "Save Packages to DB",
      "type": "n8n-nodes-base.executeCommand",
      "typeVersion": 1,
      "position": [2200, 240]
    },
    {
      "parameters": {
        "command": "cd /opt/crawlers && python3 compute_tiers.py 2>&1"
      },
      "id": "recompute_tiers",
      "name": "Recompute Listing Tiers",
      "type": "n8n-nodes-base.executeCommand",
      "typeVersion": 1,
      "position": [2450, 300]
    }
  ],
  "connections": {
    "Daily Schedule": { "main": [[ { "node": "Crawl & Extract (batch 50)", "type": "main", "index": 0 } ]] },
    "Crawl & Extract (batch 50)": { "main": [[ { "node": "Get Pricing Pages Queue", "type": "main", "index": 0 } ]] },
    "Get Pricing Pages Queue": { "main": [[ { "node": "Parse Queue Items", "type": "main", "index": 0 } ]] },
    "Parse Queue Items": { "main": [[ { "node": "Has Pricing Text?", "type": "main", "index": 0 } ]] },
    "Has Pricing Text?": {
      "main": [
        [{ "node": "Get Pricing Text", "type": "main", "index": 0 }],
        [{ "node": "Recompute Listing Tiers", "type": "main", "index": 0 }]
      ]
    },
    "Get Pricing Text": { "main": [[ { "node": "AI Extract (Claude Haiku)", "type": "main", "index": 0 } ]] },
    "AI Extract (Claude Haiku)": { "main": [[ { "node": "Parse AI Response", "type": "main", "index": 0 } ]] },
    "Parse AI Response": { "main": [[ { "node": "Save Packages to DB", "type": "main", "index": 0 } ]] },
    "Save Packages to DB": { "main": [[ { "node": "Recompute Listing Tiers", "type": "main", "index": 0 } ]] }
  },
  "settings": { "executionOrder": "v1" },
  "tags": [{ "name": "funeral-arranger" }]
}