Initial commit: funeral provider discovery pipeline
Python crawlers for VIC Register, Funerals Australia, NFDA n8n workflows for scheduled discovery and enrichment SQLite schema and seeded dev database (1,463 providers) End-to-end process documentation in n8n/PROCESS.md
This commit is contained in:
199
crawlers/lookup_abn.py
Normal file
199
crawlers/lookup_abn.py
Normal file
@@ -0,0 +1,199 @@
|
||||
"""ABN Lookup module via the Australian Business Register (ABR) API.
|
||||
|
||||
Enriches providers with their ABN (strongest dedup key) and validates
|
||||
that they are active registered businesses.
|
||||
|
||||
The ABR API is FREE. Requires a GUID (authentication token) from:
|
||||
https://abr.business.gov.au/Tools/WebServices
|
||||
|
||||
Configuration:
|
||||
Set ABR_GUID env var or in config.json.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
import urllib.parse
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
from base import fetch_url, get_db, CRAWL_DELAY
|
||||
|
||||
# Load ABR GUID from env or config
|
||||
ABR_GUID = os.environ.get("ABR_GUID")
|
||||
if not ABR_GUID:
|
||||
config_path = os.path.join(os.path.dirname(__file__), "config.json")
|
||||
if os.path.exists(config_path):
|
||||
with open(config_path) as f:
|
||||
config = json.load(f)
|
||||
ABR_GUID = config.get("abr_guid")
|
||||
|
||||
ABR_BASE = "https://abr.business.gov.au/abrxmlsearch/AbrXmlSearch.asmx"
|
||||
|
||||
|
||||
def search_by_name(name: str, state: str | None = None,
|
||||
postcode: str | None = None) -> list[dict]:
|
||||
"""Search ABR by business name. Returns matching records."""
|
||||
if not ABR_GUID:
|
||||
print(" WARNING: ABR_GUID not configured. Skipping ABN lookup.")
|
||||
return []
|
||||
|
||||
params = {
|
||||
"name": name,
|
||||
"postcode": postcode or "",
|
||||
"legalName": "Y",
|
||||
"tradingName": "Y",
|
||||
"NSW": "Y", "SA": "Y", "ACT": "Y", "VIC": "Y",
|
||||
"WA": "Y", "NT": "Y", "QLD": "Y", "TAS": "Y",
|
||||
"authenticationGuid": ABR_GUID,
|
||||
}
|
||||
|
||||
# If state specified, only search that state
|
||||
if state:
|
||||
for s in ["NSW", "SA", "ACT", "VIC", "WA", "NT", "QLD", "TAS"]:
|
||||
params[s] = "Y" if s == state else "N"
|
||||
|
||||
url = f"{ABR_BASE}/ABRSearchByNameSimpleProtocol"
|
||||
try:
|
||||
text = fetch_url(url, method="GET", data=params, timeout=15)
|
||||
except Exception as e:
|
||||
return []
|
||||
|
||||
# Parse XML response
|
||||
results = []
|
||||
try:
|
||||
root = ET.fromstring(text)
|
||||
# The ABR response uses a default namespace
|
||||
ns = {"abr": "http://abr.business.gov.au/ABRXMLSearch/"}
|
||||
|
||||
for record in root.findall(".//abr:searchResultsRecord", ns):
|
||||
abn_elem = record.find(".//abr:ABN/abr:identifierValue", ns)
|
||||
status_elem = record.find(".//abr:ABN/abr:identifierStatus", ns)
|
||||
name_elem = (
|
||||
record.find(".//abr:mainName/abr:organisationName", ns)
|
||||
or record.find(".//abr:mainTradingName/abr:organisationName", ns)
|
||||
or record.find(".//abr:businessName/abr:organisationName", ns)
|
||||
)
|
||||
state_elem = record.find(".//abr:mainBusinessPhysicalAddress/abr:stateCode", ns)
|
||||
postcode_elem = record.find(".//abr:mainBusinessPhysicalAddress/abr:postcode", ns)
|
||||
score_elem = record.find(".//abr:nameScore", ns)
|
||||
|
||||
if abn_elem is not None:
|
||||
results.append({
|
||||
"abn": abn_elem.text,
|
||||
"status": status_elem.text if status_elem is not None else None,
|
||||
"name": name_elem.text if name_elem is not None else None,
|
||||
"state": state_elem.text if state_elem is not None else None,
|
||||
"postcode": postcode_elem.text if postcode_elem is not None else None,
|
||||
"score": int(score_elem.text) if score_elem is not None else 0,
|
||||
})
|
||||
except ET.ParseError:
|
||||
return []
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def find_best_match(name: str, state: str | None = None,
|
||||
postcode: str | None = None) -> dict | None:
|
||||
"""Find the best ABR match for a business name.
|
||||
|
||||
Returns the highest-scoring active match, or None.
|
||||
"""
|
||||
results = search_by_name(name, state, postcode)
|
||||
|
||||
# Filter to active businesses
|
||||
active = [r for r in results if r.get("status") == "Active"]
|
||||
if not active:
|
||||
return None
|
||||
|
||||
# Sort by score descending
|
||||
active.sort(key=lambda r: r.get("score", 0), reverse=True)
|
||||
|
||||
# Return best match if score is reasonable
|
||||
best = active[0]
|
||||
if best.get("score", 0) >= 80:
|
||||
return best
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def run(limit: int | None = None, state_filter: str | None = None):
|
||||
"""Look up ABNs for all providers that don't have one."""
|
||||
db = get_db()
|
||||
|
||||
query = """
|
||||
SELECT id, title, business_state, business_postcode
|
||||
FROM funeral_brand
|
||||
WHERE abn IS NULL AND verified = 0
|
||||
"""
|
||||
params = []
|
||||
|
||||
if state_filter:
|
||||
query += " AND business_state = ?"
|
||||
params.append(state_filter)
|
||||
|
||||
query += " ORDER BY id"
|
||||
|
||||
if limit:
|
||||
query += f" LIMIT {limit}"
|
||||
|
||||
providers = db.execute(query, params).fetchall()
|
||||
print(f"Providers without ABN: {len(providers)}")
|
||||
|
||||
if not ABR_GUID:
|
||||
print("ERROR: ABR_GUID not configured.")
|
||||
print(" Register at: https://abr.business.gov.au/Tools/WebServices")
|
||||
print(" Then set ABR_GUID env var or add 'abr_guid' to config.json")
|
||||
return
|
||||
|
||||
found = 0
|
||||
not_found = 0
|
||||
|
||||
for i, prov in enumerate(providers):
|
||||
if (i + 1) % 20 == 0 or i == 0:
|
||||
print(f" [{i+1}/{len(providers)}] {prov['title']}")
|
||||
|
||||
match = find_best_match(
|
||||
prov["title"],
|
||||
prov["business_state"],
|
||||
prov["business_postcode"]
|
||||
)
|
||||
|
||||
if match:
|
||||
db.execute(
|
||||
"UPDATE funeral_brand SET abn = ?, updated_at = datetime('now') WHERE id = ?",
|
||||
(match["abn"], prov["id"])
|
||||
)
|
||||
found += 1
|
||||
else:
|
||||
not_found += 1
|
||||
|
||||
if (i + 1) % 50 == 0:
|
||||
db.commit()
|
||||
|
||||
time.sleep(0.5) # Be gentle with the government API
|
||||
|
||||
db.commit()
|
||||
print(f"\nDone: {found} ABNs found, {not_found} not found")
|
||||
print(f" Success rate: {found/(found+not_found)*100:.1f}%" if found + not_found > 0 else "")
|
||||
|
||||
db.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
limit = None
|
||||
state = None
|
||||
|
||||
for arg in sys.argv[1:]:
|
||||
if arg.startswith("--state="):
|
||||
state = arg.split("=")[1]
|
||||
elif arg.startswith("--limit="):
|
||||
limit = int(arg.split("=")[1])
|
||||
else:
|
||||
try:
|
||||
limit = int(arg)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
run(limit=limit, state_filter=state)
|
||||
Reference in New Issue
Block a user