Files
Provider-Crawl/crawlers/lookup_abn.py
Richie cc91427789 Initial commit: funeral provider discovery pipeline
Python crawlers for VIC Register, Funerals Australia, NFDA
n8n workflows for scheduled discovery and enrichment
SQLite schema and seeded dev database (1,463 providers)
End-to-end process documentation in n8n/PROCESS.md
2026-04-24 10:27:08 +10:00

200 lines
6.1 KiB
Python

"""ABN Lookup module via the Australian Business Register (ABR) API.
Enriches providers with their ABN (strongest dedup key) and validates
that they are active registered businesses.
The ABR API is FREE. Requires a GUID (authentication token) from:
https://abr.business.gov.au/Tools/WebServices
Configuration:
Set ABR_GUID env var or in config.json.
"""
import json
import os
import re
import time
import urllib.parse
import xml.etree.ElementTree as ET
from base import fetch_url, get_db, CRAWL_DELAY
# Load ABR GUID from env or config
ABR_GUID = os.environ.get("ABR_GUID")
if not ABR_GUID:
config_path = os.path.join(os.path.dirname(__file__), "config.json")
if os.path.exists(config_path):
with open(config_path) as f:
config = json.load(f)
ABR_GUID = config.get("abr_guid")
ABR_BASE = "https://abr.business.gov.au/abrxmlsearch/AbrXmlSearch.asmx"
def search_by_name(name: str, state: str | None = None,
postcode: str | None = None) -> list[dict]:
"""Search ABR by business name. Returns matching records."""
if not ABR_GUID:
print(" WARNING: ABR_GUID not configured. Skipping ABN lookup.")
return []
params = {
"name": name,
"postcode": postcode or "",
"legalName": "Y",
"tradingName": "Y",
"NSW": "Y", "SA": "Y", "ACT": "Y", "VIC": "Y",
"WA": "Y", "NT": "Y", "QLD": "Y", "TAS": "Y",
"authenticationGuid": ABR_GUID,
}
# If state specified, only search that state
if state:
for s in ["NSW", "SA", "ACT", "VIC", "WA", "NT", "QLD", "TAS"]:
params[s] = "Y" if s == state else "N"
url = f"{ABR_BASE}/ABRSearchByNameSimpleProtocol"
try:
text = fetch_url(url, method="GET", data=params, timeout=15)
except Exception as e:
return []
# Parse XML response
results = []
try:
root = ET.fromstring(text)
# The ABR response uses a default namespace
ns = {"abr": "http://abr.business.gov.au/ABRXMLSearch/"}
for record in root.findall(".//abr:searchResultsRecord", ns):
abn_elem = record.find(".//abr:ABN/abr:identifierValue", ns)
status_elem = record.find(".//abr:ABN/abr:identifierStatus", ns)
name_elem = (
record.find(".//abr:mainName/abr:organisationName", ns)
or record.find(".//abr:mainTradingName/abr:organisationName", ns)
or record.find(".//abr:businessName/abr:organisationName", ns)
)
state_elem = record.find(".//abr:mainBusinessPhysicalAddress/abr:stateCode", ns)
postcode_elem = record.find(".//abr:mainBusinessPhysicalAddress/abr:postcode", ns)
score_elem = record.find(".//abr:nameScore", ns)
if abn_elem is not None:
results.append({
"abn": abn_elem.text,
"status": status_elem.text if status_elem is not None else None,
"name": name_elem.text if name_elem is not None else None,
"state": state_elem.text if state_elem is not None else None,
"postcode": postcode_elem.text if postcode_elem is not None else None,
"score": int(score_elem.text) if score_elem is not None else 0,
})
except ET.ParseError:
return []
return results
def find_best_match(name: str, state: str | None = None,
postcode: str | None = None) -> dict | None:
"""Find the best ABR match for a business name.
Returns the highest-scoring active match, or None.
"""
results = search_by_name(name, state, postcode)
# Filter to active businesses
active = [r for r in results if r.get("status") == "Active"]
if not active:
return None
# Sort by score descending
active.sort(key=lambda r: r.get("score", 0), reverse=True)
# Return best match if score is reasonable
best = active[0]
if best.get("score", 0) >= 80:
return best
return None
def run(limit: int | None = None, state_filter: str | None = None):
"""Look up ABNs for all providers that don't have one."""
db = get_db()
query = """
SELECT id, title, business_state, business_postcode
FROM funeral_brand
WHERE abn IS NULL AND verified = 0
"""
params = []
if state_filter:
query += " AND business_state = ?"
params.append(state_filter)
query += " ORDER BY id"
if limit:
query += f" LIMIT {limit}"
providers = db.execute(query, params).fetchall()
print(f"Providers without ABN: {len(providers)}")
if not ABR_GUID:
print("ERROR: ABR_GUID not configured.")
print(" Register at: https://abr.business.gov.au/Tools/WebServices")
print(" Then set ABR_GUID env var or add 'abr_guid' to config.json")
return
found = 0
not_found = 0
for i, prov in enumerate(providers):
if (i + 1) % 20 == 0 or i == 0:
print(f" [{i+1}/{len(providers)}] {prov['title']}")
match = find_best_match(
prov["title"],
prov["business_state"],
prov["business_postcode"]
)
if match:
db.execute(
"UPDATE funeral_brand SET abn = ?, updated_at = datetime('now') WHERE id = ?",
(match["abn"], prov["id"])
)
found += 1
else:
not_found += 1
if (i + 1) % 50 == 0:
db.commit()
time.sleep(0.5) # Be gentle with the government API
db.commit()
print(f"\nDone: {found} ABNs found, {not_found} not found")
print(f" Success rate: {found/(found+not_found)*100:.1f}%" if found + not_found > 0 else "")
db.close()
if __name__ == "__main__":
import sys
limit = None
state = None
for arg in sys.argv[1:]:
if arg.startswith("--state="):
state = arg.split("=")[1]
elif arg.startswith("--limit="):
limit = int(arg.split("=")[1])
else:
try:
limit = int(arg)
except ValueError:
pass
run(limit=limit, state_filter=state)