Python crawlers for VIC Register, Funerals Australia, NFDA n8n workflows for scheduled discovery and enrichment SQLite schema and seeded dev database (1,463 providers) End-to-end process documentation in n8n/PROCESS.md
200 lines
6.1 KiB
Python
200 lines
6.1 KiB
Python
"""ABN Lookup module via the Australian Business Register (ABR) API.
|
|
|
|
Enriches providers with their ABN (strongest dedup key) and validates
|
|
that they are active registered businesses.
|
|
|
|
The ABR API is FREE. Requires a GUID (authentication token) from:
|
|
https://abr.business.gov.au/Tools/WebServices
|
|
|
|
Configuration:
|
|
Set ABR_GUID env var or in config.json.
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import re
|
|
import time
|
|
import urllib.parse
|
|
import xml.etree.ElementTree as ET
|
|
|
|
from base import fetch_url, get_db, CRAWL_DELAY
|
|
|
|
# Load ABR GUID from env or config
|
|
ABR_GUID = os.environ.get("ABR_GUID")
|
|
if not ABR_GUID:
|
|
config_path = os.path.join(os.path.dirname(__file__), "config.json")
|
|
if os.path.exists(config_path):
|
|
with open(config_path) as f:
|
|
config = json.load(f)
|
|
ABR_GUID = config.get("abr_guid")
|
|
|
|
ABR_BASE = "https://abr.business.gov.au/abrxmlsearch/AbrXmlSearch.asmx"
|
|
|
|
|
|
def search_by_name(name: str, state: str | None = None,
|
|
postcode: str | None = None) -> list[dict]:
|
|
"""Search ABR by business name. Returns matching records."""
|
|
if not ABR_GUID:
|
|
print(" WARNING: ABR_GUID not configured. Skipping ABN lookup.")
|
|
return []
|
|
|
|
params = {
|
|
"name": name,
|
|
"postcode": postcode or "",
|
|
"legalName": "Y",
|
|
"tradingName": "Y",
|
|
"NSW": "Y", "SA": "Y", "ACT": "Y", "VIC": "Y",
|
|
"WA": "Y", "NT": "Y", "QLD": "Y", "TAS": "Y",
|
|
"authenticationGuid": ABR_GUID,
|
|
}
|
|
|
|
# If state specified, only search that state
|
|
if state:
|
|
for s in ["NSW", "SA", "ACT", "VIC", "WA", "NT", "QLD", "TAS"]:
|
|
params[s] = "Y" if s == state else "N"
|
|
|
|
url = f"{ABR_BASE}/ABRSearchByNameSimpleProtocol"
|
|
try:
|
|
text = fetch_url(url, method="GET", data=params, timeout=15)
|
|
except Exception as e:
|
|
return []
|
|
|
|
# Parse XML response
|
|
results = []
|
|
try:
|
|
root = ET.fromstring(text)
|
|
# The ABR response uses a default namespace
|
|
ns = {"abr": "http://abr.business.gov.au/ABRXMLSearch/"}
|
|
|
|
for record in root.findall(".//abr:searchResultsRecord", ns):
|
|
abn_elem = record.find(".//abr:ABN/abr:identifierValue", ns)
|
|
status_elem = record.find(".//abr:ABN/abr:identifierStatus", ns)
|
|
name_elem = (
|
|
record.find(".//abr:mainName/abr:organisationName", ns)
|
|
or record.find(".//abr:mainTradingName/abr:organisationName", ns)
|
|
or record.find(".//abr:businessName/abr:organisationName", ns)
|
|
)
|
|
state_elem = record.find(".//abr:mainBusinessPhysicalAddress/abr:stateCode", ns)
|
|
postcode_elem = record.find(".//abr:mainBusinessPhysicalAddress/abr:postcode", ns)
|
|
score_elem = record.find(".//abr:nameScore", ns)
|
|
|
|
if abn_elem is not None:
|
|
results.append({
|
|
"abn": abn_elem.text,
|
|
"status": status_elem.text if status_elem is not None else None,
|
|
"name": name_elem.text if name_elem is not None else None,
|
|
"state": state_elem.text if state_elem is not None else None,
|
|
"postcode": postcode_elem.text if postcode_elem is not None else None,
|
|
"score": int(score_elem.text) if score_elem is not None else 0,
|
|
})
|
|
except ET.ParseError:
|
|
return []
|
|
|
|
return results
|
|
|
|
|
|
def find_best_match(name: str, state: str | None = None,
|
|
postcode: str | None = None) -> dict | None:
|
|
"""Find the best ABR match for a business name.
|
|
|
|
Returns the highest-scoring active match, or None.
|
|
"""
|
|
results = search_by_name(name, state, postcode)
|
|
|
|
# Filter to active businesses
|
|
active = [r for r in results if r.get("status") == "Active"]
|
|
if not active:
|
|
return None
|
|
|
|
# Sort by score descending
|
|
active.sort(key=lambda r: r.get("score", 0), reverse=True)
|
|
|
|
# Return best match if score is reasonable
|
|
best = active[0]
|
|
if best.get("score", 0) >= 80:
|
|
return best
|
|
|
|
return None
|
|
|
|
|
|
def run(limit: int | None = None, state_filter: str | None = None):
|
|
"""Look up ABNs for all providers that don't have one."""
|
|
db = get_db()
|
|
|
|
query = """
|
|
SELECT id, title, business_state, business_postcode
|
|
FROM funeral_brand
|
|
WHERE abn IS NULL AND verified = 0
|
|
"""
|
|
params = []
|
|
|
|
if state_filter:
|
|
query += " AND business_state = ?"
|
|
params.append(state_filter)
|
|
|
|
query += " ORDER BY id"
|
|
|
|
if limit:
|
|
query += f" LIMIT {limit}"
|
|
|
|
providers = db.execute(query, params).fetchall()
|
|
print(f"Providers without ABN: {len(providers)}")
|
|
|
|
if not ABR_GUID:
|
|
print("ERROR: ABR_GUID not configured.")
|
|
print(" Register at: https://abr.business.gov.au/Tools/WebServices")
|
|
print(" Then set ABR_GUID env var or add 'abr_guid' to config.json")
|
|
return
|
|
|
|
found = 0
|
|
not_found = 0
|
|
|
|
for i, prov in enumerate(providers):
|
|
if (i + 1) % 20 == 0 or i == 0:
|
|
print(f" [{i+1}/{len(providers)}] {prov['title']}")
|
|
|
|
match = find_best_match(
|
|
prov["title"],
|
|
prov["business_state"],
|
|
prov["business_postcode"]
|
|
)
|
|
|
|
if match:
|
|
db.execute(
|
|
"UPDATE funeral_brand SET abn = ?, updated_at = datetime('now') WHERE id = ?",
|
|
(match["abn"], prov["id"])
|
|
)
|
|
found += 1
|
|
else:
|
|
not_found += 1
|
|
|
|
if (i + 1) % 50 == 0:
|
|
db.commit()
|
|
|
|
time.sleep(0.5) # Be gentle with the government API
|
|
|
|
db.commit()
|
|
print(f"\nDone: {found} ABNs found, {not_found} not found")
|
|
print(f" Success rate: {found/(found+not_found)*100:.1f}%" if found + not_found > 0 else "")
|
|
|
|
db.close()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import sys
|
|
limit = None
|
|
state = None
|
|
|
|
for arg in sys.argv[1:]:
|
|
if arg.startswith("--state="):
|
|
state = arg.split("=")[1]
|
|
elif arg.startswith("--limit="):
|
|
limit = int(arg.split("=")[1])
|
|
else:
|
|
try:
|
|
limit = int(arg)
|
|
except ValueError:
|
|
pass
|
|
|
|
run(limit=limit, state_filter=state)
|