houseshop-ro (static XML, single feed)
Reference AI-product-sync parser. Single static XML feed with <Product> elements. Downloaded once on first call, then iter-parsed for memory-stable streaming. Demonstrates the simplest single-feed shape.
Reference parser for the aiGenerated product-sync provider. Use it as a starting point when the source is a single static XML feed that you can download once and stream end-to-end. This is the simplest shape — if the feed is one file, start here.
Source shape: static-xml-single-feed
Runtime helper note. The _download helper uses a try/except that falls back to fetch_via_proxy() — a globally-injected function that routes the request through Octocom's HTTP proxy IP (74.242.171.127). Use it when the origin blocks the Azure sandbox IP (symptoms: HTTPError 403, HTTPError 429, unexplained urlopen timeouts). Signature:
fetch_via_proxy(url, *, headers=None, timeout=60, method='GET', data=None) -> bytesFull source
"""Houseshop Romania parser. Single static XML feed; one <Product>
element per product. We download the file once on first call, then
stream <item>/<Product> elements via ET.iterparse so memory stays flat
no matter how big the feed grows.
"""
import json
import re
import unicodedata
import urllib.error
import urllib.request
import xml.etree.ElementTree as ET
# -----------------------------------------------------------------------------
# Helpers (slugify ported from text.utils.ts; cleanHtml from houseshopRo.util.ts)
# -----------------------------------------------------------------------------
def slugify(text, max_length=None):
s = (text or "").strip().lower()
s = unicodedata.normalize("NFD", s)
s = "".join(c for c in s if unicodedata.category(c) != "Mn")
s = re.sub(r"^[\W_]+|[\W_]+$", "", s)
s = re.sub(r"[\W_]+", "-", s)
s = re.sub(r"-+", "-", s).strip("-")
if max_length:
s = s[:max_length].rstrip("-")
return s
def clean_html(text):
if not text:
return ""
return re.sub(r"<[^>]+>", "", text).strip()
def _nonempty(v):
if v is None:
return None
s = str(v).strip()
return s if s else None
# -----------------------------------------------------------------------------
# Source feed
# -----------------------------------------------------------------------------
FEED_URL = "https://www.houseshop.com.ro/export_feeds/houseshop.xml"
LOCAL_PATH = "/mnt/data/houseshopRo.xml"
USER_AGENT = "Mozilla/5.0 (compatible; OctocomSync/1.0)"
CHUNK_SIZE = 500
def _download(url, dest_path):
"""Stream a URL to disk.
Tries direct urllib first (streaming, lowest memory). On HTTP/URL
errors falls back to `fetch_via_proxy()`, the runtime-provided
helper that routes through Octocom's HTTP proxy IP. Use the proxy
when the origin blocks the Azure sandbox IP — symptoms include
`HTTPError 403/429` and unexplained `urlopen` timeouts.
Runtime helper signature:
fetch_via_proxy(url, *, headers=None, timeout=60,
method='GET', data=None) -> bytes
"""
headers = {"User-Agent": USER_AGENT}
try:
req = urllib.request.Request(url, headers=headers)
with urllib.request.urlopen(req, timeout=180) as r:
with open(dest_path, "wb") as f:
while True:
chunk = r.read(1024 * 256)
if not chunk:
break
f.write(chunk)
return
except (urllib.error.HTTPError, urllib.error.URLError):
pass
body = fetch_via_proxy( # noqa: F821 - runtime-provided
url, headers=headers, timeout=200
)
with open(dest_path, "wb") as f:
f.write(body)
def _xml_product_to_dict(elem):
"""Convert one <Product> element to a dict.
Repeated child tags (e.g. <additional_imageurl>) collapse to a list.
"""
item: dict = {}
for child in elem:
text = (child.text or "").strip() if child.text else None
if child.tag in item:
existing = item[child.tag]
if not isinstance(existing, list):
item[child.tag] = [existing]
item[child.tag].append(text)
else:
item[child.tag] = text
return item
def _make_product_iter(path):
"""Yield raw <Product> elements one at a time from a streaming parse."""
for _, elem in ET.iterparse(path, events=("end",)):
if elem.tag == "Product":
yield elem
# -----------------------------------------------------------------------------
# Contract
# -----------------------------------------------------------------------------
def fetch_next_chunk(state):
parser = state.setdefault("parser", {})
if not parser.get("initialized"):
_download(FEED_URL, LOCAL_PATH)
parser["iter"] = _make_product_iter(LOCAL_PATH)
parser["initialized"] = True
iter_obj = parser["iter"]
chunk = []
for _ in range(CHUNK_SIZE):
try:
elem = next(iter_obj)
except StopIteration:
break
chunk.append(_xml_product_to_dict(elem))
elem.clear()
return chunk
def map_one(raw, context):
sku = _nonempty(raw.get("sku"))
if not sku:
return None
name = raw.get("name") or ""
description = clean_html(raw.get("description") or "")
# quantity: numeric in the source; coerce
try:
quantity = int(float(raw.get("quantity") or 0))
except (ValueError, TypeError):
quantity = 0
try:
price = float(raw.get("price_with_vat") or 0)
except (ValueError, TypeError):
price = 0.0
in_stock = (raw.get("instock") or "").strip().upper() == "Y"
additional_images_raw = raw.get("additional_imageurl")
if additional_images_raw is None:
additional_images = []
elif isinstance(additional_images_raw, list):
additional_images = [u for u in additional_images_raw if u]
else:
additional_images = [additional_images_raw]
category = _nonempty(raw.get("category"))
category_id = _nonempty(raw.get("category_id"))
metafields = []
for label, key in [
("Manufacturer", "manufacturer"),
("availability", "availability"),
("ean", "ean"),
("weight", "weight"),
("category_id", "category_id"),
]:
v = _nonempty(raw.get(key))
if v:
metafields.append({"key": label, "value": v})
return {
"name": name,
"uniqueId": sku,
"slug": slugify(name),
"longDescription": description or None,
"url": raw.get("link") or "",
"images": [
{
"url": raw.get("image") or "",
"altText": name,
"isPrimary": True,
}
]
+ [{"url": u, "altText": name} for u in additional_images],
"collections": (
[
{
"name": category,
"slug": slugify(category),
"url": (
f"https://www.houseshop.com.ro/?route=product/category"
f"&path={category_id}"
if category_id
else None
),
}
]
if category
else []
),
"variants": [
{
"title": name,
"sku": sku,
"price": {
"currentPrice": price,
"onSale": False,
},
"stockQuantity": quantity,
"inStock": in_stock,
}
],
"metafields": metafields,
"rawProduct": json.dumps(raw, ensure_ascii=False),
}