AI Knowledge & LogicProduct Sync Parsers

houseshop-ro (static XML, single feed)

Reference AI-product-sync parser. Single static XML feed with <Product> elements. Downloaded once on first call, then iter-parsed for memory-stable streaming. Demonstrates the simplest single-feed shape.

Reference parser for the aiGenerated product-sync provider. Use it as a starting point when the source is a single static XML feed that you can download once and stream end-to-end. This is the simplest shape — if the feed is one file, start here.

Source shape: static-xml-single-feed

Runtime helper note. The _download helper uses a try/except that falls back to fetch_via_proxy() — a globally-injected function that routes the request through Octocom's HTTP proxy IP (74.242.171.127). Use it when the origin blocks the Azure sandbox IP (symptoms: HTTPError 403, HTTPError 429, unexplained urlopen timeouts). Signature:

fetch_via_proxy(url, *, headers=None, timeout=60, method='GET', data=None) -> bytes

Full source

"""Houseshop Romania parser. Single static XML feed; one <Product>
element per product. We download the file once on first call, then
stream <item>/<Product> elements via ET.iterparse so memory stays flat
no matter how big the feed grows.
"""

import json
import re
import unicodedata
import urllib.error
import urllib.request
import xml.etree.ElementTree as ET

# -----------------------------------------------------------------------------
# Helpers (slugify ported from text.utils.ts; cleanHtml from houseshopRo.util.ts)
# -----------------------------------------------------------------------------


def slugify(text, max_length=None):
    s = (text or "").strip().lower()
    s = unicodedata.normalize("NFD", s)
    s = "".join(c for c in s if unicodedata.category(c) != "Mn")
    s = re.sub(r"^[\W_]+|[\W_]+$", "", s)
    s = re.sub(r"[\W_]+", "-", s)
    s = re.sub(r"-+", "-", s).strip("-")
    if max_length:
        s = s[:max_length].rstrip("-")
    return s


def clean_html(text):
    if not text:
        return ""
    return re.sub(r"<[^>]+>", "", text).strip()


def _nonempty(v):
    if v is None:
        return None
    s = str(v).strip()
    return s if s else None


# -----------------------------------------------------------------------------
# Source feed
# -----------------------------------------------------------------------------

FEED_URL = "https://www.houseshop.com.ro/export_feeds/houseshop.xml"
LOCAL_PATH = "/mnt/data/houseshopRo.xml"
USER_AGENT = "Mozilla/5.0 (compatible; OctocomSync/1.0)"
CHUNK_SIZE = 500


def _download(url, dest_path):
    """Stream a URL to disk.

    Tries direct urllib first (streaming, lowest memory). On HTTP/URL
    errors falls back to `fetch_via_proxy()`, the runtime-provided
    helper that routes through Octocom's HTTP proxy IP. Use the proxy
    when the origin blocks the Azure sandbox IP — symptoms include
    `HTTPError 403/429` and unexplained `urlopen` timeouts.

    Runtime helper signature:
        fetch_via_proxy(url, *, headers=None, timeout=60,
                        method='GET', data=None) -> bytes
    """
    headers = {"User-Agent": USER_AGENT}
    try:
        req = urllib.request.Request(url, headers=headers)
        with urllib.request.urlopen(req, timeout=180) as r:
            with open(dest_path, "wb") as f:
                while True:
                    chunk = r.read(1024 * 256)
                    if not chunk:
                        break
                    f.write(chunk)
        return
    except (urllib.error.HTTPError, urllib.error.URLError):
        pass
    body = fetch_via_proxy(  # noqa: F821 - runtime-provided
        url, headers=headers, timeout=200
    )
    with open(dest_path, "wb") as f:
        f.write(body)


def _xml_product_to_dict(elem):
    """Convert one <Product> element to a dict.

    Repeated child tags (e.g. <additional_imageurl>) collapse to a list.
    """
    item: dict = {}
    for child in elem:
        text = (child.text or "").strip() if child.text else None
        if child.tag in item:
            existing = item[child.tag]
            if not isinstance(existing, list):
                item[child.tag] = [existing]
            item[child.tag].append(text)
        else:
            item[child.tag] = text
    return item


def _make_product_iter(path):
    """Yield raw <Product> elements one at a time from a streaming parse."""
    for _, elem in ET.iterparse(path, events=("end",)):
        if elem.tag == "Product":
            yield elem


# -----------------------------------------------------------------------------
# Contract
# -----------------------------------------------------------------------------


def fetch_next_chunk(state):
    parser = state.setdefault("parser", {})

    if not parser.get("initialized"):
        _download(FEED_URL, LOCAL_PATH)
        parser["iter"] = _make_product_iter(LOCAL_PATH)
        parser["initialized"] = True

    iter_obj = parser["iter"]
    chunk = []
    for _ in range(CHUNK_SIZE):
        try:
            elem = next(iter_obj)
        except StopIteration:
            break
        chunk.append(_xml_product_to_dict(elem))
        elem.clear()
    return chunk


def map_one(raw, context):
    sku = _nonempty(raw.get("sku"))
    if not sku:
        return None

    name = raw.get("name") or ""
    description = clean_html(raw.get("description") or "")

    # quantity: numeric in the source; coerce
    try:
        quantity = int(float(raw.get("quantity") or 0))
    except (ValueError, TypeError):
        quantity = 0

    try:
        price = float(raw.get("price_with_vat") or 0)
    except (ValueError, TypeError):
        price = 0.0

    in_stock = (raw.get("instock") or "").strip().upper() == "Y"

    additional_images_raw = raw.get("additional_imageurl")
    if additional_images_raw is None:
        additional_images = []
    elif isinstance(additional_images_raw, list):
        additional_images = [u for u in additional_images_raw if u]
    else:
        additional_images = [additional_images_raw]

    category = _nonempty(raw.get("category"))
    category_id = _nonempty(raw.get("category_id"))

    metafields = []
    for label, key in [
        ("Manufacturer", "manufacturer"),
        ("availability", "availability"),
        ("ean", "ean"),
        ("weight", "weight"),
        ("category_id", "category_id"),
    ]:
        v = _nonempty(raw.get(key))
        if v:
            metafields.append({"key": label, "value": v})

    return {
        "name": name,
        "uniqueId": sku,
        "slug": slugify(name),
        "longDescription": description or None,
        "url": raw.get("link") or "",
        "images": [
            {
                "url": raw.get("image") or "",
                "altText": name,
                "isPrimary": True,
            }
        ]
        + [{"url": u, "altText": name} for u in additional_images],
        "collections": (
            [
                {
                    "name": category,
                    "slug": slugify(category),
                    "url": (
                        f"https://www.houseshop.com.ro/?route=product/category"
                        f"&path={category_id}"
                        if category_id
                        else None
                    ),
                }
            ]
            if category
            else []
        ),
        "variants": [
            {
                "title": name,
                "sku": sku,
                "price": {
                    "currentPrice": price,
                    "onSale": False,
                },
                "stockQuantity": quantity,
                "inStock": in_stock,
            }
        ],
        "metafields": metafields,
        "rawProduct": json.dumps(raw, ensure_ascii=False),
    }

On this page