AI Knowledge & LogicProduct Sync Parsers

superhome (paginated JSON API)

Reference AI-product-sync parser. Paginated JSON API. fetch_next_chunk returns one page per call; cursor (page number) lives in state['parser']['page']. Demonstrates the natural shape for paginated REST sources where one HTTP call gives you one chunk.

Reference parser for the aiGenerated product-sync provider. Use it as a starting point when the source is a paginated REST API that returns one chunk of products per request. Demonstrates the natural pattern: fetch_next_chunk makes one HTTP call and bumps the page cursor in state['parser'].

Source shape: paginated-json-api

Runtime helper note. The _fetch_page helper uses a try/except that falls back to fetch_via_proxy() — a globally-injected function that routes the request through Octocom's HTTP proxy IP (74.242.171.127). Use it when the origin blocks the Azure sandbox IP (symptoms: HTTPError 403, HTTPError 429, unexplained urlopen timeouts). Signature:

fetch_via_proxy(url, *, headers=None, timeout=60, method='GET', data=None) -> bytes

Full source

"""Superhome (superhome.com.cy) parser. Paginated JSON API.

Demonstrates the natural shape for paginated sources:
  fetch_next_chunk() = one page (or batch of pages) per call.
  state['parser']['page'] holds the cursor across calls.

The kernel persists `state` between executePython calls, so this just
works without serializing/deserializing the cursor through the wire.
"""

import json
import re
import unicodedata
import urllib.error
import urllib.parse
import urllib.request

# -----------------------------------------------------------------------------
# Helpers (slugify, decode_greek_text, sanitize_description)
# -----------------------------------------------------------------------------

GREEK_ENTITIES = {
    "Α": "Α", "Β": "Β", "Γ": "Γ", "Δ": "Δ",
    "Ε": "Ε", "Ζ": "Ζ", "Η": "Η", "Θ": "Θ",
    "Ι": "Ι", "Κ": "Κ", "Λ": "Λ", "Μ": "Μ",
    "Ν": "Ν", "Ξ": "Ξ", "Ο": "Ο", "Π": "Π",
    "Ρ": "Ρ", "Σ": "Σ", "Τ": "Τ", "Υ": "Υ",
    "Φ": "Φ", "Χ": "Χ", "Ψ": "Ψ", "Ω": "Ω",
    "α": "α", "β": "β", "γ": "γ", "δ": "δ",
    "ε": "ε", "ζ": "ζ", "η": "η", "θ": "θ",
    "ι": "ι", "κ": "κ", "λ": "λ", "μ": "μ",
    "ν": "ν", "ξ": "ξ", "ο": "ο", "π": "π",
    "ρ": "ρ", "ς": "ς", "σ": "σ", "τ": "τ",
    "υ": "υ", "φ": "φ", "χ": "χ", "ψ": "ψ",
    "ω": "ω", " ": " ", "&": "&", "´": "´",
    "¨": "¨",
}


def slugify(text, max_length=None):
    s = (text or "").strip().lower()
    s = unicodedata.normalize("NFD", s)
    s = "".join(c for c in s if unicodedata.category(c) != "Mn")
    s = re.sub(r"^[\W_]+|[\W_]+$", "", s)
    s = re.sub(r"[\W_]+", "-", s)
    s = re.sub(r"-+", "-", s).strip("-")
    if max_length:
        s = s[:max_length].rstrip("-")
    return s


def decode_greek_text(text):
    decoded = (text or "").replace("&", "&")
    for entity, char in GREEK_ENTITIES.items():
        decoded = decoded.replace(entity, char)
    decoded = re.sub(r"&[a-zA-Z]+;", "", decoded)
    decoded = re.sub(r"\s+", " ", decoded).strip()
    return decoded


MAX_DESC_LEN = 10_000


def sanitize_description(html):
    cleaned = re.sub(
        r"data:[a-zA-Z0-9+/.\-]+;base64,[A-Za-z0-9+/=\s]+", "", html
    )
    cleaned = re.sub(r"src\s*=\s*[\"']\s*[\"']", "", cleaned)
    cleaned = re.sub(r"\s{2,}", " ", cleaned).strip()
    return cleaned[:MAX_DESC_LEN]


# -----------------------------------------------------------------------------
# Source
# -----------------------------------------------------------------------------

FEED_URL = "https://superhome.com.cy/dwapi/Feeds/GetFeedOutput"
PAGE_SIZE = 100  # Superhome's API accepts large page sizes; bigger = fewer round-trips
USER_AGENT = "Mozilla/5.0 (compatible; OctocomSync/1.0)"

# URL-encoded Greek "προιοντα" path segment, used when constructing
# product URLs back into the storefront.
GREEK_PATH = "%CF%80%CF%81%CE%BF%CE%B9%CE%BF%CE%BD%CF%84%CE%B1"


def _fetch_page(page_num):
    """GET one page of the JSON feed.

    Tries direct urllib first. On HTTP/URL errors falls back to
    `fetch_via_proxy()`, the runtime-provided helper that routes
    through Octocom's HTTP proxy IP. Use the proxy when the origin
    blocks the Azure sandbox IP — symptoms include `HTTPError 403/429`
    and unexplained `urlopen` timeouts.

    Runtime helper signature:
        fetch_via_proxy(url, *, headers=None, timeout=60,
                        method='GET', data=None) -> bytes
    """
    params = {
        "Id": 6,
        "languageId": "LANG17",
        "currencyId": "EUR",
        "shopId": "SHOP1",
        "LoadVariantInfoOnVariants": "false",
        "PageSize": PAGE_SIZE,
        "PageNum": page_num,
    }
    url = f"{FEED_URL}?{urllib.parse.urlencode(params)}"
    headers = {"User-Agent": USER_AGENT, "Accept": "application/json"}
    try:
        req = urllib.request.Request(url, headers=headers)
        with urllib.request.urlopen(req, timeout=30) as r:
            body = r.read()
    except (urllib.error.HTTPError, urllib.error.URLError):
        body = fetch_via_proxy(  # noqa: F821 - runtime-provided
            url, headers=headers, timeout=60
        )
    return json.loads(body).get("Products", [])


# -----------------------------------------------------------------------------
# Contract
# -----------------------------------------------------------------------------


def fetch_next_chunk(state):
    parser = state.setdefault("parser", {})
    page = parser.get("page", 1)
    products = _fetch_page(page)
    parser["page"] = page + 1
    return products


def map_one(raw, context):
    if not raw.get("Active"):
        return None
    pid = raw.get("Id")
    name = raw.get("Name") or ""
    if not pid:
        return None

    groups = raw.get("Groups") or []
    short_desc = sanitize_description(
        decode_greek_text(decode_greek_text(raw.get("ShortDescription") or ""))
    )
    long_desc = sanitize_description(
        decode_greek_text(decode_greek_text(raw.get("LongDescription") or ""))
    )

    # Price comes wrapped: {"Price": 12.34}
    price_obj = raw.get("Price") or {}
    price = price_obj.get("Price") if isinstance(price_obj, dict) else None

    stock_level = raw.get("StockLevel")
    try:
        stock_quantity = int(stock_level) if stock_level is not None else None
    except (ValueError, TypeError):
        stock_quantity = None

    in_stock = raw.get("StockStatus") == "Σε απόθεμα"

    images = [
        {
            "url": (raw.get("DefaultImage") or {}).get("Value", ""),
            "altText": name,
            "isPrimary": True,
        }
    ] + [
        {"url": img.get("Value", ""), "altText": name}
        for img in (raw.get("ImagePatternImages") or [])
    ]

    metafields = []
    for v in (raw.get("ProductFields") or {}).values():
        label = v.get("Name")
        if label:
            metafields.append(
                {
                    "key": label,
                    "value": json.dumps(v.get("Value"), ensure_ascii=False),
                }
            )
    weight = raw.get("Weight")
    if weight is not None:
        metafields.append({"key": "weight", "value": str(weight)})

    urls = [
        {
            "url": f"https://superhome.com.cy/greek/{GREEK_PATH}/{pid}",
            "language": "el",
            "salesChannel": "web",
        },
        {
            "url": f"https://superhome.com.cy/english/products/{pid}",
            "language": "en",
            "salesChannel": "web",
        },
    ] + [
        {
            "url": f"https://superhome.com.cy/greek/{GREEK_PATH}?GroupId={g['Id']}&ProductId={pid}",
            "language": "el",
            "salesChannel": "web",
        }
        for g in groups
        if g.get("Id")
    ] + [
        {
            "url": f"https://superhome.com.cy/english/products?GroupId={g['Id']}&ProductId={pid}",
            "language": "en",
            "salesChannel": "web",
        }
        for g in groups
        if g.get("Id")
    ]

    collections = [
        {"slug": slugify(g["Name"]), "name": g["Name"]}
        for gp in (raw.get("GroupPaths") or [])
        for g in gp
        if g.get("Name")
    ]

    return {
        "name": name,
        "slug": slugify(name) + "-" + str(pid),
        "uniqueId": str(pid),
        "shortDescription": short_desc or None,
        "longDescription": long_desc or None,
        "url": urls,
        "collections": collections,
        "variants": [
            {
                "title": "default",
                "price": {
                    "currentPrice": price,
                    "onSale": False,
                },
                "stockQuantity": stock_quantity,
                "inStock": in_stock,
            }
        ],
        "images": images,
        "metafields": metafields,
        "rawProduct": json.dumps(raw, ensure_ascii=False),
    }

On this page