fix: parse __NEXT_DATA__ JSON for notification slugs
Some checks are pending
ZIPAIR Singapore Sale Monitor / check (push) Has started running

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-17 16:47:08 +08:00
parent 408de08743
commit 2801757e34

View File

@@ -81,17 +81,31 @@ def get_notification_slugs() -> list[str]:
if not html: if not html:
return [] return []
# Match /en/notification/some-slug or /en/notification/123 # Strategy 1: parse __NEXT_DATA__ JSON (Next.js SSR)
# Try both double and single quotes, and also JSON-style escaped URLs paths = []
slugs = re.findall(r'["\'](/(?:en|ja|ko|th|zh-tw|zh-cn)/notification/([^"\'?#/]+))["\']', html) m = re.search(r'<script id="__NEXT_DATA__"[^>]*>(.*?)</script>', html, re.DOTALL)
if m:
try:
nd = json.loads(m.group(1))
# flatten all string values and grep for /notification/ paths
raw_json = json.dumps(nd)
paths = re.findall(r'(?:\\?/(?:en|ja|ko|th|zh-tw|zh-cn)\\?/notification\\?/([^"\\/?#]+))', raw_json)
except Exception as e:
print(f" __NEXT_DATA__ parse error: {e}", file=sys.stderr)
# dedupe by slug # Strategy 2: any href / quoted path in raw HTML
if not paths:
paths = [s for _, s in re.findall(
r'["\'](/(?:en|ja|ko|th|zh-tw|zh-cn)/notification/([^"\'?#/]+))["\']', html
)]
# dedupe, preserving order
seen = set() seen = set()
result = [] result = []
for path, slug in slugs: for slug in paths:
if slug and slug not in seen: if slug and slug not in seen:
seen.add(slug) seen.add(slug)
result.append((slug, ZIPAIR_NOTIF_BASE + path)) result.append((slug, f"{ZIPAIR_NOTIF_BASE}/en/notification/{slug}"))
print(f" Found {len(result)} notification(s) on listing page.") print(f" Found {len(result)} notification(s) on listing page.")
return result # list of (slug, full_url) return result # list of (slug, full_url)