From 2801757e342c2b9feb4c3ea0155217feebe3c420 Mon Sep 17 00:00:00 2001 From: isky Date: Sun, 17 May 2026 16:47:08 +0800 Subject: [PATCH] fix: parse __NEXT_DATA__ JSON for notification slugs Co-Authored-By: Claude Sonnet 4.6 --- check_zipair.py | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/check_zipair.py b/check_zipair.py index 63df00d..d2b17d5 100644 --- a/check_zipair.py +++ b/check_zipair.py @@ -81,17 +81,31 @@ def get_notification_slugs() -> list[str]: if not html: return [] - # Match /en/notification/some-slug or /en/notification/123 - # Try both double and single quotes, and also JSON-style escaped URLs - slugs = re.findall(r'["\'](/(?:en|ja|ko|th|zh-tw|zh-cn)/notification/([^"\'?#/]+))["\']', html) + # Strategy 1: parse __NEXT_DATA__ JSON (Next.js SSR) + paths = [] + m = re.search(r'', html, re.DOTALL) + if m: + try: + nd = json.loads(m.group(1)) + # flatten all string values and grep for /notification/ paths + raw_json = json.dumps(nd) + paths = re.findall(r'(?:\\?/(?:en|ja|ko|th|zh-tw|zh-cn)\\?/notification\\?/([^"\\/?#]+))', raw_json) + except Exception as e: + print(f" __NEXT_DATA__ parse error: {e}", file=sys.stderr) - # dedupe by slug + # Strategy 2: any href / quoted path in raw HTML + if not paths: + paths = [s for _, s in re.findall( + r'["\'](/(?:en|ja|ko|th|zh-tw|zh-cn)/notification/([^"\'?#/]+))["\']', html + )] + + # dedupe, preserving order seen = set() result = [] - for path, slug in slugs: + for slug in paths: if slug and slug not in seen: seen.add(slug) - result.append((slug, ZIPAIR_NOTIF_BASE + path)) + result.append((slug, f"{ZIPAIR_NOTIF_BASE}/en/notification/{slug}")) print(f" Found {len(result)} notification(s) on listing page.") return result # list of (slug, full_url)