fix: parse __NEXT_DATA__ JSON for notification slugs
Some checks are pending
ZIPAIR Singapore Sale Monitor / check (push) Has started running
Some checks are pending
ZIPAIR Singapore Sale Monitor / check (push) Has started running
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -81,17 +81,31 @@ def get_notification_slugs() -> list[str]:
|
|||||||
if not html:
|
if not html:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
# Match /en/notification/some-slug or /en/notification/123
|
# Strategy 1: parse __NEXT_DATA__ JSON (Next.js SSR)
|
||||||
# Try both double and single quotes, and also JSON-style escaped URLs
|
paths = []
|
||||||
slugs = re.findall(r'["\'](/(?:en|ja|ko|th|zh-tw|zh-cn)/notification/([^"\'?#/]+))["\']', html)
|
m = re.search(r'<script id="__NEXT_DATA__"[^>]*>(.*?)</script>', html, re.DOTALL)
|
||||||
|
if m:
|
||||||
|
try:
|
||||||
|
nd = json.loads(m.group(1))
|
||||||
|
# flatten all string values and grep for /notification/ paths
|
||||||
|
raw_json = json.dumps(nd)
|
||||||
|
paths = re.findall(r'(?:\\?/(?:en|ja|ko|th|zh-tw|zh-cn)\\?/notification\\?/([^"\\/?#]+))', raw_json)
|
||||||
|
except Exception as e:
|
||||||
|
print(f" __NEXT_DATA__ parse error: {e}", file=sys.stderr)
|
||||||
|
|
||||||
# dedupe by slug
|
# Strategy 2: any href / quoted path in raw HTML
|
||||||
|
if not paths:
|
||||||
|
paths = [s for _, s in re.findall(
|
||||||
|
r'["\'](/(?:en|ja|ko|th|zh-tw|zh-cn)/notification/([^"\'?#/]+))["\']', html
|
||||||
|
)]
|
||||||
|
|
||||||
|
# dedupe, preserving order
|
||||||
seen = set()
|
seen = set()
|
||||||
result = []
|
result = []
|
||||||
for path, slug in slugs:
|
for slug in paths:
|
||||||
if slug and slug not in seen:
|
if slug and slug not in seen:
|
||||||
seen.add(slug)
|
seen.add(slug)
|
||||||
result.append((slug, ZIPAIR_NOTIF_BASE + path))
|
result.append((slug, f"{ZIPAIR_NOTIF_BASE}/en/notification/{slug}"))
|
||||||
print(f" Found {len(result)} notification(s) on listing page.")
|
print(f" Found {len(result)} notification(s) on listing page.")
|
||||||
return result # list of (slug, full_url)
|
return result # list of (slug, full_url)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user