From e9d7fdf7cc2c0f75b1f4167fcf936bee16f11354 Mon Sep 17 00:00:00 2001 From: isky Date: Sun, 17 May 2026 16:12:20 +0800 Subject: [PATCH] fix: ntfy URL fallback, drop Sec-Fetch headers, probe IDs when sitemap blocked - Use `or` so empty NTFY_URL env var falls back to hardcoded default - Remove Sec-Fetch-* headers that can trigger Cloudflare bot detection - Try sitemap_index.xml as second sitemap attempt - When both sitemaps are blocked, probe notification IDs sequentially above last_seen as a last resort Co-Authored-By: Claude Sonnet 4.6 --- check_zipair.py | 101 ++++++++++++++++++++++++++++-------------------- 1 file changed, 60 insertions(+), 41 deletions(-) diff --git a/check_zipair.py b/check_zipair.py index 019c748..1c756fb 100644 --- a/check_zipair.py +++ b/check_zipair.py @@ -17,13 +17,16 @@ import urllib.error from datetime import datetime # ── Config (set via environment variables / Gitea secrets) ────────────────── -NTFY_URL = os.environ.get("NTFY_URL", "https://ntfy.isky-homelab.com/zipair") +NTFY_URL = os.environ.get("NTFY_URL") or "https://ntfy.isky-homelab.com/zipair" NTFY_TOKEN = os.environ.get("NTFY_TOKEN", "") # optional, if your ntfy requires auth STATE_FILE = os.environ.get("STATE_FILE", "last_seen.txt") -ZIPAIR_SITEMAP = "https://www.zipair.net/sitemap.xml" -ZIPAIR_NOTIF_LIST = "https://www.zipair.net/en/notification" -ZIPAIR_NOTIF = "https://www.zipair.net/en/notification/{id}" +ZIPAIR_SITEMAP = "https://www.zipair.net/sitemap.xml" +ZIPAIR_SITEMAP_INDEX = "https://www.zipair.net/sitemap_index.xml" +ZIPAIR_NOTIF = "https://www.zipair.net/en/notification/{id}" + +# How many IDs above last_seen to probe when sitemap is unavailable +PROBE_AHEAD = 20 # Keywords that must ALL appear (case-insensitive) in a notification page # to trigger an alert. Tune these as needed. @@ -40,10 +43,6 @@ HEADERS = { "Accept-Language": "en-US,en;q=0.9,ja;q=0.8", "Accept-Encoding": "gzip, deflate, br", "Connection": "keep-alive", - "Referer": "https://www.zipair.net/", - "Sec-Fetch-Dest": "document", - "Sec-Fetch-Mode": "navigate", - "Sec-Fetch-Site": "same-origin", "Upgrade-Insecure-Requests": "1", } @@ -55,7 +54,6 @@ def fetch(url: str, timeout: int = 15) -> str: try: with urllib.request.urlopen(req, timeout=timeout) as resp: raw = resp.read() - # handle gzip transparently (urlopen usually does, but just in case) try: import gzip return gzip.decompress(raw).decode("utf-8", errors="replace") @@ -70,26 +68,42 @@ def fetch(url: str, timeout: int = 15) -> str: def get_notification_ids_from_sitemap() -> list[int]: - """Parse the ZIPAIR sitemap and return all notification IDs found.""" - print("Fetching sitemap …") - xml = fetch(ZIPAIR_SITEMAP) - if xml: - ids = [int(m) for m in re.findall(r"/notification/(\d+)", xml)] - if ids: - ids = sorted(set(ids)) - print(f" Found {len(ids)} notification IDs in sitemap (max={ids[-1]})") - return ids + """Try sitemap.xml then sitemap_index.xml; return sorted notification IDs.""" + for sitemap_url in (ZIPAIR_SITEMAP, ZIPAIR_SITEMAP_INDEX): + print(f"Fetching {sitemap_url} …") + xml = fetch(sitemap_url) + if xml: + # sitemap index may reference sub-sitemaps — fetch those too + sub_sitemaps = re.findall(r"(https?://[^<]*sitemap[^<]*)", xml) + for sub in sub_sitemaps: + if sub not in (ZIPAIR_SITEMAP, ZIPAIR_SITEMAP_INDEX): + print(f" Fetching sub-sitemap {sub} …") + xml += fetch(sub) + time.sleep(0.5) + ids = [int(m) for m in re.findall(r"/notification/(\d+)", xml)] + if ids: + ids = sorted(set(ids)) + print(f" Found {len(ids)} notification IDs (max={ids[-1]})") + return ids - # Fallback: scrape the notification listing page - print(" Sitemap unavailable, trying notification listing page …") - html = fetch(ZIPAIR_NOTIF_LIST) - if not html: - print(" Notification listing page also unavailable.", file=sys.stderr) - return [] - ids = [int(m) for m in re.findall(r"/notification/(\d+)", html)] - ids = sorted(set(ids)) - print(f" Found {len(ids)} notification IDs from listing page (max={ids[-1] if ids else 'n/a'})") - return ids + return [] + + +def probe_for_new_ids(last_seen: int) -> list[int]: + """When sitemap is unavailable, probe notification pages above last_seen.""" + print(f" Sitemap unavailable — probing IDs {last_seen+1} to {last_seen+PROBE_AHEAD} …") + found = [] + for nid in range(last_seen + 1, last_seen + PROBE_AHEAD + 1): + url = ZIPAIR_NOTIF.format(id=nid) + html = fetch(url) + time.sleep(0.5) + if html and f"/notification/{nid}" in html: + print(f" ID {nid} exists.") + found.append(nid) + else: + print(f" ID {nid} not found, stopping probe.") + break + return found def read_last_seen() -> int: @@ -136,7 +150,6 @@ def send_ntfy(notif_id: int, snippet: str): }], }).encode() - # Build the POST request to the ntfy server base URL base_url = NTFY_URL.rstrip("/").rsplit("/", 1)[0] req = urllib.request.Request( f"{base_url}/", @@ -166,15 +179,19 @@ def main(): print(f" Last seen notification ID: {last_seen}") ids = get_notification_ids_from_sitemap() - if not ids: - print("No notification IDs found; exiting.") - sys.exit(0) - new_ids = [i for i in ids if i > last_seen] - if not new_ids: - print("No new notifications since last check. All good.") - write_last_seen(max(ids)) - sys.exit(0) + if not ids: + # Sitemap completely blocked — probe directly + new_ids = probe_for_new_ids(last_seen) + if not new_ids: + print("No new notifications found via probe either.") + sys.exit(0) + else: + new_ids = [i for i in ids if i > last_seen] + if not new_ids: + print("No new notifications since last check. All good.") + write_last_seen(max(ids)) + sys.exit(0) print(f" {len(new_ids)} new notification(s) to check: {new_ids}") found_match = None @@ -183,7 +200,7 @@ def main(): url = ZIPAIR_NOTIF.format(id=nid) print(f" Fetching notification #{nid} …") text = fetch(url) - time.sleep(1) # be polite + time.sleep(1) if not text: print(f" Could not fetch #{nid}, skipping.") @@ -191,7 +208,6 @@ def main(): if matches_keywords(text): print(f" ✅ MATCH in notification #{nid}!") - # Grab a short snippet for context lower = text.lower() pos = lower.find("singapore") snippet = text[max(0, pos - 50): pos + 200].strip() @@ -200,8 +216,11 @@ def main(): else: print(f" No match in #{nid}.") - # Always advance the state to the latest ID we've seen - write_last_seen(max(ids)) + # Advance state to highest ID we've confirmed exists + if ids: + write_last_seen(max(ids)) + elif new_ids: + write_last_seen(max(new_ids)) if found_match: nid, snippet = found_match