fix: ntfy URL fallback, drop Sec-Fetch headers, probe IDs when sitemap blocked

- Use `or` so empty NTFY_URL env var falls back to hardcoded default - Remove Sec-Fetch-* headers that can trigger Cloudflare bot detection - Try sitemap_index.xml as second sitemap attempt - When both sitemaps are blocked, probe notification IDs sequentially above last_seen as a last resort Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-17 16:12:20 +08:00
parent ec71d6a504
commit e9d7fdf7cc
1 changed files with 60 additions and 41 deletions
--- a/check_zipair.py
+++ b/check_zipair.py
@@ -17,13 +17,16 @@ import urllib.error
 from datetime import datetime

 # ── Config (set via environment variables / Gitea secrets) ──────────────────
-NTFY_URL    = os.environ.get("NTFY_URL", "https://ntfy.isky-homelab.com/zipair")
+NTFY_URL    = os.environ.get("NTFY_URL") or "https://ntfy.isky-homelab.com/zipair"
 NTFY_TOKEN  = os.environ.get("NTFY_TOKEN", "")   # optional, if your ntfy requires auth
 STATE_FILE  = os.environ.get("STATE_FILE", "last_seen.txt")

-ZIPAIR_SITEMAP   = "https://www.zipair.net/sitemap.xml"
-ZIPAIR_NOTIF_LIST = "https://www.zipair.net/en/notification"
-ZIPAIR_NOTIF     = "https://www.zipair.net/en/notification/{id}"
+ZIPAIR_SITEMAP       = "https://www.zipair.net/sitemap.xml"
+ZIPAIR_SITEMAP_INDEX = "https://www.zipair.net/sitemap_index.xml"
+ZIPAIR_NOTIF         = "https://www.zipair.net/en/notification/{id}"
+
+# How many IDs above last_seen to probe when sitemap is unavailable
+PROBE_AHEAD = 20

 # Keywords that must ALL appear (case-insensitive) in a notification page
 # to trigger an alert. Tune these as needed.
@@ -40,10 +43,6 @@ HEADERS = {
    "Accept-Language": "en-US,en;q=0.9,ja;q=0.8",
    "Accept-Encoding": "gzip, deflate, br",
    "Connection":      "keep-alive",
-    "Referer":         "https://www.zipair.net/",
-    "Sec-Fetch-Dest":  "document",
-    "Sec-Fetch-Mode":  "navigate",
-    "Sec-Fetch-Site":  "same-origin",
    "Upgrade-Insecure-Requests": "1",
 }

@@ -55,7 +54,6 @@ def fetch(url: str, timeout: int = 15) -> str:
    try:
        with urllib.request.urlopen(req, timeout=timeout) as resp:
            raw = resp.read()
-            # handle gzip transparently (urlopen usually does, but just in case)
            try:
                import gzip
                return gzip.decompress(raw).decode("utf-8", errors="replace")
@@ -70,26 +68,42 @@ def fetch(url: str, timeout: int = 15) -> str:


 def get_notification_ids_from_sitemap() -> list[int]:
-    """Parse the ZIPAIR sitemap and return all notification IDs found."""
-    print("Fetching sitemap …")
-    xml = fetch(ZIPAIR_SITEMAP)
-    if xml:
-        ids = [int(m) for m in re.findall(r"/notification/(\d+)", xml)]
-        if ids:
-            ids = sorted(set(ids))
-            print(f"  Found {len(ids)} notification IDs in sitemap (max={ids[-1]})")
-            return ids
+    """Try sitemap.xml then sitemap_index.xml; return sorted notification IDs."""
+    for sitemap_url in (ZIPAIR_SITEMAP, ZIPAIR_SITEMAP_INDEX):
+        print(f"Fetching {sitemap_url} …")
+        xml = fetch(sitemap_url)
+        if xml:
+            # sitemap index may reference sub-sitemaps — fetch those too
+            sub_sitemaps = re.findall(r"<loc>(https?://[^<]*sitemap[^<]*)</loc>", xml)
+            for sub in sub_sitemaps:
+                if sub not in (ZIPAIR_SITEMAP, ZIPAIR_SITEMAP_INDEX):
+                    print(f"  Fetching sub-sitemap {sub} …")
+                    xml += fetch(sub)
+                    time.sleep(0.5)
+            ids = [int(m) for m in re.findall(r"/notification/(\d+)", xml)]
+            if ids:
+                ids = sorted(set(ids))
+                print(f"  Found {len(ids)} notification IDs (max={ids[-1]})")
+                return ids

-    # Fallback: scrape the notification listing page
-    print("  Sitemap unavailable, trying notification listing page …")
-    html = fetch(ZIPAIR_NOTIF_LIST)
-    if not html:
-        print("  Notification listing page also unavailable.", file=sys.stderr)
-        return []
-    ids = [int(m) for m in re.findall(r"/notification/(\d+)", html)]
-    ids = sorted(set(ids))
-    print(f"  Found {len(ids)} notification IDs from listing page (max={ids[-1] if ids else 'n/a'})")
-    return ids
+    return []
+
+
+def probe_for_new_ids(last_seen: int) -> list[int]:
+    """When sitemap is unavailable, probe notification pages above last_seen."""
+    print(f"  Sitemap unavailable — probing IDs {last_seen+1} to {last_seen+PROBE_AHEAD} …")
+    found = []
+    for nid in range(last_seen + 1, last_seen + PROBE_AHEAD + 1):
+        url = ZIPAIR_NOTIF.format(id=nid)
+        html = fetch(url)
+        time.sleep(0.5)
+        if html and f"/notification/{nid}" in html:
+            print(f"    ID {nid} exists.")
+            found.append(nid)
+        else:
+            print(f"    ID {nid} not found, stopping probe.")
+            break
+    return found


 def read_last_seen() -> int:
@@ -136,7 +150,6 @@ def send_ntfy(notif_id: int, snippet: str):
        }],
    }).encode()

-    # Build the POST request to the ntfy server base URL
    base_url = NTFY_URL.rstrip("/").rsplit("/", 1)[0]
    req = urllib.request.Request(
        f"{base_url}/",
@@ -166,15 +179,19 @@ def main():
    print(f"  Last seen notification ID: {last_seen}")

    ids = get_notification_ids_from_sitemap()
-    if not ids:
-        print("No notification IDs found; exiting.")
-        sys.exit(0)

-    new_ids = [i for i in ids if i > last_seen]
-    if not new_ids:
-        print("No new notifications since last check. All good.")
-        write_last_seen(max(ids))
-        sys.exit(0)
+    if not ids:
+        # Sitemap completely blocked — probe directly
+        new_ids = probe_for_new_ids(last_seen)
+        if not new_ids:
+            print("No new notifications found via probe either.")
+            sys.exit(0)
+    else:
+        new_ids = [i for i in ids if i > last_seen]
+        if not new_ids:
+            print("No new notifications since last check. All good.")
+            write_last_seen(max(ids))
+            sys.exit(0)

    print(f"  {len(new_ids)} new notification(s) to check: {new_ids}")
    found_match = None
@@ -183,7 +200,7 @@ def main():
        url = ZIPAIR_NOTIF.format(id=nid)
        print(f"  Fetching notification #{nid} …")
        text = fetch(url)
-        time.sleep(1)  # be polite
+        time.sleep(1)

        if not text:
            print(f"    Could not fetch #{nid}, skipping.")
@@ -191,7 +208,6 @@ def main():

        if matches_keywords(text):
            print(f"  ✅ MATCH in notification #{nid}!")
-            # Grab a short snippet for context
            lower = text.lower()
            pos = lower.find("singapore")
            snippet = text[max(0, pos - 50): pos + 200].strip()
@@ -200,8 +216,11 @@ def main():
        else:
            print(f"    No match in #{nid}.")

-    # Always advance the state to the latest ID we've seen
-    write_last_seen(max(ids))
+    # Advance state to highest ID we've confirmed exists
+    if ids:
+        write_last_seen(max(ids))
+    elif new_ids:
+        write_last_seen(max(new_ids))

    if found_match:
        nid, snippet = found_match