#!/usr/bin/env python3 """ VURSYL feed pipeline Generates: data/news.json — positive news archive (append-only) data/media.json — video/podcast archive (append-only) data/reality-check.json — auto-generated Reality Check entries Env vars required: NEWSAPI_KEY - newsapi.org key ANTHROPIC_API_KEY - Claude API key YOUTUBE_API_KEY - YouTube Data API v3 key Run: python scripts/update_feed.py """ import os, json, time, hashlib, urllib.request, urllib.parse, xml.etree.ElementTree as ET from datetime import datetime, timedelta, timezone NEWSAPI_KEY = os.environ["NEWSAPI_KEY"] ANTHROPIC_KEY = os.environ["ANTHROPIC_API_KEY"] YOUTUBE_KEY = os.environ["YOUTUBE_API_KEY"] OUT_NEWS = "data/news.json" OUT_MEDIA = "data/media.json" OUT_RC = "data/reality-check.json" MAX_NEWS_ITEMS = 5000 MAX_MEDIA_ITEMS = 2000 MAX_RC_ITEMS = 50 LOOKBACK_HOURS = 48 MODEL_FILTER = "claude-haiku-4-5-20251001" MODEL_RC = "claude-sonnet-4-6" CATEGORIES = ["ai", "quantum", "health", "robotics", "compute", "breakthroughs"] RC_TAGS = { "energy": ["energy","water","electricity","environment","carbon","emissions","power"], "jobs": ["jobs","employment","unemployment","workers","labor","layoffs","replace"], "safety": ["dangerous","risk","alignment","AGI","existential","regulation","ban","halt"], "health": ["medical","hospital","diagnosis","FDA","clinical","patient","drug"], "quantum": ["quantum","encryption","cryptography","qubits","supremacy"], "privacy": ["privacy","data","surveillance","personal","tracking","GDPR"], } # ---------------- SOURCES ---------------- NEWSAPI_QUERIES = { "ai": '"artificial intelligence" OR "large language model" OR LLM OR "AI model" OR "AI breakthrough"', "quantum": '"quantum computing" OR "quantum computer" OR "quantum chip" OR "error correction" qubit', "health": '"AI" AND (drug OR diagnosis OR cancer OR FDA OR clinical OR longevity OR CRISPR OR biotech)', "robotics": 'humanoid robot OR robotics OR SpaceX OR "space technology" OR autonomous', "compute": 'semiconductor OR "data center" OR GPU OR NVIDIA OR "chip breakthrough" OR supercomputer', } RC_QUERIES = [ '"AI" AND (dangerous OR risk OR threat OR ban OR regulation OR harm OR replace OR jobs OR water OR energy)', '"artificial intelligence" AND (concern OR warning OR fear OR problem OR dangerous OR crisis)', '"data center" AND (water OR energy OR environment OR electricity OR bills)', '"quantum" AND (encryption OR security OR threat OR break OR hack)', ] RSS_FEEDS = [ ("OpenAI", "https://openai.com/news/rss.xml"), ("Anthropic", "https://www.anthropic.com/news/rss"), ("Google DeepMind", "https://deepmind.google/blog/rss.xml"), ("NVIDIA", "https://blogs.nvidia.com/feed/"), ("Microsoft Research","https://www.microsoft.com/en-us/research/feed/"), ("MIT News AI", "https://news.mit.edu/topic/mitartificial-intelligence2-rss.xml"), ("IBM Research", "https://research.ibm.com/blog/feed.rss"), ("NIH News", "https://www.nih.gov/news-events/news-releases/feed.xml"), ] YOUTUBE_CHANNELS = [ "TwoMinutePapers","lexfridman","DwarkeshPatel","PeterDiamandis", "a16z","ycombinator","anthropic-ai","OpenAI","GoogleDeepMind", "NVIDIA","qiskit","TED", ] # ---------------- HTTP HELPERS ---------------- def http_get(url, headers=None, timeout=30): req = urllib.request.Request(url, headers=headers or {"User-Agent":"VursylBot/1.0"}) with urllib.request.urlopen(req, timeout=timeout) as r: return r.read().decode("utf-8", errors="replace") def http_json(url, headers=None): return json.loads(http_get(url, headers)) def post_json(url, payload, headers): data = json.dumps(payload).encode("utf-8") req = urllib.request.Request(url, data=data, headers=headers, method="POST") with urllib.request.urlopen(req, timeout=120) as r: return json.loads(r.read().decode("utf-8")) def iso(dt_str): fmts = [ "%Y-%m-%dT%H:%M:%SZ","%Y-%m-%dT%H:%M:%S%z", "%a, %d %b %Y %H:%M:%S %z","%a, %d %b %Y %H:%M:%S %Z", "%Y-%m-%dT%H:%M:%S.%fZ", ] for f in fmts: try: d = datetime.strptime(dt_str.strip(), f) if d.tzinfo is None: d = d.replace(tzinfo=timezone.utc) return d.astimezone(timezone.utc).isoformat() except (ValueError, AttributeError): continue return datetime.now(timezone.utc).isoformat() def recent(iso_str, hours=LOOKBACK_HOURS): try: d = datetime.fromisoformat(iso_str.replace("Z","+00:00")) return d > datetime.now(timezone.utc) - timedelta(hours=hours) except ValueError: return False # ---------------- ARCHIVE MERGE ---------------- def load_existing(path): try: with open(path,"r") as f: data = json.load(f) return data.get("items",[]) except (FileNotFoundError, json.JSONDecodeError): return [] def merge(existing, new_items, max_items): seen_urls, seen_titles, merged = set(), set(), [] existing_by_url = {i["url"]:i for i in existing if i.get("url")} for item in new_items: url = item.get("url","") title_key = hashlib.md5(item.get("title","").lower().encode()).hexdigest() if url in seen_urls or title_key in seen_titles: continue if url in existing_by_url: if item.get("breakthrough"): existing_by_url[url]["breakthrough"] = True seen_urls.add(url); seen_titles.add(title_key) continue seen_urls.add(url); seen_titles.add(title_key) merged.append(item) for item in existing: url = item.get("url","") title_key = hashlib.md5(item.get("title","").lower().encode()).hexdigest() if url in seen_urls or title_key in seen_titles: if url not in seen_urls: merged.append(item) continue seen_urls.add(url); seen_titles.add(title_key) merged.append(item) hero_found = False for item in merged: if item.get("breakthrough"): if hero_found: item.pop("breakthrough",None) else: hero_found = True return merged[:max_items] # ---------------- COLLECTORS ---------------- def collect_newsapi(): items = [] frm = (datetime.now(timezone.utc)-timedelta(hours=LOOKBACK_HOURS)).strftime("%Y-%m-%dT%H:%M:%S") for cat, q in NEWSAPI_QUERIES.items(): url = ("https://newsapi.org/v2/everything?"+urllib.parse.urlencode({ "q":q,"from":frm,"language":"en","sortBy":"publishedAt","pageSize":25 })+f"&apiKey={NEWSAPI_KEY}") try: for a in http_json(url).get("articles",[]): if not a.get("title") or a["title"]=="[Removed]": continue items.append({ "title":a["title"].strip(),"url":a["url"], "source":(a.get("source") or {}).get("name","News"), "category":cat,"published":iso(a.get("publishedAt","")), }) except Exception as e: print(f"[newsapi:{cat}] {e}") time.sleep(1) return items def collect_rc_candidates(): """Collect potentially doom-framed headlines for Reality Check processing.""" items = [] frm = (datetime.now(timezone.utc)-timedelta(hours=72)).strftime("%Y-%m-%dT%H:%M:%S") for q in RC_QUERIES: url = ("https://newsapi.org/v2/everything?"+urllib.parse.urlencode({ "q":q,"from":frm,"language":"en","sortBy":"popularity","pageSize":15 })+f"&apiKey={NEWSAPI_KEY}") try: for a in http_json(url).get("articles",[]): if not a.get("title") or a["title"]=="[Removed]": continue items.append({ "title":a["title"].strip(), "url":a["url"], "source":(a.get("source") or {}).get("name","News"), "published":iso(a.get("publishedAt","")), }) except Exception as e: print(f"[newsapi:rc] {e}") time.sleep(1) seen, deduped = set(), [] for i in items: key = hashlib.md5(i["title"].lower().encode()).hexdigest() if key not in seen: seen.add(key); deduped.append(i) return deduped[:30] def collect_rss(): items = [] for name, feed in RSS_FEEDS: try: root = ET.fromstring(http_get(feed)) ns = {"atom":"http://www.w3.org/2005/Atom"} entries = root.findall(".//item") or root.findall(".//atom:entry",ns) for e in entries[:8]: title = (e.findtext("title") or e.findtext("atom:title",namespaces=ns) or "").strip() link = e.findtext("link") or "" if not link: ln = e.find("atom:link",ns) link = ln.get("href") if ln is not None else "" pub = (e.findtext("pubDate") or e.findtext("atom:published",namespaces=ns) or e.findtext("atom:updated",namespaces=ns) or "") if title and link: items.append({"title":title,"url":link.strip(),"source":name, "category":"ai","published":iso(pub)}) except Exception as ex: print(f"[rss:{name}] {ex}") return [i for i in items if recent(i["published"],hours=96)] def collect_youtube(): items = [] for handle in YOUTUBE_CHANNELS: try: ch = http_json("https://www.googleapis.com/youtube/v3/channels?"+urllib.parse.urlencode({ "part":"contentDetails","forHandle":handle,"key":YOUTUBE_KEY})) chans = ch.get("items",[]) if not chans: print(f"[yt:{handle}] handle not found") continue uploads = chans[0]["contentDetails"]["relatedPlaylists"]["uploads"] pl = http_json("https://www.googleapis.com/youtube/v3/playlistItems?"+urllib.parse.urlencode({ "part":"snippet","playlistId":uploads,"maxResults":5,"key":YOUTUBE_KEY})) for v in pl.get("items",[]): s = v["snippet"] vid = s["resourceId"]["videoId"] pub = iso(s["publishedAt"]) if not recent(pub,hours=120): continue items.append({ "title":s["title"].strip(), "url":f"https://www.youtube.com/watch?v={vid}", "source":s["channelTitle"], "thumbnail":(s.get("thumbnails",{}).get("medium") or {}).get("url",""), "category":"ai","published":pub, }) except Exception as e: print(f"[yt:{handle}] {e}") time.sleep(0.5) return items # ---------------- CLAUDE FILTER ---------------- FILTER_PROMPT = """You are the editorial filter for VURSYL, a relentlessly positive news hub covering AI, quantum computing, and emerging technology. Vursyl publishes ONLY positive, constructive, optimistic content: breakthroughs, milestones, launches, approvals, records, discoveries, and wins. REJECT any item that is: doom-framed, fear-based, about layoffs, lawsuits, bans, failures, scandals, warnings, risks, culture-war angles, or stock-drop news. Neutral technical explainers and factual announcements are ACCEPTED. When in doubt, REJECT. For each ACCEPTED item assign exactly one category: - ai (AI models, LLMs, research, agents, products) - quantum (quantum computing, qubits, error correction) - health (AI/tech in medicine, biotech, longevity, CRISPR, FDA) - robotics (robots, humanoids, drones, space, autonomous) - compute (chips, GPUs, data centers, infrastructure) - breakthroughs (exceptional cross-cutting milestone — reserve for the truly remarkable) Pick AT MOST ONE item as "hero": the single most exciting positive item in the batch. Respond ONLY with JSON, no markdown fences: {"accepted":[{"i":,"category":"","hero":}]} Items: """ def claude_filter(items): if not items: return [] listing = "\n".join(f'{n}. [{i["source"]}] {i["title"]}' for n,i in enumerate(items)) try: resp = post_json("https://api.anthropic.com/v1/messages",{ "model":MODEL_FILTER,"max_tokens":4000, "messages":[{"role":"user","content":FILTER_PROMPT+listing}], },{"Content-Type":"application/json","x-api-key":ANTHROPIC_KEY,"anthropic-version":"2023-06-01"}) text = "".join(b.get("text","") for b in resp.get("content",[])) text = text.strip().removeprefix("```json").removeprefix("```").removesuffix("```").strip() decisions = json.loads(text).get("accepted",[]) except Exception as e: print(f"[claude:filter] {e} — failing closed") return [] out = [] for d in decisions: idx,cat = d.get("i"),d.get("category") if isinstance(idx,int) and 0<=idx