#!/usr/bin/env python3 """ Download all videos from a SVT Play category page. Features: - Extracts program listings from SVT Play category pages (embedded JSON) - Downloads movies and series episodes via svtplay-dl - Tracks movies, episodes, series state, and errors across runs - Downloads cover images (poster.jpg) for Jellyfin - Graceful stop on Ctrl+C (finishes current download) - Suggests marking stale series as complete Usage examples: python3 svtplay-dl-category.py python3 svtplay-dl-category.py --url https://www.svtplay.se/kategori/serier?tab=all python3 svtplay-dl-category.py --dry-run python3 svtplay-dl-category.py --mark-complete https://www.svtplay.se/show-name python3 svtplay-dl-category.py --unmark-complete https://www.svtplay.se/show-name """ import argparse import json import os import re import signal import subprocess import sys import time from datetime import datetime from html.parser import HTMLParser from urllib.parse import urljoin, urlparse from urllib.request import Request, urlopen DEFAULT_CATEGORY_URL = "https://www.svtplay.se/kategori/filmer?tab=all" INFO_SEARCH_EXPR = r']*>({.+})' # --------------------------------------------------------------------------- # Graceful stop # --------------------------------------------------------------------------- stop_requested = False current_child: subprocess.Popen | None = None def _signal_handler(signum, frame): global stop_requested if stop_requested: print("\nForce quit!", file=sys.stderr) if current_child is not None: current_child.terminate() sys.exit(1) print("\nGraceful stop requested. Finishing current download...", file=sys.stderr) stop_requested = True signal.signal(signal.SIGINT, _signal_handler) signal.signal(signal.SIGTERM, _signal_handler) # --------------------------------------------------------------------------- # HTTP helpers # --------------------------------------------------------------------------- _HTTP_HEADERS = { "User-Agent": ( "Mozilla/5.0 (X11; Linux x86_64; rv:128.0) " "Gecko/20100101 Firefox/128.0" ), "Accept": ( "text/html,application/xhtml+xml,application/xml;" "q=0.9,*/*;q=0.8" ), "Accept-Language": "sv-SE,sv;q=0.9,en-US;q=0.8,en;q=0.7", } def fetch_html(url: str, timeout: int = 30) -> str: req = Request(url, headers=_HTTP_HEADERS) with urlopen(req, timeout=timeout) as resp: charset = resp.headers.get_content_charset() or "utf-8" return resp.read().decode(charset, errors="replace") def download_file(url: str, dest_path: str, timeout: int = 60) -> bool: req = Request(url, headers={ "User-Agent": _HTTP_HEADERS["User-Agent"], }) try: with urlopen(req, timeout=timeout) as resp: with open(dest_path, "wb") as f: while True: chunk = resp.read(8192) if not chunk: break f.write(chunk) return True except Exception as e: print(f" WARNING: Image download failed: {e}", file=sys.stderr) if os.path.exists(dest_path): os.remove(dest_path) return False # --------------------------------------------------------------------------- # SVT Play JSON extraction # --------------------------------------------------------------------------- def extract_page_json(html: str) -> dict | None: match = re.search(INFO_SEARCH_EXPR, html) if not match: return None try: return json.loads(match.group(1)) except json.JSONDecodeError: return None def _iter_urql_entries(page_json: dict): for entry in page_json.get("props", {}).get("urqlState", {}).values(): if "data" in entry: try: yield json.loads(entry["data"]) except (json.JSONDecodeError, TypeError): continue # --------------------------------------------------------------------------- # Category page parsing # --------------------------------------------------------------------------- def get_category_name(page_json: dict, url: str) -> str: for entry in _iter_urql_entries(page_json): for key, data in entry.items(): if key == "categoryPage" and isinstance(data, dict): for field in ("heading", "name"): if data.get(field): return data[field] path = urlparse(url).path parts = [p for p in path.split("/") if p] if len(parts) >= 2 and parts[0] == "kategori": return parts[1].replace("-", " ").title() return "Unknown" def get_category_items(page_json: dict) -> list[dict]: items: list[dict] = [] for entry in _iter_urql_entries(page_json): for key, data in entry.items(): if key != "categoryPage" or not isinstance(data, dict): continue for tab in data.get("lazyLoadedTabs", []): if tab.get("slug") != "all": continue for module in tab.get("modules", []): sel = module.get("selection") if sel: items.extend(sel.get("items", [])) return items # --------------------------------------------------------------------------- # Detail page parsing — metadata # --------------------------------------------------------------------------- def _find_details(page_json: dict) -> dict | None: # Prefer entries that have smartStart (like svtplay-dl does) for entry in _iter_urql_entries(page_json): for key, data in entry.items(): if (key == "detailsPageByPath" and isinstance(data, dict) and "smartStart" in data): return data for entry in _iter_urql_entries(page_json): for key, data in entry.items(): if (key == "detailsPageByPath" and isinstance(data, dict) and "item" in data): return data return None def get_video_metadata(html: str) -> tuple[str | None, str | None, str | None]: """Return (name, year, image_url) from a detail page.""" page_json = extract_page_json(html) if not page_json: return None, None, None details = _find_details(page_json) if not details: return None, None, None name = _safe_get(details, "item", "parent", "name") if not name: name = _safe_get(details, "item", "name") year = _safe_get(details, "moreDetails", "productionYear") if year is not None: year = str(year) image_url = _image_from_json(details) or _image_from_html(html) return name, year, image_url def _safe_get(d, *keys): for k in keys: if not isinstance(d, dict): return None d = d.get(k) return d def _image_url_from_dict(img: dict) -> str | None: if "id" in img and "changed" in img: return ( f"https://www.svtstatic.se/image/original/default/" f"{img['id']}/{img['changed']}?format=auto&quality=100" ) return None def _image_from_json(details: dict) -> str | None: img = _safe_get(details, "item", "parent", "image", "wide") if isinstance(img, dict): return _image_url_from_dict(img) if isinstance(img, str) and img: return img img = _safe_get(details, "images", "wide") if isinstance(img, dict): return _image_url_from_dict(img) if isinstance(img, str) and img: return img return None # --------------------------------------------------------------------------- # Detail page parsing — cover image HTML fallback # --------------------------------------------------------------------------- class _ImageSrcsetExtractor(HTMLParser): def __init__(self): super().__init__() self._in_container = False self.image_url: str | None = None def handle_starttag(self, tag, attrs): attr = dict(attrs) if tag == "div" and attr.get("data-css-selector") == "imageContainer": self._in_container = True if tag == "img" and self._in_container and self.image_url is None: srcset = attr.get("srcset", "") if srcset: best_url, best_w = None, 0 for part in srcset.split(","): part = part.strip() pieces = part.rsplit(" ", 1) if len(pieces) == 2: try: w = int(pieces[1].rstrip("w")) except ValueError: continue if w > best_w: best_w = w best_url = pieces[0] self.image_url = best_url or attr.get("src") else: self.image_url = attr.get("src") def handle_endtag(self, tag): if tag == "div": self._in_container = False def _image_from_html(html: str) -> str | None: p = _ImageSrcsetExtractor() p.feed(html) return p.image_url # --------------------------------------------------------------------------- # Episode discovery (replaces svtplay-dl -A) # --------------------------------------------------------------------------- def discover_episode_urls(html: str) -> list[str]: page_json = extract_page_json(html) if not page_json: return [] details = _find_details(page_json) if not details: return [] # If this is a Single, return its own URL parent_type = _safe_get(details, "item", "parent", "__typename") if parent_type == "Single": path = _safe_get(details, "item", "urls", "svtplay") if path: return [urljoin("https://www.svtplay.se", path)] return [] videos: list[str] = [] for module in details.get("modules", []): mod_id = module.get("id", "") if mod_id in ("upcoming", "related") or mod_id.startswith("details"): continue if "clips" in mod_id: continue sel = module.get("selection") if not sel: continue for item in sel.get("items", []): path = _safe_get(item, "item", "urls", "svtplay") if path: full = urljoin("https://www.svtplay.se", path) if full not in videos: videos.append(full) return videos # --------------------------------------------------------------------------- # Tracking files (seen_urls.txt / seen_episodes.txt) # --------------------------------------------------------------------------- def load_seen(path: str) -> set[str]: if not path or not os.path.exists(path): return set() with open(path, "r", encoding="utf-8") as f: return {line.strip() for line in f if line.strip()} def append_seen(path: str, url: str) -> None: if not path: return with open(path, "a", encoding="utf-8") as f: f.write(url + "\n") def remove_from_seen(path: str, url: str) -> bool: if not path or not os.path.exists(path): return False with open(path, "r", encoding="utf-8") as f: lines = f.readlines() filtered = [l for l in lines if l.strip() != url] if len(filtered) == len(lines): return False with open(path, "w", encoding="utf-8") as f: f.writelines(filtered) return True # --------------------------------------------------------------------------- # JSON state files (series_state.json / errors.json) # --------------------------------------------------------------------------- def load_json_state(path: str) -> dict: if not path or not os.path.exists(path): return {} try: with open(path, "r", encoding="utf-8") as f: return json.load(f) except (json.JSONDecodeError, OSError): return {} def save_json_state(path: str, data: dict) -> None: if not path: return tmp = path + ".tmp" with open(tmp, "w", encoding="utf-8") as f: json.dump(data, f, indent=2, ensure_ascii=False) os.replace(tmp, path) # --------------------------------------------------------------------------- # Error tracking # --------------------------------------------------------------------------- def is_permanent_error(errors: dict, url: str) -> bool: return errors.get(url, {}).get("permanent", False) def record_error(errors: dict, url: str, return_code: int, errors_file: str) -> None: entry = errors.get(url, {"fail_count": 0, "permanent": False}) entry["fail_count"] = entry.get("fail_count", 0) + 1 entry["last_error"] = f"svtplay-dl exited with code {return_code}" entry["last_failure"] = datetime.now().isoformat() if entry["fail_count"] > 2: entry["permanent"] = True errors[url] = entry save_json_state(errors_file, errors) # --------------------------------------------------------------------------- # Series state # --------------------------------------------------------------------------- def update_series_state(state: dict, show_url: str, found_new: bool, show_name: str, state_file: str) -> None: entry = state.get(show_url, { "name": show_name, "check_count": 0, "last_new_episode_date": None, }) entry["name"] = show_name or entry.get("name", show_url) if found_new: entry["check_count"] = 0 entry["last_new_episode_date"] = datetime.now().isoformat() else: entry["check_count"] = entry.get("check_count", 0) + 1 state[show_url] = entry save_json_state(state_file, state) def find_stale_series(state: dict, stale_days: int): now = datetime.now() for url, entry in state.items(): checks = entry.get("check_count", 0) if checks < 2: continue raw = entry.get("last_new_episode_date") if raw: try: days = (now - datetime.fromisoformat(raw)).days except ValueError: days = 9999 else: days = 9999 if days >= stale_days: yield url, entry.get("name", url), days, checks # --------------------------------------------------------------------------- # svtplay-dl invocation # --------------------------------------------------------------------------- def run_svtplay_dl(url: str, output_dir: str, dry_run: bool) -> int: global current_child cmd = ["svtplay-dl", "-S", "-o", output_dir, url] print(f" >> {' '.join(cmd)}") if dry_run: return 0 try: current_child = subprocess.Popen(cmd, start_new_session=True) rc = current_child.wait() current_child = None return rc except FileNotFoundError: current_child = None print("ERROR: svtplay-dl not found in PATH.", file=sys.stderr) return 127 def download_with_retry(url: str, output_dir: str, dry_run: bool, errors: dict, errors_file: str) -> bool: """Attempt download with one immediate retry. Returns True on success.""" if is_permanent_error(errors, url): print(f" SKIP (permanent error): {url} — see errors.json") return False rc = run_svtplay_dl(url, output_dir, dry_run) if rc == 0: if url in errors: del errors[url] save_json_state(errors_file, errors) return True print(f" Retrying {url} ...") rc = run_svtplay_dl(url, output_dir, dry_run) if rc == 0: if url in errors: del errors[url] save_json_state(errors_file, errors) return True record_error(errors, url, rc, errors_file) entry = errors.get(url, {}) if entry.get("permanent"): print(f" PERMANENT ERROR: {url} " f"(failed {entry['fail_count']} times total)", file=sys.stderr) else: print(f" ERROR: {url} " f"(will retry next run, " f"{entry.get('fail_count', 0)} failures total)", file=sys.stderr) return False # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def sanitize_filename(name: str) -> str: name = re.sub(r'[<>:"/\\|?*]', "", name) return name.strip(". ") # --------------------------------------------------------------------------- # Main # --------------------------------------------------------------------------- def main(): ap = argparse.ArgumentParser( description="Download all videos from a SVT Play category page.", ) ap.add_argument( "--url", default=DEFAULT_CATEGORY_URL, help="Category page URL (default: %(default)s)", ) ap.add_argument( "--output-dir", default="Downloads", help="Base output directory (default: %(default)s)", ) ap.add_argument( "--seen-file", default="seen_urls.txt", help="Tracks completed movie / series URLs (default: %(default)s)", ) ap.add_argument( "--seen-episodes-file", default="seen_episodes.txt", help="Tracks downloaded episode URLs (default: %(default)s)", ) ap.add_argument( "--series-state-file", default="series_state.json", help="Tracks series check history (default: %(default)s)", ) ap.add_argument( "--errors-file", default="errors.json", help="Tracks download errors (default: %(default)s)", ) ap.add_argument( "--sleep", type=float, default=1.0, help="Delay between downloads in seconds (default: %(default)s)", ) ap.add_argument( "--stale-days", type=int, default=365, help="Days w/o new episodes before suggesting completion " "(default: %(default)s)", ) ap.add_argument( "--max-dl", type=int, default=0, metavar="N", help="Stop after N successful downloads (0 = no limit)", ) ap.add_argument( "--dry-run", action="store_true", help="Print commands without actually downloading", ) ap.add_argument( "--mark-complete", metavar="URL", help="Mark a series URL as complete and exit", ) ap.add_argument( "--unmark-complete", metavar="URL", help="Unmark a series URL as complete and exit", ) args = ap.parse_args() # ---- mark / unmark ---- if args.mark_complete: seen = load_seen(args.seen_file) if args.mark_complete in seen: print(f"Already marked as complete: {args.mark_complete}") else: append_seen(args.seen_file, args.mark_complete) print(f"Marked as complete: {args.mark_complete}") return if args.unmark_complete: if remove_from_seen(args.seen_file, args.unmark_complete): print(f"Unmarked (will be re-checked): {args.unmark_complete}") else: print(f"Not found in seen file: {args.unmark_complete}") return # ---- load state ---- seen = load_seen(args.seen_file) seen_episodes = load_seen(args.seen_episodes_file) series_state = load_json_state(args.series_state_file) errors = load_json_state(args.errors_file) # ---- fetch category page ---- print(f"Fetching category page: {args.url}") try: cat_html = fetch_html(args.url) except Exception as e: print(f"ERROR: Failed to fetch category page: {e}", file=sys.stderr) sys.exit(1) page_json = extract_page_json(cat_html) if not page_json: print("ERROR: Could not extract JSON data from category page.", file=sys.stderr) sys.exit(1) category_name = get_category_name(page_json, args.url) items = get_category_items(page_json) print(f"Category: {category_name}") print(f"Found {len(items)} items in category listing.") if not items: print("No items found. The page structure may have changed.", file=sys.stderr) sys.exit(1) # ---- process items ---- stats = dict( movies_downloaded=0, episodes_downloaded=0, series_checked=0, skipped_seen=0, skipped_permanent=0, errors_this_run=0, ) def dl_limit_reached(): if args.max_dl <= 0: return False total = stats["movies_downloaded"] + stats["episodes_downloaded"] return total >= args.max_dl for idx, item_data in enumerate(items): if stop_requested or dl_limit_reached(): if dl_limit_reached(): print(f"\nReached --max-dl={args.max_dl}. Stopping.") else: print("\nStopping as requested.") break try: item = item_data["item"] url_path = item["urls"]["svtplay"] item_url = urljoin("https://www.svtplay.se", url_path) is_single = item.get("__typename") == "Single" name_hint = url_path.rstrip("/").rsplit("/", 1)[-1] except (KeyError, TypeError) as e: print(f"\n WARNING: Skipping malformed item #{idx}: {e}") continue kind = "Movie" if is_single else "Series" print(f"\n[{idx + 1}/{len(items)}] {kind}: {name_hint}") if item_url in seen: print(" Skipped (in seen file)") stats["skipped_seen"] += 1 continue # ---- fetch detail page ---- print(f" Fetching: {item_url}") try: detail_html = fetch_html(item_url) except Exception as e: print(f" ERROR fetching detail page: {e}", file=sys.stderr) stats["errors_this_run"] += 1 continue name, year, image_url = get_video_metadata(detail_html) if not name: name = name_hint folder_name = (f"{sanitize_filename(name)} ({year})" if year else sanitize_filename(name)) folder_path = os.path.join( args.output_dir, sanitize_filename(category_name), folder_name, ) if not args.dry_run: os.makedirs(folder_path, exist_ok=True) print(f" -> {folder_path}") # ---- poster image ---- poster_path = os.path.join(folder_path, "poster.jpg") if image_url and not os.path.exists(poster_path): print(" Downloading poster...") if args.dry_run: print(f" >> (dry-run) download poster -> {poster_path}") else: download_file(image_url, poster_path) # ---- download ---- if is_single: if is_permanent_error(errors, item_url): print(f" SKIP (permanent error) — see errors.json") stats["skipped_permanent"] += 1 continue if download_with_retry(item_url, folder_path, args.dry_run, errors, args.errors_file): if not args.dry_run: append_seen(args.seen_file, item_url) seen.add(item_url) stats["movies_downloaded"] += 1 else: stats["errors_this_run"] += 1 else: stats["series_checked"] += 1 episode_urls = discover_episode_urls(detail_html) total_eps = len(episode_urls) new_eps = [ ep for ep in episode_urls if ep not in seen_episodes and not is_permanent_error(errors, ep) ] perm_skipped = sum( 1 for ep in episode_urls if is_permanent_error(errors, ep) ) if perm_skipped: stats["skipped_permanent"] += perm_skipped print(f" Episodes: {total_eps} total, {len(new_eps)} new" + (f", {perm_skipped} permanently failed" if perm_skipped else "")) found_new = len(new_eps) > 0 for ep_i, ep_url in enumerate(new_eps): if stop_requested or dl_limit_reached(): break print(f" Episode [{ep_i + 1}/{len(new_eps)}]: {ep_url}") if download_with_retry(ep_url, folder_path, args.dry_run, errors, args.errors_file): if not args.dry_run: append_seen(args.seen_episodes_file, ep_url) seen_episodes.add(ep_url) stats["episodes_downloaded"] += 1 else: stats["errors_this_run"] += 1 if (args.sleep > 0 and ep_i < len(new_eps) - 1 and not stop_requested): time.sleep(args.sleep) if not args.dry_run: update_series_state(series_state, item_url, found_new, name, args.series_state_file) if args.sleep > 0 and idx < len(items) - 1 and not stop_requested: time.sleep(args.sleep) # ---- stale series suggestions ---- stale = list(find_stale_series(series_state, args.stale_days)) stale = [(u, n, d, c) for u, n, d, c in stale if u not in seen] if stale: print(f"\n{'=' * 60}") print("STALE SERIES — consider marking as complete:") print(f"{'=' * 60}") for url, name, days, checks in stale: print(f'\n "{name}" — no new episodes for {days} days ' f"(checked {checks} times)") print(f" python3 {sys.argv[0]} --mark-complete {url}") # ---- summary ---- print(f"\n{'=' * 60}") print("SUMMARY") print(f"{'=' * 60}") print(f" Movies downloaded: {stats['movies_downloaded']}") print(f" Episodes downloaded: {stats['episodes_downloaded']}") print(f" Series checked: {stats['series_checked']}") print(f" Skipped (already seen): {stats['skipped_seen']}") print(f" Skipped (perm. error): {stats['skipped_permanent']}") print(f" Errors this run: {stats['errors_this_run']}") if stop_requested: print(" (Run was interrupted by user)") if __name__ == "__main__": main()