diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..fe25c71 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +seen_urls.txt +downloads/ +*.mp4 diff --git a/README.md b/README.md index a248169..b3dbff3 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,63 @@ # filmarkivet-dl -Downloads all videos from a given category page on filmarkivet.se using svtplay-dl. + +Downloads all videos from a given category page on [filmarkivet.se](https://www.filmarkivet.se/) using [svtplay-dl](https://github.com/spaam/svtplay-dl). + +Automatically paginates through all category pages, deduplicates URLs, and supports resuming interrupted sessions via a seen-file. + +## Requirements + +- Python 3.10+ +- [svtplay-dl](https://github.com/spaam/svtplay-dl) installed and available in `PATH` + +## Usage + +```bash +python3 filmarkivet-dl.py +``` + +By default this downloads all videos from the [reklamfilm](https://www.filmarkivet.se/category/reklamfilm/) category. + +### Options + +| Flag | Default | Description | +|------|---------|-------------| +| `--start-url URL` | `https://www.filmarkivet.se/category/reklamfilm/` | Category page to start from | +| `--sleep SECONDS` | `0.5` | Delay between downloads | +| `--page-sleep SECONDS` | `0.2` | Delay between page fetches | +| `--max-pages N` | `0` (no limit) | Stop after N pages | +| `--seen-file PATH` | `seen_urls.txt` | File tracking already-processed URLs | +| `--output-dir DIR` | `downloads` | Directory to save downloaded videos in | +| `--dry-run` | — | Print commands without running svtplay-dl | + +### Examples + +Preview what would be downloaded without actually downloading: + +```bash +python3 filmarkivet-dl.py --dry-run +``` + +Download from a different category: + +```bash +python3 filmarkivet-dl.py --start-url "https://www.filmarkivet.se/category/dokumentar/" +``` + +Save videos into a subdirectory: + +```bash +python3 filmarkivet-dl.py --output-dir reklamfilm +``` + +Limit to the first 3 pages with a 2-second delay between downloads: + +```bash +python3 filmarkivet-dl.py --max-pages 3 --sleep 2.0 +``` + +## How it works + +1. Fetches the category page HTML and extracts all `/movies/` links. +2. Detects the "next page" link via `rel="next"`, CSS class, or link text ("Nästa" / "Next"). +3. For each new video URL, runs `svtplay-dl -S ` to download all available subtitles and the best quality stream. +4. Records processed URLs in `seen_urls.txt` so re-running the script skips already-downloaded videos. diff --git a/filmarkivet-dl.py b/filmarkivet-dl.py new file mode 100644 index 0000000..e81b2f0 --- /dev/null +++ b/filmarkivet-dl.py @@ -0,0 +1,217 @@ +#!/usr/bin/env python3 +""" +Download all videos listed under Filmarkivet "reklamfilm" category pages +by running: svtplay-dl -S + +Features: +- Auto-pagination +- Deduping +- Resume via --seen-file +- Optional --dry-run and rate limiting + +Usage examples: + python3 download_filmarkivet_reklamfilm.py + python3 download_filmarkivet_reklamfilm.py --dry-run + python3 download_filmarkivet_reklamfilm.py --sleep 1.0 + python3 download_filmarkivet_reklamfilm.py --max-pages 5 +""" + +import argparse +import os +import re +import subprocess +import sys +import time +from html.parser import HTMLParser +from urllib.parse import urljoin, urlparse +from urllib.request import Request, urlopen + + +DEFAULT_START_URL = "https://www.filmarkivet.se/category/reklamfilm/" + + +class LinkExtractor(HTMLParser): + def __init__(self, base_url: str): + super().__init__() + self.base_url = base_url + self.movie_links = set() + self.next_link = None + self._current_a_href = None + + def handle_starttag(self, tag, attrs): + if tag.lower() != "a": + return + + attr = dict(attrs) + href = attr.get("href") + if not href: + return + + abs_url = urljoin(self.base_url, href) + self._current_a_href = abs_url + + if "/movies/" in urlparse(abs_url).path: + self.movie_links.add(abs_url) + + rel = (attr.get("rel") or "") + cls = (attr.get("class") or "") + + rel_str = " ".join(rel) if isinstance(rel, (list, tuple)) else str(rel) + cls_str = " ".join(cls) if isinstance(cls, (list, tuple)) else str(cls) + + is_next = False + if "next" in rel_str.lower(): + is_next = True + if re.search(r"\bnext\b", cls_str.lower()): + is_next = True + + if is_next: + self.next_link = abs_url + + def handle_endtag(self, tag): + if tag.lower() == "a": + self._current_a_href = None + + def handle_data(self, data): + if self._current_a_href is None: + return + text = data.strip().lower() + if text in ("nästa", "next", "next »", "nästa »"): + self.next_link = self._current_a_href + + +def fetch_html(url: str, referer: str = None, timeout: int = 30) -> str: + headers = { + "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:128.0) Gecko/20100101 Firefox/128.0", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Accept-Language": "sv-SE,sv;q=0.9,en-US;q=0.8,en;q=0.7", + } + if referer: + headers["Referer"] = referer + req = Request(url, headers=headers) + with urlopen(req, timeout=timeout) as resp: + charset = resp.headers.get_content_charset() or "utf-8" + return resp.read().decode(charset, errors="replace") + + +def load_seen(path: str) -> set[str]: + if not path or not os.path.exists(path): + return set() + with open(path, "r", encoding="utf-8") as f: + return set(line.strip() for line in f if line.strip()) + + +def append_seen(path: str, url: str) -> None: + if not path: + return + with open(path, "a", encoding="utf-8") as f: + f.write(url + "\n") + + +def run_svtplay_dl(url: str, dry_run: bool, output_dir: str = None) -> int: + cmd = ["svtplay-dl", "-S"] + if output_dir: + cmd += ["-o", output_dir] + cmd.append(url) + print(">>", " ".join(cmd)) + if dry_run: + return 0 + try: + return subprocess.call(cmd) + except FileNotFoundError: + print("ERROR: svtplay-dl not found in PATH. Install it and try again.", file=sys.stderr) + return 127 + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--start-url", default=DEFAULT_START_URL) + ap.add_argument("--sleep", type=float, default=0.5, help="Sleep between downloads (seconds)") + ap.add_argument("--page-sleep", type=float, default=0.2, help="Sleep between page fetches (seconds)") + ap.add_argument("--max-pages", type=int, default=0, help="0 = no limit") + ap.add_argument("--seen-file", default="seen_urls.txt", help="File to persist processed video URLs") + ap.add_argument("--output-dir", default="downloads", help="Directory to save downloaded videos in") + ap.add_argument("--dry-run", action="store_true", help="Print commands but do not run svtplay-dl") + args = ap.parse_args() + + if args.output_dir and not args.dry_run: + os.makedirs(args.output_dir, exist_ok=True) + + seen = load_seen(args.seen_file) + visited_pages = set() + + page_url = args.start_url + prev_url = None + page_count = 0 + total_found = 0 + total_attempted = 0 + + while page_url: + if page_url in visited_pages: + print(f"Stopping: already visited page {page_url}") + break + visited_pages.add(page_url) + + page_count += 1 + if args.max_pages and page_count > args.max_pages: + print(f"Stopping: reached --max-pages={args.max_pages}") + break + + print(f"\n=== Page {page_count}: {page_url} ===") + try: + html = fetch_html(page_url, referer=prev_url) + except Exception as e: + print(f"ERROR fetching {page_url}: {e}", file=sys.stderr) + break + + parser = LinkExtractor(page_url) + parser.feed(html) + + movie_links = sorted(parser.movie_links) + total_found += len(movie_links) + print(f"Found {len(movie_links)} movie links on this page.") + + for video_url in movie_links: + if video_url in seen: + continue + total_attempted += 1 + rc = run_svtplay_dl(video_url, args.dry_run, args.output_dir) + append_seen(args.seen_file, video_url) + seen.add(video_url) + + if rc != 0: + print(f"WARNING: svtplay-dl returned {rc} for {video_url}", file=sys.stderr) + + if args.sleep > 0: + time.sleep(args.sleep) + + next_url = parser.next_link + + # Fallback: if rel/class/text detection failed, try a regex for rel="next" + if not next_url: + for pattern in [ + r'rel=["\']next["\']\s+href=["\']([^"\']+)["\']', + r'href=["\']([^"\']+)["\']\s+rel=["\']next["\']', + ]: + m = re.search(pattern, html, flags=re.I) + if m: + next_url = urljoin(page_url, m.group(1)) + break + + prev_url = page_url + if next_url: + if args.page_sleep > 0: + time.sleep(args.page_sleep) + page_url = next_url + else: + page_url = None + + print("\n=== Done ===") + print(f"Pages visited: {page_count}") + print(f"Movie links found (sum across pages, incl duplicates): {total_found}") + print(f"New downloads attempted: {total_attempted}") + print(f"Seen file: {args.seen_file}") + + +if __name__ == "__main__": + main() \ No newline at end of file