Readme and script added
This commit is contained in:
3
.gitignore
vendored
Normal file
3
.gitignore
vendored
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
seen_urls.txt
|
||||||
|
downloads/
|
||||||
|
*.mp4
|
||||||
63
README.md
63
README.md
@@ -1,2 +1,63 @@
|
|||||||
# filmarkivet-dl
|
# filmarkivet-dl
|
||||||
Downloads all videos from a given category page on filmarkivet.se using svtplay-dl.
|
|
||||||
|
Downloads all videos from a given category page on [filmarkivet.se](https://www.filmarkivet.se/) using [svtplay-dl](https://github.com/spaam/svtplay-dl).
|
||||||
|
|
||||||
|
Automatically paginates through all category pages, deduplicates URLs, and supports resuming interrupted sessions via a seen-file.
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
- Python 3.10+
|
||||||
|
- [svtplay-dl](https://github.com/spaam/svtplay-dl) installed and available in `PATH`
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 filmarkivet-dl.py
|
||||||
|
```
|
||||||
|
|
||||||
|
By default this downloads all videos from the [reklamfilm](https://www.filmarkivet.se/category/reklamfilm/) category.
|
||||||
|
|
||||||
|
### Options
|
||||||
|
|
||||||
|
| Flag | Default | Description |
|
||||||
|
|------|---------|-------------|
|
||||||
|
| `--start-url URL` | `https://www.filmarkivet.se/category/reklamfilm/` | Category page to start from |
|
||||||
|
| `--sleep SECONDS` | `0.5` | Delay between downloads |
|
||||||
|
| `--page-sleep SECONDS` | `0.2` | Delay between page fetches |
|
||||||
|
| `--max-pages N` | `0` (no limit) | Stop after N pages |
|
||||||
|
| `--seen-file PATH` | `seen_urls.txt` | File tracking already-processed URLs |
|
||||||
|
| `--output-dir DIR` | `downloads` | Directory to save downloaded videos in |
|
||||||
|
| `--dry-run` | — | Print commands without running svtplay-dl |
|
||||||
|
|
||||||
|
### Examples
|
||||||
|
|
||||||
|
Preview what would be downloaded without actually downloading:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 filmarkivet-dl.py --dry-run
|
||||||
|
```
|
||||||
|
|
||||||
|
Download from a different category:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 filmarkivet-dl.py --start-url "https://www.filmarkivet.se/category/dokumentar/"
|
||||||
|
```
|
||||||
|
|
||||||
|
Save videos into a subdirectory:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 filmarkivet-dl.py --output-dir reklamfilm
|
||||||
|
```
|
||||||
|
|
||||||
|
Limit to the first 3 pages with a 2-second delay between downloads:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 filmarkivet-dl.py --max-pages 3 --sleep 2.0
|
||||||
|
```
|
||||||
|
|
||||||
|
## How it works
|
||||||
|
|
||||||
|
1. Fetches the category page HTML and extracts all `/movies/` links.
|
||||||
|
2. Detects the "next page" link via `rel="next"`, CSS class, or link text ("Nästa" / "Next").
|
||||||
|
3. For each new video URL, runs `svtplay-dl -S <url>` to download all available subtitles and the best quality stream.
|
||||||
|
4. Records processed URLs in `seen_urls.txt` so re-running the script skips already-downloaded videos.
|
||||||
|
|||||||
217
filmarkivet-dl.py
Normal file
217
filmarkivet-dl.py
Normal file
@@ -0,0 +1,217 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Download all videos listed under Filmarkivet "reklamfilm" category pages
|
||||||
|
by running: svtplay-dl -S <video_url>
|
||||||
|
|
||||||
|
Features:
|
||||||
|
- Auto-pagination
|
||||||
|
- Deduping
|
||||||
|
- Resume via --seen-file
|
||||||
|
- Optional --dry-run and rate limiting
|
||||||
|
|
||||||
|
Usage examples:
|
||||||
|
python3 download_filmarkivet_reklamfilm.py
|
||||||
|
python3 download_filmarkivet_reklamfilm.py --dry-run
|
||||||
|
python3 download_filmarkivet_reklamfilm.py --sleep 1.0
|
||||||
|
python3 download_filmarkivet_reklamfilm.py --max-pages 5
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from html.parser import HTMLParser
|
||||||
|
from urllib.parse import urljoin, urlparse
|
||||||
|
from urllib.request import Request, urlopen
|
||||||
|
|
||||||
|
|
||||||
|
DEFAULT_START_URL = "https://www.filmarkivet.se/category/reklamfilm/"
|
||||||
|
|
||||||
|
|
||||||
|
class LinkExtractor(HTMLParser):
|
||||||
|
def __init__(self, base_url: str):
|
||||||
|
super().__init__()
|
||||||
|
self.base_url = base_url
|
||||||
|
self.movie_links = set()
|
||||||
|
self.next_link = None
|
||||||
|
self._current_a_href = None
|
||||||
|
|
||||||
|
def handle_starttag(self, tag, attrs):
|
||||||
|
if tag.lower() != "a":
|
||||||
|
return
|
||||||
|
|
||||||
|
attr = dict(attrs)
|
||||||
|
href = attr.get("href")
|
||||||
|
if not href:
|
||||||
|
return
|
||||||
|
|
||||||
|
abs_url = urljoin(self.base_url, href)
|
||||||
|
self._current_a_href = abs_url
|
||||||
|
|
||||||
|
if "/movies/" in urlparse(abs_url).path:
|
||||||
|
self.movie_links.add(abs_url)
|
||||||
|
|
||||||
|
rel = (attr.get("rel") or "")
|
||||||
|
cls = (attr.get("class") or "")
|
||||||
|
|
||||||
|
rel_str = " ".join(rel) if isinstance(rel, (list, tuple)) else str(rel)
|
||||||
|
cls_str = " ".join(cls) if isinstance(cls, (list, tuple)) else str(cls)
|
||||||
|
|
||||||
|
is_next = False
|
||||||
|
if "next" in rel_str.lower():
|
||||||
|
is_next = True
|
||||||
|
if re.search(r"\bnext\b", cls_str.lower()):
|
||||||
|
is_next = True
|
||||||
|
|
||||||
|
if is_next:
|
||||||
|
self.next_link = abs_url
|
||||||
|
|
||||||
|
def handle_endtag(self, tag):
|
||||||
|
if tag.lower() == "a":
|
||||||
|
self._current_a_href = None
|
||||||
|
|
||||||
|
def handle_data(self, data):
|
||||||
|
if self._current_a_href is None:
|
||||||
|
return
|
||||||
|
text = data.strip().lower()
|
||||||
|
if text in ("nästa", "next", "next »", "nästa »"):
|
||||||
|
self.next_link = self._current_a_href
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_html(url: str, referer: str = None, timeout: int = 30) -> str:
|
||||||
|
headers = {
|
||||||
|
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:128.0) Gecko/20100101 Firefox/128.0",
|
||||||
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||||
|
"Accept-Language": "sv-SE,sv;q=0.9,en-US;q=0.8,en;q=0.7",
|
||||||
|
}
|
||||||
|
if referer:
|
||||||
|
headers["Referer"] = referer
|
||||||
|
req = Request(url, headers=headers)
|
||||||
|
with urlopen(req, timeout=timeout) as resp:
|
||||||
|
charset = resp.headers.get_content_charset() or "utf-8"
|
||||||
|
return resp.read().decode(charset, errors="replace")
|
||||||
|
|
||||||
|
|
||||||
|
def load_seen(path: str) -> set[str]:
|
||||||
|
if not path or not os.path.exists(path):
|
||||||
|
return set()
|
||||||
|
with open(path, "r", encoding="utf-8") as f:
|
||||||
|
return set(line.strip() for line in f if line.strip())
|
||||||
|
|
||||||
|
|
||||||
|
def append_seen(path: str, url: str) -> None:
|
||||||
|
if not path:
|
||||||
|
return
|
||||||
|
with open(path, "a", encoding="utf-8") as f:
|
||||||
|
f.write(url + "\n")
|
||||||
|
|
||||||
|
|
||||||
|
def run_svtplay_dl(url: str, dry_run: bool, output_dir: str = None) -> int:
|
||||||
|
cmd = ["svtplay-dl", "-S"]
|
||||||
|
if output_dir:
|
||||||
|
cmd += ["-o", output_dir]
|
||||||
|
cmd.append(url)
|
||||||
|
print(">>", " ".join(cmd))
|
||||||
|
if dry_run:
|
||||||
|
return 0
|
||||||
|
try:
|
||||||
|
return subprocess.call(cmd)
|
||||||
|
except FileNotFoundError:
|
||||||
|
print("ERROR: svtplay-dl not found in PATH. Install it and try again.", file=sys.stderr)
|
||||||
|
return 127
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
ap = argparse.ArgumentParser()
|
||||||
|
ap.add_argument("--start-url", default=DEFAULT_START_URL)
|
||||||
|
ap.add_argument("--sleep", type=float, default=0.5, help="Sleep between downloads (seconds)")
|
||||||
|
ap.add_argument("--page-sleep", type=float, default=0.2, help="Sleep between page fetches (seconds)")
|
||||||
|
ap.add_argument("--max-pages", type=int, default=0, help="0 = no limit")
|
||||||
|
ap.add_argument("--seen-file", default="seen_urls.txt", help="File to persist processed video URLs")
|
||||||
|
ap.add_argument("--output-dir", default="downloads", help="Directory to save downloaded videos in")
|
||||||
|
ap.add_argument("--dry-run", action="store_true", help="Print commands but do not run svtplay-dl")
|
||||||
|
args = ap.parse_args()
|
||||||
|
|
||||||
|
if args.output_dir and not args.dry_run:
|
||||||
|
os.makedirs(args.output_dir, exist_ok=True)
|
||||||
|
|
||||||
|
seen = load_seen(args.seen_file)
|
||||||
|
visited_pages = set()
|
||||||
|
|
||||||
|
page_url = args.start_url
|
||||||
|
prev_url = None
|
||||||
|
page_count = 0
|
||||||
|
total_found = 0
|
||||||
|
total_attempted = 0
|
||||||
|
|
||||||
|
while page_url:
|
||||||
|
if page_url in visited_pages:
|
||||||
|
print(f"Stopping: already visited page {page_url}")
|
||||||
|
break
|
||||||
|
visited_pages.add(page_url)
|
||||||
|
|
||||||
|
page_count += 1
|
||||||
|
if args.max_pages and page_count > args.max_pages:
|
||||||
|
print(f"Stopping: reached --max-pages={args.max_pages}")
|
||||||
|
break
|
||||||
|
|
||||||
|
print(f"\n=== Page {page_count}: {page_url} ===")
|
||||||
|
try:
|
||||||
|
html = fetch_html(page_url, referer=prev_url)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"ERROR fetching {page_url}: {e}", file=sys.stderr)
|
||||||
|
break
|
||||||
|
|
||||||
|
parser = LinkExtractor(page_url)
|
||||||
|
parser.feed(html)
|
||||||
|
|
||||||
|
movie_links = sorted(parser.movie_links)
|
||||||
|
total_found += len(movie_links)
|
||||||
|
print(f"Found {len(movie_links)} movie links on this page.")
|
||||||
|
|
||||||
|
for video_url in movie_links:
|
||||||
|
if video_url in seen:
|
||||||
|
continue
|
||||||
|
total_attempted += 1
|
||||||
|
rc = run_svtplay_dl(video_url, args.dry_run, args.output_dir)
|
||||||
|
append_seen(args.seen_file, video_url)
|
||||||
|
seen.add(video_url)
|
||||||
|
|
||||||
|
if rc != 0:
|
||||||
|
print(f"WARNING: svtplay-dl returned {rc} for {video_url}", file=sys.stderr)
|
||||||
|
|
||||||
|
if args.sleep > 0:
|
||||||
|
time.sleep(args.sleep)
|
||||||
|
|
||||||
|
next_url = parser.next_link
|
||||||
|
|
||||||
|
# Fallback: if rel/class/text detection failed, try a regex for rel="next"
|
||||||
|
if not next_url:
|
||||||
|
for pattern in [
|
||||||
|
r'rel=["\']next["\']\s+href=["\']([^"\']+)["\']',
|
||||||
|
r'href=["\']([^"\']+)["\']\s+rel=["\']next["\']',
|
||||||
|
]:
|
||||||
|
m = re.search(pattern, html, flags=re.I)
|
||||||
|
if m:
|
||||||
|
next_url = urljoin(page_url, m.group(1))
|
||||||
|
break
|
||||||
|
|
||||||
|
prev_url = page_url
|
||||||
|
if next_url:
|
||||||
|
if args.page_sleep > 0:
|
||||||
|
time.sleep(args.page_sleep)
|
||||||
|
page_url = next_url
|
||||||
|
else:
|
||||||
|
page_url = None
|
||||||
|
|
||||||
|
print("\n=== Done ===")
|
||||||
|
print(f"Pages visited: {page_count}")
|
||||||
|
print(f"Movie links found (sum across pages, incl duplicates): {total_found}")
|
||||||
|
print(f"New downloads attempted: {total_attempted}")
|
||||||
|
print(f"Seen file: {args.seen_file}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user