commit 4ca7f9667aa9765b6b1c6785bd6612193f0794da Author: Iain Bradley Date: Tue Oct 7 12:55:34 2025 +0100 First commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..cdacc79 --- /dev/null +++ b/.gitignore @@ -0,0 +1,27 @@ +# Environment variables (contains passwords) +.env + +# Downloaded files +downloads/ + +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python + +# Virtual environment +venv/ +env/ +ENV/ + +# IDE +.vscode/ +.idea/ +*.swp +*.swo + +# OS +.DS_Store +Thumbs.db diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..180c2b4 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,28 @@ +FROM python:3.11-slim + +# Install Firefox and dependencies +RUN apt-get update && apt-get install -y \ + firefox-esr \ + wget \ + && rm -rf /var/lib/apt/lists/* + +# Set working directory +WORKDIR /app + +# Copy requirements and install Python dependencies +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy the download script +COPY download_cbz.py . + +# Create downloads directory +RUN mkdir -p /app/downloads + +# Set environment variables (can be overridden at runtime) +ENV EMAIL="" +ENV PASSWORD="" +ENV OUTPUT_DIR="/app/downloads" + +# Run the script +CMD python download_cbz.py diff --git a/cbz_downloader.py b/cbz_downloader.py new file mode 100644 index 0000000..330351b --- /dev/null +++ b/cbz_downloader.py @@ -0,0 +1,333 @@ +import requests +from bs4 import BeautifulSoup +import os +import time +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.firefox.options import Options +from selenium.webdriver.firefox.service import Service +from selenium.common.exceptions import TimeoutException +from webdriver_manager.firefox import GeckoDriverManager + +def setup_driver(headless=True): + """ + Set up Selenium WebDriver with Firefox (auto-installs GeckoDriver). + + Args: + headless: Run browser in headless mode (no visible window) + + Returns: + WebDriver instance + """ + firefox_options = Options() + if headless: + firefox_options.add_argument('--headless') + firefox_options.set_preference('general.useragent.override', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:120.0) Gecko/20100101 Firefox/120.0') + + # Auto-install GeckoDriver + service = Service(GeckoDriverManager().install()) + driver = webdriver.Firefox(service=service, options=firefox_options) + return driver + +def login_with_selenium(driver, email, password): + """ + Log in to shop.2000ad.com using Selenium. + + Args: + driver: Selenium WebDriver instance + email: Your email address + password: Your password + + Returns: + True if login successful, False otherwise + """ + login_url = "https://shop.2000ad.com/account/sign-in" + + print("šŸ” Navigating to login page...") + driver.get(login_url) + + try: + # Wait for and fill in email field + email_field = WebDriverWait(driver, 10).until( + EC.presence_of_element_located((By.NAME, "email")) + ) + email_field.send_keys(email) + + # Fill in password field + password_field = driver.find_element(By.NAME, "password") + password_field.send_keys(password) + + # Submit the form + print("šŸ” Logging in...") + password_field.submit() + + # Wait for redirect after login + time.sleep(3) + + # Check if login was successful + if 'login' not in driver.current_url.lower(): + print("āœ… Login successful!") + return True + else: + print("āŒ Login failed - check your credentials") + return False + + except TimeoutException: + print("āŒ Login form not found - page may have changed") + return False + except Exception as e: + print(f"āŒ Login error: {e}") + return False + +def scroll_and_load_all_items(driver, downloads_url): + """ + Navigate to downloads page and scroll to load all items. + + Args: + driver: Selenium WebDriver instance + downloads_url: URL of the downloads page + + Returns: + HTML content with all items loaded + """ + print(f"\nšŸ“„ Loading downloads page: {downloads_url}") + driver.get(downloads_url) + + # Wait for initial content to load + try: + WebDriverWait(driver, 10).until( + EC.presence_of_element_located((By.CLASS_NAME, "product")) + ) + except TimeoutException: + print("āš ļø Warning: No products found on page") + return driver.page_source + + print("šŸ“œ Scrolling to load all items...") + + last_height = driver.execute_script("return document.body.scrollHeight") + items_count = 0 + no_change_count = 0 + + while True: + # Count current items + current_items = len(driver.find_elements(By.CLASS_NAME, "product")) + + if current_items != items_count: + print(f" Loaded {current_items} items so far...") + items_count = current_items + no_change_count = 0 + else: + no_change_count += 1 + + # Scroll to bottom + driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") + + # Wait for new content to load + time.sleep(2) + + # Calculate new scroll height + new_height = driver.execute_script("return document.body.scrollHeight") + + # Check if we've stopped loading new content + if new_height == last_height and no_change_count >= 3: + print(f"āœ… Finished loading - found {items_count} total items") + break + + last_height = new_height + + # Safety limit to prevent infinite loops + if no_change_count >= 10: + print(f"āš ļø Stopped scrolling after no changes - found {items_count} items") + break + + return driver.page_source + +def transfer_cookies_to_requests(driver, session): + """ + Transfer cookies from Selenium to requests Session. + + Args: + driver: Selenium WebDriver instance + session: requests.Session instance + """ + selenium_cookies = driver.get_cookies() + for cookie in selenium_cookies: + session.cookies.set(cookie['name'], cookie['value'], domain=cookie['domain']) + print(f"āœ… Transferred {len(selenium_cookies)} cookies to requests session") + +def download_cbz_files(email, password, output_dir='downloads', headless=True): + """ + Log in, load all downloads, and download all CBZ files. + + Args: + email: Your shop.2000ad.com email + password: Your shop.2000ad.com password + output_dir: Directory where files will be saved (default: 'downloads') + headless: Run browser in headless mode (default: True) + """ + + # Create output directory if it doesn't exist + os.makedirs(output_dir, exist_ok=True) + + # Set up Selenium + print("🌐 Starting browser...") + driver = setup_driver(headless=headless) + + try: + # Log in + if not login_with_selenium(driver, email, password): + print("\nāŒ Cannot proceed without successful login") + return + + # Load downloads page with infinite scroll + downloads_url = "https://shop.2000ad.com/account/downloads" + html_content = scroll_and_load_all_items(driver, downloads_url) + + # Parse the HTML + soup = BeautifulSoup(html_content, 'html.parser') + products = soup.find_all('li', class_='product') + + print(f"\nšŸ“š Found {len(products)} products to process\n") + + # Create requests session and transfer cookies + session = requests.Session() + session.headers.update({ + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', + 'Accept-Language': 'en-US,en;q=0.5', + 'Referer': 'https://shop.2000ad.com/' + }) + + transfer_cookies_to_requests(driver, session) + + # Close browser - we don't need it anymore + driver.quit() + print("āœ… Browser closed\n") + + print(f"{'='*50}") + print("Starting downloads...\n") + + downloaded = 0 + skipped = 0 + failed = 0 + + for product in products: + # Get product name for better logging + product_name = product.get('data-name', 'Unknown') + + # Get publication date + release_date = product.get('data-released', '') + date_str = '' + if release_date: + # Format: YYYYMMDDHHMMSS -> YYYY-MM-DD + try: + date_str = f"{release_date[0:4]}-{release_date[4:6]}-{release_date[6:8]}" + except: + date_str = '' + + # Determine subdirectory based on product name + if 'megazine' in product_name.lower(): + product_output_dir = os.path.join(output_dir, 'Megazine') + else: + product_output_dir = os.path.join(output_dir, '2000ad') + + # Create subdirectory if it doesn't exist + os.makedirs(product_output_dir, exist_ok=True) + + # Find all forms within this product + forms = product.find_all('form') + + for form in forms: + # Check if this form is for a CBZ download + button = form.find('button', type='submit') + if button and 'CBZ' in button.get_text(): + # Get the download URL + download_url = form.get('action') + + if download_url: + # Create a safe filename with date + if date_str: + filename = f"{date_str} - {product_name}.cbz" + else: + filename = f"{product_name}.cbz" + + filename = filename.replace('/', '-').replace('\\', '-').replace(':', '-') + filepath = os.path.join(product_output_dir, filename) + + # Check if file already exists + if os.path.exists(filepath): + subdir = 'Megazine' if 'megazine' in product_name.lower() else '2000ad' + print(f"ā­ļø Skipping (already exists): {subdir}/{filename}") + skipped += 1 + continue + + try: + subdir = 'Megazine' if 'megazine' in product_name.lower() else '2000ad' + print(f"šŸ“„ Downloading to {subdir}/: {filename}") + + # Download the file + response = session.get(download_url, stream=True, allow_redirects=True) + + # Check if we got HTML (login page) instead of CBZ + content_type = response.headers.get('Content-Type', '') + if 'text/html' in content_type: + print(f"āš ļø Warning: Got HTML response instead of file") + print(f" This might be a permission issue or the file isn't available") + failed += 1 + continue + + response.raise_for_status() + + # Save the file + with open(filepath, 'wb') as f: + for chunk in response.iter_content(chunk_size=8192): + f.write(chunk) + + file_size = os.path.getsize(filepath) + print(f"āœ… Saved to {subdir}/: {filename} ({file_size / 1024 / 1024:.2f} MB)") + downloaded += 1 + + # Be polite - add a small delay between downloads + time.sleep(1) + + except requests.exceptions.RequestException as e: + print(f"āŒ Error downloading {filename}: {e}") + failed += 1 + except Exception as e: + print(f"āŒ Error saving {filename}: {e}") + failed += 1 + + print(f"\n{'='*50}") + print(f"Download complete!") + print(f"āœ… Successfully downloaded: {downloaded}") + print(f"ā­ļø Skipped (already exist): {skipped}") + print(f"āŒ Failed: {failed}") + print(f"šŸ“ Files saved to: {os.path.abspath(output_dir)}") + + except Exception as e: + print(f"\nāŒ Fatal error: {e}") + import traceback + traceback.print_exc() + finally: + # Make sure browser is closed + try: + driver.quit() + except: + pass + +if __name__ == "__main__": + # Configuration - can be set via environment variables or directly + email = os.environ.get('EMAIL', 'your_email@example.com') + password = os.environ.get('PASSWORD', 'your_password') + output_dir = os.environ.get('OUTPUT_DIR', 'downloads') + + if email == 'your_email@example.com' or password == 'your_password': + print("āš ļø Warning: Please set EMAIL and PASSWORD environment variables") + print(" Example: EMAIL=your@email.com PASSWORD=yourpass python download_cbz.py") + exit(1) + + # Run the download + # Set headless=False if you want to see the browser window + download_cbz_files(email, password, output_dir=output_dir, headless=True) \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..97d567b --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,17 @@ +version: '3.8' + +services: + cbz-downloader: + build: . + container_name: 2000ad-downloader + environment: + - EMAIL=${EMAIL:-your_email@example.com} + - PASSWORD=${PASSWORD:-your_password} + - OUTPUT_DIR=/app/downloads + volumes: + - ./downloads:/app/downloads + # Optional: Mount the script for easy editing without rebuilding + - ./download_cbz.py:/app/download_cbz.py + restart: "no" + # Uncomment below to run on a schedule instead of immediately + # command: sh -c "while true; do python download_cbz.py && sleep 86400; done" diff --git a/env-template.sh b/env-template.sh new file mode 100644 index 0000000..5ae0aad --- /dev/null +++ b/env-template.sh @@ -0,0 +1,5 @@ +# Copy this file to .env and fill in your credentials +# cp .env.example .env + +EMAIL=your_email@example.com +PASSWORD=your_password diff --git a/readme.md b/readme.md new file mode 100644 index 0000000..f47fddd --- /dev/null +++ b/readme.md @@ -0,0 +1,87 @@ +# 2000 AD CBZ Downloader + +Automatically downloads all CBZ files from your shop.2000ad.com account. + +## Setup + +1. **Create `.env` file with your credentials:** + ```bash + cp .env.example .env + ``` + Then edit `.env` and add your email and password. + +2. **Build the container:** + ```bash + docker-compose build + ``` + +## Usage + +### Run once (download all new files): +```bash +docker-compose up +``` + +### Run in background: +```bash +docker-compose up -d +``` + +### View logs: +```bash +docker-compose logs -f +``` + +### Stop the container: +```bash +docker-compose down +``` + +## File Structure + +``` +. +ā”œā”€ā”€ docker-compose.yml +ā”œā”€ā”€ Dockerfile +ā”œā”€ā”€ requirements.txt +ā”œā”€ā”€ download_cbz.py +ā”œā”€ā”€ .env (your credentials - not committed to git) +ā”œā”€ā”€ .env.example (template) +└── downloads/ + └── (your CBZ files will be downloaded here) +``` + +## Scheduled Downloads + +To run automatically every day, edit `docker-compose.yml` and uncomment the `command` line: + +```yaml +command: sh -c "while true; do python download_cbz.py && sleep 86400; done" +``` + +Then change `restart: "no"` to `restart: unless-stopped`. + +This will: +- Run the downloader immediately +- Wait 24 hours (86400 seconds) +- Run again +- Repeat forever + +## Troubleshooting + +**If downloads fail:** +- Check your credentials in `.env` +- Run with logs visible: `docker-compose up` (without `-d`) +- Check that Firefox is working: The script will show login progress + +**If you want to see the browser:** +- Edit `download_cbz.py` and change `headless=True` to `headless=False` +- Rebuild: `docker-compose build` +- You'll need X11 forwarding for this in Docker + +## Security Note + +The `.env` file contains your password. Make sure to: +- Add `.env` to `.gitignore` if using git +- Never commit credentials to version control +- Keep file permissions restricted: `chmod 600 .env` diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..4f23b50 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +selenium==4.15.2 +beautifulsoup4==4.12.2 +requests==2.31.0 +webdriver-manager==4.0.1