Files
2025-10-13 10:11:22 +01:00

340 lines
13 KiB
Python

import requests
from bs4 import BeautifulSoup
import os
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.service import Service
from selenium.common.exceptions import TimeoutException
from webdriver_manager.firefox import GeckoDriverManager
def setup_driver(headless=True):
"""
Set up Selenium WebDriver with Firefox (auto-installs GeckoDriver).
Args:
headless: Run browser in headless mode (no visible window)
Returns:
WebDriver instance
"""
firefox_options = Options()
if headless:
firefox_options.add_argument('--headless')
firefox_options.set_preference('general.useragent.override', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:120.0) Gecko/20100101 Firefox/120.0')
# Auto-install GeckoDriver
service = Service(GeckoDriverManager().install())
driver = webdriver.Firefox(service=service, options=firefox_options)
return driver
def login_with_selenium(driver, email, password):
"""
Log in to shop.2000ad.com using Selenium.
Args:
driver: Selenium WebDriver instance
email: Your email address
password: Your password
Returns:
True if login successful, False otherwise
"""
login_url = "https://shop.2000ad.com/account/sign-in"
print("🔐 Navigating to login page...")
driver.get(login_url)
try:
# Wait for and fill in email field
email_field = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.NAME, "email"))
)
email_field.send_keys(email)
# Fill in password field
password_field = driver.find_element(By.NAME, "password")
password_field.send_keys(password)
# Submit the form
print("🔐 Logging in...")
password_field.submit()
# Wait for redirect after login
time.sleep(3)
# Check if login was successful
if 'login' not in driver.current_url.lower():
print("✅ Login successful!")
return True
else:
print("❌ Login failed - check your credentials")
return False
except TimeoutException:
print("❌ Login form not found - page may have changed")
return False
except Exception as e:
print(f"❌ Login error: {e}")
return False
def scroll_and_load_all_items(driver, downloads_url):
"""
Navigate to downloads page and scroll to load all items.
Args:
driver: Selenium WebDriver instance
downloads_url: URL of the downloads page
Returns:
HTML content with all items loaded
"""
print(f"\n📄 Loading downloads page: {downloads_url}")
driver.get(downloads_url)
# Wait for initial content to load
try:
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "product"))
)
except TimeoutException:
print("⚠️ Warning: No products found on page")
return driver.page_source
print("📜 Scrolling to load all items...")
last_height = driver.execute_script("return document.body.scrollHeight")
items_count = 0
no_change_count = 0
while True:
# Count current items
current_items = len(driver.find_elements(By.CLASS_NAME, "product"))
if current_items != items_count:
print(f" Loaded {current_items} items so far...")
items_count = current_items
no_change_count = 0
else:
no_change_count += 1
# Scroll to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait for new content to load
time.sleep(2)
# Calculate new scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
# Check if we've stopped loading new content
if new_height == last_height and no_change_count >= 3:
print(f"✅ Finished loading - found {items_count} total items")
break
last_height = new_height
# Safety limit to prevent infinite loops
if no_change_count >= 10:
print(f"⚠️ Stopped scrolling after no changes - found {items_count} items")
break
return driver.page_source
def transfer_cookies_to_requests(driver, session):
"""
Transfer cookies from Selenium to requests Session.
Args:
driver: Selenium WebDriver instance
session: requests.Session instance
"""
selenium_cookies = driver.get_cookies()
for cookie in selenium_cookies:
session.cookies.set(cookie['name'], cookie['value'], domain=cookie['domain'])
print(f"✅ Transferred {len(selenium_cookies)} cookies to requests session")
def download_cbz_files(email, password, output_dir='downloads', headless=True):
"""
Log in, load all downloads, and download all CBZ files.
Args:
email: Your shop.2000ad.com email
password: Your shop.2000ad.com password
output_dir: Directory where files will be saved (default: 'downloads')
headless: Run browser in headless mode (default: True)
"""
# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)
# Set up Selenium
print("🌐 Starting browser...")
driver = setup_driver(headless=headless)
try:
# Log in
if not login_with_selenium(driver, email, password):
print("\n❌ Cannot proceed without successful login")
return
# Load downloads page with infinite scroll
downloads_url = "https://shop.2000ad.com/account/downloads"
html_content = scroll_and_load_all_items(driver, downloads_url)
# Parse the HTML
soup = BeautifulSoup(html_content, 'html.parser')
products = soup.find_all('li', class_='product')
print(f"\n📚 Found {len(products)} products to process\n")
# Create requests session and transfer cookies
session = requests.Session()
session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Referer': 'https://shop.2000ad.com/'
})
transfer_cookies_to_requests(driver, session)
# Close browser - we don't need it anymore
driver.quit()
print("✅ Browser closed\n")
print(f"{'='*50}")
print("Starting downloads...\n")
downloaded = 0
skipped = 0
failed = 0
for product in products:
# Get product name for better logging
product_name = product.get('data-name', 'Unknown')
# Skip products with placeholder/error text
if '[Pen_-productVersion' in product_name or 'productVersion' in product_name:
print(f"⏭️ Skipping (placeholder/error in name): {product_name}")
skipped += 1
continue
# Get publication date
release_date = product.get('data-released', '')
date_str = ''
if release_date:
# Format: YYYYMMDDHHMMSS -> YYYY-MM-DD
try:
date_str = f"{release_date[0:4]}-{release_date[4:6]}-{release_date[6:8]}"
except:
date_str = ''
# Determine subdirectory based on product name
if 'megazine' in product_name.lower():
product_output_dir = os.path.join(output_dir, 'Megazine')
else:
product_output_dir = os.path.join(output_dir, '2000ad')
# Create subdirectory if it doesn't exist
os.makedirs(product_output_dir, exist_ok=True)
# Find all forms within this product
forms = product.find_all('form')
for form in forms:
# Check if this form is for a CBZ download
button = form.find('button', type='submit')
if button and 'CBZ' in button.get_text():
# Get the download URL
download_url = form.get('action')
if download_url:
# Create a safe filename with date
if date_str:
filename = f"{date_str} - {product_name}.cbz"
else:
filename = f"{product_name}.cbz"
filename = filename.replace('/', '-').replace('\\', '-').replace(':', '-')
filepath = os.path.join(product_output_dir, filename)
# Check if file already exists
if os.path.exists(filepath):
subdir = 'Megazine' if 'megazine' in product_name.lower() else '2000ad'
print(f"⏭️ Skipping (already exists): {subdir}/{filename}")
skipped += 1
continue
try:
subdir = 'Megazine' if 'megazine' in product_name.lower() else '2000ad'
print(f"📥 Downloading to {subdir}/: {filename}")
# Download the file
response = session.get(download_url, stream=True, allow_redirects=True)
# Check if we got HTML (login page) instead of CBZ
content_type = response.headers.get('Content-Type', '')
if 'text/html' in content_type:
print(f"⚠️ Warning: Got HTML response instead of file")
print(f" This might be a permission issue or the file isn't available")
failed += 1
continue
response.raise_for_status()
# Save the file
with open(filepath, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
file_size = os.path.getsize(filepath)
print(f"✅ Saved to {subdir}/: {filename} ({file_size / 1024 / 1024:.2f} MB)")
downloaded += 1
# Be polite - add a small delay between downloads
time.sleep(1)
except requests.exceptions.RequestException as e:
print(f"❌ Error downloading {filename}: {e}")
failed += 1
except Exception as e:
print(f"❌ Error saving {filename}: {e}")
failed += 1
print(f"\n{'='*50}")
print(f"Download complete!")
print(f"✅ Successfully downloaded: {downloaded}")
print(f"⏭️ Skipped (already exist): {skipped}")
print(f"❌ Failed: {failed}")
print(f"📁 Files saved to: {os.path.abspath(output_dir)}")
except Exception as e:
print(f"\n❌ Fatal error: {e}")
import traceback
traceback.print_exc()
finally:
# Make sure browser is closed
try:
driver.quit()
except:
pass
if __name__ == "__main__":
# Configuration - can be set via environment variables or directly
email = os.environ.get('EMAIL', 'your_email@example.com')
password = os.environ.get('PASSWORD', 'your_password')
output_dir = os.environ.get('OUTPUT_DIR', 'downloads')
if email == 'your_email@example.com' or password == 'your_password':
print("⚠️ Warning: Please set EMAIL and PASSWORD environment variables")
print(" Example: EMAIL=your@email.com PASSWORD=yourpass python download_cbz.py")
exit(1)
# Run the download
# Set headless=False if you want to see the browser window
download_cbz_files(email, password, output_dir=output_dir, headless=True)