if anyone cares, this will download a list of gelbooru tabs. Don't remove the sleep pls don't be cunts
import requests
from bs4 import BeautifulSoup
import os
import time
FILE = '.txt'
DEST_FOLDER = ...
def scrape_source(url):
# Fetch the web page
response = requests.get(url)
response.raise_for_status() # Ensure the request was successful
soup = BeautifulSoup(response.content, 'html.parser')
images = soup.find_all('img', id='image')
for img in images:
img_src = img['src']
if 'sample' in img_src:
elems = soup.find_all('section', class_='note-container')
extension = elems[0]['data-file-ext']
first_part = img_src.split('//samples/')[0]
test = "/".join(img_src.split('/')[-3:])
test = test.replace("sample_", "").split(".")[-2]
full_url = f'{first_part}/images/{test}{extension}'
return full_url
else:
return img_src
def download_file(url, dest_folder):
if not os.path.exists(dest_folder):
os.makedirs(dest_folder)
try:
response = requests.get(url, stream=True)
response.raise_for_status() # Check for HTTP errors
filename = os.path.join(dest_folder, url.split("/")[-1])
with open(filename, 'wb') as file:
for chunk in response.iter_content(chunk_size=8192):
file.write(chunk)
except requests.exceptions.RequestException as e:
pass
def download_from_file(file_path, dest_folder='.'):
with open(file_path, 'r') as file:
urls = file.readlines()
for url in urls:
url = url.strip() # Remove any leading/trailing whitespace
if url: # Check if URL is not empty
img_url = scrape_source(url)
download_file(img_url, dest_folder)
print('sleeping...')
time.sleep(0.5)
if __name__ == "__main__":
download_from_file(FILE, DEST_FOLDER)