Name: Anonymous 2011-07-26 4:31
Hello /prog/, I'm a complete beginner and I just wrote the longest script I have yet written. It's not finished, but would you please critique it and tell me if it's ugly or not?
#########################################
# g.e-hentai.org Gallery Scraper v0.1 #
# by VitamnP #
# ENTERPRISE QUALITY #
#########################################
import urllib2
import time
from math import ceil
from BeautifulSoup import BeautifulSoup
def requestPage(url):
# Define user agent to be used in requests
agent = {'User-Agent':'Uzbl (Webkit 1.3) (Linux i686 [i686])'}
# Request page from server for parsing, identifying as a browser so we don't
# get banned
request = urllib2.Request(url, data=None, headers=agent)
response = urllib2.urlopen(request)
info = response.info()
page = response.read()
# Create BeautifulSoup object from page for parsing if the page type is HTML
soup = BeautifulSoup(page)
return soup
def getInfo(url):
# Request page and prepare for parsing
soup = requestPage(url)
# Get number of images and total size in MB
image_string = soup.find(text='Images:')
info_string = image_string.findNext(text=True)
# Determine number of thumbnail pages and generate URLs
num_images = float(info_string.split(' ')[0])
num_pages = ceil(num_images / 20.0) - 1
pages_list = []
while num_pages >= 0:
pages_list.append(url + '?p=' + str(num_pages))
num_pages -= 1
return {'pages_list' : pages_list, 'info_string' : info_string}
def getImagePageLinks(pages_list):
# Create a list to store image page URLs
all_page_links = []
# Go through each thumbnail page link in the list, extract the links to
# the image pages, and store them in a list
for link in pages_list:
soup = requestPage(link)
image_links = soup.findAll('div', {"class" : "gdtm"})
for link in image_links:
all_page_links.append(link.findNext('a')['href'])
return all_page_links
def getAllImageLinks(all_page_links, wait_time_secs):
# Create a list to store the image URLs
all_image_links = []
# For each URL in all_image_links, request the page from the server and save
# the image on the page to a file in the directory specified by the user
for link in all_image_links:
soup = requestPage(link)
topframe = soup.find('iframe')
imgurl = topframe.findNext('img')['src']
all_images.append(imgurl)
time.sleep(wait_time_secs)
return all_image_links
def download(image_links, save_dir):
for image in image_links:
filename = save_dir + image.split('/')[-1]
f = open(filename, 'wb')
f.write(data)
f.close()
def main():
print 'g.e-hentai.org Gallery Scraper v0.1'
print "by VitamnP"
gallery_url = raw_input("Enter gallery URL: \n")
save_dir = raw_input("Enter save directory: \n")
wait_time_secs = input("Seconds to wait between page loads: \n")
info = getInfo(gallery_url)
print "Gallery contains ", info['info_string']
all_page_links = getImagePageLinks(info['pages_list'])
all_image_links = getAllImageLinks(all_page_links, wait_time_secs)
download(all_image_links, save_dir)
if __name__ == "__main__":
main()