/prog/ - Critique my code

Name: Anonymous 2011-07-26 4:31

Hello /prog/, I'm a complete beginner and I just wrote the longest script I have yet written. It's not finished, but would you please critique it and tell me if it's ugly or not?



#########################################

# g.e-hentai.org Gallery Scraper v0.1   #   

# by VitamnP                            #

# ENTERPRISE QUALITY                    #

#########################################



import urllib2

import time

from math import ceil

from BeautifulSoup import BeautifulSoup



def requestPage(url):

    # Define user agent to be used in requests

    agent = {'User-Agent':'Uzbl (Webkit 1.3) (Linux i686 [i686])'}

    

    # Request page from server for parsing, identifying as a browser so we don't

    # get banned

    request = urllib2.Request(url, data=None, headers=agent)

    response = urllib2.urlopen(request)

    info = response.info()

    page = response.read()

    

    # Create BeautifulSoup object from page for parsing if the page type is HTML

    soup = BeautifulSoup(page)



    return soup



def getInfo(url):

    # Request page and prepare for parsing

    soup = requestPage(url)    

    

    # Get number of images and total size in MB

    image_string = soup.find(text='Images:')

    info_string = image_string.findNext(text=True)

    

    # Determine number of thumbnail pages and generate URLs

    num_images = float(info_string.split(' ')[0])

    num_pages = ceil(num_images / 20.0) - 1

    pages_list = []

    while num_pages >= 0:

        pages_list.append(url + '?p=' + str(num_pages))

        num_pages -= 1

    

    return {'pages_list' : pages_list, 'info_string' : info_string}

    

def getImagePageLinks(pages_list):

    # Create a list to store image page URLs

    all_page_links = []

    

    # Go through each thumbnail page link in the list, extract the links to

    # the image pages, and store them in a list

    for link in pages_list:

        soup = requestPage(link)

        image_links = soup.findAll('div', {"class" : "gdtm"})

        for link in image_links:

            all_page_links.append(link.findNext('a')['href'])



    return all_page_links

    

def getAllImageLinks(all_page_links, wait_time_secs):

    # Create a list to store the image URLs

    all_image_links = []



    # For each URL in all_image_links, request the page from the server and save

    # the image on the page to a file in the directory specified by the user

    for link in all_image_links:

        soup = requestPage(link)

        topframe = soup.find('iframe')

        imgurl = topframe.findNext('img')['src']

        all_images.append(imgurl)

        time.sleep(wait_time_secs)



    return all_image_links



def download(image_links, save_dir):

    

    for image in image_links:

        filename = save_dir + image.split('/')[-1]

        f = open(filename, 'wb')

        f.write(data)

        f.close()

        

        

    

        

def main():

    print 'g.e-hentai.org Gallery Scraper v0.1'

    print "by VitamnP"

    

    gallery_url = raw_input("Enter gallery URL: \n")

    save_dir = raw_input("Enter save directory: \n")

    wait_time_secs = input("Seconds to wait between page loads: \n")

    

    info = getInfo(gallery_url)

    print "Gallery contains ", info['info_string']



    all_page_links = getImagePageLinks(info['pages_list'])

    all_image_links = getAllImageLinks(all_page_links, wait_time_secs)



    download(all_image_links, save_dir)

    

if __name__ == "__main__":

    main()

Name: Anonymous 2011-07-26 8:37

>>1

At requestPage:
Why assign soup? Why not just return BeautifulSoup(page) ?

Also:





def getAllImageLinks(all_page_links, wait_time_secs):

    # Create a list to store the image URLs

    all_image_links = []



    # For each URL in all_image_links, request the page from the server and save

    # the image on the page to a file in the directory specified by the user

    for link in all_image_links:

        soup = requestPage(link)

        topframe = soup.find('iframe')

        imgurl = topframe.findNext('img')['src']

        all_images.append(imgurl)

        time.sleep(wait_time_secs)



    return all_image_links

But all_image_links is empty, thus it won't return shite.

Critique my code

1 Name: Anonymous 2011-07-26 4:31

15 Name: Anonymous 2011-07-26 8:37

Name: Anonymous 2011-07-26 4:31

Name: Anonymous 2011-07-26 8:37