Return Styles: Pseud0ch, Terminal, Valhalla, NES, Geocities, Blue Moon. Entire thread

Archive webpage using Python

Name: Anonymous 2009-03-30 11:06

Hey guys, I'm currently trying to fetch a whole webpage with a Python script. So far nothing except the search for objects I will need to download is implemented. Did I forget something important?

#!/usr/bin/env python

import os
import sys

import html5lib
import mechanize
BeautifulSoup = html5lib.HTMLParser(
    tree=html5lib.treebuilders.getTreeBuilder("beautifulsoup"))

SUFFIXES = (".PNG", ".JPEG", ".JPG", ".GIF", ".BMP")

def main():
    browser = mechanize.Browser()
    browser.set_handle_robots(False)
    browser.set_handle_gzip(True)
    browser.add_headers = []

    soup = BeautifulSoup.parse(browser.open(sys.argv[1]))

    # Cascading Style Sheets
    for link in soup.findAll("link", {"rel":"stylesheet", "type":"text/css"}):
        print(link["href"])

    # Alternative Cascading Style Sheets
    for link in soup.findAll("link", {"rel":"alternate stylesheet", "type":"text/css"}):
        print(link["href"])

    # JavaScript
    for script in soup.findAll("script", {"type":"text/javascript", "src":True}):
        print(script["src"])

    # Images
    for img in soup.findAll("img", {"src":True}):
        print(img["src"])

    # Linked Objects
    for a in soup.findAll("a", {"href":True}):
        root, ext = os.path.splitext(a["href"])
        if ext.upper() in SUFFIXES:
            print(a["href"])

    # Embedded Objects
    for embed in soup.findAll("embed", {"src":True}):
        print(embed["src"])

if __name__ == "__main__":
    main()

Name: Anonymous 2009-03-30 11:28

You seem to have forgotten wget.

Name: Anonymous 2009-03-30 11:33

>>2
Na I didn't completely forgot it, I guess, I just wanted to implement it in Python thanks for the tip though!

Name: Anonymous 2009-03-30 14:58

why is BeautifulSoup highlighted?

Name: Anonymous 2009-03-30 15:09

>>4
Take a Guess you Faggot

Name: Anonymous 2009-03-30 15:28

>>5
Stop calling me a faggot, faggot

Name: Anonymous 2010-11-26 4:22

Name: Anonymous 2010-11-29 21:01


#!/usr/bin/env python

import os
import sys

import html5lib
import mechanize
BeautifulSoup = html5lib.HTMLParser(
    tree=html5lib.treebuilders.getTreeBuilder("beautifulsoup"))

SUFFIXES = (".PNG", ".JPEG", ".JPG", ".GIF", ".BMP")

def main():
    browser = mechanize.Browser()
    browser.set_handle_robots(False)
    browser.set_handle_gzip(True)
    browser.add_headers = []

    soup = BeautifulSoup.parse(browser.open(sys.argv[1]))

    # Cascading Style Sheets
    for link in soup.findAll("link", {"rel":"stylesheet", "type":"text/css"}):
        print(link["href"])

    # Alternative Cascading Style Sheets
    for link in soup.findAll("link", {"rel":"alternate stylesheet", "type":"text/css"}):
        print(link["href"])

    # JavaScript
    for script in soup.findAll("script", {"type":"text/javascript", "src":True}):
        print(script["src"])

    # Images
    for img in soup.findAll("img", {"src":True}):
        print(img["src"])

    # Linked Objects
    for a in soup.findAll("a", {"href":True}):
        root, ext = os.path.splitext(a["href"])
        if ext.upper() in SUFFIXES:
            print(a["href"])

    # Embedded Objects
    for embed in soup.findAll("embed", {"src":True}):
        print(embed["src"])

if __name__ == "__main__":
    main()

Name: Anonymous 2010-12-22 8:38

Newer Posts
Don't change these.
Name: Email:
Entire Thread Thread List