Name: Anonymous 2009-03-30 11:06
Hey guys, I'm currently trying to fetch a whole webpage with a Python script. So far nothing except the search for objects I will need to download is implemented. Did I forget something important?
#!/usr/bin/env python
import os
import sys
import html5lib
import mechanize
BeautifulSoup = html5lib.HTMLParser(
tree=html5lib.treebuilders.getTreeBuilder("beautifulsoup"))
SUFFIXES = (".PNG", ".JPEG", ".JPG", ".GIF", ".BMP")
def main():
browser = mechanize.Browser()
browser.set_handle_robots(False)
browser.set_handle_gzip(True)
browser.add_headers = []
soup = BeautifulSoup.parse(browser.open(sys.argv[1]))
# Cascading Style Sheets
for link in soup.findAll("link", {"rel":"stylesheet", "type":"text/css"}):
print(link["href"])
# Alternative Cascading Style Sheets
for link in soup.findAll("link", {"rel":"alternate stylesheet", "type":"text/css"}):
print(link["href"])
# JavaScript
for script in soup.findAll("script", {"type":"text/javascript", "src":True}):
print(script["src"])
# Images
for img in soup.findAll("img", {"src":True}):
print(img["src"])
# Linked Objects
for a in soup.findAll("a", {"href":True}):
root, ext = os.path.splitext(a["href"])
if ext.upper() in SUFFIXES:
print(a["href"])
# Embedded Objects
for embed in soup.findAll("embed", {"src":True}):
print(embed["src"])
if __name__ == "__main__":
main()