Return Styles: Pseud0ch, Terminal, Valhalla, NES, Geocities, Blue Moon. Entire thread

[FAST]>TURBO/prog/SCRAPE<

Name: Anonymous 2010-06-21 0:59

I hate /prog/, I hate Python, I hate Gweedo, I hate Xarn, but most of all I hate myself. Worst 4 hours of my life.

Changelog:
- Raised database busy timeout
- Added several database PRAGMAs
- Added HTTP gzip support
- Added automatic retry support
- Request only new posts
- VACUUM the database when done
- Multithreaded thread fetching
- Flush database only every 5s

Database format is unchanged and therefore it's fully interchangeable with the unmodified version.

Enjoy the turd, I sure as hell won't be touching this shit ever again.

Part 1/2 (append both parts)


#!/usr/bin/python -u

# ``Constants''

db_name  = 'prog.db'
prog_url = 'http://dis.4chan.org/prog/'
read_url = 'http://dis.4chan.org/read/prog/'
threads  = 12


# Make sure we're using a compatible version

from sys import version, exit

if version[0] != '2':
    print "Your version of Python is not supported at this time.",\
          "Please use Python 2.5 or 2.6."
    exit(1)


# Set up the database connection first

import sqlite3

db_conn = sqlite3.connect(db_name, 300.0)
db = db_conn.cursor()

try:
    db.execute('PRAGMA journal_mode = PERSIST')
    db.execute('PRAGMA synchronous = normal')
    db.execute('PRAGMA page_size = 4096')
    db.execute("""
        CREATE TABLE IF NOT EXISTS threads (
            thread INTEGER PRIMARY KEY,
            title TEXT,
            last_post INTEGER
        )""")
    db.execute("""
        CREATE TABLE IF NOT EXISTS posts (
            thread INTEGER REFERENCES threads(thread),
            id INTEGER,
            author TEXT,
            email TEXT,
            trip TEXT,
            time INTEGER,
            body TEXT,
            PRIMARY KEY (thread, id)
        )""")
    db_conn.commit()
   
except sqlite3.DatabaseError:
    # Specified DB file exists, but isn't an SQLite DB file.
    print "Use a different filename for your DB."
    raise


# Try to fetch subject.txt

import urllib2, re, StringIO, gzip, time, sys

stat_cbytes = 0
stat_ubytes = 0
stat_threads = 0
stat_skips = 0

def request_compressed(url):
    global stat_cbytes
    global stat_ubytes
    wait = 0.5
    while(True):
        try:
            request = urllib2.Request(url)
            request.add_header('Accept-encoding', 'gzip')
            opener = urllib2.build_opener()
            f = opener.open(request)
            c = f.read()
            stat_cbytes += len(c)
            serverstream = StringIO.StringIO(c)
            if f.headers.get('Content-Encoding') == 'gzip':
                decstream = gzip.GzipFile(fileobj=serverstream)
                u = decstream.read()
                stat_ubytes += len(u)
                return StringIO.StringIO(u)
            else:
                stat_ubytes += len(c)
                return serverstream
        except KeyboardInterrupt:
            print "KeyboardInterrupt, aborting..."
            raise
        except:
            print "Network fetch error:", sys.exc_info()[0]
            print "*** Trying again in", wait,
            print "seconds (press CTRL-C to abort)"
            time.sleep(wait)
            wait = wait * 2 if wait < 15 else 30


print "Fetching subject.txt...",

subjecttxt = request_compressed(prog_url + 'subject.txt')

print "Got it."


# Parse each line, check with DB, keep a list of all threads to be updated

regex = re.compile(u"""
    ^(.*?)      # Subject
    <>
    (.*?)       # Name
    <>
    (.*?)       # E-mail
    <>
    (-?\d*)     # Time posted/thread ID
    <>
    (\d*)       # Number of replies
    <>
    (.*?)       # ???
    <>
    (\d*)       # Time of last post
    \\n$""", re.VERBOSE)
to_update = []

for line in subjecttxt.readlines():
    parsed = regex.search(unicode(line,"iso-8859-1"))
    try:
        data = parsed.groups()
        result = db.execute('SELECT last_post FROM threads WHERE thread = ?', (unicode(data[3]), )).fetchone()
        if result is None:
            db.execute('INSERT INTO threads VALUES (?, ?, ?)', (unicode(data[3]), unicode(data[0]), 0))
            to_update.append((unicode(data[3]), unicode(data[6])))
        elif int(result[0]) < int(data[6]):
            to_update.append((unicode(data[3]), unicode(data[6])))

    except:
        # Failed to parse line; skip it
        print "subjects.txt fail:", line

print "%d threads to update." % len(to_update)

Name: Anonymous 2010-06-23 2:17

>>16
Hello, Xarn!

Newer Posts
Don't change these.
Name: Email:
Entire Thread Thread List