Name: Anonymous 2010-06-21 0:59
I hate /prog/, I hate Python, I hate Gweedo, I hate Xarn, but most of all I hate myself. Worst 4 hours of my life.
Changelog:
- Raised database busy timeout
- Added several database PRAGMAs
- Added HTTP gzip support
- Added automatic retry support
- Request only new posts
- VACUUM the database when done
- Multithreaded thread fetching
- Flush database only every 5s
Database format is unchanged and therefore it's fully interchangeable with the unmodified version.
Enjoy the turd, I sure as hell won't be touching this shit ever again.
Part 1/2 (append both parts)
Changelog:
- Raised database busy timeout
- Added several database PRAGMAs
- Added HTTP gzip support
- Added automatic retry support
- Request only new posts
- VACUUM the database when done
- Multithreaded thread fetching
- Flush database only every 5s
Database format is unchanged and therefore it's fully interchangeable with the unmodified version.
Enjoy the turd, I sure as hell won't be touching this shit ever again.
Part 1/2 (append both parts)
#!/usr/bin/python -u
# ``Constants''
db_name = 'prog.db'
prog_url = 'http://dis.4chan.org/prog/'
read_url = 'http://dis.4chan.org/read/prog/'
threads = 12
# Make sure we're using a compatible version
from sys import version, exit
if version[0] != '2':
print "Your version of Python is not supported at this time.",\
"Please use Python 2.5 or 2.6."
exit(1)
# Set up the database connection first
import sqlite3
db_conn = sqlite3.connect(db_name, 300.0)
db = db_conn.cursor()
try:
db.execute('PRAGMA journal_mode = PERSIST')
db.execute('PRAGMA synchronous = normal')
db.execute('PRAGMA page_size = 4096')
db.execute("""
CREATE TABLE IF NOT EXISTS threads (
thread INTEGER PRIMARY KEY,
title TEXT,
last_post INTEGER
)""")
db.execute("""
CREATE TABLE IF NOT EXISTS posts (
thread INTEGER REFERENCES threads(thread),
id INTEGER,
author TEXT,
email TEXT,
trip TEXT,
time INTEGER,
body TEXT,
PRIMARY KEY (thread, id)
)""")
db_conn.commit()
except sqlite3.DatabaseError:
# Specified DB file exists, but isn't an SQLite DB file.
print "Use a different filename for your DB."
raise
# Try to fetch subject.txt
import urllib2, re, StringIO, gzip, time, sys
stat_cbytes = 0
stat_ubytes = 0
stat_threads = 0
stat_skips = 0
def request_compressed(url):
global stat_cbytes
global stat_ubytes
wait = 0.5
while(True):
try:
request = urllib2.Request(url)
request.add_header('Accept-encoding', 'gzip')
opener = urllib2.build_opener()
f = opener.open(request)
c = f.read()
stat_cbytes += len(c)
serverstream = StringIO.StringIO(c)
if f.headers.get('Content-Encoding') == 'gzip':
decstream = gzip.GzipFile(fileobj=serverstream)
u = decstream.read()
stat_ubytes += len(u)
return StringIO.StringIO(u)
else:
stat_ubytes += len(c)
return serverstream
except KeyboardInterrupt:
print "KeyboardInterrupt, aborting..."
raise
except:
print "Network fetch error:", sys.exc_info()[0]
print "*** Trying again in", wait,
print "seconds (press CTRL-C to abort)"
time.sleep(wait)
wait = wait * 2 if wait < 15 else 30
print "Fetching subject.txt...",
subjecttxt = request_compressed(prog_url + 'subject.txt')
print "Got it."
# Parse each line, check with DB, keep a list of all threads to be updated
regex = re.compile(u"""
^(.*?) # Subject
<>
(.*?) # Name
<>
(.*?) # E-mail
<>
(-?\d*) # Time posted/thread ID
<>
(\d*) # Number of replies
<>
(.*?) # ???
<>
(\d*) # Time of last post
\\n$""", re.VERBOSE)
to_update = []
for line in subjecttxt.readlines():
parsed = regex.search(unicode(line,"iso-8859-1"))
try:
data = parsed.groups()
result = db.execute('SELECT last_post FROM threads WHERE thread = ?', (unicode(data[3]), )).fetchone()
if result is None:
db.execute('INSERT INTO threads VALUES (?, ?, ?)', (unicode(data[3]), unicode(data[0]), 0))
to_update.append((unicode(data[3]), unicode(data[6])))
elif int(result[0]) < int(data[6]):
to_update.append((unicode(data[3]), unicode(data[6])))
except:
# Failed to parse line; skip it
print "subjects.txt fail:", line
print "%d threads to update." % len(to_update)