Return Styles: Pseud0ch, Terminal, Valhalla, NES, Geocities, Blue Moon. Entire thread

/prog/ stats

Name: Anonymous 2008-05-01 11:58

As of some time two days ago:

5242 threads.
100M data.
8.1M compressed (tar and bz2).

Don't ask me how I collected this data or moot will be pissed off and hax my anus.

Name: Anonymous 2008-05-01 13:21

import os
import sqlite3
import urllib2
import datetime
import re
import time
import sys

# TODO: better error handling
#       parse html to determine both plain text and original bbcode

print '*** world4ch archiver ***'
print ''

def geturl(url):
    req = urllib2.Request(url)
    req.add_header("User-Agent", "Mozilla/4.0 (compatible; world4ch archiving robot; anonymous)")
    res = urllib2.urlopen(req)
    return res.read()

def totimestamp(dt):
    return time.mktime(dt.timetuple()) + dt.microsecond/1e6

if not os.path.exists("world4ch_archive.db"):
    print 'creating archive database'
    db = sqlite3.connect("world4ch_archive.db")
    db.execute("""
        create table boards (
            board_name text not null primary key
        )
    """)
    db.execute("""
        create table threads (
            board_name text not null,
            thread_no integer not null,
            subject text not null,
            post_count integer not null,
            highest_post integer not null,
            time_of_last_post integer not null,
            primary key ( board_name, thread_no )
        )
    """)
    db.execute("""
        create table posts (
            board_name test not null,
            thread_no integer not null,
            post_no integer not null,
            date_time text not null,
            name text not null,
            trip text,
            email text,
            id text,
            html text not null,
            bbcode text not null,
            textonly text not null,
            primary key ( board_name, thread_no, post_no )
        )
    """)
    boards = "anime,book,carcom,comp,food,games,img,lang,lounge,music,newnew,newpol,prog,sci,sjis,sports,tech,tele,vip"
    boards = [(board_name,) for board_name in boards.split(',')]

    db.executemany("insert into boards values (?)", boards)
    db.commit()

print 'creating in-memory database'
db = sqlite3.connect(":memory:")
db.text_factory = str
db.execute("""
  create table subject_txt (
    board_name text,
    subject text,
    name text,
    icon text,
    thread_no integer,
    highest_post integer,
    nothing text,
    time_of_last_post integer,
    primary key (board_name, thread_no)
  )
""")

db.execute("""
  create unique index pk_subject_txt on subject_txt ( board_name, thread_no )
""")

print 'attaching archive database'
db.execute(r"attach database 'world4ch_archive.db' as archive")

re_thread = re.compile(r'<h2>(.*?)</h2>.*?<div class="thread">(.*?)<div class="bottomnav">.*?<td class="postfieldleft"><span class="postnum">(.*?)</span></td>',re.DOTALL)
re_post = re.compile(r'<h3><span class="postnum"><a .*?>(.*?)</a>.*?<span class="postername">(.*?)</span>.*?<span class="postertrip">(.*?)</span>.*?<span class="posterdate">(.*?)</span>.*?<span class="id">(.*?)</span>.*?</h3>.*?<blockquote>(.*?)</blockquote>',re.DOTALL)
re_email = re.compile(r'<a href="mailto:(.*?)">(.*?)</a>')

def get_new_posts():

    threads = db.execute("""
      select A.board_name, A.thread_no, B.highest_post+1, B.post_count
      from subject_txt A
      left join archive.threads B on
        A.thread_no = B.thread_no and
        A.board_name = B.board_name
      where
       (B.thread_no is null or
        A.highest_post > B.highest_post) and
            A.highest_post > 0
    """).fetchall()

    for i in xrange(len(threads)):
        board_thread = threads[i]
        board_name = board_thread[0]
        thread_no = board_thread[1]
        url = "http://dis.4chan.org/read/"+board_name+"/"+str(thread_no)
        if board_thread[2]==None:
            from_post = 1
            current_post_count = 0
        else:
            from_post = board_thread[2]
            url += "/"+str(from_post)+"-"
            current_post_count = board_thread[3]
        print board_name, "("+str(i+1)+"/"+str(len(threads))+")", thread_no, "from post", from_post,
        html_page = geturl(url)
        post_search = re_thread.findall(html_page)
        subject = post_search[0][0].strip().replace("&gt;",">").replace("&lt;",">").replace("&quot;",'"').replace("'","'").replace("&amp;","&")
        posts =  post_search[0][1].strip()
        highest_post = int(post_search[0][2])-1
        post_count = current_post_count
        post_no = 0
        time_of_last_post = 0
        for post in re_post.findall(posts):
            post_count += 1
            if post_no > 0:
                print '\b'*(len(str(post_no))+2),
            post_no = int(post[0])
            print post_no,
            trip = post[2]
            if trip == '':
                trip = None
            name_email = re_email.match(post[1])
            if name_email == None:
                name = post[1]
                email = None
            else:
                name = name_email.groups()[1]
                email = name_email.groups()[0]
            # NO! this breaks on img/1104652020/21! store as string instead.
            # date_time = int(totimestamp(datetime.datetime.strptime(post[3], "%Y-%m-%d %H:%M")))
            date_time = post[3].strip()
            time_of_last_post = date_time
            id = post[4].strip()
            if id == '':
                id = None
            html = post[5]
            ### HTML PARSING WILL GO HERE ###
            row = (board_name, thread_no, post_no, date_time, name, trip, email, id, html, '', '')
            cc = db.execute('replace into archive.posts values (?,?,?,?,?,?,?,?,?,?,?)', row)
        row = (board_name, thread_no, subject, post_count, highest_post, time_of_last_post)
        cc = db.execute('replace into archive.threads values (?,?,?,?,?,?)', row)
        cc = db.commit()
        print 'highest post is now',highest_post,'~'
        time.sleep(2)


# main board loop
if len(sys.argv) >= 2:
    comparator = "="
    if len(sys.argv) == 3:
        if sys.argv[2]=="-":
            comparator = ">="
    boards = db.execute("select board_name from archive.boards where board_name "+comparator+" ?", (sys.argv[1],)).fetchall()
else:
    boards = db.execute("select board_name from archive.boards").fetchall()

for board in boards:
    board_name = board[0]
    print 'getting subject list for /'+board_name+'/'
    subject_txt = geturl("http://dis.4chan.org/"+board_name+"/subject.txt")
    subject_txt = [tuple(line.rsplit("<>",6)) for line in subject_txt.split("\n") if line != ""]
    db.execute("delete from subject_txt")
    db.executemany("insert into subject_txt values ('"+board_name+"',?,?,?,?,?,?,?)", subject_txt)
    print 'retrieving new posts'
    get_new_posts()
    time.sleep(5)

Newer Posts
Don't change these.
Name: Email:
Entire Thread Thread List