/prog/ - /prog/ stats

Name: Anonymous 2008-05-01 13:21

import os

import sqlite3

import urllib2

import datetime

import re

import time

import sys



# TODO: better error handling

#       parse html to determine both plain text and original bbcode



print '*** world4ch archiver ***'

print ''



def geturl(url):

    req = urllib2.Request(url)

    req.add_header("User-Agent", "Mozilla/4.0 (compatible; world4ch archiving robot; anonymous)")

    res = urllib2.urlopen(req)

    return res.read()



def totimestamp(dt):

    return time.mktime(dt.timetuple()) + dt.microsecond/1e6



if not os.path.exists("world4ch_archive.db"):

    print 'creating archive database'

    db = sqlite3.connect("world4ch_archive.db")

    db.execute("""

        create table boards (

            board_name text not null primary key

        )

    """)

    db.execute("""

        create table threads (

            board_name text not null,

            thread_no integer not null,

            subject text not null,

            post_count integer not null,

            highest_post integer not null,

            time_of_last_post integer not null,

            primary key ( board_name, thread_no )

        )

    """)

    db.execute("""

        create table posts (

            board_name test not null,

            thread_no integer not null,

            post_no integer not null,

            date_time text not null,

            name text not null,

            trip text,

            email text,

            id text,

            html text not null,

            bbcode text not null,

            textonly text not null,

            primary key ( board_name, thread_no, post_no )

        )

    """)

    boards = "anime,book,carcom,comp,food,games,img,lang,lounge,music,newnew,newpol,prog,sci,sjis,sports,tech,tele,vip"

    boards = [(board_name,) for board_name in boards.split(',')]



    db.executemany("insert into boards values (?)", boards)

    db.commit()



print 'creating in-memory database'

db = sqlite3.connect(":memory:")

db.text_factory = str

db.execute("""

  create table subject_txt (

    board_name text,

    subject text,

    name text,

    icon text,

    thread_no integer,

    highest_post integer,

    nothing text,

    time_of_last_post integer,

    primary key (board_name, thread_no)

  )

""")



db.execute("""

  create unique index pk_subject_txt on subject_txt ( board_name, thread_no )

""")



print 'attaching archive database'

db.execute(r"attach database 'world4ch_archive.db' as archive")



re_thread = re.compile(r'<h2>(.*?)</h2>.*?<div class="thread">(.*?)<div class="bottomnav">.*?<td class="postfieldleft"><span class="postnum">(.*?)</span></td>',re.DOTALL)

re_post = re.compile(r'<h3><span class="postnum"><a .*?>(.*?)</a>.*?<span class="postername">(.*?)</span>.*?<span class="postertrip">(.*?)</span>.*?<span class="posterdate">(.*?)</span>.*?<span class="id">(.*?)</span>.*?</h3>.*?<blockquote>(.*?)</blockquote>',re.DOTALL)

re_email = re.compile(r'<a href="mailto:(.*?)">(.*?)</a>')



def get_new_posts():



    threads = db.execute("""

      select A.board_name, A.thread_no, B.highest_post+1, B.post_count

      from subject_txt A 

      left join archive.threads B on 

        A.thread_no = B.thread_no and

        A.board_name = B.board_name

      where

       (B.thread_no is null or

        A.highest_post > B.highest_post) and

            A.highest_post > 0

    """).fetchall()



    for i in xrange(len(threads)):

        board_thread = threads[i]

        board_name = board_thread[0]

        thread_no = board_thread[1]

        url = "http://dis.4chan.org/read/"+board_name+"/"+str(thread_no)

        if board_thread[2]==None:

            from_post = 1

            current_post_count = 0

        else:

            from_post = board_thread[2]

            url += "/"+str(from_post)+"-"

            current_post_count = board_thread[3]

        print board_name, "("+str(i+1)+"/"+str(len(threads))+")", thread_no, "from post", from_post, 

        html_page = geturl(url)

        post_search = re_thread.findall(html_page)

        subject = post_search[0][0].strip().replace("&gt;",">").replace("&lt;",">").replace("&quot;",'"').replace("'","'").replace("&amp;","&")

        posts =  post_search[0][1].strip()

        highest_post = int(post_search[0][2])-1

        post_count = current_post_count

        post_no = 0

        time_of_last_post = 0

        for post in re_post.findall(posts):

            post_count += 1

            if post_no > 0:

                print '\b'*(len(str(post_no))+2),

            post_no = int(post[0])

            print post_no,

            trip = post[2]

            if trip == '':

                trip = None

            name_email = re_email.match(post[1])

            if name_email == None:

                name = post[1]

                email = None

            else:

                name = name_email.groups()[1]

                email = name_email.groups()[0]

            # NO! this breaks on img/1104652020/21! store as string instead.

            # date_time = int(totimestamp(datetime.datetime.strptime(post[3], "%Y-%m-%d %H:%M")))

            date_time = post[3].strip()

            time_of_last_post = date_time

            id = post[4].strip()

            if id == '':

                id = None

            html = post[5]

            ### HTML PARSING WILL GO HERE ###

            row = (board_name, thread_no, post_no, date_time, name, trip, email, id, html, '', '')

            cc = db.execute('replace into archive.posts values (?,?,?,?,?,?,?,?,?,?,?)', row)

        row = (board_name, thread_no, subject, post_count, highest_post, time_of_last_post)

        cc = db.execute('replace into archive.threads values (?,?,?,?,?,?)', row)

        cc = db.commit()

        print 'highest post is now',highest_post,'~'

        time.sleep(2)





# main board loop

if len(sys.argv) >= 2:

    comparator = "="

    if len(sys.argv) == 3:

        if sys.argv[2]=="-":

            comparator = ">="

    boards = db.execute("select board_name from archive.boards where board_name "+comparator+" ?", (sys.argv[1],)).fetchall()

else:

    boards = db.execute("select board_name from archive.boards").fetchall()



for board in boards:

    board_name = board[0]

    print 'getting subject list for /'+board_name+'/'

    subject_txt = geturl("http://dis.4chan.org/"+board_name+"/subject.txt")

    subject_txt = [tuple(line.rsplit("<>",6)) for line in subject_txt.split("\n") if line != ""]

    db.execute("delete from subject_txt")

    db.executemany("insert into subject_txt values ('"+board_name+"',?,?,?,?,?,?,?)", subject_txt)

    print 'retrieving new posts'

    get_new_posts()

    time.sleep(5)
/prog/ stats

1 Name: Anonymous 2008-05-01 11:58

10 Name: Anonymous 2008-05-01 13:21

Name: Anonymous 2008-05-01 11:58

Name: Anonymous 2008-05-01 13:21