/prog/ - [FAST]>TURBO/prog/SCRAPE<

Name: Anonymous 2010-06-21 0:59

I hate /prog/, I hate Python, I hate Gweedo, I hate Xarn, but most of all I hate myself. Worst 4 hours of my life.
Changelog: - Raised database busy timeout - Added several database PRAGMAs - Added HTTP gzip support - Added automatic retry support - Request only new posts - VACUUM the database when done - Multithreaded thread fetching - Flush database only every 5s
Database format is unchanged and therefore it's fully interchangeable with the unmodified version.

Enjoy the turd, I sure as hell won't be touching this shit ever again.

Part 1/2 (append both parts)



#!/usr/bin/python -u



# ``Constants''



db_name  = 'prog.db'

prog_url = 'http://dis.4chan.org/prog/'

read_url = 'http://dis.4chan.org/read/prog/'

threads  = 12





# Make sure we're using a compatible version



from sys import version, exit



if version[0] != '2':

    print "Your version of Python is not supported at this time.",\

          "Please use Python 2.5 or 2.6."

    exit(1)





# Set up the database connection first



import sqlite3



db_conn = sqlite3.connect(db_name, 300.0)

db = db_conn.cursor()



try:

    db.execute('PRAGMA journal_mode = PERSIST')

    db.execute('PRAGMA synchronous = normal')

    db.execute('PRAGMA page_size = 4096')

    db.execute("""

        CREATE TABLE IF NOT EXISTS threads (

            thread INTEGER PRIMARY KEY,

            title TEXT,

            last_post INTEGER

        )""")

    db.execute("""

        CREATE TABLE IF NOT EXISTS posts (

            thread INTEGER REFERENCES threads(thread),

            id INTEGER,

            author TEXT,

            email TEXT,

            trip TEXT,

            time INTEGER,

            body TEXT,

            PRIMARY KEY (thread, id)

        )""")

    db_conn.commit()

    

except sqlite3.DatabaseError:

    # Specified DB file exists, but isn't an SQLite DB file.

    print "Use a different filename for your DB."

    raise





# Try to fetch subject.txt



import urllib2, re, StringIO, gzip, time, sys



stat_cbytes = 0

stat_ubytes = 0

stat_threads = 0

stat_skips = 0



def request_compressed(url):

    global stat_cbytes

    global stat_ubytes

    wait = 0.5

    while(True):

        try:

            request = urllib2.Request(url)

            request.add_header('Accept-encoding', 'gzip')

            opener = urllib2.build_opener()

            f = opener.open(request)

            c = f.read()

            stat_cbytes += len(c)

            serverstream = StringIO.StringIO(c)

            if f.headers.get('Content-Encoding') == 'gzip':

                decstream = gzip.GzipFile(fileobj=serverstream)

                u = decstream.read()

                stat_ubytes += len(u)

                return StringIO.StringIO(u)

            else:

                stat_ubytes += len(c)

                return serverstream

        except KeyboardInterrupt:

            print "KeyboardInterrupt, aborting..."

            raise

        except:

            print "Network fetch error:", sys.exc_info()[0]

            print "*** Trying again in", wait,

            print "seconds (press CTRL-C to abort)"

            time.sleep(wait)

            wait = wait * 2 if wait < 15 else 30





print "Fetching subject.txt...",



subjecttxt = request_compressed(prog_url + 'subject.txt')



print "Got it."





# Parse each line, check with DB, keep a list of all threads to be updated



regex = re.compile(u"""

    ^(.*?)      # Subject

    <>

    (.*?)       # Name

    <>

    (.*?)       # E-mail

    <>

    (-?\d*)     # Time posted/thread ID

    <>

    (\d*)       # Number of replies

    <>

    (.*?)       # ???

    <>

    (\d*)       # Time of last post

    \\n$""", re.VERBOSE)

to_update = []



for line in subjecttxt.readlines():

    parsed = regex.search(unicode(line,"iso-8859-1"))

    try:

        data = parsed.groups()

        result = db.execute('SELECT last_post FROM threads WHERE thread = ?', (unicode(data[3]), )).fetchone()

        if result is None:

            db.execute('INSERT INTO threads VALUES (?, ?, ?)', (unicode(data[3]), unicode(data[0]), 0))

            to_update.append((unicode(data[3]), unicode(data[6])))

        elif int(result[0]) < int(data[6]):

            to_update.append((unicode(data[3]), unicode(data[6])))



    except:

        # Failed to parse line; skip it

        print "subjects.txt fail:", line



print "%d threads to update." % len(to_update)

Name: Anonymous 2010-06-21 1:00

Part 2/2





# Fetch new posts



import threading, time, datetime as dt, Queue



class fetch_thread(threading.Thread):

    def __init__ (self, thread, first, l, q):

        threading.Thread.__init__(self)

        self.thread = thread

        self.first = first

        self.l = l

        self.q = q

    def run(self):

        #print "Updating thread %s..." % self.thread[0]



        page = request_compressed(read_url + self.thread[0] + '/' + str(self.first) + '-').read()

        

        ids, authors, emails, trips, times, posts, starts, ends = [], [], [], [], [], [], [], []

        for a in enumerate(page):

            try:

                if a[1] == '<':

                    if page[a[0] : a[0] + 22] == '<span class="postnum">':

                        i = 48

                        while page[a[0] + i] != ',':

                            i += 1

                        ids.append(page[a[0] + 48 : a[0] + i])

                    if page[a[0] : a[0] + 25] == '<span class="postername">':

                        i = 25

                        while page[a[0] + i : a[0] + i + 7] != '</span>':

                            i += 1

                        auth = page[a[0] + 25 : a[0] + i]

                        if len(auth) > 1 and auth[:2] == '<a':

                            i = 16

                            while auth[i] != '"':

                                i += 1

                            emails.append(auth[16:i])

                            auth = auth[i + 2 : -4]

                        else:

                            emails.append('')

                        authors.append(auth)

                        

                    elif page[a[0] : a[0] + 25] == '<span class="postertrip">':

                        i = 25

                        while page[a[0] + i] != '<':

                            i += 1

                        trips.append(page[a[0] + 25 : a[0] + i])

                

                    elif page[a[0] : a[0] + 25] == '<span class="posterdate">':

                        i = 25

                        while page[a[0] + i] != '<':

                            i += 1

                    

                        d = page[a[0] + 25 : a[0] + i]

                        d = int(time.mktime(dt.datetime(int(d[:4]),

                                                        int(d[5:7]),

                                                        int(d[8:10]),

                                                        int(d[11:13]),

                                                        int(d[14:16])).timetuple()))

                        times.append(d)

                

                    elif page[a[0] : a[0] + 12] == '<blockquote>':

                        starts.append(a[0] + 18)

                    

                    elif page[a[0] : a[0] + 13] == '</blockquote>':

                        ends.append(a[0] - 7)

        

            except:

                print "! Broken post in thread %s" % self.thread[0]

                lens = map(len, [ids, authors, emails, trips, times, posts, starts, ends])

                minl = min(lens)

                if max(lens) != minl:

                    for a in [ids, authors, emails, trips, times, posts, starts, ends]:

                        if len(a) > minl:

                            a = a[:-1]



        for i in xrange(len(starts)):

            posts.append(page[starts[i] : ends[i]])





        b1 = []

        for a in zip(ids, authors, emails, trips, times, posts):

            if a[4] > self.l:

                b = [unicode(self.thread[0])]

                

                for y in a:

                    if isinstance(y,str): b.append(unicode(y,"utf-8","replace"))

                    else: b.append(y)

                                   

                b1.append(b)

                

        self.q.put([self, b1, (unicode(self.thread[1]), unicode(self.thread[0]))])

        





last_commit = start = time.time()





def start_thread(t):

    global stat_skips

    result = db.execute('SELECT MAX(id) FROM posts WHERE thread = ?', (unicode(t[0]), )).fetchone()

    if result[0] is None:

        first = 1

    else:

        first = result[0] + 1

    stat_skips += first - 1



    l = db.execute('SELECT MAX(time) FROM posts WHERE thread = ?', (unicode(t[0]),)).fetchone()

    l = None if l == None else l[0]

        

    th = fetch_thread(t, first, l, q)

    th.start()





def commit(force):

    global last_commit

    if force or last_commit + 5 < time.time():

        db_conn.commit()

        last_commit = time.time()

        elapsed = last_commit - start;

        print "%d%% - Done %d threads (%d bytes, %d%% compression) (%d th/s, %d b/s)" % \

        (stat_threads*100/len(to_update), stat_threads,         stat_cbytes, \

        100-stat_cbytes*100/stat_ubytes,  stat_threads/elapsed, stat_cbytes/elapsed)





def finish_thread(t):

    global stat_threads

    t[0].join()

    for b in t[1]:

        db.execute(u'INSERT INTO posts (thread, id, author, email, trip, time, body) VALUES (?, ?, ?, ?, ?, ?, ?)', b)

    db.execute(u'UPDATE threads SET last_post = ? WHERE thread = ?', t[2])

    stat_threads += 1

    commit(False)

    



q = Queue.Queue()

active_threads = 0



for thread in to_update:



    start_thread(thread)

    active_threads += 1

    

    if active_threads >= threads:

        finish_thread(q.get())

        active_threads -= 1





while active_threads > 0:

    finish_thread(q.get())

    active_threads -= 1



commit(True)

print "Avoided fetching %d already known posts" % stat_skips





db.execute('PRAGMA journal_mode = DELETE')

print "Optimizing database layout..."

db.execute('VACUUM')



print "All done!"

Name: Anonymous 2010-06-21 1:12

Enjoy your automated anti-DOS ban.

Name: Anonymous 2010-06-21 1:28

I don't think you know what VACUUM does, and your pragmas aren't nearly as clever as you think they are.

Name: Anonymous 2010-06-21 1:44

good job, you made /prog/scrape even worse!

Name: Anonymous 2010-06-21 2:08

Alright, you guys, ease up. It's a good effort, and some of the ideas are good. I can see gzip support, at least, making it into Xarn's progscrape.
The proper way to do this would have been to fork the Github repository, though.

Name: Anonymous 2010-06-21 4:58

FORK MY ANUS

Name: Anonymous 2010-06-21 7:56

>>7
Can I spork your dick as well?

Name: Anonymous 2010-06-21 11:54

>>3
There's no anti-DOS ban. 4chan sometimes starts ignoring requests (like when I try to fetch an entire board), but I don't think that world4ch does even that.
>>1
You're not using the JSON interface. Your program is slower than mine. Yes, I'm jaded.

Name: Anonymous 2010-06-21 14:06

>>9
You're not using the JSON interface.
You know there are cons to using the JSON interface, right?

Name: Anonymous 2010-06-21 14:32

>>10
Yes, but every cons has a silver cudder.

Name: Anonymous 2010-06-21 14:48

>>10
=> select count(*) from prog_posts; count -------- 474917 (1 row) => select count(*) from prog_posts where trip != '' or name like '%!%'; count ------- 11148 (1 row)
Yes, in 2.35% cases you'd need to fetch the HTML version of particular posts to verify the tripcode.

Name: Anonymous 2010-06-21 14:49

>>11
I always imagined cudders as pastel navy, myself.

Name: Anonymous 2010-06-21 16:20

>>13
I always imagined cudders as pasty navel, myself.

Name: Anonymous 2010-06-21 19:44

>>11
Fuck, I was trying to setup a different joke but you ruined it.

Oh well, it was slightly better but twice as bad if you know what I mean.

Name: Anonymous 2010-06-23 1:52

By the way, Xarn added gzip support to progscrape: http://github.com/Cairnarvon/progscrape/commit/5e6109e23f906df60931c133519a0def87fdf4de

Name: Anonymous 2010-06-23 2:17

>>16
Hello, Xarn!

Name: Anonymous 2010-06-23 2:24

>>17
If >>16 was Xarn he probably would have posted seven hours ago, when he updated. Clearly >>16 must be ijp or Krade.

Name: Anonymous 2010-06-23 16:06

By the way, Xarn made parsing slightly less braindead in progscrape: http://github.com/Cairnarvon/progscrape/commit/75d58558385c76e73684643291ef02ba71dd4877

Name: Anonymous 2010-06-23 16:20

On the topic of the JSON interface:

23:21:11 <~Cairnarvon> I guess this would be easier with the JSON interface, but then I can't pretend it's a general Shiichan scraper anymore. 23:21:26 < Storlek> Because there are so many other boards running Shiichan. 23:21:44 < Storlek> especially ones that are worthwhile in any way 23:22:25 <~Cairnarvon> That's not the point. 23:22:32 < Storlek> What is? 23:22:48 <~Cairnarvon> The concept of other software conforming to Shiichan is funny to me. 23:23:54 <~Cairnarvon> Python 2.5 doesn't have a JSON library, so if I used it I'd get even more people whining about not being able to figure out how to use it, too. 23:24:07 < Storlek> simplejson 23:24:18 < Storlek> It's widely known and easy to install on any distro. 23:24:30 < Storlek> And it's the one that comes in 2.6+ by default. 23:24:50 <~Cairnarvon> Windows doesn't have a package manager. 23:25:07 <~Cairnarvon> I never get any emails from Linux users.

I don't know if this means it's a definite no.

Name: Anonymous 2010-06-23 17:24

>>20
ice to meet you Storlek

Name: Storlek !desu/4y/Xg 2010-06-23 17:30

>>21
Incorrect.

Name: Anonymous 2010-06-23 19:55

>>19
Now progscrape isn't the scraper Shii_tchan deserves anymore. ：（

Name: Anonymous 2010-06-24 1:48

omg you've made progress on the project i started!
I AM PROUD

Newer Posts

[FAST]>TURBO/prog/SCRAPE<

1 Name: Anonymous 2010-06-21 0:59

2 Name: Anonymous 2010-06-21 1:00

3 Name: Anonymous 2010-06-21 1:12

4 Name: Anonymous 2010-06-21 1:28

5 Name: Anonymous 2010-06-21 1:44

6 Name: Anonymous 2010-06-21 2:08

7 Name: Anonymous 2010-06-21 4:58

8 Name: Anonymous 2010-06-21 7:56

9 Name: Anonymous 2010-06-21 11:54

10 Name: Anonymous 2010-06-21 14:06

11 Name: Anonymous 2010-06-21 14:32

12 Name: Anonymous 2010-06-21 14:48

13 Name: Anonymous 2010-06-21 14:49

14 Name: Anonymous 2010-06-21 16:20

15 Name: Anonymous 2010-06-21 19:44

16 Name: Anonymous 2010-06-23 1:52

17 Name: Anonymous 2010-06-23 2:17

18 Name: Anonymous 2010-06-23 2:24

19 Name: Anonymous 2010-06-23 16:06

20 Name: Anonymous 2010-06-23 16:20

21 Name: Anonymous 2010-06-23 17:24

22 Name: Storlek !desu/4y/Xg 2010-06-23 17:30

23 Name: Anonymous 2010-06-23 19:55

24 Name: Anonymous 2010-06-24 1:48