/prog/ - Online /prog/scrape

Name: Anonymous 2010-06-05 11:56

Is there a web accessible /progscrape that'll allow me to scrape world4ch without downloading and updating the database at http://cairnarvon.rotahall.org/2008/11/30/progscrape/?

Name: not xarn 2010-06-05 12:02

Considered it, dismissed it

Name: Anonymous 2010-06-05 12:09

http://www.google.com/

Name: Anonymous 2010-06-05 12:28

>>3
Yes, I did use Google and found nothing. You may have misread the OP post.

Name: Anonymous 2010-06-05 13:15

Xarn post.

Name: Anonymous 2010-06-05 13:40

>>4
I think you may have misread >>3.

Name: Anonymous 2010-06-05 14:27

>>4
http://www.google.com/search?q=site:dis.4chan.org+inurl:/read/prog/

Name: Anonymous 2010-06-05 14:43

>>7
That finds 264 results for Xarn. In actual fact:

$ sqlite3 prog.db 'select count(*) from posts where body like "%Xarn%";' 736

Name: Anonymous 2010-06-05 17:51

import os

import sqlite3

import urllib2

import datetime

import re

import time

import sys



# TODO: better error handling

#       parse html to determine both plain text and original bbcode



print '*** world4ch archiver ***'

print ''



def geturl(url):

    ok = True

    retrycount = 0

    retrymax = 5

    data = ''

    while True:

        try:

            ok = True

            data = geturl2(url)

        except:

            ok = False

            retrycount += 1

            print ' *',

            if retrycount >= retrymax:

                raise

        if ok:

            return data



def geturl2(url):

    req = urllib2.Request(url)

    req.add_header("User-Agent", "Mozilla/4.0 (compatible; world4ch archiving robot; anonymous)")

    res = urllib2.urlopen(req)

    return res.read()



def totimestamp(dt):

    return time.mktime(dt.timetuple()) + dt.microsecond/1e6



if not os.path.exists("world4ch_archive.db"):

    print 'creating archive database'

    db = sqlite3.connect("world4ch_archive.db")

    db.execute("""

        create table boards (

            board_name text not null primary key

        )

    """)

    db.execute("""

        create table threads (

            board_name text not null,

            thread_no integer not null,

            subject text not null,

            post_count integer not null,

            highest_post integer not null,

            time_of_last_post integer not null,

            primary key ( board_name, thread_no )

        )

    """)

    db.execute("""

        create table posts (

            board_name test not null,

            thread_no integer not null,

            post_no integer not null,

            date_time text not null,

            name text not null,

            trip text,

            email text,

            id text,

            html text not null,

            bbcode text not null,

            textonly text not null,

            primary key ( board_name, thread_no, post_no )

        )

    """)

    boards = "anime,book,carcom,comp,food,games,img,lang,lounge,music,newnew,newpol,prog,sci,sjis,sports,tech,tele,vip"

    boards = [(board_name,) for board_name in boards.split(',')]



    db.executemany("insert into boards values (?)", boards)

    db.commit()



print 'creating in-memory database'

db = sqlite3.connect(":memory:")

db.text_factory = str

db.execute("""

  create table subject_txt (

    board_name text,

    subject text,

    name text,

    icon text,

    thread_no integer,

    highest_post integer,

    nothing text,

    time_of_last_post integer

  )

""")



# removed ,

#    primary key (board_name, thread_no)

# due to broken /vip/ subject.txt



print 'attaching archive database'

db.execute(r"attach database 'world4ch_archive.db' as archive")



re_thread = re.compile(r'<h2>(.*?)</h2>.*?<div class="thread">(.*?)<div class="bottomnav">.*?<td class="postfieldleft"><span class="postnum">(.*?)</span></td>',re.DOTALL)

re_post = re.compile(r'<h3><span class="postnum"><a .*?>(.*?)</a>.*?<span class="postername">(.*?)</span>.*?<span class="postertrip">(.*?)</span>.*?<span class="posterdate">(.*?)</span>.*?<span class="id">(.*?)</span>.*?</h3>.*?<blockquote>(.*?)</blockquote>',re.DOTALL)

re_email = re.compile(r'<a href="mailto:(.*?)">(.*?)</a>')



def get_new_posts():



    threads = db.execute("""

      select A.board_name, A.thread_no, B.highest_post+1, B.post_count

      from subject_txt A 

      left join archive.threads B on 

        A.thread_no = B.thread_no and

        A.board_name = B.board_name

      where

       (B.thread_no is null or

        A.highest_post > B.highest_post) and

            A.highest_post > 0

    """).fetchall()



    for i in xrange(len(threads)):

        board_thread = threads[i]

        board_name = board_thread[0]

        thread_no = board_thread[1]

        url = "http://dis.4chan.org/read/"+board_name+"/"+str(thread_no)

        if board_thread[2]==None:

            from_post = 1

            current_post_count = 0

        else:

            from_post = board_thread[2]

            url += "/"+str(from_post)+"-"

            current_post_count = board_thread[3]

        print board_name, "("+str(i+1)+"/"+str(len(threads))+")", thread_no, "from post", from_post, 

        html_page = geturl(url)

        post_search = re_thread.findall(html_page)

        subject = post_search[0][0].strip().replace("&gt;",">").replace("&lt;",">").replace("&quot;",'"').replace("'","'").replace("&amp;","&")

        posts =  post_search[0][1].strip()

        highest_post = int(post_search[0][2])-1

        post_count = current_post_count

        post_no = 0

        time_of_last_post = 0

        for post in re_post.findall(posts):

            post_count += 1

            if post_no > 0:

                print '\b'*(len(str(post_no))+2),

            post_no = int(post[0])

            print post_no,

            trip = post[2]

            if trip == '':

                trip = None

            name_email = re_email.match(post[1])

            if name_email == None:

                name = post[1]

                email = None

            else:

                name = name_email.groups()[1]

                email = name_email.groups()[0]

            # this breaks on img/1104652020/21! store as string instead.

            # date_time = int(totimestamp(datetime.datetime.strptime(post[3], "%Y-%m-%d %H:%M")))

            date_time = post[3].strip()

            time_of_last_post = date_time

            id = post[4].strip()

            if id == '':

                id = None

            html = post[5]

            ### HTML PARSING WILL GO HERE ###

            row = (board_name, thread_no, post_no, date_time, name, trip, email, id, html, '', '')

            cc = db.execute('replace into archive.posts values (?,?,?,?,?,?,?,?,?,?,?)', row)

        row = (board_name, thread_no, subject, post_count, highest_post, time_of_last_post)

        cc = db.execute('replace into archive.threads values (?,?,?,?,?,?)', row)

        cc = db.commit()

        print 'highest post is now',highest_post,'~'

        time.sleep(2)





# main board loop

if len(sys.argv) >= 2:

    comparator = "="

    if len(sys.argv) == 3:

        if sys.argv[2]=="-":

            comparator = ">="

    boards = db.execute("select board_name from archive.boards where board_name "+comparator+" ?", (sys.argv[1],)).fetchall()

else:

    boards = db.execute("select board_name from archive.boards").fetchall()



for board in boards:

    board_name = board[0]

    print 'getting subject list for /'+board_name+'/'

    subject_txt = geturl("http://dis.4chan.org/"+board_name+"/subject.txt")

    subject_txt = [tuple(line.rsplit("<>",6)) for line in subject_txt.split("\n") if line != ""]

    # because of /sci/ LordRiordan<><>1166499052<>*Casts ressurect*<> <>70.41.253.15

    # <>Jesus Christ<><>1137752954<>2<>LordRiordan<>1166503893 :

    subject_txt = [t for t in subject_txt if len(t) == 7]

    db.execute("delete from subject_txt")

    db.executemany("insert into subject_txt values ('"+board_name+"',?,?,?,?,?,?,?)", subject_txt)

    print 'retrieving new posts'

    get_new_posts()

    time.sleep(5)

Name: Anonymous 2010-06-05 18:23

Perhaps a good plan might be for someone to run a decent progscrape periodically and put the sqlite DB up for download somewhere.

Name: Anonymous 2010-06-05 18:49

>>9
progscrape isn't pretty, but yours is horrifying.

Name: Anonymous 2010-06-05 20:02

>>8

sqlite3 prog.db 'select count(*) from (select thread from posts where thread in (select distinct thread from posts where body glob "*Xarn*") group by thread);'

Name: Anonymous 2010-06-05 20:16

>>12
You laugh, but I know people who write SQL like that.

$ sqlite3 prog.db 'select count(distinct thread) from posts where body like "%Xarn%";' 322

Google (which, like like but unlike glob is case insensitive) is still missing a lot of them.

Name: Anonymous 2010-06-05 20:47

>>13
google is good enough for people who aren't smart enough to figure out how to use progscrape.

Name: Anonymous 2010-06-05 21:06

>>14
3/10

All /prog/riders use progscrape.

Name: Anonymous 2010-06-05 21:50

>>15
I don't, anymore

Name: Anonymous 2010-06-05 22:16





  File "x.py", line 102, in <module>

    page = urllib.urlopen(read_url + thread[0] + '/1-').read()

  File "/usr/lib/python2.5/urllib.py", line 82, in urlopen

    return opener.open(url)

  File "/usr/lib/python2.5/urllib.py", line 190, in open

    return getattr(self, name)(url)

  File "/usr/lib/python2.5/urllib.py", line 328, in open_http

    errcode, errmsg, headers = h.getreply()

  File "/usr/lib/python2.5/httplib.py", line 1199, in getreply

    response = self._conn.getresponse()

  File "/usr/lib/python2.5/httplib.py", line 928, in getresponse

    response.begin()

  File "/usr/lib/python2.5/httplib.py", line 385, in begin

    version, status, reason = self._read_status()

  File "/usr/lib/python2.5/httplib.py", line 343, in _read_status

    line = self.fp.readline()

  File "/usr/lib/python2.5/socket.py", line 372, in readline

    data = recv(1)

IOError: [Errno socket error] (104, 'Connection reset by peer')

Name: Anonymous 2010-06-05 22:45

>>17
not updating database considered harmful



Fetching subject.txt... Got it.

subjects.txt fail: Anonymous<><>1220909598<><a href="read/prog/1220718054/17">&gt;&gt;17</a><br/>Don't ask me, aniki.<> <>131.116.254.199<><><>1220718054<>6<><>1231185336



8896 threads to update.

Updating thread 1275783543...

Updating thread 1275783433...

Updating thread 1202944294...

Traceback (most recent call last):

  File "progscrape.py", line 186, in <module>

    db.execute(u'INSERT INTO posts (thread, id, author, email, trip, time, body) VALUES (?, ?, ?, ?, ?, ?, ?)', b)

sqlite3.IntegrityError: columns thread, id are not unique

Name: Anonymous 2010-06-05 23:12

>>18
I don't know how you managed that. Are you running the latest version? I restarted /prog/scrape from scratch a while ago and it's been running smoothly.

Name: Anonymous 2010-06-05 23:18

>>19

Mhmm. I just grabbed it from Sarn's blog.

Name: Anonymous 2010-06-05 23:21

>>19

Oh, I should add that that's updating Sarn's database. I eventually get the same problem as >>17-kun if done from scratch

Name: Anonymous 2010-06-05 23:43

>>20
By blog do you mean Github? Because the one on the blog itself is a few Shii_tchan fuck-ups out of date.
http://github.com/Cairnarvon/progscrape/blob/master/progscrape.py

Name: Anonymous 2010-06-05 23:47

Yeah, I've got the one from the github attempting to run over the database

I'll try from scratch- the first time I did it with the non-github version

Name: Anonymous 2010-06-06 2:38

>>23
Going from scratch works just fine. A nice 141 MB database file.

Name: Anonymous 2010-06-06 10:58

>>24

Yeah, it did fine for me as well.
Mine's only 134.8 [code]:'([code]

And I still can't find that thread of vagina programs

Name: Anonymous 2010-06-06 11:30

>>25
FIND MY VAGINA

Name: Anonymous 2010-06-09 16:29

>>11

:(

What's wrong with it?

Name: Anonymous 2010-06-09 17:13

>>27
Please review PEP 8, for a start.

Name: Anonymous 2010-06-09 17:36

My world4chscrape is better than Xarn's, and yet it was ignored :(

Name: Anonymous 2010-06-09 17:42

>>29
I bet it wasn't in Python. /prog/ actually secretly loves the FIOC.

Name: Anonymous 2010-06-09 17:46

Somebody should cron this on their server and write a web 2.0 database searcher

Name: Anonymous 2010-06-09 17:51

>>31
CRON MY VAGINA

Name: Anonymous 2010-06-09 17:55

Writing a scrape in bash.

Name: Anonymous 2010-06-09 18:48

>>30
No, it was in FIOC. And I put so much effort into commenting my regexen, too! And into making it VROOM VROOM. But no-one cared. I sad :(

Name: Anonymous 2010-06-09 18:55

>>31
I was thinking about writing a Distributed Internet Content Archival System: Solution for Home Businesses, by which I mean a botnet-enabled 4chan archiver. It would have two main modes of operation, normal (legit) mode where the user willingly installs it on their system, and a silent (botnet) mode where the user is unaware of the program running. I started thinking about details, but now I wonder whether many people (aside from me) would actually be interested in using this.

Name: Anonymous 2010-06-09 19:19

>>35
s/Archival/Kompression/ and you've got DICKS

Name: Anonymous 2010-06-09 19:20

>>29,34
Was that the ENTERPRISE FIOC one with its OOP and its half a dozen files?

Name: Anonymous 2010-06-09 19:50

>>37
I didn't do any intentional ENTERPRISING, but yeah, I did create one class. And there was only a quarter of a dozen files :<

Name: Anonymous 2010-06-09 22:00

>>38
The fact that it wasn't just one small script you can put somewhere was probably the most significant factor against it. Though the fact that progscrape actually works just fine probably didn't help.
An additional consideration is that even progscrape isn't that widely used. The Github project only has one follower (besides Xarn), after all.

Name: Anonymous 2010-06-09 22:29

progscrape actually works just fine
No, it breaks when parsing subject.txt, and is 8 times slower.
Anyway, I completely understand that someone might not want to use something I wrote.

Name: Anonymous 2010-06-09 22:52

>>40
It doesn't break when parsing subject.txt at all; it correctly points out that /prog/'s subject.txt has an invalid entry. This is a Shii_tchan bug, not a progscrape bug.
The fact that it's eight times slower is a debatable point. Xarn said that he thinks hammering dis with extra threads is poor form, and progscrape's speed is just fine for people who run it every day. The only people affected by it's single-threadedness are people running it for the first time, and even then it only takes a few hours to finish.

Name: Anonymous 2010-06-09 23:04

>>39
The Github project only has one follower
IMO, that fact is irrelevant. It's relatively easy to imagine that most /prog/riders aren't going to be willing to compromise their anonymity just to know when Xarn updates his script. You'll notice that even in contest threads most entries are anonymous, and those that aren't are only identified by a tripcode.

Name: Anonymous 2010-06-09 23:14

>>42
The imageboards may have turned anonymity into a cult, but /prog/ usually seems to realize there's a time and a place for things, and understands why world4ch is anonymous by default. We have no problem shedding anonymity for things like IRC or Web 2.0 social media, so I don't see why Github should be any different.
Pseudonymity is every bit as good as anonymity in the context of Github, anyway.

But progscrape actually has two followers besides Xarn.

Name: Anonymous 2010-10-04 2:00

Wait, it's called /prog/scrape?

I always thought it was progscape.

Name: Anonymous 2010-10-04 3:42

>> 44 sage.

>> 35 If you need some bots or are having trouble thinking of a unique way to spread them, make a new thread. I'll give you a hand on how to set up an effective server, a thousand bots.

Name: Anonymous 2010-10-04 8:17

>>45
Less of you.

Name: Anonymous 2010-10-04 15:04

>> 46 Less of telling people how to do things? Or just breaking the law?

I'm so intrigued to know so I may make an apt response.

Name: Anonymous 2010-10-04 15:39

>>44
The script is progscrape.py and the Github repository is progscrape because of technical limitations, but the name of the project is /prog/scrape. As close an approximation to this name as possible should be used.

Name: Anonymous 2010-10-04 17:27

／prog／scrape

Name: Anonymous 2010-10-05 2:37

╱ℙ????????????╱????????????????????????

Name: Anonymous 2010-12-06 9:51

^{^{^{^{^{^{^{^{^{^{^{^{^{^{^{^{^{^{^{^{^{^{^{^{^{^{^{^Ba}c}k}}t}o}}/}b}/},}}`}`}G}N}A}A}}F}a}g}g}o}t}'}'}

Name: Anonymous 2010-12-17 1:35

Erika once told me that Xarn is a bad boyfriend

Name: Anonymous 2010-12-20 23:58

Name: Anonymous 2011-01-31 19:53

<-- check em dubz

Name: deafragr 2011-05-08 20:22

http://www.charms-charms.com with fellow pop star Debbie Gibson for space on the covers of teen magazines, including Tiger Beat, and Teen Beat, as well as on http://www.charms-charms.com.[citation needed] Her ballad "Could've Been" also peaked at the number one spot on the http://www.charms-charms.com/tiffany-bangles-c-16.html

Name: Anonymous 2013-02-09 6:05

http://world4search.no-ip.org:8080/

Online /prog/scrape

1 Name: Anonymous 2010-06-05 11:56

2 Name: not xarn 2010-06-05 12:02

3 Name: Anonymous 2010-06-05 12:09

4 Name: Anonymous 2010-06-05 12:28

5 Name: Anonymous 2010-06-05 13:15

6 Name: Anonymous 2010-06-05 13:40

7 Name: Anonymous 2010-06-05 14:27

8 Name: Anonymous 2010-06-05 14:43

9 Name: Anonymous 2010-06-05 17:51

10 Name: Anonymous 2010-06-05 18:23

11 Name: Anonymous 2010-06-05 18:49

12 Name: Anonymous 2010-06-05 20:02

13 Name: Anonymous 2010-06-05 20:16

14 Name: Anonymous 2010-06-05 20:47

15 Name: Anonymous 2010-06-05 21:06

16 Name: Anonymous 2010-06-05 21:50

17 Name: Anonymous 2010-06-05 22:16

18 Name: Anonymous 2010-06-05 22:45

19 Name: Anonymous 2010-06-05 23:12

20 Name: Anonymous 2010-06-05 23:18

21 Name: Anonymous 2010-06-05 23:21

22 Name: Anonymous 2010-06-05 23:43

23 Name: Anonymous 2010-06-05 23:47

24 Name: Anonymous 2010-06-06 2:38

25 Name: Anonymous 2010-06-06 10:58

26 Name: Anonymous 2010-06-06 11:30

27 Name: Anonymous 2010-06-09 16:29

28 Name: Anonymous 2010-06-09 17:13

29 Name: Anonymous 2010-06-09 17:36

30 Name: Anonymous 2010-06-09 17:42

31 Name: Anonymous 2010-06-09 17:46

32 Name: Anonymous 2010-06-09 17:51

33 Name: Anonymous 2010-06-09 17:55

34 Name: Anonymous 2010-06-09 18:48

35 Name: Anonymous 2010-06-09 18:55

36 Name: Anonymous 2010-06-09 19:19

37 Name: Anonymous 2010-06-09 19:20

38 Name: Anonymous 2010-06-09 19:50

39 Name: Anonymous 2010-06-09 22:00

40 Name: Anonymous 2010-06-09 22:29

41 Name: Anonymous 2010-06-09 22:52

42 Name: Anonymous 2010-06-09 23:04

43 Name: Anonymous 2010-06-09 23:14

44 Name: Anonymous 2010-10-04 2:00

45 Name: Anonymous 2010-10-04 3:42

46 Name: Anonymous 2010-10-04 8:17

47 Name: Anonymous 2010-10-04 15:04

48 Name: Anonymous 2010-10-04 15:39

49 Name: Anonymous 2010-10-04 17:27

50 Name: Anonymous 2010-10-05 2:37

52 Name: Anonymous 2010-12-06 9:51

53 Name: Anonymous 2010-12-17 1:35

54 Name: Anonymous 2010-12-20 23:58

55 Name: Anonymous 2011-01-31 19:53

56 Name: deafragr 2011-05-08 20:22

57 Name: Anonymous 2013-02-09 6:05