Name: Anonymous 2010-06-05 11:56
Is there a web accessible /progscrape that'll allow me to scrape world4ch without downloading and updating the database at http://cairnarvon.rotahall.org/2008/11/30/progscrape/?
import os
import sqlite3
import urllib2
import datetime
import re
import time
import sys
# TODO: better error handling
# parse html to determine both plain text and original bbcode
print '*** world4ch archiver ***'
print ''
def geturl(url):
ok = True
retrycount = 0
retrymax = 5
data = ''
while True:
try:
ok = True
data = geturl2(url)
except:
ok = False
retrycount += 1
print ' *',
if retrycount >= retrymax:
raise
if ok:
return data
def geturl2(url):
req = urllib2.Request(url)
req.add_header("User-Agent", "Mozilla/4.0 (compatible; world4ch archiving robot; anonymous)")
res = urllib2.urlopen(req)
return res.read()
def totimestamp(dt):
return time.mktime(dt.timetuple()) + dt.microsecond/1e6
if not os.path.exists("world4ch_archive.db"):
print 'creating archive database'
db = sqlite3.connect("world4ch_archive.db")
db.execute("""
create table boards (
board_name text not null primary key
)
""")
db.execute("""
create table threads (
board_name text not null,
thread_no integer not null,
subject text not null,
post_count integer not null,
highest_post integer not null,
time_of_last_post integer not null,
primary key ( board_name, thread_no )
)
""")
db.execute("""
create table posts (
board_name test not null,
thread_no integer not null,
post_no integer not null,
date_time text not null,
name text not null,
trip text,
email text,
id text,
html text not null,
bbcode text not null,
textonly text not null,
primary key ( board_name, thread_no, post_no )
)
""")
boards = "anime,book,carcom,comp,food,games,img,lang,lounge,music,newnew,newpol,prog,sci,sjis,sports,tech,tele,vip"
boards = [(board_name,) for board_name in boards.split(',')]
db.executemany("insert into boards values (?)", boards)
db.commit()
print 'creating in-memory database'
db = sqlite3.connect(":memory:")
db.text_factory = str
db.execute("""
create table subject_txt (
board_name text,
subject text,
name text,
icon text,
thread_no integer,
highest_post integer,
nothing text,
time_of_last_post integer
)
""")
# removed ,
# primary key (board_name, thread_no)
# due to broken /vip/ subject.txt
print 'attaching archive database'
db.execute(r"attach database 'world4ch_archive.db' as archive")
re_thread = re.compile(r'<h2>(.*?)</h2>.*?<div class="thread">(.*?)<div class="bottomnav">.*?<td class="postfieldleft"><span class="postnum">(.*?)</span></td>',re.DOTALL)
re_post = re.compile(r'<h3><span class="postnum"><a .*?>(.*?)</a>.*?<span class="postername">(.*?)</span>.*?<span class="postertrip">(.*?)</span>.*?<span class="posterdate">(.*?)</span>.*?<span class="id">(.*?)</span>.*?</h3>.*?<blockquote>(.*?)</blockquote>',re.DOTALL)
re_email = re.compile(r'<a href="mailto:(.*?)">(.*?)</a>')
def get_new_posts():
threads = db.execute("""
select A.board_name, A.thread_no, B.highest_post+1, B.post_count
from subject_txt A
left join archive.threads B on
A.thread_no = B.thread_no and
A.board_name = B.board_name
where
(B.thread_no is null or
A.highest_post > B.highest_post) and
A.highest_post > 0
""").fetchall()
for i in xrange(len(threads)):
board_thread = threads[i]
board_name = board_thread[0]
thread_no = board_thread[1]
url = "http://dis.4chan.org/read/"+board_name+"/"+str(thread_no)
if board_thread[2]==None:
from_post = 1
current_post_count = 0
else:
from_post = board_thread[2]
url += "/"+str(from_post)+"-"
current_post_count = board_thread[3]
print board_name, "("+str(i+1)+"/"+str(len(threads))+")", thread_no, "from post", from_post,
html_page = geturl(url)
post_search = re_thread.findall(html_page)
subject = post_search[0][0].strip().replace(">",">").replace("<",">").replace(""",'"').replace("'","'").replace("&","&")
posts = post_search[0][1].strip()
highest_post = int(post_search[0][2])-1
post_count = current_post_count
post_no = 0
time_of_last_post = 0
for post in re_post.findall(posts):
post_count += 1
if post_no > 0:
print '\b'*(len(str(post_no))+2),
post_no = int(post[0])
print post_no,
trip = post[2]
if trip == '':
trip = None
name_email = re_email.match(post[1])
if name_email == None:
name = post[1]
email = None
else:
name = name_email.groups()[1]
email = name_email.groups()[0]
# this breaks on img/1104652020/21! store as string instead.
# date_time = int(totimestamp(datetime.datetime.strptime(post[3], "%Y-%m-%d %H:%M")))
date_time = post[3].strip()
time_of_last_post = date_time
id = post[4].strip()
if id == '':
id = None
html = post[5]
### HTML PARSING WILL GO HERE ###
row = (board_name, thread_no, post_no, date_time, name, trip, email, id, html, '', '')
cc = db.execute('replace into archive.posts values (?,?,?,?,?,?,?,?,?,?,?)', row)
row = (board_name, thread_no, subject, post_count, highest_post, time_of_last_post)
cc = db.execute('replace into archive.threads values (?,?,?,?,?,?)', row)
cc = db.commit()
print 'highest post is now',highest_post,'~'
time.sleep(2)
# main board loop
if len(sys.argv) >= 2:
comparator = "="
if len(sys.argv) == 3:
if sys.argv[2]=="-":
comparator = ">="
boards = db.execute("select board_name from archive.boards where board_name "+comparator+" ?", (sys.argv[1],)).fetchall()
else:
boards = db.execute("select board_name from archive.boards").fetchall()
for board in boards:
board_name = board[0]
print 'getting subject list for /'+board_name+'/'
subject_txt = geturl("http://dis.4chan.org/"+board_name+"/subject.txt")
subject_txt = [tuple(line.rsplit("<>",6)) for line in subject_txt.split("\n") if line != ""]
# because of /sci/ LordRiordan<><>1166499052<>*Casts ressurect*<> <>70.41.253.15
# <>Jesus Christ<><>1137752954<>2<>LordRiordan<>1166503893 :
subject_txt = [t for t in subject_txt if len(t) == 7]
db.execute("delete from subject_txt")
db.executemany("insert into subject_txt values ('"+board_name+"',?,?,?,?,?,?,?)", subject_txt)
print 'retrieving new posts'
get_new_posts()
time.sleep(5)
sqlite3 prog.db 'select count(*) from (select thread from posts where thread in (select distinct thread from posts where body glob "*Xarn*") group by thread);'
like but unlike glob is case insensitive) is still missing a lot of them.
File "x.py", line 102, in <module>
page = urllib.urlopen(read_url + thread[0] + '/1-').read()
File "/usr/lib/python2.5/urllib.py", line 82, in urlopen
return opener.open(url)
File "/usr/lib/python2.5/urllib.py", line 190, in open
return getattr(self, name)(url)
File "/usr/lib/python2.5/urllib.py", line 328, in open_http
errcode, errmsg, headers = h.getreply()
File "/usr/lib/python2.5/httplib.py", line 1199, in getreply
response = self._conn.getresponse()
File "/usr/lib/python2.5/httplib.py", line 928, in getresponse
response.begin()
File "/usr/lib/python2.5/httplib.py", line 385, in begin
version, status, reason = self._read_status()
File "/usr/lib/python2.5/httplib.py", line 343, in _read_status
line = self.fp.readline()
File "/usr/lib/python2.5/socket.py", line 372, in readline
data = recv(1)
IOError: [Errno socket error] (104, 'Connection reset by peer')
Fetching subject.txt... Got it.
subjects.txt fail: Anonymous<><>1220909598<><a href="read/prog/1220718054/17">>>17</a><br/>Don't ask me, aniki.<> <>131.116.254.199<><><>1220718054<>6<><>1231185336
8896 threads to update.
Updating thread 1275783543...
Updating thread 1275783433...
Updating thread 1202944294...
Traceback (most recent call last):
File "progscrape.py", line 186, in <module>
db.execute(u'INSERT INTO posts (thread, id, author, email, trip, time, body) VALUES (?, ?, ?, ?, ?, ?, ?)', b)
sqlite3.IntegrityError: columns thread, id are not unique
scrape from scratch a while ago and it's been running smoothly.
cron this on their server and write a web 2.0 database searcher