if not os.path.exists("world4ch_archive.db"):
print 'creating archive database'
db = sqlite3.connect("world4ch_archive.db")
db.execute("""
create table boards (
board_name text not null primary key
)
""")
db.execute("""
create table threads (
board_name text not null,
thread_no integer not null,
subject text not null,
post_count integer not null,
highest_post integer not null,
time_of_last_post integer not null,
primary key ( board_name, thread_no )
)
""")
db.execute("""
create table posts (
board_name test not null,
thread_no integer not null,
post_no integer not null,
date_time text not null,
name text not null,
trip text,
email text,
id text,
html text not null,
bbcode text not null,
textonly text not null,
primary key ( board_name, thread_no, post_no )
)
""")
boards = "anime,book,carcom,comp,food,games,img,lang,lounge,music,newnew,newpol,prog,sci,sjis,sports,tech,tele,vip"
boards = [(board_name,) for board_name in boards.split(',')]
db.executemany("insert into boards values (?)", boards)
db.commit()
threads = db.execute("""
select A.board_name, A.thread_no, B.highest_post+1, B.post_count
from subject_txt A
left join archive.threads B on
A.thread_no = B.thread_no and
A.board_name = B.board_name
where
(B.thread_no is null or
A.highest_post > B.highest_post) and
A.highest_post > 0
""").fetchall()
for i in xrange(len(threads)):
board_thread = threads[i]
board_name = board_thread[0]
thread_no = board_thread[1]
url = "http://dis.4chan.org/read/"+board_name+"/"+str(thread_no)
if board_thread[2]==None:
from_post = 1
current_post_count = 0
else:
from_post = board_thread[2]
url += "/"+str(from_post)+"-"
current_post_count = board_thread[3]
print board_name, "("+str(i+1)+"/"+str(len(threads))+")", thread_no, "from post", from_post,
html_page = geturl(url)
post_search = re_thread.findall(html_page)
subject = post_search[0][0].strip().replace(">",">").replace("<",">").replace(""",'"').replace("'","'").replace("&","&")
posts = post_search[0][1].strip()
highest_post = int(post_search[0][2])-1
post_count = current_post_count
post_no = 0
time_of_last_post = 0
for post in re_post.findall(posts):
post_count += 1
if post_no > 0:
print '\b'*(len(str(post_no))+2),
post_no = int(post[0])
print post_no,
trip = post[2]
if trip == '':
trip = None
name_email = re_email.match(post[1])
if name_email == None:
name = post[1]
email = None
else:
name = name_email.groups()[1]
email = name_email.groups()[0]
# NO! this breaks on img/1104652020/21! store as string instead.
# date_time = int(totimestamp(datetime.datetime.strptime(post[3], "%Y-%m-%d %H:%M")))
date_time = post[3].strip()
time_of_last_post = date_time
id = post[4].strip()
if id == '':
id = None
html = post[5]
### HTML PARSING WILL GO HERE ###
row = (board_name, thread_no, post_no, date_time, name, trip, email, id, html, '', '')
cc = db.execute('replace into archive.posts values (?,?,?,?,?,?,?,?,?,?,?)', row)
row = (board_name, thread_no, subject, post_count, highest_post, time_of_last_post)
cc = db.execute('replace into archive.threads values (?,?,?,?,?,?)', row)
cc = db.commit()
print 'highest post is now',highest_post,'~'
time.sleep(2)
# main board loop
if len(sys.argv) >= 2:
comparator = "="
if len(sys.argv) == 3:
if sys.argv[2]=="-":
comparator = ">="
boards = db.execute("select board_name from archive.boards where board_name "+comparator+" ?", (sys.argv[1],)).fetchall()
else:
boards = db.execute("select board_name from archive.boards").fetchall()
for board in boards:
board_name = board[0]
print 'getting subject list for /'+board_name+'/'
subject_txt = geturl("http://dis.4chan.org/"+board_name+"/subject.txt")
subject_txt = [tuple(line.rsplit("<>",6)) for line in subject_txt.split("\n") if line != ""]
db.execute("delete from subject_txt")
db.executemany("insert into subject_txt values ('"+board_name+"',?,?,?,?,?,?,?)", subject_txt)
print 'retrieving new posts'
get_new_posts()
time.sleep(5)
>>12
It's a bad idea to let random faggots without any knowledge to use these tools. Right, they must download and install FIOC, but it's still easier. So, sage
OK, the archive is done... now what file sharing service should I use? I feel like using RapidShit to troll you all, but it's probably pretty painful for an uploader as well.
Now
-excerpt the 3% that's useful
-set up a better forum with ranking, searching, category organization, and filtering out idiot users as we find them
-load 3% onto better forum
-Call Adult Friend Finder
-Post link to new forum
>>25
The world4ch_archiver.py script is to be run in Python, and the world4ch_archive.db.gz file should be gunzipped and then can be opened with SQLite (http://www.sqlite.org/download.html)
e.g.
$ sqlite3 world4ch_archive.db
SQLite version 3.5.8
Enter ".help" for instructions
sqlite> .tables
boards posts threads
sqlite> select max(date_time) from posts where board_name = 'prog';
2008-05-01 15:08
sqlite> select thread_no, post_no from posts where board_name = 'prog' and html like '%forced indentation%' order by date_time limit 1;
1138460471|12
sqlite> select board_name, count(*) from threads group by board_name order by count(*) desc;
lounge|23382
comp|7879
vip|7762
prog|5266
newpol|3662
games|3212
lang|2271
sci|2234
music|1995
anime|1977
tech|1835
newnew|1257
book|1179
img|1155
tele|875
food|640
sports|556
carcom|441
sjis|324
sqlite>