[python][テキストマイニング] 2chからテキストデータを収集する。

取得するカテゴリを選択して、現行スレの一覧の値をurlという変数に代入して使う。

#!/usr/bin/python
# coding: UTF-8
import sys,lxml.html,re,urllib2,time,os,re
import feedparser
reload(sys)
sys.setdefaultencoding('shift_jis')

os.chdir("2ch")
url = "http://kamome.2ch.net/anime/subback.html"
fp = urllib2.urlopen(url,"shift_jis")
html = fp.read()
fp.close()

links = re.findall(r'href="[0-9]+?/l50',html)
links = [ w[6:-4] for w in links]

for w in links:
    split = url.split("/")
    link = "/".join(split[:3]) + "/test/read.cgi/" + split[3] + "/" + str(w)
    filename = "_".join(link.split("/")[2:])
    ls = os.listdir(".")
    if filename in ls:
        print link,"は既に取得しています"
        continue
    else:
        print link
    try:
        fp = urllib2.urlopen(link)
        html = fp.read()
        fp.close()
    except:
        continue
    
    if len(w) < 150:
        f = open(filename,'w')
        f.write(html)
        f.close()
    else:
        filename = filename[:50] + filename[-100:]
        f = open(filename,'w')
        f.write(html)
        f.close()