[python][テキストマイニング] googleニュースをRSSから取得する。

#!/usr/bin/python
# coding: UTF-8
import sys,lxml.html,re,urllib2,time,os,re
import feedparser
reload(sys)
sys.setdefaultencoding('utf-8')

os.chdir("googlenews")
url = "http://news.google.com/news?ned=us&ie=UTF-8&oe=UTF-8&q=&output=atom&num=30&hl=ja"

llog = feedparser.parse(url)

for post in llog.entries:
    content = post.content[0].value
    links = re.findall(r'url=http.+?"',content)
    links = [w[4:-1] for w in links]

    for link in links:
        filename = "_".join(link.split("/")[2:])
        ls = os.listdir(".")
        if filename in ls:
            print filename,"は既に取得しています"
            continue
        else:
            print link
        try:
            fp = urllib2.urlopen(link)
            html = fp.read()
            fp.close()
        except:
            continue

        if len(w) < 150:
            f = open(filename,'w')
            f.write(html)
            f.close()
        else:
            filename = filename[:50] + filename[-100:]
            f = open(filename,'w')
            f.write(html)
            f.close()