import sys,lxml.html,re,urllib2,time,os,re
import feedparser
reload(sys)
sys.setdefaultencoding('utf-8')
os.chdir("googlenews")
url = "http://news.google.com/news?ned=us&ie=UTF-8&oe=UTF-8&q=&output=atom&num=30&hl=ja"
llog = feedparser.parse(url)
for post in llog.entries:
content = post.content[0].value
links = re.findall(r'url=http.+?"',content)
links = [w[4:-1] for w in links]
for link in links:
filename = "_".join(link.split("/")[2:])
ls = os.listdir(".")
if filename in ls:
print filename,"は既に取得しています"
continue
else:
print link
try:
fp = urllib2.urlopen(link)
html = fp.read()
fp.close()
except:
continue
if len(w) < 150:
f = open(filename,'w')
f.write(html)
f.close()
else:
filename = filename[:50] + filename[-100:]
f = open(filename,'w')
f.write(html)
f.close()