[python] htmlファイルから、日本語のテキストを取り出す
#!/usr/bin/python # coding: UTF-8 import sys import MeCab reload(sys) sys.setdefaultencoding('utf-8') def japanese(tl): try: tl.encode("ISO8859") return 0 except: try: tl.encode("shift-jis") return 1 except: return 0 argvs = sys.argv argc = len(argvs) for w in argvs[1:]: print w try: f = open(w) except: continue text = f.read() f.close() m = MeCab.Tagger('-Ochasen') n = m.parseToNode(text) n = n.next output = open(w + "_" + "output.txt","w") while n: if japanese(n.surface): output.write(n.surface + " ") n = n.next output.close()