[python] htmlファイルから、日本語のテキストを取り出す

#!/usr/bin/python
# coding: UTF-8
import sys
import MeCab
reload(sys)
sys.setdefaultencoding('utf-8')

def japanese(tl):
    try:
        tl.encode("ISO8859")
        return 0
    except:
        try:
            tl.encode("shift-jis")
            return 1
        except:
            return 0

argvs = sys.argv
argc = len(argvs)

for w in argvs[1:]:
    print w
    try:
        f = open(w)
    except:
        continue
    text = f.read()
    f.close()
    m = MeCab.Tagger('-Ochasen')
    n = m.parseToNode(text)
    n = n.next
    output = open(w + "_" + "output.txt","w")

    while n:
        if japanese(n.surface):
            output.write(n.surface + " ")
        n = n.next

    output.close()