Export all of the japan characters in the html file into a text file
import codecs with codecs.open('kantai_collection.html', 'r', 'utf-8') as f: data = f.read() with codecs.open('kantai_collection.txt', 'w', 'utf-8') as f2: for x in data.split('<p>')[1:]: f2.write(x.split('</p>')[0] + '\n')