Find all the Japanese characters in the html file
import re def find_japanese(file): japanese_regex = re.compile(r'[\u3000-\u303f\u3040-\u309f\u30a0-\u30ff\uff00-\uff9f\u4e00-\u9faf\u3400-\u4dbf]') text = open(file) text = text.read() text = text.strip().split() japanese = [] for i in text: if japanese_regex.search(i): japanese.append(i) return japanese find_japanese('html_file.txt')