from bs4 import BeautifulSoup import csv import os class Alphabet: def __init__(self, name: str, file_path: str = "", data: list=None): if not os.path.isfile(file_path) and file_path != "": raise FileNotFoundError("File has not been found") self._name = name self._file_path = file_path self._parsed_html = None self._legend = None if data is not None: self._data = data return if file_path == "": self._data = [] print("Warning: Empty Alphabet!") return self._stream = open(file_path, "r") file_name, file_extension = os.path.splitext(self._file_path) if file_extension == '.html': self._data = self.import_from_html() elif file_extension == '.csv': self._data = self.import_from_csv() else: raise Exception("Unknown file extension!") def __len__(self): return len(self._data) def get_data(self) -> list: return self._data def get_name(self) -> str: return self._name def get_unique(self): """ Check if each entry is unique Uses Unicode number for comparison :return: """ characters = set() for entry in self._data: characters.add(entry['Decimal']) Alphabet.generate_from_numbers(self._name, list(characters)) @staticmethod def generate_from_numbers(name: str, numbers: list): """ Generate alphabet from a list of unicode numbers :param name: :param numbers: :return: """ data = [] for n in numbers: entry = {} entry['Char'] = chr(n) entry['Unicode (hex)'] = "U+" + hex(n)[2:] entry['Decimal'] = n entry['Learning Order'] = None data.append(entry) return Alphabet(name=name, data=data) def import_from_csv(self) -> list: """ Import file from csv """ reader = csv.reader(self._stream) self._legend = next(reader) data = [] for row in reader: entry = {} for i in range(len(row)): entry[self._legend[i]] = row[i] data.append(entry) return data def import_from_html(self) -> list: """ Import file as html table of characters """ html = "" for line in self._stream: html += line self._parsed_html = BeautifulSoup(html, features="html.parser") entries = self._parsed_html.find_all('tr') self._legend = [] for legend_data in entries[0].find_all('td'): self._legend.append(legend_data.string) data = [] for entry in entries[1:]: parsed_table_data = entry.find_all('td') table_data = {} for i in range(len(parsed_table_data)): table_data[self._legend[i]] = parsed_table_data[i].string data.append(table_data) return data def export_csv(self, file_path: str): assert(file_path != self._file_path) file_name, file_extension = os.path.splitext(file_path) if file_extension != '.csv': file_path = file_name + '.csv' stream = open(file_path, 'w') writer = csv.writer(stream) writer.writerow(self.get_data()[0].keys()) for entry in self.get_data(): writer.writerow([entry[key] for key in entry.keys()]) stream.close() def append_entry(self, unicode_num: int): uni_hex = hex(unicode_num) char = chr(unicode_num) print("Appending Chr:{0} Num:{1} Hex:{2} to {3}".format(char, unicode_num, uni_hex, self._name)) entry = {} entry['Char'] = char entry['Unicode (hex)'] = "U+"+uni_hex[2:] entry['Decimal'] = unicode_num entry['Learning Order'] = None self._data.append(entry) def append_entry_hex(self, uni_hex: str): self.append_entry(int(uni_hex, 0)) if __name__ == "__main__": kanji = Alphabet("Kanji", "kanji.csv") hiragana = Alphabet("Hiragana", "hiragana.csv") katakana = Alphabet("Katakana", "katakana.csv") print("Kanji:{0} Hiragana:{1} Katakana:{2}".format(len(kanji.get_data()), len(hiragana.get_data()), len(katakana.get_data(), ))) # Generate kanji2 from ranges start = 0x4e00 end = 0x9fa0 n = [x for x in range(start, end)] start = 0x3400 end = 0x4dbf n += [x for x in range(start, end)] n = list(set(n)) k2 = Alphabet.generate_from_numbers("Kanji2", n) print("Kanji2: {0}".format(len(k2))) #k2.export_csv("kanji2.csv")