You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

187 lines
4.8 KiB
Python

from bs4 import BeautifulSoup
import csv
import os
class Alphabet:
def __init__(self, name: str, file_path: str = "", data: list=None):
if not os.path.isfile(file_path) and file_path != "":
raise FileNotFoundError("File has not been found")
self._name = name
self._file_path = file_path
self._parsed_html = None
self._legend = None
if data is not None:
self._data = data
return
if file_path == "":
self._data = []
print("Warning: Empty Alphabet!")
return
self._stream = open(file_path, "r")
file_name, file_extension = os.path.splitext(self._file_path)
if file_extension == '.html':
self._data = self.import_from_html()
elif file_extension == '.csv':
self._data = self.import_from_csv()
else:
raise Exception("Unknown file extension!")
def __len__(self):
return len(self._data)
def get_data(self) -> list:
return self._data
def get_name(self) -> str:
return self._name
def get_unique(self):
"""
Check if each entry is unique
Uses Unicode number for comparison
:return:
"""
characters = set()
for entry in self._data:
characters.add(entry['Decimal'])
Alphabet.generate_from_numbers(self._name, list(characters))
@staticmethod
def generate_from_numbers(name: str, numbers: list):
"""
Generate alphabet from a list of unicode numbers
:param name:
:param numbers:
:return:
"""
data = []
for n in numbers:
entry = {}
entry['Char'] = chr(n)
entry['Unicode (hex)'] = "U+" + hex(n)[2:]
entry['Decimal'] = n
entry['Learning Order'] = None
data.append(entry)
return Alphabet(name=name, data=data)
def import_from_csv(self) -> list:
"""
Import file from csv
"""
reader = csv.reader(self._stream)
self._legend = next(reader)
data = []
for row in reader:
entry = {}
for i in range(len(row)):
entry[self._legend[i]] = row[i]
data.append(entry)
return data
def import_from_html(self) -> list:
"""
Import file as html table of characters
"""
html = ""
for line in self._stream:
html += line
self._parsed_html = BeautifulSoup(html, features="html.parser")
entries = self._parsed_html.find_all('tr')
self._legend = []
for legend_data in entries[0].find_all('td'):
self._legend.append(legend_data.string)
data = []
for entry in entries[1:]:
parsed_table_data = entry.find_all('td')
table_data = {}
for i in range(len(parsed_table_data)):
table_data[self._legend[i]] = parsed_table_data[i].string
data.append(table_data)
return data
def export_csv(self, file_path: str):
assert(file_path != self._file_path)
file_name, file_extension = os.path.splitext(file_path)
if file_extension != '.csv':
file_path = file_name + '.csv'
stream = open(file_path, 'w')
writer = csv.writer(stream)
writer.writerow(self.get_data()[0].keys())
for entry in self.get_data():
writer.writerow([entry[key] for key in entry.keys()])
stream.close()
def append_entry(self, unicode_num: int):
uni_hex = hex(unicode_num)
char = chr(unicode_num)
print("Appending Chr:{0} Num:{1} Hex:{2} to {3}".format(char, unicode_num, uni_hex, self._name))
entry = {}
entry['Char'] = char
entry['Unicode (hex)'] = "U+"+uni_hex[2:]
entry['Decimal'] = unicode_num
entry['Learning Order'] = None
self._data.append(entry)
def append_entry_hex(self, uni_hex: str):
self.append_entry(int(uni_hex, 0))
if __name__ == "__main__":
kanji = Alphabet("Kanji", "kanji.csv")
hiragana = Alphabet("Hiragana", "hiragana.csv")
katakana = Alphabet("Katakana", "katakana.csv")
print("Kanji:{0} Hiragana:{1} Katakana:{2}".format(len(kanji.get_data()),
len(hiragana.get_data()),
len(katakana.get_data(),
)))
# Generate kanji2 from ranges
start = 0x4e00
end = 0x9fa0
n = [x for x in range(start, end)]
start = 0x3400
end = 0x4dbf
n += [x for x in range(start, end)]
n = list(set(n))
k2 = Alphabet.generate_from_numbers("Kanji2", n)
print("Kanji2: {0}".format(len(k2)))
#k2.export_csv("kanji2.csv")