You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
187 lines
4.8 KiB
Python
187 lines
4.8 KiB
Python
5 years ago
|
from bs4 import BeautifulSoup
|
||
|
|
||
|
import csv
|
||
|
import os
|
||
|
|
||
|
|
||
|
class Alphabet:
|
||
|
|
||
|
def __init__(self, name: str, file_path: str = "", data: list=None):
|
||
|
if not os.path.isfile(file_path) and file_path != "":
|
||
|
raise FileNotFoundError("File has not been found")
|
||
|
|
||
|
self._name = name
|
||
|
self._file_path = file_path
|
||
|
self._parsed_html = None
|
||
|
self._legend = None
|
||
|
|
||
|
if data is not None:
|
||
|
self._data = data
|
||
|
return
|
||
|
|
||
|
if file_path == "":
|
||
|
self._data = []
|
||
|
print("Warning: Empty Alphabet!")
|
||
|
return
|
||
|
|
||
|
self._stream = open(file_path, "r")
|
||
|
file_name, file_extension = os.path.splitext(self._file_path)
|
||
|
if file_extension == '.html':
|
||
|
self._data = self.import_from_html()
|
||
|
elif file_extension == '.csv':
|
||
|
self._data = self.import_from_csv()
|
||
|
|
||
|
else:
|
||
|
raise Exception("Unknown file extension!")
|
||
|
|
||
|
def __len__(self):
|
||
|
return len(self._data)
|
||
|
|
||
|
def get_data(self) -> list:
|
||
|
return self._data
|
||
|
|
||
|
def get_name(self) -> str:
|
||
|
return self._name
|
||
|
|
||
|
def get_unique(self):
|
||
|
"""
|
||
|
Check if each entry is unique
|
||
|
|
||
|
Uses Unicode number for comparison
|
||
|
:return:
|
||
|
"""
|
||
|
characters = set()
|
||
|
for entry in self._data:
|
||
|
characters.add(entry['Decimal'])
|
||
|
|
||
|
Alphabet.generate_from_numbers(self._name, list(characters))
|
||
|
|
||
|
@staticmethod
|
||
|
def generate_from_numbers(name: str, numbers: list):
|
||
|
"""
|
||
|
Generate alphabet from a list of unicode numbers
|
||
|
:param name:
|
||
|
:param numbers:
|
||
|
:return:
|
||
|
"""
|
||
|
data = []
|
||
|
|
||
|
for n in numbers:
|
||
|
entry = {}
|
||
|
entry['Char'] = chr(n)
|
||
|
entry['Unicode (hex)'] = "U+" + hex(n)[2:]
|
||
|
entry['Decimal'] = n
|
||
|
entry['Learning Order'] = None
|
||
|
data.append(entry)
|
||
|
|
||
|
return Alphabet(name=name, data=data)
|
||
|
|
||
|
def import_from_csv(self) -> list:
|
||
|
"""
|
||
|
Import file from csv
|
||
|
"""
|
||
|
reader = csv.reader(self._stream)
|
||
|
self._legend = next(reader)
|
||
|
|
||
|
data = []
|
||
|
for row in reader:
|
||
|
entry = {}
|
||
|
for i in range(len(row)):
|
||
|
entry[self._legend[i]] = row[i]
|
||
|
data.append(entry)
|
||
|
|
||
|
return data
|
||
|
|
||
|
def import_from_html(self) -> list:
|
||
|
"""
|
||
|
Import file as html table of characters
|
||
|
"""
|
||
|
html = ""
|
||
|
for line in self._stream:
|
||
|
html += line
|
||
|
|
||
|
self._parsed_html = BeautifulSoup(html, features="html.parser")
|
||
|
entries = self._parsed_html.find_all('tr')
|
||
|
|
||
|
self._legend = []
|
||
|
for legend_data in entries[0].find_all('td'):
|
||
|
self._legend.append(legend_data.string)
|
||
|
|
||
|
data = []
|
||
|
for entry in entries[1:]:
|
||
|
parsed_table_data = entry.find_all('td')
|
||
|
|
||
|
table_data = {}
|
||
|
for i in range(len(parsed_table_data)):
|
||
|
table_data[self._legend[i]] = parsed_table_data[i].string
|
||
|
data.append(table_data)
|
||
|
|
||
|
return data
|
||
|
|
||
|
def export_csv(self, file_path: str):
|
||
|
assert(file_path != self._file_path)
|
||
|
|
||
|
file_name, file_extension = os.path.splitext(file_path)
|
||
|
if file_extension != '.csv':
|
||
|
file_path = file_name + '.csv'
|
||
|
|
||
|
stream = open(file_path, 'w')
|
||
|
writer = csv.writer(stream)
|
||
|
|
||
|
writer.writerow(self.get_data()[0].keys())
|
||
|
|
||
|
for entry in self.get_data():
|
||
|
writer.writerow([entry[key] for key in entry.keys()])
|
||
|
|
||
|
stream.close()
|
||
|
|
||
|
def append_entry(self, unicode_num: int):
|
||
|
uni_hex = hex(unicode_num)
|
||
|
char = chr(unicode_num)
|
||
|
|
||
|
print("Appending Chr:{0} Num:{1} Hex:{2} to {3}".format(char, unicode_num, uni_hex, self._name))
|
||
|
|
||
|
entry = {}
|
||
|
entry['Char'] = char
|
||
|
entry['Unicode (hex)'] = "U+"+uni_hex[2:]
|
||
|
entry['Decimal'] = unicode_num
|
||
|
entry['Learning Order'] = None
|
||
|
|
||
|
self._data.append(entry)
|
||
|
|
||
|
def append_entry_hex(self, uni_hex: str):
|
||
|
self.append_entry(int(uni_hex, 0))
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
kanji = Alphabet("Kanji", "kanji.csv")
|
||
|
hiragana = Alphabet("Hiragana", "hiragana.csv")
|
||
|
katakana = Alphabet("Katakana", "katakana.csv")
|
||
|
|
||
|
print("Kanji:{0} Hiragana:{1} Katakana:{2}".format(len(kanji.get_data()),
|
||
|
len(hiragana.get_data()),
|
||
|
len(katakana.get_data(),
|
||
|
)))
|
||
|
|
||
|
# Generate kanji2 from ranges
|
||
|
start = 0x4e00
|
||
|
end = 0x9fa0
|
||
|
|
||
|
n = [x for x in range(start, end)]
|
||
|
|
||
|
start = 0x3400
|
||
|
end = 0x4dbf
|
||
|
|
||
|
n += [x for x in range(start, end)]
|
||
|
|
||
|
n = list(set(n))
|
||
|
|
||
|
k2 = Alphabet.generate_from_numbers("Kanji2", n)
|
||
|
print("Kanji2: {0}".format(len(k2)))
|
||
|
#k2.export_csv("kanji2.csv")
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|