Basic Working Version
Rudimentary version controlled with in-code statements. Fetched contents of kanji.csv from www.tonypottier.info and converted them into a csv. Other csv's are based unicode ranges from wikipedia.master
parent
a32d18c6d9
commit
c94a2ea418
@ -0,0 +1,186 @@
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
import csv
|
||||
import os
|
||||
|
||||
|
||||
class Alphabet:
|
||||
|
||||
def __init__(self, name: str, file_path: str = "", data: list=None):
|
||||
if not os.path.isfile(file_path) and file_path != "":
|
||||
raise FileNotFoundError("File has not been found")
|
||||
|
||||
self._name = name
|
||||
self._file_path = file_path
|
||||
self._parsed_html = None
|
||||
self._legend = None
|
||||
|
||||
if data is not None:
|
||||
self._data = data
|
||||
return
|
||||
|
||||
if file_path == "":
|
||||
self._data = []
|
||||
print("Warning: Empty Alphabet!")
|
||||
return
|
||||
|
||||
self._stream = open(file_path, "r")
|
||||
file_name, file_extension = os.path.splitext(self._file_path)
|
||||
if file_extension == '.html':
|
||||
self._data = self.import_from_html()
|
||||
elif file_extension == '.csv':
|
||||
self._data = self.import_from_csv()
|
||||
|
||||
else:
|
||||
raise Exception("Unknown file extension!")
|
||||
|
||||
def __len__(self):
|
||||
return len(self._data)
|
||||
|
||||
def get_data(self) -> list:
|
||||
return self._data
|
||||
|
||||
def get_name(self) -> str:
|
||||
return self._name
|
||||
|
||||
def get_unique(self):
|
||||
"""
|
||||
Check if each entry is unique
|
||||
|
||||
Uses Unicode number for comparison
|
||||
:return:
|
||||
"""
|
||||
characters = set()
|
||||
for entry in self._data:
|
||||
characters.add(entry['Decimal'])
|
||||
|
||||
Alphabet.generate_from_numbers(self._name, list(characters))
|
||||
|
||||
@staticmethod
|
||||
def generate_from_numbers(name: str, numbers: list):
|
||||
"""
|
||||
Generate alphabet from a list of unicode numbers
|
||||
:param name:
|
||||
:param numbers:
|
||||
:return:
|
||||
"""
|
||||
data = []
|
||||
|
||||
for n in numbers:
|
||||
entry = {}
|
||||
entry['Char'] = chr(n)
|
||||
entry['Unicode (hex)'] = "U+" + hex(n)[2:]
|
||||
entry['Decimal'] = n
|
||||
entry['Learning Order'] = None
|
||||
data.append(entry)
|
||||
|
||||
return Alphabet(name=name, data=data)
|
||||
|
||||
def import_from_csv(self) -> list:
|
||||
"""
|
||||
Import file from csv
|
||||
"""
|
||||
reader = csv.reader(self._stream)
|
||||
self._legend = next(reader)
|
||||
|
||||
data = []
|
||||
for row in reader:
|
||||
entry = {}
|
||||
for i in range(len(row)):
|
||||
entry[self._legend[i]] = row[i]
|
||||
data.append(entry)
|
||||
|
||||
return data
|
||||
|
||||
def import_from_html(self) -> list:
|
||||
"""
|
||||
Import file as html table of characters
|
||||
"""
|
||||
html = ""
|
||||
for line in self._stream:
|
||||
html += line
|
||||
|
||||
self._parsed_html = BeautifulSoup(html, features="html.parser")
|
||||
entries = self._parsed_html.find_all('tr')
|
||||
|
||||
self._legend = []
|
||||
for legend_data in entries[0].find_all('td'):
|
||||
self._legend.append(legend_data.string)
|
||||
|
||||
data = []
|
||||
for entry in entries[1:]:
|
||||
parsed_table_data = entry.find_all('td')
|
||||
|
||||
table_data = {}
|
||||
for i in range(len(parsed_table_data)):
|
||||
table_data[self._legend[i]] = parsed_table_data[i].string
|
||||
data.append(table_data)
|
||||
|
||||
return data
|
||||
|
||||
def export_csv(self, file_path: str):
|
||||
assert(file_path != self._file_path)
|
||||
|
||||
file_name, file_extension = os.path.splitext(file_path)
|
||||
if file_extension != '.csv':
|
||||
file_path = file_name + '.csv'
|
||||
|
||||
stream = open(file_path, 'w')
|
||||
writer = csv.writer(stream)
|
||||
|
||||
writer.writerow(self.get_data()[0].keys())
|
||||
|
||||
for entry in self.get_data():
|
||||
writer.writerow([entry[key] for key in entry.keys()])
|
||||
|
||||
stream.close()
|
||||
|
||||
def append_entry(self, unicode_num: int):
|
||||
uni_hex = hex(unicode_num)
|
||||
char = chr(unicode_num)
|
||||
|
||||
print("Appending Chr:{0} Num:{1} Hex:{2} to {3}".format(char, unicode_num, uni_hex, self._name))
|
||||
|
||||
entry = {}
|
||||
entry['Char'] = char
|
||||
entry['Unicode (hex)'] = "U+"+uni_hex[2:]
|
||||
entry['Decimal'] = unicode_num
|
||||
entry['Learning Order'] = None
|
||||
|
||||
self._data.append(entry)
|
||||
|
||||
def append_entry_hex(self, uni_hex: str):
|
||||
self.append_entry(int(uni_hex, 0))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
kanji = Alphabet("Kanji", "kanji.csv")
|
||||
hiragana = Alphabet("Hiragana", "hiragana.csv")
|
||||
katakana = Alphabet("Katakana", "katakana.csv")
|
||||
|
||||
print("Kanji:{0} Hiragana:{1} Katakana:{2}".format(len(kanji.get_data()),
|
||||
len(hiragana.get_data()),
|
||||
len(katakana.get_data(),
|
||||
)))
|
||||
|
||||
# Generate kanji2 from ranges
|
||||
start = 0x4e00
|
||||
end = 0x9fa0
|
||||
|
||||
n = [x for x in range(start, end)]
|
||||
|
||||
start = 0x3400
|
||||
end = 0x4dbf
|
||||
|
||||
n += [x for x in range(start, end)]
|
||||
|
||||
n = list(set(n))
|
||||
|
||||
k2 = Alphabet.generate_from_numbers("Kanji2", n)
|
||||
print("Kanji2: {0}".format(len(k2)))
|
||||
#k2.export_csv("kanji2.csv")
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -0,0 +1,93 @@
|
||||
from Alphabet import Alphabet
|
||||
|
||||
|
||||
class Detector:
|
||||
|
||||
def __init__(self):
|
||||
self._alphs = None
|
||||
|
||||
def add_alphabet(self, alph: Alphabet):
|
||||
if self._alphs is None:
|
||||
self._alphs = []
|
||||
self._alphs.append(alph)
|
||||
|
||||
def match_text(self, text: str) -> dict:
|
||||
"""
|
||||
Determine if a given text uses some characters from any alphabet
|
||||
:param text:
|
||||
:return:
|
||||
"""
|
||||
results = {}
|
||||
for alph in self._alphs:
|
||||
results[alph.get_name()] = [[]] # [matches], percentage
|
||||
results['Unknown'] = [[]]
|
||||
|
||||
for c in text:
|
||||
if c == "\n": # ignoring whitespace
|
||||
continue
|
||||
found = False
|
||||
for alph in self._alphs:
|
||||
|
||||
for entry in alph.get_data():
|
||||
if c == entry['Char']:
|
||||
results[alph.get_name()][0].append(c)
|
||||
found = True
|
||||
|
||||
if not found:
|
||||
results['Unknown'][0].append(c)
|
||||
|
||||
# Create statistics
|
||||
for key in results.keys():
|
||||
results[key].append(len(results[key][0])/float(len(text)))
|
||||
return results
|
||||
|
||||
@staticmethod
|
||||
def pretty_result(result: dict, small: bool=False):
|
||||
s = ""
|
||||
longest_name = max([len(key) for key in result.keys()])
|
||||
longest_match = max([len(m[0]) for m in result.values()])
|
||||
|
||||
for key in result.keys():
|
||||
name = key + ": "
|
||||
while len(name) <= 2+longest_name:
|
||||
name += " "
|
||||
|
||||
match_list = result[key][0]
|
||||
while len(match_list) <= longest_match:
|
||||
match_list.append(" ")
|
||||
matches = ""
|
||||
for m in match_list:
|
||||
matches += m
|
||||
|
||||
percentage = round(result[key][1]*100, 2)
|
||||
if not small:
|
||||
s += name + "Matches: " + matches + " MatchPercent: " + str(percentage) + "\n"
|
||||
else:
|
||||
s += name + str(percentage) + "\n"
|
||||
|
||||
return s
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
kanji = Alphabet("Kanji (small)", "kanji.csv")
|
||||
kanji2 = Alphabet("Kanji (big)", "kanji2.csv")
|
||||
hiragana = Alphabet("Hiragana", "hiragana.csv")
|
||||
katakana = Alphabet("Katakana", "katakana.csv")
|
||||
print(kanji.get_name(), len(kanji.get_data()))
|
||||
print(kanji2.get_name(), len(kanji2.get_data()))
|
||||
print(hiragana.get_name(), len(hiragana.get_data()))
|
||||
print(katakana.get_name(), len(katakana.get_data()))
|
||||
print()
|
||||
|
||||
d = Detector()
|
||||
d.add_alphabet(kanji)
|
||||
d.add_alphabet(kanji2)
|
||||
d.add_alphabet(hiragana)
|
||||
d.add_alphabet(katakana)
|
||||
|
||||
|
||||
text = ""
|
||||
for line in open("./sample-text.txt", 'r'):
|
||||
text += line
|
||||
|
||||
print("Matches: \n{0}".format(d.pretty_result(d.match_text(text), small=True)))
|
@ -0,0 +1,90 @@
|
||||
Char,Unicode (hex),Decimal,LearningOrder
|
||||
ぁ,U+3041,12353,
|
||||
あ,U+3042,12354,
|
||||
ぃ,U+3043,12355,
|
||||
い,U+3044,12356,
|
||||
ぅ,U+3045,12357,
|
||||
う,U+3046,12358,
|
||||
ぇ,U+3047,12359,
|
||||
え,U+3048,12360,
|
||||
ぉ,U+3049,12361,
|
||||
お,U+304a,12362,
|
||||
か,U+304b,12363,
|
||||
が,U+304c,12364,
|
||||
き,U+304d,12365,
|
||||
ぎ,U+304e,12366,
|
||||
く,U+304f,12367,
|
||||
ぐ,U+3050,12368,
|
||||
け,U+3051,12369,
|
||||
げ,U+3052,12370,
|
||||
こ,U+3053,12371,
|
||||
ご,U+3054,12372,
|
||||
さ,U+3055,12373,
|
||||
ざ,U+3056,12374,
|
||||
し,U+3057,12375,
|
||||
す,U+3059,12377,
|
||||
ず,U+305a,12378,
|
||||
せ,U+305b,12379,
|
||||
ぜ,U+305c,12380,
|
||||
そ,U+305d,12381,
|
||||
ぞ,U+305e,12382,
|
||||
た,U+305f,12383,
|
||||
だ,U+3060,12384,
|
||||
ち,U+3061,12385,
|
||||
ぢ,U+3062,12386,
|
||||
っ,U+3063,12387,
|
||||
つ,U+3064,12388,
|
||||
づ,U+3065,12389,
|
||||
て,U+3066,12390,
|
||||
で,U+3067,12391,
|
||||
ど,U+3069,12393,
|
||||
な,U+306a,12394,
|
||||
に,U+306b,12395,
|
||||
ぬ,U+306c,12396,
|
||||
ね,U+306d,12397,
|
||||
の,U+306e,12398,
|
||||
は,U+306f,12399,
|
||||
ば,U+3070,12400,
|
||||
ぱ,U+3071,12401,
|
||||
ひ,U+3072,12402,
|
||||
び,U+3073,12403,
|
||||
ぴ,U+3074,12404,
|
||||
ふ,U+3075,12405,
|
||||
ぶ,U+3076,12406,
|
||||
ぷ,U+3077,12407,
|
||||
べ,U+3079,12409,
|
||||
ぺ,U+307a,12410,
|
||||
ほ,U+307b,12411,
|
||||
ぼ,U+307c,12412,
|
||||
ぽ,U+307d,12413,
|
||||
ま,U+307e,12414,
|
||||
み,U+307f,12415,
|
||||
む,U+3080,12416,
|
||||
め,U+3081,12417,
|
||||
も,U+3082,12418,
|
||||
ゃ,U+3083,12419,
|
||||
や,U+3084,12420,
|
||||
ゅ,U+3085,12421,
|
||||
ゆ,U+3086,12422,
|
||||
ょ,U+3087,12423,
|
||||
ら,U+3089,12425,
|
||||
り,U+308a,12426,
|
||||
る,U+308b,12427,
|
||||
れ,U+308c,12428,
|
||||
ろ,U+308d,12429,
|
||||
ゎ,U+308e,12430,
|
||||
わ,U+308f,12431,
|
||||
ゐ,U+3090,12432,
|
||||
ゑ,U+3091,12433,
|
||||
を,U+3092,12434,
|
||||
ん,U+3093,12435,
|
||||
ゔ,U+3094,12436,
|
||||
ゕ,U+3095,12437,
|
||||
ゖ,U+3096,12438,
|
||||
゙,U+3099,12441,
|
||||
゚,U+309a,12442,
|
||||
゛,U+309b,12443,
|
||||
゜,U+309c,12444,
|
||||
ゝ,U+309d,12445,
|
||||
ゞ,U+309e,12446,
|
||||
ゟ,U+309f,12447,
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,5 @@
|
||||
会市屋詩
|
||||
|
||||
ヷムペヺ
|
||||
|
||||
ぅぴ
|
Loading…
Reference in New Issue