Basic Working Version

Rudimentary version controlled with in-code statements.
Fetched contents of kanji.csv from www.tonypottier.info
and converted them into a csv. Other csv's are based
unicode ranges from wikipedia.
master
Peery 6 years ago
parent a32d18c6d9
commit c94a2ea418

@ -0,0 +1,186 @@
from bs4 import BeautifulSoup
import csv
import os
class Alphabet:
def __init__(self, name: str, file_path: str = "", data: list=None):
if not os.path.isfile(file_path) and file_path != "":
raise FileNotFoundError("File has not been found")
self._name = name
self._file_path = file_path
self._parsed_html = None
self._legend = None
if data is not None:
self._data = data
return
if file_path == "":
self._data = []
print("Warning: Empty Alphabet!")
return
self._stream = open(file_path, "r")
file_name, file_extension = os.path.splitext(self._file_path)
if file_extension == '.html':
self._data = self.import_from_html()
elif file_extension == '.csv':
self._data = self.import_from_csv()
else:
raise Exception("Unknown file extension!")
def __len__(self):
return len(self._data)
def get_data(self) -> list:
return self._data
def get_name(self) -> str:
return self._name
def get_unique(self):
"""
Check if each entry is unique
Uses Unicode number for comparison
:return:
"""
characters = set()
for entry in self._data:
characters.add(entry['Decimal'])
Alphabet.generate_from_numbers(self._name, list(characters))
@staticmethod
def generate_from_numbers(name: str, numbers: list):
"""
Generate alphabet from a list of unicode numbers
:param name:
:param numbers:
:return:
"""
data = []
for n in numbers:
entry = {}
entry['Char'] = chr(n)
entry['Unicode (hex)'] = "U+" + hex(n)[2:]
entry['Decimal'] = n
entry['Learning Order'] = None
data.append(entry)
return Alphabet(name=name, data=data)
def import_from_csv(self) -> list:
"""
Import file from csv
"""
reader = csv.reader(self._stream)
self._legend = next(reader)
data = []
for row in reader:
entry = {}
for i in range(len(row)):
entry[self._legend[i]] = row[i]
data.append(entry)
return data
def import_from_html(self) -> list:
"""
Import file as html table of characters
"""
html = ""
for line in self._stream:
html += line
self._parsed_html = BeautifulSoup(html, features="html.parser")
entries = self._parsed_html.find_all('tr')
self._legend = []
for legend_data in entries[0].find_all('td'):
self._legend.append(legend_data.string)
data = []
for entry in entries[1:]:
parsed_table_data = entry.find_all('td')
table_data = {}
for i in range(len(parsed_table_data)):
table_data[self._legend[i]] = parsed_table_data[i].string
data.append(table_data)
return data
def export_csv(self, file_path: str):
assert(file_path != self._file_path)
file_name, file_extension = os.path.splitext(file_path)
if file_extension != '.csv':
file_path = file_name + '.csv'
stream = open(file_path, 'w')
writer = csv.writer(stream)
writer.writerow(self.get_data()[0].keys())
for entry in self.get_data():
writer.writerow([entry[key] for key in entry.keys()])
stream.close()
def append_entry(self, unicode_num: int):
uni_hex = hex(unicode_num)
char = chr(unicode_num)
print("Appending Chr:{0} Num:{1} Hex:{2} to {3}".format(char, unicode_num, uni_hex, self._name))
entry = {}
entry['Char'] = char
entry['Unicode (hex)'] = "U+"+uni_hex[2:]
entry['Decimal'] = unicode_num
entry['Learning Order'] = None
self._data.append(entry)
def append_entry_hex(self, uni_hex: str):
self.append_entry(int(uni_hex, 0))
if __name__ == "__main__":
kanji = Alphabet("Kanji", "kanji.csv")
hiragana = Alphabet("Hiragana", "hiragana.csv")
katakana = Alphabet("Katakana", "katakana.csv")
print("Kanji:{0} Hiragana:{1} Katakana:{2}".format(len(kanji.get_data()),
len(hiragana.get_data()),
len(katakana.get_data(),
)))
# Generate kanji2 from ranges
start = 0x4e00
end = 0x9fa0
n = [x for x in range(start, end)]
start = 0x3400
end = 0x4dbf
n += [x for x in range(start, end)]
n = list(set(n))
k2 = Alphabet.generate_from_numbers("Kanji2", n)
print("Kanji2: {0}".format(len(k2)))
#k2.export_csv("kanji2.csv")

@ -0,0 +1,93 @@
from Alphabet import Alphabet
class Detector:
def __init__(self):
self._alphs = None
def add_alphabet(self, alph: Alphabet):
if self._alphs is None:
self._alphs = []
self._alphs.append(alph)
def match_text(self, text: str) -> dict:
"""
Determine if a given text uses some characters from any alphabet
:param text:
:return:
"""
results = {}
for alph in self._alphs:
results[alph.get_name()] = [[]] # [matches], percentage
results['Unknown'] = [[]]
for c in text:
if c == "\n": # ignoring whitespace
continue
found = False
for alph in self._alphs:
for entry in alph.get_data():
if c == entry['Char']:
results[alph.get_name()][0].append(c)
found = True
if not found:
results['Unknown'][0].append(c)
# Create statistics
for key in results.keys():
results[key].append(len(results[key][0])/float(len(text)))
return results
@staticmethod
def pretty_result(result: dict, small: bool=False):
s = ""
longest_name = max([len(key) for key in result.keys()])
longest_match = max([len(m[0]) for m in result.values()])
for key in result.keys():
name = key + ": "
while len(name) <= 2+longest_name:
name += " "
match_list = result[key][0]
while len(match_list) <= longest_match:
match_list.append(" ")
matches = ""
for m in match_list:
matches += m
percentage = round(result[key][1]*100, 2)
if not small:
s += name + "Matches: " + matches + " MatchPercent: " + str(percentage) + "\n"
else:
s += name + str(percentage) + "\n"
return s
if __name__ == "__main__":
kanji = Alphabet("Kanji (small)", "kanji.csv")
kanji2 = Alphabet("Kanji (big)", "kanji2.csv")
hiragana = Alphabet("Hiragana", "hiragana.csv")
katakana = Alphabet("Katakana", "katakana.csv")
print(kanji.get_name(), len(kanji.get_data()))
print(kanji2.get_name(), len(kanji2.get_data()))
print(hiragana.get_name(), len(hiragana.get_data()))
print(katakana.get_name(), len(katakana.get_data()))
print()
d = Detector()
d.add_alphabet(kanji)
d.add_alphabet(kanji2)
d.add_alphabet(hiragana)
d.add_alphabet(katakana)
text = ""
for line in open("./sample-text.txt", 'r'):
text += line
print("Matches: \n{0}".format(d.pretty_result(d.match_text(text), small=True)))

@ -0,0 +1,90 @@
Char,Unicode (hex),Decimal,LearningOrder
ぁ,U+3041,12353,
あ,U+3042,12354,
ぃ,U+3043,12355,
い,U+3044,12356,
ぅ,U+3045,12357,
う,U+3046,12358,
ぇ,U+3047,12359,
え,U+3048,12360,
ぉ,U+3049,12361,
お,U+304a,12362,
か,U+304b,12363,
が,U+304c,12364,
き,U+304d,12365,
ぎ,U+304e,12366,
く,U+304f,12367,
ぐ,U+3050,12368,
け,U+3051,12369,
げ,U+3052,12370,
こ,U+3053,12371,
ご,U+3054,12372,
さ,U+3055,12373,
ざ,U+3056,12374,
し,U+3057,12375,
す,U+3059,12377,
ず,U+305a,12378,
せ,U+305b,12379,
ぜ,U+305c,12380,
そ,U+305d,12381,
ぞ,U+305e,12382,
た,U+305f,12383,
だ,U+3060,12384,
ち,U+3061,12385,
ぢ,U+3062,12386,
っ,U+3063,12387,
つ,U+3064,12388,
づ,U+3065,12389,
て,U+3066,12390,
で,U+3067,12391,
ど,U+3069,12393,
な,U+306a,12394,
に,U+306b,12395,
ぬ,U+306c,12396,
ね,U+306d,12397,
の,U+306e,12398,
は,U+306f,12399,
ば,U+3070,12400,
ぱ,U+3071,12401,
ひ,U+3072,12402,
び,U+3073,12403,
ぴ,U+3074,12404,
ふ,U+3075,12405,
ぶ,U+3076,12406,
ぷ,U+3077,12407,
べ,U+3079,12409,
ぺ,U+307a,12410,
ほ,U+307b,12411,
ぼ,U+307c,12412,
ぽ,U+307d,12413,
ま,U+307e,12414,
み,U+307f,12415,
む,U+3080,12416,
め,U+3081,12417,
も,U+3082,12418,
ゃ,U+3083,12419,
や,U+3084,12420,
ゅ,U+3085,12421,
ゆ,U+3086,12422,
ょ,U+3087,12423,
ら,U+3089,12425,
り,U+308a,12426,
る,U+308b,12427,
れ,U+308c,12428,
ろ,U+308d,12429,
ゎ,U+308e,12430,
わ,U+308f,12431,
ゐ,U+3090,12432,
ゑ,U+3091,12433,
を,U+3092,12434,
ん,U+3093,12435,
ゔ,U+3094,12436,
ゕ,U+3095,12437,
ゖ,U+3096,12438,
゙,U+3099,12441,
゚,U+309a,12442,
゛,U+309b,12443,
゜,U+309c,12444,
ゝ,U+309d,12445,
ゞ,U+309e,12446,
ゟ,U+309f,12447,
1 Char Unicode (hex) Decimal LearningOrder
2 U+3041 12353
3 U+3042 12354
4 U+3043 12355
5 U+3044 12356
6 U+3045 12357
7 U+3046 12358
8 U+3047 12359
9 U+3048 12360
10 U+3049 12361
11 U+304a 12362
12 U+304b 12363
13 U+304c 12364
14 U+304d 12365
15 U+304e 12366
16 U+304f 12367
17 U+3050 12368
18 U+3051 12369
19 U+3052 12370
20 U+3053 12371
21 U+3054 12372
22 U+3055 12373
23 U+3056 12374
24 U+3057 12375
25 U+3059 12377
26 U+305a 12378
27 U+305b 12379
28 U+305c 12380
29 U+305d 12381
30 U+305e 12382
31 U+305f 12383
32 U+3060 12384
33 U+3061 12385
34 U+3062 12386
35 U+3063 12387
36 U+3064 12388
37 U+3065 12389
38 U+3066 12390
39 U+3067 12391
40 U+3069 12393
41 U+306a 12394
42 U+306b 12395
43 U+306c 12396
44 U+306d 12397
45 U+306e 12398
46 U+306f 12399
47 U+3070 12400
48 U+3071 12401
49 U+3072 12402
50 U+3073 12403
51 U+3074 12404
52 U+3075 12405
53 U+3076 12406
54 U+3077 12407
55 U+3079 12409
56 U+307a 12410
57 U+307b 12411
58 U+307c 12412
59 U+307d 12413
60 U+307e 12414
61 U+307f 12415
62 U+3080 12416
63 U+3081 12417
64 U+3082 12418
65 U+3083 12419
66 U+3084 12420
67 U+3085 12421
68 U+3086 12422
69 U+3087 12423
70 U+3089 12425
71 U+308a 12426
72 U+308b 12427
73 U+308c 12428
74 U+308d 12429
75 U+308e 12430
76 U+308f 12431
77 U+3090 12432
78 U+3091 12433
79 U+3092 12434
80 U+3093 12435
81 U+3094 12436
82 U+3095 12437
83 U+3096 12438
84 U+3099 12441
85 U+309a 12442
86 U+309b 12443
87 U+309c 12444
88 U+309d 12445
89 U+309e 12446
90 U+309f 12447

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -0,0 +1,97 @@
Char,Unicode (hex),Decimal,LearningOrder
,U+30a0,12448,
ァ,U+30a1,12449,
ア,U+30a2,12450,
ィ,U+30a3,12451,
イ,U+30a4,12452,
ゥ,U+30a5,12453,
ウ,U+30a6,12454,
ェ,U+30a7,12455,
エ,U+30a8,12456,
ォ,U+30a9,12457,
オ,U+30aa,12458,
カ,U+30ab,12459,
ガ,U+30ac,12460,
キ,U+30ad,12461,
ギ,U+30ae,12462,
ク,U+30af,12463,
グ,U+30b0,12464,
ケ,U+30b1,12465,
ゲ,U+30b2,12466,
コ,U+30b3,12467,
ゴ,U+30b4,12468,
サ,U+30b5,12469,
ザ,U+30b6,12470,
シ,U+30b7,12471,
ジ,U+30b8,12472,
ス,U+30b9,12473,
ズ,U+30ba,12474,
セ,U+30bb,12475,
ゼ,U+30bc,12476,
ソ,U+30bd,12477,
ゾ,U+30be,12478,
タ,U+30bf,12479,
ダ,U+30c0,12480,
チ,U+30c1,12481,
ヂ,U+30c2,12482,
ッ,U+30c3,12483,
ツ,U+30c4,12484,
ヅ,U+30c5,12485,
テ,U+30c6,12486,
デ,U+30c7,12487,
ト,U+30c8,12488,
ド,U+30c9,12489,
ナ,U+30ca,12490,
ニ,U+30cb,12491,
ヌ,U+30cc,12492,
ネ,U+30cd,12493,
,U+30ce,12494,
ハ,U+30cf,12495,
バ,U+30d0,12496,
パ,U+30d1,12497,
ヒ,U+30d2,12498,
ビ,U+30d3,12499,
ピ,U+30d4,12500,
フ,U+30d5,12501,
ブ,U+30d6,12502,
プ,U+30d7,12503,
ヘ,U+30d8,12504,
ベ,U+30d9,12505,
ペ,U+30da,12506,
ホ,U+30db,12507,
ボ,U+30dc,12508,
ポ,U+30dd,12509,
マ,U+30de,12510,
ミ,U+30df,12511,
ム,U+30e0,12512,
メ,U+30e1,12513,
モ,U+30e2,12514,
ャ,U+30e3,12515,
ヤ,U+30e4,12516,
ュ,U+30e5,12517,
ユ,U+30e6,12518,
ョ,U+30e7,12519,
ヨ,U+30e8,12520,
ラ,U+30e9,12521,
リ,U+30ea,12522,
ル,U+30eb,12523,
レ,U+30ec,12524,
ロ,U+30ed,12525,
ヮ,U+30ee,12526,
ワ,U+30ef,12527,
ヰ,U+30f0,12528,
ヱ,U+30f1,12529,
ヲ,U+30f2,12530,
ン,U+30f3,12531,
ヴ,U+30f4,12532,
ヵ,U+30f5,12533,
ヶ,U+30f6,12534,
ヷ,U+30f7,12535,
ヸ,U+30f8,12536,
ヹ,U+30f9,12537,
ヺ,U+30fa,12538,
・,U+30fb,12539,
ー,U+30fc,12540,
ヽ,U+30fd,12541,
ヾ,U+30fe,12542,
ヿ,U+30ff,12543,
1 Char Unicode (hex) Decimal LearningOrder
2 U+30a0 12448
3 U+30a1 12449
4 U+30a2 12450
5 U+30a3 12451
6 U+30a4 12452
7 U+30a5 12453
8 U+30a6 12454
9 U+30a7 12455
10 U+30a8 12456
11 U+30a9 12457
12 U+30aa 12458
13 U+30ab 12459
14 U+30ac 12460
15 U+30ad 12461
16 U+30ae 12462
17 U+30af 12463
18 U+30b0 12464
19 U+30b1 12465
20 U+30b2 12466
21 U+30b3 12467
22 U+30b4 12468
23 U+30b5 12469
24 U+30b6 12470
25 U+30b7 12471
26 U+30b8 12472
27 U+30b9 12473
28 U+30ba 12474
29 U+30bb 12475
30 U+30bc 12476
31 U+30bd 12477
32 U+30be 12478
33 U+30bf 12479
34 U+30c0 12480
35 U+30c1 12481
36 U+30c2 12482
37 U+30c3 12483
38 U+30c4 12484
39 U+30c5 12485
40 U+30c6 12486
41 U+30c7 12487
42 U+30c8 12488
43 U+30c9 12489
44 U+30ca 12490
45 U+30cb 12491
46 U+30cc 12492
47 U+30cd 12493
48 U+30ce 12494
49 U+30cf 12495
50 U+30d0 12496
51 U+30d1 12497
52 U+30d2 12498
53 U+30d3 12499
54 U+30d4 12500
55 U+30d5 12501
56 U+30d6 12502
57 U+30d7 12503
58 U+30d8 12504
59 U+30d9 12505
60 U+30da 12506
61 U+30db 12507
62 U+30dc 12508
63 U+30dd 12509
64 U+30de 12510
65 U+30df 12511
66 U+30e0 12512
67 U+30e1 12513
68 U+30e2 12514
69 U+30e3 12515
70 U+30e4 12516
71 U+30e5 12517
72 U+30e6 12518
73 U+30e7 12519
74 U+30e8 12520
75 U+30e9 12521
76 U+30ea 12522
77 U+30eb 12523
78 U+30ec 12524
79 U+30ed 12525
80 U+30ee 12526
81 U+30ef 12527
82 U+30f0 12528
83 U+30f1 12529
84 U+30f2 12530
85 U+30f3 12531
86 U+30f4 12532
87 U+30f5 12533
88 U+30f6 12534
89 U+30f7 12535
90 U+30f8 12536
91 U+30f9 12537
92 U+30fa 12538
93 U+30fb 12539
94 U+30fc 12540
95 U+30fd 12541
96 U+30fe 12542
97 U+30ff 12543

@ -0,0 +1,5 @@
会市屋詩
ヷムペヺ
ぅぴ
Loading…
Cancel
Save