Basic Working Version
Rudimentary version controlled with in-code statements. Fetched contents of kanji.csv from www.tonypottier.info and converted them into a csv. Other csv's are based unicode ranges from wikipedia.master
parent
a32d18c6d9
commit
c94a2ea418
@ -0,0 +1,186 @@
|
|||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
import csv
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
class Alphabet:
|
||||||
|
|
||||||
|
def __init__(self, name: str, file_path: str = "", data: list=None):
|
||||||
|
if not os.path.isfile(file_path) and file_path != "":
|
||||||
|
raise FileNotFoundError("File has not been found")
|
||||||
|
|
||||||
|
self._name = name
|
||||||
|
self._file_path = file_path
|
||||||
|
self._parsed_html = None
|
||||||
|
self._legend = None
|
||||||
|
|
||||||
|
if data is not None:
|
||||||
|
self._data = data
|
||||||
|
return
|
||||||
|
|
||||||
|
if file_path == "":
|
||||||
|
self._data = []
|
||||||
|
print("Warning: Empty Alphabet!")
|
||||||
|
return
|
||||||
|
|
||||||
|
self._stream = open(file_path, "r")
|
||||||
|
file_name, file_extension = os.path.splitext(self._file_path)
|
||||||
|
if file_extension == '.html':
|
||||||
|
self._data = self.import_from_html()
|
||||||
|
elif file_extension == '.csv':
|
||||||
|
self._data = self.import_from_csv()
|
||||||
|
|
||||||
|
else:
|
||||||
|
raise Exception("Unknown file extension!")
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self._data)
|
||||||
|
|
||||||
|
def get_data(self) -> list:
|
||||||
|
return self._data
|
||||||
|
|
||||||
|
def get_name(self) -> str:
|
||||||
|
return self._name
|
||||||
|
|
||||||
|
def get_unique(self):
|
||||||
|
"""
|
||||||
|
Check if each entry is unique
|
||||||
|
|
||||||
|
Uses Unicode number for comparison
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
|
characters = set()
|
||||||
|
for entry in self._data:
|
||||||
|
characters.add(entry['Decimal'])
|
||||||
|
|
||||||
|
Alphabet.generate_from_numbers(self._name, list(characters))
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def generate_from_numbers(name: str, numbers: list):
|
||||||
|
"""
|
||||||
|
Generate alphabet from a list of unicode numbers
|
||||||
|
:param name:
|
||||||
|
:param numbers:
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
|
data = []
|
||||||
|
|
||||||
|
for n in numbers:
|
||||||
|
entry = {}
|
||||||
|
entry['Char'] = chr(n)
|
||||||
|
entry['Unicode (hex)'] = "U+" + hex(n)[2:]
|
||||||
|
entry['Decimal'] = n
|
||||||
|
entry['Learning Order'] = None
|
||||||
|
data.append(entry)
|
||||||
|
|
||||||
|
return Alphabet(name=name, data=data)
|
||||||
|
|
||||||
|
def import_from_csv(self) -> list:
|
||||||
|
"""
|
||||||
|
Import file from csv
|
||||||
|
"""
|
||||||
|
reader = csv.reader(self._stream)
|
||||||
|
self._legend = next(reader)
|
||||||
|
|
||||||
|
data = []
|
||||||
|
for row in reader:
|
||||||
|
entry = {}
|
||||||
|
for i in range(len(row)):
|
||||||
|
entry[self._legend[i]] = row[i]
|
||||||
|
data.append(entry)
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
def import_from_html(self) -> list:
|
||||||
|
"""
|
||||||
|
Import file as html table of characters
|
||||||
|
"""
|
||||||
|
html = ""
|
||||||
|
for line in self._stream:
|
||||||
|
html += line
|
||||||
|
|
||||||
|
self._parsed_html = BeautifulSoup(html, features="html.parser")
|
||||||
|
entries = self._parsed_html.find_all('tr')
|
||||||
|
|
||||||
|
self._legend = []
|
||||||
|
for legend_data in entries[0].find_all('td'):
|
||||||
|
self._legend.append(legend_data.string)
|
||||||
|
|
||||||
|
data = []
|
||||||
|
for entry in entries[1:]:
|
||||||
|
parsed_table_data = entry.find_all('td')
|
||||||
|
|
||||||
|
table_data = {}
|
||||||
|
for i in range(len(parsed_table_data)):
|
||||||
|
table_data[self._legend[i]] = parsed_table_data[i].string
|
||||||
|
data.append(table_data)
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
def export_csv(self, file_path: str):
|
||||||
|
assert(file_path != self._file_path)
|
||||||
|
|
||||||
|
file_name, file_extension = os.path.splitext(file_path)
|
||||||
|
if file_extension != '.csv':
|
||||||
|
file_path = file_name + '.csv'
|
||||||
|
|
||||||
|
stream = open(file_path, 'w')
|
||||||
|
writer = csv.writer(stream)
|
||||||
|
|
||||||
|
writer.writerow(self.get_data()[0].keys())
|
||||||
|
|
||||||
|
for entry in self.get_data():
|
||||||
|
writer.writerow([entry[key] for key in entry.keys()])
|
||||||
|
|
||||||
|
stream.close()
|
||||||
|
|
||||||
|
def append_entry(self, unicode_num: int):
|
||||||
|
uni_hex = hex(unicode_num)
|
||||||
|
char = chr(unicode_num)
|
||||||
|
|
||||||
|
print("Appending Chr:{0} Num:{1} Hex:{2} to {3}".format(char, unicode_num, uni_hex, self._name))
|
||||||
|
|
||||||
|
entry = {}
|
||||||
|
entry['Char'] = char
|
||||||
|
entry['Unicode (hex)'] = "U+"+uni_hex[2:]
|
||||||
|
entry['Decimal'] = unicode_num
|
||||||
|
entry['Learning Order'] = None
|
||||||
|
|
||||||
|
self._data.append(entry)
|
||||||
|
|
||||||
|
def append_entry_hex(self, uni_hex: str):
|
||||||
|
self.append_entry(int(uni_hex, 0))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
kanji = Alphabet("Kanji", "kanji.csv")
|
||||||
|
hiragana = Alphabet("Hiragana", "hiragana.csv")
|
||||||
|
katakana = Alphabet("Katakana", "katakana.csv")
|
||||||
|
|
||||||
|
print("Kanji:{0} Hiragana:{1} Katakana:{2}".format(len(kanji.get_data()),
|
||||||
|
len(hiragana.get_data()),
|
||||||
|
len(katakana.get_data(),
|
||||||
|
)))
|
||||||
|
|
||||||
|
# Generate kanji2 from ranges
|
||||||
|
start = 0x4e00
|
||||||
|
end = 0x9fa0
|
||||||
|
|
||||||
|
n = [x for x in range(start, end)]
|
||||||
|
|
||||||
|
start = 0x3400
|
||||||
|
end = 0x4dbf
|
||||||
|
|
||||||
|
n += [x for x in range(start, end)]
|
||||||
|
|
||||||
|
n = list(set(n))
|
||||||
|
|
||||||
|
k2 = Alphabet.generate_from_numbers("Kanji2", n)
|
||||||
|
print("Kanji2: {0}".format(len(k2)))
|
||||||
|
#k2.export_csv("kanji2.csv")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -0,0 +1,93 @@
|
|||||||
|
from Alphabet import Alphabet
|
||||||
|
|
||||||
|
|
||||||
|
class Detector:
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self._alphs = None
|
||||||
|
|
||||||
|
def add_alphabet(self, alph: Alphabet):
|
||||||
|
if self._alphs is None:
|
||||||
|
self._alphs = []
|
||||||
|
self._alphs.append(alph)
|
||||||
|
|
||||||
|
def match_text(self, text: str) -> dict:
|
||||||
|
"""
|
||||||
|
Determine if a given text uses some characters from any alphabet
|
||||||
|
:param text:
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
|
results = {}
|
||||||
|
for alph in self._alphs:
|
||||||
|
results[alph.get_name()] = [[]] # [matches], percentage
|
||||||
|
results['Unknown'] = [[]]
|
||||||
|
|
||||||
|
for c in text:
|
||||||
|
if c == "\n": # ignoring whitespace
|
||||||
|
continue
|
||||||
|
found = False
|
||||||
|
for alph in self._alphs:
|
||||||
|
|
||||||
|
for entry in alph.get_data():
|
||||||
|
if c == entry['Char']:
|
||||||
|
results[alph.get_name()][0].append(c)
|
||||||
|
found = True
|
||||||
|
|
||||||
|
if not found:
|
||||||
|
results['Unknown'][0].append(c)
|
||||||
|
|
||||||
|
# Create statistics
|
||||||
|
for key in results.keys():
|
||||||
|
results[key].append(len(results[key][0])/float(len(text)))
|
||||||
|
return results
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def pretty_result(result: dict, small: bool=False):
|
||||||
|
s = ""
|
||||||
|
longest_name = max([len(key) for key in result.keys()])
|
||||||
|
longest_match = max([len(m[0]) for m in result.values()])
|
||||||
|
|
||||||
|
for key in result.keys():
|
||||||
|
name = key + ": "
|
||||||
|
while len(name) <= 2+longest_name:
|
||||||
|
name += " "
|
||||||
|
|
||||||
|
match_list = result[key][0]
|
||||||
|
while len(match_list) <= longest_match:
|
||||||
|
match_list.append(" ")
|
||||||
|
matches = ""
|
||||||
|
for m in match_list:
|
||||||
|
matches += m
|
||||||
|
|
||||||
|
percentage = round(result[key][1]*100, 2)
|
||||||
|
if not small:
|
||||||
|
s += name + "Matches: " + matches + " MatchPercent: " + str(percentage) + "\n"
|
||||||
|
else:
|
||||||
|
s += name + str(percentage) + "\n"
|
||||||
|
|
||||||
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
kanji = Alphabet("Kanji (small)", "kanji.csv")
|
||||||
|
kanji2 = Alphabet("Kanji (big)", "kanji2.csv")
|
||||||
|
hiragana = Alphabet("Hiragana", "hiragana.csv")
|
||||||
|
katakana = Alphabet("Katakana", "katakana.csv")
|
||||||
|
print(kanji.get_name(), len(kanji.get_data()))
|
||||||
|
print(kanji2.get_name(), len(kanji2.get_data()))
|
||||||
|
print(hiragana.get_name(), len(hiragana.get_data()))
|
||||||
|
print(katakana.get_name(), len(katakana.get_data()))
|
||||||
|
print()
|
||||||
|
|
||||||
|
d = Detector()
|
||||||
|
d.add_alphabet(kanji)
|
||||||
|
d.add_alphabet(kanji2)
|
||||||
|
d.add_alphabet(hiragana)
|
||||||
|
d.add_alphabet(katakana)
|
||||||
|
|
||||||
|
|
||||||
|
text = ""
|
||||||
|
for line in open("./sample-text.txt", 'r'):
|
||||||
|
text += line
|
||||||
|
|
||||||
|
print("Matches: \n{0}".format(d.pretty_result(d.match_text(text), small=True)))
|
@ -0,0 +1,90 @@
|
|||||||
|
Char,Unicode (hex),Decimal,LearningOrder
|
||||||
|
ぁ,U+3041,12353,
|
||||||
|
あ,U+3042,12354,
|
||||||
|
ぃ,U+3043,12355,
|
||||||
|
い,U+3044,12356,
|
||||||
|
ぅ,U+3045,12357,
|
||||||
|
う,U+3046,12358,
|
||||||
|
ぇ,U+3047,12359,
|
||||||
|
え,U+3048,12360,
|
||||||
|
ぉ,U+3049,12361,
|
||||||
|
お,U+304a,12362,
|
||||||
|
か,U+304b,12363,
|
||||||
|
が,U+304c,12364,
|
||||||
|
き,U+304d,12365,
|
||||||
|
ぎ,U+304e,12366,
|
||||||
|
く,U+304f,12367,
|
||||||
|
ぐ,U+3050,12368,
|
||||||
|
け,U+3051,12369,
|
||||||
|
げ,U+3052,12370,
|
||||||
|
こ,U+3053,12371,
|
||||||
|
ご,U+3054,12372,
|
||||||
|
さ,U+3055,12373,
|
||||||
|
ざ,U+3056,12374,
|
||||||
|
し,U+3057,12375,
|
||||||
|
す,U+3059,12377,
|
||||||
|
ず,U+305a,12378,
|
||||||
|
せ,U+305b,12379,
|
||||||
|
ぜ,U+305c,12380,
|
||||||
|
そ,U+305d,12381,
|
||||||
|
ぞ,U+305e,12382,
|
||||||
|
た,U+305f,12383,
|
||||||
|
だ,U+3060,12384,
|
||||||
|
ち,U+3061,12385,
|
||||||
|
ぢ,U+3062,12386,
|
||||||
|
っ,U+3063,12387,
|
||||||
|
つ,U+3064,12388,
|
||||||
|
づ,U+3065,12389,
|
||||||
|
て,U+3066,12390,
|
||||||
|
で,U+3067,12391,
|
||||||
|
ど,U+3069,12393,
|
||||||
|
な,U+306a,12394,
|
||||||
|
に,U+306b,12395,
|
||||||
|
ぬ,U+306c,12396,
|
||||||
|
ね,U+306d,12397,
|
||||||
|
の,U+306e,12398,
|
||||||
|
は,U+306f,12399,
|
||||||
|
ば,U+3070,12400,
|
||||||
|
ぱ,U+3071,12401,
|
||||||
|
ひ,U+3072,12402,
|
||||||
|
び,U+3073,12403,
|
||||||
|
ぴ,U+3074,12404,
|
||||||
|
ふ,U+3075,12405,
|
||||||
|
ぶ,U+3076,12406,
|
||||||
|
ぷ,U+3077,12407,
|
||||||
|
べ,U+3079,12409,
|
||||||
|
ぺ,U+307a,12410,
|
||||||
|
ほ,U+307b,12411,
|
||||||
|
ぼ,U+307c,12412,
|
||||||
|
ぽ,U+307d,12413,
|
||||||
|
ま,U+307e,12414,
|
||||||
|
み,U+307f,12415,
|
||||||
|
む,U+3080,12416,
|
||||||
|
め,U+3081,12417,
|
||||||
|
も,U+3082,12418,
|
||||||
|
ゃ,U+3083,12419,
|
||||||
|
や,U+3084,12420,
|
||||||
|
ゅ,U+3085,12421,
|
||||||
|
ゆ,U+3086,12422,
|
||||||
|
ょ,U+3087,12423,
|
||||||
|
ら,U+3089,12425,
|
||||||
|
り,U+308a,12426,
|
||||||
|
る,U+308b,12427,
|
||||||
|
れ,U+308c,12428,
|
||||||
|
ろ,U+308d,12429,
|
||||||
|
ゎ,U+308e,12430,
|
||||||
|
わ,U+308f,12431,
|
||||||
|
ゐ,U+3090,12432,
|
||||||
|
ゑ,U+3091,12433,
|
||||||
|
を,U+3092,12434,
|
||||||
|
ん,U+3093,12435,
|
||||||
|
ゔ,U+3094,12436,
|
||||||
|
ゕ,U+3095,12437,
|
||||||
|
ゖ,U+3096,12438,
|
||||||
|
゙,U+3099,12441,
|
||||||
|
゚,U+309a,12442,
|
||||||
|
゛,U+309b,12443,
|
||||||
|
゜,U+309c,12444,
|
||||||
|
ゝ,U+309d,12445,
|
||||||
|
ゞ,U+309e,12446,
|
||||||
|
ゟ,U+309f,12447,
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,5 @@
|
|||||||
|
会市屋詩
|
||||||
|
|
||||||
|
ヷムペヺ
|
||||||
|
|
||||||
|
ぅぴ
|
Loading…
Reference in New Issue