Browse Source

Basic Working Version

Rudimentary version controlled with in-code statements.
Fetched contents of kanji.csv from www.tonypottier.info
and converted them into a csv. Other csv's are based
unicode ranges from wikipedia.
master
Peery 10 months ago
parent
commit
c94a2ea418
7 changed files with 29906 additions and 0 deletions
  1. +186
    -0
      Alphabet.py
  2. +93
    -0
      Detector.py
  3. +90
    -0
      hiragana.csv
  4. +1947
    -0
      kanji.csv
  5. +27488
    -0
      kanji2.csv
  6. +97
    -0
      katakana.csv
  7. +5
    -0
      sample-text.txt

+ 186
- 0
Alphabet.py View File

@@ -0,0 +1,186 @@
from bs4 import BeautifulSoup

import csv
import os


class Alphabet:

def __init__(self, name: str, file_path: str = "", data: list=None):
if not os.path.isfile(file_path) and file_path != "":
raise FileNotFoundError("File has not been found")

self._name = name
self._file_path = file_path
self._parsed_html = None
self._legend = None

if data is not None:
self._data = data
return

if file_path == "":
self._data = []
print("Warning: Empty Alphabet!")
return

self._stream = open(file_path, "r")
file_name, file_extension = os.path.splitext(self._file_path)
if file_extension == '.html':
self._data = self.import_from_html()
elif file_extension == '.csv':
self._data = self.import_from_csv()

else:
raise Exception("Unknown file extension!")

def __len__(self):
return len(self._data)

def get_data(self) -> list:
return self._data

def get_name(self) -> str:
return self._name

def get_unique(self):
"""
Check if each entry is unique

Uses Unicode number for comparison
:return:
"""
characters = set()
for entry in self._data:
characters.add(entry['Decimal'])

Alphabet.generate_from_numbers(self._name, list(characters))

@staticmethod
def generate_from_numbers(name: str, numbers: list):
"""
Generate alphabet from a list of unicode numbers
:param name:
:param numbers:
:return:
"""
data = []

for n in numbers:
entry = {}
entry['Char'] = chr(n)
entry['Unicode (hex)'] = "U+" + hex(n)[2:]
entry['Decimal'] = n
entry['Learning Order'] = None
data.append(entry)

return Alphabet(name=name, data=data)

def import_from_csv(self) -> list:
"""
Import file from csv
"""
reader = csv.reader(self._stream)
self._legend = next(reader)

data = []
for row in reader:
entry = {}
for i in range(len(row)):
entry[self._legend[i]] = row[i]
data.append(entry)

return data

def import_from_html(self) -> list:
"""
Import file as html table of characters
"""
html = ""
for line in self._stream:
html += line

self._parsed_html = BeautifulSoup(html, features="html.parser")
entries = self._parsed_html.find_all('tr')

self._legend = []
for legend_data in entries[0].find_all('td'):
self._legend.append(legend_data.string)

data = []
for entry in entries[1:]:
parsed_table_data = entry.find_all('td')

table_data = {}
for i in range(len(parsed_table_data)):
table_data[self._legend[i]] = parsed_table_data[i].string
data.append(table_data)

return data

def export_csv(self, file_path: str):
assert(file_path != self._file_path)

file_name, file_extension = os.path.splitext(file_path)
if file_extension != '.csv':
file_path = file_name + '.csv'

stream = open(file_path, 'w')
writer = csv.writer(stream)

writer.writerow(self.get_data()[0].keys())

for entry in self.get_data():
writer.writerow([entry[key] for key in entry.keys()])

stream.close()

def append_entry(self, unicode_num: int):
uni_hex = hex(unicode_num)
char = chr(unicode_num)

print("Appending Chr:{0} Num:{1} Hex:{2} to {3}".format(char, unicode_num, uni_hex, self._name))

entry = {}
entry['Char'] = char
entry['Unicode (hex)'] = "U+"+uni_hex[2:]
entry['Decimal'] = unicode_num
entry['Learning Order'] = None

self._data.append(entry)

def append_entry_hex(self, uni_hex: str):
self.append_entry(int(uni_hex, 0))


if __name__ == "__main__":
kanji = Alphabet("Kanji", "kanji.csv")
hiragana = Alphabet("Hiragana", "hiragana.csv")
katakana = Alphabet("Katakana", "katakana.csv")

print("Kanji:{0} Hiragana:{1} Katakana:{2}".format(len(kanji.get_data()),
len(hiragana.get_data()),
len(katakana.get_data(),
)))

# Generate kanji2 from ranges
start = 0x4e00
end = 0x9fa0

n = [x for x in range(start, end)]

start = 0x3400
end = 0x4dbf

n += [x for x in range(start, end)]

n = list(set(n))

k2 = Alphabet.generate_from_numbers("Kanji2", n)
print("Kanji2: {0}".format(len(k2)))
#k2.export_csv("kanji2.csv")






+ 93
- 0
Detector.py View File

@@ -0,0 +1,93 @@
from Alphabet import Alphabet


class Detector:

def __init__(self):
self._alphs = None

def add_alphabet(self, alph: Alphabet):
if self._alphs is None:
self._alphs = []
self._alphs.append(alph)

def match_text(self, text: str) -> dict:
"""
Determine if a given text uses some characters from any alphabet
:param text:
:return:
"""
results = {}
for alph in self._alphs:
results[alph.get_name()] = [[]] # [matches], percentage
results['Unknown'] = [[]]

for c in text:
if c == "\n": # ignoring whitespace
continue
found = False
for alph in self._alphs:

for entry in alph.get_data():
if c == entry['Char']:
results[alph.get_name()][0].append(c)
found = True

if not found:
results['Unknown'][0].append(c)

# Create statistics
for key in results.keys():
results[key].append(len(results[key][0])/float(len(text)))
return results

@staticmethod
def pretty_result(result: dict, small: bool=False):
s = ""
longest_name = max([len(key) for key in result.keys()])
longest_match = max([len(m[0]) for m in result.values()])

for key in result.keys():
name = key + ": "
while len(name) <= 2+longest_name:
name += " "

match_list = result[key][0]
while len(match_list) <= longest_match:
match_list.append(" ")
matches = ""
for m in match_list:
matches += m

percentage = round(result[key][1]*100, 2)
if not small:
s += name + "Matches: " + matches + " MatchPercent: " + str(percentage) + "\n"
else:
s += name + str(percentage) + "\n"

return s


if __name__ == "__main__":
kanji = Alphabet("Kanji (small)", "kanji.csv")
kanji2 = Alphabet("Kanji (big)", "kanji2.csv")
hiragana = Alphabet("Hiragana", "hiragana.csv")
katakana = Alphabet("Katakana", "katakana.csv")
print(kanji.get_name(), len(kanji.get_data()))
print(kanji2.get_name(), len(kanji2.get_data()))
print(hiragana.get_name(), len(hiragana.get_data()))
print(katakana.get_name(), len(katakana.get_data()))
print()

d = Detector()
d.add_alphabet(kanji)
d.add_alphabet(kanji2)
d.add_alphabet(hiragana)
d.add_alphabet(katakana)


text = ""
for line in open("./sample-text.txt", 'r'):
text += line

print("Matches: \n{0}".format(d.pretty_result(d.match_text(text), small=True)))

+ 90
- 0
hiragana.csv View File

@@ -0,0 +1,90 @@
Char,Unicode (hex),Decimal,LearningOrder
ぁ,U+3041,12353,
あ,U+3042,12354,
ぃ,U+3043,12355,
い,U+3044,12356,
ぅ,U+3045,12357,
う,U+3046,12358,
ぇ,U+3047,12359,
え,U+3048,12360,
ぉ,U+3049,12361,
お,U+304a,12362,
か,U+304b,12363,
が,U+304c,12364,
き,U+304d,12365,
ぎ,U+304e,12366,
く,U+304f,12367,
ぐ,U+3050,12368,
け,U+3051,12369,
げ,U+3052,12370,
こ,U+3053,12371,
ご,U+3054,12372,
さ,U+3055,12373,
ざ,U+3056,12374,
し,U+3057,12375,
す,U+3059,12377,
ず,U+305a,12378,
せ,U+305b,12379,
ぜ,U+305c,12380,
そ,U+305d,12381,
ぞ,U+305e,12382,
た,U+305f,12383,
だ,U+3060,12384,
ち,U+3061,12385,
ぢ,U+3062,12386,
っ,U+3063,12387,
つ,U+3064,12388,
づ,U+3065,12389,
て,U+3066,12390,
で,U+3067,12391,
ど,U+3069,12393,
な,U+306a,12394,
に,U+306b,12395,
ぬ,U+306c,12396,
ね,U+306d,12397,
の,U+306e,12398,
は,U+306f,12399,
ば,U+3070,12400,
ぱ,U+3071,12401,
ひ,U+3072,12402,
び,U+3073,12403,
ぴ,U+3074,12404,
ふ,U+3075,12405,
ぶ,U+3076,12406,
ぷ,U+3077,12407,
べ,U+3079,12409,
ぺ,U+307a,12410,
ほ,U+307b,12411,
ぼ,U+307c,12412,
ぽ,U+307d,12413,
ま,U+307e,12414,
み,U+307f,12415,
む,U+3080,12416,
め,U+3081,12417,
も,U+3082,12418,
ゃ,U+3083,12419,
や,U+3084,12420,
ゅ,U+3085,12421,
ゆ,U+3086,12422,
ょ,U+3087,12423,
ら,U+3089,12425,
り,U+308a,12426,
る,U+308b,12427,
れ,U+308c,12428,
ろ,U+308d,12429,
ゎ,U+308e,12430,
わ,U+308f,12431,
ゐ,U+3090,12432,
ゑ,U+3091,12433,
を,U+3092,12434,
ん,U+3093,12435,
ゔ,U+3094,12436,
ゕ,U+3095,12437,
ゖ,U+3096,12438,
゙,U+3099,12441,
゚,U+309a,12442,
゛,U+309b,12443,
゜,U+309c,12444,
ゝ,U+309d,12445,
ゞ,U+309e,12446,
ゟ,U+309f,12447,

+ 1947
- 0
kanji.csv
File diff suppressed because it is too large
View File


+ 27488
- 0
kanji2.csv
File diff suppressed because it is too large
View File


+ 97
- 0
katakana.csv View File

@@ -0,0 +1,97 @@
Char,Unicode (hex),Decimal,LearningOrder
゠,U+30a0,12448,
ァ,U+30a1,12449,
ア,U+30a2,12450,
ィ,U+30a3,12451,
イ,U+30a4,12452,
ゥ,U+30a5,12453,
ウ,U+30a6,12454,
ェ,U+30a7,12455,
エ,U+30a8,12456,
ォ,U+30a9,12457,
オ,U+30aa,12458,
カ,U+30ab,12459,
ガ,U+30ac,12460,
キ,U+30ad,12461,
ギ,U+30ae,12462,
ク,U+30af,12463,
グ,U+30b0,12464,
ケ,U+30b1,12465,
ゲ,U+30b2,12466,
コ,U+30b3,12467,
ゴ,U+30b4,12468,
サ,U+30b5,12469,
ザ,U+30b6,12470,
シ,U+30b7,12471,
ジ,U+30b8,12472,
ス,U+30b9,12473,
ズ,U+30ba,12474,
セ,U+30bb,12475,
ゼ,U+30bc,12476,
ソ,U+30bd,12477,
ゾ,U+30be,12478,
タ,U+30bf,12479,
ダ,U+30c0,12480,
チ,U+30c1,12481,
ヂ,U+30c2,12482,
ッ,U+30c3,12483,
ツ,U+30c4,12484,
ヅ,U+30c5,12485,
テ,U+30c6,12486,
デ,U+30c7,12487,
ト,U+30c8,12488,
ド,U+30c9,12489,
ナ,U+30ca,12490,
ニ,U+30cb,12491,
ヌ,U+30cc,12492,
ネ,U+30cd,12493,
ノ,U+30ce,12494,
ハ,U+30cf,12495,
バ,U+30d0,12496,
パ,U+30d1,12497,
ヒ,U+30d2,12498,
ビ,U+30d3,12499,
ピ,U+30d4,12500,
フ,U+30d5,12501,
ブ,U+30d6,12502,
プ,U+30d7,12503,
ヘ,U+30d8,12504,
ベ,U+30d9,12505,
ペ,U+30da,12506,
ホ,U+30db,12507,
ボ,U+30dc,12508,
ポ,U+30dd,12509,
マ,U+30de,12510,
ミ,U+30df,12511,
ム,U+30e0,12512,
メ,U+30e1,12513,
モ,U+30e2,12514,
ャ,U+30e3,12515,
ヤ,U+30e4,12516,
ュ,U+30e5,12517,
ユ,U+30e6,12518,
ョ,U+30e7,12519,
ヨ,U+30e8,12520,
ラ,U+30e9,12521,
リ,U+30ea,12522,
ル,U+30eb,12523,
レ,U+30ec,12524,
ロ,U+30ed,12525,
ヮ,U+30ee,12526,
ワ,U+30ef,12527,
ヰ,U+30f0,12528,
ヱ,U+30f1,12529,
ヲ,U+30f2,12530,
ン,U+30f3,12531,
ヴ,U+30f4,12532,
ヵ,U+30f5,12533,
ヶ,U+30f6,12534,
ヷ,U+30f7,12535,
ヸ,U+30f8,12536,
ヹ,U+30f9,12537,
ヺ,U+30fa,12538,
・,U+30fb,12539,
ー,U+30fc,12540,
ヽ,U+30fd,12541,
ヾ,U+30fe,12542,
ヿ,U+30ff,12543,

+ 5
- 0
sample-text.txt View File

@@ -0,0 +1,5 @@
会市屋詩

ヷムペヺ

ぅぴ

Loading…
Cancel
Save