Basic Working Version
Rudimentary version controlled with in-code statements. Fetched contents of kanji.csv from www.tonypottier.info and converted them into a csv. Other csv's are based unicode ranges from wikipedia.master
							parent
							
								
									a32d18c6d9
								
							
						
					
					
						commit
						c94a2ea418
					
				| @ -0,0 +1,186 @@ | ||||
| from bs4 import BeautifulSoup | ||||
| 
 | ||||
| import csv | ||||
| import os | ||||
| 
 | ||||
| 
 | ||||
| class Alphabet: | ||||
| 
 | ||||
|     def __init__(self, name: str, file_path: str = "", data: list=None): | ||||
|         if not os.path.isfile(file_path) and file_path != "": | ||||
|             raise FileNotFoundError("File has not been found") | ||||
| 
 | ||||
|         self._name = name | ||||
|         self._file_path = file_path | ||||
|         self._parsed_html = None | ||||
|         self._legend = None | ||||
| 
 | ||||
|         if data is not None: | ||||
|             self._data = data | ||||
|             return | ||||
| 
 | ||||
|         if file_path == "": | ||||
|             self._data = [] | ||||
|             print("Warning: Empty Alphabet!") | ||||
|             return | ||||
| 
 | ||||
|         self._stream = open(file_path, "r") | ||||
|         file_name, file_extension = os.path.splitext(self._file_path) | ||||
|         if file_extension == '.html': | ||||
|             self._data = self.import_from_html() | ||||
|         elif file_extension == '.csv': | ||||
|             self._data = self.import_from_csv() | ||||
| 
 | ||||
|         else: | ||||
|             raise Exception("Unknown file extension!") | ||||
| 
 | ||||
|     def __len__(self): | ||||
|         return len(self._data) | ||||
| 
 | ||||
|     def get_data(self) -> list: | ||||
|         return self._data | ||||
| 
 | ||||
|     def get_name(self) -> str: | ||||
|         return self._name | ||||
| 
 | ||||
|     def get_unique(self): | ||||
|         """ | ||||
|         Check if each entry is unique | ||||
| 
 | ||||
|         Uses Unicode number for comparison | ||||
|         :return: | ||||
|         """ | ||||
|         characters = set() | ||||
|         for entry in self._data: | ||||
|             characters.add(entry['Decimal']) | ||||
| 
 | ||||
|         Alphabet.generate_from_numbers(self._name, list(characters)) | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def generate_from_numbers(name: str, numbers: list): | ||||
|         """ | ||||
|         Generate alphabet from a list of unicode numbers | ||||
|         :param name: | ||||
|         :param numbers: | ||||
|         :return: | ||||
|         """ | ||||
|         data = [] | ||||
| 
 | ||||
|         for n in numbers: | ||||
|             entry = {} | ||||
|             entry['Char'] = chr(n) | ||||
|             entry['Unicode (hex)'] = "U+" + hex(n)[2:] | ||||
|             entry['Decimal'] = n | ||||
|             entry['Learning Order'] = None | ||||
|             data.append(entry) | ||||
| 
 | ||||
|         return Alphabet(name=name, data=data) | ||||
| 
 | ||||
|     def import_from_csv(self) -> list: | ||||
|         """ | ||||
|         Import file from csv | ||||
|         """ | ||||
|         reader = csv.reader(self._stream) | ||||
|         self._legend = next(reader) | ||||
| 
 | ||||
|         data = [] | ||||
|         for row in reader: | ||||
|             entry = {} | ||||
|             for i in range(len(row)): | ||||
|                 entry[self._legend[i]] = row[i] | ||||
|             data.append(entry) | ||||
| 
 | ||||
|         return data | ||||
| 
 | ||||
|     def import_from_html(self) -> list: | ||||
|         """ | ||||
|         Import file as html table of characters | ||||
|         """ | ||||
|         html = "" | ||||
|         for line in self._stream: | ||||
|             html += line | ||||
| 
 | ||||
|         self._parsed_html = BeautifulSoup(html, features="html.parser") | ||||
|         entries = self._parsed_html.find_all('tr') | ||||
| 
 | ||||
|         self._legend = [] | ||||
|         for legend_data in entries[0].find_all('td'): | ||||
|             self._legend.append(legend_data.string) | ||||
| 
 | ||||
|         data = [] | ||||
|         for entry in entries[1:]: | ||||
|             parsed_table_data = entry.find_all('td') | ||||
| 
 | ||||
|             table_data = {} | ||||
|             for i in range(len(parsed_table_data)): | ||||
|                 table_data[self._legend[i]] = parsed_table_data[i].string | ||||
|             data.append(table_data) | ||||
| 
 | ||||
|         return data | ||||
| 
 | ||||
|     def export_csv(self, file_path: str): | ||||
|         assert(file_path != self._file_path) | ||||
| 
 | ||||
|         file_name, file_extension = os.path.splitext(file_path) | ||||
|         if file_extension != '.csv': | ||||
|             file_path = file_name + '.csv' | ||||
| 
 | ||||
|         stream = open(file_path, 'w') | ||||
|         writer = csv.writer(stream) | ||||
| 
 | ||||
|         writer.writerow(self.get_data()[0].keys()) | ||||
| 
 | ||||
|         for entry in self.get_data(): | ||||
|             writer.writerow([entry[key] for key in entry.keys()]) | ||||
| 
 | ||||
|         stream.close() | ||||
| 
 | ||||
|     def append_entry(self, unicode_num: int): | ||||
|         uni_hex = hex(unicode_num) | ||||
|         char = chr(unicode_num) | ||||
| 
 | ||||
|         print("Appending Chr:{0} Num:{1} Hex:{2} to {3}".format(char, unicode_num, uni_hex, self._name)) | ||||
| 
 | ||||
|         entry = {} | ||||
|         entry['Char'] = char | ||||
|         entry['Unicode (hex)'] = "U+"+uni_hex[2:] | ||||
|         entry['Decimal'] = unicode_num | ||||
|         entry['Learning Order'] = None | ||||
| 
 | ||||
|         self._data.append(entry) | ||||
| 
 | ||||
|     def append_entry_hex(self, uni_hex: str): | ||||
|         self.append_entry(int(uni_hex, 0)) | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == "__main__": | ||||
|     kanji = Alphabet("Kanji", "kanji.csv") | ||||
|     hiragana = Alphabet("Hiragana", "hiragana.csv") | ||||
|     katakana = Alphabet("Katakana", "katakana.csv") | ||||
| 
 | ||||
|     print("Kanji:{0} Hiragana:{1} Katakana:{2}".format(len(kanji.get_data()), | ||||
|                                                        len(hiragana.get_data()), | ||||
|                                                        len(katakana.get_data(), | ||||
|                                                            ))) | ||||
| 
 | ||||
|     # Generate kanji2 from ranges | ||||
|     start = 0x4e00 | ||||
|     end = 0x9fa0 | ||||
| 
 | ||||
|     n = [x for x in range(start, end)] | ||||
| 
 | ||||
|     start = 0x3400 | ||||
|     end = 0x4dbf | ||||
| 
 | ||||
|     n += [x for x in range(start, end)] | ||||
| 
 | ||||
|     n = list(set(n)) | ||||
| 
 | ||||
|     k2 = Alphabet.generate_from_numbers("Kanji2", n) | ||||
|     print("Kanji2: {0}".format(len(k2))) | ||||
|     #k2.export_csv("kanji2.csv") | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| @ -0,0 +1,93 @@ | ||||
| from Alphabet import Alphabet | ||||
| 
 | ||||
| 
 | ||||
| class Detector: | ||||
| 
 | ||||
|     def __init__(self): | ||||
|         self._alphs = None | ||||
| 
 | ||||
|     def add_alphabet(self, alph: Alphabet): | ||||
|         if self._alphs is None: | ||||
|             self._alphs = [] | ||||
|         self._alphs.append(alph) | ||||
| 
 | ||||
|     def match_text(self, text: str) -> dict: | ||||
|         """ | ||||
|         Determine if a given text uses some characters from any alphabet | ||||
|         :param text: | ||||
|         :return: | ||||
|         """ | ||||
|         results = {} | ||||
|         for alph in self._alphs: | ||||
|             results[alph.get_name()] = [[]]  # [matches], percentage | ||||
|         results['Unknown'] = [[]] | ||||
| 
 | ||||
|         for c in text: | ||||
|             if c == "\n":  # ignoring whitespace | ||||
|                 continue | ||||
|             found = False | ||||
|             for alph in self._alphs: | ||||
| 
 | ||||
|                 for entry in alph.get_data(): | ||||
|                     if c == entry['Char']: | ||||
|                         results[alph.get_name()][0].append(c) | ||||
|                         found = True | ||||
| 
 | ||||
|             if not found: | ||||
|                 results['Unknown'][0].append(c) | ||||
| 
 | ||||
|         # Create statistics | ||||
|         for key in results.keys(): | ||||
|             results[key].append(len(results[key][0])/float(len(text))) | ||||
|         return results | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def pretty_result(result: dict, small: bool=False): | ||||
|         s = "" | ||||
|         longest_name = max([len(key) for key in result.keys()]) | ||||
|         longest_match = max([len(m[0]) for m in result.values()]) | ||||
| 
 | ||||
|         for key in result.keys(): | ||||
|             name = key + ": " | ||||
|             while len(name) <= 2+longest_name: | ||||
|                 name += " " | ||||
| 
 | ||||
|             match_list = result[key][0] | ||||
|             while len(match_list) <= longest_match: | ||||
|                 match_list.append(" ") | ||||
|             matches = "" | ||||
|             for m in match_list: | ||||
|                 matches += m | ||||
| 
 | ||||
|             percentage = round(result[key][1]*100, 2) | ||||
|             if not small: | ||||
|                 s += name + "Matches: " + matches + " MatchPercent: " + str(percentage) + "\n" | ||||
|             else: | ||||
|                 s += name + str(percentage) + "\n" | ||||
| 
 | ||||
|         return s | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == "__main__": | ||||
|     kanji = Alphabet("Kanji (small)", "kanji.csv") | ||||
|     kanji2 = Alphabet("Kanji (big)", "kanji2.csv") | ||||
|     hiragana = Alphabet("Hiragana", "hiragana.csv") | ||||
|     katakana = Alphabet("Katakana", "katakana.csv") | ||||
|     print(kanji.get_name(), len(kanji.get_data())) | ||||
|     print(kanji2.get_name(), len(kanji2.get_data())) | ||||
|     print(hiragana.get_name(), len(hiragana.get_data())) | ||||
|     print(katakana.get_name(), len(katakana.get_data())) | ||||
|     print() | ||||
| 
 | ||||
|     d = Detector() | ||||
|     d.add_alphabet(kanji) | ||||
|     d.add_alphabet(kanji2) | ||||
|     d.add_alphabet(hiragana) | ||||
|     d.add_alphabet(katakana) | ||||
| 
 | ||||
| 
 | ||||
|     text = "" | ||||
|     for line in open("./sample-text.txt", 'r'): | ||||
|         text += line | ||||
| 
 | ||||
|     print("Matches: \n{0}".format(d.pretty_result(d.match_text(text), small=True))) | ||||
| @ -0,0 +1,90 @@ | ||||
| Char,Unicode (hex),Decimal,LearningOrder | ||||
| ぁ,U+3041,12353, | ||||
| あ,U+3042,12354, | ||||
| ぃ,U+3043,12355, | ||||
| い,U+3044,12356, | ||||
| ぅ,U+3045,12357, | ||||
| う,U+3046,12358, | ||||
| ぇ,U+3047,12359, | ||||
| え,U+3048,12360, | ||||
| ぉ,U+3049,12361, | ||||
| お,U+304a,12362, | ||||
| か,U+304b,12363, | ||||
| が,U+304c,12364, | ||||
| き,U+304d,12365, | ||||
| ぎ,U+304e,12366, | ||||
| く,U+304f,12367, | ||||
| ぐ,U+3050,12368, | ||||
| け,U+3051,12369, | ||||
| げ,U+3052,12370, | ||||
| こ,U+3053,12371, | ||||
| ご,U+3054,12372, | ||||
| さ,U+3055,12373, | ||||
| ざ,U+3056,12374, | ||||
| し,U+3057,12375, | ||||
| す,U+3059,12377, | ||||
| ず,U+305a,12378, | ||||
| せ,U+305b,12379, | ||||
| ぜ,U+305c,12380, | ||||
| そ,U+305d,12381, | ||||
| ぞ,U+305e,12382, | ||||
| た,U+305f,12383, | ||||
| だ,U+3060,12384, | ||||
| ち,U+3061,12385, | ||||
| ぢ,U+3062,12386, | ||||
| っ,U+3063,12387, | ||||
| つ,U+3064,12388, | ||||
| づ,U+3065,12389, | ||||
| て,U+3066,12390, | ||||
| で,U+3067,12391, | ||||
| ど,U+3069,12393, | ||||
| な,U+306a,12394, | ||||
| に,U+306b,12395, | ||||
| ぬ,U+306c,12396, | ||||
| ね,U+306d,12397, | ||||
| の,U+306e,12398, | ||||
| は,U+306f,12399, | ||||
| ば,U+3070,12400, | ||||
| ぱ,U+3071,12401, | ||||
| ひ,U+3072,12402, | ||||
| び,U+3073,12403, | ||||
| ぴ,U+3074,12404, | ||||
| ふ,U+3075,12405, | ||||
| ぶ,U+3076,12406, | ||||
| ぷ,U+3077,12407, | ||||
| べ,U+3079,12409, | ||||
| ぺ,U+307a,12410, | ||||
| ほ,U+307b,12411, | ||||
| ぼ,U+307c,12412, | ||||
| ぽ,U+307d,12413, | ||||
| ま,U+307e,12414, | ||||
| み,U+307f,12415, | ||||
| む,U+3080,12416, | ||||
| め,U+3081,12417, | ||||
| も,U+3082,12418, | ||||
| ゃ,U+3083,12419, | ||||
| や,U+3084,12420, | ||||
| ゅ,U+3085,12421, | ||||
| ゆ,U+3086,12422, | ||||
| ょ,U+3087,12423, | ||||
| ら,U+3089,12425, | ||||
| り,U+308a,12426, | ||||
| る,U+308b,12427, | ||||
| れ,U+308c,12428, | ||||
| ろ,U+308d,12429, | ||||
| ゎ,U+308e,12430, | ||||
| わ,U+308f,12431, | ||||
| ゐ,U+3090,12432, | ||||
| ゑ,U+3091,12433, | ||||
| を,U+3092,12434, | ||||
| ん,U+3093,12435, | ||||
| ゔ,U+3094,12436, | ||||
| ゕ,U+3095,12437, | ||||
| ゖ,U+3096,12438, | ||||
| ゙,U+3099,12441, | ||||
| ゚,U+309a,12442, | ||||
| ゛,U+309b,12443, | ||||
| ゜,U+309c,12444, | ||||
| ゝ,U+309d,12445, | ||||
| ゞ,U+309e,12446, | ||||
| ゟ,U+309f,12447, | ||||
| 
 | 
											
												
													File diff suppressed because it is too large
													Load Diff
												
											
										
									
								| 
 | 
| @ -0,0 +1,5 @@ | ||||
| 会市屋詩 | ||||
| 
 | ||||
| ヷムペヺ | ||||
| 
 | ||||
| ぅぴ | ||||
					Loading…
					
					
				
		Reference in New Issue