You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

94 lines
2.7 KiB
Python

from Alphabet import Alphabet
class Detector:
def __init__(self):
self._alphs = None
def add_alphabet(self, alph: Alphabet):
if self._alphs is None:
self._alphs = []
self._alphs.append(alph)
def match_text(self, text: str) -> dict:
"""
Determine if a given text uses some characters from any alphabet
:param text:
:return:
"""
results = {}
for alph in self._alphs:
results[alph.get_name()] = [[]] # [matches], percentage
results['Unknown'] = [[]]
for c in text:
if c == "\n": # ignoring whitespace
continue
found = False
for alph in self._alphs:
for entry in alph.get_data():
if c == entry['Char']:
results[alph.get_name()][0].append(c)
found = True
if not found:
results['Unknown'][0].append(c)
# Create statistics
for key in results.keys():
results[key].append(len(results[key][0])/float(len(text)))
return results
@staticmethod
def pretty_result(result: dict, small: bool=False):
s = ""
longest_name = max([len(key) for key in result.keys()])
longest_match = max([len(m[0]) for m in result.values()])
for key in result.keys():
name = key + ": "
while len(name) <= 2+longest_name:
name += " "
match_list = result[key][0]
while len(match_list) <= longest_match:
match_list.append(" ")
matches = ""
for m in match_list:
matches += m
percentage = round(result[key][1]*100, 2)
if not small:
s += name + "Matches: " + matches + " MatchPercent: " + str(percentage) + "\n"
else:
s += name + str(percentage) + "\n"
return s
if __name__ == "__main__":
kanji = Alphabet("Kanji (small)", "kanji.csv")
kanji2 = Alphabet("Kanji (big)", "kanji2.csv")
hiragana = Alphabet("Hiragana", "hiragana.csv")
katakana = Alphabet("Katakana", "katakana.csv")
print(kanji.get_name(), len(kanji.get_data()))
print(kanji2.get_name(), len(kanji2.get_data()))
print(hiragana.get_name(), len(hiragana.get_data()))
print(katakana.get_name(), len(katakana.get_data()))
print()
d = Detector()
d.add_alphabet(kanji)
d.add_alphabet(kanji2)
d.add_alphabet(hiragana)
d.add_alphabet(katakana)
text = ""
for line in open("./sample-text.txt", 'r'):
text += line
print("Matches: \n{0}".format(d.pretty_result(d.match_text(text), small=True)))